LCOV - code coverage report
Current view: top level - src/lib - TextUtil.cpp (source / functions) Hit Total Coverage
Test: mediawiki/php/wikidiff2 test coverage report Lines: 89 91 97.8 %
Date: 2023-07-04 10:20:16 Functions: 5 5 100.0 %

          Line data    Source code
       1             : #include "TextUtil.h"
       2             : 
       3             : #include <thai/thailib.h>
       4             : #include <thai/thwchar.h>
       5             : 
       6             : #include <algorithm>
       7             : 
       8             : namespace wikidiff2 {
       9             : 
      10             : static thread_local TextUtil tl_textUtil;
      11             : 
      12          30 : TextUtil::TextUtil()
      13          30 :     : breaker(NULL)
      14          30 : {}
      15             : 
      16          60 : TextUtil::~TextUtil()
      17             : {
      18          30 :     if (breaker) {
      19           1 :         th_brk_delete(breaker);
      20             :     }
      21          30 : }
      22             : 
      23          47 : TextUtil & TextUtil::getInstance() {
      24          47 :     return tl_textUtil;
      25             : }
      26             : 
      27             : // Weak UTF-8 decoder
      28             : // Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
      29      117709 : int TextUtil::nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
      30             :         String::const_iterator end)
      31             : {
      32      117709 :     int c = 0;
      33             :     unsigned char byte;
      34      117709 :     int seqLength = 0;
      35      117709 :     charStart = p;
      36      117709 :     if (p == end) {
      37         369 :         return 0;
      38             :     }
      39        1536 :     do {
      40      118876 :         byte = (unsigned char)*p;
      41      118876 :         if (byte < 0x80) {
      42      115921 :             c = byte;
      43      115921 :             seqLength = 0;
      44        2955 :         } else if (byte >= 0xc0) {
      45             :             // Start of UTF-8 character
      46             :             // If this is unexpected, due to an overshort sequence, we ignore the invalid
      47             :             // sequence and resynchronise here
      48        1419 :             if (byte < 0xe0) {
      49        1302 :                 seqLength = 1;
      50        1302 :                 c = byte & 0x1f;
      51         117 :             } else if (byte < 0xf0) {
      52         117 :                 seqLength = 2;
      53         117 :                 c = byte & 0x0f;
      54             :             } else {
      55           0 :                 seqLength = 3;
      56           0 :                 c = byte & 7;
      57             :             }
      58        1536 :         } else if (seqLength) {
      59        1536 :             c <<= 6;
      60        1536 :             c |= byte & 0x3f;
      61        1536 :             --seqLength;
      62             :         } else {
      63             :             // Unexpected continuation, ignore
      64             :         }
      65      118876 :         ++p;
      66      118876 :     } while (seqLength && p != end);
      67      117340 :     return c;
      68             : }
      69             : 
      70             : // Split a string into words
      71             : //
      72             : // TODO: I think the best way to do this would be to use ICU BreakIterator
      73             : // instead of libthai + DIY. Basically you'd run BreakIterators from several
      74             : // different locales (en, th, ja) and merge the results, i.e. if a break occurs
      75             : // in any locale at a given position, split the string. I don't know if the
      76             : // quality of the Thai dictionary in ICU matches the one in libthai, we would
      77             : // have to check this somehow.
      78         369 : void TextUtil::explodeWords(const String & text, WordVector &words)
      79             : {
      80             :     // Decode the UTF-8 in the string.
      81             :     // * Save the character sizes (in bytes)
      82             :     // * Convert the string to TIS-620, which is the internal character set of libthai.
      83             :     // * Save the character offsets of any break positions (same format as libthai).
      84             : 
      85         738 :     String tisText, charSizes;
      86         369 :     String::const_iterator suffixEnd, charStart, p;
      87         738 :     IntVector breaks;
      88             : 
      89         369 :     tisText.reserve(text.size() + 1);
      90         369 :     charSizes.reserve(text.size() + 1);
      91         369 :     breaks.reserve(text.size() + 1);
      92             :     wchar_t ch, lastChar;
      93             :     thchar_t thaiChar;
      94         369 :     bool hasThaiChars = false;
      95             : 
      96         369 :     p = text.begin();
      97         369 :     ch = nextUtf8Char(p, charStart, text.end());
      98         369 :     lastChar = 0;
      99         369 :     int charIndex = 0;
     100      117709 :     while (ch) {
     101      117340 :         thaiChar = th_uni2tis(ch);
     102      117340 :         if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
     103         104 :             hasThaiChars = true;
     104             :         }
     105      117340 :         tisText += (char)thaiChar;
     106      117340 :         charSizes += (char)(p - charStart);
     107             : 
     108      117340 :         if (isLetter(ch)) {
     109       77795 :             if (lastChar && !isLetter(lastChar)) {
     110       36861 :                 breaks.push_back(charIndex);
     111             :             }
     112             :         } else {
     113       39545 :             breaks.push_back(charIndex);
     114             :         }
     115      117340 :         charIndex++;
     116      117340 :         lastChar = ch;
     117      117340 :         ch = nextUtf8Char(p, charStart, text.end());
     118             :     }
     119             : 
     120             :     // If there were any Thai characters in the string, run th_brk on it and add
     121             :     // the resulting break positions
     122         369 :     if (hasThaiChars) {
     123           4 :         tisText += '\0';
     124           4 :         int numPlainBreaks = breaks.size();
     125           4 :         breaks.resize(numPlainBreaks + tisText.size());
     126           4 :         IntVector::iterator thaiBreaksBegin = breaks.begin() + numPlainBreaks;
     127             : 
     128             :         int numBreaks = numPlainBreaks
     129           8 :             + th_brk_find_breaks(getBreaker(), (const thchar_t*)(tisText.data()),
     130           4 :                 &*thaiBreaksBegin, tisText.size());
     131           4 :         breaks.resize(numBreaks);
     132             :         // Update invalidated iterator
     133           4 :         thaiBreaksBegin = breaks.begin() + numPlainBreaks;
     134             :         // Merge break positions and de-dupe.
     135           4 :         std::inplace_merge(breaks.begin(), thaiBreaksBegin, breaks.end());
     136           4 :         breaks.erase(std::unique(breaks.begin(), breaks.end()), breaks.end());
     137             :     }
     138             : 
     139             :     // Add a fake end-of-string character and have a break on it, so that the
     140             :     // last word gets added without special handling
     141         369 :     breaks.push_back(charSizes.size());
     142         369 :     charSizes += (char)0;
     143             : 
     144             :     // Now make the word array by traversing the breaks vector
     145         369 :     p = text.begin();
     146         369 :     IntVector::iterator pBrk = breaks.begin();
     147         369 :     String::const_iterator wordStart = text.begin();
     148         369 :     String::const_iterator suffixStart = text.end();
     149             : 
     150             :     // If there's a break at the start of the string, skip it
     151         369 :     if (pBrk != breaks.end() && *pBrk == 0) {
     152         253 :         pBrk++;
     153             :     }
     154             : 
     155      118078 :     for (charIndex = 0; charIndex < charSizes.size(); p += charSizes[charIndex++]) {
     156             :         // Assume all spaces are ASCII
     157      117709 :         if (isSpace(*p)) {
     158       36520 :             suffixStart = p;
     159             :         }
     160      117709 :         if (pBrk != breaks.end() && charIndex == *pBrk) {
     161       76538 :             if (suffixStart == text.end()) {
     162       40061 :                 words.push_back(Word(wordStart, p, p));
     163             :             } else {
     164       36477 :                 words.push_back(Word(wordStart, suffixStart, p));
     165             :             }
     166       76538 :             pBrk++;
     167       76538 :             suffixStart = text.end();
     168       76538 :             wordStart = p;
     169             :         }
     170             :     }
     171         369 : }
     172             : 
     173             : } // namespace wikidiff2

Generated by: LCOV version 1.13