Line data Source code
1 : #include "TextUtil.h"
2 :
3 : #include <thai/thailib.h>
4 : #include <thai/thwchar.h>
5 :
6 : #include <algorithm>
7 :
8 : namespace wikidiff2 {
9 :
10 : static thread_local TextUtil tl_textUtil;
11 :
12 30 : TextUtil::TextUtil()
13 30 : : breaker(NULL)
14 30 : {}
15 :
16 60 : TextUtil::~TextUtil()
17 : {
18 30 : if (breaker) {
19 1 : th_brk_delete(breaker);
20 : }
21 30 : }
22 :
23 47 : TextUtil & TextUtil::getInstance() {
24 47 : return tl_textUtil;
25 : }
26 :
27 : // Weak UTF-8 decoder
28 : // Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
29 117709 : int TextUtil::nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
30 : String::const_iterator end)
31 : {
32 117709 : int c = 0;
33 : unsigned char byte;
34 117709 : int seqLength = 0;
35 117709 : charStart = p;
36 117709 : if (p == end) {
37 369 : return 0;
38 : }
39 1536 : do {
40 118876 : byte = (unsigned char)*p;
41 118876 : if (byte < 0x80) {
42 115921 : c = byte;
43 115921 : seqLength = 0;
44 2955 : } else if (byte >= 0xc0) {
45 : // Start of UTF-8 character
46 : // If this is unexpected, due to an overshort sequence, we ignore the invalid
47 : // sequence and resynchronise here
48 1419 : if (byte < 0xe0) {
49 1302 : seqLength = 1;
50 1302 : c = byte & 0x1f;
51 117 : } else if (byte < 0xf0) {
52 117 : seqLength = 2;
53 117 : c = byte & 0x0f;
54 : } else {
55 0 : seqLength = 3;
56 0 : c = byte & 7;
57 : }
58 1536 : } else if (seqLength) {
59 1536 : c <<= 6;
60 1536 : c |= byte & 0x3f;
61 1536 : --seqLength;
62 : } else {
63 : // Unexpected continuation, ignore
64 : }
65 118876 : ++p;
66 118876 : } while (seqLength && p != end);
67 117340 : return c;
68 : }
69 :
70 : // Split a string into words
71 : //
72 : // TODO: I think the best way to do this would be to use ICU BreakIterator
73 : // instead of libthai + DIY. Basically you'd run BreakIterators from several
74 : // different locales (en, th, ja) and merge the results, i.e. if a break occurs
75 : // in any locale at a given position, split the string. I don't know if the
76 : // quality of the Thai dictionary in ICU matches the one in libthai, we would
77 : // have to check this somehow.
78 369 : void TextUtil::explodeWords(const String & text, WordVector &words)
79 : {
80 : // Decode the UTF-8 in the string.
81 : // * Save the character sizes (in bytes)
82 : // * Convert the string to TIS-620, which is the internal character set of libthai.
83 : // * Save the character offsets of any break positions (same format as libthai).
84 :
85 738 : String tisText, charSizes;
86 369 : String::const_iterator suffixEnd, charStart, p;
87 738 : IntVector breaks;
88 :
89 369 : tisText.reserve(text.size() + 1);
90 369 : charSizes.reserve(text.size() + 1);
91 369 : breaks.reserve(text.size() + 1);
92 : wchar_t ch, lastChar;
93 : thchar_t thaiChar;
94 369 : bool hasThaiChars = false;
95 :
96 369 : p = text.begin();
97 369 : ch = nextUtf8Char(p, charStart, text.end());
98 369 : lastChar = 0;
99 369 : int charIndex = 0;
100 117709 : while (ch) {
101 117340 : thaiChar = th_uni2tis(ch);
102 117340 : if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
103 104 : hasThaiChars = true;
104 : }
105 117340 : tisText += (char)thaiChar;
106 117340 : charSizes += (char)(p - charStart);
107 :
108 117340 : if (isLetter(ch)) {
109 77795 : if (lastChar && !isLetter(lastChar)) {
110 36861 : breaks.push_back(charIndex);
111 : }
112 : } else {
113 39545 : breaks.push_back(charIndex);
114 : }
115 117340 : charIndex++;
116 117340 : lastChar = ch;
117 117340 : ch = nextUtf8Char(p, charStart, text.end());
118 : }
119 :
120 : // If there were any Thai characters in the string, run th_brk on it and add
121 : // the resulting break positions
122 369 : if (hasThaiChars) {
123 4 : tisText += '\0';
124 4 : int numPlainBreaks = breaks.size();
125 4 : breaks.resize(numPlainBreaks + tisText.size());
126 4 : IntVector::iterator thaiBreaksBegin = breaks.begin() + numPlainBreaks;
127 :
128 : int numBreaks = numPlainBreaks
129 8 : + th_brk_find_breaks(getBreaker(), (const thchar_t*)(tisText.data()),
130 4 : &*thaiBreaksBegin, tisText.size());
131 4 : breaks.resize(numBreaks);
132 : // Update invalidated iterator
133 4 : thaiBreaksBegin = breaks.begin() + numPlainBreaks;
134 : // Merge break positions and de-dupe.
135 4 : std::inplace_merge(breaks.begin(), thaiBreaksBegin, breaks.end());
136 4 : breaks.erase(std::unique(breaks.begin(), breaks.end()), breaks.end());
137 : }
138 :
139 : // Add a fake end-of-string character and have a break on it, so that the
140 : // last word gets added without special handling
141 369 : breaks.push_back(charSizes.size());
142 369 : charSizes += (char)0;
143 :
144 : // Now make the word array by traversing the breaks vector
145 369 : p = text.begin();
146 369 : IntVector::iterator pBrk = breaks.begin();
147 369 : String::const_iterator wordStart = text.begin();
148 369 : String::const_iterator suffixStart = text.end();
149 :
150 : // If there's a break at the start of the string, skip it
151 369 : if (pBrk != breaks.end() && *pBrk == 0) {
152 253 : pBrk++;
153 : }
154 :
155 118078 : for (charIndex = 0; charIndex < charSizes.size(); p += charSizes[charIndex++]) {
156 : // Assume all spaces are ASCII
157 117709 : if (isSpace(*p)) {
158 36520 : suffixStart = p;
159 : }
160 117709 : if (pBrk != breaks.end() && charIndex == *pBrk) {
161 76538 : if (suffixStart == text.end()) {
162 40061 : words.push_back(Word(wordStart, p, p));
163 : } else {
164 36477 : words.push_back(Word(wordStart, suffixStart, p));
165 : }
166 76538 : pBrk++;
167 76538 : suffixStart = text.end();
168 76538 : wordStart = p;
169 : }
170 : }
171 369 : }
172 :
173 : } // namespace wikidiff2
|