Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
20.48% |
43 / 210 |
|
17.65% |
3 / 17 |
CRAP | |
0.00% |
0 / 1 |
GenReplFst | |
20.48% |
43 / 210 |
|
17.65% |
3 / 17 |
2393.46 | |
0.00% |
0 / 1 |
addAlphabet | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
addEntry | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
nextUtf8State | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
42 | |||
utf8alphabet | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
42 | |||
splitFirstUtf8Char | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
flagDiacritic | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
addFdEdge | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
addFdEdgePair | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
buildState | |
0.00% |
0 / 74 |
|
0.00% |
0 / 1 |
380 | |||
emit | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
byteToHex | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
stringToTokens | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
tokensToString | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
applyDown | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
applyUp | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
__construct | |
100.00% |
35 / 35 |
|
100.00% |
1 / 1 |
5 | |||
writeATT | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikimedia\LangConv\Construct; |
4 | |
5 | use Wikimedia\Assert\Assert; |
6 | |
7 | /** |
8 | * GENerate a REPLacement string FST. |
9 | * |
10 | * Create an FST from a replacement string array (aka, as would be provided |
11 | * to `str_tr` or `ReplacementArray` in mediawiki core). |
12 | */ |
13 | class GenReplFst { |
14 | // This can be anything, as long as it is longer than 1 character |
15 | // (so it doesn't conflict w/ any of the single-character keys) |
16 | private const END_OF_STRING = '*END*'; |
17 | // correlation of tree nodes to state machine states |
18 | private const STATE = '*STATE*'; |
19 | // UTF-8 decode state: 0=first byte, 1/2/3=bytes remaining in char |
20 | private const UTF8STATE = '*UTF8STATE*'; |
21 | // Character index of this tree node |
22 | private const INDEX = '*INDEX*'; |
23 | |
24 | /** @var array<int|string,int|string|array> */ |
25 | private $prefixTree = []; |
26 | /** @var MutableFST */ |
27 | private $fst; |
28 | /** @var array */ |
29 | private $alphabet; |
30 | /** |
31 | * Prefix to use on flag diacritics (so they are unique to this FST). |
32 | * @var string |
33 | */ |
34 | private $fdPrefix; |
35 | /** |
36 | * How many flag diacritic features are needed. At most one less than |
37 | * the longest string, but can be shorter. |
38 | * @var int |
39 | */ |
40 | private $maxLookahead; |
41 | /** |
42 | * Cache of shift machines. |
43 | * @var array |
44 | */ |
45 | private $shiftCache = []; |
46 | |
47 | /** |
48 | * Add each letter in the given word to our alphabet. |
49 | * @param array &$alphabet |
50 | * @param string $word |
51 | */ |
52 | private static function addAlphabet( array &$alphabet, string $word ): void { |
53 | for ( $i = 0; $i < strlen( $word ); $i++ ) { |
54 | $alphabet[ord( $word[$i] )] = true; |
55 | } |
56 | } |
57 | |
58 | /** |
59 | * Add the given string (the substring of $from starting from $index) to |
60 | * the prefix tree $tree, along with the conversion output $to. |
61 | * @param array<int|string,string|array> &$tree |
62 | * @param string $from |
63 | * @param int $index |
64 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
65 | * @param string $to |
66 | * @param bool $suppressUtf8Checks Whether to suppress the checks that the |
67 | * $from string is complete utf8. |
68 | */ |
69 | private static function addEntry( |
70 | array &$tree, string $from, int $index, int $utf8state, string $to, |
71 | bool $suppressUtf8Checks = false |
72 | ): void { |
73 | $c = ord( $from[$index] ); |
74 | if ( !isset( $tree[$c] ) ) { |
75 | $tree[$c] = []; |
76 | } |
77 | if ( !isset( $tree[self::UTF8STATE] ) ) { |
78 | $tree[self::UTF8STATE] = $utf8state; |
79 | } |
80 | Assert::invariant( |
81 | isset( $tree[self::UTF8STATE] ) && $tree[self::UTF8STATE] === $utf8state, |
82 | "Should never happen" |
83 | ); |
84 | if ( !isset( $tree[self::INDEX] ) ) { |
85 | $tree[self::INDEX] = $index; |
86 | } |
87 | Assert::invariant( |
88 | isset( $tree[self::INDEX] ) && $tree[self::INDEX] === $index, |
89 | "Should never happen" |
90 | ); |
91 | $nextUtf8State = self::nextUtf8State( $utf8state, $c ); |
92 | $nextIndex = $index + 1; |
93 | if ( $nextIndex < strlen( $from ) ) { |
94 | self::addEntry( $tree[$c], $from, $nextIndex, $nextUtf8State, $to ); |
95 | } else { |
96 | if ( !$suppressUtf8Checks ) { |
97 | Assert::invariant( $nextUtf8State === 0, "Bad UTF-8 in input" ); |
98 | } |
99 | $tree[$c][self::UTF8STATE] = $nextUtf8State; |
100 | $tree[$c][self::INDEX] = $nextIndex; |
101 | $tree[$c][self::END_OF_STRING] = $to; |
102 | } |
103 | } |
104 | |
105 | /** |
106 | * Return the next UTF-8 state, given the current state and the |
107 | * current character. |
108 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
109 | * @param int $c The current character |
110 | * @return int The next UTF-8 state |
111 | */ |
112 | private static function nextUtf8State( int $utf8state, int $c ): int { |
113 | if ( $utf8state === 0 ) { |
114 | if ( $c <= 0x7F ) { |
115 | return 0; |
116 | } elseif ( $c <= 0xDF ) { |
117 | Assert::invariant( $c >= 0xC0, "Bad UTF-8 in input" ); |
118 | return 1; |
119 | } elseif ( $c <= 0xEF ) { |
120 | Assert::invariant( $c >= 0xE0, "Bad UTF-8 in input" ); |
121 | return 2; |
122 | } elseif ( $c <= 0xF7 ) { |
123 | Assert::invariant( $c >= 0xF0, "Bad UTF-8 in input" ); |
124 | return 3; |
125 | } |
126 | } else { |
127 | return $utf8state - 1; |
128 | } |
129 | // @phan-suppress-next-line PhanImpossibleCondition |
130 | Assert::invariant( false, "Bad UTF-8 in input" ); |
131 | } |
132 | |
133 | /** |
134 | * Return the subset of the given alphabet appropriate for the current |
135 | * UTF-8 state. |
136 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
137 | * @return \Generator<int> |
138 | */ |
139 | private function utf8alphabet( int $utf8state ) { |
140 | foreach ( $this->alphabet as $c ) { |
141 | if ( $c >= 0x80 && $c <= 0xBF ) { |
142 | // UTF-8 continuation character |
143 | if ( $utf8state !== 0 ) { |
144 | yield $c; |
145 | } |
146 | } else { |
147 | if ( $utf8state === 0 ) { |
148 | yield $c; |
149 | } |
150 | } |
151 | } |
152 | } |
153 | |
154 | /** |
155 | * Split a given string into the first utf8 char, and "everything else". |
156 | * @param string $s |
157 | * @return string[] |
158 | */ |
159 | private static function splitFirstUtf8Char( string $s ) { |
160 | $utf8state = 0; |
161 | $i = 0; |
162 | while ( $i < strlen( $s ) ) { |
163 | $c = ord( $s[$i] ); |
164 | $utf8state = self::nextUtf8State( $utf8state, $c ); |
165 | $i += 1; |
166 | if ( $utf8state === 0 ) { |
167 | break; |
168 | } |
169 | } |
170 | return [ substr( $s, 0, $i ), substr( $s, $i ) ]; |
171 | } |
172 | |
173 | /** |
174 | * Return the name of a flag diacritic for matching character at a |
175 | * specified offset. |
176 | * @param string $type Flag diacritic operator: P/R/D/C/U |
177 | * @param int $offset Identifies the feature |
178 | * @param int|null $value The value to set/test (optional) |
179 | * @return string The symbol name for the given flag diacritic operation |
180 | */ |
181 | private function flagDiacritic( |
182 | string $type, int $offset, ?int $value = null |
183 | ): string { |
184 | $str = "@$type." . $this->fdPrefix . "char$offset"; |
185 | if ( $value !== null ) { |
186 | $str .= "."; |
187 | if ( $value >= 0 ) { |
188 | $str .= self::byteToHex( $value ); |
189 | } else { |
190 | $str .= "UNK"; |
191 | } |
192 | } |
193 | return $str . "@"; |
194 | } |
195 | |
196 | /** |
197 | * Add a flag diacritic edge between $from and $to. By convention the |
198 | * flag diacritic is on both the upper and lower slots of the edge. |
199 | * @param State $from The source of the new edge |
200 | * @param State $to The destination of the new edge |
201 | * @param string $type The flag diacritic operator |
202 | * @param int $offset The flag diacritic feature |
203 | * @param int|null $value The flag diacritic value (optional) |
204 | */ |
205 | private function addFdEdge( |
206 | State $from, State $to, string $type, int $offset, ?int $value = null |
207 | ): void { |
208 | $fdName = $this->flagDiacritic( $type, $offset, $value ); |
209 | $from->addEdge( $fdName, $fdName, $to ); |
210 | } |
211 | |
212 | /** |
213 | * Add two flag diacritic edge between $from and $to. By convention the |
214 | * flag diacritics are on both the upper and lower slots of the edge. |
215 | * @param State $from The source of the new edges |
216 | * @param State $to The destination of the new edges |
217 | * @param string $type1 The first flag diacritic operator |
218 | * @param int $offset1 The first flag diacritic feature |
219 | * @param int|null $value1 The first flag diacritic value (optional) |
220 | * @param string $type2 The second flag diacritic operator |
221 | * @param int $offset2 The second flag diacritic feature |
222 | * @param int|null $value2 The second flag diacritic value (optional) |
223 | */ |
224 | private function addFdEdgePair( |
225 | State $from, State $to, |
226 | string $type1, int $offset1, ?int $value1, |
227 | string $type2, int $offset2, ?int $value2 |
228 | ): void { |
229 | $n = $this->fst->newState(); |
230 | $this->addFdEdge( $from, $n, $type1, $offset1, $value1 ); |
231 | $this->addFdEdge( $n, $to, $type2, $offset2, $value2 ); |
232 | } |
233 | |
234 | /** |
235 | * Add edges from state $from corresponding to the prefix tree $tree, |
236 | * given the $lastMatch and the characters seen since then, $seen. |
237 | * @param State $from |
238 | * @param array &$tree |
239 | * @param ?string $lastMatch (null if we haven't seen a match) |
240 | * @param string $seen characters seen since last match |
241 | */ |
242 | private function buildState( State $from, array &$tree, ?string $lastMatch, string $seen ): void { |
243 | $tree[self::STATE] = $from; |
244 | $index = $tree[self::INDEX]; |
245 | $utf8state = $tree[self::UTF8STATE]; |
246 | if ( $index < $this->maxLookahead ) { |
247 | $noBufState = $this->fst->newState(); |
248 | $this->addFdEdge( $from, $noBufState, 'D', $index ); |
249 | } else { |
250 | $noBufState = $from; |
251 | } |
252 | $noMatchState = $this->fst->newState(); |
253 | |
254 | if ( isset( $tree[self::END_OF_STRING] ) ) { |
255 | $lastMatch = $tree[self::END_OF_STRING]; |
256 | $seen = ''; |
257 | } |
258 | foreach ( $this->utf8alphabet( $utf8state ) as $c ) { |
259 | if ( isset( $tree[$c] ) ) { |
260 | $nextSeen = $seen . chr( $c ); |
261 | $n = $this->fst->newState(); |
262 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
263 | if ( $index < $this->maxLookahead ) { |
264 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
265 | } |
266 | $nn = $this->fst->newState(); |
267 | $this->addFdEdge( $n, $nn, 'P', strlen( $seen ), $c ); |
268 | $this->buildState( $nn, $tree[$c], $lastMatch, $nextSeen ); |
269 | } else { |
270 | Assert::invariant( $index !== 0, |
271 | "fake single-char matches should always exist" ); |
272 | |
273 | $n = $this->fst->newState(); |
274 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
275 | if ( $index < $this->maxLookahead ) { |
276 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
277 | } |
278 | Assert::invariant( |
279 | strlen( $seen ) < $this->maxLookahead, |
280 | "Max lookahead should always account for seen" |
281 | ); |
282 | $this->addFdEdge( $n, $noMatchState, 'P', strlen( $seen ), $c ); |
283 | } |
284 | } |
285 | // our first state must echo all continuation characters, since |
286 | // the anythingState transitions there and we don't know what |
287 | // utf8 state IDENTITY will leave us in. (The characters not in |
288 | // our alphabet could consist of 1-/2-/3-/4-byte sequences.) |
289 | if ( $index === 0 ) { |
290 | foreach ( self::utf8alphabet( 1/*continuation chars*/ ) as $c ) { |
291 | $nextSeen = $seen . chr( $c ); |
292 | $n = $this->fst->newState(); |
293 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
294 | if ( $index < $this->maxLookahead ) { |
295 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
296 | } |
297 | $fakeTree = []; |
298 | $fakeTree[self::UTF8STATE] = 1; |
299 | $fakeTree[self::INDEX] = $index + 1; |
300 | $fakeTree[self::END_OF_STRING] = chr( $c ); |
301 | $this->buildState( $n, $fakeTree, $lastMatch, $nextSeen ); |
302 | } |
303 | } |
304 | // "anything else" |
305 | $anythingElse = $this->fst->newState(); |
306 | $noBufState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $anythingElse ); |
307 | if ( $index !== 0 ) { |
308 | $this->addFdEdge( $from, $anythingElse, 'R', $index, -1 ); |
309 | } |
310 | $this->addFdEdge( $anythingElse, $noMatchState, 'P', strlen( $seen ), -1 ); |
311 | // Emit the last match |
312 | if ( $lastMatch !== null ) { |
313 | $noMatchState = $this->emit( $noMatchState, MutableFST::EPSILON, $lastMatch ); |
314 | } |
315 | // Shift over queued input |
316 | for ( $i = $index + 1; true; $i++ ) { |
317 | $j = strlen( $seen ) + ( $i - $index ); |
318 | $key = $i . ':' . $j; |
319 | if ( isset( $this->shiftCache[$key] ) ) { |
320 | $noMatchState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->shiftCache[$key] ); |
321 | return; |
322 | } |
323 | $this->shiftCache[$key] = $noMatchState; |
324 | if ( $i < $this->maxLookahead ) { |
325 | $n = $this->fst->newState(); |
326 | $this->addFdEdge( $noMatchState, $n, 'D', $i ); |
327 | } else { |
328 | $n = $noMatchState; |
329 | } |
330 | for ( $k = $j; $k < $i && $k < $this->maxLookahead; $k++ ) { |
331 | $nn = $this->fst->newState(); |
332 | $this->addFdEdge( $n, $nn, 'C', $k ); |
333 | $n = $nn; |
334 | } |
335 | $n->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->fst->getStartState() ); |
336 | if ( !( $i < $this->maxLookahead ) ) { |
337 | break; |
338 | } |
339 | $n = $this->fst->newState(); |
340 | $this->addFdEdgePair( $noMatchState, $n, 'R', $i, -1, 'P', $j, -1 ); |
341 | foreach ( $this->alphabet as $c ) { |
342 | $this->addFdEdgePair( $noMatchState, $n, 'R', $i, $c, 'P', $j, $c ); |
343 | } |
344 | $noMatchState = $n; |
345 | } |
346 | } |
347 | |
348 | /** |
349 | * Chain states together from $fromState to emit $emitStr. |
350 | * @param State $fromState |
351 | * @param string $fromChar |
352 | * @param string $emitStr |
353 | * @return State the resulting state (after the string has been emitted) |
354 | */ |
355 | private function emit( State $fromState, string $fromChar, string $emitStr ): State { |
356 | if ( strlen( $emitStr ) === 0 && $fromChar !== MutableFST::EPSILON ) { |
357 | $n = $this->fst->newState(); |
358 | $fromState->addEdge( $fromChar, MutableFST::EPSILON, $n ); |
359 | return $n; |
360 | } |
361 | for ( $i = 0; $i < strlen( $emitStr ); $i++ ) { |
362 | $c = ord( $emitStr[$i] ); |
363 | $n = $this->fst->newState(); |
364 | $fromState->addEdge( $fromChar, self::byteToHex( $c ), $n ); |
365 | $fromState = $n; |
366 | $fromChar = MutableFST::EPSILON; |
367 | } |
368 | return $fromState; |
369 | } |
370 | |
371 | /** |
372 | * Private helper function: convert a numeric byte to the string |
373 | * token we use in the FST. |
374 | * @param int $byte |
375 | * @return string Token |
376 | */ |
377 | private static function byteToHex( int $byte ): string { |
378 | $s = strtoupper( dechex( $byte ) ); |
379 | while ( strlen( $s ) < 2 ) { |
380 | $s = "0$s"; |
381 | } |
382 | return $s; |
383 | } |
384 | |
385 | /** |
386 | * Private helper function: convert a UTF-8 string byte-by-byte into |
387 | * an array of tokens. |
388 | * @param string $s |
389 | * @return string[] |
390 | */ |
391 | private static function stringToTokens( string $s ): array { |
392 | $toks = []; |
393 | for ( $i = 0; $i < strlen( $s ); $i++ ) { |
394 | $toks[] = self::byteToHex( ord( $s[$i] ) ); |
395 | } |
396 | return $toks; |
397 | } |
398 | |
399 | /** |
400 | * Private helper function: convert an array of tokens into |
401 | * a UTF-8 string. |
402 | * @param string[] $toks |
403 | * @return string |
404 | */ |
405 | private static function tokensToString( array $toks ): string { |
406 | $s = ''; |
407 | foreach ( $toks as $token ) { |
408 | if ( strlen( $token ) === 2 ) { |
409 | $s .= chr( hexdec( $token ) ); |
410 | } else { |
411 | // shouldn't happen, but handy for debugging if it does |
412 | $s .= $token; |
413 | } |
414 | } |
415 | return $s; |
416 | } |
417 | |
418 | /** |
419 | * For testing: apply the resulting FST to the given input string. |
420 | * @param string $input |
421 | * @return string[] The possible outputs. |
422 | */ |
423 | public function applyDown( string $input ): array { |
424 | // convert input to byte tokens |
425 | $result = $this->fst->applyDown( self::stringToTokens( $input ) ); |
426 | return array_map( function ( $toks ) { |
427 | return self::tokensToString( $toks ); |
428 | }, $result ); |
429 | } |
430 | |
431 | /** |
432 | * For testing: run the resulting FST "in reverse" against the given |
433 | * input string. |
434 | * @param string $input |
435 | * @return string[] The possible outputs. |
436 | */ |
437 | public function applyUp( string $input ): array { |
438 | // convert input to byte tokens |
439 | $result = $this->fst->applyUp( self::stringToTokens( $input ) ); |
440 | return array_map( function ( $toks ) { |
441 | return self::tokensToString( $toks ); |
442 | }, $result ); |
443 | } |
444 | |
445 | /** |
446 | * Convert the given $replacementTable (strtr-style) to an FST. |
447 | * @param string $name |
448 | * @param array<string,string> $replacementTable |
449 | * @param string $fdPrefix Flag diacritic feature prefix, for uniqueness |
450 | */ |
451 | public function __construct( |
452 | string $name, array $replacementTable, string $fdPrefix = '' |
453 | ) { |
454 | $this->fdPrefix = $fdPrefix; |
455 | $alphabet = []; |
456 | $longestWord = 0; |
457 | foreach ( $replacementTable as $from => $to ) { |
458 | $longestWord = max( $longestWord, strlen( $from ) ); |
459 | self::addAlphabet( $alphabet, $from ); |
460 | self::addAlphabet( $alphabet, $to ); |
461 | self::addEntry( $this->prefixTree, $from, 0, 0, $to ); |
462 | } |
463 | // fake one character matches! |
464 | foreach ( $alphabet as $sym => $value ) { |
465 | if ( $sym < 0x80 || $sym > 0xBF ) { // not continuation chars |
466 | self::addEntry( $this->prefixTree, chr( $sym ), 0, 0, chr( $sym ), true ); |
467 | } |
468 | } |
469 | $this->maxLookahead = $longestWord - 1; // XXX could be shorter |
470 | $this->alphabet = array_keys( $alphabet ); |
471 | sort( $this->alphabet, SORT_NUMERIC ); |
472 | // ok, now we're ready to emit the FST |
473 | $this->fst = new MutableFST( array_map( function ( $n ) { |
474 | return self::byteToHex( $n ); |
475 | }, $this->alphabet ) ); |
476 | $anythingState = $this->fst->newState(); |
477 | $anything2State = $this->fst->newState(); |
478 | $anythingState->addEdge( |
479 | MutableFST::UNKNOWN, MutableFST::IDENTITY, |
480 | $anything2State |
481 | ); |
482 | $this->addFdEdge( $this->fst->getStartState(), $anythingState, 'R', 0, -1 ); |
483 | $this->addFdEdge( $anything2State, $this->fst->getStartState(), 'C', 0 ); |
484 | // The anything state could also be the end of the string |
485 | // (which for modelling purposes we can think of as a special |
486 | // "EOF" token not in the alphabet) |
487 | // Important that there are no outgoing edges from the $endState! |
488 | $endState = $this->fst->newState(); |
489 | $endState->isFinal = true; |
490 | $anythingState->addEdge( |
491 | MutableFST::EPSILON, MutableFST::EPSILON, |
492 | $endState |
493 | ); |
494 | |
495 | // Create states corresponding to prefix tree nodes |
496 | $this->buildState( |
497 | $this->fst->getStartState(), $this->prefixTree, null, '' |
498 | ); |
499 | // ok, done! |
500 | $this->fst->optimize(); |
501 | } |
502 | |
503 | /** |
504 | * Write the FST to the given file handle in AT&T format. |
505 | * @param resource $handle |
506 | */ |
507 | public function writeATT( $handle ): void { |
508 | $this->fst->writeATT( $handle ); |
509 | } |
510 | |
511 | } |