Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
20.48% |
43 / 210 |
|
17.65% |
3 / 17 |
CRAP | |
0.00% |
0 / 1 |
| GenReplFst | |
20.48% |
43 / 210 |
|
17.65% |
3 / 17 |
2393.46 | |
0.00% |
0 / 1 |
| addAlphabet | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| addEntry | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
| nextUtf8State | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
42 | |||
| utf8alphabet | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
42 | |||
| splitFirstUtf8Char | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
| flagDiacritic | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
| addFdEdge | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| addFdEdgePair | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| buildState | |
0.00% |
0 / 74 |
|
0.00% |
0 / 1 |
380 | |||
| emit | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
| byteToHex | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| stringToTokens | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| tokensToString | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| applyDown | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| applyUp | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| __construct | |
100.00% |
35 / 35 |
|
100.00% |
1 / 1 |
5 | |||
| writeATT | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikimedia\LangConv\Construct; |
| 4 | |
| 5 | use Wikimedia\Assert\Assert; |
| 6 | |
| 7 | /** |
| 8 | * GENerate a REPLacement string FST. |
| 9 | * |
| 10 | * Create an FST from a replacement string array (aka, as would be provided |
| 11 | * to `str_tr` or `ReplacementArray` in mediawiki core). |
| 12 | */ |
| 13 | class GenReplFst { |
| 14 | // This can be anything, as long as it is longer than 1 character |
| 15 | // (so it doesn't conflict w/ any of the single-character keys) |
| 16 | private const END_OF_STRING = '*END*'; |
| 17 | // correlation of tree nodes to state machine states |
| 18 | private const STATE = '*STATE*'; |
| 19 | // UTF-8 decode state: 0=first byte, 1/2/3=bytes remaining in char |
| 20 | private const UTF8STATE = '*UTF8STATE*'; |
| 21 | // Character index of this tree node |
| 22 | private const INDEX = '*INDEX*'; |
| 23 | |
| 24 | /** @var array<int|string,int|string|array> */ |
| 25 | private $prefixTree = []; |
| 26 | /** @var MutableFST */ |
| 27 | private $fst; |
| 28 | /** @var array */ |
| 29 | private $alphabet; |
| 30 | /** |
| 31 | * Prefix to use on flag diacritics (so they are unique to this FST). |
| 32 | * @var string |
| 33 | */ |
| 34 | private $fdPrefix; |
| 35 | /** |
| 36 | * How many flag diacritic features are needed. At most one less than |
| 37 | * the longest string, but can be shorter. |
| 38 | * @var int |
| 39 | */ |
| 40 | private $maxLookahead; |
| 41 | /** |
| 42 | * Cache of shift machines. |
| 43 | * @var array |
| 44 | */ |
| 45 | private $shiftCache = []; |
| 46 | |
| 47 | /** |
| 48 | * Add each letter in the given word to our alphabet. |
| 49 | * @param array &$alphabet |
| 50 | * @param string $word |
| 51 | */ |
| 52 | private static function addAlphabet( array &$alphabet, string $word ): void { |
| 53 | for ( $i = 0; $i < strlen( $word ); $i++ ) { |
| 54 | $alphabet[ord( $word[$i] )] = true; |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Add the given string (the substring of $from starting from $index) to |
| 60 | * the prefix tree $tree, along with the conversion output $to. |
| 61 | * @param array<int|string,string|array> &$tree |
| 62 | * @param string $from |
| 63 | * @param int $index |
| 64 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
| 65 | * @param string $to |
| 66 | * @param bool $suppressUtf8Checks Whether to suppress the checks that the |
| 67 | * $from string is complete utf8. |
| 68 | */ |
| 69 | private static function addEntry( |
| 70 | array &$tree, string $from, int $index, int $utf8state, string $to, |
| 71 | bool $suppressUtf8Checks = false |
| 72 | ): void { |
| 73 | $c = ord( $from[$index] ); |
| 74 | if ( !isset( $tree[$c] ) ) { |
| 75 | $tree[$c] = []; |
| 76 | } |
| 77 | if ( !isset( $tree[self::UTF8STATE] ) ) { |
| 78 | $tree[self::UTF8STATE] = $utf8state; |
| 79 | } |
| 80 | Assert::invariant( |
| 81 | isset( $tree[self::UTF8STATE] ) && $tree[self::UTF8STATE] === $utf8state, |
| 82 | "Should never happen" |
| 83 | ); |
| 84 | if ( !isset( $tree[self::INDEX] ) ) { |
| 85 | $tree[self::INDEX] = $index; |
| 86 | } |
| 87 | Assert::invariant( |
| 88 | isset( $tree[self::INDEX] ) && $tree[self::INDEX] === $index, |
| 89 | "Should never happen" |
| 90 | ); |
| 91 | $nextUtf8State = self::nextUtf8State( $utf8state, $c ); |
| 92 | $nextIndex = $index + 1; |
| 93 | if ( $nextIndex < strlen( $from ) ) { |
| 94 | self::addEntry( $tree[$c], $from, $nextIndex, $nextUtf8State, $to ); |
| 95 | } else { |
| 96 | if ( !$suppressUtf8Checks ) { |
| 97 | Assert::invariant( $nextUtf8State === 0, "Bad UTF-8 in input" ); |
| 98 | } |
| 99 | $tree[$c][self::UTF8STATE] = $nextUtf8State; |
| 100 | $tree[$c][self::INDEX] = $nextIndex; |
| 101 | $tree[$c][self::END_OF_STRING] = $to; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | /** |
| 106 | * Return the next UTF-8 state, given the current state and the |
| 107 | * current character. |
| 108 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
| 109 | * @param int $c The current character |
| 110 | * @return int The next UTF-8 state |
| 111 | */ |
| 112 | private static function nextUtf8State( int $utf8state, int $c ): int { |
| 113 | if ( $utf8state === 0 ) { |
| 114 | if ( $c <= 0x7F ) { |
| 115 | return 0; |
| 116 | } elseif ( $c <= 0xDF ) { |
| 117 | Assert::invariant( $c >= 0xC0, "Bad UTF-8 in input" ); |
| 118 | return 1; |
| 119 | } elseif ( $c <= 0xEF ) { |
| 120 | Assert::invariant( $c >= 0xE0, "Bad UTF-8 in input" ); |
| 121 | return 2; |
| 122 | } elseif ( $c <= 0xF7 ) { |
| 123 | Assert::invariant( $c >= 0xF0, "Bad UTF-8 in input" ); |
| 124 | return 3; |
| 125 | } |
| 126 | } else { |
| 127 | return $utf8state - 1; |
| 128 | } |
| 129 | // @phan-suppress-next-line PhanImpossibleCondition |
| 130 | Assert::invariant( false, "Bad UTF-8 in input" ); |
| 131 | } |
| 132 | |
| 133 | /** |
| 134 | * Return the subset of the given alphabet appropriate for the current |
| 135 | * UTF-8 state. |
| 136 | * @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char |
| 137 | * @return \Generator<int> |
| 138 | */ |
| 139 | private function utf8alphabet( int $utf8state ) { |
| 140 | foreach ( $this->alphabet as $c ) { |
| 141 | if ( $c >= 0x80 && $c <= 0xBF ) { |
| 142 | // UTF-8 continuation character |
| 143 | if ( $utf8state !== 0 ) { |
| 144 | yield $c; |
| 145 | } |
| 146 | } else { |
| 147 | if ( $utf8state === 0 ) { |
| 148 | yield $c; |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | /** |
| 155 | * Split a given string into the first utf8 char, and "everything else". |
| 156 | * @param string $s |
| 157 | * @return string[] |
| 158 | */ |
| 159 | private static function splitFirstUtf8Char( string $s ) { |
| 160 | $utf8state = 0; |
| 161 | $i = 0; |
| 162 | while ( $i < strlen( $s ) ) { |
| 163 | $c = ord( $s[$i] ); |
| 164 | $utf8state = self::nextUtf8State( $utf8state, $c ); |
| 165 | $i += 1; |
| 166 | if ( $utf8state === 0 ) { |
| 167 | break; |
| 168 | } |
| 169 | } |
| 170 | return [ substr( $s, 0, $i ), substr( $s, $i ) ]; |
| 171 | } |
| 172 | |
| 173 | /** |
| 174 | * Return the name of a flag diacritic for matching character at a |
| 175 | * specified offset. |
| 176 | * @param string $type Flag diacritic operator: P/R/D/C/U |
| 177 | * @param int $offset Identifies the feature |
| 178 | * @param int|null $value The value to set/test (optional) |
| 179 | * @return string The symbol name for the given flag diacritic operation |
| 180 | */ |
| 181 | private function flagDiacritic( |
| 182 | string $type, int $offset, ?int $value = null |
| 183 | ): string { |
| 184 | $str = "@$type." . $this->fdPrefix . "char$offset"; |
| 185 | if ( $value !== null ) { |
| 186 | $str .= "."; |
| 187 | if ( $value >= 0 ) { |
| 188 | $str .= self::byteToHex( $value ); |
| 189 | } else { |
| 190 | $str .= "UNK"; |
| 191 | } |
| 192 | } |
| 193 | return $str . "@"; |
| 194 | } |
| 195 | |
| 196 | /** |
| 197 | * Add a flag diacritic edge between $from and $to. By convention the |
| 198 | * flag diacritic is on both the upper and lower slots of the edge. |
| 199 | * @param State $from The source of the new edge |
| 200 | * @param State $to The destination of the new edge |
| 201 | * @param string $type The flag diacritic operator |
| 202 | * @param int $offset The flag diacritic feature |
| 203 | * @param int|null $value The flag diacritic value (optional) |
| 204 | */ |
| 205 | private function addFdEdge( |
| 206 | State $from, State $to, string $type, int $offset, ?int $value = null |
| 207 | ): void { |
| 208 | $fdName = $this->flagDiacritic( $type, $offset, $value ); |
| 209 | $from->addEdge( $fdName, $fdName, $to ); |
| 210 | } |
| 211 | |
| 212 | /** |
| 213 | * Add two flag diacritic edge between $from and $to. By convention the |
| 214 | * flag diacritics are on both the upper and lower slots of the edge. |
| 215 | * @param State $from The source of the new edges |
| 216 | * @param State $to The destination of the new edges |
| 217 | * @param string $type1 The first flag diacritic operator |
| 218 | * @param int $offset1 The first flag diacritic feature |
| 219 | * @param int|null $value1 The first flag diacritic value (optional) |
| 220 | * @param string $type2 The second flag diacritic operator |
| 221 | * @param int $offset2 The second flag diacritic feature |
| 222 | * @param int|null $value2 The second flag diacritic value (optional) |
| 223 | */ |
| 224 | private function addFdEdgePair( |
| 225 | State $from, State $to, |
| 226 | string $type1, int $offset1, ?int $value1, |
| 227 | string $type2, int $offset2, ?int $value2 |
| 228 | ): void { |
| 229 | $n = $this->fst->newState(); |
| 230 | $this->addFdEdge( $from, $n, $type1, $offset1, $value1 ); |
| 231 | $this->addFdEdge( $n, $to, $type2, $offset2, $value2 ); |
| 232 | } |
| 233 | |
| 234 | /** |
| 235 | * Add edges from state $from corresponding to the prefix tree $tree, |
| 236 | * given the $lastMatch and the characters seen since then, $seen. |
| 237 | * @param State $from |
| 238 | * @param array &$tree |
| 239 | * @param ?string $lastMatch (null if we haven't seen a match) |
| 240 | * @param string $seen characters seen since last match |
| 241 | */ |
| 242 | private function buildState( State $from, array &$tree, ?string $lastMatch, string $seen ): void { |
| 243 | $tree[self::STATE] = $from; |
| 244 | $index = $tree[self::INDEX]; |
| 245 | $utf8state = $tree[self::UTF8STATE]; |
| 246 | if ( $index < $this->maxLookahead ) { |
| 247 | $noBufState = $this->fst->newState(); |
| 248 | $this->addFdEdge( $from, $noBufState, 'D', $index ); |
| 249 | } else { |
| 250 | $noBufState = $from; |
| 251 | } |
| 252 | $noMatchState = $this->fst->newState(); |
| 253 | |
| 254 | if ( isset( $tree[self::END_OF_STRING] ) ) { |
| 255 | $lastMatch = $tree[self::END_OF_STRING]; |
| 256 | $seen = ''; |
| 257 | } |
| 258 | foreach ( $this->utf8alphabet( $utf8state ) as $c ) { |
| 259 | if ( isset( $tree[$c] ) ) { |
| 260 | $nextSeen = $seen . chr( $c ); |
| 261 | $n = $this->fst->newState(); |
| 262 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
| 263 | if ( $index < $this->maxLookahead ) { |
| 264 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
| 265 | } |
| 266 | $nn = $this->fst->newState(); |
| 267 | $this->addFdEdge( $n, $nn, 'P', strlen( $seen ), $c ); |
| 268 | $this->buildState( $nn, $tree[$c], $lastMatch, $nextSeen ); |
| 269 | } else { |
| 270 | Assert::invariant( $index !== 0, |
| 271 | "fake single-char matches should always exist" ); |
| 272 | |
| 273 | $n = $this->fst->newState(); |
| 274 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
| 275 | if ( $index < $this->maxLookahead ) { |
| 276 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
| 277 | } |
| 278 | Assert::invariant( |
| 279 | strlen( $seen ) < $this->maxLookahead, |
| 280 | "Max lookahead should always account for seen" |
| 281 | ); |
| 282 | $this->addFdEdge( $n, $noMatchState, 'P', strlen( $seen ), $c ); |
| 283 | } |
| 284 | } |
| 285 | // our first state must echo all continuation characters, since |
| 286 | // the anythingState transitions there and we don't know what |
| 287 | // utf8 state IDENTITY will leave us in. (The characters not in |
| 288 | // our alphabet could consist of 1-/2-/3-/4-byte sequences.) |
| 289 | if ( $index === 0 ) { |
| 290 | foreach ( self::utf8alphabet( 1/*continuation chars*/ ) as $c ) { |
| 291 | $nextSeen = $seen . chr( $c ); |
| 292 | $n = $this->fst->newState(); |
| 293 | $noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n ); |
| 294 | if ( $index < $this->maxLookahead ) { |
| 295 | $this->addFdEdge( $from, $n, 'R', $index, $c ); |
| 296 | } |
| 297 | $fakeTree = []; |
| 298 | $fakeTree[self::UTF8STATE] = 1; |
| 299 | $fakeTree[self::INDEX] = $index + 1; |
| 300 | $fakeTree[self::END_OF_STRING] = chr( $c ); |
| 301 | $this->buildState( $n, $fakeTree, $lastMatch, $nextSeen ); |
| 302 | } |
| 303 | } |
| 304 | // "anything else" |
| 305 | $anythingElse = $this->fst->newState(); |
| 306 | $noBufState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $anythingElse ); |
| 307 | if ( $index !== 0 ) { |
| 308 | $this->addFdEdge( $from, $anythingElse, 'R', $index, -1 ); |
| 309 | } |
| 310 | $this->addFdEdge( $anythingElse, $noMatchState, 'P', strlen( $seen ), -1 ); |
| 311 | // Emit the last match |
| 312 | if ( $lastMatch !== null ) { |
| 313 | $noMatchState = $this->emit( $noMatchState, MutableFST::EPSILON, $lastMatch ); |
| 314 | } |
| 315 | // Shift over queued input |
| 316 | for ( $i = $index + 1; true; $i++ ) { |
| 317 | $j = strlen( $seen ) + ( $i - $index ); |
| 318 | $key = $i . ':' . $j; |
| 319 | if ( isset( $this->shiftCache[$key] ) ) { |
| 320 | $noMatchState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->shiftCache[$key] ); |
| 321 | return; |
| 322 | } |
| 323 | $this->shiftCache[$key] = $noMatchState; |
| 324 | if ( $i < $this->maxLookahead ) { |
| 325 | $n = $this->fst->newState(); |
| 326 | $this->addFdEdge( $noMatchState, $n, 'D', $i ); |
| 327 | } else { |
| 328 | $n = $noMatchState; |
| 329 | } |
| 330 | for ( $k = $j; $k < $i && $k < $this->maxLookahead; $k++ ) { |
| 331 | $nn = $this->fst->newState(); |
| 332 | $this->addFdEdge( $n, $nn, 'C', $k ); |
| 333 | $n = $nn; |
| 334 | } |
| 335 | $n->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->fst->getStartState() ); |
| 336 | if ( !( $i < $this->maxLookahead ) ) { |
| 337 | break; |
| 338 | } |
| 339 | $n = $this->fst->newState(); |
| 340 | $this->addFdEdgePair( $noMatchState, $n, 'R', $i, -1, 'P', $j, -1 ); |
| 341 | foreach ( $this->alphabet as $c ) { |
| 342 | $this->addFdEdgePair( $noMatchState, $n, 'R', $i, $c, 'P', $j, $c ); |
| 343 | } |
| 344 | $noMatchState = $n; |
| 345 | } |
| 346 | } |
| 347 | |
| 348 | /** |
| 349 | * Chain states together from $fromState to emit $emitStr. |
| 350 | * @param State $fromState |
| 351 | * @param string $fromChar |
| 352 | * @param string $emitStr |
| 353 | * @return State the resulting state (after the string has been emitted) |
| 354 | */ |
| 355 | private function emit( State $fromState, string $fromChar, string $emitStr ): State { |
| 356 | if ( strlen( $emitStr ) === 0 && $fromChar !== MutableFST::EPSILON ) { |
| 357 | $n = $this->fst->newState(); |
| 358 | $fromState->addEdge( $fromChar, MutableFST::EPSILON, $n ); |
| 359 | return $n; |
| 360 | } |
| 361 | for ( $i = 0; $i < strlen( $emitStr ); $i++ ) { |
| 362 | $c = ord( $emitStr[$i] ); |
| 363 | $n = $this->fst->newState(); |
| 364 | $fromState->addEdge( $fromChar, self::byteToHex( $c ), $n ); |
| 365 | $fromState = $n; |
| 366 | $fromChar = MutableFST::EPSILON; |
| 367 | } |
| 368 | return $fromState; |
| 369 | } |
| 370 | |
| 371 | /** |
| 372 | * Private helper function: convert a numeric byte to the string |
| 373 | * token we use in the FST. |
| 374 | * @param int $byte |
| 375 | * @return string Token |
| 376 | */ |
| 377 | private static function byteToHex( int $byte ): string { |
| 378 | $s = strtoupper( dechex( $byte ) ); |
| 379 | while ( strlen( $s ) < 2 ) { |
| 380 | $s = "0$s"; |
| 381 | } |
| 382 | return $s; |
| 383 | } |
| 384 | |
| 385 | /** |
| 386 | * Private helper function: convert a UTF-8 string byte-by-byte into |
| 387 | * an array of tokens. |
| 388 | * @param string $s |
| 389 | * @return string[] |
| 390 | */ |
| 391 | private static function stringToTokens( string $s ): array { |
| 392 | $toks = []; |
| 393 | for ( $i = 0; $i < strlen( $s ); $i++ ) { |
| 394 | $toks[] = self::byteToHex( ord( $s[$i] ) ); |
| 395 | } |
| 396 | return $toks; |
| 397 | } |
| 398 | |
| 399 | /** |
| 400 | * Private helper function: convert an array of tokens into |
| 401 | * a UTF-8 string. |
| 402 | * @param string[] $toks |
| 403 | * @return string |
| 404 | */ |
| 405 | private static function tokensToString( array $toks ): string { |
| 406 | $s = ''; |
| 407 | foreach ( $toks as $token ) { |
| 408 | if ( strlen( $token ) === 2 ) { |
| 409 | $s .= chr( hexdec( $token ) ); |
| 410 | } else { |
| 411 | // shouldn't happen, but handy for debugging if it does |
| 412 | $s .= $token; |
| 413 | } |
| 414 | } |
| 415 | return $s; |
| 416 | } |
| 417 | |
| 418 | /** |
| 419 | * For testing: apply the resulting FST to the given input string. |
| 420 | * @param string $input |
| 421 | * @return string[] The possible outputs. |
| 422 | */ |
| 423 | public function applyDown( string $input ): array { |
| 424 | // convert input to byte tokens |
| 425 | $result = $this->fst->applyDown( self::stringToTokens( $input ) ); |
| 426 | return array_map( function ( $toks ) { |
| 427 | return self::tokensToString( $toks ); |
| 428 | }, $result ); |
| 429 | } |
| 430 | |
| 431 | /** |
| 432 | * For testing: run the resulting FST "in reverse" against the given |
| 433 | * input string. |
| 434 | * @param string $input |
| 435 | * @return string[] The possible outputs. |
| 436 | */ |
| 437 | public function applyUp( string $input ): array { |
| 438 | // convert input to byte tokens |
| 439 | $result = $this->fst->applyUp( self::stringToTokens( $input ) ); |
| 440 | return array_map( function ( $toks ) { |
| 441 | return self::tokensToString( $toks ); |
| 442 | }, $result ); |
| 443 | } |
| 444 | |
| 445 | /** |
| 446 | * Convert the given $replacementTable (strtr-style) to an FST. |
| 447 | * @param string $name |
| 448 | * @param array<string,string> $replacementTable |
| 449 | * @param string $fdPrefix Flag diacritic feature prefix, for uniqueness |
| 450 | */ |
| 451 | public function __construct( |
| 452 | string $name, array $replacementTable, string $fdPrefix = '' |
| 453 | ) { |
| 454 | $this->fdPrefix = $fdPrefix; |
| 455 | $alphabet = []; |
| 456 | $longestWord = 0; |
| 457 | foreach ( $replacementTable as $from => $to ) { |
| 458 | $longestWord = max( $longestWord, strlen( $from ) ); |
| 459 | self::addAlphabet( $alphabet, $from ); |
| 460 | self::addAlphabet( $alphabet, $to ); |
| 461 | self::addEntry( $this->prefixTree, $from, 0, 0, $to ); |
| 462 | } |
| 463 | // fake one character matches! |
| 464 | foreach ( $alphabet as $sym => $value ) { |
| 465 | if ( $sym < 0x80 || $sym > 0xBF ) { // not continuation chars |
| 466 | self::addEntry( $this->prefixTree, chr( $sym ), 0, 0, chr( $sym ), true ); |
| 467 | } |
| 468 | } |
| 469 | $this->maxLookahead = $longestWord - 1; // XXX could be shorter |
| 470 | $this->alphabet = array_keys( $alphabet ); |
| 471 | sort( $this->alphabet, SORT_NUMERIC ); |
| 472 | // ok, now we're ready to emit the FST |
| 473 | $this->fst = new MutableFST( array_map( function ( $n ) { |
| 474 | return self::byteToHex( $n ); |
| 475 | }, $this->alphabet ) ); |
| 476 | $anythingState = $this->fst->newState(); |
| 477 | $anything2State = $this->fst->newState(); |
| 478 | $anythingState->addEdge( |
| 479 | MutableFST::UNKNOWN, MutableFST::IDENTITY, |
| 480 | $anything2State |
| 481 | ); |
| 482 | $this->addFdEdge( $this->fst->getStartState(), $anythingState, 'R', 0, -1 ); |
| 483 | $this->addFdEdge( $anything2State, $this->fst->getStartState(), 'C', 0 ); |
| 484 | // The anything state could also be the end of the string |
| 485 | // (which for modelling purposes we can think of as a special |
| 486 | // "EOF" token not in the alphabet) |
| 487 | // Important that there are no outgoing edges from the $endState! |
| 488 | $endState = $this->fst->newState(); |
| 489 | $endState->isFinal = true; |
| 490 | $anythingState->addEdge( |
| 491 | MutableFST::EPSILON, MutableFST::EPSILON, |
| 492 | $endState |
| 493 | ); |
| 494 | |
| 495 | // Create states corresponding to prefix tree nodes |
| 496 | $this->buildState( |
| 497 | $this->fst->getStartState(), $this->prefixTree, null, '' |
| 498 | ); |
| 499 | // ok, done! |
| 500 | $this->fst->optimize(); |
| 501 | } |
| 502 | |
| 503 | /** |
| 504 | * Write the FST to the given file handle in AT&T format. |
| 505 | * @param resource $handle |
| 506 | */ |
| 507 | public function writeATT( $handle ): void { |
| 508 | $this->fst->writeATT( $handle ); |
| 509 | } |
| 510 | |
| 511 | } |