Code Coverage for /src/src/Construct/GenReplFst.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	20.48% covered (danger)	20.48%	43 / 210	17.65% covered (danger)	17.65%	3 / 17	CRAP	0.00% covered (danger)	0.00%	0 / 1
GenReplFst	20.48% covered (danger)	20.48%	43 / 210	17.65% covered (danger)	17.65%	3 / 17	2393.46	0.00% covered (danger)	0.00%	0 / 1
addAlphabet	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6
addEntry	0.00% covered (danger)	0.00%	0 / 24	0.00% covered (danger)	0.00%	0 / 1	72
nextUtf8State	0.00% covered (danger)	0.00%	0 / 14	0.00% covered (danger)	0.00%	0 / 1	42
utf8alphabet	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	42
splitFirstUtf8Char	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	12
flagDiacritic	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	12
addFdEdge	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
addFdEdgePair	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
buildState	0.00% covered (danger)	0.00%	0 / 74	0.00% covered (danger)	0.00%	0 / 1	380
emit	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	20
byteToHex	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
stringToTokens	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
tokensToString	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	12
applyDown	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
applyUp	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
__construct	100.00% covered (success)	100.00%	35 / 35	100.00% covered (success)	100.00%	1 / 1	5
writeATT	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace Wikimedia\LangConv\Construct;
4
5	use Wikimedia\Assert\Assert;
6
7	/**
8	* GENerate a REPLacement string FST.
9	*
10	* Create an FST from a replacement string array (aka, as would be provided
11	* to `str_tr` or `ReplacementArray` in mediawiki core).
12	*/
13	class GenReplFst {
14	// This can be anything, as long as it is longer than 1 character
15	// (so it doesn't conflict w/ any of the single-character keys)
16	private const END_OF_STRING = 'END';
17	// correlation of tree nodes to state machine states
18	private const STATE = 'STATE';
19	// UTF-8 decode state: 0=first byte, 1/2/3=bytes remaining in char
20	private const UTF8STATE = 'UTF8STATE';
21	// Character index of this tree node
22	private const INDEX = 'INDEX';
23
24	/** @var array<int\|string,int\|string\|array> */
25	private $prefixTree = [];
26	/** @var MutableFST */
27	private $fst;
28	/** @var array */
29	private $alphabet;
30	/**
31	* Prefix to use on flag diacritics (so they are unique to this FST).
32	* @var string
33	*/
34	private $fdPrefix;
35	/**
36	* How many flag diacritic features are needed. At most one less than
37	* the longest string, but can be shorter.
38	* @var int
39	*/
40	private $maxLookahead;
41	/**
42	* Cache of shift machines.
43	* @var array
44	*/
45	private $shiftCache = [];
46
47	/**
48	* Add each letter in the given word to our alphabet.
49	* @param array &$alphabet
50	* @param string $word
51	*/
52	private static function addAlphabet( array &$alphabet, string $word ): void {
53	for ( $i = 0; $i < strlen( $word ); $i++ ) {
54	$alphabet[ord( $word[$i] )] = true;
55	}
56	}
57
58	/**
59	* Add the given string (the substring of $from starting from $index) to
60	* the prefix tree $tree, along with the conversion output $to.
61	* @param array<int\|string,string\|array> &$tree
62	* @param string $from
63	* @param int $index
64	* @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char
65	* @param string $to
66	* @param bool $suppressUtf8Checks Whether to suppress the checks that the
67	* $from string is complete utf8.
68	*/
69	private static function addEntry(
70	array &$tree, string $from, int $index, int $utf8state, string $to,
71	bool $suppressUtf8Checks = false
72	): void {
73	$c = ord( $from[$index] );
74	if ( !isset( $tree[$c] ) ) {
75	$tree[$c] = [];
76	}
77	if ( !isset( $tree[self::UTF8STATE] ) ) {
78	$tree[self::UTF8STATE] = $utf8state;
79	}
80	Assert::invariant(
81	isset( $tree[self::UTF8STATE] ) && $tree[self::UTF8STATE] === $utf8state,
82	"Should never happen"
83	);
84	if ( !isset( $tree[self::INDEX] ) ) {
85	$tree[self::INDEX] = $index;
86	}
87	Assert::invariant(
88	isset( $tree[self::INDEX] ) && $tree[self::INDEX] === $index,
89	"Should never happen"
90	);
91	$nextUtf8State = self::nextUtf8State( $utf8state, $c );
92	$nextIndex = $index + 1;
93	if ( $nextIndex < strlen( $from ) ) {
94	self::addEntry( $tree[$c], $from, $nextIndex, $nextUtf8State, $to );
95	} else {
96	if ( !$suppressUtf8Checks ) {
97	Assert::invariant( $nextUtf8State === 0, "Bad UTF-8 in input" );
98	}
99	$tree[$c][self::UTF8STATE] = $nextUtf8State;
100	$tree[$c][self::INDEX] = $nextIndex;
101	$tree[$c][self::END_OF_STRING] = $to;
102	}
103	}
104
105	/**
106	* Return the next UTF-8 state, given the current state and the
107	* current character.
108	* @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char
109	* @param int $c The current character
110	* @return int The next UTF-8 state
111	*/
112	private static function nextUtf8State( int $utf8state, int $c ): int {
113	if ( $utf8state === 0 ) {
114	if ( $c <= 0x7F ) {
115	return 0;
116	} elseif ( $c <= 0xDF ) {
117	Assert::invariant( $c >= 0xC0, "Bad UTF-8 in input" );
118	return 1;
119	} elseif ( $c <= 0xEF ) {
120	Assert::invariant( $c >= 0xE0, "Bad UTF-8 in input" );
121	return 2;
122	} elseif ( $c <= 0xF7 ) {
123	Assert::invariant( $c >= 0xF0, "Bad UTF-8 in input" );
124	return 3;
125	}
126	} else {
127	return $utf8state - 1;
128	}
129	// @phan-suppress-next-line PhanImpossibleCondition
130	Assert::invariant( false, "Bad UTF-8 in input" );
131	}
132
133	/**
134	* Return the subset of the given alphabet appropriate for the current
135	* UTF-8 state.
136	* @param int $utf8state 0=first byte, 1/2/3 = bytes remaining in char
137	* @return \Generator<int>
138	*/
139	private function utf8alphabet( int $utf8state ) {
140	foreach ( $this->alphabet as $c ) {
141	if ( $c >= 0x80 && $c <= 0xBF ) {
142	// UTF-8 continuation character
143	if ( $utf8state !== 0 ) {
144	yield $c;
145	}
146	} else {
147	if ( $utf8state === 0 ) {
148	yield $c;
149	}
150	}
151	}
152	}
153
154	/**
155	* Split a given string into the first utf8 char, and "everything else".
156	* @param string $s
157	* @return string[]
158	*/
159	private static function splitFirstUtf8Char( string $s ) {
160	$utf8state = 0;
161	$i = 0;
162	while ( $i < strlen( $s ) ) {
163	$c = ord( $s[$i] );
164	$utf8state = self::nextUtf8State( $utf8state, $c );
165	$i += 1;
166	if ( $utf8state === 0 ) {
167	break;
168	}
169	}
170	return [ substr( $s, 0, $i ), substr( $s, $i ) ];
171	}
172
173	/**
174	* Return the name of a flag diacritic for matching character at a
175	* specified offset.
176	* @param string $type Flag diacritic operator: P/R/D/C/U
177	* @param int $offset Identifies the feature
178	* @param int\|null $value The value to set/test (optional)
179	* @return string The symbol name for the given flag diacritic operation
180	*/
181	private function flagDiacritic(
182	string $type, int $offset, ?int $value = null
183	): string {
184	$str = "@$type." . $this->fdPrefix . "char$offset";
185	if ( $value !== null ) {
186	$str .= ".";
187	if ( $value >= 0 ) {
188	$str .= self::byteToHex( $value );
189	} else {
190	$str .= "UNK";
191	}
192	}
193	return $str . "@";
194	}
195
196	/**
197	* Add a flag diacritic edge between $from and $to. By convention the
198	* flag diacritic is on both the upper and lower slots of the edge.
199	* @param State $from The source of the new edge
200	* @param State $to The destination of the new edge
201	* @param string $type The flag diacritic operator
202	* @param int $offset The flag diacritic feature
203	* @param int\|null $value The flag diacritic value (optional)
204	*/
205	private function addFdEdge(
206	State $from, State $to, string $type, int $offset, ?int $value = null
207	): void {
208	$fdName = $this->flagDiacritic( $type, $offset, $value );
209	$from->addEdge( $fdName, $fdName, $to );
210	}
211
212	/**
213	* Add two flag diacritic edge between $from and $to. By convention the
214	* flag diacritics are on both the upper and lower slots of the edge.
215	* @param State $from The source of the new edges
216	* @param State $to The destination of the new edges
217	* @param string $type1 The first flag diacritic operator
218	* @param int $offset1 The first flag diacritic feature
219	* @param int\|null $value1 The first flag diacritic value (optional)
220	* @param string $type2 The second flag diacritic operator
221	* @param int $offset2 The second flag diacritic feature
222	* @param int\|null $value2 The second flag diacritic value (optional)
223	*/
224	private function addFdEdgePair(
225	State $from, State $to,
226	string $type1, int $offset1, ?int $value1,
227	string $type2, int $offset2, ?int $value2
228	): void {
229	$n = $this->fst->newState();
230	$this->addFdEdge( $from, $n, $type1, $offset1, $value1 );
231	$this->addFdEdge( $n, $to, $type2, $offset2, $value2 );
232	}
233
234	/**
235	* Add edges from state $from corresponding to the prefix tree $tree,
236	* given the $lastMatch and the characters seen since then, $seen.
237	* @param State $from
238	* @param array &$tree
239	* @param ?string $lastMatch (null if we haven't seen a match)
240	* @param string $seen characters seen since last match
241	*/
242	private function buildState( State $from, array &$tree, ?string $lastMatch, string $seen ): void {
243	$tree[self::STATE] = $from;
244	$index = $tree[self::INDEX];
245	$utf8state = $tree[self::UTF8STATE];
246	if ( $index < $this->maxLookahead ) {
247	$noBufState = $this->fst->newState();
248	$this->addFdEdge( $from, $noBufState, 'D', $index );
249	} else {
250	$noBufState = $from;
251	}
252	$noMatchState = $this->fst->newState();
253
254	if ( isset( $tree[self::END_OF_STRING] ) ) {
255	$lastMatch = $tree[self::END_OF_STRING];
256	$seen = '';
257	}
258	foreach ( $this->utf8alphabet( $utf8state ) as $c ) {
259	if ( isset( $tree[$c] ) ) {
260	$nextSeen = $seen . chr( $c );
261	$n = $this->fst->newState();
262	$noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n );
263	if ( $index < $this->maxLookahead ) {
264	$this->addFdEdge( $from, $n, 'R', $index, $c );
265	}
266	$nn = $this->fst->newState();
267	$this->addFdEdge( $n, $nn, 'P', strlen( $seen ), $c );
268	$this->buildState( $nn, $tree[$c], $lastMatch, $nextSeen );
269	} else {
270	Assert::invariant( $index !== 0,
271	"fake single-char matches should always exist" );
272
273	$n = $this->fst->newState();
274	$noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n );
275	if ( $index < $this->maxLookahead ) {
276	$this->addFdEdge( $from, $n, 'R', $index, $c );
277	}
278	Assert::invariant(
279	strlen( $seen ) < $this->maxLookahead,
280	"Max lookahead should always account for seen"
281	);
282	$this->addFdEdge( $n, $noMatchState, 'P', strlen( $seen ), $c );
283	}
284	}
285	// our first state must echo all continuation characters, since
286	// the anythingState transitions there and we don't know what
287	// utf8 state IDENTITY will leave us in. (The characters not in
288	// our alphabet could consist of 1-/2-/3-/4-byte sequences.)
289	if ( $index === 0 ) {
290	foreach ( self::utf8alphabet( 1/continuation chars/ ) as $c ) {
291	$nextSeen = $seen . chr( $c );
292	$n = $this->fst->newState();
293	$noBufState->addEdge( self::byteToHex( $c ), MutableFST::EPSILON, $n );
294	if ( $index < $this->maxLookahead ) {
295	$this->addFdEdge( $from, $n, 'R', $index, $c );
296	}
297	$fakeTree = [];
298	$fakeTree[self::UTF8STATE] = 1;
299	$fakeTree[self::INDEX] = $index + 1;
300	$fakeTree[self::END_OF_STRING] = chr( $c );
301	$this->buildState( $n, $fakeTree, $lastMatch, $nextSeen );
302	}
303	}
304	// "anything else"
305	$anythingElse = $this->fst->newState();
306	$noBufState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $anythingElse );
307	if ( $index !== 0 ) {
308	$this->addFdEdge( $from, $anythingElse, 'R', $index, -1 );
309	}
310	$this->addFdEdge( $anythingElse, $noMatchState, 'P', strlen( $seen ), -1 );
311	// Emit the last match
312	if ( $lastMatch !== null ) {
313	$noMatchState = $this->emit( $noMatchState, MutableFST::EPSILON, $lastMatch );
314	}
315	// Shift over queued input
316	for ( $i = $index + 1; true; $i++ ) {
317	$j = strlen( $seen ) + ( $i - $index );
318	$key = $i . ':' . $j;
319	if ( isset( $this->shiftCache[$key] ) ) {
320	$noMatchState->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->shiftCache[$key] );
321	return;
322	}
323	$this->shiftCache[$key] = $noMatchState;
324	if ( $i < $this->maxLookahead ) {
325	$n = $this->fst->newState();
326	$this->addFdEdge( $noMatchState, $n, 'D', $i );
327	} else {
328	$n = $noMatchState;
329	}
330	for ( $k = $j; $k < $i && $k < $this->maxLookahead; $k++ ) {
331	$nn = $this->fst->newState();
332	$this->addFdEdge( $n, $nn, 'C', $k );
333	$n = $nn;
334	}
335	$n->addEdge( MutableFST::EPSILON, MutableFST::EPSILON, $this->fst->getStartState() );
336	if ( !( $i < $this->maxLookahead ) ) {
337	break;
338	}
339	$n = $this->fst->newState();
340	$this->addFdEdgePair( $noMatchState, $n, 'R', $i, -1, 'P', $j, -1 );
341	foreach ( $this->alphabet as $c ) {
342	$this->addFdEdgePair( $noMatchState, $n, 'R', $i, $c, 'P', $j, $c );
343	}
344	$noMatchState = $n;
345	}
346	}
347
348	/**
349	* Chain states together from $fromState to emit $emitStr.
350	* @param State $fromState
351	* @param string $fromChar
352	* @param string $emitStr
353	* @return State the resulting state (after the string has been emitted)
354	*/
355	private function emit( State $fromState, string $fromChar, string $emitStr ): State {
356	if ( strlen( $emitStr ) === 0 && $fromChar !== MutableFST::EPSILON ) {
357	$n = $this->fst->newState();
358	$fromState->addEdge( $fromChar, MutableFST::EPSILON, $n );
359	return $n;
360	}
361	for ( $i = 0; $i < strlen( $emitStr ); $i++ ) {
362	$c = ord( $emitStr[$i] );
363	$n = $this->fst->newState();
364	$fromState->addEdge( $fromChar, self::byteToHex( $c ), $n );
365	$fromState = $n;
366	$fromChar = MutableFST::EPSILON;
367	}
368	return $fromState;
369	}
370
371	/**
372	* Private helper function: convert a numeric byte to the string
373	* token we use in the FST.
374	* @param int $byte
375	* @return string Token
376	*/
377	private static function byteToHex( int $byte ): string {
378	$s = strtoupper( dechex( $byte ) );
379	while ( strlen( $s ) < 2 ) {
380	$s = "0$s";
381	}
382	return $s;
383	}
384
385	/**
386	* Private helper function: convert a UTF-8 string byte-by-byte into
387	* an array of tokens.
388	* @param string $s
389	* @return string[]
390	*/
391	private static function stringToTokens( string $s ): array {
392	$toks = [];
393	for ( $i = 0; $i < strlen( $s ); $i++ ) {
394	$toks[] = self::byteToHex( ord( $s[$i] ) );
395	}
396	return $toks;
397	}
398
399	/**
400	* Private helper function: convert an array of tokens into
401	* a UTF-8 string.
402	* @param string[] $toks
403	* @return string
404	*/
405	private static function tokensToString( array $toks ): string {
406	$s = '';
407	foreach ( $toks as $token ) {
408	if ( strlen( $token ) === 2 ) {
409	$s .= chr( hexdec( $token ) );
410	} else {
411	// shouldn't happen, but handy for debugging if it does
412	$s .= $token;
413	}
414	}
415	return $s;
416	}
417
418	/**
419	* For testing: apply the resulting FST to the given input string.
420	* @param string $input
421	* @return string[] The possible outputs.
422	*/
423	public function applyDown( string $input ): array {
424	// convert input to byte tokens
425	$result = $this->fst->applyDown( self::stringToTokens( $input ) );
426	return array_map( function ( $toks ) {
427	return self::tokensToString( $toks );
428	}, $result );
429	}
430
431	/**
432	* For testing: run the resulting FST "in reverse" against the given
433	* input string.
434	* @param string $input
435	* @return string[] The possible outputs.
436	*/
437	public function applyUp( string $input ): array {
438	// convert input to byte tokens
439	$result = $this->fst->applyUp( self::stringToTokens( $input ) );
440	return array_map( function ( $toks ) {
441	return self::tokensToString( $toks );
442	}, $result );
443	}
444
445	/**
446	* Convert the given $replacementTable (strtr-style) to an FST.
447	* @param string $name
448	* @param array<string,string> $replacementTable
449	* @param string $fdPrefix Flag diacritic feature prefix, for uniqueness
450	*/
451	public function __construct(
452	string $name, array $replacementTable, string $fdPrefix = ''
453	) {
454	$this->fdPrefix = $fdPrefix;
455	$alphabet = [];
456	$longestWord = 0;
457	foreach ( $replacementTable as $from => $to ) {
458	$longestWord = max( $longestWord, strlen( $from ) );
459	self::addAlphabet( $alphabet, $from );
460	self::addAlphabet( $alphabet, $to );
461	self::addEntry( $this->prefixTree, $from, 0, 0, $to );
462	}
463	// fake one character matches!
464	foreach ( $alphabet as $sym => $value ) {
465	if ( $sym < 0x80 \|\| $sym > 0xBF ) { // not continuation chars
466	self::addEntry( $this->prefixTree, chr( $sym ), 0, 0, chr( $sym ), true );
467	}
468	}
469	$this->maxLookahead = $longestWord - 1; // XXX could be shorter
470	$this->alphabet = array_keys( $alphabet );
471	sort( $this->alphabet, SORT_NUMERIC );
472	// ok, now we're ready to emit the FST
473	$this->fst = new MutableFST( array_map( function ( $n ) {
474	return self::byteToHex( $n );
475	}, $this->alphabet ) );
476	$anythingState = $this->fst->newState();
477	$anything2State = $this->fst->newState();
478	$anythingState->addEdge(
479	MutableFST::UNKNOWN, MutableFST::IDENTITY,
480	$anything2State
481	);
482	$this->addFdEdge( $this->fst->getStartState(), $anythingState, 'R', 0, -1 );
483	$this->addFdEdge( $anything2State, $this->fst->getStartState(), 'C', 0 );
484	// The anything state could also be the end of the string
485	// (which for modelling purposes we can think of as a special
486	// "EOF" token not in the alphabet)
487	// Important that there are no outgoing edges from the $endState!
488	$endState = $this->fst->newState();
489	$endState->isFinal = true;
490	$anythingState->addEdge(
491	MutableFST::EPSILON, MutableFST::EPSILON,
492	$endState
493	);
494
495	// Create states corresponding to prefix tree nodes
496	$this->buildState(
497	$this->fst->getStartState(), $this->prefixTree, null, ''
498	);
499	// ok, done!
500	$this->fst->optimize();
501	}
502
503	/**
504	* Write the FST to the given file handle in AT&T format.
505	* @param resource $handle
506	*/
507	public function writeATT( $handle ): void {
508	$this->fst->writeATT( $handle );
509	}
510
511	}