Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 123 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
FST | |
0.00% |
0 / 123 |
|
0.00% |
0 / 6 |
1332 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
split | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
readUnsignedV | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
readSignedV | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
run | |
0.00% |
0 / 88 |
|
0.00% |
0 / 1 |
812 | |||
compile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikimedia\LangConv; |
4 | |
5 | use Wikimedia\Assert\Assert; |
6 | |
7 | /** |
8 | * Load and execute a finite-state transducer (FST) based converter or |
9 | * bracketing machine from a compact string description. |
10 | */ |
11 | class FST { |
12 | private const MAGIC_BYTES = 8; // 8 byte header w/ magic bytes |
13 | |
14 | // These pseudo-characters appear in the "output" side of the FST only. |
15 | private const BYTE_IDENTITY = 0xFF; |
16 | private const BYTE_RBRACKET = 0xFE; |
17 | private const BYTE_LBRACKET = 0xFD; |
18 | private const BYTE_FAIL = 0xFC; |
19 | // These pseudo-characters appear in the "input" side of the FST. |
20 | private const BYTE_EOF = 0xF8; // The highest possible input char |
21 | private const BYTE_EPSILON = 0x00; // Always appears first in sorted order |
22 | |
23 | /** |
24 | * Name of pFST file (for debugging and error messages). |
25 | * @var string |
26 | */ |
27 | private $name; |
28 | |
29 | /** |
30 | * FST, packed as a string. |
31 | * @var string |
32 | */ |
33 | private $pfst; |
34 | |
35 | /** |
36 | * @var bool |
37 | */ |
38 | private $justBrackets; |
39 | |
40 | /** |
41 | * @param string $name |
42 | * @param string $pfst |
43 | * @param bool $justBrackets |
44 | */ |
45 | private function __construct( string $name, string $pfst, bool $justBrackets ) { |
46 | $this->name = $name; |
47 | $this->pfst = $pfst; |
48 | $this->justBrackets = $justBrackets; |
49 | Assert::precondition( |
50 | strlen( $pfst ) >= self::MAGIC_BYTES + 2, /*states, min*/ |
51 | "pFST file too short: $name" |
52 | ); |
53 | Assert::precondition( |
54 | "pFST\0WM\0" === |
55 | substr( $pfst, 0, self::MAGIC_BYTES ), |
56 | "Invalid pFST file: $name" |
57 | ); |
58 | } |
59 | |
60 | /** |
61 | * @param string $input |
62 | * @param int|null $start |
63 | * @param int|null $end |
64 | * @return array |
65 | */ |
66 | public function split( string $input, ?int $start = null, ?int $end = null ): array { |
67 | // Debugging helper: instead of an array of positions, split the |
68 | // input at the bracket locations and return an array of strings. |
69 | Assert::precondition( $this->justBrackets, |
70 | "Needs a bracket machine: " . $this->name ); |
71 | $end = $end ?? strlen( $input ); |
72 | $r = $this->run( $input, $start, $end ); |
73 | $r[] = $end; |
74 | $i = 0; |
75 | $nr = []; |
76 | foreach ( $r as $j ) { |
77 | $nr[] = substr( $input, $i, $j ); |
78 | $i = $j; |
79 | } |
80 | return $nr; |
81 | } |
82 | |
83 | /** |
84 | * Read zig-zag encoded variable length integers |
85 | * (See [[:en:Variable-length_quantity#Zigzag_encoding]]) |
86 | * @param int &$state |
87 | * @return int |
88 | */ |
89 | private function readUnsignedV( int &$state ): int { |
90 | $b = ord( $this->pfst[$state++] ); |
91 | $val = $b & 127; |
92 | while ( $b & 128 ) { |
93 | $val += 1; |
94 | $b = ord( $this->pfst[$state++] ); |
95 | $val = ( $val << 7 ) + ( $b & 127 ); |
96 | } |
97 | return $val; |
98 | } |
99 | |
100 | /** |
101 | * @param int &$state |
102 | * @return int |
103 | */ |
104 | private function readSignedV( int &$state ): int { |
105 | $v = $this->readUnsignedV( $state ); |
106 | if ( $v & 1 ) { // sign bit is in LSB |
107 | return -( $v >> 1 ) - 1; |
108 | } else { |
109 | return $v >> 1; |
110 | } |
111 | } |
112 | |
113 | /** |
114 | * @param string $input |
115 | * @param int|null $start |
116 | * @param int|null $end |
117 | * @param bool $unicode |
118 | * @return string|array |
119 | */ |
120 | public function run( string $input, ?int $start = null, ?int $end = null, bool $unicode = false ) { |
121 | $start = $start ?? 0; |
122 | $end = $end ?? strlen( $input ); |
123 | $countCodePoints = $this->justBrackets && $unicode; |
124 | $initialState = self::MAGIC_BYTES + 2; /* eof state */ |
125 | $state = $initialState; |
126 | $idx = $start; |
127 | $outpos = 0; |
128 | $stack = [ new BacktrackState( 0, 0, 0, 0 ) ]; |
129 | $result = $stack[0]; |
130 | $result->partialBrackets[] = 0; |
131 | $epsSkip = 0; |
132 | |
133 | // This runs the machine until we reach the EOF state |
134 | while ( $state >= $initialState ) { |
135 | if ( $state === $initialState && count( $stack ) > 1 ) { |
136 | // Memory efficiency: since the machine is universal we know |
137 | // we'll never fail as long as we're in the initial state. |
138 | $result = $stack[0]; |
139 | foreach ( array_splice( $stack, 1 ) as $s ) { |
140 | if ( $this->justBrackets ) { |
141 | foreach ( $s->partialBrackets as $b ) { |
142 | $result->partialBrackets[] = $b; |
143 | } |
144 | } else { |
145 | $result->partialResult .= $s->partialResult; |
146 | } |
147 | } |
148 | } |
149 | $saveState = $state; |
150 | $edgeWidth = $this->readUnsignedV( $state ); |
151 | $nEdges = $this->readUnsignedV( $state ); |
152 | Assert::invariant( $nEdges > 0, $this->name ); |
153 | $saveEdges = $nEdges; |
154 | // Read first edge to see if there are any epsilon edges |
155 | $edge0 = $state; |
156 | if ( $epsSkip > 0 ) { |
157 | $edge0 += ( $epsSkip * $edgeWidth ); |
158 | $nEdges -= $epsSkip; |
159 | $epsSkip = 0; |
160 | } |
161 | if ( ord( $this->pfst[$edge0] ) === self::BYTE_EPSILON ) { |
162 | // If this is an epsilon edge, take it immediately! |
163 | // But save a backtrack state since this non-deterministic |
164 | // edge may fail. If it does, we'll restart at the next |
165 | // edge in this state. |
166 | if ( $nEdges > 1 ) { |
167 | $result = new BacktrackState( |
168 | $saveState, |
169 | ( $saveEdges - $nEdges ) + 1, |
170 | $outpos, |
171 | $idx ); |
172 | $stack[] = $result; |
173 | } |
174 | $targetEdge = $edge0; |
175 | $outByte = ord( $this->pfst[$edge0 + 1] ); |
176 | $c = self::BYTE_EPSILON; |
177 | } else { |
178 | // Binary search for an edge matching c |
179 | $c = $idx < $end ? ord( $input[$idx++] ) : /* pseudo-character: */ self::BYTE_EOF; |
180 | $minIndex = 0; |
181 | $maxIndex = $nEdges; |
182 | while ( $minIndex !== $maxIndex ) { |
183 | $currentIndex = ( $minIndex + $maxIndex ) >> 1; |
184 | $targetEdge = $edge0 + ( $edgeWidth * $currentIndex ); |
185 | $inByte = ord( $this->pfst[$targetEdge] ); |
186 | if ( $inByte <= $c ) { |
187 | $minIndex = $currentIndex + 1; |
188 | } else { |
189 | $maxIndex = $currentIndex; |
190 | } |
191 | } |
192 | // (minIndex-1).inByte <= c, and maxIndex.inByte > c |
193 | $targetEdge = $edge0 + ( $edgeWidth * ( $minIndex - 1 ) ); |
194 | $outByte = $minIndex > 0 ? ord( $this->pfst[$targetEdge + 1] ) : self::BYTE_FAIL; |
195 | } |
196 | if ( $outByte === self::BYTE_FAIL ) { |
197 | // FAIL! Pop an element off the stack and reset our state. |
198 | Assert::invariant( count( $stack ) > 1, $this->name ); # catch underflow |
199 | $s = array_pop( $stack ); |
200 | $outpos = $s->outpos; |
201 | $result = $stack[count( $stack ) - 1]; |
202 | $idx = $s->idx; |
203 | $state = $s->epsState; |
204 | $epsSkip = $s->epsSkip; |
205 | } else { |
206 | // Emit $outByte: add a byte to the output. |
207 | if ( $outByte !== self::BYTE_EPSILON ) { |
208 | if ( $outByte === self::BYTE_IDENTITY ) { |
209 | $outByte = $c; // Copy input byte to output |
210 | Assert::invariant( |
211 | $outByte !== self::BYTE_EPSILON, "bad pFST" |
212 | ); |
213 | } |
214 | if ( $this->justBrackets ) { |
215 | // Count brackets, if that's what we're doing. |
216 | if ( $outByte === self::BYTE_LBRACKET || |
217 | $outByte === self::BYTE_RBRACKET ) { |
218 | $result->partialBrackets[] = $outpos; |
219 | } elseif ( $countCodePoints && |
220 | $outByte >= 0x80 && $outByte < 0xC0 ) { |
221 | /* Ignore UTF-8 continuation characters */ |
222 | } else { |
223 | $outpos++; |
224 | } |
225 | } else { |
226 | // Add this byte to the partial result |
227 | $result->partialResult .= chr( $outByte ); |
228 | $outpos++; |
229 | } |
230 | } |
231 | // Done emitting, go on to the next state. |
232 | $state = $targetEdge + 2; // skip over inByte/outByte |
233 | $state = $this->readSignedV( $state ) + ( $targetEdge + 2 ); |
234 | } |
235 | } |
236 | |
237 | // Ok, process the final state and return something. |
238 | $result = $stack[0]; |
239 | foreach ( array_splice( $stack, 1 ) as $s ) { |
240 | if ( $this->justBrackets ) { |
241 | foreach ( $s->partialBrackets as $b ) { |
242 | $result->partialBrackets[] = $b; |
243 | } |
244 | } else { |
245 | $result->partialResult .= $s->partialResult; |
246 | } |
247 | } |
248 | if ( $this->justBrackets ) { |
249 | $result->partialBrackets[] = $outpos; |
250 | return $result->partialBrackets; |
251 | } |
252 | Assert::invariant( strlen( $result->partialResult ) === $outpos, $this->name ); |
253 | return $result->partialResult; |
254 | } |
255 | |
256 | /** |
257 | * Load an FST description and return a function which runs the machine. |
258 | * @param string $pfst The FST description as a filename (to be loaded synchronously) |
259 | * @param bool $justBrackets The machine will return an array of bracket locations, |
260 | * instead of the converted text. |
261 | * @return FST |
262 | */ |
263 | public static function compile( string $pfst, $justBrackets = false ): FST { |
264 | return new FST( $pfst, file_get_contents( $pfst ), $justBrackets ); |
265 | } |
266 | } |