Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 122 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
FST | |
0.00% |
0 / 122 |
|
0.00% |
0 / 6 |
1332 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
split | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
readUnsignedV | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
readSignedV | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
run | |
0.00% |
0 / 88 |
|
0.00% |
0 / 1 |
812 | |||
compile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikimedia\LangConv; |
4 | |
5 | use Wikimedia\Assert\Assert; |
6 | |
7 | /** |
8 | * Load and execute a finite-state transducer (FST) based converter or |
9 | * bracketing machine from a compact string description. |
10 | */ |
11 | class FST { |
12 | private const MAGIC_BYTES = 8; // 8 byte header w/ magic bytes |
13 | |
14 | // These pseudo-characters appear in the "output" side of the FST only. |
15 | private const BYTE_IDENTITY = 0xFF; |
16 | private const BYTE_RBRACKET = 0xFE; |
17 | private const BYTE_LBRACKET = 0xFD; |
18 | private const BYTE_FAIL = 0xFC; |
19 | // These pseudo-characters appear in the "input" side of the FST. |
20 | private const BYTE_EOF = 0xF8; // The highest possible input char |
21 | private const BYTE_EPSILON = 0x00; // Always appears first in sorted order |
22 | |
23 | /** |
24 | * Name of pFST file (for debugging and error messages). |
25 | * @var string |
26 | */ |
27 | private $name; |
28 | |
29 | /** |
30 | * FST, packed as a string. |
31 | * @var string |
32 | */ |
33 | private $pfst; |
34 | |
35 | /** |
36 | * @var bool |
37 | */ |
38 | private $justBrackets; |
39 | |
40 | /** |
41 | * @param string $name |
42 | * @param string $pfst |
43 | * @param bool $justBrackets |
44 | */ |
45 | private function __construct( string $name, string $pfst, bool $justBrackets ) { |
46 | $this->name = $name; |
47 | $this->pfst = $pfst; |
48 | $this->justBrackets = $justBrackets; |
49 | Assert::precondition( |
50 | strlen( $pfst ) >= self::MAGIC_BYTES + 2, /*states, min*/ |
51 | "pFST file too short: $name" |
52 | ); |
53 | Assert::precondition( |
54 | substr( $pfst, 0, self::MAGIC_BYTES ) === "pFST\0WM\0", |
55 | "Invalid pFST file: $name" |
56 | ); |
57 | } |
58 | |
59 | /** |
60 | * @param string $input |
61 | * @param int|null $start |
62 | * @param int|null $end |
63 | * @return array |
64 | */ |
65 | public function split( string $input, ?int $start = null, ?int $end = null ): array { |
66 | // Debugging helper: instead of an array of positions, split the |
67 | // input at the bracket locations and return an array of strings. |
68 | Assert::precondition( $this->justBrackets, |
69 | "Needs a bracket machine: " . $this->name ); |
70 | $end = $end ?? strlen( $input ); |
71 | $r = $this->run( $input, $start, $end ); |
72 | $r[] = $end; |
73 | $i = 0; |
74 | $nr = []; |
75 | foreach ( $r as $j ) { |
76 | $nr[] = substr( $input, $i, $j ); |
77 | $i = $j; |
78 | } |
79 | return $nr; |
80 | } |
81 | |
82 | /** |
83 | * Read zig-zag encoded variable length integers |
84 | * (See [[:en:Variable-length_quantity#Zigzag_encoding]]) |
85 | * @param int &$state |
86 | * @return int |
87 | */ |
88 | private function readUnsignedV( int &$state ): int { |
89 | $b = ord( $this->pfst[$state++] ); |
90 | $val = $b & 127; |
91 | while ( $b & 128 ) { |
92 | $val += 1; |
93 | $b = ord( $this->pfst[$state++] ); |
94 | $val = ( $val << 7 ) + ( $b & 127 ); |
95 | } |
96 | return $val; |
97 | } |
98 | |
99 | /** |
100 | * @param int &$state |
101 | * @return int |
102 | */ |
103 | private function readSignedV( int &$state ): int { |
104 | $v = $this->readUnsignedV( $state ); |
105 | if ( $v & 1 ) { // sign bit is in LSB |
106 | return -( $v >> 1 ) - 1; |
107 | } else { |
108 | return $v >> 1; |
109 | } |
110 | } |
111 | |
112 | /** |
113 | * @param string $input |
114 | * @param int|null $start |
115 | * @param int|null $end |
116 | * @param bool $unicode |
117 | * @return string|array |
118 | */ |
119 | public function run( string $input, ?int $start = null, ?int $end = null, bool $unicode = false ) { |
120 | $start = $start ?? 0; |
121 | $end = $end ?? strlen( $input ); |
122 | $countCodePoints = $this->justBrackets && $unicode; |
123 | $initialState = self::MAGIC_BYTES + 2; /* eof state */ |
124 | $state = $initialState; |
125 | $idx = $start; |
126 | $outpos = 0; |
127 | $stack = [ new BacktrackState( 0, 0, 0, 0 ) ]; |
128 | $result = $stack[0]; |
129 | $result->partialBrackets[] = 0; |
130 | $epsSkip = 0; |
131 | |
132 | // This runs the machine until we reach the EOF state |
133 | while ( $state >= $initialState ) { |
134 | if ( $state === $initialState && count( $stack ) > 1 ) { |
135 | // Memory efficiency: since the machine is universal we know |
136 | // we'll never fail as long as we're in the initial state. |
137 | $result = $stack[0]; |
138 | foreach ( array_splice( $stack, 1 ) as $s ) { |
139 | if ( $this->justBrackets ) { |
140 | foreach ( $s->partialBrackets as $b ) { |
141 | $result->partialBrackets[] = $b; |
142 | } |
143 | } else { |
144 | $result->partialResult .= $s->partialResult; |
145 | } |
146 | } |
147 | } |
148 | $saveState = $state; |
149 | $edgeWidth = $this->readUnsignedV( $state ); |
150 | $nEdges = $this->readUnsignedV( $state ); |
151 | Assert::invariant( $nEdges > 0, $this->name ); |
152 | $saveEdges = $nEdges; |
153 | // Read first edge to see if there are any epsilon edges |
154 | $edge0 = $state; |
155 | if ( $epsSkip > 0 ) { |
156 | $edge0 += ( $epsSkip * $edgeWidth ); |
157 | $nEdges -= $epsSkip; |
158 | $epsSkip = 0; |
159 | } |
160 | if ( ord( $this->pfst[$edge0] ) === self::BYTE_EPSILON ) { |
161 | // If this is an epsilon edge, take it immediately! |
162 | // But save a backtrack state since this non-deterministic |
163 | // edge may fail. If it does, we'll restart at the next |
164 | // edge in this state. |
165 | if ( $nEdges > 1 ) { |
166 | $result = new BacktrackState( |
167 | $saveState, |
168 | ( $saveEdges - $nEdges ) + 1, |
169 | $outpos, |
170 | $idx ); |
171 | $stack[] = $result; |
172 | } |
173 | $targetEdge = $edge0; |
174 | $outByte = ord( $this->pfst[$edge0 + 1] ); |
175 | $c = self::BYTE_EPSILON; |
176 | } else { |
177 | // Binary search for an edge matching c |
178 | $c = $idx < $end ? ord( $input[$idx++] ) : /* pseudo-character: */ self::BYTE_EOF; |
179 | $minIndex = 0; |
180 | $maxIndex = $nEdges; |
181 | while ( $minIndex !== $maxIndex ) { |
182 | $currentIndex = ( $minIndex + $maxIndex ) >> 1; |
183 | $targetEdge = $edge0 + ( $edgeWidth * $currentIndex ); |
184 | $inByte = ord( $this->pfst[$targetEdge] ); |
185 | if ( $inByte <= $c ) { |
186 | $minIndex = $currentIndex + 1; |
187 | } else { |
188 | $maxIndex = $currentIndex; |
189 | } |
190 | } |
191 | // (minIndex-1).inByte <= c, and maxIndex.inByte > c |
192 | $targetEdge = $edge0 + ( $edgeWidth * ( $minIndex - 1 ) ); |
193 | $outByte = $minIndex > 0 ? ord( $this->pfst[$targetEdge + 1] ) : self::BYTE_FAIL; |
194 | } |
195 | if ( $outByte === self::BYTE_FAIL ) { |
196 | // FAIL! Pop an element off the stack and reset our state. |
197 | Assert::invariant( count( $stack ) > 1, $this->name ); # catch underflow |
198 | $s = array_pop( $stack ); |
199 | $outpos = $s->outpos; |
200 | $result = $stack[count( $stack ) - 1]; |
201 | $idx = $s->idx; |
202 | $state = $s->epsState; |
203 | $epsSkip = $s->epsSkip; |
204 | } else { |
205 | // Emit $outByte: add a byte to the output. |
206 | if ( $outByte !== self::BYTE_EPSILON ) { |
207 | if ( $outByte === self::BYTE_IDENTITY ) { |
208 | $outByte = $c; // Copy input byte to output |
209 | Assert::invariant( |
210 | $outByte !== self::BYTE_EPSILON, "bad pFST" |
211 | ); |
212 | } |
213 | if ( $this->justBrackets ) { |
214 | // Count brackets, if that's what we're doing. |
215 | if ( $outByte === self::BYTE_LBRACKET || |
216 | $outByte === self::BYTE_RBRACKET ) { |
217 | $result->partialBrackets[] = $outpos; |
218 | } elseif ( $countCodePoints && |
219 | $outByte >= 0x80 && $outByte < 0xC0 ) { |
220 | /* Ignore UTF-8 continuation characters */ |
221 | } else { |
222 | $outpos++; |
223 | } |
224 | } else { |
225 | // Add this byte to the partial result |
226 | $result->partialResult .= chr( $outByte ); |
227 | $outpos++; |
228 | } |
229 | } |
230 | // Done emitting, go on to the next state. |
231 | $state = $targetEdge + 2; // skip over inByte/outByte |
232 | $state = $this->readSignedV( $state ) + ( $targetEdge + 2 ); |
233 | } |
234 | } |
235 | |
236 | // Ok, process the final state and return something. |
237 | $result = $stack[0]; |
238 | foreach ( array_splice( $stack, 1 ) as $s ) { |
239 | if ( $this->justBrackets ) { |
240 | foreach ( $s->partialBrackets as $b ) { |
241 | $result->partialBrackets[] = $b; |
242 | } |
243 | } else { |
244 | $result->partialResult .= $s->partialResult; |
245 | } |
246 | } |
247 | if ( $this->justBrackets ) { |
248 | $result->partialBrackets[] = $outpos; |
249 | return $result->partialBrackets; |
250 | } |
251 | Assert::invariant( strlen( $result->partialResult ) === $outpos, $this->name ); |
252 | return $result->partialResult; |
253 | } |
254 | |
255 | /** |
256 | * Load an FST description and return a function which runs the machine. |
257 | * @param string $pfst The FST description as a filename (to be loaded synchronously) |
258 | * @param bool $justBrackets The machine will return an array of bracket locations, |
259 | * instead of the converted text. |
260 | * @return FST |
261 | */ |
262 | public static function compile( string $pfst, $justBrackets = false ): FST { |
263 | return new FST( $pfst, file_get_contents( $pfst ), $justBrackets ); |
264 | } |
265 | } |