Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 122 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
| FST | |
0.00% |
0 / 122 |
|
0.00% |
0 / 6 |
1332 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
| split | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
| readUnsignedV | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| readSignedV | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| run | |
0.00% |
0 / 88 |
|
0.00% |
0 / 1 |
812 | |||
| compile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikimedia\LangConv; |
| 4 | |
| 5 | use Wikimedia\Assert\Assert; |
| 6 | |
| 7 | /** |
| 8 | * Load and execute a finite-state transducer (FST) based converter or |
| 9 | * bracketing machine from a compact string description. |
| 10 | */ |
| 11 | class FST { |
| 12 | private const MAGIC_BYTES = 8; // 8 byte header w/ magic bytes |
| 13 | |
| 14 | // These pseudo-characters appear in the "output" side of the FST only. |
| 15 | private const BYTE_IDENTITY = 0xFF; |
| 16 | private const BYTE_RBRACKET = 0xFE; |
| 17 | private const BYTE_LBRACKET = 0xFD; |
| 18 | private const BYTE_FAIL = 0xFC; |
| 19 | // These pseudo-characters appear in the "input" side of the FST. |
| 20 | private const BYTE_EOF = 0xF8; // The highest possible input char |
| 21 | private const BYTE_EPSILON = 0x00; // Always appears first in sorted order |
| 22 | |
| 23 | /** |
| 24 | * Name of pFST file (for debugging and error messages). |
| 25 | * @var string |
| 26 | */ |
| 27 | private $name; |
| 28 | |
| 29 | /** |
| 30 | * FST, packed as a string. |
| 31 | * @var string |
| 32 | */ |
| 33 | private $pfst; |
| 34 | |
| 35 | /** |
| 36 | * @var bool |
| 37 | */ |
| 38 | private $justBrackets; |
| 39 | |
| 40 | /** |
| 41 | * @param string $name |
| 42 | * @param string $pfst |
| 43 | * @param bool $justBrackets |
| 44 | */ |
| 45 | private function __construct( string $name, string $pfst, bool $justBrackets ) { |
| 46 | $this->name = $name; |
| 47 | $this->pfst = $pfst; |
| 48 | $this->justBrackets = $justBrackets; |
| 49 | Assert::precondition( |
| 50 | strlen( $pfst ) >= self::MAGIC_BYTES + 2, /*states, min*/ |
| 51 | "pFST file too short: $name" |
| 52 | ); |
| 53 | Assert::precondition( |
| 54 | substr( $pfst, 0, self::MAGIC_BYTES ) === "pFST\0WM\0", |
| 55 | "Invalid pFST file: $name" |
| 56 | ); |
| 57 | } |
| 58 | |
| 59 | /** |
| 60 | * @param string $input |
| 61 | * @param int|null $start |
| 62 | * @param int|null $end |
| 63 | * @return array |
| 64 | */ |
| 65 | public function split( string $input, ?int $start = null, ?int $end = null ): array { |
| 66 | // Debugging helper: instead of an array of positions, split the |
| 67 | // input at the bracket locations and return an array of strings. |
| 68 | Assert::precondition( $this->justBrackets, |
| 69 | "Needs a bracket machine: " . $this->name ); |
| 70 | $end = $end ?? strlen( $input ); |
| 71 | $r = $this->run( $input, $start, $end ); |
| 72 | $r[] = $end; |
| 73 | $i = 0; |
| 74 | $nr = []; |
| 75 | foreach ( $r as $j ) { |
| 76 | $nr[] = substr( $input, $i, $j ); |
| 77 | $i = $j; |
| 78 | } |
| 79 | return $nr; |
| 80 | } |
| 81 | |
| 82 | /** |
| 83 | * Read zig-zag encoded variable length integers |
| 84 | * (See [[:en:Variable-length_quantity#Zigzag_encoding]]) |
| 85 | * @param int &$state |
| 86 | * @return int |
| 87 | */ |
| 88 | private function readUnsignedV( int &$state ): int { |
| 89 | $b = ord( $this->pfst[$state++] ); |
| 90 | $val = $b & 127; |
| 91 | while ( $b & 128 ) { |
| 92 | $val += 1; |
| 93 | $b = ord( $this->pfst[$state++] ); |
| 94 | $val = ( $val << 7 ) + ( $b & 127 ); |
| 95 | } |
| 96 | return $val; |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * @param int &$state |
| 101 | * @return int |
| 102 | */ |
| 103 | private function readSignedV( int &$state ): int { |
| 104 | $v = $this->readUnsignedV( $state ); |
| 105 | if ( $v & 1 ) { // sign bit is in LSB |
| 106 | return -( $v >> 1 ) - 1; |
| 107 | } else { |
| 108 | return $v >> 1; |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | /** |
| 113 | * @param string $input |
| 114 | * @param int|null $start |
| 115 | * @param int|null $end |
| 116 | * @param bool $unicode |
| 117 | * @return string|array |
| 118 | */ |
| 119 | public function run( string $input, ?int $start = null, ?int $end = null, bool $unicode = false ) { |
| 120 | $start = $start ?? 0; |
| 121 | $end = $end ?? strlen( $input ); |
| 122 | $countCodePoints = $this->justBrackets && $unicode; |
| 123 | $initialState = self::MAGIC_BYTES + 2; /* eof state */ |
| 124 | $state = $initialState; |
| 125 | $idx = $start; |
| 126 | $outpos = 0; |
| 127 | $stack = [ new BacktrackState( 0, 0, 0, 0 ) ]; |
| 128 | $result = $stack[0]; |
| 129 | $result->partialBrackets[] = 0; |
| 130 | $epsSkip = 0; |
| 131 | |
| 132 | // This runs the machine until we reach the EOF state |
| 133 | while ( $state >= $initialState ) { |
| 134 | if ( $state === $initialState && count( $stack ) > 1 ) { |
| 135 | // Memory efficiency: since the machine is universal we know |
| 136 | // we'll never fail as long as we're in the initial state. |
| 137 | $result = $stack[0]; |
| 138 | foreach ( array_splice( $stack, 1 ) as $s ) { |
| 139 | if ( $this->justBrackets ) { |
| 140 | foreach ( $s->partialBrackets as $b ) { |
| 141 | $result->partialBrackets[] = $b; |
| 142 | } |
| 143 | } else { |
| 144 | $result->partialResult .= $s->partialResult; |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | $saveState = $state; |
| 149 | $edgeWidth = $this->readUnsignedV( $state ); |
| 150 | $nEdges = $this->readUnsignedV( $state ); |
| 151 | Assert::invariant( $nEdges > 0, $this->name ); |
| 152 | $saveEdges = $nEdges; |
| 153 | // Read first edge to see if there are any epsilon edges |
| 154 | $edge0 = $state; |
| 155 | if ( $epsSkip > 0 ) { |
| 156 | $edge0 += ( $epsSkip * $edgeWidth ); |
| 157 | $nEdges -= $epsSkip; |
| 158 | $epsSkip = 0; |
| 159 | } |
| 160 | if ( ord( $this->pfst[$edge0] ) === self::BYTE_EPSILON ) { |
| 161 | // If this is an epsilon edge, take it immediately! |
| 162 | // But save a backtrack state since this non-deterministic |
| 163 | // edge may fail. If it does, we'll restart at the next |
| 164 | // edge in this state. |
| 165 | if ( $nEdges > 1 ) { |
| 166 | $result = new BacktrackState( |
| 167 | $saveState, |
| 168 | ( $saveEdges - $nEdges ) + 1, |
| 169 | $outpos, |
| 170 | $idx ); |
| 171 | $stack[] = $result; |
| 172 | } |
| 173 | $targetEdge = $edge0; |
| 174 | $outByte = ord( $this->pfst[$edge0 + 1] ); |
| 175 | $c = self::BYTE_EPSILON; |
| 176 | } else { |
| 177 | // Binary search for an edge matching c |
| 178 | $c = $idx < $end ? ord( $input[$idx++] ) : /* pseudo-character: */ self::BYTE_EOF; |
| 179 | $minIndex = 0; |
| 180 | $maxIndex = $nEdges; |
| 181 | while ( $minIndex !== $maxIndex ) { |
| 182 | $currentIndex = ( $minIndex + $maxIndex ) >> 1; |
| 183 | $targetEdge = $edge0 + ( $edgeWidth * $currentIndex ); |
| 184 | $inByte = ord( $this->pfst[$targetEdge] ); |
| 185 | if ( $inByte <= $c ) { |
| 186 | $minIndex = $currentIndex + 1; |
| 187 | } else { |
| 188 | $maxIndex = $currentIndex; |
| 189 | } |
| 190 | } |
| 191 | // (minIndex-1).inByte <= c, and maxIndex.inByte > c |
| 192 | $targetEdge = $edge0 + ( $edgeWidth * ( $minIndex - 1 ) ); |
| 193 | $outByte = $minIndex > 0 ? ord( $this->pfst[$targetEdge + 1] ) : self::BYTE_FAIL; |
| 194 | } |
| 195 | if ( $outByte === self::BYTE_FAIL ) { |
| 196 | // FAIL! Pop an element off the stack and reset our state. |
| 197 | Assert::invariant( count( $stack ) > 1, $this->name ); # catch underflow |
| 198 | $s = array_pop( $stack ); |
| 199 | $outpos = $s->outpos; |
| 200 | $result = $stack[count( $stack ) - 1]; |
| 201 | $idx = $s->idx; |
| 202 | $state = $s->epsState; |
| 203 | $epsSkip = $s->epsSkip; |
| 204 | } else { |
| 205 | // Emit $outByte: add a byte to the output. |
| 206 | if ( $outByte !== self::BYTE_EPSILON ) { |
| 207 | if ( $outByte === self::BYTE_IDENTITY ) { |
| 208 | $outByte = $c; // Copy input byte to output |
| 209 | Assert::invariant( |
| 210 | $outByte !== self::BYTE_EPSILON, "bad pFST" |
| 211 | ); |
| 212 | } |
| 213 | if ( $this->justBrackets ) { |
| 214 | // Count brackets, if that's what we're doing. |
| 215 | if ( $outByte === self::BYTE_LBRACKET || |
| 216 | $outByte === self::BYTE_RBRACKET ) { |
| 217 | $result->partialBrackets[] = $outpos; |
| 218 | } elseif ( $countCodePoints && |
| 219 | $outByte >= 0x80 && $outByte < 0xC0 ) { |
| 220 | /* Ignore UTF-8 continuation characters */ |
| 221 | } else { |
| 222 | $outpos++; |
| 223 | } |
| 224 | } else { |
| 225 | // Add this byte to the partial result |
| 226 | $result->partialResult .= chr( $outByte ); |
| 227 | $outpos++; |
| 228 | } |
| 229 | } |
| 230 | // Done emitting, go on to the next state. |
| 231 | $state = $targetEdge + 2; // skip over inByte/outByte |
| 232 | $state = $this->readSignedV( $state ) + ( $targetEdge + 2 ); |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | // Ok, process the final state and return something. |
| 237 | $result = $stack[0]; |
| 238 | foreach ( array_splice( $stack, 1 ) as $s ) { |
| 239 | if ( $this->justBrackets ) { |
| 240 | foreach ( $s->partialBrackets as $b ) { |
| 241 | $result->partialBrackets[] = $b; |
| 242 | } |
| 243 | } else { |
| 244 | $result->partialResult .= $s->partialResult; |
| 245 | } |
| 246 | } |
| 247 | if ( $this->justBrackets ) { |
| 248 | $result->partialBrackets[] = $outpos; |
| 249 | return $result->partialBrackets; |
| 250 | } |
| 251 | Assert::invariant( strlen( $result->partialResult ) === $outpos, $this->name ); |
| 252 | return $result->partialResult; |
| 253 | } |
| 254 | |
| 255 | /** |
| 256 | * Load an FST description and return a function which runs the machine. |
| 257 | * @param string $pfst The FST description as a filename (to be loaded synchronously) |
| 258 | * @param bool $justBrackets The machine will return an array of bracket locations, |
| 259 | * instead of the converted text. |
| 260 | * @return FST |
| 261 | */ |
| 262 | public static function compile( string $pfst, $justBrackets = false ): FST { |
| 263 | return new FST( $pfst, file_get_contents( $pfst ), $justBrackets ); |
| 264 | } |
| 265 | } |