Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
MutableFST
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
3906
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 newState
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getStartState
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyDown
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUp
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUpDown
0.00% covered (danger)
0.00%
0 / 56
0.00% covered (danger)
0.00%
0 / 1
930
 optimize
0.00% covered (danger)
0.00%
0 / 62
0.00% covered (danger)
0.00%
0 / 1
506
 writeATT
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
20
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\LangConv\Construct;
5
6use Wikimedia\Assert\Assert;
7
8/**
9 * A mutable FST.
10 */
11class MutableFST {
12    /** Special alphabet symbol for `nothing` used in AT&T format FST files. */
13    public const EPSILON = '@0@';
14    /** Special alphabet symbol for upper-side `?` used in AT&T format FST files. */
15    public const UNKNOWN = '@_UNKNOWN_SYMBOL_@';
16    /** Special alphabet symbol for lower-side `?` used in AT&T format FST files. */
17    public const IDENTITY = '@_IDENTITY_SYMBOL_@';
18
19    /** @var array<string,true> */
20    private $alphabet = [];
21    /** @var State[] */
22    private $states = [];
23
24    /**
25     * Create a new MutableFST with the given alphabet.
26     * @param string[] $alphabet
27     */
28    public function __construct( array $alphabet ) {
29        // Create start state
30        $this->newState();
31        // Set up alphabet
32        foreach ( $alphabet as $tok ) {
33            $this->alphabet[$tok] = true;
34        }
35        $this->alphabet[self::EPSILON] = true;
36        $this->alphabet[self::IDENTITY] = true;
37        $this->alphabet[self::UNKNOWN] = true;
38    }
39
40    /**
41     * Add a new state to this FST.
42     * @return State the new state.
43     */
44    public function newState(): State {
45        $id = count( $this->states );
46        $s = $this->states[] = new State( $id );
47        return $s;
48    }
49
50    /**
51     * Return the start state of this FST.
52     * @return State the start state.
53     */
54    public function getStartState(): State {
55        return $this->states[0];
56    }
57
58    /**
59     * Apply the given FST to the array of input tokens.  Symbols in
60     * 'upper' are matched and symbols in 'lower' are emitted.
61     * @param string[] $input
62     * @return array<array<string>>
63     */
64    public function applyDown( array $input ): array {
65        return $this->applyUpDown( $input, true );
66    }
67
68    /**
69     * Apply the given FST to the array of input tokens.  Symbols in
70     * 'lower' are matched and symbols in 'upper' are emitted.
71     * @param string[] $input
72     * @return array<array<string>>
73     */
74    public function applyUp( array $input ): array {
75        return $this->applyUpDown( $input, false );
76    }
77
78    /**
79     * Apply the given FST to the array of input tokens.
80     * @param string[] $input array of input tokens
81     * @param bool $isDown Whether to apply 'up' or 'down'
82     * @return array<array<string>>
83     */
84    private function applyUpDown( array $input, bool $isDown ): array {
85        // This is an extremely simple implementation, made for validating
86        // the FST not efficient execution.
87        $results = [];
88        $stack = [];
89        $flagRE = '/^@([PNRDCU])[.]([^.@]+)(?:[.]([^@]+))?@$/D';
90        $addTok = static function ( $toks, $t ) use ( $flagRE ) {
91            if ( $t === self::EPSILON || preg_match( $flagRE, $t ) ) {
92                // skip token
93            } else {
94                $toks[] = $t;
95            }
96            return $toks;
97        };
98        $stack[] = [ $this->getStartState(), 0, [], [] ];
99        while ( count( $stack ) > 0 ) {
100            [ $state, $pos, $flags, $emitted ] = array_pop( $stack );
101            $tok = $pos < count( $input ) ? $input[$pos] : null;
102            if ( $tok !== null && !isset( $this->alphabet[$tok] ) ) {
103                $tok = self::UNKNOWN;
104            }
105            foreach ( $state->edges as $e ) {
106                if ( $isDown ) {
107                    $match = $e->upper;
108                    $output = $e->lower;
109                } else {
110                    $match = $e->lower;
111                    $output = $e->upper;
112                }
113                // Treat IDENTITY and UNKNOWN as identical (applyDown/applyUp)
114                if ( $match === self::IDENTITY ) {
115                    $match = self::UNKNOWN;
116                }
117                if ( $output === self::UNKNOWN ) {
118                    $output = self::IDENTITY;
119                }
120                // Does this edge match?
121                if ( $output === self::IDENTITY && $tok !== null ) {
122                    $output = $input[$pos];
123                }
124                if ( $match === self::EPSILON ) {
125                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
126                } elseif ( preg_match( $flagRE, $match, $flagInfo, PREG_UNMATCHED_AS_NULL ) === 1 ) {
127                    $op = $flagInfo[1];
128                    $feature = $flagInfo[2];
129                    $value = $flagInfo[3] ?? null;
130                    $f = $flags[$feature] ?? null;
131                    if ( $op === 'P' ) {
132                        Assert::invariant( $value !== null, "P not C!" );
133                        $flags[$feature] = $value;
134                    } elseif ( $op === 'C' ) {
135                        Assert::invariant( $value === null, "C not P!" );
136                        $flags[$feature] = null;
137                    } elseif ( $op === 'N' ) {
138                        // @phan-suppress-next-line PhanImpossibleCondition
139                        Assert::invariant( false, 'N not supported' );
140                    } elseif ( $op === 'R' || $op === 'D' ) {
141                        $m = ( $value === null ) ? ( $f !== null ) : ( $f === $value );
142                        if ( $op === 'R' ? ( !$m ) : $m ) {
143                            continue;
144                        }
145                    } elseif ( $op === 'U' ) {
146                        Assert::invariant( $value !== null, "Unify with what?" );
147                        if ( $f === null || $f === $value ) {
148                            $flags[$feature] = $value;
149                        } else {
150                            continue;
151                        }
152                    }
153                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
154                } elseif ( $match === $tok && $tok !== null ) {
155                    $stack[] = [ $e->to, $pos + 1, $flags, $addTok( $emitted, $output ) ];
156                }
157            }
158            if ( $tok === null && $state->isFinal ) {
159                $results[] = $emitted;
160            }
161        }
162        return $results;
163    }
164
165    /**
166     * Perform simple optimization on the FST, trimming unreachable states
167     * and attempting to combine edges containing epsilons.
168     */
169    public function optimize(): void {
170        $reachable = [];
171        $stateWorklist = [];
172        $stateWorklist[$this->getStartState()->id] = $this->getStartState();
173        while ( count( $stateWorklist ) > 0 ) {
174            $state = array_pop( $stateWorklist );
175            $reachable[$state->id] = true;
176            $edgeWorklist = $state->edges; // worklist
177            while ( count( $edgeWorklist ) > 0 ) {
178                $edge = array_pop( $edgeWorklist );
179                if ( $edge === null ) {
180                    continue; /* removed edge */
181                }
182
183                if ( $edge->to->isFinal ) {
184                    if ( !isset( $reachable[$edge->to->id] ) ) {
185                        $stateWorklist[$edge->to->id] = $edge->to;
186                    }
187                    continue;
188                }
189                $nEdge = count( $edge->to->edges );
190                if ( $nEdge === 0 ) {
191                    $state->edges[$edge->id] = null; // remove
192                    // XXX requeue all states which point to this
193                    continue;
194                } elseif ( $nEdge === 1 ) {
195                    $nextEdge = $edge->to->edges[0];
196                    if ( $nextEdge === null ) {
197                        /* deleted.  Fall through. */
198                    } elseif (
199                        $edge->lower === self::EPSILON &&
200                        $nextEdge->upper === self::EPSILON
201                    ) {
202                        $edge->lower = $nextEdge->lower;
203                        $edge->to = $nextEdge->to;
204                        $edgeWorklist[] = $edge;
205                        continue;
206                    } elseif (
207                        $edge->upper === self::EPSILON &&
208                        $nextEdge->lower == self::EPSILON
209                    ) {
210                        $edge->upper = $nextEdge->upper;
211                        $edge->to = $nextEdge->to;
212                        $edgeWorklist[] = $edge;
213                        continue;
214                    } elseif (
215                        $edge->upper === self::EPSILON &&
216                        $edge->lower === self::EPSILON
217                    ) {
218                        $edge->upper = $nextEdge->upper;
219                        $edge->lower = $nextEdge->lower;
220                        $edge->to = $nextEdge->to;
221                        $edgeWorklist[] = $edge;
222                        continue;
223                    } elseif (
224                        $nextEdge->upper === self::EPSILON &&
225                        $nextEdge->lower === self::EPSILON
226                    ) {
227                        $edge->to = $nextEdge->to;
228                        $edgeWorklist[] = $edge;
229                        continue;
230                    }
231                }
232                // fall through
233                if ( !isset( $reachable[$edge->to->id] ) ) {
234                    $stateWorklist[$edge->to->id] = $edge->to;
235                }
236            }
237        }
238        // Renumber states and edges
239        $j = 0;
240        for ( $i = 0; $i < count( $this->states ); $i++ ) {
241            $s = $this->states[$i];
242            if ( isset( $reachable[$s->id] ) ) {
243                $s->id = $j++;
244                $this->states[$s->id] = $s;
245                // renumber edges
246                $k = 0;
247                for ( $l = 0; $l < count( $s->edges ); $l++ ) {
248                    $e = $s->edges[$l];
249                    if ( $e !== null ) {
250                        $e->id = $k++;
251                        $s->edges[$e->id] = $e;
252                    }
253                }
254                array_splice( $s->edges, $k );
255            }
256        }
257        array_splice( $this->states, $j );
258    }
259
260    /**
261     * Write an AT&T format description of this FST to the given filehandle.
262     * @param resource $handle
263     */
264    public function writeATT( $handle ): void {
265        // Write an AT&T format FST file
266        foreach ( $this->states as $state ) {
267            $state->writeATT( $handle );
268        }
269        // Now write the final states
270        foreach ( $this->states as $state ) {
271            if ( $state->isFinal ) {
272                fwrite( $handle, strval( $state->id ) . "\n" );
273            }
274        }
275    }
276}