Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
MutableFST
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
3906
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 newState
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getStartState
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyDown
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUp
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUpDown
0.00% covered (danger)
0.00%
0 / 56
0.00% covered (danger)
0.00%
0 / 1
930
 optimize
0.00% covered (danger)
0.00%
0 / 62
0.00% covered (danger)
0.00%
0 / 1
506
 writeATT
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
20
1<?php
2
3namespace Wikimedia\LangConv\Construct;
4
5use Wikimedia\Assert\Assert;
6
7/**
8 * A mutable FST.
9 */
10class MutableFST {
11    /** Special alphabet symbol for `nothing` used in AT&T format FST files. */
12    public const EPSILON = '@0@';
13    /** Special alphabet symbol for upper-side `?` used in AT&T format FST files. */
14    public const UNKNOWN = '@_UNKNOWN_SYMBOL_@';
15    /** Special alphabet symbol for lower-side `?` used in AT&T format FST files. */
16    public const IDENTITY = '@_IDENTITY_SYMBOL_@';
17
18    /** @var array<string,true> */
19    private $alphabet = [];
20    /** @var State[] */
21    private $states = [];
22
23    /**
24     * Create a new MutableFST with the given alphabet.
25     * @param string[] $alphabet
26     */
27    public function __construct( array $alphabet ) {
28        // Create start state
29        $this->newState();
30        // Set up alphabet
31        foreach ( $alphabet as $tok ) {
32            $this->alphabet[$tok] = true;
33        }
34        $this->alphabet[self::EPSILON] = true;
35        $this->alphabet[self::IDENTITY] = true;
36        $this->alphabet[self::UNKNOWN] = true;
37    }
38
39    /**
40     * Add a new state to this FST.
41     * @return State the new state.
42     */
43    public function newState(): State {
44        $id = count( $this->states );
45        $s = $this->states[] = new State( $id );
46        return $s;
47    }
48
49    /**
50     * Return the start state of this FST.
51     * @return State the start state.
52     */
53    public function getStartState(): State {
54        return $this->states[0];
55    }
56
57    /**
58     * Apply the given FST to the array of input tokens.  Symbols in
59     * 'upper' are matched and symbols in 'lower' are emitted.
60     * @param string[] $input
61     * @return array<array<string>>
62     */
63    public function applyDown( array $input ): array {
64        return $this->applyUpDown( $input, true );
65    }
66
67    /**
68     * Apply the given FST to the array of input tokens.  Symbols in
69     * 'lower' are matched and symbols in 'upper' are emitted.
70     * @param string[] $input
71     * @return array<array<string>>
72     */
73    public function applyUp( array $input ): array {
74        return $this->applyUpDown( $input, false );
75    }
76
77    /**
78     * Apply the given FST to the array of input tokens.
79     * @param string[] $input array of input tokens
80     * @param bool $isDown Whether to apply 'up' or 'down'
81     * @return array<array<string>>
82     */
83    private function applyUpDown( array $input, bool $isDown ): array {
84        // This is an extremely simple implementation, made for validating
85        // the FST not efficient execution.
86        $results = [];
87        $stack = [];
88        $flagRE = '/^@([PNRDCU])[.]([^.@]+)(?:[.]([^@]+))?@$/D';
89        $addTok = static function ( $toks, $t ) use ( $flagRE ) {
90            if ( $t === self::EPSILON || preg_match( $flagRE, $t ) ) {
91                // skip token
92            } else {
93                $toks[] = $t;
94            }
95            return $toks;
96        };
97        $stack[] = [ $this->getStartState(), 0, [], [] ];
98        while ( count( $stack ) > 0 ) {
99            [ $state, $pos, $flags, $emitted ] = array_pop( $stack );
100            $tok = $pos < count( $input ) ? $input[$pos] : null;
101            if ( $tok !== null && !isset( $this->alphabet[$tok] ) ) {
102                $tok = self::UNKNOWN;
103            }
104            foreach ( $state->edges as $e ) {
105                if ( $isDown ) {
106                    $match = $e->upper;
107                    $output = $e->lower;
108                } else {
109                    $match = $e->lower;
110                    $output = $e->upper;
111                }
112                // Treat IDENTITY and UNKNOWN as identical (applyDown/applyUp)
113                if ( $match === self::IDENTITY ) {
114                    $match = self::UNKNOWN;
115                }
116                if ( $output === self::UNKNOWN ) {
117                    $output = self::IDENTITY;
118                }
119                // Does this edge match?
120                if ( $output === self::IDENTITY && $tok !== null ) {
121                    $output = $input[$pos];
122                }
123                if ( $match === self::EPSILON ) {
124                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
125                } elseif ( preg_match( $flagRE, $match, $flagInfo, PREG_UNMATCHED_AS_NULL ) === 1 ) {
126                    $op = $flagInfo[1];
127                    $feature = $flagInfo[2];
128                    $value = $flagInfo[3] ?? null;
129                    $f = $flags[$feature] ?? null;
130                    if ( $op === 'P' ) {
131                        Assert::invariant( $value !== null, "P not C!" );
132                        $flags[$feature] = $value;
133                    } elseif ( $op === 'C' ) {
134                        Assert::invariant( $value === null, "C not P!" );
135                        $flags[$feature] = null;
136                    } elseif ( $op === 'N' ) {
137                        // @phan-suppress-next-line PhanImpossibleCondition
138                        Assert::invariant( false, 'N not supported' );
139                    } elseif ( $op === 'R' || $op === 'D' ) {
140                        $m = ( $value === null ) ? ( $f !== null ) : ( $f === $value );
141                        if ( $op === 'R' ? ( !$m ) : $m ) {
142                            continue;
143                        }
144                    } elseif ( $op === 'U' ) {
145                        Assert::invariant( $value !== null, "Unify with what?" );
146                        if ( $f === null || $f === $value ) {
147                            $flags[$feature] = $value;
148                        } else {
149                            continue;
150                        }
151                    }
152                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
153                } elseif ( $match === $tok && $tok !== null ) {
154                    $stack[] = [ $e->to, $pos + 1, $flags, $addTok( $emitted, $output ) ];
155                }
156            }
157            if ( $tok === null && $state->isFinal ) {
158                $results[] = $emitted;
159            }
160        }
161        return $results;
162    }
163
164    /**
165     * Perform simple optimization on the FST, trimming unreachable states
166     * and attempting to combine edges containing epsilons.
167     */
168    public function optimize(): void {
169        $reachable = [];
170        $stateWorklist = [];
171        $stateWorklist[$this->getStartState()->id] = $this->getStartState();
172        while ( count( $stateWorklist ) > 0 ) {
173            $state = array_pop( $stateWorklist );
174            $reachable[$state->id] = true;
175            $edgeWorklist = $state->edges; // worklist
176            while ( count( $edgeWorklist ) > 0 ) {
177                $edge = array_pop( $edgeWorklist );
178                if ( $edge === null ) {
179                    continue; /* removed edge */
180                }
181
182                if ( $edge->to->isFinal ) {
183                    if ( !isset( $reachable[$edge->to->id] ) ) {
184                        $stateWorklist[$edge->to->id] = $edge->to;
185                    }
186                    continue;
187                }
188                $nEdge = count( $edge->to->edges );
189                if ( $nEdge === 0 ) {
190                    $state->edges[$edge->id] = null; // remove
191                    // XXX requeue all states which point to this
192                    continue;
193                } elseif ( $nEdge === 1 ) {
194                    $nextEdge = $edge->to->edges[0];
195                    if ( $nextEdge === null ) {
196                        /* deleted.  Fall through. */
197                    } elseif (
198                        $edge->lower === self::EPSILON &&
199                        $nextEdge->upper === self::EPSILON
200                    ) {
201                        $edge->lower = $nextEdge->lower;
202                        $edge->to = $nextEdge->to;
203                        $edgeWorklist[] = $edge;
204                        continue;
205                    } elseif (
206                        $edge->upper === self::EPSILON &&
207                        $nextEdge->lower == self::EPSILON
208                    ) {
209                        $edge->upper = $nextEdge->upper;
210                        $edge->to = $nextEdge->to;
211                        $edgeWorklist[] = $edge;
212                        continue;
213                    } elseif (
214                        $edge->upper === self::EPSILON &&
215                        $edge->lower === self::EPSILON
216                    ) {
217                        $edge->upper = $nextEdge->upper;
218                        $edge->lower = $nextEdge->lower;
219                        $edge->to = $nextEdge->to;
220                        $edgeWorklist[] = $edge;
221                        continue;
222                    } elseif (
223                        $nextEdge->upper === self::EPSILON &&
224                        $nextEdge->lower === self::EPSILON
225                    ) {
226                        $edge->to = $nextEdge->to;
227                        $edgeWorklist[] = $edge;
228                        continue;
229                    }
230                }
231                // fall through
232                if ( !isset( $reachable[$edge->to->id] ) ) {
233                    $stateWorklist[$edge->to->id] = $edge->to;
234                }
235            }
236        }
237        // Renumber states and edges
238        $j = 0;
239        for ( $i = 0; $i < count( $this->states ); $i++ ) {
240            $s = $this->states[$i];
241            if ( isset( $reachable[$s->id] ) ) {
242                $s->id = $j++;
243                $this->states[$s->id] = $s;
244                // renumber edges
245                $k = 0;
246                for ( $l = 0; $l < count( $s->edges ); $l++ ) {
247                    $e = $s->edges[$l];
248                    if ( $e !== null ) {
249                        $e->id = $k++;
250                        $s->edges[$e->id] = $e;
251                    }
252                }
253                array_splice( $s->edges, $k );
254            }
255        }
256        array_splice( $this->states, $j );
257    }
258
259    /**
260     * Write an AT&T format description of this FST to the given filehandle.
261     * @param resource $handle
262     */
263    public function writeATT( $handle ): void {
264        // Write an AT&T format FST file
265        foreach ( $this->states as $state ) {
266            $state->writeATT( $handle );
267        }
268        // Now write the final states
269        foreach ( $this->states as $state ) {
270            if ( $state->isFinal ) {
271                fwrite( $handle, strval( $state->id ) . "\n" );
272            }
273        }
274    }
275}