Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
MutableFST
0.00% covered (danger)
0.00%
0 / 135
0.00% covered (danger)
0.00%
0 / 8
3906
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 newState
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getStartState
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyDown
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUp
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 applyUpDown
0.00% covered (danger)
0.00%
0 / 56
0.00% covered (danger)
0.00%
0 / 1
930
 optimize
0.00% covered (danger)
0.00%
0 / 62
0.00% covered (danger)
0.00%
0 / 1
506
 writeATT
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
20
1<?php
2
3namespace Wikimedia\LangConv\Construct;
4
5use Wikimedia\Assert\Assert;
6
7/**
8 * A mutable FST.
9 */
10class MutableFST {
11    /** Special alphabet symbol for `nothing` used in AT&T format FST files. */
12    public const EPSILON = '@0@';
13    /** Special alphabet symbol for upper-side `?` used in AT&T format FST files. */
14    public const UNKNOWN = '@_UNKNOWN_SYMBOL_@';
15    /** Special alphabet symbol for lower-side `?` used in AT&T format FST files. */
16    public const IDENTITY = '@_IDENTITY_SYMBOL_@';
17
18    private $alphabet = [];
19    private $states = [];
20
21    /**
22     * Create a new MutableFST with the given alphabet.
23     * @param string[] $alphabet
24     */
25    public function __construct( array $alphabet ) {
26        // Create start state
27        $this->newState();
28        // Set up alphabet
29        foreach ( $alphabet as $tok ) {
30            $this->alphabet[$tok] = true;
31        }
32        $this->alphabet[self::EPSILON] = true;
33        $this->alphabet[self::IDENTITY] = true;
34        $this->alphabet[self::UNKNOWN] = true;
35    }
36
37    /**
38     * Add a new state to this FST.
39     * @return State the new state.
40     */
41    public function newState(): State {
42        $id = count( $this->states );
43        $s = $this->states[] = new State( $id );
44        return $s;
45    }
46
47    /**
48     * Return the start state of this FST.
49     * @return State the start state.
50     */
51    public function getStartState(): State {
52        return $this->states[0];
53    }
54
55    /**
56     * Apply the given FST to the array of input tokens.  Symbols in
57     * 'upper' are matched and symbols in 'lower' are emitted.
58     * @param string[] $input
59     * @return array<array<string>>
60     */
61    public function applyDown( array $input ): array {
62        return $this->applyUpDown( $input, true );
63    }
64
65    /**
66     * Apply the given FST to the array of input tokens.  Symbols in
67     * 'lower' are matched and symbols in 'upper' are emitted.
68     * @param string[] $input
69     * @return array<array<string>>
70     */
71    public function applyUp( array $input ): array {
72        return $this->applyUpDown( $input, false );
73    }
74
75    /**
76     * Apply the given FST to the array of input tokens.
77     * @param string[] $input array of input tokens
78     * @param bool $isDown Whether to apply 'up' or 'down'
79     * @return array<array<string>>
80     */
81    private function applyUpDown( array $input, bool $isDown ): array {
82        // This is an extremely simple implementation, made for validating
83        // the FST not efficient execution.
84        $results = [];
85        $stack = [];
86        $flagRE = '/^@([PNRDCU])[.]([^.@]+)(?:[.]([^@]+))?@$/D';
87        $addTok = static function ( $toks, $t ) use ( $flagRE ) {
88            if ( $t === self::EPSILON || preg_match( $flagRE, $t ) ) {
89                // skip token
90            } else {
91                $toks[] = $t;
92            }
93            return $toks;
94        };
95        $stack[] = [ $this->getStartState(), 0, [], [] ];
96        while ( count( $stack ) > 0 ) {
97            [ $state, $pos, $flags, $emitted ] = array_pop( $stack );
98            $tok = $pos < count( $input ) ? $input[$pos] : null;
99            if ( $tok !== null && !isset( $this->alphabet[$tok] ) ) {
100                $tok = self::UNKNOWN;
101            }
102            foreach ( $state->edges as $e ) {
103                if ( $isDown ) {
104                    $match = $e->upper;
105                    $output = $e->lower;
106                } else {
107                    $match = $e->lower;
108                    $output = $e->upper;
109                }
110                // Treat IDENTITY and UNKNOWN as identical (applyDown/applyUp)
111                if ( $match === self::IDENTITY ) {
112                    $match = self::UNKNOWN;
113                }
114                if ( $output === self::UNKNOWN ) {
115                    $output = self::IDENTITY;
116                }
117                // Does this edge match?
118                if ( $output === self::IDENTITY && $tok !== null ) {
119                    $output = $input[$pos];
120                }
121                if ( $match === self::EPSILON ) {
122                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
123                } elseif ( preg_match( $flagRE, $match, $flagInfo, PREG_UNMATCHED_AS_NULL ) === 1 ) {
124                    $op = $flagInfo[1];
125                    $feature = $flagInfo[2];
126                    $value = $flagInfo[3] ?? null;
127                    $f = $flags[$feature] ?? null;
128                    if ( $op === 'P' ) {
129                        Assert::invariant( $value !== null, "P not C!" );
130                        $flags[$feature] = $value;
131                    } elseif ( $op === 'C' ) {
132                        Assert::invariant( $value === null, "C not P!" );
133                        $flags[$feature] = null;
134                    } elseif ( $op === 'N' ) {
135                        // @phan-suppress-next-line PhanImpossibleCondition
136                        Assert::invariant( false, 'N not supported' );
137                    } elseif ( $op === 'R' || $op === 'D' ) {
138                        $m = ( $value === null ) ? ( $f !== null ) : ( $f === $value );
139                        if ( $op === 'R' ? ( !$m ) : $m ) {
140                            continue;
141                        }
142                    } elseif ( $op === 'U' ) {
143                        Assert::invariant( $value !== null, "Unify with what?" );
144                        if ( $f === null || $f === $value ) {
145                            $flags[$feature] = $value;
146                        } else {
147                            continue;
148                        }
149                    }
150                    $stack[] = [ $e->to, $pos, $flags, $addTok( $emitted, $output ) ];
151                } elseif ( $match === $tok && $tok !== null ) {
152                    $stack[] = [ $e->to, $pos + 1, $flags, $addTok( $emitted, $output ) ];
153                }
154            }
155            if ( $tok === null && $state->isFinal ) {
156                $results[] = $emitted;
157            }
158        }
159        return $results;
160    }
161
162    /**
163     * Perform simple optimization on the FST, trimming unreachable states
164     * and attempting to combine edges containing epsilons.
165     */
166    public function optimize(): void {
167        $reachable = [];
168        $stateWorklist = [];
169        $stateWorklist[$this->getStartState()->id] = $this->getStartState();
170        while ( count( $stateWorklist ) > 0 ) {
171            $state = array_pop( $stateWorklist );
172            $reachable[$state->id] = true;
173            $edgeWorklist = $state->edges; // worklist
174            while ( count( $edgeWorklist ) > 0 ) {
175                $edge = array_pop( $edgeWorklist );
176                if ( $edge === null ) {
177                    continue; /* removed edge */
178                }
179
180                if ( $edge->to->isFinal ) {
181                    if ( !isset( $reachable[$edge->to->id] ) ) {
182                        $stateWorklist[$edge->to->id] = $edge->to;
183                    }
184                    continue;
185                }
186                $nEdge = count( $edge->to->edges );
187                if ( $nEdge === 0 ) {
188                    $state->edges[$edge->id] = null; // remove
189                    // XXX requeue all states which point to this
190                    continue;
191                } elseif ( $nEdge === 1 ) {
192                    $nextEdge = $edge->to->edges[0];
193                    if ( $nextEdge === null ) {
194                        /* deleted.  Fall through. */
195                    } elseif (
196                        $edge->lower === self::EPSILON &&
197                        $nextEdge->upper === self::EPSILON
198                    ) {
199                        $edge->lower = $nextEdge->lower;
200                        $edge->to = $nextEdge->to;
201                        $edgeWorklist[] = $edge;
202                        continue;
203                    } elseif (
204                        $edge->upper === self::EPSILON &&
205                        $nextEdge->lower == self::EPSILON
206                    ) {
207                        $edge->upper = $nextEdge->upper;
208                        $edge->to = $nextEdge->to;
209                        $edgeWorklist[] = $edge;
210                        continue;
211                    } elseif (
212                        $edge->upper === self::EPSILON &&
213                        $edge->lower === self::EPSILON
214                    ) {
215                        $edge->upper = $nextEdge->upper;
216                        $edge->lower = $nextEdge->lower;
217                        $edge->to = $nextEdge->to;
218                        $edgeWorklist[] = $edge;
219                        continue;
220                    } elseif (
221                        $nextEdge->upper === self::EPSILON &&
222                        $nextEdge->lower === self::EPSILON
223                    ) {
224                        $edge->to = $nextEdge->to;
225                        $edgeWorklist[] = $edge;
226                        continue;
227                    }
228                }
229                // fall through
230                if ( !isset( $reachable[$edge->to->id] ) ) {
231                    $stateWorklist[$edge->to->id] = $edge->to;
232                }
233            }
234        }
235        // Renumber states and edges
236        $j = 0;
237        for ( $i = 0; $i < count( $this->states ); $i++ ) {
238            $s = $this->states[$i];
239            if ( isset( $reachable[$s->id] ) ) {
240                $s->id = $j++;
241                $this->states[$s->id] = $s;
242                // renumber edges
243                $k = 0;
244                for ( $l = 0; $l < count( $s->edges ); $l++ ) {
245                    $e = $s->edges[$l];
246                    if ( $e !== null ) {
247                        $e->id = $k++;
248                        $s->edges[$e->id] = $e;
249                    }
250                }
251                array_splice( $s->edges, $k );
252            }
253        }
254        array_splice( $this->states, $j );
255    }
256
257    /**
258     * Write an AT&T format description of this FST to the given filehandle.
259     * @param resource $handle
260     */
261    public function writeATT( $handle ): void {
262        // Write an AT&T format FST file
263        foreach ( $this->states as $state ) {
264            $state->writeATT( $handle );
265        }
266        // Now write the final states
267        foreach ( $this->states as $state ) {
268            if ( $state->isFinal ) {
269                fwrite( $handle, strval( $state->id ) . "\n" );
270            }
271        }
272    }
273}