Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
FstReplacementMachine | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
192.75 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
getCodes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
loadFST | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
countBrackets | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
convert | |
25.58% |
11 / 43 |
|
0.00% |
0 / 1 |
27.19 |
1 | <?php |
2 | |
3 | namespace Wikimedia\LangConv; |
4 | |
5 | use DOMDocument; |
6 | use DOMDocumentFragment; |
7 | use Wikimedia\Assert\Assert; |
8 | |
9 | class FstReplacementMachine extends ReplacementMachine { |
10 | |
11 | private $baseLanguage; |
12 | private $codes = []; |
13 | private $machines = []; |
14 | |
15 | /** |
16 | * ReplacementMachine constructor. |
17 | * @param string $baseLanguage |
18 | * @param string[] $codes |
19 | */ |
20 | public function __construct( $baseLanguage, $codes ) { |
21 | parent::__construct(); |
22 | $this->baseLanguage = $baseLanguage; |
23 | foreach ( $codes as $code ) { |
24 | // Set key *and* value of `codes` to allow use as set |
25 | $this->codes[ $code ] = $code; |
26 | $bracketMachines = []; |
27 | foreach ( $codes as $code2 ) { |
28 | if ( !$this->isValidCodePair( $code, $code2 ) ) { |
29 | continue; |
30 | } |
31 | $dstCode = $code === $code2 ? 'noop' : $code2; |
32 | $bracketMachines[$code2] = $this->loadFST( "brack-$code-$dstCode", true ); |
33 | } |
34 | $this->machines[$code] = [ |
35 | 'convert' => $this->loadFST( "trans-$code" ), |
36 | 'bracket' => $bracketMachines, |
37 | ]; |
38 | } |
39 | } |
40 | |
41 | /** |
42 | * Return the set of language codes supported. Both key and value are |
43 | * set in order to facilitate inclusion testing. |
44 | * |
45 | * @return array<string,string> |
46 | */ |
47 | public function getCodes() { |
48 | return $this->codes; |
49 | } |
50 | |
51 | /** |
52 | * Load a conversion machine from a pFST file with filename $filename from the fst directory. |
53 | * @param string $filename filename, omitting the .pfst file extension |
54 | * @param bool $justBrackets whether to return only the bracket locations |
55 | * @return FST |
56 | */ |
57 | public function loadFST( string $filename, bool $justBrackets = false ): FST { |
58 | return FST::compile( __DIR__ . "/../fst/$filename.pfst", $justBrackets ); |
59 | } |
60 | |
61 | /** |
62 | * Quantify a guess about the "native" language of string `s`. |
63 | * We will be converting *to* `destCode`, and our guess is that when we round trip we'll want |
64 | * to convert back to `invertCode` (so `invertCode` is our guess about the actual language of |
65 | * `s`). |
66 | * If we were to make this encoding, the returned value `unsafe` is the number of codepoints |
67 | * we'd have to specially-escape, `safe` is the number of codepoints we wouldn't have to |
68 | * escape, and `len` is the total number of codepoints in `s`. Generally lower values of |
69 | * `nonsafe` indicate a better guess for `invertCode`. |
70 | * @param string $s |
71 | * @param string $destCode |
72 | * @param string $invertCode |
73 | * @return BracketResult Statistics about the given guess. |
74 | */ |
75 | public function countBrackets( string $s, $destCode, $invertCode ) { |
76 | Assert::precondition( $this->isValidCodePair( $destCode, $invertCode ), |
77 | "Invalid code pair: $destCode/$invertCode" ); |
78 | $m = $this->machines[$destCode]['bracket'][$invertCode]; |
79 | // call array_values on the result of unpack() to transform from a 1- to 0-indexed array |
80 | $brackets = $m->run( $s, 0, strlen( $s ), true ); |
81 | $safe = 0; |
82 | $unsafe = 0; |
83 | for ( $i = 1; $i < count( $brackets ); $i++ ) { |
84 | $safe += ( $brackets[$i] - $brackets[$i - 1] ); |
85 | if ( ++$i < count( $brackets ) ) { |
86 | $unsafe += ( $brackets[$i] - $brackets[$i - 1] ); |
87 | } |
88 | } |
89 | // Note that this is counting codepoints, not UTF-8 code units. |
90 | return new BracketResult( |
91 | $safe, $unsafe, $brackets[count( $brackets ) - 1] |
92 | ); |
93 | } |
94 | |
95 | /** |
96 | * Convert a string of text. |
97 | * @param DOMDocument $document |
98 | * @param string $s text to convert |
99 | * @param string $destCode destination language code |
100 | * @param string $invertCode |
101 | * @return DOMDocumentFragment DocumentFragment containing converted text |
102 | */ |
103 | public function convert( $document, $s, $destCode, $invertCode ) { |
104 | $machine = $this->machines[$destCode]; |
105 | $convertM = $machine['convert']; |
106 | $bracketM = $machine['bracket'][$invertCode]; |
107 | $result = $document->createDocumentFragment(); |
108 | |
109 | $brackets = $bracketM->run( $s ); |
110 | |
111 | for ( $i = 1, $len = count( $brackets ); $i < $len; $i++ ) { |
112 | // A safe string |
113 | $safe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
114 | if ( strlen( $safe ) > 0 ) { |
115 | $result->appendChild( $document->createTextNode( $safe ) ); |
116 | } |
117 | if ( ++$i < count( $brackets ) ) { |
118 | // An unsafe string |
119 | $orig = substr( $s, $brackets[$i - 1], $brackets[$i] - $brackets[$i - 1] ); |
120 | $unsafe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
121 | $span = $document->createElement( 'span' ); |
122 | $span->textContent = $unsafe; |
123 | $span->setAttribute( 'typeof', 'mw:LanguageVariant' ); |
124 | // If this is an anomalous piece of text in a paragraph otherwise written in |
125 | // destCode, then it's possible invertCode === destCode. In this case try to pick a |
126 | // more appropriate invertCode !== destCode. |
127 | $ic = $invertCode; |
128 | if ( $ic === $destCode ) { |
129 | $cs = array_values( array_filter( $this->codes, static function ( $code ) use ( $destCode ) { |
130 | return $code !== $destCode; |
131 | } ) ); |
132 | $cs = array_map( function ( $code ) use ( $orig ) { |
133 | return [ |
134 | 'code' => $code, |
135 | 'stats' => $this->countBrackets( $orig, $code, $code ), |
136 | ]; |
137 | }, $cs ); |
138 | uasort( $cs, static function ( $a, $b ) { |
139 | return $a['stats']->unsafe - $b['stats']->unsafe; |
140 | } ); |
141 | if ( count( $cs ) === 0 ) { |
142 | $ic = '-'; |
143 | } else { |
144 | $ic = $cs[0]['code']; |
145 | $span->setAttribute( 'data-mw-variant-lang', $ic ); |
146 | } |
147 | } |
148 | $span->setAttribute( 'data-mw-variant', $this->jsonEncode( [ |
149 | 'twoway' => [ |
150 | [ 'l' => $ic, 't' => $orig ], |
151 | [ 'l' => $destCode, 't' => $unsafe ], |
152 | ], |
153 | 'rt' => true /* Synthetic markup used for round-tripping */ |
154 | ] ) ); |
155 | if ( strlen( $unsafe ) > 0 ) { |
156 | $result->appendChild( $span ); |
157 | } |
158 | } |
159 | } |
160 | return $result; |
161 | } |
162 | } |