Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
FstReplacementMachine | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
192.75 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
getCodes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
loadFST | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
countBrackets | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
convert | |
25.58% |
11 / 43 |
|
0.00% |
0 / 1 |
27.19 |
1 | <?php |
2 | |
3 | namespace Wikimedia\LangConv; |
4 | |
5 | use DOMDocument; |
6 | use DOMDocumentFragment; |
7 | use Wikimedia\Assert\Assert; |
8 | |
9 | class FstReplacementMachine extends ReplacementMachine { |
10 | |
11 | /** @var string */ |
12 | private $baseLanguage; |
13 | /** @var array<string,string> */ |
14 | private $codes = []; |
15 | /** @var array<string,array> */ |
16 | private $machines = []; |
17 | |
18 | /** |
19 | * ReplacementMachine constructor. |
20 | * @param string $baseLanguage |
21 | * @param string[] $codes |
22 | */ |
23 | public function __construct( $baseLanguage, $codes ) { |
24 | parent::__construct(); |
25 | $this->baseLanguage = $baseLanguage; |
26 | foreach ( $codes as $code ) { |
27 | // Set key *and* value of `codes` to allow use as set |
28 | $this->codes[ $code ] = $code; |
29 | $bracketMachines = []; |
30 | foreach ( $codes as $code2 ) { |
31 | if ( !$this->isValidCodePair( $code, $code2 ) ) { |
32 | continue; |
33 | } |
34 | $dstCode = $code === $code2 ? 'noop' : $code2; |
35 | $bracketMachines[$code2] = $this->loadFST( "brack-$code-$dstCode", true ); |
36 | } |
37 | $this->machines[$code] = [ |
38 | 'convert' => $this->loadFST( "trans-$code" ), |
39 | 'bracket' => $bracketMachines, |
40 | ]; |
41 | } |
42 | } |
43 | |
44 | /** |
45 | * Return the set of language codes supported. Both key and value are |
46 | * set in order to facilitate inclusion testing. |
47 | * |
48 | * @return array<string,string> |
49 | */ |
50 | public function getCodes() { |
51 | return $this->codes; |
52 | } |
53 | |
54 | /** |
55 | * Load a conversion machine from a pFST file with filename $filename from the fst directory. |
56 | * @param string $filename filename, omitting the .pfst file extension |
57 | * @param bool $justBrackets whether to return only the bracket locations |
58 | * @return FST |
59 | */ |
60 | public function loadFST( string $filename, bool $justBrackets = false ): FST { |
61 | return FST::compile( __DIR__ . "/../fst/$filename.pfst", $justBrackets ); |
62 | } |
63 | |
64 | /** |
65 | * Quantify a guess about the "native" language of string `s`. |
66 | * We will be converting *to* `destCode`, and our guess is that when we round trip we'll want |
67 | * to convert back to `invertCode` (so `invertCode` is our guess about the actual language of |
68 | * `s`). |
69 | * If we were to make this encoding, the returned value `unsafe` is the number of codepoints |
70 | * we'd have to specially-escape, `safe` is the number of codepoints we wouldn't have to |
71 | * escape, and `len` is the total number of codepoints in `s`. Generally lower values of |
72 | * `nonsafe` indicate a better guess for `invertCode`. |
73 | * @param string $s |
74 | * @param string $destCode |
75 | * @param string $invertCode |
76 | * @return BracketResult Statistics about the given guess. |
77 | */ |
78 | public function countBrackets( string $s, $destCode, $invertCode ) { |
79 | Assert::precondition( $this->isValidCodePair( $destCode, $invertCode ), |
80 | "Invalid code pair: $destCode/$invertCode" ); |
81 | $m = $this->machines[$destCode]['bracket'][$invertCode]; |
82 | // call array_values on the result of unpack() to transform from a 1- to 0-indexed array |
83 | $brackets = $m->run( $s, 0, strlen( $s ), true ); |
84 | $safe = 0; |
85 | $unsafe = 0; |
86 | for ( $i = 1; $i < count( $brackets ); $i++ ) { |
87 | $safe += ( $brackets[$i] - $brackets[$i - 1] ); |
88 | if ( ++$i < count( $brackets ) ) { |
89 | $unsafe += ( $brackets[$i] - $brackets[$i - 1] ); |
90 | } |
91 | } |
92 | // Note that this is counting codepoints, not UTF-8 code units. |
93 | return new BracketResult( |
94 | $safe, $unsafe, $brackets[count( $brackets ) - 1] |
95 | ); |
96 | } |
97 | |
98 | /** |
99 | * Convert a string of text. |
100 | * @param DOMDocument $document |
101 | * @param string $s text to convert |
102 | * @param string $destCode destination language code |
103 | * @param string $invertCode |
104 | * @return DOMDocumentFragment DocumentFragment containing converted text |
105 | */ |
106 | public function convert( $document, $s, $destCode, $invertCode ) { |
107 | $machine = $this->machines[$destCode]; |
108 | $convertM = $machine['convert']; |
109 | $bracketM = $machine['bracket'][$invertCode]; |
110 | $result = $document->createDocumentFragment(); |
111 | |
112 | $brackets = $bracketM->run( $s ); |
113 | |
114 | for ( $i = 1, $len = count( $brackets ); $i < $len; $i++ ) { |
115 | // A safe string |
116 | $safe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
117 | if ( strlen( $safe ) > 0 ) { |
118 | $result->appendChild( $document->createTextNode( $safe ) ); |
119 | } |
120 | if ( ++$i < count( $brackets ) ) { |
121 | // An unsafe string |
122 | $orig = substr( $s, $brackets[$i - 1], $brackets[$i] - $brackets[$i - 1] ); |
123 | $unsafe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
124 | $span = $document->createElement( 'span' ); |
125 | $span->textContent = $unsafe; |
126 | $span->setAttribute( 'typeof', 'mw:LanguageVariant' ); |
127 | // If this is an anomalous piece of text in a paragraph otherwise written in |
128 | // destCode, then it's possible invertCode === destCode. In this case try to pick a |
129 | // more appropriate invertCode !== destCode. |
130 | $ic = $invertCode; |
131 | if ( $ic === $destCode ) { |
132 | $cs = array_values( array_filter( $this->codes, static function ( $code ) use ( $destCode ) { |
133 | return $code !== $destCode; |
134 | } ) ); |
135 | $cs = array_map( function ( $code ) use ( $orig ) { |
136 | return [ |
137 | 'code' => $code, |
138 | 'stats' => $this->countBrackets( $orig, $code, $code ), |
139 | ]; |
140 | }, $cs ); |
141 | uasort( $cs, static function ( $a, $b ) { |
142 | return $a['stats']->unsafe - $b['stats']->unsafe; |
143 | } ); |
144 | if ( count( $cs ) === 0 ) { |
145 | $ic = '-'; |
146 | } else { |
147 | $ic = $cs[0]['code']; |
148 | $span->setAttribute( 'data-mw-variant-lang', $ic ); |
149 | } |
150 | } |
151 | $span->setAttribute( 'data-mw-variant', $this->jsonEncode( [ |
152 | 'twoway' => [ |
153 | [ 'l' => $ic, 't' => $orig ], |
154 | [ 'l' => $destCode, 't' => $unsafe ], |
155 | ], |
156 | 'rt' => true /* Synthetic markup used for round-tripping */ |
157 | ] ) ); |
158 | if ( strlen( $unsafe ) > 0 ) { |
159 | $result->appendChild( $span ); |
160 | } |
161 | } |
162 | } |
163 | return $result; |
164 | } |
165 | } |