Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
| FstReplacementMachine | |
15.28% |
11 / 72 |
|
0.00% |
0 / 5 |
192.75 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
| getCodes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| loadFST | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| countBrackets | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
| convert | |
25.58% |
11 / 43 |
|
0.00% |
0 / 1 |
27.19 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikimedia\LangConv; |
| 4 | |
| 5 | use DOMDocument; |
| 6 | use DOMDocumentFragment; |
| 7 | use Wikimedia\Assert\Assert; |
| 8 | |
| 9 | class FstReplacementMachine extends ReplacementMachine { |
| 10 | |
| 11 | /** @var string */ |
| 12 | private $baseLanguage; |
| 13 | /** @var array<string,string> */ |
| 14 | private $codes = []; |
| 15 | /** @var array<string,array> */ |
| 16 | private $machines = []; |
| 17 | |
| 18 | /** |
| 19 | * ReplacementMachine constructor. |
| 20 | * @param string $baseLanguage |
| 21 | * @param string[] $codes |
| 22 | */ |
| 23 | public function __construct( $baseLanguage, $codes ) { |
| 24 | parent::__construct(); |
| 25 | $this->baseLanguage = $baseLanguage; |
| 26 | foreach ( $codes as $code ) { |
| 27 | // Set key *and* value of `codes` to allow use as set |
| 28 | $this->codes[ $code ] = $code; |
| 29 | $bracketMachines = []; |
| 30 | foreach ( $codes as $code2 ) { |
| 31 | if ( !$this->isValidCodePair( $code, $code2 ) ) { |
| 32 | continue; |
| 33 | } |
| 34 | $dstCode = $code === $code2 ? 'noop' : $code2; |
| 35 | $bracketMachines[$code2] = $this->loadFST( "brack-$code-$dstCode", true ); |
| 36 | } |
| 37 | $this->machines[$code] = [ |
| 38 | 'convert' => $this->loadFST( "trans-$code" ), |
| 39 | 'bracket' => $bracketMachines, |
| 40 | ]; |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | /** |
| 45 | * Return the set of language codes supported. Both key and value are |
| 46 | * set in order to facilitate inclusion testing. |
| 47 | * |
| 48 | * @return array<string,string> |
| 49 | */ |
| 50 | public function getCodes() { |
| 51 | return $this->codes; |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * Load a conversion machine from a pFST file with filename $filename from the fst directory. |
| 56 | * @param string $filename filename, omitting the .pfst file extension |
| 57 | * @param bool $justBrackets whether to return only the bracket locations |
| 58 | * @return FST |
| 59 | */ |
| 60 | public function loadFST( string $filename, bool $justBrackets = false ): FST { |
| 61 | return FST::compile( __DIR__ . "/../fst/$filename.pfst", $justBrackets ); |
| 62 | } |
| 63 | |
| 64 | /** |
| 65 | * Quantify a guess about the "native" language of string `s`. |
| 66 | * We will be converting *to* `destCode`, and our guess is that when we round trip we'll want |
| 67 | * to convert back to `invertCode` (so `invertCode` is our guess about the actual language of |
| 68 | * `s`). |
| 69 | * If we were to make this encoding, the returned value `unsafe` is the number of codepoints |
| 70 | * we'd have to specially-escape, `safe` is the number of codepoints we wouldn't have to |
| 71 | * escape, and `len` is the total number of codepoints in `s`. Generally lower values of |
| 72 | * `nonsafe` indicate a better guess for `invertCode`. |
| 73 | * @param string $s |
| 74 | * @param string $destCode |
| 75 | * @param string $invertCode |
| 76 | * @return BracketResult Statistics about the given guess. |
| 77 | */ |
| 78 | public function countBrackets( string $s, $destCode, $invertCode ) { |
| 79 | Assert::precondition( $this->isValidCodePair( $destCode, $invertCode ), |
| 80 | "Invalid code pair: $destCode/$invertCode" ); |
| 81 | $m = $this->machines[$destCode]['bracket'][$invertCode]; |
| 82 | // call array_values on the result of unpack() to transform from a 1- to 0-indexed array |
| 83 | $brackets = $m->run( $s, 0, strlen( $s ), true ); |
| 84 | $safe = 0; |
| 85 | $unsafe = 0; |
| 86 | for ( $i = 1; $i < count( $brackets ); $i++ ) { |
| 87 | $safe += ( $brackets[$i] - $brackets[$i - 1] ); |
| 88 | if ( ++$i < count( $brackets ) ) { |
| 89 | $unsafe += ( $brackets[$i] - $brackets[$i - 1] ); |
| 90 | } |
| 91 | } |
| 92 | // Note that this is counting codepoints, not UTF-8 code units. |
| 93 | return new BracketResult( |
| 94 | $safe, $unsafe, $brackets[count( $brackets ) - 1] |
| 95 | ); |
| 96 | } |
| 97 | |
| 98 | /** |
| 99 | * Convert a string of text. |
| 100 | * @param DOMDocument $document |
| 101 | * @param string $s text to convert |
| 102 | * @param string $destCode destination language code |
| 103 | * @param string $invertCode |
| 104 | * @return DOMDocumentFragment DocumentFragment containing converted text |
| 105 | */ |
| 106 | public function convert( $document, $s, $destCode, $invertCode ) { |
| 107 | $machine = $this->machines[$destCode]; |
| 108 | $convertM = $machine['convert']; |
| 109 | $bracketM = $machine['bracket'][$invertCode]; |
| 110 | $result = $document->createDocumentFragment(); |
| 111 | |
| 112 | $brackets = $bracketM->run( $s ); |
| 113 | |
| 114 | for ( $i = 1, $len = count( $brackets ); $i < $len; $i++ ) { |
| 115 | // A safe string |
| 116 | $safe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
| 117 | if ( strlen( $safe ) > 0 ) { |
| 118 | $result->appendChild( $document->createTextNode( $safe ) ); |
| 119 | } |
| 120 | if ( ++$i < count( $brackets ) ) { |
| 121 | // An unsafe string |
| 122 | $orig = substr( $s, $brackets[$i - 1], $brackets[$i] - $brackets[$i - 1] ); |
| 123 | $unsafe = $convertM->run( $s, $brackets[$i - 1], $brackets[$i] ); |
| 124 | $span = $document->createElement( 'span' ); |
| 125 | $span->textContent = $unsafe; |
| 126 | $span->setAttribute( 'typeof', 'mw:LanguageVariant' ); |
| 127 | // If this is an anomalous piece of text in a paragraph otherwise written in |
| 128 | // destCode, then it's possible invertCode === destCode. In this case try to pick a |
| 129 | // more appropriate invertCode !== destCode. |
| 130 | $ic = $invertCode; |
| 131 | if ( $ic === $destCode ) { |
| 132 | $cs = array_values( array_filter( $this->codes, static function ( $code ) use ( $destCode ) { |
| 133 | return $code !== $destCode; |
| 134 | } ) ); |
| 135 | $cs = array_map( function ( $code ) use ( $orig ) { |
| 136 | return [ |
| 137 | 'code' => $code, |
| 138 | 'stats' => $this->countBrackets( $orig, $code, $code ), |
| 139 | ]; |
| 140 | }, $cs ); |
| 141 | uasort( $cs, static function ( $a, $b ) { |
| 142 | return $a['stats']->unsafe - $b['stats']->unsafe; |
| 143 | } ); |
| 144 | if ( count( $cs ) === 0 ) { |
| 145 | $ic = '-'; |
| 146 | } else { |
| 147 | $ic = $cs[0]['code']; |
| 148 | $span->setAttribute( 'data-mw-variant-lang', $ic ); |
| 149 | } |
| 150 | } |
| 151 | $span->setAttribute( 'data-mw-variant', $this->jsonEncode( [ |
| 152 | 'twoway' => [ |
| 153 | [ 'l' => $ic, 't' => $orig ], |
| 154 | [ 'l' => $destCode, 't' => $unsafe ], |
| 155 | ], |
| 156 | 'rt' => true /* Synthetic markup used for round-tripping */ |
| 157 | ] ) ); |
| 158 | if ( strlen( $unsafe ) > 0 ) { |
| 159 | $result->appendChild( $span ); |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | return $result; |
| 164 | } |
| 165 | } |