Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 48 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
| MachineLanguageGuesser | |
0.00% |
0 / 48 |
|
0.00% |
0 / 3 |
272 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
182 | |||
| getNodeData | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| guessLang | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Language; |
| 5 | |
| 6 | use stdClass; |
| 7 | use Wikimedia\Bcp47Code\Bcp47Code; |
| 8 | use Wikimedia\LangConv\FstReplacementMachine; |
| 9 | use Wikimedia\Parsoid\DOM\Element; |
| 10 | use Wikimedia\Parsoid\DOM\Node; |
| 11 | use Wikimedia\Parsoid\DOM\Text; |
| 12 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 13 | use Wikimedia\Parsoid\Utils\DOMPostOrder; |
| 14 | use Wikimedia\Parsoid\Utils\Utils; |
| 15 | |
| 16 | /** |
| 17 | * Use a {@Link ReplacementMachine} to predict the best "source language" for every node in a DOM. |
| 18 | * Appropriate for wikis which are written in a mix of variants. |
| 19 | */ |
| 20 | class MachineLanguageGuesser extends LanguageGuesser { |
| 21 | |
| 22 | /** |
| 23 | * MachineLanguageGuesser constructor. |
| 24 | * @param FstReplacementMachine $machine |
| 25 | * @param Node $root |
| 26 | * @param Bcp47Code $destCode a language code |
| 27 | */ |
| 28 | public function __construct( FstReplacementMachine $machine, Node $root, $destCode ) { |
| 29 | # T320662 This code uses MW-internal codes internally |
| 30 | $destCode = Utils::bcp47ToMwCode( $destCode ); |
| 31 | |
| 32 | $codes = []; |
| 33 | foreach ( $machine->getCodes() as $invertCode => $_ignore ) { |
| 34 | if ( $machine->isValidCodePair( $destCode, $invertCode ) ) { |
| 35 | $codes[] = $invertCode; |
| 36 | } |
| 37 | } |
| 38 | $zeroCounts = []; |
| 39 | foreach ( $codes as $invertCode ) { |
| 40 | $zeroCounts[$invertCode] = 0; |
| 41 | } |
| 42 | |
| 43 | DOMPostOrder::traverse( |
| 44 | $root, function ( Node &$node ) use ( |
| 45 | $machine, $codes, $destCode, $zeroCounts |
| 46 | ) { |
| 47 | if ( !( $node instanceof Element ) ) { |
| 48 | // Elements only! |
| 49 | return; |
| 50 | } |
| 51 | // XXX look at `lang` attribute and use it to inform guess? |
| 52 | $nodeData = self::getNodeData( $node ); |
| 53 | $first = true; |
| 54 | // Iterate over child *nodes* (not just elements) |
| 55 | for ( $child = $node->firstChild; |
| 56 | $child; |
| 57 | $child = $child->nextSibling |
| 58 | ) { |
| 59 | if ( $child instanceof Text ) { |
| 60 | $countMap = []; |
| 61 | foreach ( $codes as $invertCode ) { |
| 62 | $countMap[$invertCode] = $machine->countBrackets( |
| 63 | $child->textContent, |
| 64 | $destCode, |
| 65 | $invertCode |
| 66 | )->safe; |
| 67 | } |
| 68 | } elseif ( $child instanceof Element ) { |
| 69 | $countMap = self::getNodeData( $child )->countMap; |
| 70 | } else { |
| 71 | continue; // skip this non-element non-text node |
| 72 | } |
| 73 | if ( $first ) { |
| 74 | $nodeData->countMap = $countMap; |
| 75 | $first = false; |
| 76 | } else { |
| 77 | // accumulate child counts! |
| 78 | foreach ( $codes as $c ) { |
| 79 | $nodeData->countMap[$c] += $countMap[$c]; |
| 80 | } |
| 81 | } |
| 82 | } |
| 83 | if ( $first ) { |
| 84 | $nodeData->countMap = $zeroCounts; |
| 85 | } |
| 86 | // Compute best guess for language |
| 87 | $safe = []; |
| 88 | foreach ( $codes as $code ) { |
| 89 | $safe[$code] = $nodeData->countMap[$code]; |
| 90 | } |
| 91 | arsort( $safe ); |
| 92 | $nodeData->guessLang = array_keys( $safe )[0]; |
| 93 | } ); |
| 94 | } |
| 95 | |
| 96 | /** |
| 97 | * Helper function that namespaces all of our node data used in |
| 98 | * this class into the top-level `mw_variant` key. |
| 99 | * |
| 100 | * @param Element $node |
| 101 | * @return stdClass |
| 102 | */ |
| 103 | private static function getNodeData( Element $node ): stdClass { |
| 104 | $nodeData = DOMDataUtils::getNodeData( $node ); |
| 105 | if ( !isset( $nodeData->mw_variant ) ) { |
| 106 | $nodeData->mw_variant = new stdClass; |
| 107 | } |
| 108 | return $nodeData->mw_variant; |
| 109 | } |
| 110 | |
| 111 | /** @inheritDoc */ |
| 112 | public function guessLang( Element $node ): Bcp47Code { |
| 113 | return Utils::mwCodeToBcp47( self::getNodeData( $node )->guessLang ); |
| 114 | } |
| 115 | } |