Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 48 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
MachineLanguageGuesser | |
0.00% |
0 / 48 |
|
0.00% |
0 / 3 |
272 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
182 | |||
getNodeData | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
guessLang | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Language; |
5 | |
6 | use stdClass; |
7 | use Wikimedia\Bcp47Code\Bcp47Code; |
8 | use Wikimedia\LangConv\FstReplacementMachine; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
13 | use Wikimedia\Parsoid\Utils\DOMPostOrder; |
14 | use Wikimedia\Parsoid\Utils\Utils; |
15 | |
16 | /** |
17 | * Use a {@Link ReplacementMachine} to predict the best "source language" for every node in a DOM. |
18 | * Appropriate for wikis which are written in a mix of variants. |
19 | */ |
20 | class MachineLanguageGuesser extends LanguageGuesser { |
21 | |
22 | /** |
23 | * MachineLanguageGuesser constructor. |
24 | * @param FstReplacementMachine $machine |
25 | * @param Node $root |
26 | * @param Bcp47Code $destCode a language code |
27 | */ |
28 | public function __construct( FstReplacementMachine $machine, Node $root, $destCode ) { |
29 | # T320662 This code uses MW-internal codes internally |
30 | $destCode = Utils::bcp47ToMwCode( $destCode ); |
31 | |
32 | $codes = []; |
33 | foreach ( $machine->getCodes() as $invertCode => $ignore ) { |
34 | if ( $machine->isValidCodePair( $destCode, $invertCode ) ) { |
35 | $codes[] = $invertCode; |
36 | } |
37 | } |
38 | $zeroCounts = []; |
39 | foreach ( $codes as $invertCode ) { |
40 | $zeroCounts[$invertCode] = 0; |
41 | } |
42 | |
43 | DOMPostOrder::traverse( |
44 | $root, function ( Node &$node ) use ( |
45 | $machine, $codes, $destCode, $zeroCounts |
46 | ) { |
47 | if ( !( $node instanceof Element ) ) { |
48 | // Elements only! |
49 | return; |
50 | } |
51 | // XXX look at `lang` attribute and use it to inform guess? |
52 | $nodeData = self::getNodeData( $node ); |
53 | $first = true; |
54 | // Iterate over child *nodes* (not just elements) |
55 | for ( $child = $node->firstChild; |
56 | $child; |
57 | $child = $child->nextSibling |
58 | ) { |
59 | if ( $child instanceof Text ) { |
60 | $countMap = []; |
61 | foreach ( $codes as $invertCode ) { |
62 | $countMap[$invertCode] = $machine->countBrackets( |
63 | $child->textContent, |
64 | $destCode, |
65 | $invertCode |
66 | )->safe; |
67 | } |
68 | } elseif ( $child instanceof Element ) { |
69 | $countMap = self::getNodeData( $child )->countMap; |
70 | } else { |
71 | continue; // skip this non-element non-text node |
72 | } |
73 | if ( $first ) { |
74 | $nodeData->countMap = $countMap; |
75 | $first = false; |
76 | } else { |
77 | // accumulate child counts! |
78 | foreach ( $codes as $c ) { |
79 | $nodeData->countMap[$c] += $countMap[$c]; |
80 | } |
81 | } |
82 | } |
83 | if ( $first ) { |
84 | $nodeData->countMap = $zeroCounts; |
85 | } |
86 | // Compute best guess for language |
87 | $safe = []; |
88 | foreach ( $codes as $code ) { |
89 | $safe[$code] = $nodeData->countMap[$code]; |
90 | } |
91 | arsort( $safe ); |
92 | $nodeData->guessLang = array_keys( $safe )[0]; |
93 | } ); |
94 | } |
95 | |
96 | /** |
97 | * Helper function that namespaces all of our node data used in |
98 | * this class into the top-level `mw_variant` key. |
99 | * |
100 | * @param Element $node |
101 | * @return stdClass |
102 | */ |
103 | private static function getNodeData( Element $node ): stdClass { |
104 | $nodeData = DOMDataUtils::getNodeData( $node ); |
105 | if ( !isset( $nodeData->mw_variant ) ) { |
106 | $nodeData->mw_variant = new stdClass; |
107 | } |
108 | return $nodeData->mw_variant; |
109 | } |
110 | |
111 | /** @inheritDoc */ |
112 | public function guessLang( Element $node ): Bcp47Code { |
113 | return Utils::mwCodeToBcp47( self::getNodeData( $node )->guessLang ); |
114 | } |
115 | } |