Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 64 |
MachineLanguageGuesser | |
0.00% |
0 / 1 |
|
0.00% |
0 / 3 |
272 | |
0.00% |
0 / 64 |
__construct | |
0.00% |
0 / 1 |
182 | |
0.00% |
0 / 56 |
|||
getNodeData | |
0.00% |
0 / 1 |
6 | |
0.00% |
0 / 6 |
|||
guessLang | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 2 |
<?php | |
declare( strict_types = 1 ); | |
namespace Wikimedia\Parsoid\Language; | |
use stdClass; | |
use Wikimedia\LangConv\FstReplacementMachine; | |
use Wikimedia\Parsoid\DOM\Element; | |
use Wikimedia\Parsoid\DOM\Node; | |
use Wikimedia\Parsoid\DOM\Text; | |
use Wikimedia\Parsoid\Utils\DOMDataUtils; | |
use Wikimedia\Parsoid\Utils\DOMPostOrder; | |
/** | |
* Use a {@Link ReplacementMachine} to predict the best "source language" for every node in a DOM. | |
* Appropriate for wikis which are written in a mix of variants. | |
*/ | |
class MachineLanguageGuesser extends LanguageGuesser { | |
/** | |
* MachineLanguageGuesser constructor. | |
* @param FstReplacementMachine $machine | |
* @param Node $root | |
* @param string $destCode | |
*/ | |
public function __construct( FstReplacementMachine $machine, Node $root, $destCode ) { | |
$codes = []; | |
foreach ( $machine->getCodes() as $invertCode => $ignore ) { | |
if ( $machine->isValidCodePair( $destCode, $invertCode ) ) { | |
$codes[] = $invertCode; | |
} | |
} | |
$zeroCounts = []; | |
foreach ( $codes as $invertCode ) { | |
$zeroCounts[$invertCode] = 0; | |
} | |
DOMPostOrder::traverse( | |
$root, function ( Node &$node ) use ( | |
$machine, $codes, $destCode, $zeroCounts | |
) { | |
if ( !( $node instanceof Element ) ) { | |
// Elements only! | |
return; | |
} | |
// XXX look at `lang` attribute and use it to inform guess? | |
$nodeData = self::getNodeData( $node ); | |
$first = true; | |
// Iterate over child *nodes* (not just elements) | |
for ( $child = $node->firstChild; | |
$child; | |
$child = $child->nextSibling | |
) { | |
if ( $child instanceof Text ) { | |
$countMap = []; | |
foreach ( $codes as $invertCode ) { | |
$countMap[$invertCode] = $machine->countBrackets( | |
$child->textContent, | |
$destCode, | |
$invertCode | |
)->safe; | |
} | |
} elseif ( $child instanceof Element ) { | |
$countMap = self::getNodeData( $child )->countMap; | |
} else { | |
continue; // skip this non-element non-text node | |
} | |
if ( $first ) { | |
$nodeData->countMap = $countMap; | |
$first = false; | |
} else { | |
// accumulate child counts! | |
foreach ( $codes as $c ) { | |
$nodeData->countMap[$c] += $countMap[$c]; | |
} | |
} | |
} | |
if ( $first ) { | |
$nodeData->countMap = $zeroCounts; | |
} | |
// Compute best guess for language | |
$safe = []; | |
foreach ( $codes as $code ) { | |
$safe[$code] = $nodeData->countMap[$code]; | |
} | |
arsort( $safe ); | |
$nodeData->guessLang = array_keys( $safe )[0]; | |
} ); | |
} | |
/** | |
* Helper function that namespaces all of our node data used in | |
* this class into the top-level `mw_variant` key. | |
* | |
* @param Element $node | |
* @return stdClass | |
*/ | |
private static function getNodeData( Element $node ): stdClass { | |
$nodeData = DOMDataUtils::getNodeData( $node ); | |
if ( !isset( $nodeData->mw_variant ) ) { | |
$nodeData->mw_variant = new stdClass; | |
} | |
return $nodeData->mw_variant; | |
} | |
/** @inheritDoc */ | |
public function guessLang( Element $node ): string { | |
return self::getNodeData( $node )->guessLang; | |
} | |
} |