Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 23 |
|
0.00% |
0 / 2 |
CRAP | |
0.00% |
0 / 1 |
UnderlinkedFunctionScoreBuilder | |
0.00% |
0 / 23 |
|
0.00% |
0 / 2 |
6 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
append | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace GrowthExperiments\NewcomerTasks\TaskSuggester; |
4 | |
5 | use CirrusSearch\Search\Rescore\BoostFunctionBuilder; |
6 | use Elastica\Query\FunctionScore; |
7 | use Elastica\Script\Script; |
8 | |
9 | /** |
10 | * A CirrusSearch rescore function which prioritizes underlinked articles and is otherwise random. |
11 | * @see https://www.mediawiki.org/wiki/Extension:CirrusSearch/Scoring#Rescoring |
12 | */ |
13 | class UnderlinkedFunctionScoreBuilder implements BoostFunctionBuilder { |
14 | |
15 | /** Function type used in the rescore profile */ |
16 | public const TYPE = 'growth_underlinked'; |
17 | |
18 | /** @var float */ |
19 | private $weight; |
20 | |
21 | /** @var int */ |
22 | private $minimumLength; |
23 | |
24 | /** |
25 | * @param float $weight Weight of the underlinkedness metric (vs. a random factor) in sorting. |
26 | * @param int $minimumLength Do not consider articles shorter than this underlinked. |
27 | */ |
28 | public function __construct( float $weight, int $minimumLength ) { |
29 | $this->weight = $weight; |
30 | $this->minimumLength = $minimumLength; |
31 | } |
32 | |
33 | /** |
34 | * @inheritDoc |
35 | */ |
36 | public function append( FunctionScore $container ) { |
37 | // For articles shorter than the minimum length, the underlinkedness score is 0. |
38 | // Otherwise, it is the chance that a randomly picked word in the |
39 | // article is not a link. (Approximately - doesn't take multi-word links into account.) |
40 | // Since this is very close to 1, a power function is used to smooth it to the [0,1] range. |
41 | // See https://phabricator.wikimedia.org/T317546#8246903 for why .length is used |
42 | |
43 | // Chosen arbitrarily because it gave nice values for a few sample articles. |
44 | $smoothingFactor = 4; |
45 | |
46 | // Keep this in sync with tests/api-testing/underlinked-rescore.js |
47 | $script = /** @lang JavaScript */ |
48 | <<<'SCRIPT' |
49 | doc['text_bytes'] >= minimumLength |
50 | ? pow( |
51 | max( |
52 | 0, |
53 | 1 - ( |
54 | doc['outgoing_link.token_count'].length |
55 | / max( 1, doc['text.word_count'] ) |
56 | ) |
57 | ), |
58 | smoothingFactor |
59 | ) |
60 | : 0 |
61 | SCRIPT; |
62 | $script = trim( preg_replace( '/\s+/', ' ', $script ) ); |
63 | $params = [ |
64 | 'minimumLength' => $this->minimumLength, |
65 | 'smoothingFactor' => $smoothingFactor, |
66 | ]; |
67 | $container->addScriptScoreFunction( |
68 | new Script( $script, $params, Script::LANG_EXPRESSION ), |
69 | null, |
70 | $this->weight |
71 | ); |
72 | // Mix with a random factor so everyone doesn't get the same list of articles. |
73 | $container->addRandomScoreFunction( |
74 | random_int( 1, PHP_INT_MAX ), |
75 | null, |
76 | 1 - $this->weight, |
77 | '_seq_no' |
78 | ); |
79 | $container->setScoreMode( FunctionScore::SCORE_MODE_SUM ); |
80 | } |
81 | |
82 | } |