Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 23
0.00% covered (danger)
0.00%
0 / 2
CRAP
0.00% covered (danger)
0.00%
0 / 1
UnderlinkedFunctionScoreBuilder
0.00% covered (danger)
0.00%
0 / 23
0.00% covered (danger)
0.00%
0 / 2
6
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 append
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace GrowthExperiments\NewcomerTasks\TaskSuggester;
4
5use CirrusSearch\Search\Rescore\BoostFunctionBuilder;
6use Elastica\Query\FunctionScore;
7use Elastica\Script\Script;
8
9/**
10 * A CirrusSearch rescore function which prioritizes underlinked articles and is otherwise random.
11 * @see https://www.mediawiki.org/wiki/Extension:CirrusSearch/Scoring#Rescoring
12 */
13class UnderlinkedFunctionScoreBuilder implements BoostFunctionBuilder {
14
15    /** Function type used in the rescore profile */
16    public const TYPE = 'growth_underlinked';
17
18    /** @var float */
19    private $weight;
20
21    /** @var int */
22    private $minimumLength;
23
24    /**
25     * @param float $weight Weight of the underlinkedness metric (vs. a random factor) in sorting.
26     * @param int $minimumLength Do not consider articles shorter than this underlinked.
27     */
28    public function __construct( float $weight, int $minimumLength ) {
29        $this->weight = $weight;
30        $this->minimumLength = $minimumLength;
31    }
32
33    /**
34     * @inheritDoc
35     */
36    public function append( FunctionScore $container ) {
37        // For articles shorter than the minimum length, the underlinkedness score is 0.
38        // Otherwise, it is the chance that a randomly picked word in the
39        // article is not a link. (Approximately - doesn't take multi-word links into account.)
40        // Since this is very close to 1, a power function is used to smooth it to the [0,1] range.
41        // See https://phabricator.wikimedia.org/T317546#8246903 for why .length is used
42
43        // Chosen arbitrarily because it gave nice values for a few sample articles.
44        $smoothingFactor = 4;
45
46        // Keep this in sync with tests/api-testing/underlinked-rescore.js
47        $script = /** @lang JavaScript */
48<<<'SCRIPT'
49        doc['text_bytes'] >= minimumLength
50            ? pow(
51                max(
52                    0,
53                    1 - (
54                        doc['outgoing_link.token_count'].length
55                        / max( 1, doc['text.word_count'] )
56                    )
57                ),
58                smoothingFactor
59            )
60            : 0
61SCRIPT;
62        $script = trim( preg_replace( '/\s+/', ' ', $script ) );
63        $params = [
64            'minimumLength' => $this->minimumLength,
65            'smoothingFactor' => $smoothingFactor,
66        ];
67        $container->addScriptScoreFunction(
68            new Script( $script, $params, Script::LANG_EXPRESSION ),
69            null,
70            $this->weight
71        );
72        // Mix with a random factor so everyone doesn't get the same list of articles.
73        $container->addRandomScoreFunction(
74            random_int( 1, PHP_INT_MAX ),
75            null,
76            1 - $this->weight,
77            '_seq_no'
78        );
79        $container->setScoreMode( FunctionScore::SCORE_MODE_SUM );
80    }
81
82}