Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
CRAP | |
0.00% |
0 / 1 |
QualityScore | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
38.67 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
score | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
intermediateScore | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
1 | |||
scoreNormLog2 | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
scoreNorm | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
boostTemplates | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
boost | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
getRequiredFields | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
setMaxDocs | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
explain | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
intermediateExplain | |
100.00% |
34 / 34 |
|
100.00% |
1 / 1 |
3 | |||
explainTemplateBoosts | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
6.09 | |||
explainBoostTemplates | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
4 | |||
explainScoreNormLog2 | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
explainScoreNorm | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
explainWeight | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument\Completion; |
4 | |
5 | use CirrusSearch\Util; |
6 | |
7 | /** |
8 | * Score that tries to reflect the quality of a page. |
9 | * NOTE: Experimental |
10 | * |
11 | * This score makes the assumption that bigger is better. |
12 | * |
13 | * Small cities/village which have a high number of incoming links because they |
14 | * link to each others ( see https://en.wikipedia.org/wiki/Villefort,_Loz%C3%A8re ) |
15 | * will be be discounted correctly because others variables are very low. |
16 | * |
17 | * On the other hand some pages like List will get sometimes a very high but unjustified |
18 | * score. |
19 | * |
20 | * The boost templates feature might help but it's a System message that is not necessarily |
21 | * configured by wiki admins. |
22 | */ |
23 | class QualityScore implements SuggestScoringMethod { |
24 | // TODO: move these constants into a cirrus profile |
25 | public const INCOMING_LINKS_MAX_DOCS_FACTOR = 0.1; |
26 | |
27 | public const EXTERNAL_LINKS_NORM = 20; |
28 | public const PAGE_SIZE_NORM = 50000; |
29 | public const HEADING_NORM = 20; |
30 | public const REDIRECT_NORM = 30; |
31 | |
32 | private const INCOMING_LINKS_WEIGHT = 0.6; |
33 | private const EXTERNAL_LINKS_WEIGHT = 0.1; |
34 | private const PAGE_SIZE_WEIGHT = 0.1; |
35 | private const HEADING_WEIGHT = 0.2; |
36 | private const REDIRECT_WEIGHT = 0.1; |
37 | |
38 | // The final score will be in the range [0, SCORE_RANGE] |
39 | public const SCORE_RANGE = 10000000; |
40 | |
41 | /** |
42 | * Template boosts configured by the mediawiki admin. |
43 | * |
44 | * @var float[] array of key values, key is the template and value is a float |
45 | */ |
46 | private $boostTemplates; |
47 | |
48 | /** |
49 | * @var int the number of docs in the index |
50 | */ |
51 | protected $maxDocs; |
52 | |
53 | /** |
54 | * @var int normalisation factor for incoming links |
55 | */ |
56 | private $incomingLinksNorm; |
57 | |
58 | /** |
59 | * @param float[]|null $boostTemplates Array of key values, key is the template name, value the |
60 | * boost factor. Defaults to Util::getDefaultBoostTemplates() |
61 | */ |
62 | public function __construct( $boostTemplates = null ) { |
63 | $this->boostTemplates = $boostTemplates === null ? Util::getDefaultBoostTemplates() : $boostTemplates; |
64 | } |
65 | |
66 | /** |
67 | * @inheritDoc |
68 | */ |
69 | public function score( array $doc ) { |
70 | return intval( $this->intermediateScore( $doc ) * self::SCORE_RANGE ); |
71 | } |
72 | |
73 | protected function intermediateScore( array $doc ) { |
74 | $incLinks = $this->scoreNormLog2( $doc['incoming_links'] ?? 0, |
75 | $this->incomingLinksNorm ); |
76 | $pageSize = $this->scoreNormLog2( $doc['text_bytes'] ?? 0, |
77 | self::PAGE_SIZE_NORM ); |
78 | $extLinks = $this->scoreNorm( count( $doc['external_link'] ?? [] ), |
79 | self::EXTERNAL_LINKS_NORM ); |
80 | $headings = $this->scoreNorm( count( $doc['heading'] ?? [] ), |
81 | self::HEADING_NORM ); |
82 | $redirects = $this->scoreNorm( count( $doc['redirect'] ?? [] ), |
83 | self::REDIRECT_NORM ); |
84 | |
85 | $score = $incLinks * self::INCOMING_LINKS_WEIGHT; |
86 | |
87 | $score += $extLinks * self::EXTERNAL_LINKS_WEIGHT; |
88 | $score += $pageSize * self::PAGE_SIZE_WEIGHT; |
89 | $score += $headings * self::HEADING_WEIGHT; |
90 | $score += $redirects * self::REDIRECT_WEIGHT; |
91 | |
92 | // We have a standardized composite score between 0 and 1 |
93 | $score /= self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
94 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
95 | |
96 | return $this->boostTemplates( $doc, $score ); |
97 | } |
98 | |
99 | /** |
100 | * log2( ( value / norm ) + 1 ) => [0-1] |
101 | * |
102 | * @param float $value |
103 | * @param float $norm |
104 | * @return float between 0 and 1 |
105 | */ |
106 | public function scoreNormLog2( $value, $norm ) { |
107 | return log( $value > $norm ? 2 : ( $value / $norm ) + 1, 2 ); |
108 | } |
109 | |
110 | /** |
111 | * value / norm => [0-1] |
112 | * |
113 | * @param float $value |
114 | * @param float $norm |
115 | * @return float between 0 and 1 |
116 | */ |
117 | public function scoreNorm( $value, $norm ) { |
118 | return $value > $norm ? 1 : $value / $norm; |
119 | } |
120 | |
121 | /** |
122 | * Modify an existing score based on templates contained |
123 | * by the document. |
124 | * |
125 | * @param array $doc Document score is generated for |
126 | * @param float $score Current score between 0 and 1 |
127 | * @return float Score after boosting templates |
128 | */ |
129 | public function boostTemplates( array $doc, $score ) { |
130 | if ( !isset( $doc['template'] ) ) { |
131 | return $score; |
132 | } |
133 | |
134 | if ( $this->boostTemplates ) { |
135 | $boost = 1; |
136 | // compute the global boost |
137 | foreach ( $this->boostTemplates as $k => $v ) { |
138 | if ( in_array( $k, $doc['template'] ) ) { |
139 | $boost *= $v; |
140 | } |
141 | } |
142 | if ( $boost != 1 ) { |
143 | return $this->boost( $score, $boost ); |
144 | } |
145 | } |
146 | return $score; |
147 | } |
148 | |
149 | /** |
150 | * Boost the score : |
151 | * boost value lower than 1 will decrease the score |
152 | * boost value set to 1 will keep the score unchanged |
153 | * boost value greater than 1 will increase the score |
154 | * |
155 | * score = 0.5, boost = 0.5 result is 0.375 |
156 | * score = 0.1, boost = 2 result is 0.325 |
157 | * |
158 | * @param float $score |
159 | * @param float $boost |
160 | * @return float adjusted score |
161 | */ |
162 | public function boost( $score, $boost ) { |
163 | if ( $boost == 1 ) { |
164 | return $score; |
165 | } |
166 | |
167 | // Transform the boost to a value between -1 and 1 |
168 | $boost = $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ); |
169 | // @todo: the 0.5 ratio is hardcoded we could maybe allow customization |
170 | // here, this would be a way to increase the impact of template boost |
171 | if ( $boost > 0 ) { |
172 | return $score + ( ( ( 1 - $score ) / 2 ) * $boost ); |
173 | } else { |
174 | return $score + ( ( $score / 2 ) * $boost ); |
175 | } |
176 | } |
177 | |
178 | /** |
179 | * @inheritDoc |
180 | */ |
181 | public function getRequiredFields() { |
182 | return [ |
183 | 'incoming_links', |
184 | 'external_link', |
185 | 'text_bytes', |
186 | 'heading', |
187 | 'redirect', |
188 | 'template', |
189 | ]; |
190 | } |
191 | |
192 | /** |
193 | * @param int $maxDocs |
194 | */ |
195 | public function setMaxDocs( $maxDocs ) { |
196 | $this->maxDocs = $maxDocs; |
197 | // We normalize incoming links according to the size of the index |
198 | $this->incomingLinksNorm = (int)( $maxDocs * self::INCOMING_LINKS_MAX_DOCS_FACTOR ); |
199 | if ( $this->incomingLinksNorm < 1 ) { |
200 | // it's a very small wiki let's force the norm to 1 |
201 | $this->incomingLinksNorm = 1; |
202 | } |
203 | } |
204 | |
205 | /** |
206 | * Explain the score |
207 | * @param array $doc |
208 | * @return array |
209 | */ |
210 | public function explain( array $doc ) { |
211 | $intermediateExplain = $this->intermediateExplain( $doc ); |
212 | return [ |
213 | 'value' => (int)( $intermediateExplain['value'] * self::SCORE_RANGE ), |
214 | 'description' => 'Convert to an integer score: ' . $intermediateExplain['value'] . ' * ' . self::SCORE_RANGE, |
215 | 'details' => [ 'normalized_score' => $intermediateExplain ] |
216 | ]; |
217 | } |
218 | |
219 | /** |
220 | * @param array $doc |
221 | * @return array |
222 | */ |
223 | protected function intermediateExplain( array $doc ) { |
224 | $incLinks = $this->explainScoreNormLog2( $doc['incoming_links'] ?? 0, |
225 | $this->incomingLinksNorm, 'incoming_links' ); |
226 | $pageSize = $this->explainScoreNormLog2( $doc['text_bytes'] ?? 0, |
227 | self::PAGE_SIZE_NORM, 'text_bytes' ); |
228 | $extLinks = $this->explainScoreNorm( count( $doc['external_link'] ?? [] ), |
229 | self::EXTERNAL_LINKS_NORM, 'external_links_count' ); |
230 | $headings = $this->explainScoreNorm( count( $doc['heading'] ?? [] ), |
231 | self::HEADING_NORM, 'headings_count' ); |
232 | $redirects = $this->explainScoreNorm( count( $doc['redirect'] ?? [] ), |
233 | self::REDIRECT_NORM, 'redirects_count' ); |
234 | |
235 | $details = []; |
236 | $total = self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
237 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
238 | $details['incoming_links_weighted'] = $this->explainWeight( $incLinks, self::INCOMING_LINKS_WEIGHT, |
239 | $total, 'incoming_links_normalized' ); |
240 | $details['external_links_weighted'] = $this->explainWeight( $extLinks, self::EXTERNAL_LINKS_WEIGHT, |
241 | $total, 'external_links_count_normalized' ); |
242 | $details['text_bytes_weighted'] = $this->explainWeight( $pageSize, self::PAGE_SIZE_WEIGHT, |
243 | $total, 'text_bytes_normalized' ); |
244 | $details['headings_count_weighted'] = $this->explainWeight( $headings, self::HEADING_WEIGHT, |
245 | $total, 'headings_count_normalized' ); |
246 | $details['redirects_count_weighted'] = $this->explainWeight( $redirects, self::REDIRECT_WEIGHT, |
247 | $total, 'redirects_count_normalized' ); |
248 | |
249 | $score = 0; |
250 | foreach ( $details as $detail ) { |
251 | $score += $detail['value']; |
252 | } |
253 | $metadataExplain = [ |
254 | 'value' => $score, |
255 | 'description' => 'weighted sum of document metadata', |
256 | 'details' => $details |
257 | ]; |
258 | |
259 | if ( $this->boostTemplates ) { |
260 | return $this->explainBoostTemplates( $metadataExplain, $doc ); |
261 | } |
262 | return $metadataExplain; |
263 | } |
264 | |
265 | /** |
266 | * @param array $doc |
267 | * @return array |
268 | */ |
269 | private function explainTemplateBoosts( array $doc ) { |
270 | if ( !isset( $doc['template'] ) ) { |
271 | return [ |
272 | 'value' => 1, |
273 | 'description' => 'No templates' |
274 | ]; |
275 | } |
276 | |
277 | if ( $this->boostTemplates ) { |
278 | $details = []; |
279 | $boost = 1; |
280 | // compute the global boost |
281 | foreach ( $this->boostTemplates as $k => $v ) { |
282 | if ( in_array( $k, $doc['template'] ) ) { |
283 | $details["$k: boost for " . $v] = [ |
284 | 'value' => $v, |
285 | 'description' => $k |
286 | ]; |
287 | $boost *= $v; |
288 | } |
289 | } |
290 | if ( $details !== [] ) { |
291 | return [ |
292 | 'value' => $boost, |
293 | 'description' => 'Product of all template boosts', |
294 | 'details' => $details |
295 | ]; |
296 | } |
297 | return [ |
298 | 'value' => 1, |
299 | 'description' => "No templates match any boosted templates" |
300 | ]; |
301 | } else { |
302 | return [ |
303 | 'value' => 1, |
304 | 'description' => "No configured boosted templates" |
305 | ]; |
306 | } |
307 | } |
308 | |
309 | /** |
310 | * @param array $metadataExplain |
311 | * @param array $doc |
312 | * @return array |
313 | */ |
314 | private function explainBoostTemplates( array $metadataExplain, array $doc ) { |
315 | $boostExplain = $this->explainTemplateBoosts( $doc ); |
316 | $score = $metadataExplain['value']; |
317 | $boost = $boostExplain['value']; |
318 | $boostExplain = [ |
319 | 'value' => $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ), |
320 | 'description' => ( $boost > 1 ? "1-(1/boost)" : "-(1-boost)" ) . "; boost = $boost", |
321 | 'details' => [ 'template_boosts' => $boostExplain ] |
322 | ]; |
323 | $boost = $boostExplain['value']; |
324 | |
325 | if ( $boost > 0 ) { |
326 | return [ |
327 | 'value' => $score + ( ( ( 1 - $score ) / 2 ) * $boost ), |
328 | 'description' => "score + (((1-score)/2)*boost); score = $score, boost = $boost", |
329 | 'details' => [ $metadataExplain, $boostExplain ] |
330 | ]; |
331 | } else { |
332 | return [ |
333 | 'value' => $score + ( ( $score / 2 ) * $boost ), |
334 | 'description' => "score+(((1-score)/2)*boost); score = $score, boost = $boost", |
335 | 'details' => [ 'score' => $metadataExplain, 'boost' => $boostExplain ] |
336 | ]; |
337 | } |
338 | } |
339 | |
340 | /** |
341 | * @param float|int $value |
342 | * @param float|int $norm |
343 | * @param string $valueName |
344 | * @return array |
345 | */ |
346 | private function explainScoreNormLog2( $value, $norm, $valueName ) { |
347 | $score = $this->scoreNormLog2( $value, $norm ); |
348 | return [ |
349 | 'value' => $score, |
350 | 'description' => "logâ‚‚((min($valueName,max)/max)+1); $valueName = $value, max = $norm", |
351 | ]; |
352 | } |
353 | |
354 | /** |
355 | * @param int|float $value |
356 | * @param int|float $norm |
357 | * @param string $valueName |
358 | * @return array |
359 | */ |
360 | private function explainScoreNorm( $value, $norm, $valueName ) { |
361 | $score = $this->scoreNorm( $value, $norm ); |
362 | return [ |
363 | 'value' => $score, |
364 | 'description' => "min($valueName,max)/max; $valueName = $value, max = $norm", |
365 | ]; |
366 | } |
367 | |
368 | /** |
369 | * @param array $detail |
370 | * @param float $weight |
371 | * @param float $allWeights |
372 | * @param string $valueName |
373 | * @return array |
374 | */ |
375 | protected function explainWeight( array $detail, $weight, $allWeights, $valueName ) { |
376 | $value = $detail['value']; |
377 | return [ |
378 | 'value' => $value * $weight / $allWeights, |
379 | 'description' => "$valueName*weight/total; $valueName = $value, weight = $weight, total = $allWeights", |
380 | 'details' => [ $valueName => $detail ] |
381 | ]; |
382 | } |
383 | } |