Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
CRAP | |
0.00% |
0 / 1 |
QualityScore | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
38.67 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
score | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
intermediateScore | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
1 | |||
scoreNormLog2 | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
scoreNorm | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
boostTemplates | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
boost | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
getRequiredFields | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
setMaxDocs | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
explain | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
intermediateExplain | |
100.00% |
34 / 34 |
|
100.00% |
1 / 1 |
3 | |||
explainTemplateBoosts | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
6.09 | |||
explainBoostTemplates | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
4 | |||
explainScoreNormLog2 | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
explainScoreNorm | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
explainWeight | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument\Completion; |
4 | |
5 | use CirrusSearch\Util; |
6 | |
7 | /** |
8 | * Score that tries to reflect the quality of a page. |
9 | * NOTE: Experimental |
10 | * |
11 | * This score makes the assumption that bigger is better. |
12 | * |
13 | * Small cities/village which have a high number of incoming links because they |
14 | * link to each others ( see https://en.wikipedia.org/wiki/Villefort,_Loz%C3%A8re ) |
15 | * will be be discounted correctly because others variables are very low. |
16 | * |
17 | * On the other hand some pages like List will get sometimes a very high but unjustified |
18 | * score. |
19 | * |
20 | * The boost templates feature might help but it's a System message that is not necessarily |
21 | * configured by wiki admins. |
22 | */ |
23 | class QualityScore implements SuggestScoringMethod { |
24 | // TODO: move these constants into a cirrus profile |
25 | public const INCOMING_LINKS_MAX_DOCS_FACTOR = 0.1; |
26 | |
27 | public const EXTERNAL_LINKS_NORM = 20; |
28 | public const PAGE_SIZE_NORM = 50000; |
29 | public const HEADING_NORM = 20; |
30 | public const REDIRECT_NORM = 30; |
31 | |
32 | private const INCOMING_LINKS_WEIGHT = 0.6; |
33 | private const EXTERNAL_LINKS_WEIGHT = 0.1; |
34 | private const PAGE_SIZE_WEIGHT = 0.1; |
35 | private const HEADING_WEIGHT = 0.2; |
36 | private const REDIRECT_WEIGHT = 0.1; |
37 | |
38 | // The final score will be in the range [0, SCORE_RANGE] |
39 | public const SCORE_RANGE = 10000000; |
40 | |
41 | /** |
42 | * Template boosts configured by the mediawiki admin. |
43 | * |
44 | * @var float[] array of key values, key is the template and value is a float |
45 | */ |
46 | private $boostTemplates; |
47 | |
48 | /** |
49 | * @var int the number of docs in the index |
50 | */ |
51 | protected $maxDocs; |
52 | |
53 | /** |
54 | * @var int normalisation factor for incoming links |
55 | */ |
56 | private $incomingLinksNorm; |
57 | |
58 | /** |
59 | * @param float[]|null $boostTemplates Array of key values, key is the template name, value the |
60 | * boost factor. Defaults to Util::getDefaultBoostTemplates() |
61 | */ |
62 | public function __construct( $boostTemplates = null ) { |
63 | $this->boostTemplates = $boostTemplates === null ? Util::getDefaultBoostTemplates() : $boostTemplates; |
64 | } |
65 | |
66 | /** |
67 | * @inheritDoc |
68 | */ |
69 | public function score( array $doc ) { |
70 | return intval( $this->intermediateScore( $doc ) * self::SCORE_RANGE ); |
71 | } |
72 | |
73 | /** |
74 | * @param array $doc |
75 | * @return float |
76 | */ |
77 | protected function intermediateScore( array $doc ) { |
78 | $incLinks = $this->scoreNormLog2( $doc['incoming_links'] ?? 0, |
79 | $this->incomingLinksNorm ); |
80 | $pageSize = $this->scoreNormLog2( $doc['text_bytes'] ?? 0, |
81 | self::PAGE_SIZE_NORM ); |
82 | $extLinks = $this->scoreNorm( count( $doc['external_link'] ?? [] ), |
83 | self::EXTERNAL_LINKS_NORM ); |
84 | $headings = $this->scoreNorm( count( $doc['heading'] ?? [] ), |
85 | self::HEADING_NORM ); |
86 | $redirects = $this->scoreNorm( count( $doc['redirect'] ?? [] ), |
87 | self::REDIRECT_NORM ); |
88 | |
89 | $score = $incLinks * self::INCOMING_LINKS_WEIGHT; |
90 | |
91 | $score += $extLinks * self::EXTERNAL_LINKS_WEIGHT; |
92 | $score += $pageSize * self::PAGE_SIZE_WEIGHT; |
93 | $score += $headings * self::HEADING_WEIGHT; |
94 | $score += $redirects * self::REDIRECT_WEIGHT; |
95 | |
96 | // We have a standardized composite score between 0 and 1 |
97 | $score /= self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
98 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
99 | |
100 | return $this->boostTemplates( $doc, $score ); |
101 | } |
102 | |
103 | /** |
104 | * log2( ( value / norm ) + 1 ) => [0-1] |
105 | * |
106 | * @param float $value |
107 | * @param float $norm |
108 | * @return float between 0 and 1 |
109 | */ |
110 | public function scoreNormLog2( $value, $norm ) { |
111 | return log( $value > $norm ? 2 : ( $value / $norm ) + 1, 2 ); |
112 | } |
113 | |
114 | /** |
115 | * value / norm => [0-1] |
116 | * |
117 | * @param float $value |
118 | * @param float $norm |
119 | * @return float between 0 and 1 |
120 | */ |
121 | public function scoreNorm( $value, $norm ) { |
122 | return $value > $norm ? 1 : $value / $norm; |
123 | } |
124 | |
125 | /** |
126 | * Modify an existing score based on templates contained |
127 | * by the document. |
128 | * |
129 | * @param array $doc Document score is generated for |
130 | * @param float $score Current score between 0 and 1 |
131 | * @return float Score after boosting templates |
132 | */ |
133 | public function boostTemplates( array $doc, $score ) { |
134 | if ( !isset( $doc['template'] ) ) { |
135 | return $score; |
136 | } |
137 | |
138 | if ( $this->boostTemplates ) { |
139 | $boost = 1; |
140 | // compute the global boost |
141 | foreach ( $this->boostTemplates as $k => $v ) { |
142 | if ( in_array( $k, $doc['template'] ) ) { |
143 | $boost *= $v; |
144 | } |
145 | } |
146 | if ( $boost != 1 ) { |
147 | return $this->boost( $score, $boost ); |
148 | } |
149 | } |
150 | return $score; |
151 | } |
152 | |
153 | /** |
154 | * Boost the score : |
155 | * boost value lower than 1 will decrease the score |
156 | * boost value set to 1 will keep the score unchanged |
157 | * boost value greater than 1 will increase the score |
158 | * |
159 | * score = 0.5, boost = 0.5 result is 0.375 |
160 | * score = 0.1, boost = 2 result is 0.325 |
161 | * |
162 | * @param float $score |
163 | * @param float $boost |
164 | * @return float adjusted score |
165 | */ |
166 | public function boost( $score, $boost ) { |
167 | if ( $boost == 1 ) { |
168 | return $score; |
169 | } |
170 | |
171 | // Transform the boost to a value between -1 and 1 |
172 | $boost = $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ); |
173 | // @todo: the 0.5 ratio is hardcoded we could maybe allow customization |
174 | // here, this would be a way to increase the impact of template boost |
175 | if ( $boost > 0 ) { |
176 | return $score + ( ( ( 1 - $score ) / 2 ) * $boost ); |
177 | } else { |
178 | return $score + ( ( $score / 2 ) * $boost ); |
179 | } |
180 | } |
181 | |
182 | /** |
183 | * @inheritDoc |
184 | */ |
185 | public function getRequiredFields() { |
186 | return [ |
187 | 'incoming_links', |
188 | 'external_link', |
189 | 'text_bytes', |
190 | 'heading', |
191 | 'redirect', |
192 | 'template', |
193 | ]; |
194 | } |
195 | |
196 | /** |
197 | * @param int $maxDocs |
198 | */ |
199 | public function setMaxDocs( $maxDocs ) { |
200 | $this->maxDocs = $maxDocs; |
201 | // We normalize incoming links according to the size of the index |
202 | $this->incomingLinksNorm = (int)( $maxDocs * self::INCOMING_LINKS_MAX_DOCS_FACTOR ); |
203 | if ( $this->incomingLinksNorm < 1 ) { |
204 | // it's a very small wiki let's force the norm to 1 |
205 | $this->incomingLinksNorm = 1; |
206 | } |
207 | } |
208 | |
209 | /** |
210 | * Explain the score |
211 | * @param array $doc |
212 | * @return array |
213 | */ |
214 | public function explain( array $doc ) { |
215 | $intermediateExplain = $this->intermediateExplain( $doc ); |
216 | return [ |
217 | 'value' => (int)( $intermediateExplain['value'] * self::SCORE_RANGE ), |
218 | 'description' => 'Convert to an integer score: ' . $intermediateExplain['value'] . ' * ' . self::SCORE_RANGE, |
219 | 'details' => [ 'normalized_score' => $intermediateExplain ] |
220 | ]; |
221 | } |
222 | |
223 | /** |
224 | * @param array $doc |
225 | * @return array |
226 | */ |
227 | protected function intermediateExplain( array $doc ) { |
228 | $incLinks = $this->explainScoreNormLog2( $doc['incoming_links'] ?? 0, |
229 | $this->incomingLinksNorm, 'incoming_links' ); |
230 | $pageSize = $this->explainScoreNormLog2( $doc['text_bytes'] ?? 0, |
231 | self::PAGE_SIZE_NORM, 'text_bytes' ); |
232 | $extLinks = $this->explainScoreNorm( count( $doc['external_link'] ?? [] ), |
233 | self::EXTERNAL_LINKS_NORM, 'external_links_count' ); |
234 | $headings = $this->explainScoreNorm( count( $doc['heading'] ?? [] ), |
235 | self::HEADING_NORM, 'headings_count' ); |
236 | $redirects = $this->explainScoreNorm( count( $doc['redirect'] ?? [] ), |
237 | self::REDIRECT_NORM, 'redirects_count' ); |
238 | |
239 | $details = []; |
240 | $total = self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
241 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
242 | $details['incoming_links_weighted'] = $this->explainWeight( $incLinks, self::INCOMING_LINKS_WEIGHT, |
243 | $total, 'incoming_links_normalized' ); |
244 | $details['external_links_weighted'] = $this->explainWeight( $extLinks, self::EXTERNAL_LINKS_WEIGHT, |
245 | $total, 'external_links_count_normalized' ); |
246 | $details['text_bytes_weighted'] = $this->explainWeight( $pageSize, self::PAGE_SIZE_WEIGHT, |
247 | $total, 'text_bytes_normalized' ); |
248 | $details['headings_count_weighted'] = $this->explainWeight( $headings, self::HEADING_WEIGHT, |
249 | $total, 'headings_count_normalized' ); |
250 | $details['redirects_count_weighted'] = $this->explainWeight( $redirects, self::REDIRECT_WEIGHT, |
251 | $total, 'redirects_count_normalized' ); |
252 | |
253 | $score = 0; |
254 | foreach ( $details as $detail ) { |
255 | $score += $detail['value']; |
256 | } |
257 | $metadataExplain = [ |
258 | 'value' => $score, |
259 | 'description' => 'weighted sum of document metadata', |
260 | 'details' => $details |
261 | ]; |
262 | |
263 | if ( $this->boostTemplates ) { |
264 | return $this->explainBoostTemplates( $metadataExplain, $doc ); |
265 | } |
266 | return $metadataExplain; |
267 | } |
268 | |
269 | /** |
270 | * @param array $doc |
271 | * @return array |
272 | */ |
273 | private function explainTemplateBoosts( array $doc ) { |
274 | if ( !isset( $doc['template'] ) ) { |
275 | return [ |
276 | 'value' => 1, |
277 | 'description' => 'No templates' |
278 | ]; |
279 | } |
280 | |
281 | if ( $this->boostTemplates ) { |
282 | $details = []; |
283 | $boost = 1; |
284 | // compute the global boost |
285 | foreach ( $this->boostTemplates as $k => $v ) { |
286 | if ( in_array( $k, $doc['template'] ) ) { |
287 | $details["$k: boost for " . $v] = [ |
288 | 'value' => $v, |
289 | 'description' => $k |
290 | ]; |
291 | $boost *= $v; |
292 | } |
293 | } |
294 | if ( $details !== [] ) { |
295 | return [ |
296 | 'value' => $boost, |
297 | 'description' => 'Product of all template boosts', |
298 | 'details' => $details |
299 | ]; |
300 | } |
301 | return [ |
302 | 'value' => 1, |
303 | 'description' => "No templates match any boosted templates" |
304 | ]; |
305 | } else { |
306 | return [ |
307 | 'value' => 1, |
308 | 'description' => "No configured boosted templates" |
309 | ]; |
310 | } |
311 | } |
312 | |
313 | /** |
314 | * @param array $metadataExplain |
315 | * @param array $doc |
316 | * @return array |
317 | */ |
318 | private function explainBoostTemplates( array $metadataExplain, array $doc ) { |
319 | $boostExplain = $this->explainTemplateBoosts( $doc ); |
320 | $score = $metadataExplain['value']; |
321 | $boost = $boostExplain['value']; |
322 | $boostExplain = [ |
323 | 'value' => $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ), |
324 | 'description' => ( $boost > 1 ? "1-(1/boost)" : "-(1-boost)" ) . "; boost = $boost", |
325 | 'details' => [ 'template_boosts' => $boostExplain ] |
326 | ]; |
327 | $boost = $boostExplain['value']; |
328 | |
329 | if ( $boost > 0 ) { |
330 | return [ |
331 | 'value' => $score + ( ( ( 1 - $score ) / 2 ) * $boost ), |
332 | 'description' => "score + (((1-score)/2)*boost); score = $score, boost = $boost", |
333 | 'details' => [ $metadataExplain, $boostExplain ] |
334 | ]; |
335 | } else { |
336 | return [ |
337 | 'value' => $score + ( ( $score / 2 ) * $boost ), |
338 | 'description' => "score+(((1-score)/2)*boost); score = $score, boost = $boost", |
339 | 'details' => [ 'score' => $metadataExplain, 'boost' => $boostExplain ] |
340 | ]; |
341 | } |
342 | } |
343 | |
344 | /** |
345 | * @param float|int $value |
346 | * @param float|int $norm |
347 | * @param string $valueName |
348 | * @return array |
349 | */ |
350 | private function explainScoreNormLog2( $value, $norm, $valueName ) { |
351 | $score = $this->scoreNormLog2( $value, $norm ); |
352 | return [ |
353 | 'value' => $score, |
354 | 'description' => "logâ‚‚((min($valueName,max)/max)+1); $valueName = $value, max = $norm", |
355 | ]; |
356 | } |
357 | |
358 | /** |
359 | * @param int|float $value |
360 | * @param int|float $norm |
361 | * @param string $valueName |
362 | * @return array |
363 | */ |
364 | private function explainScoreNorm( $value, $norm, $valueName ) { |
365 | $score = $this->scoreNorm( $value, $norm ); |
366 | return [ |
367 | 'value' => $score, |
368 | 'description' => "min($valueName,max)/max; $valueName = $value, max = $norm", |
369 | ]; |
370 | } |
371 | |
372 | /** |
373 | * @param array $detail |
374 | * @param float $weight |
375 | * @param float $allWeights |
376 | * @param string $valueName |
377 | * @return array |
378 | */ |
379 | protected function explainWeight( array $detail, $weight, $allWeights, $valueName ) { |
380 | $value = $detail['value']; |
381 | return [ |
382 | 'value' => $value * $weight / $allWeights, |
383 | 'description' => "$valueName*weight/total; $valueName = $value, weight = $weight, total = $allWeights", |
384 | 'details' => [ $valueName => $detail ] |
385 | ]; |
386 | } |
387 | } |