Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
CRAP | |
0.00% |
0 / 1 |
| QualityScore | |
92.26% |
143 / 155 |
|
87.50% |
14 / 16 |
38.67 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| score | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| intermediateScore | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
1 | |||
| scoreNormLog2 | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| scoreNorm | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| boostTemplates | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
| boost | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
| getRequiredFields | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxDocs | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| explain | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| intermediateExplain | |
100.00% |
34 / 34 |
|
100.00% |
1 / 1 |
3 | |||
| explainTemplateBoosts | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
6.09 | |||
| explainBoostTemplates | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
4 | |||
| explainScoreNormLog2 | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| explainScoreNorm | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| explainWeight | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\BuildDocument\Completion; |
| 4 | |
| 5 | use CirrusSearch\Util; |
| 6 | |
| 7 | /** |
| 8 | * Score that tries to reflect the quality of a page. |
| 9 | * NOTE: Experimental |
| 10 | * |
| 11 | * This score makes the assumption that bigger is better. |
| 12 | * |
| 13 | * Small cities/village which have a high number of incoming links because they |
| 14 | * link to each others ( see https://en.wikipedia.org/wiki/Villefort,_Loz%C3%A8re ) |
| 15 | * will be be discounted correctly because others variables are very low. |
| 16 | * |
| 17 | * On the other hand some pages like List will get sometimes a very high but unjustified |
| 18 | * score. |
| 19 | * |
| 20 | * The boost templates feature might help but it's a System message that is not necessarily |
| 21 | * configured by wiki admins. |
| 22 | */ |
| 23 | class QualityScore implements SuggestScoringMethod { |
| 24 | // TODO: move these constants into a cirrus profile |
| 25 | public const INCOMING_LINKS_MAX_DOCS_FACTOR = 0.1; |
| 26 | |
| 27 | public const EXTERNAL_LINKS_NORM = 20; |
| 28 | public const PAGE_SIZE_NORM = 50000; |
| 29 | public const HEADING_NORM = 20; |
| 30 | public const REDIRECT_NORM = 30; |
| 31 | |
| 32 | private const INCOMING_LINKS_WEIGHT = 0.6; |
| 33 | private const EXTERNAL_LINKS_WEIGHT = 0.1; |
| 34 | private const PAGE_SIZE_WEIGHT = 0.1; |
| 35 | private const HEADING_WEIGHT = 0.2; |
| 36 | private const REDIRECT_WEIGHT = 0.1; |
| 37 | |
| 38 | // The final score will be in the range [0, SCORE_RANGE] |
| 39 | public const SCORE_RANGE = 10000000; |
| 40 | |
| 41 | /** |
| 42 | * Template boosts configured by the mediawiki admin. |
| 43 | * |
| 44 | * @var float[] array of key values, key is the template and value is a float |
| 45 | */ |
| 46 | private $boostTemplates; |
| 47 | |
| 48 | /** |
| 49 | * @var int the number of docs in the index |
| 50 | */ |
| 51 | protected $maxDocs; |
| 52 | |
| 53 | /** |
| 54 | * @var int normalisation factor for incoming links |
| 55 | */ |
| 56 | private $incomingLinksNorm; |
| 57 | |
| 58 | /** |
| 59 | * @param float[]|null $boostTemplates Array of key values, key is the template name, value the |
| 60 | * boost factor. Defaults to Util::getDefaultBoostTemplates() |
| 61 | */ |
| 62 | public function __construct( $boostTemplates = null ) { |
| 63 | $this->boostTemplates = $boostTemplates === null ? Util::getDefaultBoostTemplates() : $boostTemplates; |
| 64 | } |
| 65 | |
| 66 | /** |
| 67 | * @inheritDoc |
| 68 | */ |
| 69 | public function score( array $doc ) { |
| 70 | return intval( $this->intermediateScore( $doc ) * self::SCORE_RANGE ); |
| 71 | } |
| 72 | |
| 73 | /** |
| 74 | * @param array $doc |
| 75 | * @return float |
| 76 | */ |
| 77 | protected function intermediateScore( array $doc ) { |
| 78 | $incLinks = $this->scoreNormLog2( $doc['incoming_links'] ?? 0, |
| 79 | $this->incomingLinksNorm ); |
| 80 | $pageSize = $this->scoreNormLog2( $doc['text_bytes'] ?? 0, |
| 81 | self::PAGE_SIZE_NORM ); |
| 82 | $extLinks = $this->scoreNorm( count( $doc['external_link'] ?? [] ), |
| 83 | self::EXTERNAL_LINKS_NORM ); |
| 84 | $headings = $this->scoreNorm( count( $doc['heading'] ?? [] ), |
| 85 | self::HEADING_NORM ); |
| 86 | $redirects = $this->scoreNorm( count( $doc['redirect'] ?? [] ), |
| 87 | self::REDIRECT_NORM ); |
| 88 | |
| 89 | $score = $incLinks * self::INCOMING_LINKS_WEIGHT; |
| 90 | |
| 91 | $score += $extLinks * self::EXTERNAL_LINKS_WEIGHT; |
| 92 | $score += $pageSize * self::PAGE_SIZE_WEIGHT; |
| 93 | $score += $headings * self::HEADING_WEIGHT; |
| 94 | $score += $redirects * self::REDIRECT_WEIGHT; |
| 95 | |
| 96 | // We have a standardized composite score between 0 and 1 |
| 97 | $score /= self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
| 98 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
| 99 | |
| 100 | return $this->boostTemplates( $doc, $score ); |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * log2( ( value / norm ) + 1 ) => [0-1] |
| 105 | * |
| 106 | * @param float $value |
| 107 | * @param float $norm |
| 108 | * @return float between 0 and 1 |
| 109 | */ |
| 110 | public function scoreNormLog2( $value, $norm ) { |
| 111 | return log( $value > $norm ? 2 : ( $value / $norm ) + 1, 2 ); |
| 112 | } |
| 113 | |
| 114 | /** |
| 115 | * value / norm => [0-1] |
| 116 | * |
| 117 | * @param float $value |
| 118 | * @param float $norm |
| 119 | * @return float between 0 and 1 |
| 120 | */ |
| 121 | public function scoreNorm( $value, $norm ) { |
| 122 | return $value > $norm ? 1 : $value / $norm; |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * Modify an existing score based on templates contained |
| 127 | * by the document. |
| 128 | * |
| 129 | * @param array $doc Document score is generated for |
| 130 | * @param float $score Current score between 0 and 1 |
| 131 | * @return float Score after boosting templates |
| 132 | */ |
| 133 | public function boostTemplates( array $doc, $score ) { |
| 134 | if ( !isset( $doc['template'] ) ) { |
| 135 | return $score; |
| 136 | } |
| 137 | |
| 138 | if ( $this->boostTemplates ) { |
| 139 | $boost = 1; |
| 140 | // compute the global boost |
| 141 | foreach ( $this->boostTemplates as $k => $v ) { |
| 142 | if ( in_array( $k, $doc['template'] ) ) { |
| 143 | $boost *= $v; |
| 144 | } |
| 145 | } |
| 146 | if ( $boost != 1 ) { |
| 147 | return $this->boost( $score, $boost ); |
| 148 | } |
| 149 | } |
| 150 | return $score; |
| 151 | } |
| 152 | |
| 153 | /** |
| 154 | * Boost the score : |
| 155 | * boost value lower than 1 will decrease the score |
| 156 | * boost value set to 1 will keep the score unchanged |
| 157 | * boost value greater than 1 will increase the score |
| 158 | * |
| 159 | * score = 0.5, boost = 0.5 result is 0.375 |
| 160 | * score = 0.1, boost = 2 result is 0.325 |
| 161 | * |
| 162 | * @param float $score |
| 163 | * @param float $boost |
| 164 | * @return float adjusted score |
| 165 | */ |
| 166 | public function boost( $score, $boost ) { |
| 167 | if ( $boost == 1 ) { |
| 168 | return $score; |
| 169 | } |
| 170 | |
| 171 | // Transform the boost to a value between -1 and 1 |
| 172 | $boost = $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ); |
| 173 | // @todo: the 0.5 ratio is hardcoded we could maybe allow customization |
| 174 | // here, this would be a way to increase the impact of template boost |
| 175 | if ( $boost > 0 ) { |
| 176 | return $score + ( ( ( 1 - $score ) / 2 ) * $boost ); |
| 177 | } else { |
| 178 | return $score + ( ( $score / 2 ) * $boost ); |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | /** |
| 183 | * @inheritDoc |
| 184 | */ |
| 185 | public function getRequiredFields() { |
| 186 | return [ |
| 187 | 'incoming_links', |
| 188 | 'external_link', |
| 189 | 'text_bytes', |
| 190 | 'heading', |
| 191 | 'redirect', |
| 192 | 'template', |
| 193 | ]; |
| 194 | } |
| 195 | |
| 196 | /** |
| 197 | * @param int $maxDocs |
| 198 | */ |
| 199 | public function setMaxDocs( $maxDocs ) { |
| 200 | $this->maxDocs = $maxDocs; |
| 201 | // We normalize incoming links according to the size of the index |
| 202 | $this->incomingLinksNorm = (int)( $maxDocs * self::INCOMING_LINKS_MAX_DOCS_FACTOR ); |
| 203 | if ( $this->incomingLinksNorm < 1 ) { |
| 204 | // it's a very small wiki let's force the norm to 1 |
| 205 | $this->incomingLinksNorm = 1; |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | /** |
| 210 | * Explain the score |
| 211 | * @param array $doc |
| 212 | * @return array |
| 213 | */ |
| 214 | public function explain( array $doc ) { |
| 215 | $intermediateExplain = $this->intermediateExplain( $doc ); |
| 216 | return [ |
| 217 | 'value' => (int)( $intermediateExplain['value'] * self::SCORE_RANGE ), |
| 218 | 'description' => 'Convert to an integer score: ' . $intermediateExplain['value'] . ' * ' . self::SCORE_RANGE, |
| 219 | 'details' => [ 'normalized_score' => $intermediateExplain ] |
| 220 | ]; |
| 221 | } |
| 222 | |
| 223 | /** |
| 224 | * @param array $doc |
| 225 | * @return array |
| 226 | */ |
| 227 | protected function intermediateExplain( array $doc ) { |
| 228 | $incLinks = $this->explainScoreNormLog2( $doc['incoming_links'] ?? 0, |
| 229 | $this->incomingLinksNorm, 'incoming_links' ); |
| 230 | $pageSize = $this->explainScoreNormLog2( $doc['text_bytes'] ?? 0, |
| 231 | self::PAGE_SIZE_NORM, 'text_bytes' ); |
| 232 | $extLinks = $this->explainScoreNorm( count( $doc['external_link'] ?? [] ), |
| 233 | self::EXTERNAL_LINKS_NORM, 'external_links_count' ); |
| 234 | $headings = $this->explainScoreNorm( count( $doc['heading'] ?? [] ), |
| 235 | self::HEADING_NORM, 'headings_count' ); |
| 236 | $redirects = $this->explainScoreNorm( count( $doc['redirect'] ?? [] ), |
| 237 | self::REDIRECT_NORM, 'redirects_count' ); |
| 238 | |
| 239 | $details = []; |
| 240 | $total = self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + |
| 241 | self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT; |
| 242 | $details['incoming_links_weighted'] = $this->explainWeight( $incLinks, self::INCOMING_LINKS_WEIGHT, |
| 243 | $total, 'incoming_links_normalized' ); |
| 244 | $details['external_links_weighted'] = $this->explainWeight( $extLinks, self::EXTERNAL_LINKS_WEIGHT, |
| 245 | $total, 'external_links_count_normalized' ); |
| 246 | $details['text_bytes_weighted'] = $this->explainWeight( $pageSize, self::PAGE_SIZE_WEIGHT, |
| 247 | $total, 'text_bytes_normalized' ); |
| 248 | $details['headings_count_weighted'] = $this->explainWeight( $headings, self::HEADING_WEIGHT, |
| 249 | $total, 'headings_count_normalized' ); |
| 250 | $details['redirects_count_weighted'] = $this->explainWeight( $redirects, self::REDIRECT_WEIGHT, |
| 251 | $total, 'redirects_count_normalized' ); |
| 252 | |
| 253 | $score = 0; |
| 254 | foreach ( $details as $detail ) { |
| 255 | $score += $detail['value']; |
| 256 | } |
| 257 | $metadataExplain = [ |
| 258 | 'value' => $score, |
| 259 | 'description' => 'weighted sum of document metadata', |
| 260 | 'details' => $details |
| 261 | ]; |
| 262 | |
| 263 | if ( $this->boostTemplates ) { |
| 264 | return $this->explainBoostTemplates( $metadataExplain, $doc ); |
| 265 | } |
| 266 | return $metadataExplain; |
| 267 | } |
| 268 | |
| 269 | /** |
| 270 | * @param array $doc |
| 271 | * @return array |
| 272 | */ |
| 273 | private function explainTemplateBoosts( array $doc ) { |
| 274 | if ( !isset( $doc['template'] ) ) { |
| 275 | return [ |
| 276 | 'value' => 1, |
| 277 | 'description' => 'No templates' |
| 278 | ]; |
| 279 | } |
| 280 | |
| 281 | if ( $this->boostTemplates ) { |
| 282 | $details = []; |
| 283 | $boost = 1; |
| 284 | // compute the global boost |
| 285 | foreach ( $this->boostTemplates as $k => $v ) { |
| 286 | if ( in_array( $k, $doc['template'] ) ) { |
| 287 | $details["$k: boost for " . $v] = [ |
| 288 | 'value' => $v, |
| 289 | 'description' => $k |
| 290 | ]; |
| 291 | $boost *= $v; |
| 292 | } |
| 293 | } |
| 294 | if ( $details !== [] ) { |
| 295 | return [ |
| 296 | 'value' => $boost, |
| 297 | 'description' => 'Product of all template boosts', |
| 298 | 'details' => $details |
| 299 | ]; |
| 300 | } |
| 301 | return [ |
| 302 | 'value' => 1, |
| 303 | 'description' => "No templates match any boosted templates" |
| 304 | ]; |
| 305 | } else { |
| 306 | return [ |
| 307 | 'value' => 1, |
| 308 | 'description' => "No configured boosted templates" |
| 309 | ]; |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | /** |
| 314 | * @param array $metadataExplain |
| 315 | * @param array $doc |
| 316 | * @return array |
| 317 | */ |
| 318 | private function explainBoostTemplates( array $metadataExplain, array $doc ) { |
| 319 | $boostExplain = $this->explainTemplateBoosts( $doc ); |
| 320 | $score = $metadataExplain['value']; |
| 321 | $boost = $boostExplain['value']; |
| 322 | $boostExplain = [ |
| 323 | 'value' => $boost > 1 ? 1 - ( 1 / $boost ) : -( 1 - $boost ), |
| 324 | 'description' => ( $boost > 1 ? "1-(1/boost)" : "-(1-boost)" ) . "; boost = $boost", |
| 325 | 'details' => [ 'template_boosts' => $boostExplain ] |
| 326 | ]; |
| 327 | $boost = $boostExplain['value']; |
| 328 | |
| 329 | if ( $boost > 0 ) { |
| 330 | return [ |
| 331 | 'value' => $score + ( ( ( 1 - $score ) / 2 ) * $boost ), |
| 332 | 'description' => "score + (((1-score)/2)*boost); score = $score, boost = $boost", |
| 333 | 'details' => [ $metadataExplain, $boostExplain ] |
| 334 | ]; |
| 335 | } else { |
| 336 | return [ |
| 337 | 'value' => $score + ( ( $score / 2 ) * $boost ), |
| 338 | 'description' => "score+(((1-score)/2)*boost); score = $score, boost = $boost", |
| 339 | 'details' => [ 'score' => $metadataExplain, 'boost' => $boostExplain ] |
| 340 | ]; |
| 341 | } |
| 342 | } |
| 343 | |
| 344 | /** |
| 345 | * @param float|int $value |
| 346 | * @param float|int $norm |
| 347 | * @param string $valueName |
| 348 | * @return array |
| 349 | */ |
| 350 | private function explainScoreNormLog2( $value, $norm, $valueName ) { |
| 351 | $score = $this->scoreNormLog2( $value, $norm ); |
| 352 | return [ |
| 353 | 'value' => $score, |
| 354 | 'description' => "logâ‚‚((min($valueName,max)/max)+1); $valueName = $value, max = $norm", |
| 355 | ]; |
| 356 | } |
| 357 | |
| 358 | /** |
| 359 | * @param int|float $value |
| 360 | * @param int|float $norm |
| 361 | * @param string $valueName |
| 362 | * @return array |
| 363 | */ |
| 364 | private function explainScoreNorm( $value, $norm, $valueName ) { |
| 365 | $score = $this->scoreNorm( $value, $norm ); |
| 366 | return [ |
| 367 | 'value' => $score, |
| 368 | 'description' => "min($valueName,max)/max; $valueName = $value, max = $norm", |
| 369 | ]; |
| 370 | } |
| 371 | |
| 372 | /** |
| 373 | * @param array $detail |
| 374 | * @param float $weight |
| 375 | * @param float $allWeights |
| 376 | * @param string $valueName |
| 377 | * @return array |
| 378 | */ |
| 379 | protected function explainWeight( array $detail, $weight, $allWeights, $valueName ) { |
| 380 | $value = $detail['value']; |
| 381 | return [ |
| 382 | 'value' => $value * $weight / $allWeights, |
| 383 | 'description' => "$valueName*weight/total; $valueName = $value, weight = $weight, total = $allWeights", |
| 384 | 'details' => [ $valueName => $detail ] |
| 385 | ]; |
| 386 | } |
| 387 | } |