Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
75.42% |
135 / 179 |
|
41.18% |
7 / 17 |
CRAP | |
0.00% |
0 / 1 |
SuggestBuilder | |
75.42% |
135 / 179 |
|
41.18% |
7 / 17 |
102.58 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
create | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
build | |
86.67% |
39 / 45 |
|
0.00% |
0 / 1 |
13.40 | |||
buildNormalSuggestions | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
4.01 | |||
getRequiredFields | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
buildTitleSuggestion | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
buildRedirectsSuggestion | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
buildSuggestion | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
3 | |||
trimForDistanceCheck | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
extractTitleAndSimilarRedirects | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
extractSimilars | |
94.74% |
18 / 19 |
|
0.00% |
0 / 1 |
7.01 | |||
distance | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
4.05 | |||
encodeDocId | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
encodePossibleDocIds | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getBatchId | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getTargetNamespace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fetchMaxDoc | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument\Completion; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use Elastica\Multi\Search as MultiSearch; |
7 | use Elastica\Search; |
8 | use MediaWiki\MediaWikiServices; |
9 | use Title; |
10 | |
11 | /** |
12 | * Build a doc ready for the titlesuggest index. |
13 | * |
14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. |
18 | * |
19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. |
23 | * |
24 | * You should have received a copy of the GNU General Public License along |
25 | * with this program; if not, write to the Free Software Foundation, Inc., |
26 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
27 | * http://www.gnu.org/copyleft/gpl.html |
28 | */ |
29 | |
30 | /** |
31 | * Builder used to create suggester docs |
32 | * NOTE: Experimental |
33 | */ |
34 | class SuggestBuilder { |
35 | /** |
36 | * We limit the input to 50 chars the search requests |
37 | * It'll be used when searching to trim the input query |
38 | * and when determining close redirects |
39 | */ |
40 | public const MAX_INPUT_LENGTH = 50; |
41 | |
42 | /** |
43 | * The acceptable edit distance to group similar strings |
44 | */ |
45 | private const GROUP_ACCEPTABLE_DISTANCE = 2; |
46 | |
47 | /** |
48 | * Discount suggestions based on redirects |
49 | */ |
50 | public const REDIRECT_DISCOUNT = 0.1; |
51 | |
52 | /** |
53 | * Discount suggestions based on cross namespace redirects |
54 | */ |
55 | public const CROSSNS_DISCOUNT = 0.005; |
56 | |
57 | /** |
58 | * Redirect suggestion type |
59 | */ |
60 | public const REDIRECT_SUGGESTION = 'r'; |
61 | |
62 | /** |
63 | * Title suggestion type |
64 | */ |
65 | public const TITLE_SUGGESTION = 't'; |
66 | |
67 | /** |
68 | * Number of common prefix chars a redirect must share with the title to be |
69 | * promoted as a title suggestion. |
70 | * This is useful not to promote Eraq as a title suggestion for Iraq |
71 | * Less than 3 can lead to weird results like oba => Osama Bin Laden |
72 | */ |
73 | private const REDIRECT_COMMON_PREFIX_LEN = 3; |
74 | |
75 | /** |
76 | * @var SuggestScoringMethod the scoring function |
77 | */ |
78 | private $scoringMethod; |
79 | |
80 | /** |
81 | * @var int batch id |
82 | */ |
83 | private $batchId; |
84 | |
85 | /** |
86 | * @var ExtraSuggestionsBuilder[] |
87 | */ |
88 | private $extraBuilders; |
89 | |
90 | /** |
91 | * NOTE: Currently a fixed value because the completion suggester does not support |
92 | * multi namespace suggestion. |
93 | * |
94 | * @var int |
95 | */ |
96 | private $targetNamespace = NS_MAIN; // @phan-suppress-current-line PhanUndeclaredConstant NS_MAIN is defined |
97 | |
98 | /** |
99 | * @param SuggestScoringMethod $scoringMethod the scoring function to use |
100 | * @param ExtraSuggestionsBuilder[] $extraBuilders set of extra builders |
101 | */ |
102 | public function __construct( SuggestScoringMethod $scoringMethod, array $extraBuilders = [] ) { |
103 | $this->scoringMethod = $scoringMethod; |
104 | $this->extraBuilders = $extraBuilders; |
105 | $this->batchId = time(); |
106 | } |
107 | |
108 | /** |
109 | * @param Connection $connection |
110 | * @param string|null $scoreMethodName |
111 | * @param string|null $indexBaseName |
112 | * @return SuggestBuilder |
113 | * @throws \Exception |
114 | */ |
115 | public static function create( Connection $connection, $scoreMethodName = null, $indexBaseName = null ): SuggestBuilder { |
116 | $config = $connection->getConfig(); |
117 | $scoreMethodName = $scoreMethodName ?: $config->get( 'CirrusSearchCompletionDefaultScore' ); |
118 | $scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName ); |
119 | |
120 | $extraBuilders = []; |
121 | if ( $config->get( 'CirrusSearchCompletionSuggesterUseDefaultSort' ) ) { |
122 | $extraBuilders[] = new DefaultSortSuggestionsBuilder(); |
123 | } |
124 | $subPhrasesConfig = $config->get( 'CirrusSearchCompletionSuggesterSubphrases' ); |
125 | if ( $subPhrasesConfig['build'] ) { |
126 | $extraBuilders[] = NaiveSubphrasesSuggestionsBuilder::create( $subPhrasesConfig ); |
127 | } |
128 | $scoreMethod->setMaxDocs( self::fetchMaxDoc( $connection, $indexBaseName ) ); |
129 | return new SuggestBuilder( $scoreMethod, $extraBuilders ); |
130 | } |
131 | |
132 | /** |
133 | * @param array[] $inputDocs a batch of docs to build |
134 | * @param bool $explain |
135 | * @return \Elastica\Document[] a set of suggest documents |
136 | */ |
137 | public function build( $inputDocs, $explain = false ) { |
138 | // Cross namespace titles |
139 | $crossNsTitles = []; |
140 | $docs = []; |
141 | foreach ( $inputDocs as $sourceDoc ) { |
142 | $inputDoc = $sourceDoc['source']; |
143 | $docId = $sourceDoc['id']; |
144 | // a bit of a hack but it's convenient to carry |
145 | // the id around |
146 | $inputDoc['id'] = $docId; |
147 | if ( !isset( $inputDoc['namespace'] ) ) { |
148 | // Bad doc, nothing to do here. |
149 | continue; |
150 | } |
151 | if ( $inputDoc['namespace'] == $this->targetNamespace ) { |
152 | if ( !isset( $inputDoc['title'] ) ) { |
153 | // Bad doc, nothing to do here. |
154 | continue; |
155 | } |
156 | $docs = array_merge( $docs, $this->buildNormalSuggestions( $docId, $inputDoc, $explain ) ); |
157 | } else { |
158 | if ( !isset( $inputDoc['redirect'] ) ) { |
159 | // Bad doc, nothing to do here. |
160 | continue; |
161 | } |
162 | |
163 | foreach ( $inputDoc['redirect'] as $redir ) { |
164 | if ( !isset( $redir['namespace'] ) || !isset( $redir['title'] ) ) { |
165 | continue; |
166 | } |
167 | if ( $redir['namespace'] != $this->targetNamespace ) { |
168 | continue; |
169 | } |
170 | $score = $this->scoringMethod->score( $inputDoc ); |
171 | // Discount the score of these suggestions. |
172 | $score = (int)( $score * self::CROSSNS_DISCOUNT ); |
173 | $explainDetails = null; |
174 | if ( $explain ) { |
175 | // TODO: add explanation of the crossns discount |
176 | $explainDetails = $this->scoringMethod->explain( $inputDoc ); |
177 | } |
178 | |
179 | $title = Title::makeTitle( $redir['namespace'], $redir['title'] ); |
180 | $crossNsTitles[] = [ |
181 | 'title' => $title, |
182 | 'score' => $score, |
183 | 'text' => $redir['title'], |
184 | 'inputDoc' => $inputDoc, |
185 | 'explain' => $explainDetails, |
186 | ]; |
187 | } |
188 | } |
189 | } |
190 | |
191 | // Build cross ns suggestions |
192 | if ( !empty( $crossNsTitles ) ) { |
193 | $titles = array_column( $crossNsTitles, 'title' ); |
194 | $lb = MediaWikiServices::getInstance()->getLinkBatchFactory()->newLinkBatch( $titles ); |
195 | $lb->setCaller( __METHOD__ ); |
196 | $lb->execute(); |
197 | // This is far from perfect: |
198 | // - we won't try to group similar redirects since we don't know which one |
199 | // is the official one |
200 | // - we will certainly suggest multiple times the same pages |
201 | // - we must not run a second pass at query time: no redirect suggestion |
202 | foreach ( $crossNsTitles as $data ) { |
203 | $suggestion = [ |
204 | 'text' => $data['text'], |
205 | 'variants' => [] |
206 | ]; |
207 | $docs[] = $this->buildTitleSuggestion( (string)$data['title']->getArticleID(), $suggestion, |
208 | $data['score'], $data['inputDoc'], $data['explain'] ); |
209 | } |
210 | } |
211 | return $docs; |
212 | } |
213 | |
214 | /** |
215 | * Build classic suggestion |
216 | * |
217 | * @param string $docId |
218 | * @param array $inputDoc |
219 | * @param bool $explain |
220 | * @return \Elastica\Document[] a set of suggest documents |
221 | */ |
222 | private function buildNormalSuggestions( $docId, array $inputDoc, $explain = false ) { |
223 | if ( !isset( $inputDoc['title'] ) ) { |
224 | // Bad doc, nothing to do here. |
225 | return []; |
226 | } |
227 | |
228 | $score = $this->scoringMethod->score( $inputDoc ); |
229 | $explainDetails = null; |
230 | if ( $explain ) { |
231 | $explainDetails = $this->scoringMethod->explain( $inputDoc ); |
232 | } |
233 | |
234 | $suggestions = $this->extractTitleAndSimilarRedirects( $inputDoc ); |
235 | |
236 | $docs = [ $this->buildTitleSuggestion( $docId, $suggestions['group'], $score, $inputDoc, $explainDetails ) ]; |
237 | if ( !empty( $suggestions['candidates'] ) ) { |
238 | $docs[] = $this->buildRedirectsSuggestion( $docId, $suggestions['candidates'], $score, $inputDoc, $explainDetails ); |
239 | } |
240 | return $docs; |
241 | } |
242 | |
243 | /** |
244 | * The fields needed to build and score documents. |
245 | * |
246 | * @return string[] the list of fields |
247 | */ |
248 | public function getRequiredFields() { |
249 | $fields = $this->scoringMethod->getRequiredFields(); |
250 | $fields = array_merge( $fields, [ 'title', 'redirect', 'namespace' ] ); |
251 | foreach ( $this->extraBuilders as $extraBuilder ) { |
252 | $fields = array_merge( $fields, $extraBuilder->getRequiredFields() ); |
253 | } |
254 | return array_values( array_unique( $fields ) ); |
255 | } |
256 | |
257 | /** |
258 | * Builds the 'title' suggestion. |
259 | * |
260 | * @param string $docId the page id |
261 | * @param array $title the title in 'text' and an array of similar redirects in 'variants' |
262 | * @param int $score the weight of the suggestion |
263 | * @param mixed[] $inputDoc |
264 | * @param array|null $scoreExplanation |
265 | * @return \Elastica\Document the suggestion document |
266 | */ |
267 | private function buildTitleSuggestion( $docId, array $title, $score, array $inputDoc, array $scoreExplanation = null ) { |
268 | $inputs = [ $title['text'] ]; |
269 | foreach ( $title['variants'] as $variant ) { |
270 | $inputs[] = $variant; |
271 | } |
272 | return $this->buildSuggestion( |
273 | self::TITLE_SUGGESTION, |
274 | $docId, |
275 | $inputs, |
276 | $score, |
277 | $inputDoc, |
278 | $scoreExplanation |
279 | ); |
280 | } |
281 | |
282 | /** |
283 | * Builds the 'redirects' suggestion. |
284 | * The score will be discounted by the REDIRECT_DISCOUNT factor. |
285 | * NOTE: the client will have to fetch the doc redirects when searching |
286 | * and choose the best one to display. This is because we are unable |
287 | * to make this decision at index time. |
288 | * |
289 | * @param string $docId the elasticsearch document id |
290 | * @param string[] $redirects |
291 | * @param int $score the weight of the suggestion |
292 | * @param mixed[] $inputDoc |
293 | * @param array|null $scoreExplanation |
294 | * @return \Elastica\Document the suggestion document |
295 | */ |
296 | private function buildRedirectsSuggestion( $docId, array $redirects, $score, array $inputDoc, array $scoreExplanation = null ) { |
297 | $inputs = []; |
298 | foreach ( $redirects as $redirect ) { |
299 | $inputs[] = $redirect; |
300 | } |
301 | // TODO: add redirect discount explanation |
302 | $score = (int)( $score * self::REDIRECT_DISCOUNT ); |
303 | return $this->buildSuggestion( self::REDIRECT_SUGGESTION, $docId, $inputs, |
304 | $score, $inputDoc, $scoreExplanation ); |
305 | } |
306 | |
307 | /** |
308 | * Builds a suggestion document. |
309 | * |
310 | * @param string $suggestionType suggestion type (title or redirect) |
311 | * @param string $docId The document id |
312 | * @param string[] $inputs the suggestion inputs |
313 | * @param int $score the weight of the suggestion |
314 | * @param mixed[] $inputDoc |
315 | * @param array|null $scoreExplanation |
316 | * @return \Elastica\Document a doc ready to be indexed in the completion suggester |
317 | */ |
318 | private function buildSuggestion( $suggestionType, $docId, array $inputs, $score, array $inputDoc, array $scoreExplanation = null ) { |
319 | $doc = [ |
320 | 'batch_id' => $this->batchId, |
321 | 'source_doc_id' => $inputDoc['id'], |
322 | 'target_title' => [ |
323 | 'title' => $inputDoc['title'], |
324 | 'namespace' => $inputDoc['namespace'], |
325 | ], |
326 | 'suggest' => [ |
327 | 'input' => $inputs, |
328 | 'weight' => $score |
329 | ], |
330 | 'suggest-stop' => [ |
331 | 'input' => $inputs, |
332 | 'weight' => $score |
333 | ] |
334 | ]; |
335 | |
336 | $suggestDoc = new \Elastica\Document( self::encodeDocId( $suggestionType, $docId ), $doc ); |
337 | foreach ( $this->extraBuilders as $builder ) { |
338 | $builder->build( $inputDoc, $suggestionType, $score, $suggestDoc, $this->targetNamespace ); |
339 | } |
340 | if ( $scoreExplanation !== null ) { |
341 | $suggestDoc->set( 'score_explanation', $scoreExplanation ); |
342 | } |
343 | return $suggestDoc; |
344 | } |
345 | |
346 | /** |
347 | * @param string $input A page title |
348 | * @return string A page title short enough to not cause indexing |
349 | * issues. |
350 | */ |
351 | public function trimForDistanceCheck( $input ) { |
352 | if ( mb_strlen( $input ) > self::MAX_INPUT_LENGTH ) { |
353 | $input = mb_substr( $input, 0, self::MAX_INPUT_LENGTH ); |
354 | } |
355 | return $input; |
356 | } |
357 | |
358 | /** |
359 | * Extracts title with redirects that are very close. |
360 | * It will allow to make one suggestion with title as the |
361 | * output and title + similar redirects as the inputs. |
362 | * It can be useful to avoid displaying redirects created to |
363 | * to handle typos. |
364 | * |
365 | * e.g. : |
366 | * title: Giraffe |
367 | * redirects: Girafe, Girraffe, Mating Giraffes |
368 | * will output |
369 | * - 'group' : { 'text': 'Giraffe', 'variants': ['Girafe', 'Girraffe'] } |
370 | * - 'candidates' : ['Mating Giraffes'] |
371 | * |
372 | * It would be nice to do this for redirects but we have no way to decide |
373 | * which redirect is a typo and this technique would simply take the first |
374 | * redirect in the list. |
375 | * |
376 | * @param array $doc |
377 | * @return array mixed 'group' key contains the group with the |
378 | * lead and its variants and 'candidates' contains the remaining |
379 | * candidates that were not close enough to $groupHead. |
380 | */ |
381 | public function extractTitleAndSimilarRedirects( array $doc ) { |
382 | $redirects = []; |
383 | if ( isset( $doc['redirect'] ) ) { |
384 | foreach ( $doc['redirect'] as $redir ) { |
385 | // Avoid suggesting/displaying non existent titles |
386 | // in the target namespace |
387 | if ( $redir['namespace'] == $this->targetNamespace ) { |
388 | $redirects[] = $redir['title']; |
389 | } |
390 | } |
391 | } |
392 | return $this->extractSimilars( $doc['title'], $redirects, true ); |
393 | } |
394 | |
395 | /** |
396 | * Extracts from $candidates the values that are "similar" to $groupHead |
397 | * |
398 | * @param string $groupHead |
399 | * @param string[] $candidates |
400 | * @param bool $checkVariants if the candidate does not match the groupHead try to match a variant |
401 | * @return array 'group' key contains the group with the |
402 | * head and its variants and 'candidates' contains the remaining |
403 | * candidates that were not close enough to $groupHead. |
404 | */ |
405 | private function extractSimilars( $groupHead, array $candidates, $checkVariants = false ) { |
406 | $group = [ |
407 | 'text' => $groupHead, |
408 | 'variants' => [] |
409 | ]; |
410 | $newCandidates = []; |
411 | foreach ( $candidates as $c ) { |
412 | $distance = $this->distance( $groupHead, $c ); |
413 | if ( $distance > self::GROUP_ACCEPTABLE_DISTANCE && $checkVariants ) { |
414 | // Run a second pass over the variants |
415 | foreach ( $group['variants'] as $v ) { |
416 | $distance = $this->distance( $v, $c ); |
417 | if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) { |
418 | break; |
419 | } |
420 | } |
421 | } |
422 | if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) { |
423 | $group['variants'][] = $c; |
424 | } else { |
425 | $newCandidates[] = $c; |
426 | } |
427 | } |
428 | |
429 | return [ |
430 | 'group' => $group, |
431 | 'candidates' => $newCandidates |
432 | ]; |
433 | } |
434 | |
435 | /** |
436 | * Computes the edit distance between $a and $b. |
437 | * |
438 | * @param string $a |
439 | * @param string $b |
440 | * @return int the edit distance between a and b |
441 | */ |
442 | private function distance( $a, $b ) { |
443 | $a = $this->trimForDistanceCheck( $a ); |
444 | $b = $this->trimForDistanceCheck( $b ); |
445 | $a = mb_strtolower( $a ); |
446 | $b = mb_strtolower( $b ); |
447 | |
448 | $aLength = mb_strlen( $a ); |
449 | $bLength = mb_strlen( $b ); |
450 | |
451 | $commonPrefixLen = self::REDIRECT_COMMON_PREFIX_LEN; |
452 | |
453 | if ( $aLength < $commonPrefixLen ) { |
454 | $commonPrefixLen = $aLength; |
455 | } |
456 | if ( $bLength < $commonPrefixLen ) { |
457 | $commonPrefixLen = $bLength; |
458 | } |
459 | |
460 | // check the common prefix |
461 | if ( mb_substr( $a, 0, $commonPrefixLen ) != mb_substr( $b, 0, $commonPrefixLen ) ) { |
462 | return PHP_INT_MAX; |
463 | } |
464 | |
465 | // TODO: switch to a ratio instead of raw distance would help to group |
466 | // longer strings |
467 | return levenshtein( $a, $b ); |
468 | } |
469 | |
470 | /** |
471 | * Encode the suggestion doc id |
472 | * @param string $suggestionType |
473 | * @param string $docId |
474 | * @return string |
475 | */ |
476 | public static function encodeDocId( $suggestionType, $docId ) { |
477 | return $docId . $suggestionType; |
478 | } |
479 | |
480 | /** |
481 | * Encode possible docIds used by the completion suggester index |
482 | * |
483 | * @param string $docId |
484 | * @return string[] list of docIds |
485 | */ |
486 | public static function encodePossibleDocIds( $docId ) { |
487 | return [ |
488 | self::encodeDocId( self::TITLE_SUGGESTION, $docId ), |
489 | self::encodeDocId( self::REDIRECT_SUGGESTION, $docId ), |
490 | ]; |
491 | } |
492 | |
493 | /** |
494 | * @return int the batchId |
495 | */ |
496 | public function getBatchId() { |
497 | return $this->batchId; |
498 | } |
499 | |
500 | /** |
501 | * @return int the target namespace |
502 | */ |
503 | public function getTargetNamespace() { |
504 | return $this->targetNamespace; |
505 | } |
506 | |
507 | /** |
508 | * @param Connection $connection |
509 | * @param string|null $indexBaseName |
510 | * @return int |
511 | */ |
512 | private static function fetchMaxDoc( Connection $connection, $indexBaseName = null ) { |
513 | // Indices to use for counting max_docs used by scoring functions |
514 | // Since we work mostly on the content namespace it seems OK to count |
515 | // only docs in the CONTENT index. |
516 | $countIndices = [ Connection::CONTENT_INDEX_SUFFIX ]; |
517 | |
518 | $indexBaseName = $indexBaseName ?: $connection->getConfig()->get( 'CirrusSearchIndexBaseName' ); |
519 | |
520 | // Run a first query to count the number of docs. |
521 | // This is needed for the scoring methods that need |
522 | // to normalize values against wiki size. |
523 | $mSearch = new MultiSearch( $connection->getClient() ); |
524 | foreach ( $countIndices as $sourceIndexSuffix ) { |
525 | $search = new Search( $connection->getClient() ); |
526 | $search->addIndex( |
527 | $connection->getIndex( $indexBaseName, $sourceIndexSuffix ) |
528 | ); |
529 | $search->getQuery()->setSize( 0 ); |
530 | $search->getQuery()->setTrackTotalHits( true ); |
531 | $mSearch->addSearch( $search ); |
532 | } |
533 | |
534 | $mSearchRes = $mSearch->search(); |
535 | $total = 0; |
536 | foreach ( $mSearchRes as $res ) { |
537 | $total += $res->getTotalHits(); |
538 | } |
539 | return $total; |
540 | } |
541 | } |