Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
75.56% covered (warning)
75.56%
136 / 180
41.18% covered (danger)
41.18%
7 / 17
CRAP
0.00% covered (danger)
0.00%
0 / 1
SuggestBuilder
75.56% covered (warning)
75.56%
136 / 180
41.18% covered (danger)
41.18%
7 / 17
101.81
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 create
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
20
 build
86.96% covered (warning)
86.96%
40 / 46
0.00% covered (danger)
0.00%
0 / 1
13.38
 buildNormalSuggestions
90.91% covered (success)
90.91%
10 / 11
0.00% covered (danger)
0.00%
0 / 1
4.01
 getRequiredFields
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 buildTitleSuggestion
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
2
 buildRedirectsSuggestion
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 buildSuggestion
100.00% covered (success)
100.00%
22 / 22
100.00% covered (success)
100.00%
1 / 1
3
 trimForDistanceCheck
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 extractTitleAndSimilarRedirects
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 extractSimilars
94.74% covered (success)
94.74%
18 / 19
0.00% covered (danger)
0.00%
0 / 1
7.01
 distance
85.71% covered (warning)
85.71%
12 / 14
0.00% covered (danger)
0.00%
0 / 1
4.05
 encodeDocId
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 encodePossibleDocIds
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 getBatchId
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getTargetNamespace
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 fetchMaxDoc
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
20
1<?php
2
3namespace CirrusSearch\BuildDocument\Completion;
4
5use CirrusSearch\Connection;
6use Elastica\Multi\Search as MultiSearch;
7use Elastica\Search;
8use MediaWiki\MediaWikiServices;
9
10/**
11 * Build a doc ready for the titlesuggest index.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License along
24 * with this program; if not, write to the Free Software Foundation, Inc.,
25 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 * http://www.gnu.org/copyleft/gpl.html
27 */
28
29/**
30 * Builder used to create suggester docs
31 * NOTE: Experimental
32 */
33class SuggestBuilder {
34    /**
35     * We limit the input to 50 chars the search requests
36     * It'll be used when searching to trim the input query
37     * and when determining close redirects
38     */
39    public const MAX_INPUT_LENGTH = 50;
40
41    /**
42     * The acceptable edit distance to group similar strings
43     */
44    private const GROUP_ACCEPTABLE_DISTANCE = 2;
45
46    /**
47     * Discount suggestions based on redirects
48     */
49    public const REDIRECT_DISCOUNT = 0.1;
50
51    /**
52     * Discount suggestions based on cross namespace redirects
53     */
54    public const CROSSNS_DISCOUNT = 0.005;
55
56    /**
57     * Redirect suggestion type
58     */
59    public const REDIRECT_SUGGESTION = 'r';
60
61    /**
62     * Title suggestion type
63     */
64    public const TITLE_SUGGESTION = 't';
65
66    /**
67     * Number of common prefix chars a redirect must share with the title to be
68     * promoted as a title suggestion.
69     * This is useful not to promote Eraq as a title suggestion for Iraq
70     * Less than 3 can lead to weird results like oba => Osama Bin Laden
71     */
72    private const REDIRECT_COMMON_PREFIX_LEN = 3;
73
74    /**
75     * @var SuggestScoringMethod the scoring function
76     */
77    private $scoringMethod;
78
79    /**
80     * @var int batch id
81     */
82    private $batchId;
83
84    /**
85     * @var ExtraSuggestionsBuilder[]
86     */
87    private $extraBuilders;
88
89    /**
90     * NOTE: Currently a fixed value because the completion suggester does not support
91     * multi namespace suggestion.
92     *
93     * @var int
94     */
95    private $targetNamespace = NS_MAIN; // @phan-suppress-current-line PhanUndeclaredConstant NS_MAIN is defined
96
97    /**
98     * @param SuggestScoringMethod $scoringMethod the scoring function to use
99     * @param ExtraSuggestionsBuilder[] $extraBuilders set of extra builders
100     */
101    public function __construct( SuggestScoringMethod $scoringMethod, array $extraBuilders = [] ) {
102        $this->scoringMethod = $scoringMethod;
103        $this->extraBuilders = $extraBuilders;
104        $this->batchId = time();
105    }
106
107    /**
108     * @param Connection $connection
109     * @param string|null $scoreMethodName
110     * @param string|null $indexBaseName
111     * @return SuggestBuilder
112     * @throws \Exception
113     */
114    public static function create( Connection $connection, $scoreMethodName = null, $indexBaseName = null ): SuggestBuilder {
115        $config  = $connection->getConfig();
116        $scoreMethodName = $scoreMethodName ?: $config->get( 'CirrusSearchCompletionDefaultScore' );
117        $scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
118
119        $extraBuilders = [];
120        if ( $config->get( 'CirrusSearchCompletionSuggesterUseDefaultSort' ) ) {
121            $extraBuilders[] = new DefaultSortSuggestionsBuilder();
122        }
123        $subPhrasesConfig = $config->get( 'CirrusSearchCompletionSuggesterSubphrases' );
124        if ( $subPhrasesConfig['build'] ) {
125            $extraBuilders[] = NaiveSubphrasesSuggestionsBuilder::create( $subPhrasesConfig );
126        }
127        $scoreMethod->setMaxDocs( self::fetchMaxDoc( $connection, $indexBaseName ) );
128        return new SuggestBuilder( $scoreMethod, $extraBuilders );
129    }
130
131    /**
132     * @param array[] $inputDocs a batch of docs to build
133     * @param bool $explain
134     * @return \Elastica\Document[] a set of suggest documents
135     */
136    public function build( $inputDocs, $explain = false ) {
137        $titleFactory = MediaWikiServices::getInstance()->getTitleFactory();
138        // Cross namespace titles
139        $crossNsTitles = [];
140        $docs = [];
141        foreach ( $inputDocs as $sourceDoc ) {
142            $inputDoc = $sourceDoc['source'];
143            $docId = $sourceDoc['id'];
144            // a bit of a hack but it's convenient to carry
145            // the id around
146            $inputDoc['id'] = $docId;
147            if ( !isset( $inputDoc['namespace'] ) ) {
148                // Bad doc, nothing to do here.
149                continue;
150            }
151            if ( $inputDoc['namespace'] == $this->targetNamespace ) {
152                if ( !isset( $inputDoc['title'] ) ) {
153                    // Bad doc, nothing to do here.
154                    continue;
155                }
156                $docs = array_merge( $docs, $this->buildNormalSuggestions( $docId, $inputDoc, $explain ) );
157            } else {
158                if ( !isset( $inputDoc['redirect'] ) ) {
159                    // Bad doc, nothing to do here.
160                    continue;
161                }
162
163                foreach ( $inputDoc['redirect'] as $redir ) {
164                    if ( !isset( $redir['namespace'] ) || !isset( $redir['title'] ) ) {
165                        continue;
166                    }
167                    if ( $redir['namespace'] != $this->targetNamespace ) {
168                        continue;
169                    }
170                    $score = $this->scoringMethod->score( $inputDoc );
171                    // Discount the score of these suggestions.
172                    $score = (int)( $score * self::CROSSNS_DISCOUNT );
173                    $explainDetails = null;
174                    if ( $explain ) {
175                        // TODO: add explanation of the crossns discount
176                        $explainDetails = $this->scoringMethod->explain( $inputDoc );
177                    }
178
179                    $title = $titleFactory->makeTitle( $redir['namespace'], $redir['title'] );
180                    $crossNsTitles[] = [
181                        'title' => $title,
182                        'score' => $score,
183                        'text' => $redir['title'],
184                        'inputDoc' => $inputDoc,
185                        'explain' => $explainDetails,
186                    ];
187                }
188            }
189        }
190
191        // Build cross ns suggestions
192        if ( $crossNsTitles ) {
193            $titles = array_column( $crossNsTitles, 'title' );
194            $lb = MediaWikiServices::getInstance()->getLinkBatchFactory()->newLinkBatch( $titles );
195            $lb->setCaller( __METHOD__ );
196            $lb->execute();
197            // This is far from perfect:
198            // - we won't try to group similar redirects since we don't know which one
199            // is the official one
200            // - we will certainly suggest multiple times the same pages
201            // - we must not run a second pass at query time: no redirect suggestion
202            foreach ( $crossNsTitles as $data ) {
203                $suggestion = [
204                    'text' => $data['text'],
205                    'variants' => []
206                ];
207                $docs[] = $this->buildTitleSuggestion( (string)$data['title']->getArticleID(), $suggestion,
208                    $data['score'], $data['inputDoc'], $data['explain'] );
209            }
210        }
211        return $docs;
212    }
213
214    /**
215     * Build classic suggestion
216     *
217     * @param string $docId
218     * @param array $inputDoc
219     * @param bool $explain
220     * @return \Elastica\Document[] a set of suggest documents
221     */
222    private function buildNormalSuggestions( $docId, array $inputDoc, $explain = false ) {
223        if ( !isset( $inputDoc['title'] ) ) {
224            // Bad doc, nothing to do here.
225            return [];
226        }
227
228        $score = $this->scoringMethod->score( $inputDoc );
229        $explainDetails = null;
230        if ( $explain ) {
231            $explainDetails = $this->scoringMethod->explain( $inputDoc );
232        }
233
234        $suggestions = $this->extractTitleAndSimilarRedirects( $inputDoc );
235
236        $docs = [ $this->buildTitleSuggestion( $docId, $suggestions['group'], $score, $inputDoc, $explainDetails ) ];
237        if ( !empty( $suggestions['candidates'] ) ) {
238            $docs[] = $this->buildRedirectsSuggestion( $docId, $suggestions['candidates'], $score, $inputDoc, $explainDetails );
239        }
240        return $docs;
241    }
242
243    /**
244     * The fields needed to build and score documents.
245     *
246     * @return string[] the list of fields
247     */
248    public function getRequiredFields() {
249        $fields = $this->scoringMethod->getRequiredFields();
250        $fields = array_merge( $fields, [ 'title', 'redirect', 'namespace' ] );
251        foreach ( $this->extraBuilders as $extraBuilder ) {
252            $fields = array_merge( $fields, $extraBuilder->getRequiredFields() );
253        }
254        return array_values( array_unique( $fields ) );
255    }
256
257    /**
258     * Builds the 'title' suggestion.
259     *
260     * @param string $docId the page id
261     * @param array $title the title in 'text' and an array of similar redirects in 'variants'
262     * @param int $score the weight of the suggestion
263     * @param mixed[] $inputDoc
264     * @param array|null $scoreExplanation
265     * @return \Elastica\Document the suggestion document
266     */
267    private function buildTitleSuggestion( $docId, array $title, $score, array $inputDoc, array $scoreExplanation = null ) {
268        $inputs = [ $title['text'] ];
269        foreach ( $title['variants'] as $variant ) {
270            $inputs[] = $variant;
271        }
272        return $this->buildSuggestion(
273            self::TITLE_SUGGESTION,
274            $docId,
275            $inputs,
276            $score,
277            $inputDoc,
278            $scoreExplanation
279        );
280    }
281
282    /**
283     * Builds the 'redirects' suggestion.
284     * The score will be discounted by the REDIRECT_DISCOUNT factor.
285     * NOTE: the client will have to fetch the doc redirects when searching
286     * and choose the best one to display. This is because we are unable
287     * to make this decision at index time.
288     *
289     * @param string $docId the elasticsearch document id
290     * @param string[] $redirects
291     * @param int $score the weight of the suggestion
292     * @param mixed[] $inputDoc
293     * @param array|null $scoreExplanation
294     * @return \Elastica\Document the suggestion document
295     */
296    private function buildRedirectsSuggestion( $docId, array $redirects, $score, array $inputDoc, array $scoreExplanation = null ) {
297        $inputs = [];
298        foreach ( $redirects as $redirect ) {
299            $inputs[] = $redirect;
300        }
301        // TODO: add redirect discount explanation
302        $score = (int)( $score * self::REDIRECT_DISCOUNT );
303        return $this->buildSuggestion( self::REDIRECT_SUGGESTION, $docId, $inputs,
304            $score, $inputDoc, $scoreExplanation );
305    }
306
307    /**
308     * Builds a suggestion document.
309     *
310     * @param string $suggestionType suggestion type (title or redirect)
311     * @param string $docId The document id
312     * @param string[] $inputs the suggestion inputs
313     * @param int $score the weight of the suggestion
314     * @param mixed[] $inputDoc
315     * @param array|null $scoreExplanation
316     * @return \Elastica\Document a doc ready to be indexed in the completion suggester
317     */
318    private function buildSuggestion( $suggestionType, $docId, array $inputs, $score, array $inputDoc, array $scoreExplanation = null ) {
319        $doc = [
320            'batch_id' => $this->batchId,
321            'source_doc_id' => $inputDoc['id'],
322            'target_title' => [
323                'title' => $inputDoc['title'],
324                'namespace' => $inputDoc['namespace'],
325            ],
326            'suggest' => [
327                'input' => $inputs,
328                'weight' => $score
329            ],
330            'suggest-stop' => [
331                'input' => $inputs,
332                'weight' => $score
333            ]
334        ];
335
336        $suggestDoc = new \Elastica\Document( self::encodeDocId( $suggestionType, $docId ), $doc );
337        foreach ( $this->extraBuilders as $builder ) {
338            $builder->build( $inputDoc, $suggestionType, $score, $suggestDoc, $this->targetNamespace );
339        }
340        if ( $scoreExplanation !== null ) {
341            $suggestDoc->set( 'score_explanation', $scoreExplanation );
342        }
343        return $suggestDoc;
344    }
345
346    /**
347     * @param string $input A page title
348     * @return string A page title short enough to not cause indexing
349     *  issues.
350     */
351    public function trimForDistanceCheck( $input ) {
352        if ( mb_strlen( $input ) > self::MAX_INPUT_LENGTH ) {
353            $input = mb_substr( $input, 0, self::MAX_INPUT_LENGTH );
354        }
355        return $input;
356    }
357
358    /**
359     * Extracts title with redirects that are very close.
360     * It will allow to make one suggestion with title as the
361     * output and title + similar redirects as the inputs.
362     * It can be useful to avoid displaying redirects created to
363     * to handle typos.
364     *
365     * e.g. :
366     *   title: Giraffe
367     *   redirects: Girafe, Girraffe, Mating Giraffes
368     * will output
369     *   - 'group' : { 'text': 'Giraffe', 'variants': ['Girafe', 'Girraffe'] }
370     *   - 'candidates' : ['Mating Giraffes']
371     *
372     * It would be nice to do this for redirects but we have no way to decide
373     * which redirect is a typo and this technique would simply take the first
374     * redirect in the list.
375     *
376     * @param array $doc
377     * @return array mixed 'group' key contains the group with the
378     *         lead and its variants and 'candidates' contains the remaining
379     *         candidates that were not close enough to $groupHead.
380     */
381    public function extractTitleAndSimilarRedirects( array $doc ) {
382        $redirects = [];
383        if ( isset( $doc['redirect'] ) ) {
384            foreach ( $doc['redirect'] as $redir ) {
385                // Avoid suggesting/displaying non existent titles
386                // in the target namespace
387                if ( $redir['namespace'] == $this->targetNamespace ) {
388                    $redirects[] = $redir['title'];
389                }
390            }
391        }
392        return $this->extractSimilars( $doc['title'], $redirects, true );
393    }
394
395    /**
396     * Extracts from $candidates the values that are "similar" to $groupHead
397     *
398     * @param string $groupHead
399     * @param string[] $candidates
400     * @param bool $checkVariants if the candidate does not match the groupHead try to match a variant
401     * @return array 'group' key contains the group with the
402     *         head and its variants and 'candidates' contains the remaining
403     *         candidates that were not close enough to $groupHead.
404     */
405    private function extractSimilars( $groupHead, array $candidates, $checkVariants = false ) {
406        $group = [
407            'text' => $groupHead,
408            'variants' => []
409        ];
410        $newCandidates = [];
411        foreach ( $candidates as $c ) {
412            $distance = $this->distance( $groupHead, $c );
413            if ( $distance > self::GROUP_ACCEPTABLE_DISTANCE && $checkVariants ) {
414                // Run a second pass over the variants
415                foreach ( $group['variants'] as $v ) {
416                    $distance = $this->distance( $v, $c );
417                    if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) {
418                        break;
419                    }
420                }
421            }
422            if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) {
423                $group['variants'][] = $c;
424            } else {
425                $newCandidates[] = $c;
426            }
427        }
428
429        return [
430            'group' => $group,
431            'candidates' => $newCandidates
432        ];
433    }
434
435    /**
436     * Computes the edit distance between $a and $b.
437     *
438     * @param string $a
439     * @param string $b
440     * @return int the edit distance between a and b
441     */
442    private function distance( $a, $b ) {
443        $a = $this->trimForDistanceCheck( $a );
444        $b = $this->trimForDistanceCheck( $b );
445        $a = mb_strtolower( $a );
446        $b = mb_strtolower( $b );
447
448        $aLength = mb_strlen( $a );
449        $bLength = mb_strlen( $b );
450
451        $commonPrefixLen = self::REDIRECT_COMMON_PREFIX_LEN;
452
453        if ( $aLength < $commonPrefixLen ) {
454            $commonPrefixLen = $aLength;
455        }
456        if ( $bLength < $commonPrefixLen ) {
457            $commonPrefixLen = $bLength;
458        }
459
460        // check the common prefix
461        if ( mb_substr( $a, 0, $commonPrefixLen ) != mb_substr( $b, 0, $commonPrefixLen ) ) {
462            return PHP_INT_MAX;
463        }
464
465        // TODO: switch to a ratio instead of raw distance would help to group
466        // longer strings
467        return levenshtein( $a, $b );
468    }
469
470    /**
471     * Encode the suggestion doc id
472     * @param string $suggestionType
473     * @param string $docId
474     * @return string
475     */
476    public static function encodeDocId( $suggestionType, $docId ) {
477        return $docId . $suggestionType;
478    }
479
480    /**
481     * Encode possible docIds used by the completion suggester index
482     *
483     * @param string $docId
484     * @return string[] list of docIds
485     */
486    public static function encodePossibleDocIds( $docId ) {
487        return [
488            self::encodeDocId( self::TITLE_SUGGESTION, $docId ),
489            self::encodeDocId( self::REDIRECT_SUGGESTION, $docId ),
490        ];
491    }
492
493    /**
494     * @return int the batchId
495     */
496    public function getBatchId() {
497        return $this->batchId;
498    }
499
500    /**
501     * @return int the target namespace
502     */
503    public function getTargetNamespace() {
504        return $this->targetNamespace;
505    }
506
507    /**
508     * @param Connection $connection
509     * @param string|null $indexBaseName
510     * @return int
511     */
512    private static function fetchMaxDoc( Connection $connection, $indexBaseName = null ) {
513        // Indices to use for counting max_docs used by scoring functions
514        // Since we work mostly on the content namespace it seems OK to count
515        // only docs in the CONTENT index.
516        $countIndices = [ Connection::CONTENT_INDEX_SUFFIX ];
517
518        $indexBaseName = $indexBaseName ?: $connection->getConfig()->get( 'CirrusSearchIndexBaseName' );
519
520        // Run a first query to count the number of docs.
521        // This is needed for the scoring methods that need
522        // to normalize values against wiki size.
523        $mSearch = new MultiSearch( $connection->getClient() );
524        foreach ( $countIndices as $sourceIndexSuffix ) {
525            $search = new Search( $connection->getClient() );
526            $search->addIndex(
527                $connection->getIndex( $indexBaseName, $sourceIndexSuffix )
528            );
529            $search->getQuery()->setSize( 0 );
530            $search->getQuery()->setTrackTotalHits( true );
531            $mSearch->addSearch( $search );
532        }
533
534        $mSearchRes = $mSearch->search();
535        $total = 0;
536        foreach ( $mSearchRes as $res ) {
537            $total += $res->getTotalHits();
538        }
539        return $total;
540    }
541}