Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
51.45% covered (warning)
51.45%
230 / 447
36.36% covered (danger)
36.36%
12 / 33
CRAP
0.00% covered (danger)
0.00%
0 / 1
Searcher
51.45% covered (warning)
51.45%
230 / 447
36.36% covered (danger)
36.36%
12 / 33
1600.85
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
5
 search
89.47% covered (warning)
89.47%
17 / 19
0.00% covered (danger)
0.00%
0 / 1
3.01
 setResultsType
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isReturnRaw
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setSort
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 limitSearchToLocalWiki
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 nearMatchTitleSearch
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 countContentWords
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 prefixSearch
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 buildFullTextSearch
94.74% covered (success)
94.74%
18 / 19
0.00% covered (danger)
0.00%
0 / 1
5.00
 searchTextInternal
61.22% covered (warning)
61.22%
30 / 49
0.00% covered (danger)
0.00%
0 / 1
25.43
 get
0.00% covered (danger)
0.00%
0 / 39
0.00% covered (danger)
0.00%
0 / 1
72
 findNamespace
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
6
 buildSearch
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 searchOne
38.46% covered (danger)
38.46%
5 / 13
0.00% covered (danger)
0.00%
0 / 1
10.83
 searchMulti
38.53% covered (danger)
38.53%
42 / 109
0.00% covered (danger)
0.00%
0 / 1
134.41
 updateNamespacesFromQuery
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
20
 getSearchContext
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getPoolCounterType
76.92% covered (warning)
76.92%
10 / 13
0.00% covered (danger)
0.00%
0 / 1
5.31
 isAutomatedRequest
40.00% covered (danger)
40.00%
4 / 10
0.00% covered (danger)
0.00%
0 / 1
4.94
 getOverriddenConnection
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
3.58
 getQueryCacheStatsKey
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 newLog
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 processRawReturn
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 searchArchive
100.00% covered (success)
100.00%
28 / 28
100.00% covered (success)
100.00%
1 / 1
1
 areSearchesTheSame
0.00% covered (danger)
0.00%
0 / 14
0.00% covered (danger)
0.00%
0 / 1
42
 buildInterleaveSearcher
50.00% covered (danger)
50.00%
5 / 10
0.00% covered (danger)
0.00%
0 / 1
6.00
 emptyResultSet
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 applyDebugOptionsToQuery
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 makeSearcher
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 setOffsetLimit
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 getOffsetLimit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 buildFullTextBuilder
90.48% covered (success)
90.48%
19 / 21
0.00% covered (danger)
0.00%
0 / 1
4.01
1<?php
2
3namespace CirrusSearch;
4
5use CirrusSearch\Fallbacks\FallbackRunner;
6use CirrusSearch\Fallbacks\SearcherFactory;
7use CirrusSearch\Maintenance\NullPrinter;
8use CirrusSearch\MetaStore\MetaStoreIndex;
9use CirrusSearch\Parser\BasicQueryClassifier;
10use CirrusSearch\Parser\FullTextKeywordRegistry;
11use CirrusSearch\Parser\NamespacePrefixParser;
12use CirrusSearch\Profile\SearchProfileService;
13use CirrusSearch\Query\CountContentWordsBuilder;
14use CirrusSearch\Query\FullTextQueryBuilder;
15use CirrusSearch\Query\KeywordFeature;
16use CirrusSearch\Query\NearMatchQueryBuilder;
17use CirrusSearch\Query\PrefixSearchQueryBuilder;
18use CirrusSearch\Search\BaseCirrusSearchResultSet;
19use CirrusSearch\Search\FullTextResultsType;
20use CirrusSearch\Search\MSearchRequests;
21use CirrusSearch\Search\MSearchResponses;
22use CirrusSearch\Search\ResultsType;
23use CirrusSearch\Search\SearchContext;
24use CirrusSearch\Search\SearchQuery;
25use CirrusSearch\Search\SearchRequestBuilder;
26use CirrusSearch\Search\TeamDraftInterleaver;
27use CirrusSearch\Search\TitleHelper;
28use CirrusSearch\Search\TitleResultsType;
29use Elastica\Exception\ResponseException;
30use Elastica\Exception\RuntimeException;
31use Elastica\Multi\Search as MultiSearch;
32use Elastica\Query;
33use Elastica\Query\BoolQuery;
34use Elastica\Query\MultiMatch;
35use Elastica\Search;
36use MediaWiki\Logger\LoggerFactory;
37use MediaWiki\MediaWikiServices;
38use MediaWiki\Request\WebRequest;
39use MediaWiki\Status\Status;
40use MediaWiki\Title\Title;
41use MediaWiki\User\User;
42use MediaWiki\WikiMap\WikiMap;
43use RequestContext;
44use Wikimedia\Assert\Assert;
45use Wikimedia\ObjectFactory\ObjectFactory;
46
47/**
48 * Performs searches using Elasticsearch.  Note that each instance of this class
49 * is single use only.
50 *
51 * This program is free software; you can redistribute it and/or modify
52 * it under the terms of the GNU General Public License as published by
53 * the Free Software Foundation; either version 2 of the License, or
54 * (at your option) any later version.
55 *
56 * This program is distributed in the hope that it will be useful,
57 * but WITHOUT ANY WARRANTY; without even the implied warranty of
58 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
59 * GNU General Public License for more details.
60 *
61 * You should have received a copy of the GNU General Public License along
62 * with this program; if not, write to the Free Software Foundation, Inc.,
63 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
64 * http://www.gnu.org/copyleft/gpl.html
65 */
66class Searcher extends ElasticsearchIntermediary implements SearcherFactory {
67    public const SUGGESTION_HIGHLIGHT_PRE = '<em>';
68    public const SUGGESTION_HIGHLIGHT_POST = '</em>';
69    public const HIGHLIGHT_PRE_MARKER = ''; // \uE000. Can't be a unicode literal until php7
70    public const HIGHLIGHT_PRE = '<span class="searchmatch">';
71    public const HIGHLIGHT_POST_MARKER = ''; // \uE001
72    public const HIGHLIGHT_POST = '</span>';
73
74    /**
75     * Maximum offset + limit depth allowed. As in the deepest possible result
76     * to return. Too deep will cause very slow queries. 10,000 feels plenty
77     * deep. This should be <= index.max_result_window in elasticsearch.
78     */
79    private const MAX_OFFSET_LIMIT = 10000;
80
81    /**
82     * Identifies the main search in MSearchRequests/MSearchResponses
83     */
84    public const MAINSEARCH_MSEARCH_KEY = '__main__';
85
86    /**
87     * Identifies the "tested" search request in MSearchRequests/MSearchResponses
88     */
89    private const INTERLEAVED_MSEARCH_KEY = '__interleaved__';
90
91    /**
92     * @var int search offset
93     */
94    protected $offset;
95
96    /**
97     * @var int maximum number of result
98     */
99    protected $limit;
100
101    /**
102     * @var string sort type
103     */
104    private $sort = 'relevance';
105
106    /**
107     * @var string index base name to use
108     */
109    protected $indexBaseName;
110
111    /**
112     * Search environment configuration
113     * @var SearchConfig
114     */
115    protected $config;
116
117    /**
118     * @var SearchContext
119     */
120    protected $searchContext;
121
122    /**
123     * Indexing type we'll be using.
124     * @var string|\Elastica\Index
125     */
126    private $index;
127
128    /**
129     * @var NamespacePrefixParser|null
130     */
131    private $namespacePrefixParser;
132    /**
133     * @var InterwikiResolver
134     */
135    protected $interwikiResolver;
136
137    /** @var TitleHelper */
138    protected $titleHelper;
139    /**
140     * @var CirrusSearchHookRunner
141     */
142    protected $cirrusSearchHookRunner;
143
144    /**
145     * @param Connection $conn
146     * @param int $offset Offset the results by this much
147     * @param int $limit Limit the results to this many
148     * @param SearchConfig $config Configuration settings
149     * @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces.
150     * @param User|null $user user for which this search is being performed.  Attached to slow request logs.
151     * @param string|bool $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName
152     * @param CirrusDebugOptions|null $options the debugging options to use or null to use defaults
153     * @param NamespacePrefixParser|null $namespacePrefixParser
154     * @param InterwikiResolver|null $interwikiResolver
155     * @param TitleHelper|null $titleHelper
156     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
157     * @see CirrusDebugOptions::defaultOptions()
158     */
159    public function __construct(
160        Connection $conn, $offset,
161        $limit,
162        SearchConfig $config,
163        array $namespaces = null,
164        User $user = null,
165        $index = false,
166        CirrusDebugOptions $options = null,
167        NamespacePrefixParser $namespacePrefixParser = null,
168        InterwikiResolver $interwikiResolver = null,
169        TitleHelper $titleHelper = null,
170        CirrusSearchHookRunner $cirrusSearchHookRunner = null
171    ) {
172        parent::__construct(
173            $conn,
174            $user,
175            $config->get( 'CirrusSearchSlowSearch' ),
176            $config->get( 'CirrusSearchExtraBackendLatency' )
177        );
178        $this->config = $config;
179        $this->setOffsetLimit( $offset, $limit );
180        $this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME );
181        // TODO: Make these params mandatory once WBCS stops extending this class
182        $this->namespacePrefixParser = $namespacePrefixParser;
183        $this->interwikiResolver = $interwikiResolver ?: MediaWikiServices::getInstance()->getService( InterwikiResolver::SERVICE );
184        $this->titleHelper = $titleHelper ?: new TitleHelper( WikiMap::getCurrentWikiId(), $this->interwikiResolver );
185        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
186            MediaWikiServices::getInstance()->getHookContainer() );
187        $this->searchContext = new SearchContext( $this->config, $namespaces, $options, null, null, $this->cirrusSearchHookRunner );
188    }
189
190    /**
191     * Unified search public entry-point.
192     *
193     * NOTE: only fulltext search supported for now.
194     * @param SearchQuery $query
195     * @return Status
196     */
197    public function search( SearchQuery $query ) {
198        if ( $query->getDebugOptions()->isCirrusDumpQueryAST() ) {
199            return Status::newGood( [ 'ast' => $query->getParsedQuery()->toArray() ] );
200        }
201        // TODO: properly pass the profile context name and its params once we have a dispatch service.
202        $this->searchContext = SearchContext::fromSearchQuery( $query, FallbackRunner::create( $query, $this->interwikiResolver ),
203            $this->cirrusSearchHookRunner );
204        $this->setOffsetLimit( $query->getOffset(), $query->getLimit() );
205        $this->config = $query->getSearchConfig();
206        $this->sort = $query->getSort();
207
208        if ( $query->getSearchEngineEntryPoint() === SearchQuery::SEARCH_TEXT ) {
209            $this->searchContext->setResultsType(
210                new FullTextResultsType(
211                    $this->searchContext->getFetchPhaseBuilder(),
212                    $query->getParsedQuery()->isQueryOfClass( BasicQueryClassifier::COMPLEX_QUERY ),
213                    $this->titleHelper,
214                    $query->getExtraFieldsToExtract(),
215                    $this->searchContext->getConfig()->getElement( 'CirrusSearchDeduplicateInMemory' ) === true
216                )
217            );
218            return $this->searchTextInternal( $query->getParsedQuery()->getQueryWithoutNsHeader() );
219        } else {
220            throw new \RuntimeException( 'Only ' . SearchQuery::SEARCH_TEXT . ' is supported for now' );
221        }
222    }
223
224    /**
225     * @param ResultsType $resultsType results type to return
226     */
227    public function setResultsType( $resultsType ) {
228        $this->searchContext->setResultsType( $resultsType );
229    }
230
231    /**
232     * Is this searcher used to return debugging info?
233     * @return bool true if the search will return raw output
234     */
235    public function isReturnRaw() {
236        return $this->searchContext->getDebugOptions()->isReturnRaw();
237    }
238
239    /**
240     * Set the type of sort to perform.  Must be 'relevance', 'title_asc', 'title_desc'.
241     * @param string $sort sort type
242     */
243    public function setSort( $sort ) {
244        $this->sort = $sort;
245    }
246
247    /**
248     * Should this search limit results to the local wiki?  If not called the default is false.
249     * @param bool $limitSearchToLocalWiki should the results be limited?
250     */
251    public function limitSearchToLocalWiki( $limitSearchToLocalWiki ) {
252        $this->searchContext->setLimitSearchToLocalWiki( $limitSearchToLocalWiki );
253    }
254
255    /**
256     * Perform a "near match" title search which is pretty much a prefix match without the prefixes.
257     * @param string $term text by which to search
258     * @return Status status containing results defined by resultsType on success
259     */
260    public function nearMatchTitleSearch( $term ) {
261        ( new NearMatchQueryBuilder() )->build( $this->searchContext, $term );
262        return $this->searchOne();
263    }
264
265    /**
266     * Perform a sum over the number of words in the content index
267     * @return Status status containing a single integer
268     */
269    public function countContentWords() {
270        ( new CountContentWordsBuilder() )->build( $this->searchContext );
271        $this->limit = 1;
272        return $this->searchOne();
273    }
274
275    /**
276     * Perform a prefix search.
277     * @param string $term text by which to search
278     * @param string[] $variants variants to search for
279     * @return Status status containing results defined by resultsType on success
280     */
281    public function prefixSearch( $term, $variants = [] ) {
282        ( new PrefixSearchQueryBuilder() )->build( $this->searchContext, $term, $variants );
283        return $this->searchOne();
284    }
285
286    /**
287     * Build full text search for articles with provided term. All the
288     * state is applied to $this->searchContext. The returned query
289     * builder can be used to build a degraded query if necessary.
290     *
291     * @param string $term term to search
292     * @return FullTextQueryBuilder
293     */
294    protected function buildFullTextSearch( $term ) {
295        // Convert the unicode character 'ideographic whitespace' into standard
296        // whitespace. Cirrussearch treats them both as normal whitespace, but
297        // the preceding isn't appropriately trimmed.
298        // No searching for nothing! That takes forever!
299        $term = trim( str_replace( "\xE3\x80\x80", " ", $term ) );
300        if ( $term === '' ) {
301            $this->searchContext->setResultsPossible( false );
302        }
303
304        $builderSettings = $this->config->getProfileService()
305            ->loadProfileByName( SearchProfileService::FT_QUERY_BUILDER,
306                $this->searchContext->getFulltextQueryBuilderProfile() );
307        $features = ( new FullTextKeywordRegistry( $this->config ) )->getKeywords();
308        $qb = self::buildFullTextBuilder( $builderSettings, $this->config, $features );
309
310        $qb->build( $this->searchContext, $term );
311
312        if ( $this->searchContext->getSearchQuery() !== null ) {
313            $degradeOnParseWarnings = [
314                // && test, test AND && test
315                'cirrussearch-parse-error-unexpected-token',
316                // test AND
317                'cirrussearch-parse-error-unexpected-end'
318            ];
319            // Quick hack to avoid sending bad queries to the backend
320            foreach ( $this->searchContext->getSearchQuery()->getParsedQuery()->getParseWarnings() as $warning ) {
321                if ( in_array( $warning->getMessage(), $degradeOnParseWarnings ) ) {
322                    $qb->buildDegraded( $this->searchContext );
323                    return $qb;
324                }
325            }
326        }
327
328        return $qb;
329    }
330
331    /**
332     * @param string $term
333     * @return Status
334     */
335    private function searchTextInternal( $term ) {
336        // Searcher needs to be cloned before any actual query building is done.
337        $interleaveSearcher = $this->buildInterleaveSearcher();
338
339        $qb = $this->buildFullTextSearch( $term );
340        $mainSearch = $this->buildSearch();
341        $searches = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $mainSearch );
342        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";
343
344        if ( !$this->searchContext->areResultsPossible() ) {
345            if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
346                // return the empty array to suggest that no query will be run
347                return Status::newGood( [] );
348            }
349            $status = $this->emptyResultSet();
350            if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
351                return Status::newGood(
352                    ( new MSearchResponses( [ $status->getValue() ], [] ) )->dumpResults( $description )
353                );
354            }
355            return $status;
356        }
357
358        if ( $interleaveSearcher !== null ) {
359            $interleaveSearcher->buildFullTextSearch( $term );
360            $interleaveSearch = $interleaveSearcher->buildSearch();
361            if ( $this->areSearchesTheSame( $mainSearch, $interleaveSearch ) ) {
362                $interleaveSearcher = null;
363            } else {
364                $searches->addRequest( self::INTERLEAVED_MSEARCH_KEY, $interleaveSearch );
365            }
366        }
367
368        $fallbackRunner = $this->searchContext->getFallbackRunner();
369        $fallbackRunner->attachSearchRequests( $searches, $this->connection->getClient() );
370
371        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
372            return $searches->dumpQuery( $description );
373        }
374
375        $responses = $this->searchMulti( $searches );
376        if ( $responses->hasFailure() ) {
377            $status = $responses->getFailure();
378            if ( ElasticaErrorHandler::isParseError( $status ) ) {
379                // Rebuild the search context because we need a fresh fetchPhaseBuilder
380                $this->searchContext = $this->searchContext->withConfig( $this->config );
381                if ( $qb->buildDegraded( $this->searchContext ) ) {
382                    // If that doesn't work we're out of luck but it should.
383                    // There no guarantee it'll work properly with the syntax
384                    // we've built above but it'll do _something_ and we'll
385                    // still work on fixing all the parse errors that come in.
386                    $status = $this->searchOne();
387                }
388            }
389            return $status;
390        }
391
392        if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
393            return $responses->dumpResults( $description );
394        }
395
396        $rType = $this->getSearchContext()->getResultsType();
397        $mainSet = $responses->transformAsResultSet( $rType, self::MAINSEARCH_MSEARCH_KEY );
398        if ( $interleaveSearcher !== null ) {
399            $interleaver = new TeamDraftInterleaver( $this->searchContext->getOriginalSearchTerm() );
400            $testedSet = $responses->transformAsResultSet( $rType, self::INTERLEAVED_MSEARCH_KEY );
401            $response = $interleaver->interleave( $mainSet, $testedSet, $this->limit );
402        } else {
403            $response = $mainSet;
404        }
405
406        $status = Status::newGood();
407        if ( $this->namespacePrefixParser !== null ) {
408            $status = Status::newGood( $fallbackRunner->run( $this, $response, $responses,
409                $this->namespacePrefixParser, $this->cirrusSearchHookRunner ) );
410            $this->appendMetrics( $fallbackRunner );
411        }
412
413        foreach ( $this->searchContext->getWarnings() as $warning ) {
414            $status->warning( ...$warning );
415        }
416        return $status;
417    }
418
419    /**
420     * Get the page with $docId.  Note that the result is a status containing _all_ pages found.
421     * It is possible to find more then one page if the page is in multiple indexes.
422     * @param string[] $docIds array of document ids
423     * @param string[]|bool $sourceFiltering source filtering to apply
424     * @param bool $usePoolCounter false to disable the pool counter
425     * @return Status containing pages found, containing an empty array if not found,
426     *    or an error if there was an error
427     */
428    public function get( array $docIds, $sourceFiltering, $usePoolCounter = true ) {
429        $connection = $this->getOverriddenConnection();
430        $indexSuffix = $connection->pickIndexSuffixForNamespaces(
431            $this->searchContext->getNamespaces()
432        );
433
434        // The worst case would be to have all ids duplicated in all available indices.
435        // We set the limit accordingly
436        $size = count( $connection->getAllIndexSuffixesForNamespaces(
437            $this->searchContext->getNamespaces()
438        ) );
439        $size *= count( $docIds );
440
441        $work = function () use ( $docIds, $sourceFiltering, $indexSuffix, $size, $connection ) {
442            try {
443                $this->startNewLog( 'get of {indexSuffix}.{docIds}', 'get', [
444                    'indexSuffix' => $indexSuffix,
445                    'docIds' => $docIds,
446                ] );
447                // Shard timeout not supported on get requests so we just use the client side timeout
448                $connection->setTimeout( $this->getClientTimeout( 'get' ) );
449                // We use a search query instead of _get/_mget, these methods are
450                // theorically well suited for this kind of job but they are not
451                // supported on aliases with multiple indices (content/general)
452                $index = $connection->getIndex( $this->indexBaseName, $indexSuffix );
453                $query = new \Elastica\Query( new \Elastica\Query\Ids( $docIds ) );
454                if ( is_array( $sourceFiltering ) ) {
455                    // The title is a required field in the ApiTrait
456                    if ( !in_array( "title", $sourceFiltering ) ) {
457                        array_push( $sourceFiltering, "title" );
458                    }
459                    $query->setParam( '_source', $sourceFiltering );
460                }
461                $query->addParam( 'stats', 'get' );
462                // We ignore limits provided to the searcher
463                // otherwize we could return fewer results than
464                // the ids requested.
465                $query->setFrom( 0 );
466                $query->setSize( $size );
467                $resultSet = $index->search( $query, [ 'search_type' => 'query_then_fetch' ] );
468                if ( !$resultSet->getResponse()->isOK() ) {
469                    $request = $connection->getClient()->getLastRequest();
470                    if ( $request == null ) {
471                        // I can't imagine how this would happen, but the type signature allows
472                        // for a null last request so we provide a minimal workaround.
473                        throw new \Elastica\Exception\RuntimeException(
474                            "Response reports failure, but no last request available" );
475                    }
476                    throw new ResponseException( $request, $resultSet->getResponse() );
477                }
478                return $this->success( $resultSet->getResults(), $connection );
479            } catch ( \Elastica\Exception\NotFoundException $e ) {
480                // NotFoundException just means the field didn't exist.
481                // It is up to the caller to decide if that is an error.
482                return $this->success( [], $connection );
483            } catch ( \Elastica\Exception\ExceptionInterface $e ) {
484                return $this->failure( $e, $connection );
485            }
486        };
487
488        if ( $usePoolCounter ) {
489            return Util::doPoolCounterWork( $this->getPoolCounterType(), $this->user, $work );
490        } else {
491            return $work();
492        }
493    }
494
495    /**
496     * @param string $name
497     * @return Status
498     */
499    private function findNamespace( $name ) {
500        return Util::doPoolCounterWork(
501            'CirrusSearch-NamespaceLookup',
502            $this->user,
503            function () use ( $name ) {
504                try {
505                    $this->startNewLog( 'lookup namespace for {namespaceName}', 'namespace', [
506                        'namespaceName' => $name,
507                        'query' => $name,
508                    ] );
509                    $connection = $this->getOverriddenConnection();
510                    $connection->setTimeout( $this->getClientTimeout( 'namespace' ) );
511
512                    // A bit awkward, but accepted as this is the backup
513                    // implementation of namespace lookup. Deployments should
514                    // prefer to install php-intl and use utr30.
515                    $store = ( new MetaStoreIndex( $connection, new NullPrinter(), $this->config ) )
516                        ->namespaceStore();
517                    $resultSet = $store->find( $name, [
518                        'timeout' => $this->getTimeout( 'namespace' ),
519                    ] );
520                    return $this->success( $resultSet->getResults(), $connection );
521                } catch ( \Elastica\Exception\ExceptionInterface $e ) {
522                    return $this->failure( $e, $connection );
523                }
524            } );
525    }
526
527    /**
528     * @return \Elastica\Search
529     */
530    protected function buildSearch() {
531        $builder = new SearchRequestBuilder(
532            $this->searchContext, $this->getOverriddenConnection(), $this->indexBaseName );
533        return $builder->setLimit( $this->limit )
534            ->setOffset( $this->offset )
535            ->setIndex( $this->index )
536            ->setSort( $this->sort )
537            ->setTimeout( $this->getTimeout( $this->searchContext->getSearchType() ) )
538            ->build();
539    }
540
541    /**
542     * Perform a single-query search.
543     * @return Status
544     */
545    protected function searchOne() {
546        $search = $this->buildSearch();
547        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";
548        $msearch = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $search );
549        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
550            return $msearch->dumpQuery( $description );
551        }
552        if ( !$this->searchContext->areResultsPossible() ) {
553            return $this->emptyResultSet();
554        }
555
556        $mresults = $this->searchMulti( $msearch );
557
558        if ( $mresults->hasFailure() ) {
559            return $mresults->getFailure();
560        }
561
562        if ( $this->searchContext->getDebugOptions()->isReturnRaw() ) {
563            return $mresults->dumpResults( $description );
564        }
565        return $mresults->transformAndGetSingle( $this->searchContext->getResultsType(), self::MAINSEARCH_MSEARCH_KEY );
566    }
567
568    /**
569     * Powers full-text-like searches including prefix search.
570     *
571     * @param MSearchRequests $msearches
572     * @return MSearchResponses search responses
573     */
574    protected function searchMulti( MSearchRequests $msearches ) {
575        $searches = $msearches->getRequests();
576        $contextResultsType = $this->searchContext->getResultsType();
577        $cirrusDebugOptions = $this->searchContext->getDebugOptions();
578        Assert::precondition( !$cirrusDebugOptions->isCirrusDumpQuery(), 'Must not reach this method when dumping the query' );
579
580        // TODO: should this be moved upper in the stack?
581        if ( $this->limit <= 0 ) {
582            return $msearches->failure( Status::newFatal( 'cirrussearch-offset-too-large',
583                self::MAX_OFFSET_LIMIT, $this->offset ) );
584        }
585
586        $connection = $this->getOverriddenConnection();
587        $log = new MultiSearchRequestLog(
588            $connection->getClient(),
589            "{queryType} search for '{query}'",
590            $this->searchContext->getSearchType(),
591            [
592                'query' => $this->searchContext->getOriginalSearchTerm(),
593                'limit' => $this->limit ?: null,
594                // Used syntax
595                'syntax' => $this->searchContext->getSyntaxUsed(),
596            ],
597            $this->searchContext->getNamespaces() ?: []
598        );
599
600        // Similar to indexing support only the bulk code path, rather than
601        // single and bulk. The extra overhead should be minimal, and the
602        // reduced complexity is welcomed.
603        $search = new MultiSearch( $connection->getClient() );
604        $search->addSearches( $searches );
605
606        $connection->setTimeout( $this->getClientTimeout( $this->searchContext->getSearchType() ) );
607
608        if ( $this->config->get( 'CirrusSearchMoreAccurateScoringMode' ) ) {
609            $search->setSearchType( \Elastica\Search::OPTION_SEARCH_TYPE_DFS_QUERY_THEN_FETCH );
610        }
611
612        // Perform the search
613        $work = function () use ( $search, $log, $connection ) {
614            return Util::doPoolCounterWork(
615                $this->getPoolCounterType(),
616                $this->user,
617                function () use ( $search, $log, $connection ) {
618                    // @todo only reports the first error, also turns
619                    // a partial (single search) error into a complete
620                    // failure across the board. Should be addressed
621                    // at some point.
622                    return $this->runMSearch( $search, $log, $connection );
623                },
624                $this->searchContext->isSyntaxUsed( 'regex' ) ?
625                    'cirrussearch-regex-too-busy-error' : null
626            );
627        };
628
629        // Wrap with caching if needed, but don't cache debugging queries
630        $skipCache = $cirrusDebugOptions->mustNeverBeCached();
631        if ( $this->searchContext->getCacheTtl() > 0 && !$skipCache ) {
632            $work = function () use ( $work, $searches, $log, $contextResultsType ) {
633                $services = MediaWikiServices::getInstance();
634                $requestStats = $services->getStatsdDataFactory();
635                $cache = $services->getMainWANObjectCache();
636                $keyParts = [];
637                foreach ( $searches as $key => $search ) {
638                    $keyParts[] = $search->getPath() .
639                        serialize( $search->getOptions() ) .
640                        serialize( $search->getQuery()->toArray() ) .
641                        ( $contextResultsType !== null ? get_class( $contextResultsType ) : "NONE" );
642                }
643                $key = $cache->makeKey( 'cirrussearch', 'search', 'v2', md5(
644                    implode( '|', $keyParts )
645                ) );
646                $cacheResult = $cache->get( $key );
647                $statsKey = $this->getQueryCacheStatsKey();
648                if ( $cacheResult ) {
649                    [ $logVariables, $multiResultSet ] = $cacheResult;
650                    $requestStats->increment( "$statsKey.hit" );
651                    $log->setCachedResult( $logVariables );
652                    $this->successViaCache( $log );
653
654                    if ( $multiResultSet->isOK() ) {
655                        /**
656                         * @var $cachedMResultSet \Elastica\Multi\ResultSet
657                         */
658                        $cachedMResultSet = $multiResultSet->getValue();
659                        if ( count( $cachedMResultSet->getResultSets() ) !== count( $searches ) ) {
660                            LoggerFactory::getInstance( 'CirrusSearch' )
661                                ->warning( 'Ignoring a cached Multi/ResultSet wanted {nb_queries} response(s) but received {nb_responses}',
662                                    [
663                                        'nb_queries' => count( $searches ),
664                                        'nb_responses' => count( $cachedMResultSet->getResultSets() )
665                                    ] );
666                            $requestStats->increment( "$statsKey.incoherent" );
667                        } else {
668                            return $multiResultSet;
669                        }
670                    } else {
671                        LoggerFactory::getInstance( 'CirrusSearch' )
672                            ->warning( 'Cached a Status value that is not OK' );
673                        $requestStats->increment( "$statsKey.nok" );
674                    }
675                } else {
676                    $requestStats->increment( "$statsKey.miss" );
677                }
678
679                $multiResultSet = $work();
680
681                if ( $multiResultSet->isOK() ) {
682                    $isPartialResult = false;
683                    foreach ( $multiResultSet->getValue()->getResultSets() as $resultSet ) {
684                        $responseData = $resultSet->getResponse()->getData();
685                        if ( isset( $responseData['timed_out'] ) && $responseData['timed_out'] ) {
686                            $isPartialResult = true;
687                            break;
688                        }
689                    }
690                    if ( !$isPartialResult ) {
691                        $requestStats->increment( "$statsKey.set" );
692                        $cache->set(
693                            $key,
694                            [ $log->getLogVariables(), $multiResultSet ],
695                            $this->searchContext->getCacheTtl()
696                        );
697                    }
698                }
699
700                return $multiResultSet;
701            };
702        }
703
704        $status = $work();
705
706        // @todo Does this need anything special for multi-search changes?
707        if ( !$status->isOK() ) {
708            return $msearches->failure( $status );
709        }
710
711        $response = $status->getValue();
712        /**
713         * @var $response \Elastica\Multi\ResultSet
714         */
715        if ( count( $response->getResultSets() ) !== count( $msearches->getRequests() ) ) {
716            // Temp hack to investigate T231023 (use php serialize just in case it has some invalid
717            // UTF8 sequences that would prevent this message from being sent to logstash
718            LoggerFactory::getInstance( 'CirrusSearch' )
719                ->warning( "Incoherent response received (#searches != #responses) for {query}: {response}",
720                    [ 'query' => $this->searchContext->getOriginalSearchTerm(), 'response' => serialize( $response->getResponse() ) ] );
721            return $msearches->failure( Status::newFatal( 'cirrussearch-backend-error' ) );
722        }
723        $mreponses = $msearches->toMSearchResponses( $response->getResultSets() );
724        if ( $mreponses->hasTimeout() ) {
725            LoggerFactory::getInstance( 'CirrusSearch' )->warning(
726                $log->getDescription() . " timed out and only returned partial results!",
727                $log->getLogVariables()
728            );
729            $this->searchContext->addWarning( $this->searchContext->isSyntaxUsed( 'regex' )
730                ? 'cirrussearch-regex-timed-out'
731                : 'cirrussearch-timed-out'
732            );
733        }
734        return $mreponses;
735    }
736
737    /**
738     * Attempt to suck a leading namespace followed by a colon from the query string.
739     * Reaches out to Elasticsearch to perform normalized lookup against the namespaces.
740     * Should be fast but for the network hop.
741     *
742     * @param string &$query
743     */
744    public function updateNamespacesFromQuery( &$query ) {
745        $colon = strpos( $query, ':' );
746        if ( $colon === false ) {
747            return;
748        }
749        $namespaceName = substr( $query, 0, $colon );
750        $status = $this->findNamespace( $namespaceName );
751        // Failure case is already logged so just handle success case
752        if ( !$status->isOK() ) {
753            return;
754        }
755        $foundNamespace = $status->getValue();
756        if ( !$foundNamespace ) {
757            return;
758        }
759        $foundNamespace = $foundNamespace[ 0 ];
760        $query = substr( $query, $colon + 1 );
761        $this->searchContext->setNamespaces( [ $foundNamespace->namespace_id ] );
762    }
763
764    /**
765     * @return SearchContext
766     */
767    public function getSearchContext() {
768        return $this->searchContext;
769    }
770
771    private function getPoolCounterType(): string {
772        // Default pool counter for all search requests. Note that not all
773        // possible requests go through Searcher, so this isn't globally
774        // definitive.
775        $pool = 'CirrusSearch-Search';
776        // Pool counter overrides based on query syntax. Goal is to
777        // separate expensive or high-volume traffic into dedicated
778        // pools with specific limits. Prefix is only high volume
779        // when completion is disabled.
780        $poolCounterTypes = [
781            'regex' => 'CirrusSearch-Regex',
782            'prefix' => 'CirrusSearch-Prefix',
783            'more_like' => 'CirrusSearch-MoreLike',
784        ];
785        foreach ( $poolCounterTypes as $type => $counter ) {
786            if ( $this->searchContext->isSyntaxUsed( $type ) ) {
787                $pool = $counter;
788                break;
789            }
790        }
791        // Put external automated requests into their own bucket The main idea
792        // here is to allow automated access, but prevent that automation from
793        // capping out the pools used by interactive queries.
794        // It's not clear when the automation bucket should not override other
795        // bucketing decisions, for now override everything except Regex since
796        // those can be very expensive and usually use a small pool. If both
797        // the automation and regex pools filled with regexes it would be
798        // significantly more load than expected.
799        if ( $pool !== 'CirrusSearch-Regex' && $this->isAutomatedRequest() ) {
800            $pool = 'CirrusSearch-Automated';
801        }
802        return $pool;
803    }
804
805    private function isAutomatedRequest(): bool {
806        $req = RequestContext::getMain()->getRequest();
807        try {
808            $ip = $req->getIP();
809        } catch ( \MWException $e ) {
810            // No IP, typically this means a CLI invocation. We are attempting
811            // to segregate external automation, internal automation has its
812            // own ability to control configuration and shouldn't be flagged
813            if ( MW_ENTRY_POINT === 'cli' ) {
814                return false;
815            }
816            // When can we get here? Is this ever run?
817            LoggerFactory::getInstance( 'CirrusSearch' )->info(
818                'No IP available during automated request check' );
819            return false;
820        }
821        return Util::looksLikeAutomation(
822            $this->config, $ip, $req->getAllHeaders() );
823    }
824
825    /**
826     * Some queries, like more like this, are quite expensive and can cause
827     * latency spikes. This allows redirecting queries using particular
828     * features to specific clusters.
829     * @return Connection
830     */
831    private function getOverriddenConnection() {
832        $overrides = $this->config->get( 'CirrusSearchClusterOverrides' );
833        foreach ( $overrides as $feature => $cluster ) {
834            if ( $this->searchContext->isSyntaxUsed( $feature ) ) {
835                return Connection::getPool( $this->config, $cluster );
836            }
837        }
838        return $this->connection;
839    }
840
841    /**
842     * @return string The stats key used for reporting hit/miss rates of the
843     *  application side query cache.
844     */
845    protected function getQueryCacheStatsKey() {
846        $type = $this->searchContext->getSearchType();
847        return "CirrusSearch.query_cache.$type";
848    }
849
850    /**
851     * @param string $description
852     * @param string $queryType
853     * @param string[] $extra
854     * @return SearchRequestLog
855     */
856    protected function newLog( $description, $queryType, array $extra = [] ) {
857        return new SearchRequestLog(
858            $this->getOverriddenConnection()->getClient(),
859            $description,
860            $queryType,
861            $extra
862        );
863    }
864
865    /**
866     * If we're supposed to create raw result, create and return it,
867     * or output it and finish.
868     * @param mixed $result Search result data
869     * @param WebRequest $request Request context
870     * @return string The new raw result.
871     */
872    public function processRawReturn( $result, WebRequest $request ) {
873        return Util::processSearchRawReturn( $result, $request,
874            $this->searchContext->getDebugOptions() );
875    }
876
877    /**
878     * Search titles in archive
879     * @param string $term
880     * @return Status<Title[]>
881     */
882    public function searchArchive( $term ) {
883        $this->searchContext->setOriginalSearchTerm( $term );
884        $term = $this->searchContext->escaper()->fixupWholeQueryString( $term );
885        $this->setResultsType( new TitleResultsType() );
886
887        // This does not support cross-cluster search, but there is also no use case
888        // for cross-wiki archive search.
889        $this->index = $this->getOverriddenConnection()->getArchiveIndex( $this->indexBaseName );
890
891        // Setup the search query
892        $query = new BoolQuery();
893
894        $multi = new MultiMatch();
895        $multi->setType( 'best_fields' );
896        $multi->setTieBreaker( 0 );
897        $multi->setQuery( $term );
898        $multi->setFields( [
899            'title.near_match^100',
900            'title.near_match_asciifolding^75',
901            'title.plain^50',
902            'title^25'
903        ] );
904        $multi->setOperator( 'AND' );
905
906        $fuzzy = new \Elastica\Query\MatchQuery();
907        $fuzzy->setFieldQuery( 'title.plain', $term );
908        $fuzzy->setFieldFuzziness( 'title.plain', 'AUTO' );
909        $fuzzy->setFieldOperator( 'title.plain', 'AND' );
910
911        $query->addShould( $multi );
912        $query->addShould( $fuzzy );
913        $query->setMinimumShouldMatch( 1 );
914
915        $this->sort = 'just_match';
916
917        $this->searchContext->setMainQuery( $query );
918        $this->searchContext->addSyntaxUsed( 'archive' );
919        $this->searchContext->setRescoreProfile( 'empty' );
920
921        return $this->searchOne();
922    }
923
924    /**
925     * Tests if two search objects are equivalent
926     *
927     * @param Search $a
928     * @param Search $b
929     * @return bool
930     */
931    private function areSearchesTheSame( Search $a, Search $b ) {
932        // same object.
933        if ( $a === $b ) {
934            return true;
935        }
936
937        // Check values not included in toArray()
938        if ( $a->getPath() !== $b->getPath()
939            || $a->getOptions() != $b->getOptions()
940        ) {
941            return false;
942        }
943
944        $aArray = $a->getQuery()->toArray();
945        $bArray = $b->getQuery()->toArray();
946
947        // normalize the 'now' value which contains a timestamp that
948        // may vary.
949        $fixNow = static function ( &$value, $key ) {
950            if ( $key === 'now' && is_int( $value ) ) {
951                $value = 12345678;
952            }
953        };
954        array_walk_recursive( $aArray, $fixNow );
955        array_walk_recursive( $bArray, $fixNow );
956
957        // Simplest form, requires both arrays to have exact same ordering,
958        // types, keys, etc. We could try much harder to remove edge cases,
959        // but they probably don't matter too much. The main thing we are
960        // looking for is if configuration used for interleaved search didn't
961        // have an effect query building. If we get it wrong in some rare
962        // cases it should have minimal effects on the interleaved search test.
963        return $aArray === $bArray;
964    }
965
966    private function buildInterleaveSearcher() {
967        // If we aren't on the first page, or the user has specified
968        // some custom magic query options (override rescore profile,
969        // etc) then don't interleave.
970        if ( $this->offset > 0 || $this->searchContext->isDirty() ) {
971            return null;
972        }
973
974        // Is interleaving configured?
975        $overrides = $this->config->get( 'CirrusSearchInterleaveConfig' );
976        if ( $overrides === null ) {
977            return null;
978        }
979
980        $config = new HashSearchConfig( $overrides, [ HashSearchConfig::FLAG_INHERIT ] );
981        $other = clone $this;
982        $other->config = $config;
983        $other->searchContext = $other->searchContext->withConfig( $config );
984
985        return $other;
986    }
987
988    /**
989     * @return Status
990     */
991    private function emptyResultSet() {
992        $results = $this->searchContext->getResultsType()->createEmptyResult();
993        if ( $results instanceof BaseCirrusSearchResultSet ) {
994            // TODO: Keywords are very specific to full-text search, while
995            // ResultsType and this method are much more general.
996            // While awkward, this maintains BC until we decide what to do.
997            $results = BaseCirrusSearchResultSet::emptyResultSet(
998                $this->searchContext->isSpecialKeywordUsed()
999            );
1000        }
1001        $status = Status::newGood( $results );
1002        foreach ( $this->searchContext->getWarnings() as $warning ) {
1003            $status->warning( ...$warning );
1004        }
1005        return $status;
1006    }
1007
1008    /**
1009     * Apply debug options to the elastica query
1010     * @param Query $query
1011     * @return Query
1012     */
1013    public function applyDebugOptionsToQuery( Query $query ) {
1014        return $this->searchContext->getDebugOptions()->applyDebugOptions( $query );
1015    }
1016
1017    /**
1018     * @param SearchQuery $query
1019     * @return Searcher
1020     */
1021    public function makeSearcher( SearchQuery $query ) {
1022        return new self( $this->connection, $query->getOffset(), $query->getLimit(),
1023            $query->getSearchConfig(), $query->getNamespaces(), $this->user,
1024            false, $query->getDebugOptions(), $this->namespacePrefixParser, $this->interwikiResolver,
1025            $this->titleHelper, $this->cirrusSearchHookRunner );
1026    }
1027
1028    /**
1029     * @param int $offset
1030     * @param int $limit
1031     */
1032    private function setOffsetLimit( $offset, $limit ) {
1033        $this->offset = $offset;
1034        if ( $offset + $limit > self::MAX_OFFSET_LIMIT ) {
1035            $this->limit = self::MAX_OFFSET_LIMIT - $offset;
1036        } else {
1037            $this->limit = $limit;
1038        }
1039    }
1040
1041    /**
1042     * Visible for testing
1043     * @return int[] 2 elements array
1044     */
1045    public function getOffsetLimit() {
1046        Assert::precondition( defined( 'MW_PHPUNIT_TEST' ),
1047            'getOffsetLimit must only be called for testing purposes' );
1048        return [ $this->offset, $this->limit ];
1049    }
1050
1051    /**
1052     * Build a FullTextQueryBuilder defined in the $builderSettings:
1053     * format is:
1054     * [
1055     *     'builder_factory' => callback
1056     *     'settings' => ...
1057     * ]
1058     * where callback must be function that accepts the settings array and returns a FullTextQueryBuilder
1059     *
1060     * Legacy version:
1061     * [
1062     *     'builder_class' => ClassName
1063     *     'settings' => ...
1064     * ]
1065     * where ClassName must declare a constructor with these arguments:
1066     *   SearchConfig $config, KeywordFeature[] $features, $settings
1067     *
1068     * Visible for testing only
1069     * @param array $builderSettings
1070     * @param SearchConfig $config
1071     * @param KeywordFeature[] $features
1072     * @return FullTextQueryBuilder
1073     * @throws \ReflectionException
1074     */
1075    final public static function buildFullTextBuilder(
1076        array $builderSettings,
1077        SearchConfig $config,
1078        array $features
1079    ): FullTextQueryBuilder {
1080        if ( isset( $builderSettings['builder_class'] ) ) {
1081            $objectFactorySpecs = [
1082                'class' => $builderSettings['builder_class'],
1083                'args' => [
1084                    $config,
1085                    $features,
1086                    $builderSettings['settings']
1087                ]
1088            ];
1089        } elseif ( $builderSettings['builder_factory'] ) {
1090            $objectFactorySpecs = [
1091                'factory' => $builderSettings['builder_factory'],
1092                'args' => [
1093                    $builderSettings['settings']
1094                ]
1095            ];
1096        } else {
1097            throw new \InvalidArgumentException( 'Missing builder_class or builder_factory in the builderSettings' );
1098        }
1099
1100        /** @var FullTextQueryBuilder $qb */
1101        // @phan-suppress-next-line PhanTypeInvalidCallableArraySize
1102        $qb = ObjectFactory::getObjectFromSpec( $objectFactorySpecs );
1103        if ( !( $qb instanceof FullTextQueryBuilder ) ) {
1104            throw new RuntimeException( 'Bad builder class configured.' );
1105        }
1106
1107        return $qb;
1108    }
1109}