Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
51.46% covered (warning)
51.46%
229 / 445
36.36% covered (danger)
36.36%
12 / 33
CRAP
0.00% covered (danger)
0.00%
0 / 1
Searcher
51.46% covered (warning)
51.46%
229 / 445
36.36% covered (danger)
36.36%
12 / 33
1573.29
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
5
 search
89.47% covered (warning)
89.47%
17 / 19
0.00% covered (danger)
0.00%
0 / 1
3.01
 setResultsType
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isReturnRaw
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setSort
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 limitSearchToLocalWiki
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 nearMatchTitleSearch
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 countContentWords
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 prefixSearch
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 buildFullTextSearch
84.21% covered (warning)
84.21%
16 / 19
0.00% covered (danger)
0.00%
0 / 1
5.10
 searchTextInternal
61.22% covered (warning)
61.22%
30 / 49
0.00% covered (danger)
0.00%
0 / 1
25.43
 get
0.00% covered (danger)
0.00%
0 / 34
0.00% covered (danger)
0.00%
0 / 1
42
 findNamespace
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
6
 buildSearch
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 searchOne
38.46% covered (danger)
38.46%
5 / 13
0.00% covered (danger)
0.00%
0 / 1
10.83
 searchMulti
38.89% covered (danger)
38.89%
42 / 108
0.00% covered (danger)
0.00%
0 / 1
132.46
 updateNamespacesFromQuery
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
20
 getSearchContext
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getPoolCounterType
78.57% covered (warning)
78.57%
11 / 14
0.00% covered (danger)
0.00%
0 / 1
5.25
 isAutomatedRequest
40.00% covered (danger)
40.00%
4 / 10
0.00% covered (danger)
0.00%
0 / 1
4.94
 getOverriddenConnection
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
3.58
 recordQueryCacheMetrics
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 newLog
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 processRawReturn
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 searchArchive
100.00% covered (success)
100.00%
28 / 28
100.00% covered (success)
100.00%
1 / 1
1
 areSearchesTheSame
0.00% covered (danger)
0.00%
0 / 14
0.00% covered (danger)
0.00%
0 / 1
42
 buildInterleaveSearcher
50.00% covered (danger)
50.00%
5 / 10
0.00% covered (danger)
0.00%
0 / 1
6.00
 emptyResultSet
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 applyDebugOptionsToQuery
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 makeSearcher
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 setOffsetLimit
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 getOffsetLimit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 buildFullTextBuilder
90.48% covered (success)
90.48%
19 / 21
0.00% covered (danger)
0.00%
0 / 1
4.01
1<?php
2
3namespace CirrusSearch;
4
5use CirrusSearch\Fallbacks\FallbackRunner;
6use CirrusSearch\Fallbacks\SearcherFactory;
7use CirrusSearch\Maintenance\NullPrinter;
8use CirrusSearch\MetaStore\MetaStoreIndex;
9use CirrusSearch\Parser\BasicQueryClassifier;
10use CirrusSearch\Parser\FullTextKeywordRegistry;
11use CirrusSearch\Parser\NamespacePrefixParser;
12use CirrusSearch\Profile\SearchProfileService;
13use CirrusSearch\Query\CountContentWordsBuilder;
14use CirrusSearch\Query\FullTextQueryBuilder;
15use CirrusSearch\Query\KeywordFeature;
16use CirrusSearch\Query\NearMatchQueryBuilder;
17use CirrusSearch\Query\PrefixSearchQueryBuilder;
18use CirrusSearch\Search\BaseCirrusSearchResultSet;
19use CirrusSearch\Search\FullTextResultsType;
20use CirrusSearch\Search\MSearchRequests;
21use CirrusSearch\Search\MSearchResponses;
22use CirrusSearch\Search\ResultsType;
23use CirrusSearch\Search\SearchContext;
24use CirrusSearch\Search\SearchQuery;
25use CirrusSearch\Search\SearchRequestBuilder;
26use CirrusSearch\Search\TeamDraftInterleaver;
27use CirrusSearch\Search\TitleHelper;
28use CirrusSearch\Search\TitleResultsType;
29use Elastica\Exception\RuntimeException;
30use Elastica\Multi\Search as MultiSearch;
31use Elastica\Query;
32use Elastica\Query\BoolQuery;
33use Elastica\Query\MultiMatch;
34use Elastica\Search;
35use MediaWiki\Context\RequestContext;
36use MediaWiki\Exception\MWException;
37use MediaWiki\Logger\LoggerFactory;
38use MediaWiki\MediaWikiServices;
39use MediaWiki\Request\WebRequest;
40use MediaWiki\Status\Status;
41use MediaWiki\Title\Title;
42use MediaWiki\User\User;
43use MediaWiki\WikiMap\WikiMap;
44use Wikimedia\Assert\Assert;
45use Wikimedia\ObjectFactory\ObjectFactory;
46use Wikimedia\Stats\StatsFactory;
47
48/**
49 * Performs searches using Elasticsearch.  Note that each instance of this class
50 * is single use only.
51 *
52 * This program is free software; you can redistribute it and/or modify
53 * it under the terms of the GNU General Public License as published by
54 * the Free Software Foundation; either version 2 of the License, or
55 * (at your option) any later version.
56 *
57 * This program is distributed in the hope that it will be useful,
58 * but WITHOUT ANY WARRANTY; without even the implied warranty of
59 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
60 * GNU General Public License for more details.
61 *
62 * You should have received a copy of the GNU General Public License along
63 * with this program; if not, write to the Free Software Foundation, Inc.,
64 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
65 * http://www.gnu.org/copyleft/gpl.html
66 */
67class Searcher extends ElasticsearchIntermediary implements SearcherFactory {
68    public const SUGGESTION_HIGHLIGHT_PRE = '<em>';
69    public const SUGGESTION_HIGHLIGHT_POST = '</em>';
70    public const HIGHLIGHT_PRE_MARKER = ''; // \uE000. Can't be a unicode literal until php7
71    public const HIGHLIGHT_PRE = '<span class="searchmatch">';
72    public const HIGHLIGHT_POST_MARKER = ''; // \uE001
73    public const HIGHLIGHT_POST = '</span>';
74
75    /**
76     * Maximum offset + limit depth allowed. As in the deepest possible result
77     * to return. Too deep will cause very slow queries. 10,000 feels plenty
78     * deep. This should be <= index.max_result_window in elasticsearch.
79     */
80    private const MAX_OFFSET_LIMIT = 10000;
81
82    /**
83     * Identifies the main search in MSearchRequests/MSearchResponses
84     */
85    public const MAINSEARCH_MSEARCH_KEY = '__main__';
86
87    /**
88     * Identifies the "tested" search request in MSearchRequests/MSearchResponses
89     */
90    private const INTERLEAVED_MSEARCH_KEY = '__interleaved__';
91
92    /**
93     * @var int search offset
94     */
95    protected $offset;
96
97    /**
98     * @var int maximum number of result
99     */
100    protected $limit;
101
102    /**
103     * @var string sort type
104     */
105    private $sort = 'relevance';
106
107    /**
108     * @var string index base name to use
109     */
110    protected $indexBaseName;
111
112    /**
113     * Search environment configuration
114     * @var SearchConfig
115     */
116    protected $config;
117
118    /**
119     * @var SearchContext
120     */
121    protected $searchContext;
122
123    /**
124     * Indexing type we'll be using.
125     * @var string|\Elastica\Index
126     */
127    private $index;
128
129    /**
130     * @var NamespacePrefixParser|null
131     */
132    private $namespacePrefixParser;
133    /**
134     * @var InterwikiResolver
135     */
136    protected $interwikiResolver;
137
138    /** @var TitleHelper */
139    protected $titleHelper;
140    /**
141     * @var CirrusSearchHookRunner
142     */
143    protected $cirrusSearchHookRunner;
144
145    /**
146     * @param Connection $conn
147     * @param int $offset Offset the results by this much
148     * @param int $limit Limit the results to this many
149     * @param SearchConfig $config Configuration settings
150     * @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces.
151     * @param User|null $user user for which this search is being performed.  Attached to slow request logs.
152     * @param string|bool $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName
153     * @param CirrusDebugOptions|null $options the debugging options to use or null to use defaults
154     * @param NamespacePrefixParser|null $namespacePrefixParser
155     * @param InterwikiResolver|null $interwikiResolver
156     * @param TitleHelper|null $titleHelper
157     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
158     * @see CirrusDebugOptions::defaultOptions()
159     */
160    public function __construct(
161        Connection $conn, $offset,
162        $limit,
163        SearchConfig $config,
164        ?array $namespaces = null,
165        ?User $user = null,
166        $index = false,
167        ?CirrusDebugOptions $options = null,
168        ?NamespacePrefixParser $namespacePrefixParser = null,
169        ?InterwikiResolver $interwikiResolver = null,
170        ?TitleHelper $titleHelper = null,
171        ?CirrusSearchHookRunner $cirrusSearchHookRunner = null
172    ) {
173        parent::__construct(
174            $conn,
175            $user,
176            $config->get( 'CirrusSearchSlowSearch' ),
177            $config->get( 'CirrusSearchExtraBackendLatency' )
178        );
179        $this->config = $config;
180        $this->setOffsetLimit( $offset, $limit );
181        $this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME );
182        // TODO: Make these params mandatory once WBCS stops extending this class
183        $this->namespacePrefixParser = $namespacePrefixParser;
184        $this->interwikiResolver = $interwikiResolver ?: MediaWikiServices::getInstance()->getService( InterwikiResolver::SERVICE );
185        $this->titleHelper = $titleHelper ?: new TitleHelper( WikiMap::getCurrentWikiId(), $this->interwikiResolver );
186        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
187            MediaWikiServices::getInstance()->getHookContainer() );
188        $this->searchContext = new SearchContext( $this->config, $namespaces, $options, null, null, $this->cirrusSearchHookRunner );
189    }
190
191    /**
192     * Unified search public entry-point.
193     *
194     * NOTE: only fulltext search supported for now.
195     * @param SearchQuery $query
196     * @return Status
197     */
198    public function search( SearchQuery $query ) {
199        if ( $query->getDebugOptions()->isCirrusDumpQueryAST() ) {
200            return Status::newGood( [ 'ast' => $query->getParsedQuery()->toArray() ] );
201        }
202        // TODO: properly pass the profile context name and its params once we have a dispatch service.
203        $this->searchContext = SearchContext::fromSearchQuery( $query, FallbackRunner::create( $query, $this->interwikiResolver ),
204            $this->cirrusSearchHookRunner );
205        $this->setOffsetLimit( $query->getOffset(), $query->getLimit() );
206        $this->config = $query->getSearchConfig();
207        $this->sort = $query->getSort();
208
209        if ( $query->getSearchEngineEntryPoint() === SearchQuery::SEARCH_TEXT ) {
210            $this->searchContext->setResultsType(
211                new FullTextResultsType(
212                    $this->searchContext->getFetchPhaseBuilder(),
213                    $query->getParsedQuery()->isQueryOfClass( BasicQueryClassifier::COMPLEX_QUERY ),
214                    $this->titleHelper,
215                    $query->getExtraFieldsToExtract(),
216                    $this->searchContext->getConfig()->getElement( 'CirrusSearchDeduplicateInMemory' ) === true
217                )
218            );
219            return $this->searchTextInternal( $query->getParsedQuery()->getQueryWithoutNsHeader() );
220        } else {
221            throw new \RuntimeException( 'Only ' . SearchQuery::SEARCH_TEXT . ' is supported for now' );
222        }
223    }
224
225    /**
226     * @param ResultsType $resultsType results type to return
227     */
228    public function setResultsType( $resultsType ) {
229        $this->searchContext->setResultsType( $resultsType );
230    }
231
232    /**
233     * Is this searcher used to return debugging info?
234     * @return bool true if the search will return raw output
235     */
236    public function isReturnRaw() {
237        return $this->searchContext->getDebugOptions()->isReturnRaw();
238    }
239
240    /**
241     * Set the type of sort to perform.  Must be 'relevance', 'title_asc', 'title_desc'.
242     * @param string $sort sort type
243     */
244    public function setSort( $sort ) {
245        $this->sort = $sort;
246    }
247
248    /**
249     * Should this search limit results to the local wiki?  If not called the default is false.
250     * @param bool $limitSearchToLocalWiki should the results be limited?
251     */
252    public function limitSearchToLocalWiki( $limitSearchToLocalWiki ) {
253        $this->searchContext->setLimitSearchToLocalWiki( $limitSearchToLocalWiki );
254    }
255
256    /**
257     * Perform a "near match" title search which is pretty much a prefix match without the prefixes.
258     * @param string $term text by which to search
259     * @return Status status containing results defined by resultsType on success
260     */
261    public function nearMatchTitleSearch( $term ) {
262        ( new NearMatchQueryBuilder() )->build( $this->searchContext, $term );
263        return $this->searchOne();
264    }
265
266    /**
267     * Perform a sum over the number of words in the content index
268     * @return Status status containing a single integer
269     */
270    public function countContentWords() {
271        ( new CountContentWordsBuilder() )->build( $this->searchContext );
272        $this->limit = 1;
273        return $this->searchOne();
274    }
275
276    /**
277     * Perform a prefix search.
278     * @param string $term text by which to search
279     * @param string[] $variants variants to search for
280     * @return Status status containing results defined by resultsType on success
281     */
282    public function prefixSearch( $term, $variants = [] ) {
283        ( new PrefixSearchQueryBuilder() )->build( $this->searchContext, $term, $variants );
284        return $this->searchOne();
285    }
286
287    /**
288     * Build full text search for articles with provided term. All the
289     * state is applied to $this->searchContext. The returned query
290     * builder can be used to build a degraded query if necessary.
291     *
292     * @param string $term term to search
293     * @return FullTextQueryBuilder
294     */
295    protected function buildFullTextSearch( $term ) {
296        // Convert the unicode character 'ideographic whitespace' into standard
297        // whitespace. Cirrussearch treats them both as normal whitespace, but
298        // the preceding isn't appropriately trimmed.
299        // No searching for nothing! That takes forever!
300        $term = trim( str_replace( "\xE3\x80\x80", " ", $term ) );
301        if ( $term === '' ) {
302            $this->searchContext->setResultsPossible( false );
303        }
304
305        $builderSettings = $this->config->getProfileService()
306            ->loadProfileByName( SearchProfileService::FT_QUERY_BUILDER,
307                $this->searchContext->getFulltextQueryBuilderProfile() );
308        $features = ( new FullTextKeywordRegistry( $this->config ) )->getKeywords();
309        $qb = self::buildFullTextBuilder( $builderSettings, $this->config, $features );
310
311        $qb->build( $this->searchContext, $term );
312
313        if ( $this->searchContext->getSearchQuery() !== null ) {
314            $degradeOnParseWarnings = [
315                // && test, test AND && test
316                'cirrussearch-parse-error-unexpected-token',
317                // test AND
318                'cirrussearch-parse-error-unexpected-end'
319            ];
320            // Quick hack to avoid sending bad queries to the backend
321            foreach ( $this->searchContext->getSearchQuery()->getParsedQuery()->getParseWarnings() as $warning ) {
322                if ( in_array( $warning->getMessage(), $degradeOnParseWarnings ) ) {
323                    $qb->buildDegraded( $this->searchContext );
324                    return $qb;
325                }
326            }
327        }
328
329        return $qb;
330    }
331
332    /**
333     * @param string $term
334     * @return Status
335     */
336    private function searchTextInternal( $term ) {
337        // Searcher needs to be cloned before any actual query building is done.
338        $interleaveSearcher = $this->buildInterleaveSearcher();
339
340        $qb = $this->buildFullTextSearch( $term );
341        $mainSearch = $this->buildSearch();
342        $searches = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $mainSearch );
343        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";
344
345        if ( !$this->searchContext->areResultsPossible() ) {
346            if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
347                // return the empty array to suggest that no query will be run
348                return Status::newGood( [] );
349            }
350            $status = $this->emptyResultSet();
351            if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
352                return Status::newGood(
353                    ( new MSearchResponses( [ $status->getValue() ], [] ) )->dumpResults( $description )
354                );
355            }
356            return $status;
357        }
358
359        if ( $interleaveSearcher !== null ) {
360            $interleaveSearcher->buildFullTextSearch( $term );
361            $interleaveSearch = $interleaveSearcher->buildSearch();
362            if ( $this->areSearchesTheSame( $mainSearch, $interleaveSearch ) ) {
363                $interleaveSearcher = null;
364            } else {
365                $searches->addRequest( self::INTERLEAVED_MSEARCH_KEY, $interleaveSearch );
366            }
367        }
368
369        $fallbackRunner = $this->searchContext->getFallbackRunner();
370        $fallbackRunner->attachSearchRequests( $searches, $this->connection->getClient() );
371
372        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
373            return $searches->dumpQuery( $description );
374        }
375
376        $responses = $this->searchMulti( $searches );
377        if ( $responses->hasFailure() ) {
378            $status = $responses->getFailure();
379            if ( ElasticaErrorHandler::isParseError( $status ) ) {
380                // Rebuild the search context because we need a fresh fetchPhaseBuilder
381                $this->searchContext = $this->searchContext->withConfig( $this->config );
382                if ( $qb->buildDegraded( $this->searchContext ) ) {
383                    // If that doesn't work we're out of luck but it should.
384                    // There no guarantee it'll work properly with the syntax
385                    // we've built above but it'll do _something_ and we'll
386                    // still work on fixing all the parse errors that come in.
387                    $status = $this->searchOne();
388                }
389            }
390            return $status;
391        }
392
393        if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
394            return $responses->dumpResults( $description );
395        }
396
397        $rType = $this->getSearchContext()->getResultsType();
398        $mainSet = $responses->transformAsResultSet( $rType, self::MAINSEARCH_MSEARCH_KEY );
399        if ( $interleaveSearcher !== null ) {
400            $interleaver = new TeamDraftInterleaver( $this->searchContext->getOriginalSearchTerm() );
401            $testedSet = $responses->transformAsResultSet( $rType, self::INTERLEAVED_MSEARCH_KEY );
402            $response = $interleaver->interleave( $mainSet, $testedSet, $this->limit );
403        } else {
404            $response = $mainSet;
405        }
406
407        $status = Status::newGood();
408        if ( $this->namespacePrefixParser !== null ) {
409            $status = Status::newGood( $fallbackRunner->run( $this, $response, $responses,
410                $this->namespacePrefixParser, $this->cirrusSearchHookRunner ) );
411            $this->appendMetrics( $fallbackRunner );
412        }
413
414        foreach ( $this->searchContext->getWarnings() as $warning ) {
415            $status->warning( ...$warning );
416        }
417        return $status;
418    }
419
420    /**
421     * Get the page with $docId.  Note that the result is a status containing _all_ pages found.
422     * It is possible to find more then one page if the page is in multiple indexes.
423     * @param string[] $docIds array of document ids
424     * @param string[]|bool $sourceFiltering source filtering to apply
425     * @param bool $usePoolCounter false to disable the pool counter
426     * @return Status containing pages found, containing an empty array if not found,
427     *    or an error if there was an error
428     */
429    public function get( array $docIds, $sourceFiltering, $usePoolCounter = true ) {
430        $connection = $this->getOverriddenConnection();
431        $indexSuffix = $connection->pickIndexSuffixForNamespaces(
432            $this->searchContext->getNamespaces()
433        );
434
435        // The worst case would be to have all ids duplicated in all available indices.
436        // We set the limit accordingly
437        $size = count( $connection->getAllIndexSuffixesForNamespaces(
438            $this->searchContext->getNamespaces()
439        ) );
440        $size *= count( $docIds );
441
442        $work = function () use ( $docIds, $sourceFiltering, $indexSuffix, $size, $connection ) {
443            try {
444                $this->startNewLog( 'get of {indexSuffix}.{docIds}', 'get', [
445                    'indexSuffix' => $indexSuffix,
446                    'docIds' => $docIds,
447                ] );
448                // Shard timeout not supported on get requests so we just use the client side timeout
449                $connection->setTimeout( $this->getClientTimeout( 'get' ) );
450                // We use a search query instead of _get/_mget, these methods are
451                // theorically well suited for this kind of job but they are not
452                // supported on aliases with multiple indices (content/general)
453                $index = $connection->getIndex( $this->indexBaseName, $indexSuffix );
454                $query = new \Elastica\Query( new \Elastica\Query\Ids( $docIds ) );
455                if ( is_array( $sourceFiltering ) ) {
456                    // The title is a required field in the ApiTrait
457                    if ( !in_array( "title", $sourceFiltering ) ) {
458                        array_push( $sourceFiltering, "title" );
459                    }
460                    $query->setParam( '_source', $sourceFiltering );
461                }
462                $query->addParam( 'stats', 'get' );
463                // We ignore limits provided to the searcher
464                // otherwize we could return fewer results than
465                // the ids requested.
466                $query->setFrom( 0 );
467                $query->setSize( $size );
468                $resultSet = $index->search( $query, [ 'search_type' => 'query_then_fetch' ] );
469                self::throwIfNotOk( $connection, $resultSet->getResponse() );
470                return $this->success( $resultSet->getResults(), $connection );
471            } catch ( \Elastica\Exception\NotFoundException $e ) {
472                // NotFoundException just means the field didn't exist.
473                // It is up to the caller to decide if that is an error.
474                return $this->success( [], $connection );
475            } catch ( \Elastica\Exception\ExceptionInterface $e ) {
476                return $this->failure( $e, $connection );
477            }
478        };
479
480        if ( $usePoolCounter ) {
481            return Util::doPoolCounterWork( $this->getPoolCounterType(), $this->user, $work );
482        } else {
483            return $work();
484        }
485    }
486
487    /**
488     * @param string $name
489     * @return Status
490     */
491    private function findNamespace( $name ) {
492        return Util::doPoolCounterWork(
493            'CirrusSearch-NamespaceLookup',
494            $this->user,
495            function () use ( $name ) {
496                try {
497                    $this->startNewLog( 'lookup namespace for {namespaceName}', 'namespace', [
498                        'namespaceName' => $name,
499                        'query' => $name,
500                    ] );
501                    $connection = $this->getOverriddenConnection();
502                    $connection->setTimeout( $this->getClientTimeout( 'namespace' ) );
503
504                    // A bit awkward, but accepted as this is the backup
505                    // implementation of namespace lookup. Deployments should
506                    // prefer to install php-intl and use utr30.
507                    $store = ( new MetaStoreIndex( $connection, new NullPrinter(), $this->config ) )
508                        ->namespaceStore();
509                    $resultSet = $store->find( $name, [
510                        'timeout' => $this->getTimeout( 'namespace' ),
511                    ] );
512                    return $this->success( $resultSet->getResults(), $connection );
513                } catch ( \Elastica\Exception\ExceptionInterface $e ) {
514                    return $this->failure( $e, $connection );
515                }
516            } );
517    }
518
519    /**
520     * @return \Elastica\Search
521     */
522    protected function buildSearch() {
523        $builder = new SearchRequestBuilder(
524            $this->searchContext, $this->getOverriddenConnection(), $this->indexBaseName );
525        return $builder->setLimit( $this->limit )
526            ->setOffset( $this->offset )
527            ->setIndex( $this->index )
528            ->setSort( $this->sort )
529            ->setTimeout( $this->getTimeout( $this->searchContext->getSearchType() ) )
530            ->build();
531    }
532
533    /**
534     * Perform a single-query search.
535     * @return Status
536     */
537    protected function searchOne() {
538        $search = $this->buildSearch();
539        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";
540        $msearch = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $search );
541        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
542            return $msearch->dumpQuery( $description );
543        }
544        if ( !$this->searchContext->areResultsPossible() ) {
545            return $this->emptyResultSet();
546        }
547
548        $mresults = $this->searchMulti( $msearch );
549
550        if ( $mresults->hasFailure() ) {
551            return $mresults->getFailure();
552        }
553
554        if ( $this->searchContext->getDebugOptions()->isReturnRaw() ) {
555            return $mresults->dumpResults( $description );
556        }
557        return $mresults->transformAndGetSingle( $this->searchContext->getResultsType(), self::MAINSEARCH_MSEARCH_KEY );
558    }
559
560    /**
561     * Powers full-text-like searches including prefix search.
562     *
563     * @param MSearchRequests $msearches
564     * @return MSearchResponses search responses
565     */
566    protected function searchMulti( MSearchRequests $msearches ) {
567        $searches = $msearches->getRequests();
568        $contextResultsType = $this->searchContext->getResultsType();
569        $cirrusDebugOptions = $this->searchContext->getDebugOptions();
570        Assert::precondition( !$cirrusDebugOptions->isCirrusDumpQuery(), 'Must not reach this method when dumping the query' );
571
572        // TODO: should this be moved upper in the stack?
573        if ( $this->limit <= 0 ) {
574            return $msearches->failure( Status::newFatal( 'cirrussearch-offset-too-large',
575                self::MAX_OFFSET_LIMIT, $this->offset ) );
576        }
577
578        $connection = $this->getOverriddenConnection();
579        $log = new MultiSearchRequestLog(
580            $connection->getClient(),
581            "{queryType} search for '{query}'",
582            $this->searchContext->getSearchType(),
583            [
584                'query' => $this->searchContext->getOriginalSearchTerm(),
585                'limit' => $this->limit ?: null,
586                // Used syntax
587                'syntax' => $this->searchContext->getSyntaxUsed(),
588            ],
589            $this->searchContext->getNamespaces() ?: []
590        );
591
592        // Similar to indexing support only the bulk code path, rather than
593        // single and bulk. The extra overhead should be minimal, and the
594        // reduced complexity is welcomed.
595        $search = new MultiSearch( $connection->getClient() );
596        $search->addSearches( $searches );
597
598        $connection->setTimeout( $this->getClientTimeout( $this->searchContext->getSearchType() ) );
599
600        if ( $this->config->get( 'CirrusSearchMoreAccurateScoringMode' ) ) {
601            $search->setSearchType( \Elastica\Search::OPTION_SEARCH_TYPE_DFS_QUERY_THEN_FETCH );
602        }
603
604        // Perform the search
605        $work = function () use ( $search, $log, $connection ) {
606            return Util::doPoolCounterWork(
607                $this->getPoolCounterType(),
608                $this->user,
609                function () use ( $search, $log, $connection ) {
610                    // @todo only reports the first error, also turns
611                    // a partial (single search) error into a complete
612                    // failure across the board. Should be addressed
613                    // at some point.
614                    return $this->runMSearch( $search, $log, $connection );
615                },
616                $this->searchContext->isSyntaxUsed( 'regex' ) ?
617                    'cirrussearch-regex-too-busy-error' : null
618            );
619        };
620
621        // Wrap with caching if needed, but don't cache debugging queries
622        $skipCache = $cirrusDebugOptions->mustNeverBeCached();
623        if ( $this->searchContext->getCacheTtl() > 0 && !$skipCache ) {
624            $work = function () use ( $work, $searches, $log, $contextResultsType ) {
625                $services = MediaWikiServices::getInstance();
626                $requestStats = Util::getStatsFactory();
627                $cache = $services->getMainWANObjectCache();
628                $keyParts = [];
629                foreach ( $searches as $key => $search ) {
630                    $keyParts[] = $search->getPath() .
631                        serialize( $search->getOptions() ) .
632                        serialize( $search->getQuery()->toArray() ) .
633                        ( $contextResultsType !== null ? get_class( $contextResultsType ) : "NONE" );
634                }
635                $key = $cache->makeKey( 'cirrussearch', 'search', 'v2', md5(
636                    implode( '|', $keyParts )
637                ) );
638                $cacheResult = $cache->get( $key );
639                if ( $cacheResult ) {
640                    [ $logVariables, $multiResultSet ] = $cacheResult;
641                    $this->recordQueryCacheMetrics( $requestStats, "hit" );
642                    $log->setCachedResult( $logVariables );
643                    $this->successViaCache( $log );
644
645                    if ( $multiResultSet->isOK() ) {
646                        /** @var \Elastica\Multi\ResultSet $cachedMResultSet */
647                        $cachedMResultSet = $multiResultSet->getValue();
648                        if ( count( $cachedMResultSet->getResultSets() ) !== count( $searches ) ) {
649                            LoggerFactory::getInstance( 'CirrusSearch' )
650                                ->warning( 'Ignoring a cached Multi/ResultSet wanted {nb_queries} response(s) but received {nb_responses}',
651                                    [
652                                        'nb_queries' => count( $searches ),
653                                        'nb_responses' => count( $cachedMResultSet->getResultSets() )
654                                    ] );
655                            $this->recordQueryCacheMetrics( $requestStats, "incoherent" );
656                        } else {
657                            return $multiResultSet;
658                        }
659                    } else {
660                        LoggerFactory::getInstance( 'CirrusSearch' )
661                            ->warning( 'Cached a Status value that is not OK' );
662                        $this->recordQueryCacheMetrics( $requestStats, "nok" );
663                    }
664                } else {
665                    $this->recordQueryCacheMetrics( $requestStats, "miss" );
666                }
667
668                $multiResultSet = $work();
669
670                if ( $multiResultSet->isOK() ) {
671                    $isPartialResult = false;
672                    foreach ( $multiResultSet->getValue()->getResultSets() as $resultSet ) {
673                        $responseData = $resultSet->getResponse()->getData();
674                        if ( isset( $responseData['timed_out'] ) && $responseData['timed_out'] ) {
675                            $isPartialResult = true;
676                            break;
677                        }
678                    }
679                    if ( !$isPartialResult ) {
680                        $this->recordQueryCacheMetrics( $requestStats, "set" );
681                        $cache->set(
682                            $key,
683                            [ $log->getLogVariables(), $multiResultSet ],
684                            $this->searchContext->getCacheTtl()
685                        );
686                    }
687                }
688
689                return $multiResultSet;
690            };
691        }
692
693        $status = $work();
694
695        // @todo Does this need anything special for multi-search changes?
696        if ( !$status->isOK() ) {
697            return $msearches->failure( $status );
698        }
699
700        /** @var \Elastica\Multi\ResultSet $response */
701        $response = $status->getValue();
702        if ( count( $response->getResultSets() ) !== count( $msearches->getRequests() ) ) {
703            // Temp hack to investigate T231023 (use php serialize just in case it has some invalid
704            // UTF8 sequences that would prevent this message from being sent to logstash
705            LoggerFactory::getInstance( 'CirrusSearch' )
706                ->warning( "Incoherent response received (#searches != #responses) for {query}: {response}",
707                    [ 'query' => $this->searchContext->getOriginalSearchTerm(), 'response' => serialize( $response->getResponse() ) ] );
708            return $msearches->failure( Status::newFatal( 'cirrussearch-backend-error' ) );
709        }
710        $mreponses = $msearches->toMSearchResponses( $response->getResultSets() );
711        if ( $mreponses->hasTimeout() ) {
712            LoggerFactory::getInstance( 'CirrusSearch' )->warning(
713                $log->getDescription() . " timed out and only returned partial results!",
714                $log->getLogVariables()
715            );
716            $this->searchContext->addWarning( $this->searchContext->isSyntaxUsed( 'regex' )
717                ? 'cirrussearch-regex-timed-out'
718                : 'cirrussearch-timed-out'
719            );
720        }
721        return $mreponses;
722    }
723
724    /**
725     * Attempt to suck a leading namespace followed by a colon from the query string.
726     * Reaches out to Elasticsearch to perform normalized lookup against the namespaces.
727     * Should be fast but for the network hop.
728     *
729     * @param string &$query
730     */
731    public function updateNamespacesFromQuery( &$query ) {
732        $colon = strpos( $query, ':' );
733        if ( $colon === false ) {
734            return;
735        }
736        $namespaceName = substr( $query, 0, $colon );
737        $status = $this->findNamespace( $namespaceName );
738        // Failure case is already logged so just handle success case
739        if ( !$status->isOK() ) {
740            return;
741        }
742        $foundNamespace = $status->getValue();
743        if ( !$foundNamespace ) {
744            return;
745        }
746        $foundNamespace = $foundNamespace[ 0 ];
747        $query = substr( $query, $colon + 1 );
748        $this->searchContext->setNamespaces( [ $foundNamespace->namespace_id ] );
749    }
750
751    /**
752     * @return SearchContext
753     */
754    public function getSearchContext() {
755        return $this->searchContext;
756    }
757
758    private function getPoolCounterType(): string {
759        // Default pool counter for all search requests. Note that not all
760        // possible requests go through Searcher, so this isn't globally
761        // definitive.
762        $pool = 'CirrusSearch-Search';
763        // Pool counter overrides based on query syntax. Goal is to
764        // separate expensive or high-volume traffic into dedicated
765        // pools with specific limits. Prefix is only high volume
766        // when completion is disabled.
767        $poolCounterTypes = [
768            'deepcat' => 'CirrusSearch-ExpensiveFullText',
769            'regex' => 'CirrusSearch-ExpensiveFullText',
770            'prefix' => 'CirrusSearch-Prefix',
771            'more_like' => 'CirrusSearch-MoreLike',
772        ];
773        foreach ( $poolCounterTypes as $type => $counter ) {
774            if ( $this->searchContext->isSyntaxUsed( $type ) ) {
775                $pool = $counter;
776                break;
777            }
778        }
779        // Put external automated requests into their own bucket The main idea
780        // here is to allow automated access, but prevent that automation from
781        // capping out the pools used by interactive queries.
782        // It's not clear when the automation bucket should not override other
783        // bucketing decisions, for now override everything except Regex since
784        // those can be very expensive and usually use a small pool. If both
785        // the automation and regex pools filled with regexes it would be
786        // significantly more load than expected.
787        if ( $pool !== 'CirrusSearch-ExpensiveFullText' && $this->isAutomatedRequest() ) {
788            $pool = 'CirrusSearch-Automated';
789        }
790        return $pool;
791    }
792
793    private function isAutomatedRequest(): bool {
794        $req = RequestContext::getMain()->getRequest();
795        try {
796            $ip = $req->getIP();
797        } catch ( MWException $e ) {
798            // No IP, typically this means a CLI invocation. We are attempting
799            // to segregate external automation, internal automation has its
800            // own ability to control configuration and shouldn't be flagged
801            if ( MW_ENTRY_POINT === 'cli' ) {
802                return false;
803            }
804            // When can we get here? Is this ever run?
805            LoggerFactory::getInstance( 'CirrusSearch' )->info(
806                'No IP available during automated request check' );
807            return false;
808        }
809        return Util::looksLikeAutomation(
810            $this->config, $ip, $req->getAllHeaders() );
811    }
812
813    /**
814     * Some queries, like more like this, are quite expensive and can cause
815     * latency spikes. This allows redirecting queries using particular
816     * features to specific clusters.
817     * @return Connection
818     */
819    private function getOverriddenConnection() {
820        $overrides = $this->config->get( 'CirrusSearchClusterOverrides' );
821        foreach ( $overrides as $feature => $cluster ) {
822            if ( $this->searchContext->isSyntaxUsed( $feature ) ) {
823                return Connection::getPool( $this->config, $cluster );
824            }
825        }
826        return $this->connection;
827    }
828
829    protected function recordQueryCacheMetrics( StatsFactory $requestStats, string $cacheStatus, ?string $type = null ): void {
830        $type = $type ?: $this->getSearchContext()->getSearchType();
831        $requestStats->getCounter( "query_cache_total" )
832            ->setLabel( "type", $type )
833            ->setLabel( "status", $cacheStatus )
834            ->increment();
835    }
836
837    /**
838     * @param string $description
839     * @param string $queryType
840     * @param string[] $extra
841     * @return SearchRequestLog
842     */
843    protected function newLog( $description, $queryType, array $extra = [] ) {
844        return new SearchRequestLog(
845            $this->getOverriddenConnection()->getClient(),
846            $description,
847            $queryType,
848            $extra
849        );
850    }
851
852    /**
853     * If we're supposed to create raw result, create and return it,
854     * or output it and finish.
855     * @param mixed $result Search result data
856     * @param WebRequest $request Request context
857     * @return string The new raw result.
858     */
859    public function processRawReturn( $result, WebRequest $request ) {
860        return Util::processSearchRawReturn( $result, $request,
861            $this->searchContext->getDebugOptions() );
862    }
863
864    /**
865     * Search titles in archive
866     * @param string $term
867     * @return Status<Title[]>
868     */
869    public function searchArchive( $term ) {
870        $this->searchContext->setOriginalSearchTerm( $term );
871        $term = $this->searchContext->escaper()->fixupWholeQueryString( $term );
872        $this->setResultsType( new TitleResultsType() );
873
874        // This does not support cross-cluster search, but there is also no use case
875        // for cross-wiki archive search.
876        $this->index = $this->getOverriddenConnection()->getArchiveIndex( $this->indexBaseName );
877
878        // Setup the search query
879        $query = new BoolQuery();
880
881        $multi = new MultiMatch();
882        $multi->setType( 'best_fields' );
883        $multi->setTieBreaker( 0 );
884        $multi->setQuery( $term );
885        $multi->setFields( [
886            'title.near_match^100',
887            'title.near_match_asciifolding^75',
888            'title.plain^50',
889            'title^25'
890        ] );
891        $multi->setOperator( 'AND' );
892
893        $fuzzy = new \Elastica\Query\MatchQuery();
894        $fuzzy->setFieldQuery( 'title.plain', $term );
895        $fuzzy->setFieldFuzziness( 'title.plain', 'AUTO' );
896        $fuzzy->setFieldOperator( 'title.plain', 'AND' );
897
898        $query->addShould( $multi );
899        $query->addShould( $fuzzy );
900        $query->setMinimumShouldMatch( 1 );
901
902        $this->sort = 'just_match';
903
904        $this->searchContext->setMainQuery( $query );
905        $this->searchContext->addSyntaxUsed( 'archive' );
906        $this->searchContext->setRescoreProfile( 'empty' );
907
908        return $this->searchOne();
909    }
910
911    /**
912     * Tests if two search objects are equivalent
913     *
914     * @param Search $a
915     * @param Search $b
916     * @return bool
917     */
918    private function areSearchesTheSame( Search $a, Search $b ) {
919        // same object.
920        if ( $a === $b ) {
921            return true;
922        }
923
924        // Check values not included in toArray()
925        if ( $a->getPath() !== $b->getPath()
926            || $a->getOptions() != $b->getOptions()
927        ) {
928            return false;
929        }
930
931        $aArray = $a->getQuery()->toArray();
932        $bArray = $b->getQuery()->toArray();
933
934        // normalize the 'now' value which contains a timestamp that
935        // may vary.
936        $fixNow = static function ( &$value, $key ) {
937            if ( $key === 'now' && is_int( $value ) ) {
938                $value = 12345678;
939            }
940        };
941        array_walk_recursive( $aArray, $fixNow );
942        array_walk_recursive( $bArray, $fixNow );
943
944        // Simplest form, requires both arrays to have exact same ordering,
945        // types, keys, etc. We could try much harder to remove edge cases,
946        // but they probably don't matter too much. The main thing we are
947        // looking for is if configuration used for interleaved search didn't
948        // have an effect query building. If we get it wrong in some rare
949        // cases it should have minimal effects on the interleaved search test.
950        return $aArray === $bArray;
951    }
952
953    private function buildInterleaveSearcher(): ?self {
954        // If we aren't on the first page, or the user has specified
955        // some custom magic query options (override rescore profile,
956        // etc) then don't interleave.
957        if ( $this->offset > 0 || $this->searchContext->isDirty() ) {
958            return null;
959        }
960
961        // Is interleaving configured?
962        $overrides = $this->config->get( 'CirrusSearchInterleaveConfig' );
963        if ( $overrides === null ) {
964            return null;
965        }
966
967        $config = new HashSearchConfig( $overrides, [ HashSearchConfig::FLAG_INHERIT ] );
968        $other = clone $this;
969        $other->config = $config;
970        $other->searchContext = $other->searchContext->withConfig( $config );
971
972        return $other;
973    }
974
975    /**
976     * @return Status
977     */
978    private function emptyResultSet() {
979        $results = $this->searchContext->getResultsType()->createEmptyResult();
980        if ( $results instanceof BaseCirrusSearchResultSet ) {
981            // TODO: Keywords are very specific to full-text search, while
982            // ResultsType and this method are much more general.
983            // While awkward, this maintains BC until we decide what to do.
984            $results = BaseCirrusSearchResultSet::emptyResultSet(
985                $this->searchContext->isSpecialKeywordUsed()
986            );
987        }
988        $status = Status::newGood( $results );
989        foreach ( $this->searchContext->getWarnings() as $warning ) {
990            $status->warning( ...$warning );
991        }
992        return $status;
993    }
994
995    /**
996     * Apply debug options to the elastica query
997     * @param Query $query
998     * @return Query
999     */
1000    public function applyDebugOptionsToQuery( Query $query ) {
1001        return $this->searchContext->getDebugOptions()->applyDebugOptions( $query );
1002    }
1003
1004    public function makeSearcher( SearchQuery $query ): self {
1005        return new self( $this->connection, $query->getOffset(), $query->getLimit(),
1006            $query->getSearchConfig(), $query->getNamespaces(), $this->user,
1007            false, $query->getDebugOptions(), $this->namespacePrefixParser, $this->interwikiResolver,
1008            $this->titleHelper, $this->cirrusSearchHookRunner );
1009    }
1010
1011    /**
1012     * @param int $offset
1013     * @param int $limit
1014     */
1015    private function setOffsetLimit( $offset, $limit ) {
1016        $this->offset = $offset;
1017        if ( $offset + $limit > self::MAX_OFFSET_LIMIT ) {
1018            $this->limit = self::MAX_OFFSET_LIMIT - $offset;
1019        } else {
1020            $this->limit = $limit;
1021        }
1022    }
1023
1024    /**
1025     * Visible for testing
1026     * @return int[] 2 elements array
1027     */
1028    public function getOffsetLimit() {
1029        Assert::precondition( defined( 'MW_PHPUNIT_TEST' ),
1030            'getOffsetLimit must only be called for testing purposes' );
1031        return [ $this->offset, $this->limit ];
1032    }
1033
1034    /**
1035     * Build a FullTextQueryBuilder defined in the $builderSettings:
1036     * format is:
1037     * [
1038     *     'builder_factory' => callback
1039     *     'settings' => ...
1040     * ]
1041     * where callback must be function that accepts the settings array and returns a FullTextQueryBuilder
1042     *
1043     * Legacy version:
1044     * [
1045     *     'builder_class' => ClassName
1046     *     'settings' => ...
1047     * ]
1048     * where ClassName must declare a constructor with these arguments:
1049     *   SearchConfig $config, KeywordFeature[] $features, $settings
1050     *
1051     * Visible for testing only
1052     * @param array $builderSettings
1053     * @param SearchConfig $config
1054     * @param KeywordFeature[] $features
1055     * @return FullTextQueryBuilder
1056     * @throws \ReflectionException
1057     */
1058    final public static function buildFullTextBuilder(
1059        array $builderSettings,
1060        SearchConfig $config,
1061        array $features
1062    ): FullTextQueryBuilder {
1063        if ( isset( $builderSettings['builder_class'] ) ) {
1064            $objectFactorySpecs = [
1065                'class' => $builderSettings['builder_class'],
1066                'args' => [
1067                    $config,
1068                    $features,
1069                    $builderSettings['settings']
1070                ]
1071            ];
1072        } elseif ( $builderSettings['builder_factory'] ) {
1073            $objectFactorySpecs = [
1074                'factory' => $builderSettings['builder_factory'],
1075                'args' => [
1076                    $builderSettings['settings']
1077                ]
1078            ];
1079        } else {
1080            throw new \InvalidArgumentException( 'Missing builder_class or builder_factory in the builderSettings' );
1081        }
1082
1083        /** @var FullTextQueryBuilder $qb */
1084        // @phan-suppress-next-line PhanTypeInvalidCallableArraySize
1085        $qb = ObjectFactory::getObjectFromSpec( $objectFactorySpecs );
1086        if ( !( $qb instanceof FullTextQueryBuilder ) ) {
1087            throw new RuntimeException( 'Bad builder class configured.' );
1088        }
1089
1090        return $qb;
1091    }
1092}