Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 372 |
|
0.00% |
0 / 33 |
CRAP | |
0.00% |
0 / 1 |
ElasticSearchTtmServer | |
0.00% |
0 / 371 |
|
0.00% |
0 / 33 |
7310 | |
0.00% |
0 / 1 |
isLocalSuggestion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
expandLocation | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
query | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
doQuery | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
110 | |||
update | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
42 | |||
createDocument | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
createIndex | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
6 | |||
beginBootstrap | |
0.00% |
0 / 41 |
|
0.00% |
0 / 1 |
12 | |||
beginBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
batchInsertDefinitions | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
batchInsertTranslations | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
endBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
endBootstrap | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getClient | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
useWikimediaExtraPlugin | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
getIndexName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getIndex | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getShardCount | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getReplicaCount | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
waitUntilReady | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
setLogger | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
logOutput | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
setDoReIndex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
parseQueryString | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
110 | |||
createSearch | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
30 | |||
search | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getFacets | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
12 | |||
getTotalHits | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getDocuments | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
deleteByQuery | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
getElasticsearchVersion | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
checkElasticsearchVersion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
assertResultSetInstance | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\TtmServer; |
5 | |
6 | use Elastica\Aggregation\Terms; |
7 | use Elastica\Client; |
8 | use Elastica\Document; |
9 | use Elastica\Exception\ExceptionInterface; |
10 | use Elastica\Index; |
11 | use Elastica\Mapping; |
12 | use Elastica\Query; |
13 | use Elastica\Query\BoolQuery; |
14 | use Elastica\Query\FunctionScore; |
15 | use Elastica\Query\MatchQuery; |
16 | use Elastica\Query\Term; |
17 | use Elastica\ResultSet; |
18 | use Elastica\Search; |
19 | use Exception; |
20 | use MediaWiki\Extension\Elastica\MWElasticUtils; |
21 | use MediaWiki\Extension\Translate\LogNames; |
22 | use MediaWiki\Extension\Translate\MessageLoading\MessageHandle; |
23 | use MediaWiki\Extension\Translate\TranslatorInterface\TranslationHelperException; |
24 | use MediaWiki\Logger\LoggerFactory; |
25 | use MediaWiki\MediaWikiServices; |
26 | use MediaWiki\Title\Title; |
27 | use MediaWiki\WikiMap\WikiMap; |
28 | use RuntimeException; |
29 | use TTMServerBootstrap; |
30 | |
31 | /** |
32 | * TtmServer backend based on ElasticSearch. Depends on Elastica. |
33 | * @author Niklas Laxström |
34 | * @license GPL-2.0-or-later |
35 | * @since 2014.04 |
36 | * @ingroup TTMServer |
37 | */ |
38 | class ElasticSearchTtmServer |
39 | extends TtmServer |
40 | implements ReadableTtmServer, WritableTtmServer, SearchableTtmServer |
41 | { |
42 | /** |
43 | * @const int in case a write operation fails during a batch process |
44 | * this constant controls the number of times we will retry the same |
45 | * operation. |
46 | */ |
47 | private const BULK_INDEX_RETRY_ATTEMPTS = 5; |
48 | |
49 | /** |
50 | * @const int time (seconds) to wait for the index to ready before |
51 | * starting to index. Since we wait for index status it can be relatively |
52 | * long especially if some nodes are restarted. |
53 | */ |
54 | private const WAIT_UNTIL_READY_TIMEOUT = 3600; |
55 | |
56 | private ?Client $client = null; |
57 | /** Reference to the maintenance script to relay logging output. */ |
58 | private ?TTMServerBootstrap $logger = null; |
59 | /** Used for reindex */ |
60 | private bool $updateMapping = false; |
61 | |
62 | public function isLocalSuggestion( array $suggestion ): bool { |
63 | return $suggestion['wiki'] === WikiMap::getCurrentWikiId(); |
64 | } |
65 | |
66 | public function expandLocation( array $suggestion ): string { |
67 | return $suggestion['uri']; |
68 | } |
69 | |
70 | public function query( string $sourceLanguage, string $targetLanguage, string $text ): array { |
71 | try { |
72 | return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); |
73 | } catch ( Exception $e ) { |
74 | throw new TranslationHelperException( 'Elastica exception: ' . $e ); |
75 | } |
76 | } |
77 | |
78 | private function doQuery( string $sourceLanguage, string $targetLanguage, string $text ): array { |
79 | if ( !$this->useWikimediaExtraPlugin() ) { |
80 | // ElasticTTM is currently not compatible with elasticsearch 2.x/5.x |
81 | // It needs FuzzyLikeThis ported via the wmf extra plugin |
82 | throw new RuntimeException( 'The wikimedia extra plugin is mandatory.' ); |
83 | } |
84 | /* Two query system: |
85 | * 1) Find all strings in source language that match text |
86 | * 2) Do another query for translations for those strings |
87 | */ |
88 | $connection = $this->getClient()->getConnection(); |
89 | $oldTimeout = $connection->getTimeout(); |
90 | $connection->setTimeout( 10 ); |
91 | |
92 | $fuzzyQuery = new FuzzyLikeThis(); |
93 | $fuzzyQuery->setLikeText( $text ); |
94 | $fuzzyQuery->addFieldNames( [ 'content' ] ); |
95 | |
96 | $boostQuery = new FunctionScore(); |
97 | $boostQuery->addFunction( |
98 | 'levenshtein_distance_score', |
99 | [ |
100 | 'text' => $text, |
101 | 'field' => 'content' |
102 | ] |
103 | ); |
104 | $boostQuery->setBoostMode( FunctionScore::BOOST_MODE_REPLACE ); |
105 | |
106 | // Wrap the fuzzy query, so it can be used as a filter. |
107 | // This is slightly faster, as ES can throw away the scores by this query. |
108 | $bool = new BoolQuery(); |
109 | $bool->addFilter( $fuzzyQuery ); |
110 | $bool->addMust( $boostQuery ); |
111 | |
112 | $languageFilter = new Term(); |
113 | $languageFilter->setTerm( 'language', $sourceLanguage ); |
114 | $bool->addFilter( $languageFilter ); |
115 | |
116 | // The whole query |
117 | $query = new Query(); |
118 | $query->setQuery( $bool ); |
119 | |
120 | // The interface usually displays three best candidates. These might |
121 | // come from more than three source things, if the translations are |
122 | // the same. In other words suggestions are grouped by the suggested |
123 | // translation. This algorithm might not find all suggestions, if the |
124 | // top N best matching source texts don't have equivalent translations |
125 | // in the target language, but worse matches which we did not fetch do. |
126 | // This code tries to balance between doing too many or too big queries |
127 | // and not fetching enough results to show all possible suggestions. |
128 | $sizeFirst = 100; |
129 | $sizeSecond = $sizeFirst * 5; |
130 | |
131 | $query->setFrom( 0 ); |
132 | $query->setSize( $sizeFirst ); |
133 | $query->setParam( '_source', [ 'content' ] ); |
134 | $cutoff = $this->config['cutoff'] ?? 0.65; |
135 | $query->setParam( 'min_score', $cutoff ); |
136 | $query->setSort( [ '_score', 'wiki', 'localid' ] ); |
137 | |
138 | /* This query is doing two unrelated things: |
139 | * 1) Collect the message contents and scores so that they can |
140 | * be accessed later for the translations we found. |
141 | * 2) Build the query string for the query that fetches the translations. |
142 | */ |
143 | $contents = $scores = $terms = []; |
144 | do { |
145 | $resultset = $this->getIndex()->search( $query ); |
146 | |
147 | if ( count( $resultset ) === 0 ) { |
148 | break; |
149 | } |
150 | |
151 | foreach ( $resultset->getResults() as $result ) { |
152 | $data = $result->getData(); |
153 | $score = $result->getScore(); |
154 | |
155 | $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() ); |
156 | $contents[$sourceId] = $data['content']; |
157 | $scores[$sourceId] = $score; |
158 | $terms[] = "$sourceId/$targetLanguage"; |
159 | } |
160 | |
161 | // Check if it looks like that we are hitting the long tail already. |
162 | // Otherwise, we'll do a query to fetch some more to reach a "sane" |
163 | // breaking point, i.e. include all suggestions with same content |
164 | // for reliable used X times statistics. |
165 | if ( count( array_unique( $scores ) ) > 5 ) { |
166 | break; |
167 | } |
168 | |
169 | // Okay, We are now in second iteration of the loop. We already got |
170 | // lots of suggestions. We will give up for now even if it means we |
171 | // return in some sense incomplete results. |
172 | if ( count( $resultset ) === $sizeSecond ) { |
173 | break; |
174 | } |
175 | |
176 | // After the first query, the smallest score is the new threshold. |
177 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
178 | $query->setParam( 'min_score', $score ); |
179 | $query->setFrom( $query->getParam( 'size' ) + $query->getParam( 'from' ) ); |
180 | $query->setSize( $sizeSecond ); |
181 | |
182 | // Break if we already got all hits |
183 | } while ( $resultset->getTotalHits() > count( $contents ) ); |
184 | |
185 | $suggestions = []; |
186 | |
187 | // Skip second query if first query found nothing. Keeping only one return |
188 | // statement in this method to avoid forgetting to reset connection timeout |
189 | if ( $terms !== [] ) { |
190 | $idQuery = new Query\Terms( '_id', $terms ); |
191 | |
192 | $query = new Query( $idQuery ); |
193 | $query->setSize( 25 ); |
194 | $query->setParam( '_source', [ 'wiki', 'uri', 'content', 'localid' ] ); |
195 | $resultset = $this->getIndex()->search( $query ); |
196 | |
197 | foreach ( $resultset->getResults() as $result ) { |
198 | $data = $result->getData(); |
199 | |
200 | // Construct the matching source id |
201 | $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() ); |
202 | |
203 | $suggestions[] = [ |
204 | 'source' => $contents[$sourceId], |
205 | 'target' => $data['content'], |
206 | 'context' => $data['localid'], |
207 | 'quality' => $scores[$sourceId], |
208 | 'wiki' => $data['wiki'], |
209 | 'location' => $data['localid'] . '/' . $targetLanguage, |
210 | 'uri' => $data['uri'], |
211 | ]; |
212 | } |
213 | |
214 | // Ensure results are in quality order |
215 | uasort( $suggestions, static function ( $a, $b ) { |
216 | if ( $a['quality'] === $b['quality'] ) { |
217 | return 0; |
218 | } |
219 | |
220 | return ( $a['quality'] < $b['quality'] ) ? 1 : -1; |
221 | } ); |
222 | } |
223 | |
224 | $connection->setTimeout( $oldTimeout ); |
225 | |
226 | return $suggestions; |
227 | } |
228 | |
229 | /* Write functions */ |
230 | |
231 | public function update( MessageHandle $handle, ?string $targetText ): bool { |
232 | if ( !$handle->isValid() || $handle->getCode() === '' ) { |
233 | return false; |
234 | } |
235 | |
236 | /* There are various different cases here: |
237 | * [new or updated] [fuzzy|non-fuzzy] [translation|definition] |
238 | * 1) We don't distinguish between new or updated here. |
239 | * 2) Delete old translation, but not definition |
240 | * 3) Insert new translation or definition, if non-fuzzy |
241 | * The definition should never be fuzzied anyway. |
242 | * |
243 | * These only apply to known messages. |
244 | */ |
245 | |
246 | $sourceLanguage = $handle->getGroup()->getSourceLanguage(); |
247 | |
248 | // Do not delete definitions, because the translations are attached to that |
249 | if ( $handle->getCode() !== $sourceLanguage ) { |
250 | $localid = $handle->getTitleForBase()->getPrefixedText(); |
251 | $this->deleteByQuery( $this->getIndex(), Query::create( |
252 | ( new BoolQuery() ) |
253 | ->addFilter( new Term( [ 'wiki' => WikiMap::getCurrentWikiId() ] ) ) |
254 | ->addFilter( new Term( [ 'language' => $handle->getCode() ] ) ) |
255 | ->addFilter( new Term( [ 'localid' => $localid ] ) ) ) ); |
256 | } |
257 | |
258 | // If translation was made fuzzy, we do not need to add anything |
259 | if ( $targetText === null ) { |
260 | return true; |
261 | } |
262 | |
263 | // source language is null, skip doing rest of the stuff |
264 | if ( $sourceLanguage === null ) { |
265 | return true; |
266 | } |
267 | |
268 | $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); |
269 | $doc = $this->createDocument( $handle, $targetText, $revId ); |
270 | $fname = __METHOD__; |
271 | |
272 | MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS, |
273 | function () use ( $doc ) { |
274 | $this->getIndex()->addDocuments( [ $doc ] ); |
275 | }, |
276 | static function ( $e, $errors ) use ( $fname ) { |
277 | $c = get_class( $e ); |
278 | $msg = $e->getMessage(); |
279 | error_log( $fname . ": update failed ($c: $msg); retrying." ); |
280 | sleep( 10 ); |
281 | } |
282 | ); |
283 | |
284 | return true; |
285 | } |
286 | |
287 | private function createDocument( MessageHandle $handle, string $text, int $revId ): Document { |
288 | $language = $handle->getCode(); |
289 | |
290 | $localid = $handle->getTitleForBase()->getPrefixedText(); |
291 | $wiki = WikiMap::getCurrentWikiId(); |
292 | $globalid = "$wiki-$localid-$revId/$language"; |
293 | |
294 | $data = [ |
295 | 'wiki' => $wiki, |
296 | 'uri' => $handle->getTitle()->getCanonicalURL(), |
297 | 'localid' => $localid, |
298 | 'language' => $language, |
299 | 'content' => $text, |
300 | 'group' => $handle->getGroupIds(), |
301 | ]; |
302 | |
303 | return new Document( $globalid, $data, '_doc' ); |
304 | } |
305 | |
306 | /** @param bool $rebuild Deletes index first if already exists */ |
307 | private function createIndex( bool $rebuild ): void { |
308 | $indexSettings = [ |
309 | 'settings' => [ |
310 | 'index' => [ |
311 | 'number_of_shards' => $this->getShardCount(), |
312 | 'analysis' => [ |
313 | 'filter' => [ |
314 | 'prefix_filter' => [ |
315 | 'type' => 'edge_ngram', |
316 | 'min_gram' => 2, |
317 | 'max_gram' => 20 |
318 | ] |
319 | ], |
320 | 'analyzer' => [ |
321 | 'prefix' => [ |
322 | 'type' => 'custom', |
323 | 'tokenizer' => 'standard', |
324 | 'filter' => [ 'lowercase', 'prefix_filter' ] |
325 | ], |
326 | 'casesensitive' => [ |
327 | 'tokenizer' => 'standard' |
328 | ] |
329 | ] |
330 | ] |
331 | ], |
332 | ], |
333 | ]; |
334 | $replicas = $this->getReplicaCount(); |
335 | $key = str_contains( $replicas, '-' ) ? 'auto_expand_replicas' : 'number_of_replicas'; |
336 | $indexSettings['settings']['index'][$key] = $replicas; |
337 | |
338 | $this->getIndex()->create( $indexSettings, $rebuild ); |
339 | } |
340 | |
341 | /** |
342 | * Begin the bootstrap process. |
343 | * @throws RuntimeException |
344 | */ |
345 | public function beginBootstrap(): void { |
346 | $this->checkElasticsearchVersion(); |
347 | $index = $this->getIndex(); |
348 | if ( $this->updateMapping ) { |
349 | $this->logOutput( 'Updating the index mappings...' ); |
350 | $this->createIndex( true ); |
351 | } elseif ( !$index->exists() ) { |
352 | $this->createIndex( false ); |
353 | } |
354 | |
355 | $settings = $index->getSettings(); |
356 | $settings->setRefreshInterval( '-1' ); |
357 | |
358 | $this->deleteByQuery( $this->getIndex(), Query::create( |
359 | ( new Term() )->setTerm( 'wiki', WikiMap::getCurrentWikiId() ) ) ); |
360 | |
361 | $properties = [ |
362 | 'wiki' => [ 'type' => 'keyword' ], |
363 | 'localid' => [ 'type' => 'keyword' ], |
364 | 'uri' => [ 'type' => 'keyword' ], |
365 | 'language' => [ 'type' => 'keyword' ], |
366 | 'group' => [ 'type' => 'keyword' ], |
367 | 'content' => [ |
368 | 'type' => 'text', |
369 | 'fields' => [ |
370 | 'content' => [ |
371 | 'type' => 'text', |
372 | 'term_vector' => 'yes' |
373 | ], |
374 | 'prefix_complete' => [ |
375 | 'type' => 'text', |
376 | 'analyzer' => 'prefix', |
377 | 'search_analyzer' => 'standard', |
378 | 'term_vector' => 'yes' |
379 | ], |
380 | 'case_sensitive' => [ |
381 | 'type' => 'text', |
382 | 'analyzer' => 'casesensitive', |
383 | 'term_vector' => 'yes' |
384 | ] |
385 | ] |
386 | ], |
387 | ]; |
388 | |
389 | $mapping = new Mapping( $properties ); |
390 | $mapping->send( $index, [ 'include_type_name' => 'false' ] ); |
391 | |
392 | $this->waitUntilReady(); |
393 | } |
394 | |
395 | public function beginBatch(): void { |
396 | } |
397 | |
398 | /** |
399 | * @param array[] $batch |
400 | * @phan-param array<int,array{0:MessageHandle,1:string,2:string}> $batch |
401 | */ |
402 | public function batchInsertDefinitions( array $batch ): void { |
403 | $lb = MediaWikiServices::getInstance()->getLinkBatchFactory()->newLinkBatch(); |
404 | foreach ( $batch as $data ) { |
405 | $lb->addObj( $data[0]->getTitle() ); |
406 | } |
407 | $lb->execute(); |
408 | |
409 | $this->batchInsertTranslations( $batch ); |
410 | } |
411 | |
412 | public function batchInsertTranslations( array $batch ): void { |
413 | $docs = []; |
414 | foreach ( $batch as $data ) { |
415 | [ $handle, $sourceLanguage, $text ] = $data; |
416 | $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); |
417 | $docs[] = $this->createDocument( $handle, $text, $revId ); |
418 | } |
419 | |
420 | MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS, |
421 | function () use ( $docs ) { |
422 | $this->getIndex()->addDocuments( $docs ); |
423 | }, |
424 | function ( $e, $errors ) { |
425 | $c = get_class( $e ); |
426 | $msg = $e->getMessage(); |
427 | $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); |
428 | sleep( 10 ); |
429 | } |
430 | ); |
431 | } |
432 | |
433 | public function endBatch(): void { |
434 | } |
435 | |
436 | public function endBootstrap(): void { |
437 | $index = $this->getIndex(); |
438 | $index->refresh(); |
439 | $index->forcemerge(); |
440 | $index->getSettings()->setRefreshInterval( '5s' ); |
441 | } |
442 | |
443 | public function getClient(): Client { |
444 | if ( $this->client === null ) { |
445 | if ( isset( $this->config['config'] ) ) { |
446 | $this->client = new Client( $this->config['config'] ); |
447 | } else { |
448 | $this->client = new Client(); |
449 | } |
450 | } |
451 | return $this->client; |
452 | } |
453 | |
454 | /** @return true if the backend is configured with the wikimedia extra plugin */ |
455 | public function useWikimediaExtraPlugin(): bool { |
456 | return isset( $this->config['use_wikimedia_extra'] ) && $this->config['use_wikimedia_extra']; |
457 | } |
458 | |
459 | private function getIndexName(): string { |
460 | return $this->config['index'] ?? 'ttmserver'; |
461 | } |
462 | |
463 | public function getIndex(): Index { |
464 | return $this->getClient() |
465 | ->getIndex( $this->getIndexName() ); |
466 | } |
467 | |
468 | private function getShardCount(): int { |
469 | return $this->config['shards'] ?? 1; |
470 | } |
471 | |
472 | private function getReplicaCount(): string { |
473 | return $this->config['replicas'] ?? '0-2'; |
474 | } |
475 | |
476 | private function waitUntilReady(): void { |
477 | $statuses = MWElasticUtils::waitForGreen( |
478 | $this->getClient(), |
479 | $this->getIndexName(), |
480 | self::WAIT_UNTIL_READY_TIMEOUT ); |
481 | $this->logOutput( "Waiting for the index to go green..." ); |
482 | foreach ( $statuses as $message ) { |
483 | $this->logOutput( $message ); |
484 | } |
485 | |
486 | if ( !$statuses->getReturn() ) { |
487 | die( "Timeout! Please check server logs for {$this->getIndexName()}." ); |
488 | } |
489 | } |
490 | |
491 | public function setLogger( TTMServerBootstrap $logger ): void { |
492 | $this->logger = $logger; |
493 | } |
494 | |
495 | // Can it get any uglier? |
496 | private function logOutput( string $text ): void { |
497 | if ( $this->logger !== null ) { |
498 | $this->logger->statusLine( "$text\n" ); |
499 | } |
500 | } |
501 | |
502 | public function setDoReIndex(): void { |
503 | $this->updateMapping = true; |
504 | } |
505 | |
506 | /** Parse query string and build the search query */ |
507 | private function parseQueryString( string $queryString, array $opts ): array { |
508 | $fields = $highlights = []; |
509 | $terms = preg_split( '/\s+/', $queryString ); |
510 | $match = $opts['match']; |
511 | $case = $opts['case']; |
512 | |
513 | // Map each word in the query string with its corresponding field |
514 | foreach ( $terms as $term ) { |
515 | $prefix = strstr( $term, '*', true ); |
516 | if ( $prefix ) { |
517 | // For wildcard search |
518 | $fields['content.prefix_complete'][] = $prefix; |
519 | } elseif ( $case === '1' ) { |
520 | // For case-sensitive search |
521 | $fields['content.case_sensitive'][] = $term; |
522 | } else { |
523 | $fields['content'][] = $term; |
524 | } |
525 | } |
526 | |
527 | // Allow searching either by message content or message id (page name |
528 | // without language subpage) with exact match only. |
529 | $searchQuery = new BoolQuery(); |
530 | foreach ( $fields as $analyzer => $words ) { |
531 | foreach ( $words as $word ) { |
532 | $boolQuery = new BoolQuery(); |
533 | $contentQuery = new MatchQuery(); |
534 | $contentQuery->setFieldQuery( $analyzer, $word ); |
535 | $boolQuery->addShould( $contentQuery ); |
536 | $messageQuery = new Term(); |
537 | $messageQuery->setTerm( 'localid', $word ); |
538 | $boolQuery->addShould( $messageQuery ); |
539 | |
540 | if ( $match === 'all' ) { |
541 | $searchQuery->addMust( $boolQuery ); |
542 | } else { |
543 | $searchQuery->addShould( $boolQuery ); |
544 | } |
545 | |
546 | // Fields for highlighting |
547 | $highlights[$analyzer] = [ |
548 | 'number_of_fragments' => 0 |
549 | ]; |
550 | |
551 | // Allow searching by exact message title (page name with |
552 | // language subpage). |
553 | $title = Title::newFromText( $word ); |
554 | if ( !$title ) { |
555 | continue; |
556 | } |
557 | $handle = new MessageHandle( $title ); |
558 | if ( $handle->isValid() && $handle->getCode() !== '' ) { |
559 | $localid = $handle->getTitleForBase()->getPrefixedText(); |
560 | $boolQuery = new BoolQuery(); |
561 | $messageId = new Term(); |
562 | $messageId->setTerm( 'localid', $localid ); |
563 | $boolQuery->addMust( $messageId ); |
564 | $searchQuery->addShould( $boolQuery ); |
565 | } |
566 | } |
567 | } |
568 | |
569 | return [ $searchQuery, $highlights ]; |
570 | } |
571 | |
572 | /** Search interface */ |
573 | public function createSearch( string $queryString, array $opts, array $highlight ): Search { |
574 | $query = new Query(); |
575 | |
576 | [ $searchQuery, $highlights ] = $this->parseQueryString( $queryString, $opts ); |
577 | $query->setQuery( $searchQuery ); |
578 | |
579 | $language = new Terms( 'language' ); |
580 | $language->setField( 'language' ); |
581 | $language->setSize( 500 ); |
582 | $query->addAggregation( $language ); |
583 | |
584 | $group = new Terms( 'group' ); |
585 | $group->setField( 'group' ); |
586 | // Would like to prioritize the top level groups and not show subgroups |
587 | // if the top group has only few hits, but that doesn't seem to be possile. |
588 | $group->setSize( 500 ); |
589 | $query->addAggregation( $group ); |
590 | |
591 | $query->setSize( $opts['limit'] ); |
592 | $query->setFrom( $opts['offset'] ); |
593 | |
594 | // BoolAnd filters are executed in sequence per document. Bool filters with |
595 | // multiple must clauses are executed by converting each filter into a bit |
596 | // field then anding them together. The latter is normally faster if either |
597 | // of the subfilters are reused. May not make a difference in this context. |
598 | $filters = new BoolQuery(); |
599 | |
600 | $language = $opts['language']; |
601 | if ( $language !== '' ) { |
602 | $languageFilter = new Term(); |
603 | $languageFilter->setTerm( 'language', $language ); |
604 | $filters->addFilter( $languageFilter ); |
605 | } |
606 | |
607 | $group = $opts['group']; |
608 | if ( $group !== '' ) { |
609 | $groupFilter = new Term(); |
610 | $groupFilter->setTerm( 'group', $group ); |
611 | $filters->addFilter( $groupFilter ); |
612 | } |
613 | |
614 | // Check that we have at least one filter to avoid invalid query errors. |
615 | if ( $language !== '' || $group !== '' ) { |
616 | // TODO: This seems wrong, but perhaps for aggregation purposes? |
617 | // should make $search a must clause and use the bool query |
618 | // as main. |
619 | $query->setPostFilter( $filters ); |
620 | } |
621 | |
622 | [ $pre, $post ] = $highlight; |
623 | $query->setHighlight( [ |
624 | // The value must be an object |
625 | 'pre_tags' => [ $pre ], |
626 | 'post_tags' => [ $post ], |
627 | 'fields' => $highlights, |
628 | ] ); |
629 | |
630 | return $this->getIndex()->createSearch( $query ); |
631 | } |
632 | |
633 | /** |
634 | * Search interface |
635 | * @throws TtmServerException |
636 | */ |
637 | public function search( string $queryString, array $opts, array $highlight ): ResultSet { |
638 | $search = $this->createSearch( $queryString, $opts, $highlight ); |
639 | |
640 | try { |
641 | return $search->search(); |
642 | } catch ( ExceptionInterface $e ) { |
643 | throw new TtmServerException( $e->getMessage() ); |
644 | } |
645 | } |
646 | |
647 | /** @inheritDoc */ |
648 | public function getFacets( $resultset ): array { |
649 | $this->assertResultSetInstance( $resultset ); |
650 | $aggs = $resultset->getAggregations(); |
651 | '@phan-var array[][][] $aggs'; |
652 | |
653 | $ret = [ |
654 | 'language' => [], |
655 | 'group' => [] |
656 | ]; |
657 | |
658 | foreach ( $aggs as $type => $info ) { |
659 | foreach ( $info['buckets'] as $row ) { |
660 | $ret[$type][$row['key']] = $row['doc_count']; |
661 | } |
662 | } |
663 | |
664 | return $ret; |
665 | } |
666 | |
667 | /** @inheritDoc */ |
668 | public function getTotalHits( $resultset ): int { |
669 | $this->assertResultSetInstance( $resultset ); |
670 | return $resultset->getTotalHits(); |
671 | } |
672 | |
673 | /** @inheritDoc */ |
674 | public function getDocuments( $resultset ): array { |
675 | $this->assertResultSetInstance( $resultset ); |
676 | $ret = []; |
677 | foreach ( $resultset->getResults() as $document ) { |
678 | $data = $document->getData(); |
679 | $hl = $document->getHighlights(); |
680 | if ( isset( $hl['content.prefix_complete'][0] ) ) { |
681 | $data['content'] = $hl['content.prefix_complete'][0]; |
682 | } elseif ( isset( $hl['content.case_sensitive'][0] ) ) { |
683 | $data['content'] = $hl['content.case_sensitive'][0]; |
684 | } elseif ( isset( $hl['content'][0] ) ) { |
685 | $data['content'] = $hl['content'][0]; |
686 | } |
687 | $ret[] = $data; |
688 | } |
689 | |
690 | return $ret; |
691 | } |
692 | |
693 | /** |
694 | * Delete docs by query by using the scroll API. |
695 | * TODO: Elastica\Index::deleteByQuery() ? was removed |
696 | * in 2.x and returned in 5.x. |
697 | * @throws RuntimeException |
698 | */ |
699 | private function deleteByQuery( Index $sourceIndex, Query $query ): void { |
700 | try { |
701 | MWElasticUtils::deleteByQuery( $sourceIndex, $query, /* $allowConflicts = */ true ); |
702 | } catch ( Exception $e ) { |
703 | LoggerFactory::getInstance( LogNames::ELASTIC_SEARCH_TTMSERVER )->error( |
704 | 'Problem encountered during deletion.', |
705 | [ 'exception' => $e ] |
706 | ); |
707 | |
708 | throw new RuntimeException( "Problem encountered during deletion.\n" . $e ); |
709 | } |
710 | } |
711 | |
712 | /* @throws RuntimeException */ |
713 | private function getElasticsearchVersion(): string { |
714 | $response = $this->getClient()->request( '' ); |
715 | if ( !$response->isOK() ) { |
716 | throw new RuntimeException( "Cannot fetch elasticsearch version: " . $response->getError() ); |
717 | } |
718 | |
719 | $result = $response->getData(); |
720 | if ( !isset( $result['version']['number'] ) ) { |
721 | throw new RuntimeException( 'Unable to determine elasticsearch version, aborting.' ); |
722 | } |
723 | |
724 | return $result[ 'version' ][ 'number' ]; |
725 | } |
726 | |
727 | private function checkElasticsearchVersion(): void { |
728 | $version = $this->getElasticsearchVersion(); |
729 | if ( !str_starts_with( $version, '6.8' ) && !str_starts_with( $version, '7.' ) ) { |
730 | throw new RuntimeException( "Only Elasticsearch 6.8.x and 7.x are supported. Your version: $version." ); |
731 | } |
732 | } |
733 | |
734 | /** @param mixed $resultSet */ |
735 | private function assertResultSetInstance( $resultSet ): void { |
736 | if ( $resultSet instanceof ResultSet ) { |
737 | return; |
738 | } |
739 | |
740 | throw new RuntimeException( |
741 | "Expected resultset to be an instance of " . ResultSet::class |
742 | ); |
743 | } |
744 | } |
745 | |
746 | // Translation memory configuration ($wgTranslateTranslationServices) uses class as ElasticSearchTTMServer |
747 | // See: https://www.mediawiki.org/wiki/Help:Extension:Translate/Translation_memories#Configuration |
748 | class_alias( ElasticSearchTtmServer::class, "ElasticSearchTTMServer" ); |