Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 380 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
ForceSearchIndex | |
0.00% |
0 / 373 |
|
0.00% |
0 / 19 |
8190 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
702 | |||
buildPageIdBatches | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
72 | |||
buildUpdateFlags | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
waitForQueueToShrink | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
waitForQueueToDrain | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
42 | |||
calculateIndexingRate | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
simpleCheckIndexes | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
getDeletesIterator | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
12 | |||
getIdsIterator | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByDateIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByIdIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
12 | |||
attachTimestampConditions | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
attachPageConditions | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
wrapDecodeResults | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
decidePage | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
56 | |||
buildChunks | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
42 | |||
getUpdatesInQueue | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
createUpdater | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use BatchRowIterator; |
6 | use CirrusSearch\BuildDocument\BuildDocument; |
7 | use CirrusSearch\Iterator\CallbackIterator; |
8 | use CirrusSearch\Job; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\Updater; |
11 | use IDBAccessObject; |
12 | use MediaWiki\Logger\LoggerFactory; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Title\Title; |
15 | use MediaWiki\Utils\MWTimestamp; |
16 | use MediaWiki\WikiMap\WikiMap; |
17 | use Throwable; |
18 | use UnexpectedValueException; |
19 | use Wikimedia\Rdbms\IDatabase; |
20 | use WikiPage; |
21 | |
22 | /** |
23 | * Force reindexing change to the wiki. |
24 | * |
25 | * This program is free software; you can redistribute it and/or modify |
26 | * it under the terms of the GNU General Public License as published by |
27 | * the Free Software Foundation; either version 2 of the License, or |
28 | * (at your option) any later version. |
29 | * |
30 | * This program is distributed in the hope that it will be useful, |
31 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
32 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
33 | * GNU General Public License for more details. |
34 | * |
35 | * You should have received a copy of the GNU General Public License along |
36 | * with this program; if not, write to the Free Software Foundation, Inc., |
37 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
38 | * http://www.gnu.org/copyleft/gpl.html |
39 | */ |
40 | |
41 | $IP = getenv( 'MW_INSTALL_PATH' ); |
42 | if ( $IP === false ) { |
43 | $IP = __DIR__ . '/../../..'; |
44 | } |
45 | require_once "$IP/maintenance/Maintenance.php"; |
46 | require_once __DIR__ . '/../includes/Maintenance/Maintenance.php'; |
47 | |
48 | class ForceSearchIndex extends Maintenance { |
49 | private const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3; |
50 | /** @var MWTimestamp|null */ |
51 | public $fromDate = null; |
52 | /** @var MWTimestamp|null */ |
53 | public $toDate = null; |
54 | /** @var string|null */ |
55 | public $toId = null; |
56 | /** @var bool */ |
57 | public $indexUpdates; |
58 | /** @var bool */ |
59 | public $archive; |
60 | /** @var string */ |
61 | public $limit; |
62 | /** @var string */ |
63 | public $queue; |
64 | /** @var int|null */ |
65 | public $maxJobs; |
66 | /** @var int|null */ |
67 | public $pauseForJobs; |
68 | /** @var int|null */ |
69 | public $namespace; |
70 | /** @var string[] */ |
71 | public $excludeContentTypes; |
72 | /** @var float */ |
73 | public $lastJobQueueCheckTime = 0; |
74 | |
75 | /** |
76 | * @var bool true if the script is run with --ids |
77 | */ |
78 | private $runWithIds; |
79 | |
80 | /** |
81 | * @var int[] list of page ids to reindex when --ids is used |
82 | */ |
83 | private $pageIds; |
84 | |
85 | public function __construct() { |
86 | parent::__construct(); |
87 | $this->addDescription( "Force indexing some pages. Setting --from or --to will switch " |
88 | . "from page id based indexing to " |
89 | . "date based indexing which uses less efficient queries and follows redirects.\n\n" |
90 | . "Note: All froms are _exclusive_ and all tos are _inclusive_.\n" |
91 | . "Note 2: Setting fromId and toId use the efficient query so those are ok.\n" |
92 | . "Note 3: Operates on all clusters unless --cluster is provided.\n" |
93 | ); |
94 | $this->setBatchSize( 10 ); |
95 | $this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc. Defaults ' . |
96 | 'to 0 epoch.', false, true ); |
97 | $this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', |
98 | false, true ); |
99 | $this->addOption( 'fromId', 'Start indexing at a specific page_id. ' . |
100 | 'Not useful with --deletes.', false, true ); |
101 | $this->addOption( 'toId', 'Stop indexing at a specific page_id. ' . |
102 | 'Not useful with --deletes or --from or --to.', false, true ); |
103 | $this->addOption( 'ids', 'List of page ids (comma separated) to reindex. ' . |
104 | 'Not allowed with deletes/from/to/fromId/toId/limit.', false, true ); |
105 | $this->addOption( 'deletes', |
106 | 'If this is set then just index deletes, not updates or creates.', false ); |
107 | $this->addOption( 'archive', |
108 | 'Don\'t delete pages, only index them into the archive.', false, false ); |
109 | $this->addOption( 'limit', |
110 | 'Maximum number of pages to process before exiting the script. Default to unlimited.', |
111 | false, true ); |
112 | $this->addOption( 'buildChunks', 'Instead of running the script spit out commands that ' . |
113 | 'can be farmed out to different processes or machines to rebuild the index. Works ' . |
114 | 'with fromId and toId, not from and to. If specified as a number then chunks no ' . |
115 | 'larger than that size are spat out. If specified as a number followed by the word ' . |
116 | '"total" without a space between them then that many chunks will be spat out sized ' . |
117 | 'to cover the entire wiki.', false, true ); |
118 | $this->addOption( 'queue', 'Rather than perform the indexes in process add them to the ' . |
119 | 'job queue. Ignored for delete.' ); |
120 | $this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue ' . |
121 | 'then pause before adding more. This is only checked every ' . |
122 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS . |
123 | ' seconds. Not meaningful without --queue.', false, true ); |
124 | $this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less ' . |
125 | 'than this many before starting again. Defaults to the value specified for ' . |
126 | '--maxJobs. Not meaningful without --queue.', false, true ); |
127 | $this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document' . |
128 | ' as an index. This replaces the contents of the index for that entry with the entry' . |
129 | ' built from a skipped process. Without this if the entry does not exist then it will' . |
130 | ' be skipped entirely. Only set this when running the first pass of building the' . |
131 | ' index. Otherwise, don\'t tempt fate by indexing half complete documents.' ); |
132 | $this->addOption( 'forceParse', '(deprecated)' ); |
133 | $this->addOption( 'skipParse', |
134 | 'Skip parsing the page. This is really only good for running the second half ' . |
135 | 'of the two phase index build. If this is specified then the default batch size ' . |
136 | 'is actually 50.' ); |
137 | $this->addOption( 'skipLinks', |
138 | 'Skip looking for links to the page (counting and finding redirects). Use ' . |
139 | 'this with --indexOnSkip for the first half of the two phase index build.' ); |
140 | $this->addOption( 'namespace', 'Only index pages in this given namespace', false, true ); |
141 | $this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. ' . |
142 | 'These must be a comma separated list of strings such as "wikitext" or "json" ' . |
143 | 'matching the CONTENT_MODEL_* constants.', false, true, false ); |
144 | $this->addOption( 'useDbIndex', |
145 | 'Use specific index when fetching IDs from the database.', false, true, false ); |
146 | } |
147 | |
148 | public function execute() { |
149 | $this->disablePoolCountersAndLogging(); |
150 | $wiki = sprintf( "[%20s]", WikiMap::getCurrentWikiId() ); |
151 | |
152 | // Make sure we've actually got indices to populate |
153 | if ( !$this->simpleCheckIndexes() ) { |
154 | $this->fatalError( |
155 | "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?" |
156 | ); |
157 | } |
158 | |
159 | $this->indexUpdates = !$this->getOption( 'deletes', false ); |
160 | // We need to check ids options early otherwise hasOption may return |
161 | // true even if the user did not set the option on the commandline |
162 | if ( $this->hasOption( 'ids' ) ) { |
163 | $this->runWithIds = true; |
164 | $this->pageIds = $this->buildPageIdBatches(); |
165 | } |
166 | |
167 | if ( $this->getOption( 'from' ) !== null || $this->getOption( 'to' ) !== null ) { |
168 | // 0 is falsy so MWTimestamp makes that `now`. '00' is epoch 0. |
169 | $this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' ) ); |
170 | $this->toDate = new MWTimestamp( $this->getOption( 'to', false ) ); |
171 | } |
172 | $this->toId = $this->getOption( 'toId' ); |
173 | $this->archive = (bool)$this->getOption( 'archive', false ); |
174 | if ( $this->archive ) { |
175 | // If we're indexing only for archive, this implies deletes |
176 | $this->indexUpdates = false; |
177 | } |
178 | $this->limit = $this->getOption( 'limit' ); |
179 | $buildChunks = $this->getOption( 'buildChunks' ); |
180 | if ( $buildChunks !== null ) { |
181 | $this->buildChunks( $buildChunks ); |
182 | return null; |
183 | } |
184 | $this->queue = $this->getOption( 'queue' ); |
185 | $this->maxJobs = $this->getOption( 'maxJobs' ) |
186 | ? intval( $this->getOption( 'maxJobs' ) ) |
187 | : null; |
188 | $this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ? |
189 | intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs; |
190 | $updateFlags = $this->buildUpdateFlags(); |
191 | |
192 | if ( !$this->getOption( 'batch-size' ) && |
193 | ( $this->getOption( 'queue' ) || !$this->indexUpdates ) |
194 | ) { |
195 | $this->setBatchSize( 100 ); |
196 | } |
197 | |
198 | $this->namespace = $this->hasOption( 'namespace' ) ? |
199 | intval( $this->getOption( 'namespace' ) ) : null; |
200 | |
201 | $this->excludeContentTypes = array_filter( array_map( |
202 | 'trim', |
203 | explode( ',', $this->getOption( 'excludeContentTypes', '' ) ) |
204 | ) ); |
205 | |
206 | $operationName = $this->indexUpdates |
207 | ? ( $this->queue ? 'Queued' : 'Indexed' ) |
208 | : ( $this->archive ? 'Archived' : 'Deleted' ); |
209 | |
210 | $operationStartTime = microtime( true ); |
211 | $completed = 0; |
212 | $rate = 0; |
213 | |
214 | if ( $this->runWithIds ) { |
215 | $it = $this->getIdsIterator(); |
216 | // @phan-suppress-next-line PhanImpossibleTypeComparison |
217 | } elseif ( $this->indexUpdates && $this->fromDate === null ) { |
218 | $it = $this->getUpdatesByIdIterator(); |
219 | } elseif ( $this->indexUpdates ) { |
220 | $it = $this->getUpdatesByDateIterator(); |
221 | } else { |
222 | $it = $this->getDeletesIterator(); |
223 | } |
224 | $jobQueueGroup = MediaWikiServices::getInstance()->getJobQueueGroup(); |
225 | |
226 | foreach ( $it as $batch ) { |
227 | if ( $this->indexUpdates ) { |
228 | $size = count( $batch['updates'] ); |
229 | $updates = array_filter( $batch['updates'] ); |
230 | if ( $this->queue ) { |
231 | $this->waitForQueueToShrink( $wiki ); |
232 | $jobQueueGroup->push( Job\MassIndex::build( |
233 | $updates, $updateFlags, $this->getOption( 'cluster' ) |
234 | ) ); |
235 | } else { |
236 | // Update size with the actual number of updated documents. |
237 | $updater = $this->createUpdater(); |
238 | $size = $updater->updatePages( $updates, $updateFlags ); |
239 | } |
240 | } else { |
241 | $size = count( $batch['titlesToDelete'] ); |
242 | $updater = $this->createUpdater(); |
243 | $updater->archivePages( $batch['archive'] ); |
244 | if ( !$this->archive ) { |
245 | $updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] ); |
246 | } |
247 | } |
248 | |
249 | $completed += $size; |
250 | $rate = $this->calculateIndexingRate( $completed, $operationStartTime ); |
251 | |
252 | $this->output( |
253 | "$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n" |
254 | ); |
255 | if ( $this->limit !== null && $completed > $this->limit ) { |
256 | break; |
257 | } |
258 | } |
259 | $this->output( "$operationName a total of {$completed} pages at $rate/second\n" ); |
260 | $this->waitForQueueToDrain( $wiki ); |
261 | |
262 | return true; |
263 | } |
264 | |
265 | private function buildPageIdBatches() { |
266 | if ( !$this->indexUpdates || $this->hasOption( 'limit' ) |
267 | || $this->hasOption( 'from' ) || $this->hasOption( 'to' ) |
268 | || $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' ) |
269 | ) { |
270 | $this->fatalError( |
271 | '--ids cannot be used with deletes/archive/from/to/fromId/toId/limit' |
272 | ); |
273 | } |
274 | |
275 | $pageIds = array_map( |
276 | function ( $pageId ) { |
277 | $pageId = trim( $pageId ); |
278 | if ( !ctype_digit( $pageId ) ) { |
279 | $this->fatalError( "Invalid page id provided in --ids, got '$pageId', " . |
280 | "expected a positive integer" ); |
281 | } |
282 | return intval( $pageId ); |
283 | }, |
284 | explode( ',', $this->getOption( 'ids' ) ) |
285 | ); |
286 | return array_unique( $pageIds, SORT_REGULAR ); |
287 | } |
288 | |
289 | private function buildUpdateFlags() { |
290 | $updateFlags = 0; |
291 | if ( $this->getOption( 'indexOnSkip' ) ) { |
292 | $updateFlags |= BuildDocument::INDEX_ON_SKIP; |
293 | } |
294 | if ( $this->getOption( 'skipParse' ) ) { |
295 | $updateFlags |= BuildDocument::SKIP_PARSE; |
296 | if ( !$this->getOption( 'batch-size' ) ) { |
297 | $this->setBatchSize( 50 ); |
298 | } |
299 | } |
300 | if ( $this->getOption( 'skipLinks' ) ) { |
301 | $updateFlags |= BuildDocument::SKIP_LINKS; |
302 | } |
303 | |
304 | return $updateFlags; |
305 | } |
306 | |
307 | private function waitForQueueToShrink( $wiki ) { |
308 | $now = microtime( true ); |
309 | if ( $now - $this->lastJobQueueCheckTime <= |
310 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS |
311 | ) { |
312 | return; |
313 | } |
314 | |
315 | $this->lastJobQueueCheckTime = $now; |
316 | $queueSize = $this->getUpdatesInQueue(); |
317 | if ( $this->maxJobs === null || $this->maxJobs >= $queueSize ) { |
318 | return; |
319 | } |
320 | |
321 | do { |
322 | $this->output( |
323 | "$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n" |
324 | ); |
325 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
326 | $queueSize = $this->getUpdatesInQueue(); |
327 | } while ( $this->pauseForJobs < $queueSize ); |
328 | } |
329 | |
330 | private function waitForQueueToDrain( $wiki ) { |
331 | if ( !$this->queue ) { |
332 | return; |
333 | } |
334 | |
335 | $lastQueueSizeForOurJob = PHP_INT_MAX; |
336 | $waitStartTime = microtime( true ); |
337 | $this->output( "Waiting for jobs to drain from the queue\n" ); |
338 | while ( true ) { |
339 | $queueSizeForOurJob = $this->getUpdatesInQueue(); |
340 | if ( $queueSizeForOurJob === 0 ) { |
341 | return; |
342 | } |
343 | // We subtract 5 because we some jobs may be added by deletes |
344 | if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) { |
345 | $this->output( "Queue size went up. Another script is likely adding jobs " . |
346 | "and it'll wait for them to empty.\n" ); |
347 | return; |
348 | } |
349 | if ( microtime( true ) - $waitStartTime > 120 ) { |
350 | // Wait at least two full minutes before we check if the job count went down. |
351 | // Less then that and we might be seeing lag from redis's counts. |
352 | $lastQueueSizeForOurJob = $queueSizeForOurJob; |
353 | } |
354 | $this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" ); |
355 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
356 | } |
357 | } |
358 | |
359 | /** |
360 | * @param int $completed |
361 | * @param double $operationStartTime |
362 | * |
363 | * @return double |
364 | */ |
365 | private function calculateIndexingRate( $completed, $operationStartTime ) { |
366 | $rate = $completed / ( microtime( true ) - $operationStartTime ); |
367 | |
368 | if ( $rate < 1 ) { |
369 | return round( $rate, 1 ); |
370 | } |
371 | |
372 | return round( $rate ); |
373 | } |
374 | |
375 | /** |
376 | * Do some simple sanity checking to make sure we've got indexes to populate. |
377 | * Note this isn't nearly as robust as updateSearchIndexConfig is, but it's |
378 | * not designed to be. |
379 | * |
380 | * @return bool |
381 | */ |
382 | private function simpleCheckIndexes() { |
383 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
384 | |
385 | // Top-level alias needs to exist |
386 | if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) { |
387 | return false; |
388 | } |
389 | |
390 | // Now check all index types to see if they exist |
391 | foreach ( $this->getConnection()->getAllIndexSuffixes() as $indexSuffix ) { |
392 | // If the alias for this type doesn't exist, fail |
393 | if ( !$this->getConnection()->getIndex( $indexBaseName, $indexSuffix )->exists() ) { |
394 | return false; |
395 | } |
396 | } |
397 | |
398 | return true; |
399 | } |
400 | |
401 | /** |
402 | * @return CallbackIterator |
403 | */ |
404 | protected function getDeletesIterator() { |
405 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
406 | $it = new BatchRowIterator( |
407 | $dbr, |
408 | 'logging', |
409 | [ 'log_timestamp' ], |
410 | $this->getBatchSize() |
411 | ); |
412 | |
413 | $this->attachPageConditions( $dbr, $it, 'log' ); |
414 | $this->attachTimestampConditions( $dbr, $it, 'log' ); |
415 | $it->addConditions( [ |
416 | 'log_type' => 'delete', |
417 | 'log_action' => 'delete', |
418 | 'EXISTS(select * from archive where ar_title = log_title and ar_namespace = log_namespace)', |
419 | // Prior to 2010 the logging table contains nulls. As the docs in elasticsearch use the page id |
420 | // as the document id we cannot handle these old rows. |
421 | $dbr->expr( 'log_page', '!=', null ), |
422 | ] ); |
423 | |
424 | $it->setFetchColumns( [ 'log_timestamp', 'log_namespace', 'log_title', 'log_page' ] ); |
425 | |
426 | $it->setCaller( __METHOD__ ); |
427 | |
428 | return new CallbackIterator( $it, function ( $batch ) { |
429 | $titlesToDelete = []; |
430 | $docIdsToDelete = []; |
431 | $archive = []; |
432 | foreach ( $batch as $row ) { |
433 | $title = Title::makeTitle( $row->log_namespace, $row->log_title ); |
434 | $id = $this->getSearchConfig()->makeId( $row->log_page ); |
435 | $titlesToDelete[] = $title; |
436 | $docIdsToDelete[] = $id; |
437 | $archive[] = [ |
438 | 'title' => $title, |
439 | 'page' => $id, |
440 | ]; |
441 | } |
442 | |
443 | return [ |
444 | 'titlesToDelete' => $titlesToDelete, |
445 | 'docIdsToDelete' => $docIdsToDelete, |
446 | 'archive' => $archive, |
447 | 'endingAt' => isset( $row ) |
448 | ? ( new MWTimestamp( $row->log_timestamp ) )->getTimestamp( TS_ISO_8601 ) |
449 | : 'unknown', |
450 | ]; |
451 | } ); |
452 | } |
453 | |
454 | /** |
455 | * @return CallbackIterator |
456 | */ |
457 | protected function getIdsIterator() { |
458 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
459 | $pageQuery = WikiPage::getQueryInfo(); |
460 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
461 | $it->setFetchColumns( $pageQuery['fields'] ); |
462 | $it->addJoinConditions( $pageQuery['joins'] ); |
463 | $it->addConditions( [ 'page_id' => $this->pageIds ] ); |
464 | $it->setCaller( __METHOD__ ); |
465 | $this->attachPageConditions( $dbr, $it, 'page' ); |
466 | |
467 | return $this->wrapDecodeResults( $it, 'page_id' ); |
468 | } |
469 | |
470 | /** |
471 | * @return CallbackIterator |
472 | */ |
473 | protected function getUpdatesByDateIterator() { |
474 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
475 | $pageQuery = WikiPage::getQueryInfo(); |
476 | $it = new BatchRowIterator( |
477 | $dbr, |
478 | array_merge( $pageQuery['tables'], [ 'revision' ] ), |
479 | [ 'rev_timestamp', 'page_id' ], |
480 | $this->getBatchSize() |
481 | ); |
482 | $it->setFetchColumns( $pageQuery['fields'] ); |
483 | $it->addJoinConditions( $pageQuery['joins'] ); |
484 | $it->addJoinConditions( [ |
485 | 'revision' => [ 'JOIN', [ 'rev_page = page_id', 'rev_id = page_latest' ] ] |
486 | ] ); |
487 | $it->setCaller( __METHOD__ ); |
488 | |
489 | $this->attachTimestampConditions( $dbr, $it, 'rev' ); |
490 | $this->attachPageConditions( $dbr, $it, 'page' ); |
491 | |
492 | return $this->wrapDecodeResults( $it, 'rev_timestamp' ); |
493 | } |
494 | |
495 | /** |
496 | * @return CallbackIterator |
497 | */ |
498 | protected function getUpdatesByIdIterator() { |
499 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
500 | $pageQuery = WikiPage::getQueryInfo(); |
501 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
502 | $it->setFetchColumns( $pageQuery['fields'] ); |
503 | $it->addJoinConditions( $pageQuery['joins'] ); |
504 | $it->setCaller( __METHOD__ ); |
505 | $fromId = $this->getOption( 'fromId', 0 ); |
506 | if ( $fromId > 0 ) { |
507 | $it->addConditions( [ |
508 | $dbr->expr( 'page_id', '>=', $fromId ), |
509 | ] ); |
510 | } |
511 | if ( $this->toId ) { |
512 | $it->addConditions( [ |
513 | $dbr->expr( 'page_id', '<=', $this->toId ), |
514 | ] ); |
515 | } |
516 | |
517 | $this->attachPageConditions( $dbr, $it, 'page' ); |
518 | |
519 | return $this->wrapDecodeResults( $it, 'page_id' ); |
520 | } |
521 | |
522 | private function attachTimestampConditions( |
523 | IDatabase $dbr, BatchRowIterator $it, $columnPrefix |
524 | ) { |
525 | // When initializing we guarantee that if either fromDate or toDate are provided |
526 | // the other has a sane default value. |
527 | if ( $this->fromDate !== null ) { |
528 | $it->addConditions( [ |
529 | $dbr->expr( "{$columnPrefix}_timestamp", '>=', $dbr->timestamp( $this->fromDate ) ), |
530 | $dbr->expr( "{$columnPrefix}_timestamp", '<=', $dbr->timestamp( $this->toDate ) ), |
531 | ] ); |
532 | } |
533 | } |
534 | |
535 | private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) { |
536 | if ( $this->namespace !== null ) { |
537 | $it->addConditions( [ |
538 | "{$columnPrefix}_namespace" => $this->namespace, |
539 | ] ); |
540 | } |
541 | if ( $this->excludeContentTypes ) { |
542 | $it->addConditions( [ |
543 | $dbr->expr( "{$columnPrefix}_content_model", '!=', $this->excludeContentTypes ), |
544 | ] ); |
545 | } |
546 | if ( $this->hasOption( 'useDbIndex' ) ) { |
547 | $index = $this->getOption( 'useDbIndex' ); |
548 | $it->addOptions( [ 'USE INDEX' => $index ] ); |
549 | } |
550 | } |
551 | |
552 | /** |
553 | * @param BatchRowIterator $it |
554 | * @param string $endingAtColumn |
555 | * @return CallbackIterator |
556 | */ |
557 | private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) { |
558 | return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) { |
559 | // Build the updater outside the loop because it stores the redirects it hits. |
560 | // Don't build it at the top level so those are stored when it is freed. |
561 | $updater = $this->createUpdater(); |
562 | |
563 | $pages = []; |
564 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
565 | foreach ( $batch as $row ) { |
566 | // No need to call Updater::traceRedirects here because we know this is a valid page |
567 | // because it is in the database. |
568 | $page = $wikiPageFactory->newFromRow( $row, IDBAccessObject::READ_LATEST ); |
569 | |
570 | // null pages still get attached to keep the counts the same. They will be filtered |
571 | // later on. |
572 | $pages[] = $this->decidePage( $updater, $page ); |
573 | } |
574 | |
575 | if ( isset( $row ) ) { |
576 | if ( $endingAtColumn === 'rev_timestamp' ) { |
577 | $ts = new MWTimestamp( $row->rev_timestamp ); |
578 | $endingAt = $ts->getTimestamp( TS_ISO_8601 ); |
579 | } elseif ( $endingAtColumn === 'page_id' ) { |
580 | $endingAt = $row->page_id; |
581 | } else { |
582 | throw new UnexpectedValueException( 'Unknown $endingAtColumn: ' . $endingAtColumn ); |
583 | } |
584 | } else { |
585 | $endingAt = 'unknown'; |
586 | } |
587 | |
588 | return [ |
589 | 'updates' => $pages, |
590 | 'endingAt' => $endingAt, |
591 | ]; |
592 | } ); |
593 | } |
594 | |
595 | /** |
596 | * Determine the actual page in the index that needs to be updated, based on a |
597 | * source page. |
598 | * |
599 | * @param Updater $updater |
600 | * @param WikiPage $page |
601 | * @return WikiPage|null WikiPage to be updated, or null if none. |
602 | */ |
603 | private function decidePage( Updater $updater, WikiPage $page ) { |
604 | try { |
605 | $content = $page->getContent(); |
606 | } catch ( Throwable $ex ) { |
607 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
608 | "Error deserializing content, skipping page: {pageId}", |
609 | [ 'pageId' => $page->getTitle()->getArticleID() ] |
610 | ); |
611 | return null; |
612 | } |
613 | |
614 | if ( $content === null ) { |
615 | // Skip pages without content. Pages have no content because their latest revision |
616 | // as loaded by the query above doesn't exist. |
617 | $this->output( |
618 | 'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n" |
619 | ); |
620 | return null; |
621 | } |
622 | |
623 | if ( !$content->isRedirect() ) { |
624 | return $page; |
625 | } |
626 | |
627 | if ( $this->toDate === null ) { |
628 | // Looks like we accidentally picked up a redirect when we were indexing by id and thus |
629 | // trying to ignore redirects! Just ignore it! We would filter them out at the db |
630 | // level but that is slow for large wikis. |
631 | return null; |
632 | } |
633 | |
634 | // We found a redirect. Great. Since we can't index special pages and redirects to special |
635 | // pages are totally possible, as well as fun stuff like redirect loops, we need to use |
636 | // Updater's redirect tracing logic which is very complete. Also, it returns null on |
637 | // self redirects. Great! |
638 | [ $page, ] = $updater->traceRedirects( $page->getTitle() ); |
639 | |
640 | if ( $page != null && |
641 | Title::makeTitleSafe( $page->getTitle()->getNamespace(), $page->getTitle()->getText() ) === null |
642 | ) { |
643 | // The title cannot be rebuilt from its ns_prefix + text. |
644 | // It happens if an invalid title is present in the DB |
645 | // We may prefer to not index them as they are hardly viewable |
646 | $this->output( 'Skipping page with invalid title: ' . $page->getTitle()->getPrefixedText() ); |
647 | return null; |
648 | } |
649 | |
650 | return $page; |
651 | } |
652 | |
653 | /** |
654 | * @param string|int $buildChunks If specified as a number then chunks no |
655 | * larger than that size are spat out. If specified as a number followed |
656 | * by the word "total" without a space between them then that many chunks |
657 | * will be spat out sized to cover the entire wiki. |
658 | */ |
659 | private function buildChunks( $buildChunks ) { |
660 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
661 | if ( $this->toId === null ) { |
662 | $this->toId = $dbr->newSelectQueryBuilder() |
663 | ->select( 'MAX(page_id)' ) |
664 | ->from( 'page' ) |
665 | ->caller( __METHOD__ ) |
666 | ->fetchField(); |
667 | if ( $this->toId === false ) { |
668 | $this->fatalError( "Couldn't find any pages to index." ); |
669 | } |
670 | } |
671 | $fromId = $this->getOption( 'fromId' ); |
672 | if ( $fromId === null ) { |
673 | $fromId = $dbr->newSelectQueryBuilder() |
674 | ->select( 'MIN(page_id) - 1' ) |
675 | ->from( 'page' ) |
676 | ->caller( __METHOD__ ) |
677 | ->fetchField(); |
678 | if ( $fromId === false ) { |
679 | $this->fatalError( "Couldn't find any pages to index." ); |
680 | } |
681 | } |
682 | if ( $fromId === $this->toId ) { |
683 | $this->fatalError( |
684 | "Couldn't find any pages to index. fromId = $fromId = $this->toId = toId." |
685 | ); |
686 | } |
687 | $builder = new \CirrusSearch\Maintenance\ChunkBuilder(); |
688 | $builder->build( $this->mSelf, $this->getParameters()->getOptions(), $buildChunks, $fromId, $this->toId ); |
689 | } |
690 | |
691 | /** |
692 | * Get the number of cirrusSearchMassIndex jobs in the queue. |
693 | * @return int length |
694 | */ |
695 | private function getUpdatesInQueue() { |
696 | return MediaWikiServices::getInstance()->getJobQueueGroup()->get( 'cirrusSearchMassIndex' )->getSize(); |
697 | } |
698 | |
699 | /** |
700 | * @return Updater |
701 | */ |
702 | private function createUpdater() { |
703 | return Updater::build( $this->getSearchConfig(), $this->getOption( 'cluster', null ) ); |
704 | } |
705 | } |
706 | |
707 | $maintClass = ForceSearchIndex::class; |
708 | require_once RUN_MAINTENANCE_IF_MAIN; |