Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 378 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
ForceSearchIndex | |
0.00% |
0 / 371 |
|
0.00% |
0 / 19 |
8372 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 61 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
702 | |||
buildPageIdBatches | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
72 | |||
buildUpdateFlags | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
waitForQueueToShrink | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
waitForQueueToDrain | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
42 | |||
calculateIndexingRate | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
simpleCheckIndexes | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
getDeletesIterator | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
12 | |||
getIdsIterator | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByDateIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByIdIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
12 | |||
attachTimestampConditions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
attachPageConditions | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
wrapDecodeResults | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
decidePage | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
56 | |||
buildChunks | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
42 | |||
getUpdatesInQueue | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
createUpdater | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use BatchRowIterator; |
6 | use CirrusSearch\BuildDocument\BuildDocument; |
7 | use CirrusSearch\Iterator\CallbackIterator; |
8 | use CirrusSearch\Job; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\Updater; |
11 | use MediaWiki\Logger\LoggerFactory; |
12 | use MediaWiki\MediaWikiServices; |
13 | use MWException; |
14 | use MWTimestamp; |
15 | use Title; |
16 | use WikiMap; |
17 | use Wikimedia\Rdbms\IDatabase; |
18 | use WikiPage; |
19 | |
20 | /** |
21 | * Force reindexing change to the wiki. |
22 | * |
23 | * This program is free software; you can redistribute it and/or modify |
24 | * it under the terms of the GNU General Public License as published by |
25 | * the Free Software Foundation; either version 2 of the License, or |
26 | * (at your option) any later version. |
27 | * |
28 | * This program is distributed in the hope that it will be useful, |
29 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | * GNU General Public License for more details. |
32 | * |
33 | * You should have received a copy of the GNU General Public License along |
34 | * with this program; if not, write to the Free Software Foundation, Inc., |
35 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
36 | * http://www.gnu.org/copyleft/gpl.html |
37 | */ |
38 | |
39 | $IP = getenv( 'MW_INSTALL_PATH' ); |
40 | if ( $IP === false ) { |
41 | $IP = __DIR__ . '/../../..'; |
42 | } |
43 | require_once "$IP/maintenance/Maintenance.php"; |
44 | require_once __DIR__ . '/../includes/Maintenance/Maintenance.php'; |
45 | |
46 | class ForceSearchIndex extends Maintenance { |
47 | private const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3; |
48 | /** @var MWTimestamp|null */ |
49 | public $fromDate = null; |
50 | /** @var MWTimestamp|null */ |
51 | public $toDate = null; |
52 | public $toId = null; |
53 | public $indexUpdates; |
54 | public $archive; |
55 | public $limit; |
56 | public $queue; |
57 | public $maxJobs; |
58 | public $pauseForJobs; |
59 | public $namespace; |
60 | public $excludeContentTypes; |
61 | public $lastJobQueueCheckTime = 0; |
62 | |
63 | /** |
64 | * @var bool true if the script is run with --ids |
65 | */ |
66 | private $runWithIds; |
67 | |
68 | /** |
69 | * @var int[] list of page ids to reindex when --ids is used |
70 | */ |
71 | private $pageIds; |
72 | |
73 | public function __construct() { |
74 | parent::__construct(); |
75 | $this->addDescription( "Force indexing some pages. Setting --from or --to will switch " |
76 | . "from page id based indexing to " |
77 | . "date based indexing which uses less efficient queries and follows redirects.\n\n" |
78 | . "Note: All froms are _exclusive_ and all tos are _inclusive_.\n" |
79 | . "Note 2: Setting fromId and toId use the efficient query so those are ok.\n" |
80 | . "Note 3: Operates on all clusters unless --cluster is provided.\n" |
81 | ); |
82 | $this->setBatchSize( 10 ); |
83 | $this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc. Defaults ' . |
84 | 'to 0 epoch.', false, true ); |
85 | $this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', |
86 | false, true ); |
87 | $this->addOption( 'fromId', 'Start indexing at a specific page_id. ' . |
88 | 'Not useful with --deletes.', false, true ); |
89 | $this->addOption( 'toId', 'Stop indexing at a specific page_id. ' . |
90 | 'Not useful with --deletes or --from or --to.', false, true ); |
91 | $this->addOption( 'ids', 'List of page ids (comma separated) to reindex. ' . |
92 | 'Not allowed with deletes/from/to/fromId/toId/limit.', false, true ); |
93 | $this->addOption( 'deletes', |
94 | 'If this is set then just index deletes, not updates or creates.', false ); |
95 | $this->addOption( 'archive', |
96 | 'Don\'t delete pages, only index them into the archive.', false, false ); |
97 | $this->addOption( 'limit', |
98 | 'Maximum number of pages to process before exiting the script. Default to unlimited.', |
99 | false, true ); |
100 | $this->addOption( 'buildChunks', 'Instead of running the script spit out commands that ' . |
101 | 'can be farmed out to different processes or machines to rebuild the index. Works ' . |
102 | 'with fromId and toId, not from and to. If specified as a number then chunks no ' . |
103 | 'larger than that size are spat out. If specified as a number followed by the word ' . |
104 | '"total" without a space between them then that many chunks will be spat out sized ' . |
105 | 'to cover the entire wiki.', false, true ); |
106 | $this->addOption( 'queue', 'Rather than perform the indexes in process add them to the ' . |
107 | 'job queue. Ignored for delete.' ); |
108 | $this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue ' . |
109 | 'then pause before adding more. This is only checked every ' . |
110 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS . |
111 | ' seconds. Not meaningful without --queue.', false, true ); |
112 | $this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less ' . |
113 | 'than this many before starting again. Defaults to the value specified for ' . |
114 | '--maxJobs. Not meaningful without --queue.', false, true ); |
115 | $this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document' . |
116 | ' as an index. This replaces the contents of the index for that entry with the entry' . |
117 | ' built from a skipped process. Without this if the entry does not exist then it will' . |
118 | ' be skipped entirely. Only set this when running the first pass of building the' . |
119 | ' index. Otherwise, don\'t tempt fate by indexing half complete documents.' ); |
120 | $this->addOption( 'forceParse', |
121 | 'Bypass ParserCache and do a fresh parse of pages from the Content.' ); |
122 | $this->addOption( 'skipParse', |
123 | 'Skip parsing the page. This is really only good for running the second half ' . |
124 | 'of the two phase index build. If this is specified then the default batch size ' . |
125 | 'is actually 50.' ); |
126 | $this->addOption( 'skipLinks', |
127 | 'Skip looking for links to the page (counting and finding redirects). Use ' . |
128 | 'this with --indexOnSkip for the first half of the two phase index build.' ); |
129 | $this->addOption( 'namespace', 'Only index pages in this given namespace', false, true ); |
130 | $this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. ' . |
131 | 'These must be a comma separated list of strings such as "wikitext" or "json" ' . |
132 | 'matching the CONTENT_MODEL_* constants.', false, true, false ); |
133 | $this->addOption( 'useDbIndex', |
134 | 'Use specific index when fetching IDs from the database.', false, true, false ); |
135 | } |
136 | |
137 | public function execute() { |
138 | $this->disablePoolCountersAndLogging(); |
139 | $wiki = sprintf( "[%20s]", WikiMap::getCurrentWikiId() ); |
140 | |
141 | // Make sure we've actually got indices to populate |
142 | if ( !$this->simpleCheckIndexes() ) { |
143 | $this->fatalError( |
144 | "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?" |
145 | ); |
146 | } |
147 | |
148 | $this->indexUpdates = !$this->getOption( 'deletes', false ); |
149 | // We need to check ids options early otherwise hasOption may return |
150 | // true even if the user did not set the option on the commandline |
151 | if ( $this->hasOption( 'ids' ) ) { |
152 | $this->runWithIds = true; |
153 | $this->pageIds = $this->buildPageIdBatches(); |
154 | } |
155 | |
156 | if ( $this->getOption( 'from' ) !== null || $this->getOption( 'to' ) !== null ) { |
157 | // 0 is falsy so MWTimestamp makes that `now`. '00' is epoch 0. |
158 | $this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' ) ); |
159 | $this->toDate = new MWTimestamp( $this->getOption( 'to', false ) ); |
160 | } |
161 | $this->toId = $this->getOption( 'toId' ); |
162 | $this->archive = (bool)$this->getOption( 'archive', false ); |
163 | if ( $this->archive ) { |
164 | // If we're indexing only for archive, this implies deletes |
165 | $this->indexUpdates = false; |
166 | } |
167 | $this->limit = $this->getOption( 'limit' ); |
168 | $buildChunks = $this->getOption( 'buildChunks' ); |
169 | if ( $buildChunks !== null ) { |
170 | $this->buildChunks( $buildChunks ); |
171 | return null; |
172 | } |
173 | $this->queue = $this->getOption( 'queue' ); |
174 | $this->maxJobs = $this->getOption( 'maxJobs' ) |
175 | ? intval( $this->getOption( 'maxJobs' ) ) |
176 | : null; |
177 | $this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ? |
178 | intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs; |
179 | $updateFlags = $this->buildUpdateFlags(); |
180 | |
181 | if ( !$this->getOption( 'batch-size' ) && |
182 | ( $this->getOption( 'queue' ) || !$this->indexUpdates ) |
183 | ) { |
184 | $this->setBatchSize( 100 ); |
185 | } |
186 | |
187 | $this->namespace = $this->hasOption( 'namespace' ) ? |
188 | intval( $this->getOption( 'namespace' ) ) : null; |
189 | |
190 | $this->excludeContentTypes = array_filter( array_map( |
191 | 'trim', |
192 | explode( ',', $this->getOption( 'excludeContentTypes', '' ) ) |
193 | ) ); |
194 | |
195 | $operationName = $this->indexUpdates |
196 | ? ( $this->queue ? 'Queued' : 'Indexed' ) |
197 | : ( $this->archive ? 'Archived' : 'Deleted' ); |
198 | |
199 | $operationStartTime = microtime( true ); |
200 | $completed = 0; |
201 | $rate = 0; |
202 | |
203 | if ( $this->runWithIds ) { |
204 | $it = $this->getIdsIterator(); |
205 | // @phan-suppress-next-line PhanImpossibleTypeComparison |
206 | } elseif ( $this->indexUpdates && $this->fromDate === null ) { |
207 | $it = $this->getUpdatesByIdIterator(); |
208 | } elseif ( $this->indexUpdates ) { |
209 | $it = $this->getUpdatesByDateIterator(); |
210 | } else { |
211 | $it = $this->getDeletesIterator(); |
212 | } |
213 | $jobQueueGroup = MediaWikiServices::getInstance()->getJobQueueGroup(); |
214 | |
215 | foreach ( $it as $batch ) { |
216 | if ( $this->indexUpdates ) { |
217 | $size = count( $batch['updates'] ); |
218 | $updates = array_filter( $batch['updates'] ); |
219 | if ( $this->queue ) { |
220 | $this->waitForQueueToShrink( $wiki ); |
221 | $jobQueueGroup->push( Job\MassIndex::build( |
222 | $updates, $updateFlags, $this->getOption( 'cluster' ) |
223 | ) ); |
224 | } else { |
225 | // Update size with the actual number of updated documents. |
226 | $updater = $this->createUpdater(); |
227 | $size = $updater->updatePages( $updates, $updateFlags ); |
228 | } |
229 | } else { |
230 | $size = count( $batch['titlesToDelete'] ); |
231 | $updater = $this->createUpdater(); |
232 | $updater->archivePages( $batch['archive'] ); |
233 | if ( !$this->archive ) { |
234 | $updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] ); |
235 | } |
236 | } |
237 | |
238 | $completed += $size; |
239 | $rate = $this->calculateIndexingRate( $completed, $operationStartTime ); |
240 | |
241 | $this->output( |
242 | "$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n" |
243 | ); |
244 | if ( $this->limit !== null && $completed > $this->limit ) { |
245 | break; |
246 | } |
247 | } |
248 | $this->output( "$operationName a total of {$completed} pages at $rate/second\n" ); |
249 | $this->waitForQueueToDrain( $wiki ); |
250 | |
251 | return true; |
252 | } |
253 | |
254 | private function buildPageIdBatches() { |
255 | if ( !$this->indexUpdates || $this->hasOption( 'limit' ) |
256 | || $this->hasOption( 'from' ) || $this->hasOption( 'to' ) |
257 | || $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' ) |
258 | ) { |
259 | $this->fatalError( |
260 | '--ids cannot be used with deletes/archive/from/to/fromId/toId/limit' |
261 | ); |
262 | } |
263 | |
264 | $pageIds = array_map( |
265 | function ( $pageId ) { |
266 | $pageId = trim( $pageId ); |
267 | if ( !ctype_digit( $pageId ) ) { |
268 | $this->fatalError( "Invalid page id provided in --ids, got '$pageId', " . |
269 | "expected a positive integer" ); |
270 | } |
271 | return intval( $pageId ); |
272 | }, |
273 | explode( ',', $this->getOption( 'ids' ) ) |
274 | ); |
275 | return array_unique( $pageIds, SORT_REGULAR ); |
276 | } |
277 | |
278 | private function buildUpdateFlags() { |
279 | $updateFlags = 0; |
280 | if ( $this->getOption( 'indexOnSkip' ) ) { |
281 | $updateFlags |= BuildDocument::INDEX_ON_SKIP; |
282 | } |
283 | if ( $this->getOption( 'skipParse' ) ) { |
284 | $updateFlags |= BuildDocument::SKIP_PARSE; |
285 | if ( !$this->getOption( 'batch-size' ) ) { |
286 | $this->setBatchSize( 50 ); |
287 | } |
288 | } |
289 | if ( $this->getOption( 'skipLinks' ) ) { |
290 | $updateFlags |= BuildDocument::SKIP_LINKS; |
291 | } |
292 | |
293 | if ( $this->getOption( 'forceParse' ) ) { |
294 | $updateFlags |= BuildDocument::FORCE_PARSE; |
295 | } |
296 | |
297 | return $updateFlags; |
298 | } |
299 | |
300 | private function waitForQueueToShrink( $wiki ) { |
301 | $now = microtime( true ); |
302 | if ( $now - $this->lastJobQueueCheckTime <= |
303 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS |
304 | ) { |
305 | return; |
306 | } |
307 | |
308 | $this->lastJobQueueCheckTime = $now; |
309 | $queueSize = $this->getUpdatesInQueue(); |
310 | if ( $this->maxJobs === null || $this->maxJobs >= $queueSize ) { |
311 | return; |
312 | } |
313 | |
314 | do { |
315 | $this->output( |
316 | "$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n" |
317 | ); |
318 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
319 | $queueSize = $this->getUpdatesInQueue(); |
320 | } while ( $this->pauseForJobs < $queueSize ); |
321 | } |
322 | |
323 | private function waitForQueueToDrain( $wiki ) { |
324 | if ( !$this->queue ) { |
325 | return; |
326 | } |
327 | |
328 | $lastQueueSizeForOurJob = PHP_INT_MAX; |
329 | $waitStartTime = microtime( true ); |
330 | $this->output( "Waiting for jobs to drain from the queue\n" ); |
331 | while ( true ) { |
332 | $queueSizeForOurJob = $this->getUpdatesInQueue(); |
333 | if ( $queueSizeForOurJob === 0 ) { |
334 | return; |
335 | } |
336 | // We subtract 5 because we some jobs may be added by deletes |
337 | if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) { |
338 | $this->output( "Queue size went up. Another script is likely adding jobs " . |
339 | "and it'll wait for them to empty.\n" ); |
340 | return; |
341 | } |
342 | if ( microtime( true ) - $waitStartTime > 120 ) { |
343 | // Wait at least two full minutes before we check if the job count went down. |
344 | // Less then that and we might be seeing lag from redis's counts. |
345 | $lastQueueSizeForOurJob = $queueSizeForOurJob; |
346 | } |
347 | $this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" ); |
348 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
349 | } |
350 | } |
351 | |
352 | /** |
353 | * @param int $completed |
354 | * @param double $operationStartTime |
355 | * |
356 | * @return double |
357 | */ |
358 | private function calculateIndexingRate( $completed, $operationStartTime ) { |
359 | $rate = $completed / ( microtime( true ) - $operationStartTime ); |
360 | |
361 | if ( $rate < 1 ) { |
362 | return round( $rate, 1 ); |
363 | } |
364 | |
365 | return round( $rate ); |
366 | } |
367 | |
368 | /** |
369 | * Do some simple sanity checking to make sure we've got indexes to populate. |
370 | * Note this isn't nearly as robust as updateSearchIndexConfig is, but it's |
371 | * not designed to be. |
372 | * |
373 | * @return bool |
374 | */ |
375 | private function simpleCheckIndexes() { |
376 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
377 | |
378 | // Top-level alias needs to exist |
379 | if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) { |
380 | return false; |
381 | } |
382 | |
383 | // Now check all index types to see if they exist |
384 | foreach ( $this->getConnection()->getAllIndexSuffixes() as $indexSuffix ) { |
385 | // If the alias for this type doesn't exist, fail |
386 | if ( !$this->getConnection()->getIndex( $indexBaseName, $indexSuffix )->exists() ) { |
387 | return false; |
388 | } |
389 | } |
390 | |
391 | return true; |
392 | } |
393 | |
394 | protected function getDeletesIterator() { |
395 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
396 | $it = new BatchRowIterator( |
397 | $dbr, |
398 | 'logging', |
399 | [ 'log_timestamp' ], |
400 | $this->getBatchSize() |
401 | ); |
402 | |
403 | $this->attachPageConditions( $dbr, $it, 'log' ); |
404 | $this->attachTimestampConditions( $dbr, $it, 'log' ); |
405 | $it->addConditions( [ |
406 | 'log_type' => 'delete', |
407 | 'log_action' => 'delete', |
408 | 'EXISTS(select * from archive where ar_title = log_title and ar_namespace = log_namespace)', |
409 | // Prior to 2010 the logging table contains nulls. As the docs in elasticsearch use the page id |
410 | // as the document id we cannot handle these old rows. |
411 | 'log_page IS NOT NULL', |
412 | ] ); |
413 | |
414 | $it->setFetchColumns( [ 'log_timestamp', 'log_namespace', 'log_title', 'log_page' ] ); |
415 | |
416 | $it->setCaller( __METHOD__ ); |
417 | |
418 | return new CallbackIterator( $it, function ( $batch ) { |
419 | $titlesToDelete = []; |
420 | $docIdsToDelete = []; |
421 | $archive = []; |
422 | foreach ( $batch as $row ) { |
423 | $title = Title::makeTitle( $row->log_namespace, $row->log_title ); |
424 | $id = $this->getSearchConfig()->makeId( $row->log_page ); |
425 | $titlesToDelete[] = $title; |
426 | $docIdsToDelete[] = $id; |
427 | $archive[] = [ |
428 | 'title' => $title, |
429 | 'page' => $id, |
430 | ]; |
431 | } |
432 | |
433 | return [ |
434 | 'titlesToDelete' => $titlesToDelete, |
435 | 'docIdsToDelete' => $docIdsToDelete, |
436 | 'archive' => $archive, |
437 | 'endingAt' => isset( $row ) |
438 | ? ( new MWTimestamp( $row->log_timestamp ) )->getTimestamp( TS_ISO_8601 ) |
439 | : 'unknown', |
440 | ]; |
441 | } ); |
442 | } |
443 | |
444 | protected function getIdsIterator() { |
445 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
446 | $pageQuery = WikiPage::getQueryInfo(); |
447 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
448 | $it->setFetchColumns( $pageQuery['fields'] ); |
449 | $it->addJoinConditions( $pageQuery['joins'] ); |
450 | $it->addConditions( [ 'page_id' => $this->pageIds ] ); |
451 | $it->setCaller( __METHOD__ ); |
452 | $this->attachPageConditions( $dbr, $it, 'page' ); |
453 | |
454 | return $this->wrapDecodeResults( $it, 'page_id' ); |
455 | } |
456 | |
457 | protected function getUpdatesByDateIterator() { |
458 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
459 | $pageQuery = WikiPage::getQueryInfo(); |
460 | $it = new BatchRowIterator( |
461 | $dbr, |
462 | array_merge( $pageQuery['tables'], [ 'revision' ] ), |
463 | [ 'rev_timestamp', 'page_id' ], |
464 | $this->getBatchSize() |
465 | ); |
466 | $it->setFetchColumns( $pageQuery['fields'] ); |
467 | $it->addJoinConditions( $pageQuery['joins'] ); |
468 | $it->addJoinConditions( [ |
469 | 'revision' => [ 'JOIN', [ 'rev_page = page_id', 'rev_id = page_latest' ] ] |
470 | ] ); |
471 | $it->setCaller( __METHOD__ ); |
472 | |
473 | $this->attachTimestampConditions( $dbr, $it, 'rev' ); |
474 | $this->attachPageConditions( $dbr, $it, 'page' ); |
475 | |
476 | return $this->wrapDecodeResults( $it, 'rev_timestamp' ); |
477 | } |
478 | |
479 | protected function getUpdatesByIdIterator() { |
480 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
481 | $pageQuery = WikiPage::getQueryInfo(); |
482 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
483 | $it->setFetchColumns( $pageQuery['fields'] ); |
484 | $it->addJoinConditions( $pageQuery['joins'] ); |
485 | $it->setCaller( __METHOD__ ); |
486 | $fromId = $this->getOption( 'fromId', 0 ); |
487 | if ( $fromId > 0 ) { |
488 | $it->addConditions( [ |
489 | 'page_id >= ' . $dbr->addQuotes( $fromId ), |
490 | ] ); |
491 | } |
492 | if ( $this->toId ) { |
493 | $it->addConditions( [ |
494 | 'page_id <= ' . $dbr->addQuotes( $this->toId ), |
495 | ] ); |
496 | } |
497 | |
498 | $this->attachPageConditions( $dbr, $it, 'page' ); |
499 | |
500 | return $this->wrapDecodeResults( $it, 'page_id' ); |
501 | } |
502 | |
503 | private function attachTimestampConditions( |
504 | IDatabase $dbr, BatchRowIterator $it, $columnPrefix |
505 | ) { |
506 | // When initializing we guarantee that if either fromDate or toDate are provided |
507 | // the other has a sane default value. |
508 | if ( $this->fromDate !== null ) { |
509 | $it->addConditions( [ |
510 | "{$columnPrefix}_timestamp >= " . |
511 | $dbr->addQuotes( $dbr->timestamp( $this->fromDate ) ), |
512 | "{$columnPrefix}_timestamp <= " . |
513 | $dbr->addQuotes( $dbr->timestamp( $this->toDate ) ), |
514 | ] ); |
515 | } |
516 | } |
517 | |
518 | private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) { |
519 | if ( $this->namespace !== null ) { |
520 | $it->addConditions( [ |
521 | "{$columnPrefix}_namespace" => $this->namespace, |
522 | ] ); |
523 | } |
524 | if ( $this->excludeContentTypes ) { |
525 | $list = $dbr->makeList( $this->excludeContentTypes, LIST_COMMA ); |
526 | $it->addConditions( [ |
527 | "{$columnPrefix}_content_model NOT IN ($list)", |
528 | ] ); |
529 | } |
530 | if ( $this->hasOption( 'useDbIndex' ) ) { |
531 | $index = $this->getOption( 'useDbIndex' ); |
532 | $it->addOptions( [ 'USE INDEX' => $index ] ); |
533 | } |
534 | } |
535 | |
536 | /** |
537 | * @param BatchRowIterator $it |
538 | * @param string $endingAtColumn |
539 | * @return CallbackIterator |
540 | */ |
541 | private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) { |
542 | return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) { |
543 | // Build the updater outside the loop because it stores the redirects it hits. |
544 | // Don't build it at the top level so those are stored when it is freed. |
545 | $updater = $this->createUpdater(); |
546 | |
547 | $pages = []; |
548 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
549 | foreach ( $batch as $row ) { |
550 | // No need to call Updater::traceRedirects here because we know this is a valid page |
551 | // because it is in the database. |
552 | $page = $wikiPageFactory->newFromRow( $row, WikiPage::READ_LATEST ); |
553 | |
554 | // null pages still get attached to keep the counts the same. They will be filtered |
555 | // later on. |
556 | $pages[] = $this->decidePage( $updater, $page ); |
557 | } |
558 | |
559 | if ( isset( $row ) ) { |
560 | if ( $endingAtColumn === 'rev_timestamp' ) { |
561 | $ts = new MWTimestamp( $row->rev_timestamp ); |
562 | $endingAt = $ts->getTimestamp( TS_ISO_8601 ); |
563 | } elseif ( $endingAtColumn === 'page_id' ) { |
564 | $endingAt = $row->page_id; |
565 | } else { |
566 | throw new \MWException( 'Unknown $endingAtColumn: ' . $endingAtColumn ); |
567 | } |
568 | } else { |
569 | $endingAt = 'unknown'; |
570 | } |
571 | |
572 | return [ |
573 | 'updates' => $pages, |
574 | 'endingAt' => $endingAt, |
575 | ]; |
576 | } ); |
577 | } |
578 | |
579 | /** |
580 | * Determine the actual page in the index that needs to be updated, based on a |
581 | * source page. |
582 | * |
583 | * @param Updater $updater |
584 | * @param WikiPage $page |
585 | * @return WikiPage|null WikiPage to be updated, or null if none. |
586 | */ |
587 | private function decidePage( Updater $updater, WikiPage $page ) { |
588 | try { |
589 | $content = $page->getContent(); |
590 | } catch ( MWException $ex ) { |
591 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
592 | "Error deserializing content, skipping page: {pageId}", |
593 | [ 'pageId' => $page->getTitle()->getArticleID() ] |
594 | ); |
595 | return null; |
596 | } |
597 | |
598 | if ( $content === null ) { |
599 | // Skip pages without content. Pages have no content because their latest revision |
600 | // as loaded by the query above doesn't exist. |
601 | $this->output( |
602 | 'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n" |
603 | ); |
604 | return null; |
605 | } |
606 | |
607 | if ( !$content->isRedirect() ) { |
608 | return $page; |
609 | } |
610 | |
611 | if ( $this->toDate === null ) { |
612 | // Looks like we accidentally picked up a redirect when we were indexing by id and thus |
613 | // trying to ignore redirects! Just ignore it! We would filter them out at the db |
614 | // level but that is slow for large wikis. |
615 | return null; |
616 | } |
617 | |
618 | // We found a redirect. Great. Since we can't index special pages and redirects to special |
619 | // pages are totally possible, as well as fun stuff like redirect loops, we need to use |
620 | // Updater's redirect tracing logic which is very complete. Also, it returns null on |
621 | // self redirects. Great! |
622 | list( $page, ) = $updater->traceRedirects( $page->getTitle() ); |
623 | |
624 | if ( $page != null && |
625 | Title::makeTitleSafe( $page->getTitle()->getNamespace(), $page->getTitle()->getText() ) === null |
626 | ) { |
627 | // The title cannot be rebuilt from its ns_prefix + text. |
628 | // It happens if an invalid title is present in the DB |
629 | // We may prefer to not index them as they are hardly viewable |
630 | $this->output( 'Skipping page with invalid title: ' . $page->getTitle()->getPrefixedText() ); |
631 | return null; |
632 | } |
633 | |
634 | return $page; |
635 | } |
636 | |
637 | /** |
638 | * @param string|int $buildChunks If specified as a number then chunks no |
639 | * larger than that size are spat out. If specified as a number followed |
640 | * by the word "total" without a space between them then that many chunks |
641 | * will be spat out sized to cover the entire wiki. |
642 | */ |
643 | private function buildChunks( $buildChunks ) { |
644 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
645 | if ( $this->toId === null ) { |
646 | $this->toId = $dbr->selectField( 'page', 'MAX(page_id)', [], __METHOD__ ); |
647 | if ( $this->toId === false ) { |
648 | $this->fatalError( "Couldn't find any pages to index." ); |
649 | } |
650 | } |
651 | $fromId = $this->getOption( 'fromId' ); |
652 | if ( $fromId === null ) { |
653 | $fromId = $dbr->selectField( 'page', 'MIN(page_id) - 1', [], __METHOD__ ); |
654 | if ( $fromId === false ) { |
655 | $this->fatalError( "Couldn't find any pages to index." ); |
656 | } |
657 | } |
658 | if ( $fromId === $this->toId ) { |
659 | $this->fatalError( |
660 | "Couldn't find any pages to index. fromId = $fromId = $this->toId = toId." |
661 | ); |
662 | } |
663 | $builder = new \CirrusSearch\Maintenance\ChunkBuilder(); |
664 | $builder->build( $this->mSelf, $this->mOptions, $buildChunks, $fromId, $this->toId ); |
665 | } |
666 | |
667 | /** |
668 | * Get the number of cirrusSearchMassIndex jobs in the queue. |
669 | * @return int length |
670 | */ |
671 | private function getUpdatesInQueue() { |
672 | return MediaWikiServices::getInstance()->getJobQueueGroup()->get( 'cirrusSearchMassIndex' )->getSize(); |
673 | } |
674 | |
675 | /** |
676 | * @return Updater |
677 | */ |
678 | private function createUpdater() { |
679 | return Updater::build( $this->getSearchConfig(), $this->getOption( 'cluster', null ) ); |
680 | } |
681 | } |
682 | |
683 | $maintClass = ForceSearchIndex::class; |
684 | require_once RUN_MAINTENANCE_IF_MAIN; |