Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 372 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
ForceSearchIndex | |
0.00% |
0 / 365 |
|
0.00% |
0 / 19 |
8190 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
702 | |||
buildPageIdBatches | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
72 | |||
buildUpdateFlags | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
waitForQueueToShrink | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
waitForQueueToDrain | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
42 | |||
calculateIndexingRate | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
simpleCheckIndexes | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
getDeletesIterator | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
12 | |||
getIdsIterator | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByDateIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
getUpdatesByIdIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
12 | |||
attachTimestampConditions | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
attachPageConditions | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
wrapDecodeResults | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
decidePage | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
56 | |||
buildChunks | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
42 | |||
getUpdatesInQueue | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
createUpdater | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use BatchRowIterator; |
6 | use CirrusSearch\BuildDocument\BuildDocument; |
7 | use CirrusSearch\Iterator\CallbackIterator; |
8 | use CirrusSearch\Job; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\Updater; |
11 | use IDBAccessObject; |
12 | use MediaWiki\Logger\LoggerFactory; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Title\Title; |
15 | use MediaWiki\Utils\MWTimestamp; |
16 | use MediaWiki\WikiMap\WikiMap; |
17 | use Throwable; |
18 | use UnexpectedValueException; |
19 | use Wikimedia\Rdbms\IDatabase; |
20 | use WikiPage; |
21 | |
22 | /** |
23 | * Force reindexing change to the wiki. |
24 | * |
25 | * This program is free software; you can redistribute it and/or modify |
26 | * it under the terms of the GNU General Public License as published by |
27 | * the Free Software Foundation; either version 2 of the License, or |
28 | * (at your option) any later version. |
29 | * |
30 | * This program is distributed in the hope that it will be useful, |
31 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
32 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
33 | * GNU General Public License for more details. |
34 | * |
35 | * You should have received a copy of the GNU General Public License along |
36 | * with this program; if not, write to the Free Software Foundation, Inc., |
37 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
38 | * http://www.gnu.org/copyleft/gpl.html |
39 | */ |
40 | |
41 | $IP = getenv( 'MW_INSTALL_PATH' ); |
42 | if ( $IP === false ) { |
43 | $IP = __DIR__ . '/../../..'; |
44 | } |
45 | require_once "$IP/maintenance/Maintenance.php"; |
46 | require_once __DIR__ . '/../includes/Maintenance/Maintenance.php'; |
47 | |
48 | class ForceSearchIndex extends Maintenance { |
49 | private const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3; |
50 | /** @var MWTimestamp|null */ |
51 | public $fromDate = null; |
52 | /** @var MWTimestamp|null */ |
53 | public $toDate = null; |
54 | public $toId = null; |
55 | public $indexUpdates; |
56 | public $archive; |
57 | public $limit; |
58 | public $queue; |
59 | public $maxJobs; |
60 | public $pauseForJobs; |
61 | public $namespace; |
62 | /** @var string[] */ |
63 | public $excludeContentTypes; |
64 | public $lastJobQueueCheckTime = 0; |
65 | |
66 | /** |
67 | * @var bool true if the script is run with --ids |
68 | */ |
69 | private $runWithIds; |
70 | |
71 | /** |
72 | * @var int[] list of page ids to reindex when --ids is used |
73 | */ |
74 | private $pageIds; |
75 | |
76 | public function __construct() { |
77 | parent::__construct(); |
78 | $this->addDescription( "Force indexing some pages. Setting --from or --to will switch " |
79 | . "from page id based indexing to " |
80 | . "date based indexing which uses less efficient queries and follows redirects.\n\n" |
81 | . "Note: All froms are _exclusive_ and all tos are _inclusive_.\n" |
82 | . "Note 2: Setting fromId and toId use the efficient query so those are ok.\n" |
83 | . "Note 3: Operates on all clusters unless --cluster is provided.\n" |
84 | ); |
85 | $this->setBatchSize( 10 ); |
86 | $this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc. Defaults ' . |
87 | 'to 0 epoch.', false, true ); |
88 | $this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', |
89 | false, true ); |
90 | $this->addOption( 'fromId', 'Start indexing at a specific page_id. ' . |
91 | 'Not useful with --deletes.', false, true ); |
92 | $this->addOption( 'toId', 'Stop indexing at a specific page_id. ' . |
93 | 'Not useful with --deletes or --from or --to.', false, true ); |
94 | $this->addOption( 'ids', 'List of page ids (comma separated) to reindex. ' . |
95 | 'Not allowed with deletes/from/to/fromId/toId/limit.', false, true ); |
96 | $this->addOption( 'deletes', |
97 | 'If this is set then just index deletes, not updates or creates.', false ); |
98 | $this->addOption( 'archive', |
99 | 'Don\'t delete pages, only index them into the archive.', false, false ); |
100 | $this->addOption( 'limit', |
101 | 'Maximum number of pages to process before exiting the script. Default to unlimited.', |
102 | false, true ); |
103 | $this->addOption( 'buildChunks', 'Instead of running the script spit out commands that ' . |
104 | 'can be farmed out to different processes or machines to rebuild the index. Works ' . |
105 | 'with fromId and toId, not from and to. If specified as a number then chunks no ' . |
106 | 'larger than that size are spat out. If specified as a number followed by the word ' . |
107 | '"total" without a space between them then that many chunks will be spat out sized ' . |
108 | 'to cover the entire wiki.', false, true ); |
109 | $this->addOption( 'queue', 'Rather than perform the indexes in process add them to the ' . |
110 | 'job queue. Ignored for delete.' ); |
111 | $this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue ' . |
112 | 'then pause before adding more. This is only checked every ' . |
113 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS . |
114 | ' seconds. Not meaningful without --queue.', false, true ); |
115 | $this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less ' . |
116 | 'than this many before starting again. Defaults to the value specified for ' . |
117 | '--maxJobs. Not meaningful without --queue.', false, true ); |
118 | $this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document' . |
119 | ' as an index. This replaces the contents of the index for that entry with the entry' . |
120 | ' built from a skipped process. Without this if the entry does not exist then it will' . |
121 | ' be skipped entirely. Only set this when running the first pass of building the' . |
122 | ' index. Otherwise, don\'t tempt fate by indexing half complete documents.' ); |
123 | $this->addOption( 'forceParse', '(deprecated)' ); |
124 | $this->addOption( 'skipParse', |
125 | 'Skip parsing the page. This is really only good for running the second half ' . |
126 | 'of the two phase index build. If this is specified then the default batch size ' . |
127 | 'is actually 50.' ); |
128 | $this->addOption( 'skipLinks', |
129 | 'Skip looking for links to the page (counting and finding redirects). Use ' . |
130 | 'this with --indexOnSkip for the first half of the two phase index build.' ); |
131 | $this->addOption( 'namespace', 'Only index pages in this given namespace', false, true ); |
132 | $this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. ' . |
133 | 'These must be a comma separated list of strings such as "wikitext" or "json" ' . |
134 | 'matching the CONTENT_MODEL_* constants.', false, true, false ); |
135 | $this->addOption( 'useDbIndex', |
136 | 'Use specific index when fetching IDs from the database.', false, true, false ); |
137 | } |
138 | |
139 | public function execute() { |
140 | $this->disablePoolCountersAndLogging(); |
141 | $wiki = sprintf( "[%20s]", WikiMap::getCurrentWikiId() ); |
142 | |
143 | // Make sure we've actually got indices to populate |
144 | if ( !$this->simpleCheckIndexes() ) { |
145 | $this->fatalError( |
146 | "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?" |
147 | ); |
148 | } |
149 | |
150 | $this->indexUpdates = !$this->getOption( 'deletes', false ); |
151 | // We need to check ids options early otherwise hasOption may return |
152 | // true even if the user did not set the option on the commandline |
153 | if ( $this->hasOption( 'ids' ) ) { |
154 | $this->runWithIds = true; |
155 | $this->pageIds = $this->buildPageIdBatches(); |
156 | } |
157 | |
158 | if ( $this->getOption( 'from' ) !== null || $this->getOption( 'to' ) !== null ) { |
159 | // 0 is falsy so MWTimestamp makes that `now`. '00' is epoch 0. |
160 | $this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' ) ); |
161 | $this->toDate = new MWTimestamp( $this->getOption( 'to', false ) ); |
162 | } |
163 | $this->toId = $this->getOption( 'toId' ); |
164 | $this->archive = (bool)$this->getOption( 'archive', false ); |
165 | if ( $this->archive ) { |
166 | // If we're indexing only for archive, this implies deletes |
167 | $this->indexUpdates = false; |
168 | } |
169 | $this->limit = $this->getOption( 'limit' ); |
170 | $buildChunks = $this->getOption( 'buildChunks' ); |
171 | if ( $buildChunks !== null ) { |
172 | $this->buildChunks( $buildChunks ); |
173 | return null; |
174 | } |
175 | $this->queue = $this->getOption( 'queue' ); |
176 | $this->maxJobs = $this->getOption( 'maxJobs' ) |
177 | ? intval( $this->getOption( 'maxJobs' ) ) |
178 | : null; |
179 | $this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ? |
180 | intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs; |
181 | $updateFlags = $this->buildUpdateFlags(); |
182 | |
183 | if ( !$this->getOption( 'batch-size' ) && |
184 | ( $this->getOption( 'queue' ) || !$this->indexUpdates ) |
185 | ) { |
186 | $this->setBatchSize( 100 ); |
187 | } |
188 | |
189 | $this->namespace = $this->hasOption( 'namespace' ) ? |
190 | intval( $this->getOption( 'namespace' ) ) : null; |
191 | |
192 | $this->excludeContentTypes = array_filter( array_map( |
193 | 'trim', |
194 | explode( ',', $this->getOption( 'excludeContentTypes', '' ) ) |
195 | ) ); |
196 | |
197 | $operationName = $this->indexUpdates |
198 | ? ( $this->queue ? 'Queued' : 'Indexed' ) |
199 | : ( $this->archive ? 'Archived' : 'Deleted' ); |
200 | |
201 | $operationStartTime = microtime( true ); |
202 | $completed = 0; |
203 | $rate = 0; |
204 | |
205 | if ( $this->runWithIds ) { |
206 | $it = $this->getIdsIterator(); |
207 | // @phan-suppress-next-line PhanImpossibleTypeComparison |
208 | } elseif ( $this->indexUpdates && $this->fromDate === null ) { |
209 | $it = $this->getUpdatesByIdIterator(); |
210 | } elseif ( $this->indexUpdates ) { |
211 | $it = $this->getUpdatesByDateIterator(); |
212 | } else { |
213 | $it = $this->getDeletesIterator(); |
214 | } |
215 | $jobQueueGroup = MediaWikiServices::getInstance()->getJobQueueGroup(); |
216 | |
217 | foreach ( $it as $batch ) { |
218 | if ( $this->indexUpdates ) { |
219 | $size = count( $batch['updates'] ); |
220 | $updates = array_filter( $batch['updates'] ); |
221 | if ( $this->queue ) { |
222 | $this->waitForQueueToShrink( $wiki ); |
223 | $jobQueueGroup->push( Job\MassIndex::build( |
224 | $updates, $updateFlags, $this->getOption( 'cluster' ) |
225 | ) ); |
226 | } else { |
227 | // Update size with the actual number of updated documents. |
228 | $updater = $this->createUpdater(); |
229 | $size = $updater->updatePages( $updates, $updateFlags ); |
230 | } |
231 | } else { |
232 | $size = count( $batch['titlesToDelete'] ); |
233 | $updater = $this->createUpdater(); |
234 | $updater->archivePages( $batch['archive'] ); |
235 | if ( !$this->archive ) { |
236 | $updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] ); |
237 | } |
238 | } |
239 | |
240 | $completed += $size; |
241 | $rate = $this->calculateIndexingRate( $completed, $operationStartTime ); |
242 | |
243 | $this->output( |
244 | "$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n" |
245 | ); |
246 | if ( $this->limit !== null && $completed > $this->limit ) { |
247 | break; |
248 | } |
249 | } |
250 | $this->output( "$operationName a total of {$completed} pages at $rate/second\n" ); |
251 | $this->waitForQueueToDrain( $wiki ); |
252 | |
253 | return true; |
254 | } |
255 | |
256 | private function buildPageIdBatches() { |
257 | if ( !$this->indexUpdates || $this->hasOption( 'limit' ) |
258 | || $this->hasOption( 'from' ) || $this->hasOption( 'to' ) |
259 | || $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' ) |
260 | ) { |
261 | $this->fatalError( |
262 | '--ids cannot be used with deletes/archive/from/to/fromId/toId/limit' |
263 | ); |
264 | } |
265 | |
266 | $pageIds = array_map( |
267 | function ( $pageId ) { |
268 | $pageId = trim( $pageId ); |
269 | if ( !ctype_digit( $pageId ) ) { |
270 | $this->fatalError( "Invalid page id provided in --ids, got '$pageId', " . |
271 | "expected a positive integer" ); |
272 | } |
273 | return intval( $pageId ); |
274 | }, |
275 | explode( ',', $this->getOption( 'ids' ) ) |
276 | ); |
277 | return array_unique( $pageIds, SORT_REGULAR ); |
278 | } |
279 | |
280 | private function buildUpdateFlags() { |
281 | $updateFlags = 0; |
282 | if ( $this->getOption( 'indexOnSkip' ) ) { |
283 | $updateFlags |= BuildDocument::INDEX_ON_SKIP; |
284 | } |
285 | if ( $this->getOption( 'skipParse' ) ) { |
286 | $updateFlags |= BuildDocument::SKIP_PARSE; |
287 | if ( !$this->getOption( 'batch-size' ) ) { |
288 | $this->setBatchSize( 50 ); |
289 | } |
290 | } |
291 | if ( $this->getOption( 'skipLinks' ) ) { |
292 | $updateFlags |= BuildDocument::SKIP_LINKS; |
293 | } |
294 | |
295 | return $updateFlags; |
296 | } |
297 | |
298 | private function waitForQueueToShrink( $wiki ) { |
299 | $now = microtime( true ); |
300 | if ( $now - $this->lastJobQueueCheckTime <= |
301 | self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS |
302 | ) { |
303 | return; |
304 | } |
305 | |
306 | $this->lastJobQueueCheckTime = $now; |
307 | $queueSize = $this->getUpdatesInQueue(); |
308 | if ( $this->maxJobs === null || $this->maxJobs >= $queueSize ) { |
309 | return; |
310 | } |
311 | |
312 | do { |
313 | $this->output( |
314 | "$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n" |
315 | ); |
316 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
317 | $queueSize = $this->getUpdatesInQueue(); |
318 | } while ( $this->pauseForJobs < $queueSize ); |
319 | } |
320 | |
321 | private function waitForQueueToDrain( $wiki ) { |
322 | if ( !$this->queue ) { |
323 | return; |
324 | } |
325 | |
326 | $lastQueueSizeForOurJob = PHP_INT_MAX; |
327 | $waitStartTime = microtime( true ); |
328 | $this->output( "Waiting for jobs to drain from the queue\n" ); |
329 | while ( true ) { |
330 | $queueSizeForOurJob = $this->getUpdatesInQueue(); |
331 | if ( $queueSizeForOurJob === 0 ) { |
332 | return; |
333 | } |
334 | // We subtract 5 because we some jobs may be added by deletes |
335 | if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) { |
336 | $this->output( "Queue size went up. Another script is likely adding jobs " . |
337 | "and it'll wait for them to empty.\n" ); |
338 | return; |
339 | } |
340 | if ( microtime( true ) - $waitStartTime > 120 ) { |
341 | // Wait at least two full minutes before we check if the job count went down. |
342 | // Less then that and we might be seeing lag from redis's counts. |
343 | $lastQueueSizeForOurJob = $queueSizeForOurJob; |
344 | } |
345 | $this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" ); |
346 | usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 ); |
347 | } |
348 | } |
349 | |
350 | /** |
351 | * @param int $completed |
352 | * @param double $operationStartTime |
353 | * |
354 | * @return double |
355 | */ |
356 | private function calculateIndexingRate( $completed, $operationStartTime ) { |
357 | $rate = $completed / ( microtime( true ) - $operationStartTime ); |
358 | |
359 | if ( $rate < 1 ) { |
360 | return round( $rate, 1 ); |
361 | } |
362 | |
363 | return round( $rate ); |
364 | } |
365 | |
366 | /** |
367 | * Do some simple sanity checking to make sure we've got indexes to populate. |
368 | * Note this isn't nearly as robust as updateSearchIndexConfig is, but it's |
369 | * not designed to be. |
370 | * |
371 | * @return bool |
372 | */ |
373 | private function simpleCheckIndexes() { |
374 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
375 | |
376 | // Top-level alias needs to exist |
377 | if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) { |
378 | return false; |
379 | } |
380 | |
381 | // Now check all index types to see if they exist |
382 | foreach ( $this->getConnection()->getAllIndexSuffixes() as $indexSuffix ) { |
383 | // If the alias for this type doesn't exist, fail |
384 | if ( !$this->getConnection()->getIndex( $indexBaseName, $indexSuffix )->exists() ) { |
385 | return false; |
386 | } |
387 | } |
388 | |
389 | return true; |
390 | } |
391 | |
392 | protected function getDeletesIterator() { |
393 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
394 | $it = new BatchRowIterator( |
395 | $dbr, |
396 | 'logging', |
397 | [ 'log_timestamp' ], |
398 | $this->getBatchSize() |
399 | ); |
400 | |
401 | $this->attachPageConditions( $dbr, $it, 'log' ); |
402 | $this->attachTimestampConditions( $dbr, $it, 'log' ); |
403 | $it->addConditions( [ |
404 | 'log_type' => 'delete', |
405 | 'log_action' => 'delete', |
406 | 'EXISTS(select * from archive where ar_title = log_title and ar_namespace = log_namespace)', |
407 | // Prior to 2010 the logging table contains nulls. As the docs in elasticsearch use the page id |
408 | // as the document id we cannot handle these old rows. |
409 | $dbr->expr( 'log_page', '!=', null ), |
410 | ] ); |
411 | |
412 | $it->setFetchColumns( [ 'log_timestamp', 'log_namespace', 'log_title', 'log_page' ] ); |
413 | |
414 | $it->setCaller( __METHOD__ ); |
415 | |
416 | return new CallbackIterator( $it, function ( $batch ) { |
417 | $titlesToDelete = []; |
418 | $docIdsToDelete = []; |
419 | $archive = []; |
420 | foreach ( $batch as $row ) { |
421 | $title = Title::makeTitle( $row->log_namespace, $row->log_title ); |
422 | $id = $this->getSearchConfig()->makeId( $row->log_page ); |
423 | $titlesToDelete[] = $title; |
424 | $docIdsToDelete[] = $id; |
425 | $archive[] = [ |
426 | 'title' => $title, |
427 | 'page' => $id, |
428 | ]; |
429 | } |
430 | |
431 | return [ |
432 | 'titlesToDelete' => $titlesToDelete, |
433 | 'docIdsToDelete' => $docIdsToDelete, |
434 | 'archive' => $archive, |
435 | 'endingAt' => isset( $row ) |
436 | ? ( new MWTimestamp( $row->log_timestamp ) )->getTimestamp( TS_ISO_8601 ) |
437 | : 'unknown', |
438 | ]; |
439 | } ); |
440 | } |
441 | |
442 | protected function getIdsIterator() { |
443 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
444 | $pageQuery = WikiPage::getQueryInfo(); |
445 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
446 | $it->setFetchColumns( $pageQuery['fields'] ); |
447 | $it->addJoinConditions( $pageQuery['joins'] ); |
448 | $it->addConditions( [ 'page_id' => $this->pageIds ] ); |
449 | $it->setCaller( __METHOD__ ); |
450 | $this->attachPageConditions( $dbr, $it, 'page' ); |
451 | |
452 | return $this->wrapDecodeResults( $it, 'page_id' ); |
453 | } |
454 | |
455 | protected function getUpdatesByDateIterator() { |
456 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
457 | $pageQuery = WikiPage::getQueryInfo(); |
458 | $it = new BatchRowIterator( |
459 | $dbr, |
460 | array_merge( $pageQuery['tables'], [ 'revision' ] ), |
461 | [ 'rev_timestamp', 'page_id' ], |
462 | $this->getBatchSize() |
463 | ); |
464 | $it->setFetchColumns( $pageQuery['fields'] ); |
465 | $it->addJoinConditions( $pageQuery['joins'] ); |
466 | $it->addJoinConditions( [ |
467 | 'revision' => [ 'JOIN', [ 'rev_page = page_id', 'rev_id = page_latest' ] ] |
468 | ] ); |
469 | $it->setCaller( __METHOD__ ); |
470 | |
471 | $this->attachTimestampConditions( $dbr, $it, 'rev' ); |
472 | $this->attachPageConditions( $dbr, $it, 'page' ); |
473 | |
474 | return $this->wrapDecodeResults( $it, 'rev_timestamp' ); |
475 | } |
476 | |
477 | protected function getUpdatesByIdIterator() { |
478 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
479 | $pageQuery = WikiPage::getQueryInfo(); |
480 | $it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() ); |
481 | $it->setFetchColumns( $pageQuery['fields'] ); |
482 | $it->addJoinConditions( $pageQuery['joins'] ); |
483 | $it->setCaller( __METHOD__ ); |
484 | $fromId = $this->getOption( 'fromId', 0 ); |
485 | if ( $fromId > 0 ) { |
486 | $it->addConditions( [ |
487 | $dbr->expr( 'page_id', '>=', $fromId ), |
488 | ] ); |
489 | } |
490 | if ( $this->toId ) { |
491 | $it->addConditions( [ |
492 | $dbr->expr( 'page_id', '<=', $this->toId ), |
493 | ] ); |
494 | } |
495 | |
496 | $this->attachPageConditions( $dbr, $it, 'page' ); |
497 | |
498 | return $this->wrapDecodeResults( $it, 'page_id' ); |
499 | } |
500 | |
501 | private function attachTimestampConditions( |
502 | IDatabase $dbr, BatchRowIterator $it, $columnPrefix |
503 | ) { |
504 | // When initializing we guarantee that if either fromDate or toDate are provided |
505 | // the other has a sane default value. |
506 | if ( $this->fromDate !== null ) { |
507 | $it->addConditions( [ |
508 | $dbr->expr( "{$columnPrefix}_timestamp", '>=', $dbr->timestamp( $this->fromDate ) ), |
509 | $dbr->expr( "{$columnPrefix}_timestamp", '<=', $dbr->timestamp( $this->toDate ) ), |
510 | ] ); |
511 | } |
512 | } |
513 | |
514 | private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) { |
515 | if ( $this->namespace !== null ) { |
516 | $it->addConditions( [ |
517 | "{$columnPrefix}_namespace" => $this->namespace, |
518 | ] ); |
519 | } |
520 | if ( $this->excludeContentTypes ) { |
521 | $it->addConditions( [ |
522 | $dbr->expr( "{$columnPrefix}_content_model", '!=', $this->excludeContentTypes ), |
523 | ] ); |
524 | } |
525 | if ( $this->hasOption( 'useDbIndex' ) ) { |
526 | $index = $this->getOption( 'useDbIndex' ); |
527 | $it->addOptions( [ 'USE INDEX' => $index ] ); |
528 | } |
529 | } |
530 | |
531 | /** |
532 | * @param BatchRowIterator $it |
533 | * @param string $endingAtColumn |
534 | * @return CallbackIterator |
535 | */ |
536 | private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) { |
537 | return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) { |
538 | // Build the updater outside the loop because it stores the redirects it hits. |
539 | // Don't build it at the top level so those are stored when it is freed. |
540 | $updater = $this->createUpdater(); |
541 | |
542 | $pages = []; |
543 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
544 | foreach ( $batch as $row ) { |
545 | // No need to call Updater::traceRedirects here because we know this is a valid page |
546 | // because it is in the database. |
547 | $page = $wikiPageFactory->newFromRow( $row, IDBAccessObject::READ_LATEST ); |
548 | |
549 | // null pages still get attached to keep the counts the same. They will be filtered |
550 | // later on. |
551 | $pages[] = $this->decidePage( $updater, $page ); |
552 | } |
553 | |
554 | if ( isset( $row ) ) { |
555 | if ( $endingAtColumn === 'rev_timestamp' ) { |
556 | $ts = new MWTimestamp( $row->rev_timestamp ); |
557 | $endingAt = $ts->getTimestamp( TS_ISO_8601 ); |
558 | } elseif ( $endingAtColumn === 'page_id' ) { |
559 | $endingAt = $row->page_id; |
560 | } else { |
561 | throw new UnexpectedValueException( 'Unknown $endingAtColumn: ' . $endingAtColumn ); |
562 | } |
563 | } else { |
564 | $endingAt = 'unknown'; |
565 | } |
566 | |
567 | return [ |
568 | 'updates' => $pages, |
569 | 'endingAt' => $endingAt, |
570 | ]; |
571 | } ); |
572 | } |
573 | |
574 | /** |
575 | * Determine the actual page in the index that needs to be updated, based on a |
576 | * source page. |
577 | * |
578 | * @param Updater $updater |
579 | * @param WikiPage $page |
580 | * @return WikiPage|null WikiPage to be updated, or null if none. |
581 | */ |
582 | private function decidePage( Updater $updater, WikiPage $page ) { |
583 | try { |
584 | $content = $page->getContent(); |
585 | } catch ( Throwable $ex ) { |
586 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
587 | "Error deserializing content, skipping page: {pageId}", |
588 | [ 'pageId' => $page->getTitle()->getArticleID() ] |
589 | ); |
590 | return null; |
591 | } |
592 | |
593 | if ( $content === null ) { |
594 | // Skip pages without content. Pages have no content because their latest revision |
595 | // as loaded by the query above doesn't exist. |
596 | $this->output( |
597 | 'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n" |
598 | ); |
599 | return null; |
600 | } |
601 | |
602 | if ( !$content->isRedirect() ) { |
603 | return $page; |
604 | } |
605 | |
606 | if ( $this->toDate === null ) { |
607 | // Looks like we accidentally picked up a redirect when we were indexing by id and thus |
608 | // trying to ignore redirects! Just ignore it! We would filter them out at the db |
609 | // level but that is slow for large wikis. |
610 | return null; |
611 | } |
612 | |
613 | // We found a redirect. Great. Since we can't index special pages and redirects to special |
614 | // pages are totally possible, as well as fun stuff like redirect loops, we need to use |
615 | // Updater's redirect tracing logic which is very complete. Also, it returns null on |
616 | // self redirects. Great! |
617 | [ $page, ] = $updater->traceRedirects( $page->getTitle() ); |
618 | |
619 | if ( $page != null && |
620 | Title::makeTitleSafe( $page->getTitle()->getNamespace(), $page->getTitle()->getText() ) === null |
621 | ) { |
622 | // The title cannot be rebuilt from its ns_prefix + text. |
623 | // It happens if an invalid title is present in the DB |
624 | // We may prefer to not index them as they are hardly viewable |
625 | $this->output( 'Skipping page with invalid title: ' . $page->getTitle()->getPrefixedText() ); |
626 | return null; |
627 | } |
628 | |
629 | return $page; |
630 | } |
631 | |
632 | /** |
633 | * @param string|int $buildChunks If specified as a number then chunks no |
634 | * larger than that size are spat out. If specified as a number followed |
635 | * by the word "total" without a space between them then that many chunks |
636 | * will be spat out sized to cover the entire wiki. |
637 | */ |
638 | private function buildChunks( $buildChunks ) { |
639 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
640 | if ( $this->toId === null ) { |
641 | $this->toId = $dbr->selectField( 'page', 'MAX(page_id)', [], __METHOD__ ); |
642 | if ( $this->toId === false ) { |
643 | $this->fatalError( "Couldn't find any pages to index." ); |
644 | } |
645 | } |
646 | $fromId = $this->getOption( 'fromId' ); |
647 | if ( $fromId === null ) { |
648 | $fromId = $dbr->selectField( 'page', 'MIN(page_id) - 1', [], __METHOD__ ); |
649 | if ( $fromId === false ) { |
650 | $this->fatalError( "Couldn't find any pages to index." ); |
651 | } |
652 | } |
653 | if ( $fromId === $this->toId ) { |
654 | $this->fatalError( |
655 | "Couldn't find any pages to index. fromId = $fromId = $this->toId = toId." |
656 | ); |
657 | } |
658 | $builder = new \CirrusSearch\Maintenance\ChunkBuilder(); |
659 | $builder->build( $this->mSelf, $this->mOptions, $buildChunks, $fromId, $this->toId ); |
660 | } |
661 | |
662 | /** |
663 | * Get the number of cirrusSearchMassIndex jobs in the queue. |
664 | * @return int length |
665 | */ |
666 | private function getUpdatesInQueue() { |
667 | return MediaWikiServices::getInstance()->getJobQueueGroup()->get( 'cirrusSearchMassIndex' )->getSize(); |
668 | } |
669 | |
670 | /** |
671 | * @return Updater |
672 | */ |
673 | private function createUpdater() { |
674 | return Updater::build( $this->getSearchConfig(), $this->getOption( 'cluster', null ) ); |
675 | } |
676 | } |
677 | |
678 | $maintClass = ForceSearchIndex::class; |
679 | require_once RUN_MAINTENANCE_IF_MAIN; |