Code Coverage for /workspace/src/extensions/CirrusSearch/maintenance/ForceSearchIndex.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 380	0.00% covered (danger)	0.00%	0 / 19	CRAP	0.00% covered (danger)	0.00%	0 / 1
ForceSearchIndex	0.00% covered (danger)	0.00%	0 / 373	0.00% covered (danger)	0.00%	0 / 19	8190	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 60	0.00% covered (danger)	0.00%	0 / 1	2
execute	0.00% covered (danger)	0.00%	0 / 78	0.00% covered (danger)	0.00%	0 / 1	702
buildPageIdBatches	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	72
buildUpdateFlags	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	30
waitForQueueToShrink	0.00% covered (danger)	0.00%	0 / 14	0.00% covered (danger)	0.00%	0 / 1	20
waitForQueueToDrain	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	42
calculateIndexingRate	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
simpleCheckIndexes	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	20
getDeletesIterator	0.00% covered (danger)	0.00%	0 / 39	0.00% covered (danger)	0.00%	0 / 1	12
getIdsIterator	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	2
getUpdatesByDateIterator	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	2
getUpdatesByIdIterator	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	12
attachTimestampConditions	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	6
attachPageConditions	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	20
wrapDecodeResults	0.00% covered (danger)	0.00%	0 / 20	0.00% covered (danger)	0.00%	0 / 1	30
decidePage	0.00% covered (danger)	0.00%	0 / 22	0.00% covered (danger)	0.00%	0 / 1	56
buildChunks	0.00% covered (danger)	0.00%	0 / 24	0.00% covered (danger)	0.00%	0 / 1	42
getUpdatesInQueue	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
createUpdater	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace CirrusSearch\Maintenance;
4
5	use BatchRowIterator;
6	use CirrusSearch\BuildDocument\BuildDocument;
7	use CirrusSearch\Iterator\CallbackIterator;
8	use CirrusSearch\Job;
9	use CirrusSearch\SearchConfig;
10	use CirrusSearch\Updater;
11	use IDBAccessObject;
12	use MediaWiki\Logger\LoggerFactory;
13	use MediaWiki\MediaWikiServices;
14	use MediaWiki\Title\Title;
15	use MediaWiki\Utils\MWTimestamp;
16	use MediaWiki\WikiMap\WikiMap;
17	use Throwable;
18	use UnexpectedValueException;
19	use Wikimedia\Rdbms\IDatabase;
20	use WikiPage;
21
22	/**
23	* Force reindexing change to the wiki.
24	*
25	* This program is free software; you can redistribute it and/or modify
26	* it under the terms of the GNU General Public License as published by
27	* the Free Software Foundation; either version 2 of the License, or
28	* (at your option) any later version.
29	*
30	* This program is distributed in the hope that it will be useful,
31	* but WITHOUT ANY WARRANTY; without even the implied warranty of
32	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33	* GNU General Public License for more details.
34	*
35	* You should have received a copy of the GNU General Public License along
36	* with this program; if not, write to the Free Software Foundation, Inc.,
37	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
38	* http://www.gnu.org/copyleft/gpl.html
39	*/
40
41	$IP = getenv( 'MW_INSTALL_PATH' );
42	if ( $IP === false ) {
43	$IP = __DIR__ . '/../../..';
44	}
45	require_once "$IP/maintenance/Maintenance.php";
46	require_once __DIR__ . '/../includes/Maintenance/Maintenance.php';
47
48	class ForceSearchIndex extends Maintenance {
49	private const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3;
50	/** @var MWTimestamp\|null */
51	public $fromDate = null;
52	/** @var MWTimestamp\|null */
53	public $toDate = null;
54	public $toId = null;
55	public $indexUpdates;
56	public $archive;
57	public $limit;
58	public $queue;
59	public $maxJobs;
60	public $pauseForJobs;
61	public $namespace;
62	/** @var string[] */
63	public $excludeContentTypes;
64	public $lastJobQueueCheckTime = 0;
65
66	/**
67	* @var bool true if the script is run with --ids
68	*/
69	private $runWithIds;
70
71	/**
72	* @var int[] list of page ids to reindex when --ids is used
73	*/
74	private $pageIds;
75
76	public function __construct() {
77	parent::__construct();
78	$this->addDescription( "Force indexing some pages. Setting --from or --to will switch "
79	. "from page id based indexing to "
80	. "date based indexing which uses less efficient queries and follows redirects.\n\n"
81	. "Note: All froms are _exclusive_ and all tos are _inclusive_.\n"
82	. "Note 2: Setting fromId and toId use the efficient query so those are ok.\n"
83	. "Note 3: Operates on all clusters unless --cluster is provided.\n"
84	);
85	$this->setBatchSize( 10 );
86	$this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc. Defaults ' .
87	'to 0 epoch.', false, true );
88	$this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.',
89	false, true );
90	$this->addOption( 'fromId', 'Start indexing at a specific page_id. ' .
91	'Not useful with --deletes.', false, true );
92	$this->addOption( 'toId', 'Stop indexing at a specific page_id. ' .
93	'Not useful with --deletes or --from or --to.', false, true );
94	$this->addOption( 'ids', 'List of page ids (comma separated) to reindex. ' .
95	'Not allowed with deletes/from/to/fromId/toId/limit.', false, true );
96	$this->addOption( 'deletes',
97	'If this is set then just index deletes, not updates or creates.', false );
98	$this->addOption( 'archive',
99	'Don\'t delete pages, only index them into the archive.', false, false );
100	$this->addOption( 'limit',
101	'Maximum number of pages to process before exiting the script. Default to unlimited.',
102	false, true );
103	$this->addOption( 'buildChunks', 'Instead of running the script spit out commands that ' .
104	'can be farmed out to different processes or machines to rebuild the index. Works ' .
105	'with fromId and toId, not from and to. If specified as a number then chunks no ' .
106	'larger than that size are spat out. If specified as a number followed by the word ' .
107	'"total" without a space between them then that many chunks will be spat out sized ' .
108	'to cover the entire wiki.', false, true );
109	$this->addOption( 'queue', 'Rather than perform the indexes in process add them to the ' .
110	'job queue. Ignored for delete.' );
111	$this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue ' .
112	'then pause before adding more. This is only checked every ' .
113	self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS .
114	' seconds. Not meaningful without --queue.', false, true );
115	$this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less ' .
116	'than this many before starting again. Defaults to the value specified for ' .
117	'--maxJobs. Not meaningful without --queue.', false, true );
118	$this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document' .
119	' as an index. This replaces the contents of the index for that entry with the entry' .
120	' built from a skipped process. Without this if the entry does not exist then it will' .
121	' be skipped entirely. Only set this when running the first pass of building the' .
122	' index. Otherwise, don\'t tempt fate by indexing half complete documents.' );
123	$this->addOption( 'forceParse', '(deprecated)' );
124	$this->addOption( 'skipParse',
125	'Skip parsing the page. This is really only good for running the second half ' .
126	'of the two phase index build. If this is specified then the default batch size ' .
127	'is actually 50.' );
128	$this->addOption( 'skipLinks',
129	'Skip looking for links to the page (counting and finding redirects). Use ' .
130	'this with --indexOnSkip for the first half of the two phase index build.' );
131	$this->addOption( 'namespace', 'Only index pages in this given namespace', false, true );
132	$this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. ' .
133	'These must be a comma separated list of strings such as "wikitext" or "json" ' .
134	'matching the CONTENT_MODEL_* constants.', false, true, false );
135	$this->addOption( 'useDbIndex',
136	'Use specific index when fetching IDs from the database.', false, true, false );
137	}
138
139	public function execute() {
140	$this->disablePoolCountersAndLogging();
141	$wiki = sprintf( "[%20s]", WikiMap::getCurrentWikiId() );
142
143	// Make sure we've actually got indices to populate
144	if ( !$this->simpleCheckIndexes() ) {
145	$this->fatalError(
146	"$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?"
147	);
148	}
149
150	$this->indexUpdates = !$this->getOption( 'deletes', false );
151	// We need to check ids options early otherwise hasOption may return
152	// true even if the user did not set the option on the commandline
153	if ( $this->hasOption( 'ids' ) ) {
154	$this->runWithIds = true;
155	$this->pageIds = $this->buildPageIdBatches();
156	}
157
158	if ( $this->getOption( 'from' ) !== null \|\| $this->getOption( 'to' ) !== null ) {
159	// 0 is falsy so MWTimestamp makes that `now`. '00' is epoch 0.
160	$this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' ) );
161	$this->toDate = new MWTimestamp( $this->getOption( 'to', false ) );
162	}
163	$this->toId = $this->getOption( 'toId' );
164	$this->archive = (bool)$this->getOption( 'archive', false );
165	if ( $this->archive ) {
166	// If we're indexing only for archive, this implies deletes
167	$this->indexUpdates = false;
168	}
169	$this->limit = $this->getOption( 'limit' );
170	$buildChunks = $this->getOption( 'buildChunks' );
171	if ( $buildChunks !== null ) {
172	$this->buildChunks( $buildChunks );
173	return null;
174	}
175	$this->queue = $this->getOption( 'queue' );
176	$this->maxJobs = $this->getOption( 'maxJobs' )
177	? intval( $this->getOption( 'maxJobs' ) )
178	: null;
179	$this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ?
180	intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs;
181	$updateFlags = $this->buildUpdateFlags();
182
183	if ( !$this->getOption( 'batch-size' ) &&
184	( $this->getOption( 'queue' ) \|\| !$this->indexUpdates )
185	) {
186	$this->setBatchSize( 100 );
187	}
188
189	$this->namespace = $this->hasOption( 'namespace' ) ?
190	intval( $this->getOption( 'namespace' ) ) : null;
191
192	$this->excludeContentTypes = array_filter( array_map(
193	'trim',
194	explode( ',', $this->getOption( 'excludeContentTypes', '' ) )
195	) );
196
197	$operationName = $this->indexUpdates
198	? ( $this->queue ? 'Queued' : 'Indexed' )
199	: ( $this->archive ? 'Archived' : 'Deleted' );
200
201	$operationStartTime = microtime( true );
202	$completed = 0;
203	$rate = 0;
204
205	if ( $this->runWithIds ) {
206	$it = $this->getIdsIterator();
207	// @phan-suppress-next-line PhanImpossibleTypeComparison
208	} elseif ( $this->indexUpdates && $this->fromDate === null ) {
209	$it = $this->getUpdatesByIdIterator();
210	} elseif ( $this->indexUpdates ) {
211	$it = $this->getUpdatesByDateIterator();
212	} else {
213	$it = $this->getDeletesIterator();
214	}
215	$jobQueueGroup = MediaWikiServices::getInstance()->getJobQueueGroup();
216
217	foreach ( $it as $batch ) {
218	if ( $this->indexUpdates ) {
219	$size = count( $batch['updates'] );
220	$updates = array_filter( $batch['updates'] );
221	if ( $this->queue ) {
222	$this->waitForQueueToShrink( $wiki );
223	$jobQueueGroup->push( Job\MassIndex::build(
224	$updates, $updateFlags, $this->getOption( 'cluster' )
225	) );
226	} else {
227	// Update size with the actual number of updated documents.
228	$updater = $this->createUpdater();
229	$size = $updater->updatePages( $updates, $updateFlags );
230	}
231	} else {
232	$size = count( $batch['titlesToDelete'] );
233	$updater = $this->createUpdater();
234	$updater->archivePages( $batch['archive'] );
235	if ( !$this->archive ) {
236	$updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] );
237	}
238	}
239
240	$completed += $size;
241	$rate = $this->calculateIndexingRate( $completed, $operationStartTime );
242
243	$this->output(
244	"$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n"
245	);
246	if ( $this->limit !== null && $completed > $this->limit ) {
247	break;
248	}
249	}
250	$this->output( "$operationName a total of {$completed} pages at $rate/second\n" );
251	$this->waitForQueueToDrain( $wiki );
252
253	return true;
254	}
255
256	private function buildPageIdBatches() {
257	if ( !$this->indexUpdates \|\| $this->hasOption( 'limit' )
258	\|\| $this->hasOption( 'from' ) \|\| $this->hasOption( 'to' )
259	\|\| $this->hasOption( 'fromId' ) \|\| $this->hasOption( 'toId' )
260	) {
261	$this->fatalError(
262	'--ids cannot be used with deletes/archive/from/to/fromId/toId/limit'
263	);
264	}
265
266	$pageIds = array_map(
267	function ( $pageId ) {
268	$pageId = trim( $pageId );
269	if ( !ctype_digit( $pageId ) ) {
270	$this->fatalError( "Invalid page id provided in --ids, got '$pageId', " .
271	"expected a positive integer" );
272	}
273	return intval( $pageId );
274	},
275	explode( ',', $this->getOption( 'ids' ) )
276	);
277	return array_unique( $pageIds, SORT_REGULAR );
278	}
279
280	private function buildUpdateFlags() {
281	$updateFlags = 0;
282	if ( $this->getOption( 'indexOnSkip' ) ) {
283	$updateFlags \|= BuildDocument::INDEX_ON_SKIP;
284	}
285	if ( $this->getOption( 'skipParse' ) ) {
286	$updateFlags \|= BuildDocument::SKIP_PARSE;
287	if ( !$this->getOption( 'batch-size' ) ) {
288	$this->setBatchSize( 50 );
289	}
290	}
291	if ( $this->getOption( 'skipLinks' ) ) {
292	$updateFlags \|= BuildDocument::SKIP_LINKS;
293	}
294
295	return $updateFlags;
296	}
297
298	private function waitForQueueToShrink( $wiki ) {
299	$now = microtime( true );
300	if ( $now - $this->lastJobQueueCheckTime <=
301	self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS
302	) {
303	return;
304	}
305
306	$this->lastJobQueueCheckTime = $now;
307	$queueSize = $this->getUpdatesInQueue();
308	if ( $this->maxJobs === null \|\| $this->maxJobs >= $queueSize ) {
309	return;
310	}
311
312	do {
313	$this->output(
314	"$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n"
315	);
316	usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
317	$queueSize = $this->getUpdatesInQueue();
318	} while ( $this->pauseForJobs < $queueSize );
319	}
320
321	private function waitForQueueToDrain( $wiki ) {
322	if ( !$this->queue ) {
323	return;
324	}
325
326	$lastQueueSizeForOurJob = PHP_INT_MAX;
327	$waitStartTime = microtime( true );
328	$this->output( "Waiting for jobs to drain from the queue\n" );
329	while ( true ) {
330	$queueSizeForOurJob = $this->getUpdatesInQueue();
331	if ( $queueSizeForOurJob === 0 ) {
332	return;
333	}
334	// We subtract 5 because we some jobs may be added by deletes
335	if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) {
336	$this->output( "Queue size went up. Another script is likely adding jobs " .
337	"and it'll wait for them to empty.\n" );
338	return;
339	}
340	if ( microtime( true ) - $waitStartTime > 120 ) {
341	// Wait at least two full minutes before we check if the job count went down.
342	// Less then that and we might be seeing lag from redis's counts.
343	$lastQueueSizeForOurJob = $queueSizeForOurJob;
344	}
345	$this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" );
346	usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
347	}
348	}
349
350	/**
351	* @param int $completed
352	* @param double $operationStartTime
353	*
354	* @return double
355	*/
356	private function calculateIndexingRate( $completed, $operationStartTime ) {
357	$rate = $completed / ( microtime( true ) - $operationStartTime );
358
359	if ( $rate < 1 ) {
360	return round( $rate, 1 );
361	}
362
363	return round( $rate );
364	}
365
366	/**
367	* Do some simple sanity checking to make sure we've got indexes to populate.
368	* Note this isn't nearly as robust as updateSearchIndexConfig is, but it's
369	* not designed to be.
370	*
371	* @return bool
372	*/
373	private function simpleCheckIndexes() {
374	$indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME );
375
376	// Top-level alias needs to exist
377	if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) {
378	return false;
379	}
380
381	// Now check all index types to see if they exist
382	foreach ( $this->getConnection()->getAllIndexSuffixes() as $indexSuffix ) {
383	// If the alias for this type doesn't exist, fail
384	if ( !$this->getConnection()->getIndex( $indexBaseName, $indexSuffix )->exists() ) {
385	return false;
386	}
387	}
388
389	return true;
390	}
391
392	/**
393	* @return CallbackIterator
394	*/
395	protected function getDeletesIterator() {
396	$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
397	$it = new BatchRowIterator(
398	$dbr,
399	'logging',
400	[ 'log_timestamp' ],
401	$this->getBatchSize()
402	);
403
404	$this->attachPageConditions( $dbr, $it, 'log' );
405	$this->attachTimestampConditions( $dbr, $it, 'log' );
406	$it->addConditions( [
407	'log_type' => 'delete',
408	'log_action' => 'delete',
409	'EXISTS(select * from archive where ar_title = log_title and ar_namespace = log_namespace)',
410	// Prior to 2010 the logging table contains nulls. As the docs in elasticsearch use the page id
411	// as the document id we cannot handle these old rows.
412	$dbr->expr( 'log_page', '!=', null ),
413	] );
414
415	$it->setFetchColumns( [ 'log_timestamp', 'log_namespace', 'log_title', 'log_page' ] );
416
417	$it->setCaller( __METHOD__ );
418
419	return new CallbackIterator( $it, function ( $batch ) {
420	$titlesToDelete = [];
421	$docIdsToDelete = [];
422	$archive = [];
423	foreach ( $batch as $row ) {
424	$title = Title::makeTitle( $row->log_namespace, $row->log_title );
425	$id = $this->getSearchConfig()->makeId( $row->log_page );
426	$titlesToDelete[] = $title;
427	$docIdsToDelete[] = $id;
428	$archive[] = [
429	'title' => $title,
430	'page' => $id,
431	];
432	}
433
434	return [
435	'titlesToDelete' => $titlesToDelete,
436	'docIdsToDelete' => $docIdsToDelete,
437	'archive' => $archive,
438	'endingAt' => isset( $row )
439	? ( new MWTimestamp( $row->log_timestamp ) )->getTimestamp( TS_ISO_8601 )
440	: 'unknown',
441	];
442	} );
443	}
444
445	/**
446	* @return CallbackIterator
447	*/
448	protected function getIdsIterator() {
449	$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
450	$pageQuery = WikiPage::getQueryInfo();
451	$it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() );
452	$it->setFetchColumns( $pageQuery['fields'] );
453	$it->addJoinConditions( $pageQuery['joins'] );
454	$it->addConditions( [ 'page_id' => $this->pageIds ] );
455	$it->setCaller( __METHOD__ );
456	$this->attachPageConditions( $dbr, $it, 'page' );
457
458	return $this->wrapDecodeResults( $it, 'page_id' );
459	}
460
461	/**
462	* @return CallbackIterator
463	*/
464	protected function getUpdatesByDateIterator() {
465	$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
466	$pageQuery = WikiPage::getQueryInfo();
467	$it = new BatchRowIterator(
468	$dbr,
469	array_merge( $pageQuery['tables'], [ 'revision' ] ),
470	[ 'rev_timestamp', 'page_id' ],
471	$this->getBatchSize()
472	);
473	$it->setFetchColumns( $pageQuery['fields'] );
474	$it->addJoinConditions( $pageQuery['joins'] );
475	$it->addJoinConditions( [
476	'revision' => [ 'JOIN', [ 'rev_page = page_id', 'rev_id = page_latest' ] ]
477	] );
478	$it->setCaller( __METHOD__ );
479
480	$this->attachTimestampConditions( $dbr, $it, 'rev' );
481	$this->attachPageConditions( $dbr, $it, 'page' );
482
483	return $this->wrapDecodeResults( $it, 'rev_timestamp' );
484	}
485
486	/**
487	* @return CallbackIterator
488	*/
489	protected function getUpdatesByIdIterator() {
490	$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
491	$pageQuery = WikiPage::getQueryInfo();
492	$it = new BatchRowIterator( $dbr, $pageQuery['tables'], 'page_id', $this->getBatchSize() );
493	$it->setFetchColumns( $pageQuery['fields'] );
494	$it->addJoinConditions( $pageQuery['joins'] );
495	$it->setCaller( __METHOD__ );
496	$fromId = $this->getOption( 'fromId', 0 );
497	if ( $fromId > 0 ) {
498	$it->addConditions( [
499	$dbr->expr( 'page_id', '>=', $fromId ),
500	] );
501	}
502	if ( $this->toId ) {
503	$it->addConditions( [
504	$dbr->expr( 'page_id', '<=', $this->toId ),
505	] );
506	}
507
508	$this->attachPageConditions( $dbr, $it, 'page' );
509
510	return $this->wrapDecodeResults( $it, 'page_id' );
511	}
512
513	private function attachTimestampConditions(
514	IDatabase $dbr, BatchRowIterator $it, $columnPrefix
515	) {
516	// When initializing we guarantee that if either fromDate or toDate are provided
517	// the other has a sane default value.
518	if ( $this->fromDate !== null ) {
519	$it->addConditions( [
520	$dbr->expr( "{$columnPrefix}_timestamp", '>=', $dbr->timestamp( $this->fromDate ) ),
521	$dbr->expr( "{$columnPrefix}_timestamp", '<=', $dbr->timestamp( $this->toDate ) ),
522	] );
523	}
524	}
525
526	private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) {
527	if ( $this->namespace !== null ) {
528	$it->addConditions( [
529	"{$columnPrefix}_namespace" => $this->namespace,
530	] );
531	}
532	if ( $this->excludeContentTypes ) {
533	$it->addConditions( [
534	$dbr->expr( "{$columnPrefix}_content_model", '!=', $this->excludeContentTypes ),
535	] );
536	}
537	if ( $this->hasOption( 'useDbIndex' ) ) {
538	$index = $this->getOption( 'useDbIndex' );
539	$it->addOptions( [ 'USE INDEX' => $index ] );
540	}
541	}
542
543	/**
544	* @param BatchRowIterator $it
545	* @param string $endingAtColumn
546	* @return CallbackIterator
547	*/
548	private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) {
549	return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) {
550	// Build the updater outside the loop because it stores the redirects it hits.
551	// Don't build it at the top level so those are stored when it is freed.
552	$updater = $this->createUpdater();
553
554	$pages = [];
555	$wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory();
556	foreach ( $batch as $row ) {
557	// No need to call Updater::traceRedirects here because we know this is a valid page
558	// because it is in the database.
559	$page = $wikiPageFactory->newFromRow( $row, IDBAccessObject::READ_LATEST );
560
561	// null pages still get attached to keep the counts the same. They will be filtered
562	// later on.
563	$pages[] = $this->decidePage( $updater, $page );
564	}
565
566	if ( isset( $row ) ) {
567	if ( $endingAtColumn === 'rev_timestamp' ) {
568	$ts = new MWTimestamp( $row->rev_timestamp );
569	$endingAt = $ts->getTimestamp( TS_ISO_8601 );
570	} elseif ( $endingAtColumn === 'page_id' ) {
571	$endingAt = $row->page_id;
572	} else {
573	throw new UnexpectedValueException( 'Unknown $endingAtColumn: ' . $endingAtColumn );
574	}
575	} else {
576	$endingAt = 'unknown';
577	}
578
579	return [
580	'updates' => $pages,
581	'endingAt' => $endingAt,
582	];
583	} );
584	}
585
586	/**
587	* Determine the actual page in the index that needs to be updated, based on a
588	* source page.
589	*
590	* @param Updater $updater
591	* @param WikiPage $page
592	* @return WikiPage\|null WikiPage to be updated, or null if none.
593	*/
594	private function decidePage( Updater $updater, WikiPage $page ) {
595	try {
596	$content = $page->getContent();
597	} catch ( Throwable $ex ) {
598	LoggerFactory::getInstance( 'CirrusSearch' )->warning(
599	"Error deserializing content, skipping page: {pageId}",
600	[ 'pageId' => $page->getTitle()->getArticleID() ]
601	);
602	return null;
603	}
604
605	if ( $content === null ) {
606	// Skip pages without content. Pages have no content because their latest revision
607	// as loaded by the query above doesn't exist.
608	$this->output(
609	'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n"
610	);
611	return null;
612	}
613
614	if ( !$content->isRedirect() ) {
615	return $page;
616	}
617
618	if ( $this->toDate === null ) {
619	// Looks like we accidentally picked up a redirect when we were indexing by id and thus
620	// trying to ignore redirects! Just ignore it! We would filter them out at the db
621	// level but that is slow for large wikis.
622	return null;
623	}
624
625	// We found a redirect. Great. Since we can't index special pages and redirects to special
626	// pages are totally possible, as well as fun stuff like redirect loops, we need to use
627	// Updater's redirect tracing logic which is very complete. Also, it returns null on
628	// self redirects. Great!
629	[ $page, ] = $updater->traceRedirects( $page->getTitle() );
630
631	if ( $page != null &&
632	Title::makeTitleSafe( $page->getTitle()->getNamespace(), $page->getTitle()->getText() ) === null
633	) {
634	// The title cannot be rebuilt from its ns_prefix + text.
635	// It happens if an invalid title is present in the DB
636	// We may prefer to not index them as they are hardly viewable
637	$this->output( 'Skipping page with invalid title: ' . $page->getTitle()->getPrefixedText() );
638	return null;
639	}
640
641	return $page;
642	}
643
644	/**
645	* @param string\|int $buildChunks If specified as a number then chunks no
646	* larger than that size are spat out. If specified as a number followed
647	* by the word "total" without a space between them then that many chunks
648	* will be spat out sized to cover the entire wiki.
649	*/
650	private function buildChunks( $buildChunks ) {
651	$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
652	if ( $this->toId === null ) {
653	$this->toId = $dbr->newSelectQueryBuilder()
654	->select( 'MAX(page_id)' )
655	->from( 'page' )
656	->caller( __METHOD__ )
657	->fetchField();
658	if ( $this->toId === false ) {
659	$this->fatalError( "Couldn't find any pages to index." );
660	}
661	}
662	$fromId = $this->getOption( 'fromId' );
663	if ( $fromId === null ) {
664	$fromId = $dbr->newSelectQueryBuilder()
665	->select( 'MIN(page_id) - 1' )
666	->from( 'page' )
667	->caller( __METHOD__ )
668	->fetchField();
669	if ( $fromId === false ) {
670	$this->fatalError( "Couldn't find any pages to index." );
671	}
672	}
673	if ( $fromId === $this->toId ) {
674	$this->fatalError(
675	"Couldn't find any pages to index. fromId = $fromId = $this->toId = toId."
676	);
677	}
678	$builder = new \CirrusSearch\Maintenance\ChunkBuilder();
679	$builder->build( $this->mSelf, $this->mOptions, $buildChunks, $fromId, $this->toId );
680	}
681
682	/**
683	* Get the number of cirrusSearchMassIndex jobs in the queue.
684	* @return int length
685	*/
686	private function getUpdatesInQueue() {
687	return MediaWikiServices::getInstance()->getJobQueueGroup()->get( 'cirrusSearchMassIndex' )->getSize();
688	}
689
690	/**
691	* @return Updater
692	*/
693	private function createUpdater() {
694	return Updater::build( $this->getSearchConfig(), $this->getOption( 'cluster', null ) );
695	}
696	}
697
698	$maintClass = ForceSearchIndex::class;
699	require_once RUN_MAINTENANCE_IF_MAIN;