MediaWiki master
refreshLinks.php
Go to the documentation of this file.
1<?php
24
25// @codeCoverageIgnoreStart
26require_once __DIR__ . '/Maintenance.php';
27// @codeCoverageIgnoreEnd
28
35 private const REPORTING_INTERVAL = 100;
36
37 public function __construct() {
38 parent::__construct();
39 $this->addDescription( 'Refresh link tables' );
40 $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' );
41 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
42 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
43 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
44 $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' );
45 $this->addOption( 'e', 'Last page id to refresh', false, true );
46 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
47 'query, default 100,000', false, true );
48 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
49 $this->addOption( 'category', 'Only fix pages in this category', false, true );
50 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
51 $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp',
52 false, true );
53 $this->addArg( 'start', 'Page_id to start from, default 1', false );
54 $this->setBatchSize( 100 );
55 }
56
57 public function execute() {
58 // Note that there is a difference between not specifying the start
59 // and end IDs and using the minimum and maximum values from the page
60 // table. In the latter case, deleteLinksFromNonexistent() will not
61 // delete entries for nonexistent IDs that fall outside the range.
62 $start = (int)$this->getArg( 0 ) ?: null;
63 $end = (int)$this->getOption( 'e' ) ?: null;
64 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 );
65
66 if ( $this->hasOption( 'dfn-only' ) ) {
67 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
68 return;
69 }
70
71 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
72 $builder = $dbr->newSelectQueryBuilder()
73 ->from( 'page' )
74 ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) )
75 ->limit( $this->getBatchSize() );
76
77 if ( $this->hasOption( 'namespace' ) ) {
78 $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] );
79 }
80
81 if ( $this->hasOption( 'before-timestamp' ) ) {
82 $builder->andWhere(
83 $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) )
84 ->or( 'page_links_updated', '=', null )
85 );
86 }
87
88 if ( $this->hasOption( 'category' ) ) {
89 $category = $this->getOption( 'category' );
90 $title = Title::makeTitleSafe( NS_CATEGORY, $category );
91 if ( !$title ) {
92 $this->fatalError( "'$category' is an invalid category name!\n" );
93 }
94 $this->refreshCategory( $builder, $title );
95 } elseif ( $this->hasOption( 'tracking-category' ) ) {
96 // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core
97 $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) );
98 } else {
99 $new = $this->hasOption( 'new-only' );
100 $redir = $this->hasOption( 'redirects-only' );
101 $touched = $this->hasOption( 'touched-only' );
102 $what = $redir ? 'redirects' : 'links';
103 if ( $new ) {
104 $builder->andWhere( [ 'page_is_new' => 1 ] );
105 $this->output( "Refreshing $what from new pages...\n" );
106 } else {
107 if ( $touched ) {
108 $builder->andWhere( [
109 $dbr->expr( 'page_links_updated', '=', null )
110 ->orExpr( new RawSQLExpression( 'page_touched > page_links_updated' ) ),
111 ] );
112 }
113 $this->output( "Refreshing $what from pages...\n" );
114 }
115 $this->doRefreshLinks( $builder, $redir );
116 if ( !$this->hasOption( 'namespace' ) ) {
117 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
118 }
119 }
120 }
121
128 private function doRefreshLinks(
129 SelectQueryBuilder $builder,
130 bool $redirectsOnly = false,
131 array $indexFields = [ 'page_id' ]
132 ) {
133 // Give extensions a chance to optimize settings
134 $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this );
135
136 $estimateCount = $builder->caller( __METHOD__ )->estimateRowCount();
137 $this->output( "Estimated page count: $estimateCount\n" );
138
139 $i = 0;
140 $lastIndexes = array_fill_keys( $indexFields, 0 );
141 $selectFields = in_array( 'page_id', $indexFields )
142 ? $indexFields : [ 'page_id', ...$indexFields ];
143 $verbose = $this->hasOption( 'verbose' );
144 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
145 do {
146 $batchCond = $dbr->buildComparison( '>', $lastIndexes );
147 $res = ( clone $builder )->select( $selectFields )
148 ->andWhere( [ $batchCond ] )
149 ->orderBy( $indexFields )
150 ->caller( __METHOD__ )->fetchResultSet();
151
152 if ( $verbose ) {
153 $this->output( "Refreshing links for {$res->numRows()} pages\n" );
154 }
155
156 foreach ( $res as $row ) {
157 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
158 $this->output( "$i\n" );
159 $this->waitForReplication();
160 }
161 if ( $verbose ) {
162 $this->output( "Refreshing links for page ID {$row->page_id}\n" );
163 }
164 self::fixRedirect( $this, $row->page_id );
165 if ( !$redirectsOnly ) {
166 self::fixLinksFromArticle( $row->page_id );
167 }
168 }
169 if ( $res->numRows() ) {
170 $res->seek( $res->numRows() - 1 );
171 foreach ( $indexFields as $field ) {
172 $lastIndexes[$field] = $res->current()->$field;
173 }
174 }
175
176 } while ( $res->numRows() == $this->getBatchSize() );
177 }
178
193 public static function fixRedirect( Maintenance $maint, $id ) {
194 $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id );
195
196 // In case the page just got deleted.
197 if ( $page === null ) {
198 return;
199 }
200
201 $rt = null;
202 $content = $page->getContent( RevisionRecord::RAW );
203 if ( $content !== null ) {
204 $rt = $content->getRedirectTarget();
205 }
206
207 $dbw = $maint->getDB( DB_PRIMARY );
208 if ( $rt === null ) {
209 // The page is not a redirect
210 // Delete any redirect table entry for it
211 $dbw->newDeleteQueryBuilder()
212 ->deleteFrom( 'redirect' )
213 ->where( [ 'rd_from' => $id ] )
214 ->caller( __METHOD__ )->execute();
215 $fieldValue = 0;
216 } else {
217 $page->insertRedirectEntry( $rt );
218 $fieldValue = 1;
219 }
220
221 // Update the page table to be sure it is an a consistent state
222 $update = $dbw->newUpdateQueryBuilder()
223 ->update( 'page' )
224 ->set( [ 'page_is_redirect' => $fieldValue ] )
225 ->where( [ 'page_id' => $id ] )
226 ->caller( __METHOD__ );
227 $update->execute();
228 $maint->getServiceContainer()->getLinkWriteDuplicator()->duplicate( $update );
229 }
230
235 public static function fixLinksFromArticle( $id ) {
236 $services = MediaWikiServices::getInstance();
237 $page = $services->getWikiPageFactory()->newFromID( $id );
238
239 // In case the page just got deleted.
240 if ( $page === null ) {
241 return;
242 }
243
244 // Defer updates to post-send but then immediately execute deferred updates;
245 // this is the simplest way to run all updates immediately (including updates
246 // scheduled by other updates).
247 $page->doSecondaryDataUpdates( [
248 'defer' => DeferredUpdates::POSTSEND,
249 'causeAction' => 'refresh-links-maintenance',
250 'recursive' => false,
251 ] );
252 DeferredUpdates::doUpdates();
253 }
254
266 private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
267 $chunkSize = 100_000
268 ) {
269 $this->waitForReplication();
270 $this->output( "Deleting illegal entries from the links tables...\n" );
271 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
272 do {
273 // Find the start of the next chunk. This is based only
274 // on existent page_ids.
275 $nextStart = $dbr->newSelectQueryBuilder()
276 ->select( 'page_id' )
277 ->from( 'page' )
278 ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] )
279 ->orderBy( 'page_id' )
280 ->offset( $chunkSize )
281 ->caller( __METHOD__ )->fetchField();
282
283 if ( $nextStart !== false ) {
284 // To find the end of the current chunk, subtract one.
285 // This will serve to limit the number of rows scanned in
286 // dfnCheckInterval(), per query, to at most the sum of
287 // the chunk size and deletion batch size.
288 $chunkEnd = $nextStart - 1;
289 } else {
290 // This is the last chunk. Check all page_ids up to $end.
291 $chunkEnd = $end;
292 }
293
294 $fmtStart = $start !== null ? "[$start" : '(-INF';
295 $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
296 $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
297 $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
298
299 $start = $nextStart;
300
301 } while ( $nextStart !== false );
302 }
303
310 private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
311 $linksTables = [
312 // table name => page_id field
313 'pagelinks' => 'pl_from',
314 'imagelinks' => 'il_from',
315 'categorylinks' => 'cl_from',
316 'templatelinks' => 'tl_from',
317 'externallinks' => 'el_from',
318 'iwlinks' => 'iwl_from',
319 'langlinks' => 'll_from',
320 'redirect' => 'rd_from',
321 'page_props' => 'pp_page',
322 ];
323
324 $domains = [
325 'categorylinks' => CategoryLinksTable::VIRTUAL_DOMAIN,
326 'externallinks' => ExternalLinksTable::VIRTUAL_DOMAIN,
327 'imagelinks' => ImageLinksTable::VIRTUAL_DOMAIN,
328 'iwlinks' => InterwikiLinksTable::VIRTUAL_DOMAIN,
329 'langlinks' => LangLinksTable::VIRTUAL_DOMAIN,
330 'pagelinks' => PageLinksTable::VIRTUAL_DOMAIN,
331 'templatelinks' => TemplateLinksTable::VIRTUAL_DOMAIN,
332 ];
333
334 foreach ( $linksTables as $table => $field ) {
335 $domain = $domains[$table] ?? false;
336 $dbw = $this->getServiceContainer()->getConnectionProvider()->getPrimaryDatabase( $domain );
337 $dbr = $this->getServiceContainer()->getConnectionProvider()->getReplicaDatabase( $domain, 'vslow' );
338
339 $this->output( " $table: 0" );
340 $tableStart = $start;
341 $counter = 0;
342 do {
343 $ids = $dbr->newSelectQueryBuilder()
344 ->select( $field )
345 ->distinct()
346 ->from( $table )
347 ->leftJoin( 'page', null, "$field = page_id" )
348 ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) )
349 ->andWhere( [ 'page_id' => null ] )
350 ->orderBy( $field )
351 ->limit( $batchSize )
352 ->caller( __METHOD__ )->fetchFieldValues();
353
354 $numIds = count( $ids );
355 if ( $numIds ) {
356 $counter += $numIds;
357 $dbw->newDeleteQueryBuilder()
358 ->deleteFrom( $table )
359 ->where( [ $field => $ids ] )
360 ->caller( __METHOD__ )->execute();
361 $this->output( ", $counter" );
362 $tableStart = $ids[$numIds - 1] + 1;
363 $this->waitForReplication();
364 }
365
366 } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
367
368 $this->output( " deleted.\n" );
369 }
370 }
371
384 private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) {
385 if ( $start === null && $end === null ) {
386 return $db->expr( $var, '!=', null );
387 } elseif ( $end === null ) {
388 return $db->expr( $var, '>=', $start );
389 } elseif ( $start === null ) {
390 return $db->expr( $var, '<=', $end );
391 } else {
392 return $db->expr( $var, '>=', $start )->and( $var, '<=', $end );
393 }
394 }
395
402 private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) {
403 $cats = $this->getPossibleCategories( $category );
404
405 if ( !$cats ) {
406 $this->error( "Tracking category '$category' is disabled\n" );
407 // Output to stderr but don't bail out.
408 }
409
410 foreach ( $cats as $cat ) {
411 $this->refreshCategory( clone $builder, $cat );
412 }
413 }
414
421 private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) {
422 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
423
424 $builder->join( 'categorylinks', null, 'page_id=cl_from' )
425 ->join( 'linktarget', null, 'lt_id=cl_target_id' )
426 ->andWhere( [ 'lt_title' => $category->getDBkey(), 'lt_namespace' => NS_CATEGORY ] );
427 $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] );
428 }
429
436 private function getPossibleCategories( $categoryKey ) {
437 $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories();
438 if ( isset( $cats[$categoryKey] ) ) {
439 return $cats[$categoryKey]['cats'];
440 }
441 $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
442 }
443}
444
445// @codeCoverageIgnoreStart
446$maintClass = RefreshLinks::class;
447require_once RUN_MAINTENANCE_IF_MAIN;
448// @codeCoverageIgnoreEnd
const NS_CATEGORY
Definition Defines.php:65
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28
Defer callable updates to run later in the PHP process.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
getBatchSize()
Returns batch size.
output( $out, $channel=null)
Throw some output to the user.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
waitForReplication()
Wait for replica DB servers to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Service locator for MediaWiki core services.
Page revision base class.
Represents a title within MediaWiki.
Definition Title.php:69
join( $table, $alias=null, $conds=[])
Inner join a table or group of tables.
Raw SQL expression to be used in query builders.
Build SELECT queries with a fluent interface.
andWhere( $conds)
Add conditions to the query.
caller( $fname)
Set the method name to be included in an SQL comment.
from( $table, $alias=null)
Add a single table to the SELECT query.
Represents the target of a wiki link.
getDBkey()
Get the main part of the link target, in canonical database form.
A database connection without write operations.
expr(string $field, string $op, $value)
See Expression::__construct()