MediaWiki 1.42.1
refreshLinks.php
Go to the documentation of this file.
1<?php
28
29require_once __DIR__ . '/Maintenance.php';
30
37 private const REPORTING_INTERVAL = 100;
38
39 public function __construct() {
40 parent::__construct();
41 $this->addDescription( 'Refresh link tables' );
42 $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' );
43 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
44 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
45 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
46 $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' );
47 $this->addOption( 'e', 'Last page id to refresh', false, true );
48 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
49 'query, default 100,000', false, true );
50 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
51 $this->addOption( 'category', 'Only fix pages in this category', false, true );
52 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
53 $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp',
54 false, true );
55 $this->addArg( 'start', 'Page_id to start from, default 1', false );
56 $this->setBatchSize( 100 );
57 }
58
59 public function execute() {
60 // Note that there is a difference between not specifying the start
61 // and end IDs and using the minimum and maximum values from the page
62 // table. In the latter case, deleteLinksFromNonexistent() will not
63 // delete entries for nonexistent IDs that fall outside the range.
64 $start = (int)$this->getArg( 0 ) ?: null;
65 $end = (int)$this->getOption( 'e' ) ?: null;
66 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 );
67
68 if ( $this->hasOption( 'dfn-only' ) ) {
69 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
70 return;
71 }
72
73 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
74 $builder = $dbr->newSelectQueryBuilder()
75 ->from( 'page' )
76 ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) )
77 ->limit( $this->getBatchSize() );
78
79 if ( $this->hasOption( 'namespace' ) ) {
80 $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] );
81 }
82
83 if ( $this->hasOption( 'before-timestamp' ) ) {
84 $builder->andWhere(
85 $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) )
86 ->or( 'page_links_updated', '=', null )
87 );
88 }
89
90 if ( $this->hasOption( 'category' ) ) {
91 $category = $this->getOption( 'category' );
92 $title = Title::makeTitleSafe( NS_CATEGORY, $category );
93 if ( !$title ) {
94 $this->fatalError( "'$category' is an invalid category name!\n" );
95 }
96 $this->refreshCategory( $builder, $title );
97 } elseif ( $this->hasOption( 'tracking-category' ) ) {
98 // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core
99 $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) );
100 } else {
101 $new = $this->hasOption( 'new-only' );
102 $redir = $this->hasOption( 'redirects-only' );
103 $touched = $this->hasOption( 'touched-only' );
104 $what = $redir ? 'redirects' : 'links';
105 if ( $new ) {
106 $builder->andWhere( [ 'page_is_new' => 1 ] );
107 $this->output( "Refreshing $what from new pages...\n" );
108 } else {
109 if ( $touched ) {
110 $builder->andWhere( [
111 'page_touched > page_links_updated OR page_links_updated IS NULL',
112 ] );
113 }
114 $this->output( "Refreshing $what from pages...\n" );
115 }
116 $this->doRefreshLinks( $builder, $redir );
117 if ( !$this->hasOption( 'namespace' ) ) {
118 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
119 }
120 }
121 }
122
129 private function doRefreshLinks(
130 SelectQueryBuilder $builder,
131 bool $redirectsOnly = false,
132 array $indexFields = [ 'page_id' ]
133 ) {
134 // Give extensions a chance to optimize settings
135 $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this );
136
137 $estimateCount = $builder->estimateRowCount();
138 $this->output( "Estimated page count: $estimateCount\n" );
139
140 $i = 0;
141 $lastIndexes = array_fill_keys( $indexFields, 0 );
142 $selectFields = in_array( 'page_id', $indexFields )
143 ? $indexFields : [ 'page_id', ...$indexFields ];
144 $verbose = $this->hasOption( 'verbose' );
145 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
146 do {
147 $batchCond = $dbr->buildComparison( '>', $lastIndexes );
148 $res = ( clone $builder )->select( $selectFields )
149 ->andWhere( [ $batchCond ] )
150 ->orderBy( $indexFields )
151 ->caller( __METHOD__ )->fetchResultSet();
152
153 if ( $verbose ) {
154 $this->output( "Refreshing links for {$res->numRows()} pages\n" );
155 }
156
157 foreach ( $res as $row ) {
158 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
159 $this->output( "$i\n" );
160 $this->waitForReplication();
161 }
162 if ( $verbose ) {
163 $this->output( "Refreshing links for page ID {$row->page_id}\n" );
164 }
165 self::fixRedirect( $this, $row->page_id );
166 if ( !$redirectsOnly ) {
167 self::fixLinksFromArticle( $row->page_id );
168 }
169 }
170 if ( $res->numRows() ) {
171 $res->seek( $res->numRows() - 1 );
172 foreach ( $indexFields as $field ) {
173 $lastIndexes[$field] = $res->current()->$field;
174 }
175 }
176
177 } while ( $res->numRows() == $this->getBatchSize() );
178 }
179
194 public static function fixRedirect( Maintenance $maint, $id ) {
195 $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id );
196
197 // In case the page just got deleted.
198 if ( $page === null ) {
199 return;
200 }
201
202 $rt = null;
203 $content = $page->getContent( RevisionRecord::RAW );
204 if ( $content !== null ) {
205 $rt = $content->getRedirectTarget();
206 }
207
208 $dbw = $maint->getDB( DB_PRIMARY );
209 if ( $rt === null ) {
210 // The page is not a redirect
211 // Delete any redirect table entry for it
212 $dbw->newDeleteQueryBuilder()
213 ->deleteFrom( 'redirect' )
214 ->where( [ 'rd_from' => $id ] )
215 ->caller( __METHOD__ )->execute();
216 $fieldValue = 0;
217 } else {
218 $page->insertRedirectEntry( $rt );
219 $fieldValue = 1;
220 }
221
222 // Update the page table to be sure it is an a consistent state
223 $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
224 [ 'page_id' => $id ], __METHOD__ );
225 }
226
231 public static function fixLinksFromArticle( $id ) {
232 $services = MediaWikiServices::getInstance();
233 $page = $services->getWikiPageFactory()->newFromID( $id );
234
235 // In case the page just got deleted.
236 if ( $page === null ) {
237 return;
238 }
239
240 // Defer updates to post-send but then immediately execute deferred updates;
241 // this is the simplest way to run all updates immediately (including updates
242 // scheduled by other updates).
243 $page->doSecondaryDataUpdates( [
244 'defer' => DeferredUpdates::POSTSEND,
245 'causeAction' => 'refresh-links-maintenance',
246 'recursive' => false,
247 ] );
248 DeferredUpdates::doUpdates();
249 }
250
262 private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
263 $chunkSize = 100_000
264 ) {
265 $this->waitForReplication();
266 $this->output( "Deleting illegal entries from the links tables...\n" );
267 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
268 do {
269 // Find the start of the next chunk. This is based only
270 // on existent page_ids.
271 $nextStart = $dbr->newSelectQueryBuilder()
272 ->select( 'page_id' )
273 ->from( 'page' )
274 ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] )
275 ->orderBy( 'page_id' )
276 ->offset( $chunkSize )
277 ->caller( __METHOD__ )->fetchField();
278
279 if ( $nextStart !== false ) {
280 // To find the end of the current chunk, subtract one.
281 // This will serve to limit the number of rows scanned in
282 // dfnCheckInterval(), per query, to at most the sum of
283 // the chunk size and deletion batch size.
284 $chunkEnd = $nextStart - 1;
285 } else {
286 // This is the last chunk. Check all page_ids up to $end.
287 $chunkEnd = $end;
288 }
289
290 $fmtStart = $start !== null ? "[$start" : '(-INF';
291 $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
292 $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
293 $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
294
295 $start = $nextStart;
296
297 } while ( $nextStart !== false );
298 }
299
306 private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
307 $dbw = $this->getPrimaryDB();
308 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
309
310 $linksTables = [
311 // table name => page_id field
312 'pagelinks' => 'pl_from',
313 'imagelinks' => 'il_from',
314 'categorylinks' => 'cl_from',
315 'templatelinks' => 'tl_from',
316 'externallinks' => 'el_from',
317 'iwlinks' => 'iwl_from',
318 'langlinks' => 'll_from',
319 'redirect' => 'rd_from',
320 'page_props' => 'pp_page',
321 ];
322
323 foreach ( $linksTables as $table => $field ) {
324 $this->output( " $table: 0" );
325 $tableStart = $start;
326 $counter = 0;
327 do {
328 $ids = $dbr->newSelectQueryBuilder()
329 ->select( $field )
330 ->distinct()
331 ->from( $table )
332 ->leftJoin( 'page', null, "$field = page_id" )
333 ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) )
334 ->andWhere( [ 'page_id' => null ] )
335 ->orderBy( $field )
336 ->limit( $batchSize )
337 ->caller( __METHOD__ )->fetchFieldValues();
338
339 $numIds = count( $ids );
340 if ( $numIds ) {
341 $counter += $numIds;
342 $dbw->newDeleteQueryBuilder()
343 ->deleteFrom( $table )
344 ->where( [ $field => $ids ] )
345 ->caller( __METHOD__ )->execute();
346 $this->output( ", $counter" );
347 $tableStart = $ids[$numIds - 1] + 1;
348 $this->waitForReplication();
349 }
350
351 } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
352
353 $this->output( " deleted.\n" );
354 }
355 }
356
369 private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) {
370 if ( $start === null && $end === null ) {
371 return "$var IS NOT NULL";
372 } elseif ( $end === null ) {
373 return "$var >= " . $db->addQuotes( $start );
374 } elseif ( $start === null ) {
375 return "$var <= " . $db->addQuotes( $end );
376 } else {
377 return "$var BETWEEN " . $db->addQuotes( $start ) . ' AND ' . $db->addQuotes( $end );
378 }
379 }
380
387 private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) {
388 $cats = $this->getPossibleCategories( $category );
389
390 if ( !$cats ) {
391 $this->error( "Tracking category '$category' is disabled\n" );
392 // Output to stderr but don't bail out.
393 }
394
395 foreach ( $cats as $cat ) {
396 $this->refreshCategory( clone $builder, $cat );
397 }
398 }
399
406 private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) {
407 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
408
409 $builder->join( 'categorylinks', null, 'page_id=cl_from' )
410 ->andWhere( [ 'cl_to' => $category->getDBkey() ] );
411 $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] );
412 }
413
420 private function getPossibleCategories( $categoryKey ) {
421 $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories();
422 if ( isset( $cats[$categoryKey] ) ) {
423 return $cats[$categoryKey]['cats'];
424 }
425 $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
426 }
427}
428
429$maintClass = RefreshLinks::class;
430require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const NS_CATEGORY
Definition Defines.php:78
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
error( $err, $die=0)
Throw an error to the user.
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Defer callable updates to run later in the PHP process.
Service locator for MediaWiki core services.
Page revision base class.
Represents a title within MediaWiki.
Definition Title.php:78
join( $table, $alias=null, $conds=[])
Inner join a table or group of tables.
Build SELECT queries with a fluent interface.
estimateRowCount()
Estimate the number of rows in dataset.
andWhere( $conds)
Add conditions to the query.
from( $table, $alias=null)
Add a single table to the SELECT query.
Represents the target of a wiki link.
getDBkey()
Get the main part of the link target, in canonical database form.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
A database connection without write operations.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28