MediaWiki  master
refreshLinks.php
Go to the documentation of this file.
1 <?php
27 
28 require_once __DIR__ . '/Maintenance.php';
29 
35 class RefreshLinks extends Maintenance {
36  private const REPORTING_INTERVAL = 100;
37 
38  public function __construct() {
39  parent::__construct();
40  $this->addDescription( 'Refresh link tables' );
41  $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' );
42  $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
43  $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
44  $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
45  $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
46  $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' );
47  $this->addOption( 'e', 'Last page id to refresh', false, true );
48  $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
49  'query, default 100000', false, true );
50  $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
51  $this->addOption( 'category', 'Only fix pages in this category', false, true );
52  $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
53  $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp',
54  false, true );
55  $this->addArg( 'start', 'Page_id to start from, default 1', false );
56  $this->setBatchSize( 100 );
57  }
58 
59  public function execute() {
60  // Note that there is a difference between not specifying the start
61  // and end IDs and using the minimum and maximum values from the page
62  // table. In the latter case, deleteLinksFromNonexistent() will not
63  // delete entries for nonexistent IDs that fall outside the range.
64  $start = (int)$this->getArg( 0 ) ?: null;
65  $end = (int)$this->getOption( 'e' ) ?: null;
66  $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
67 
68  if ( $this->hasOption( 'dfn-only' ) ) {
69  $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
70  return;
71  }
72 
73  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
74  $builder = $dbr->newSelectQueryBuilder()
75  ->from( 'page' )
76  ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) )
77  ->limit( $this->getBatchSize() );
78 
79  if ( $this->hasOption( 'namespace' ) ) {
80  $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] );
81  }
82 
83  if ( $this->hasOption( 'before-timestamp' ) ) {
84  $timeCond = $dbr->buildComparison( '<', [
85  'page_links_updated' => $this->getOption( 'before-timestamp' )
86  ] );
87  $builder->andWhere( [ "$timeCond OR page_links_updated IS NULL" ] );
88  }
89 
90  if ( $this->hasOption( 'category' ) ) {
91  $category = $this->getOption( 'category' );
92  $title = Title::makeTitleSafe( NS_CATEGORY, $category );
93  if ( !$title ) {
94  $this->fatalError( "'$category' is an invalid category name!\n" );
95  }
96  $this->refreshCategory( $builder, $title );
97  } elseif ( $this->hasOption( 'tracking-category' ) ) {
98  // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core
99  $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) );
100  } else {
101  $new = $this->hasOption( 'new-only' );
102  $redir = $this->hasOption( 'redirects-only' );
103  $oldRedir = $this->hasOption( 'old-redirects-only' );
104  $touched = $this->hasOption( 'touched-only' );
105  $what = $redir ? 'redirects' : 'links';
106  if ( $oldRedir ) {
107  $builder->leftJoin( 'redirect', null, 'page_id=rd_from' )
108  ->andWhere( [
109  'page_is_redirect' => 1,
110  'rd_from' => null,
111  ] );
112  $this->output( "Refreshing old redirects from $start...\n" );
113  } elseif ( $new ) {
114  $builder->andWhere( [ 'page_is_new' => 1 ] );
115  $this->output( "Refreshing $what from new pages...\n" );
116  } else {
117  if ( $touched ) {
118  $builder->andWhere( [
119  'page_touched > page_links_updated OR page_links_updated IS NULL',
120  ] );
121  }
122  $this->output( "Refreshing $what from pages...\n" );
123  }
124  $this->doRefreshLinks( $builder, $redir || $oldRedir );
125  if ( !$this->hasOption( 'namespace' ) ) {
126  $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
127  }
128  }
129  }
130 
137  private function doRefreshLinks(
138  SelectQueryBuilder $builder,
139  bool $redirectsOnly = false,
140  array $indexFields = [ 'page_id' ]
141  ) {
142  // Give extensions a chance to optimize settings
143  $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this );
144 
145  $estimateCount = $builder->estimateRowCount();
146  $this->output( "Estimated page count: $estimateCount\n" );
147 
148  $i = 0;
149  $lastIndexes = array_fill_keys( $indexFields, 0 );
150  $selectFields = in_array( 'page_id', $indexFields )
151  ? $indexFields : [ 'page_id', ...$indexFields ];
152  $verbose = $this->hasOption( 'verbose' );
153  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
154  do {
155  $batchCond = $dbr->buildComparison( '>', $lastIndexes );
156  $res = ( clone $builder )->select( $selectFields )
157  ->andWhere( [ $batchCond ] )
158  ->orderBy( $indexFields )
159  ->caller( __METHOD__ )->fetchResultSet();
160 
161  if ( $verbose ) {
162  $this->output( "Refreshing links for {$res->numRows()} pages\n" );
163  }
164 
165  foreach ( $res as $row ) {
166  if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
167  $this->output( "$i\n" );
168  $this->waitForReplication();
169  }
170  if ( $verbose ) {
171  $this->output( "Refreshing links for page ID {$row->page_id}\n" );
172  }
173  self::fixRedirect( $this, $row->page_id );
174  if ( !$redirectsOnly ) {
175  self::fixLinksFromArticle( $row->page_id );
176  }
177  }
178  if ( $res->numRows() ) {
179  $res->seek( $res->numRows() - 1 );
180  foreach ( $indexFields as $field ) {
181  $lastIndexes[$field] = $res->current()->$field;
182  }
183  }
184 
185  } while ( $res->numRows() == $this->getBatchSize() );
186  }
187 
202  public static function fixRedirect( Maintenance $maint, $id ) {
203  $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id );
204 
205  // In case the page just got deleted.
206  if ( $page === null ) {
207  return;
208  }
209 
210  $rt = null;
211  $content = $page->getContent( RevisionRecord::RAW );
212  if ( $content !== null ) {
213  $rt = $content->getRedirectTarget();
214  }
215 
216  $dbw = $maint->getDB( DB_PRIMARY );
217  if ( $rt === null ) {
218  // The page is not a redirect
219  // Delete any redirect table entry for it
220  $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
221  $fieldValue = 0;
222  } else {
223  $page->insertRedirectEntry( $rt );
224  $fieldValue = 1;
225  }
226 
227  // Update the page table to be sure it is an a consistent state
228  $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
229  [ 'page_id' => $id ], __METHOD__ );
230  }
231 
236  public static function fixLinksFromArticle( $id ) {
237  $services = MediaWikiServices::getInstance();
238  $page = $services->getWikiPageFactory()->newFromID( $id );
239 
240  // In case the page just got deleted.
241  if ( $page === null ) {
242  return;
243  }
244 
245  // Defer updates to post-send but then immediately execute deferred updates;
246  // this is the simplest way to run all updates immediately (including updates
247  // scheduled by other updates).
248  $page->doSecondaryDataUpdates( [
249  'defer' => DeferredUpdates::POSTSEND,
250  'causeAction' => 'refresh-links-maintenance',
251  'recursive' => false,
252  ] );
254  }
255 
267  private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
268  $chunkSize = 100000
269  ) {
270  $this->waitForReplication();
271  $this->output( "Deleting illegal entries from the links tables...\n" );
272  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
273  do {
274  // Find the start of the next chunk. This is based only
275  // on existent page_ids.
276  $nextStart = $dbr->newSelectQueryBuilder()
277  ->select( 'page_id' )
278  ->from( 'page' )
279  ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] )
280  ->orderBy( 'page_id' )
281  ->offset( $chunkSize )
282  ->caller( __METHOD__ )->fetchField();
283 
284  if ( $nextStart !== false ) {
285  // To find the end of the current chunk, subtract one.
286  // This will serve to limit the number of rows scanned in
287  // dfnCheckInterval(), per query, to at most the sum of
288  // the chunk size and deletion batch size.
289  $chunkEnd = $nextStart - 1;
290  } else {
291  // This is the last chunk. Check all page_ids up to $end.
292  $chunkEnd = $end;
293  }
294 
295  $fmtStart = $start !== null ? "[$start" : '(-INF';
296  $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
297  $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
298  $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
299 
300  $start = $nextStart;
301 
302  } while ( $nextStart !== false );
303  }
304 
311  private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
312  $dbw = $this->getDB( DB_PRIMARY );
313  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
314 
315  $linksTables = [
316  // table name => page_id field
317  'pagelinks' => 'pl_from',
318  'imagelinks' => 'il_from',
319  'categorylinks' => 'cl_from',
320  'templatelinks' => 'tl_from',
321  'externallinks' => 'el_from',
322  'iwlinks' => 'iwl_from',
323  'langlinks' => 'll_from',
324  'redirect' => 'rd_from',
325  'page_props' => 'pp_page',
326  ];
327 
328  foreach ( $linksTables as $table => $field ) {
329  $this->output( " $table: 0" );
330  $tableStart = $start;
331  $counter = 0;
332  do {
333  $ids = $dbr->newSelectQueryBuilder()
334  ->select( $field )
335  ->distinct()
336  ->from( $table )
337  ->leftJoin( 'page', null, "$field = page_id" )
338  ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) )
339  ->andWhere( [ 'page_id' => null ] )
340  ->orderBy( $field )
341  ->limit( $batchSize )
342  ->caller( __METHOD__ )->fetchFieldValues();
343 
344  $numIds = count( $ids );
345  if ( $numIds ) {
346  $counter += $numIds;
347  $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
348  $this->output( ", $counter" );
349  $tableStart = $ids[$numIds - 1] + 1;
350  $this->waitForReplication();
351  }
352 
353  } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
354 
355  $this->output( " deleted.\n" );
356  }
357  }
358 
371  private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) {
372  if ( $start === null && $end === null ) {
373  return "$var IS NOT NULL";
374  } elseif ( $end === null ) {
375  return "$var >= " . $db->addQuotes( $start );
376  } elseif ( $start === null ) {
377  return "$var <= " . $db->addQuotes( $end );
378  } else {
379  return "$var BETWEEN " . $db->addQuotes( $start ) . ' AND ' . $db->addQuotes( $end );
380  }
381  }
382 
389  private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) {
390  $cats = $this->getPossibleCategories( $category );
391 
392  if ( !$cats ) {
393  $this->error( "Tracking category '$category' is disabled\n" );
394  // Output to stderr but don't bail out.
395  }
396 
397  foreach ( $cats as $cat ) {
398  $this->refreshCategory( clone $builder, $cat );
399  }
400  }
401 
408  private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) {
409  $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
410 
411  $builder->join( 'categorylinks', null, 'page_id=cl_from' )
412  ->andWhere( [ 'cl_to' => $category->getDBkey() ] );
413  $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] );
414  }
415 
422  private function getPossibleCategories( $categoryKey ) {
423  $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories();
424  if ( isset( $cats[$categoryKey] ) ) {
425  return $cats[$categoryKey]['cats'];
426  }
427  $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
428  }
429 }
430 
431 $maintClass = RefreshLinks::class;
432 require_once RUN_MAINTENANCE_IF_MAIN;
const NS_CATEGORY
Definition: Defines.php:78
static doUpdates( $stage=self::ALL)
Consume and execute all pending updates.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
error( $err, $die=0)
Throw an error to the user.
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getHookRunner()
Get a HookRunner for running core hooks.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Service locator for MediaWiki core services.
Page revision base class.
Represents a title within MediaWiki.
Definition: Title.php:76
join( $table, $alias=null, $conds=[])
Inner join a table or group of tables.
leftJoin( $table, $alias=null, $conds=[])
Left join a table or group of tables.
Build SELECT queries with a fluent interface.
estimateRowCount()
Estimate the number of rows in dataset.
andWhere( $conds)
Add conditions to the query.
from( $table, $alias=null)
Add a single table to the SELECT query.
Represents the target of a wiki link.
Definition: LinkTarget.php:30
getDBkey()
Get the main part of the link target, in canonical database form.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
A database connection without write operations.
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28
$content
Definition: router.php:76