MediaWiki  master
refreshLinks.php
Go to the documentation of this file.
1 <?php
26 
27 require_once __DIR__ . '/Maintenance.php';
28 
34 class RefreshLinks extends Maintenance {
35  const REPORTING_INTERVAL = 100;
36 
38  protected $namespace = false;
39 
40  public function __construct() {
41  parent::__construct();
42  $this->addDescription( 'Refresh link tables' );
43  $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
44  $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
45  $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
46  $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
47  $this->addOption( 'e', 'Last page id to refresh', false, true );
48  $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
49  'query, default 100000', false, true );
50  $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
51  $this->addOption( 'category', 'Only fix pages in this category', false, true );
52  $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
53  $this->addArg( 'start', 'Page_id to start from, default 1', false );
54  $this->setBatchSize( 100 );
55  }
56 
57  public function execute() {
58  // Note that there is a difference between not specifying the start
59  // and end IDs and using the minimum and maximum values from the page
60  // table. In the latter case, deleteLinksFromNonexistent() will not
61  // delete entries for nonexistent IDs that fall outside the range.
62  $start = (int)$this->getArg( 0 ) ?: null;
63  $end = (int)$this->getOption( 'e' ) ?: null;
64  $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
65  $ns = $this->getOption( 'namespace' );
66  if ( $ns === null ) {
67  $this->namespace = false;
68  } else {
69  $this->namespace = (int)$ns;
70  }
71  if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
72  $title = Title::makeTitleSafe( NS_CATEGORY, $category );
73  if ( !$title ) {
74  $this->fatalError( "'$category' is an invalid category name!\n" );
75  }
76  $this->refreshCategory( $title );
77  } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
78  $this->refreshTrackingCategory( $category );
79  } elseif ( !$this->hasOption( 'dfn-only' ) ) {
80  $new = $this->hasOption( 'new-only' );
81  $redir = $this->hasOption( 'redirects-only' );
82  $oldRedir = $this->hasOption( 'old-redirects-only' );
83  $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
84  $this->deleteLinksFromNonexistent( null, null, $this->getBatchSize(), $dfnChunkSize );
85  } else {
86  $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
87  }
88  }
89 
90  private function namespaceCond() {
91  return $this->namespace !== false
92  ? [ 'page_namespace' => $this->namespace ]
93  : [];
94  }
95 
104  private function doRefreshLinks( $start, $newOnly = false,
105  $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
106  ) {
107  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
108 
109  if ( $start === null ) {
110  $start = 1;
111  }
112 
113  // Give extensions a chance to optimize settings
114  Hooks::run( 'MaintenanceRefreshLinksInit', [ $this ] );
115 
116  $what = $redirectsOnly ? "redirects" : "links";
117 
118  if ( $oldRedirectsOnly ) {
119  # This entire code path is cut-and-pasted from below. Hurrah.
120 
121  $conds = [
122  "page_is_redirect=1",
123  "rd_from IS NULL",
124  self::intervalCond( $dbr, 'page_id', $start, $end ),
125  ] + $this->namespaceCond();
126 
127  $res = $dbr->select(
128  [ 'page', 'redirect' ],
129  'page_id',
130  $conds,
131  __METHOD__,
132  [],
133  [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
134  );
135  $num = $res->numRows();
136  $this->output( "Refreshing $num old redirects from $start...\n" );
137 
138  $i = 0;
139 
140  foreach ( $res as $row ) {
141  if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
142  $this->output( "$i\n" );
143  wfWaitForSlaves();
144  }
145  $this->fixRedirect( $row->page_id );
146  }
147  } elseif ( $newOnly ) {
148  $this->output( "Refreshing $what from " );
149  $res = $dbr->select( 'page',
150  [ 'page_id' ],
151  [
152  'page_is_new' => 1,
153  self::intervalCond( $dbr, 'page_id', $start, $end ),
154  ] + $this->namespaceCond(),
155  __METHOD__
156  );
157  $num = $res->numRows();
158  $this->output( "$num new articles...\n" );
159 
160  $i = 0;
161  foreach ( $res as $row ) {
162  if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
163  $this->output( "$i\n" );
164  wfWaitForSlaves();
165  }
166  if ( $redirectsOnly ) {
167  $this->fixRedirect( $row->page_id );
168  } else {
169  self::fixLinksFromArticle( $row->page_id, $this->namespace );
170  }
171  }
172  } else {
173  if ( !$end ) {
174  $maxPage = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
175  $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', '', __METHOD__ );
176  $end = max( $maxPage, $maxRD );
177  }
178  $this->output( "Refreshing redirects table.\n" );
179  $this->output( "Starting from page_id $start of $end.\n" );
180 
181  for ( $id = $start; $id <= $end; $id++ ) {
182  if ( !( $id % self::REPORTING_INTERVAL ) ) {
183  $this->output( "$id\n" );
184  wfWaitForSlaves();
185  }
186  $this->fixRedirect( $id );
187  }
188 
189  if ( !$redirectsOnly ) {
190  $this->output( "Refreshing links tables.\n" );
191  $this->output( "Starting from page_id $start of $end.\n" );
192 
193  for ( $id = $start; $id <= $end; $id++ ) {
194  if ( !( $id % self::REPORTING_INTERVAL ) ) {
195  $this->output( "$id\n" );
196  wfWaitForSlaves();
197  }
198  self::fixLinksFromArticle( $id, $this->namespace );
199  }
200  }
201  }
202  }
203 
216  private function fixRedirect( $id ) {
217  $page = WikiPage::newFromID( $id );
218  $dbw = $this->getDB( DB_MASTER );
219 
220  if ( $page === null ) {
221  // This page doesn't exist (any more)
222  // Delete any redirect table entry for it
223  $dbw->delete( 'redirect', [ 'rd_from' => $id ],
224  __METHOD__ );
225 
226  return;
227  } elseif ( $this->namespace !== false
228  && !$page->getTitle()->inNamespace( $this->namespace )
229  ) {
230  return;
231  }
232 
233  $rt = null;
234  $content = $page->getContent( Revision::RAW );
235  if ( $content !== null ) {
236  $rt = $content->getUltimateRedirectTarget();
237  }
238 
239  if ( $rt === null ) {
240  // The page is not a redirect
241  // Delete any redirect table entry for it
242  $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
243  $fieldValue = 0;
244  } else {
245  $page->insertRedirectEntry( $rt );
246  $fieldValue = 1;
247  }
248 
249  // Update the page table to be sure it is an a consistent state
250  $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
251  [ 'page_id' => $id ], __METHOD__ );
252  }
253 
259  public static function fixLinksFromArticle( $id, $ns = false ) {
260  $page = WikiPage::newFromID( $id );
261 
262  MediaWikiServices::getInstance()->getLinkCache()->clear();
263 
264  if ( $page === null ) {
265  return;
266  } elseif ( $ns !== false
267  && !$page->getTitle()->inNamespace( $ns ) ) {
268  return;
269  }
270 
271  // Defer updates to post-send but then immediately execute deferred updates;
272  // this is the simplest way to run all updates immediately (including updates
273  // scheduled by other updates).
274  $page->doSecondaryDataUpdates( [
275  'defer' => DeferredUpdates::POSTSEND,
276  'recursive' => false,
277  ] );
279  }
280 
292  private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
293  $chunkSize = 100000
294  ) {
295  wfWaitForSlaves();
296  $this->output( "Deleting illegal entries from the links tables...\n" );
297  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
298  do {
299  // Find the start of the next chunk. This is based only
300  // on existent page_ids.
301  $nextStart = $dbr->selectField(
302  'page',
303  'page_id',
304  [ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
305  + $this->namespaceCond(),
306  __METHOD__,
307  [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
308  );
309 
310  if ( $nextStart !== false ) {
311  // To find the end of the current chunk, subtract one.
312  // This will serve to limit the number of rows scanned in
313  // dfnCheckInterval(), per query, to at most the sum of
314  // the chunk size and deletion batch size.
315  $chunkEnd = $nextStart - 1;
316  } else {
317  // This is the last chunk. Check all page_ids up to $end.
318  $chunkEnd = $end;
319  }
320 
321  $fmtStart = $start !== null ? "[$start" : '(-INF';
322  $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
323  $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
324  $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
325 
326  $start = $nextStart;
327 
328  } while ( $nextStart !== false );
329  }
330 
337  private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
338  $dbw = $this->getDB( DB_MASTER );
339  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
340 
341  $linksTables = [ // table name => page_id field
342  'pagelinks' => 'pl_from',
343  'imagelinks' => 'il_from',
344  'categorylinks' => 'cl_from',
345  'templatelinks' => 'tl_from',
346  'externallinks' => 'el_from',
347  'iwlinks' => 'iwl_from',
348  'langlinks' => 'll_from',
349  'redirect' => 'rd_from',
350  'page_props' => 'pp_page',
351  ];
352 
353  foreach ( $linksTables as $table => $field ) {
354  $this->output( " $table: 0" );
355  $tableStart = $start;
356  $counter = 0;
357  do {
358  $ids = $dbr->selectFieldValues(
359  $table,
360  $field,
361  [
362  self::intervalCond( $dbr, $field, $tableStart, $end ),
363  "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
364  ],
365  __METHOD__,
366  [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
367  );
368 
369  $numIds = count( $ids );
370  if ( $numIds ) {
371  $counter += $numIds;
372  $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
373  $this->output( ", $counter" );
374  $tableStart = $ids[$numIds - 1] + 1;
375  wfWaitForSlaves();
376  }
377 
378  } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
379 
380  $this->output( " deleted.\n" );
381  }
382  }
383 
396  private static function intervalCond( IDatabase $db, $var, $start, $end ) {
397  if ( $start === null && $end === null ) {
398  return "$var IS NOT NULL";
399  } elseif ( $end === null ) {
400  return "$var >= {$db->addQuotes( $start )}";
401  } elseif ( $start === null ) {
402  return "$var <= {$db->addQuotes( $end )}";
403  } else {
404  return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
405  }
406  }
407 
413  private function refreshTrackingCategory( $category ) {
414  $cats = $this->getPossibleCategories( $category );
415 
416  if ( !$cats ) {
417  $this->error( "Tracking category '$category' is disabled\n" );
418  // Output to stderr but don't bail out,
419  }
420 
421  foreach ( $cats as $cat ) {
422  $this->refreshCategory( $cat );
423  }
424  }
425 
431  private function refreshCategory( Title $category ) {
432  $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
433 
434  $dbr = $this->getDB( DB_REPLICA );
435  $conds = [
436  'page_id=cl_from',
437  'cl_to' => $category->getDBkey(),
438  ];
439  if ( $this->namespace !== false ) {
440  $conds['page_namespace'] = $this->namespace;
441  }
442 
443  $i = 0;
444  $timestamp = '';
445  $lastId = 0;
446  do {
447  $finalConds = $conds;
448  $timestamp = $dbr->addQuotes( $timestamp );
449  $finalConds [] =
450  "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
451  $res = $dbr->select( [ 'page', 'categorylinks' ],
452  [ 'page_id', 'cl_timestamp' ],
453  $finalConds,
454  __METHOD__,
455  [
456  'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
457  'LIMIT' => $this->getBatchSize(),
458  ]
459  );
460 
461  foreach ( $res as $row ) {
462  if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
463  $this->output( "$i\n" );
464  wfWaitForSlaves();
465  }
466  $lastId = $row->page_id;
467  $timestamp = $row->cl_timestamp;
468  self::fixLinksFromArticle( $row->page_id );
469  }
470 
471  } while ( $res->numRows() == $this->getBatchSize() );
472  }
473 
480  private function getPossibleCategories( $categoryKey ) {
481  $trackingCategories = new TrackingCategories( $this->getConfig() );
482  $cats = $trackingCategories->getTrackingCategories();
483  if ( isset( $cats[$categoryKey] ) ) {
484  return $cats[$categoryKey]['cats'];
485  }
486  $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
487  }
488 }
489 
491 require_once RUN_MAINTENANCE_IF_MAIN;
getArg( $argId=0, $default=null)
Get an argument.
error( $err, $die=0)
Throw an error to the user.
processing should stop and the error should be shown to the user * false
Definition: hooks.txt:187
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: maintenance.txt:39
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
setBatchSize( $s=0)
Set the batch size.
hasOption( $name)
Checks to see if a particular option exists.
require_once RUN_MAINTENANCE_IF_MAIN
Definition: maintenance.txt:50
const DB_MASTER
Definition: defines.php:26
getDBkey()
Get the main part with underscores.
Definition: Title.php:1019
$res
Definition: database.txt:21
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
addDescription( $text)
Set the description text.
const NS_CATEGORY
Definition: Defines.php:74
const REPORTING_INTERVAL
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:767
addArg( $arg, $description, $required=true)
Add some args that are needed.
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:912
getDB( $db, $groups=[], $wiki=false)
Returns a database to be used by current maintenance script.
output( $out, $channel=null)
Throw some output to the user.
const RAW
Definition: Revision.php:56
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:618
static newFromID( $id, $from='fromdb')
Constructor from a page id.
Definition: WikiPage.php:176
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
static doUpdates( $mode='run', $stage=self::ALL)
Do any deferred updates and clear the list.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
getBatchSize()
Returns batch size.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
const DB_REPLICA
Definition: defines.php:25
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
$content
Definition: pageupdater.txt:72
This class performs some operations related to tracking categories, such as creating a list of all su...
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200