MediaWiki REL1_30
refreshLinks.php
Go to the documentation of this file.
1<?php
25
26require_once __DIR__ . '/Maintenance.php';
27
34 const REPORTING_INTERVAL = 100;
35
37 protected $namespace = false;
38
39 public function __construct() {
40 parent::__construct();
41 $this->addDescription( 'Refresh link tables' );
42 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
43 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
44 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
45 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
46 $this->addOption( 'e', 'Last page id to refresh', false, true );
47 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
48 'query, default 100000', false, true );
49 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
50 $this->addOption( 'category', 'Only fix pages in this category', false, true );
51 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
52 $this->addArg( 'start', 'Page_id to start from, default 1', false );
53 $this->setBatchSize( 100 );
54 }
55
56 public function execute() {
57 // Note that there is a difference between not specifying the start
58 // and end IDs and using the minimum and maximum values from the page
59 // table. In the latter case, deleteLinksFromNonexistent() will not
60 // delete entries for nonexistent IDs that fall outside the range.
61 $start = (int)$this->getArg( 0 ) ?: null;
62 $end = (int)$this->getOption( 'e' ) ?: null;
63 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
64 $ns = $this->getOption( 'namespace' );
65 if ( $ns === null ) {
66 $this->namespace = false;
67 } else {
68 $this->namespace = (int)$ns;
69 }
70 if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
71 $title = Title::makeTitleSafe( NS_CATEGORY, $category );
72 if ( !$title ) {
73 $this->error( "'$category' is an invalid category name!\n", true );
74 }
75 $this->refreshCategory( $title );
76 } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
77 $this->refreshTrackingCategory( $category );
78 } elseif ( !$this->hasOption( 'dfn-only' ) ) {
79 $new = $this->hasOption( 'new-only' );
80 $redir = $this->hasOption( 'redirects-only' );
81 $oldRedir = $this->hasOption( 'old-redirects-only' );
82 $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
83 $this->deleteLinksFromNonexistent( null, null, $this->mBatchSize, $dfnChunkSize );
84 } else {
85 $this->deleteLinksFromNonexistent( $start, $end, $this->mBatchSize, $dfnChunkSize );
86 }
87 }
88
89 private function namespaceCond() {
90 return $this->namespace !== false
91 ? [ 'page_namespace' => $this->namespace ]
92 : [];
93 }
94
103 private function doRefreshLinks( $start, $newOnly = false,
104 $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
105 ) {
106 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
107
108 if ( $start === null ) {
109 $start = 1;
110 }
111
112 // Give extensions a chance to optimize settings
113 Hooks::run( 'MaintenanceRefreshLinksInit', [ $this ] );
114
115 $what = $redirectsOnly ? "redirects" : "links";
116
117 if ( $oldRedirectsOnly ) {
118 # This entire code path is cut-and-pasted from below. Hurrah.
119
120 $conds = [
121 "page_is_redirect=1",
122 "rd_from IS NULL",
123 self::intervalCond( $dbr, 'page_id', $start, $end ),
124 ] + $this->namespaceCond();
125
126 $res = $dbr->select(
127 [ 'page', 'redirect' ],
128 'page_id',
129 $conds,
130 __METHOD__,
131 [],
132 [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
133 );
134 $num = $res->numRows();
135 $this->output( "Refreshing $num old redirects from $start...\n" );
136
137 $i = 0;
138
139 foreach ( $res as $row ) {
140 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
141 $this->output( "$i\n" );
143 }
144 $this->fixRedirect( $row->page_id );
145 }
146 } elseif ( $newOnly ) {
147 $this->output( "Refreshing $what from " );
148 $res = $dbr->select( 'page',
149 [ 'page_id' ],
150 [
151 'page_is_new' => 1,
152 self::intervalCond( $dbr, 'page_id', $start, $end ),
153 ] + $this->namespaceCond(),
154 __METHOD__
155 );
156 $num = $res->numRows();
157 $this->output( "$num new articles...\n" );
158
159 $i = 0;
160 foreach ( $res as $row ) {
161 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
162 $this->output( "$i\n" );
164 }
165 if ( $redirectsOnly ) {
166 $this->fixRedirect( $row->page_id );
167 } else {
168 self::fixLinksFromArticle( $row->page_id, $this->namespace );
169 }
170 }
171 } else {
172 if ( !$end ) {
173 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
174 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
175 $end = max( $maxPage, $maxRD );
176 }
177 $this->output( "Refreshing redirects table.\n" );
178 $this->output( "Starting from page_id $start of $end.\n" );
179
180 for ( $id = $start; $id <= $end; $id++ ) {
181 if ( !( $id % self::REPORTING_INTERVAL ) ) {
182 $this->output( "$id\n" );
184 }
185 $this->fixRedirect( $id );
186 }
187
188 if ( !$redirectsOnly ) {
189 $this->output( "Refreshing links tables.\n" );
190 $this->output( "Starting from page_id $start of $end.\n" );
191
192 for ( $id = $start; $id <= $end; $id++ ) {
193 if ( !( $id % self::REPORTING_INTERVAL ) ) {
194 $this->output( "$id\n" );
196 }
197 self::fixLinksFromArticle( $id, $this->namespace );
198 }
199 }
200 }
201 }
202
215 private function fixRedirect( $id ) {
216 $page = WikiPage::newFromID( $id );
217 $dbw = $this->getDB( DB_MASTER );
218
219 if ( $page === null ) {
220 // This page doesn't exist (any more)
221 // Delete any redirect table entry for it
222 $dbw->delete( 'redirect', [ 'rd_from' => $id ],
223 __METHOD__ );
224
225 return;
226 } elseif ( $this->namespace !== false
227 && !$page->getTitle()->inNamespace( $this->namespace )
228 ) {
229 return;
230 }
231
232 $rt = null;
233 $content = $page->getContent( Revision::RAW );
234 if ( $content !== null ) {
235 $rt = $content->getUltimateRedirectTarget();
236 }
237
238 if ( $rt === null ) {
239 // The page is not a redirect
240 // Delete any redirect table entry for it
241 $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
242 $fieldValue = 0;
243 } else {
244 $page->insertRedirectEntry( $rt );
245 $fieldValue = 1;
246 }
247
248 // Update the page table to be sure it is an a consistent state
249 $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
250 [ 'page_id' => $id ], __METHOD__ );
251 }
252
258 public static function fixLinksFromArticle( $id, $ns = false ) {
259 $page = WikiPage::newFromID( $id );
260
261 LinkCache::singleton()->clear();
262
263 if ( $page === null ) {
264 return;
265 } elseif ( $ns !== false
266 && !$page->getTitle()->inNamespace( $ns ) ) {
267 return;
268 }
269
270 $content = $page->getContent( Revision::RAW );
271 if ( $content === null ) {
272 return;
273 }
274
275 $updates = $content->getSecondaryDataUpdates(
276 $page->getTitle(), /* $old = */ null, /* $recursive = */ false );
277 foreach ( $updates as $update ) {
278 DeferredUpdates::addUpdate( $update );
279 DeferredUpdates::doUpdates();
280 }
281 }
282
294 private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
295 $chunkSize = 100000
296 ) {
298 $this->output( "Deleting illegal entries from the links tables...\n" );
299 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
300 do {
301 // Find the start of the next chunk. This is based only
302 // on existent page_ids.
303 $nextStart = $dbr->selectField(
304 'page',
305 'page_id',
306 [ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
307 + $this->namespaceCond(),
308 __METHOD__,
309 [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
310 );
311
312 if ( $nextStart !== false ) {
313 // To find the end of the current chunk, subtract one.
314 // This will serve to limit the number of rows scanned in
315 // dfnCheckInterval(), per query, to at most the sum of
316 // the chunk size and deletion batch size.
317 $chunkEnd = $nextStart - 1;
318 } else {
319 // This is the last chunk. Check all page_ids up to $end.
320 $chunkEnd = $end;
321 }
322
323 $fmtStart = $start !== null ? "[$start" : '(-INF';
324 $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
325 $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
326 $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
327
328 $start = $nextStart;
329
330 } while ( $nextStart !== false );
331 }
332
339 private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
340 $dbw = $this->getDB( DB_MASTER );
341 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
342
343 $linksTables = [ // table name => page_id field
344 'pagelinks' => 'pl_from',
345 'imagelinks' => 'il_from',
346 'categorylinks' => 'cl_from',
347 'templatelinks' => 'tl_from',
348 'externallinks' => 'el_from',
349 'iwlinks' => 'iwl_from',
350 'langlinks' => 'll_from',
351 'redirect' => 'rd_from',
352 'page_props' => 'pp_page',
353 ];
354
355 foreach ( $linksTables as $table => $field ) {
356 $this->output( " $table: 0" );
357 $tableStart = $start;
358 $counter = 0;
359 do {
360 $ids = $dbr->selectFieldValues(
361 $table,
362 $field,
363 [
364 self::intervalCond( $dbr, $field, $tableStart, $end ),
365 "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
366 ],
367 __METHOD__,
368 [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
369 );
370
371 $numIds = count( $ids );
372 if ( $numIds ) {
373 $counter += $numIds;
374 $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
375 $this->output( ", $counter" );
376 $tableStart = $ids[$numIds - 1] + 1;
378 }
379
380 } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
381
382 $this->output( " deleted.\n" );
383 }
384 }
385
398 private static function intervalCond( IDatabase $db, $var, $start, $end ) {
399 if ( $start === null && $end === null ) {
400 return "$var IS NOT NULL";
401 } elseif ( $end === null ) {
402 return "$var >= {$db->addQuotes( $start )}";
403 } elseif ( $start === null ) {
404 return "$var <= {$db->addQuotes( $end )}";
405 } else {
406 return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
407 }
408 }
409
415 private function refreshTrackingCategory( $category ) {
416 $cats = $this->getPossibleCategories( $category );
417
418 if ( !$cats ) {
419 $this->error( "Tracking category '$category' is disabled\n" );
420 // Output to stderr but don't bail out,
421 }
422
423 foreach ( $cats as $cat ) {
424 $this->refreshCategory( $cat );
425 }
426 }
427
433 private function refreshCategory( Title $category ) {
434 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
435
436 $dbr = $this->getDB( DB_REPLICA );
437 $conds = [
438 'page_id=cl_from',
439 'cl_to' => $category->getDBkey(),
440 ];
441 if ( $this->namespace !== false ) {
442 $conds['page_namespace'] = $this->namespace;
443 }
444
445 $i = 0;
446 $timestamp = '';
447 $lastId = 0;
448 do {
449 $finalConds = $conds;
450 $timestamp = $dbr->addQuotes( $timestamp );
451 $finalConds [] =
452 "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
453 $res = $dbr->select( [ 'page', 'categorylinks' ],
454 [ 'page_id', 'cl_timestamp' ],
455 $finalConds,
456 __METHOD__,
457 [
458 'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
459 'LIMIT' => $this->mBatchSize,
460 ]
461 );
462
463 foreach ( $res as $row ) {
464 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
465 $this->output( "$i\n" );
467 }
468 $lastId = $row->page_id;
469 $timestamp = $row->cl_timestamp;
470 self::fixLinksFromArticle( $row->page_id );
471 }
472
473 } while ( $res->numRows() == $this->mBatchSize );
474 }
475
482 private function getPossibleCategories( $categoryKey ) {
483 $trackingCategories = new TrackingCategories( $this->getConfig() );
484 $cats = $trackingCategories->getTrackingCategories();
485 if ( isset( $cats[$categoryKey] ) ) {
486 return $cats[$categoryKey]['cats'];
487 }
488 $this->error( "Unknown tracking category {$categoryKey}\n", true );
489 }
490}
491
492$maintClass = 'RefreshLinks';
493require_once RUN_MAINTENANCE_IF_MAIN;
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true)
Add some args that are needed.
getDB( $db, $groups=[], $wiki=false)
Returns a database to be used by current maintenance script.
hasOption( $name)
Checks to see if a particular param exists.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Set the batch size.
const RAW
Definition Revision.php:100
Represents a title within MediaWiki.
Definition Title.php:39
getDBkey()
Get the main part with underscores.
Definition Title.php:955
This class performs some operations related to tracking categories, such as creating a list of all su...
static newFromID( $id, $from='fromdb')
Constructor from a page id.
Definition WikiPage.php:159
if(! $regexes) $dbr
Definition cleanup.php:94
$res
Definition database.txt:21
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling output() to send it all. It could be easily changed to send incrementally if that becomes useful
do that in ParserLimitReportFormat instead use this to modify the parameters of the image all existing parser cache entries will be invalid To avoid you ll need to handle that somehow(e.g. with the RejectParserCacheValue hook) because MediaWiki won 't do it for you. & $defaults error
Definition hooks.txt:2581
const NS_CATEGORY
Definition Defines.php:79
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:40
require_once RUN_MAINTENANCE_IF_MAIN
const DB_REPLICA
Definition defines.php:25
const DB_MASTER
Definition defines.php:26