Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 200 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
RefreshLinks | |
0.00% |
0 / 200 |
|
0.00% |
0 / 11 |
2450 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 42 |
|
0.00% |
0 / 1 |
182 | |||
doRefreshLinks | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
90 | |||
fixRedirect | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
20 | |||
fixLinksFromArticle | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
deleteLinksFromNonexistent | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
dfnCheckInterval | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
30 | |||
intervalCond | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
refreshTrackingCategory | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
refreshCategory | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getPossibleCategories | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | use MediaWiki\Deferred\DeferredUpdates; |
22 | use MediaWiki\Linker\LinkTarget; |
23 | use MediaWiki\Maintenance\Maintenance; |
24 | use MediaWiki\MediaWikiServices; |
25 | use MediaWiki\Revision\RevisionRecord; |
26 | use MediaWiki\Title\Title; |
27 | use Wikimedia\Rdbms\IExpression; |
28 | use Wikimedia\Rdbms\IReadableDatabase; |
29 | use Wikimedia\Rdbms\SelectQueryBuilder; |
30 | |
31 | // @codeCoverageIgnoreStart |
32 | require_once __DIR__ . '/Maintenance.php'; |
33 | // @codeCoverageIgnoreEnd |
34 | |
35 | /** |
36 | * Refresh link tables. |
37 | * |
38 | * @ingroup Maintenance |
39 | */ |
40 | class RefreshLinks extends Maintenance { |
41 | private const REPORTING_INTERVAL = 100; |
42 | |
43 | public function __construct() { |
44 | parent::__construct(); |
45 | $this->addDescription( 'Refresh link tables' ); |
46 | $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' ); |
47 | $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' ); |
48 | $this->addOption( 'new-only', 'Only affect articles with just a single edit' ); |
49 | $this->addOption( 'redirects-only', 'Only fix redirects, not all links' ); |
50 | $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' ); |
51 | $this->addOption( 'e', 'Last page id to refresh', false, true ); |
52 | $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . |
53 | 'query, default 100,000', false, true ); |
54 | $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true ); |
55 | $this->addOption( 'category', 'Only fix pages in this category', false, true ); |
56 | $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true ); |
57 | $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp', |
58 | false, true ); |
59 | $this->addArg( 'start', 'Page_id to start from, default 1', false ); |
60 | $this->setBatchSize( 100 ); |
61 | } |
62 | |
63 | public function execute() { |
64 | // Note that there is a difference between not specifying the start |
65 | // and end IDs and using the minimum and maximum values from the page |
66 | // table. In the latter case, deleteLinksFromNonexistent() will not |
67 | // delete entries for nonexistent IDs that fall outside the range. |
68 | $start = (int)$this->getArg( 0 ) ?: null; |
69 | $end = (int)$this->getOption( 'e' ) ?: null; |
70 | $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 ); |
71 | |
72 | if ( $this->hasOption( 'dfn-only' ) ) { |
73 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
74 | return; |
75 | } |
76 | |
77 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
78 | $builder = $dbr->newSelectQueryBuilder() |
79 | ->from( 'page' ) |
80 | ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) ) |
81 | ->limit( $this->getBatchSize() ); |
82 | |
83 | if ( $this->hasOption( 'namespace' ) ) { |
84 | $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] ); |
85 | } |
86 | |
87 | if ( $this->hasOption( 'before-timestamp' ) ) { |
88 | $builder->andWhere( |
89 | $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) ) |
90 | ->or( 'page_links_updated', '=', null ) |
91 | ); |
92 | } |
93 | |
94 | if ( $this->hasOption( 'category' ) ) { |
95 | $category = $this->getOption( 'category' ); |
96 | $title = Title::makeTitleSafe( NS_CATEGORY, $category ); |
97 | if ( !$title ) { |
98 | $this->fatalError( "'$category' is an invalid category name!\n" ); |
99 | } |
100 | $this->refreshCategory( $builder, $title ); |
101 | } elseif ( $this->hasOption( 'tracking-category' ) ) { |
102 | // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core |
103 | $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) ); |
104 | } else { |
105 | $new = $this->hasOption( 'new-only' ); |
106 | $redir = $this->hasOption( 'redirects-only' ); |
107 | $touched = $this->hasOption( 'touched-only' ); |
108 | $what = $redir ? 'redirects' : 'links'; |
109 | if ( $new ) { |
110 | $builder->andWhere( [ 'page_is_new' => 1 ] ); |
111 | $this->output( "Refreshing $what from new pages...\n" ); |
112 | } else { |
113 | if ( $touched ) { |
114 | $builder->andWhere( [ |
115 | $dbr->expr( 'page_touched', '>', 'page_links_updated' ) |
116 | ->or( 'page_links_updated', '=', null ), |
117 | ] ); |
118 | } |
119 | $this->output( "Refreshing $what from pages...\n" ); |
120 | } |
121 | $this->doRefreshLinks( $builder, $redir ); |
122 | if ( !$this->hasOption( 'namespace' ) ) { |
123 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
124 | } |
125 | } |
126 | } |
127 | |
128 | /** |
129 | * Do the actual link refreshing. |
130 | * @param SelectQueryBuilder $builder |
131 | * @param bool $redirectsOnly Only fix redirects |
132 | * @param array $indexFields |
133 | */ |
134 | private function doRefreshLinks( |
135 | SelectQueryBuilder $builder, |
136 | bool $redirectsOnly = false, |
137 | array $indexFields = [ 'page_id' ] |
138 | ) { |
139 | // Give extensions a chance to optimize settings |
140 | $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this ); |
141 | |
142 | $estimateCount = $builder->caller( __METHOD__ )->estimateRowCount(); |
143 | $this->output( "Estimated page count: $estimateCount\n" ); |
144 | |
145 | $i = 0; |
146 | $lastIndexes = array_fill_keys( $indexFields, 0 ); |
147 | $selectFields = in_array( 'page_id', $indexFields ) |
148 | ? $indexFields : [ 'page_id', ...$indexFields ]; |
149 | $verbose = $this->hasOption( 'verbose' ); |
150 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
151 | do { |
152 | $batchCond = $dbr->buildComparison( '>', $lastIndexes ); |
153 | $res = ( clone $builder )->select( $selectFields ) |
154 | ->andWhere( [ $batchCond ] ) |
155 | ->orderBy( $indexFields ) |
156 | ->caller( __METHOD__ )->fetchResultSet(); |
157 | |
158 | if ( $verbose ) { |
159 | $this->output( "Refreshing links for {$res->numRows()} pages\n" ); |
160 | } |
161 | |
162 | foreach ( $res as $row ) { |
163 | if ( !( ++$i % self::REPORTING_INTERVAL ) ) { |
164 | $this->output( "$i\n" ); |
165 | $this->waitForReplication(); |
166 | } |
167 | if ( $verbose ) { |
168 | $this->output( "Refreshing links for page ID {$row->page_id}\n" ); |
169 | } |
170 | self::fixRedirect( $this, $row->page_id ); |
171 | if ( !$redirectsOnly ) { |
172 | self::fixLinksFromArticle( $row->page_id ); |
173 | } |
174 | } |
175 | if ( $res->numRows() ) { |
176 | $res->seek( $res->numRows() - 1 ); |
177 | foreach ( $indexFields as $field ) { |
178 | $lastIndexes[$field] = $res->current()->$field; |
179 | } |
180 | } |
181 | |
182 | } while ( $res->numRows() == $this->getBatchSize() ); |
183 | } |
184 | |
185 | /** |
186 | * Update the redirect entry for a given page. |
187 | * |
188 | * This methods bypasses the "redirect" table to get the redirect target, |
189 | * and parses the page's content to fetch it. This allows to be sure that |
190 | * the redirect target is up to date and valid. |
191 | * This is particularly useful when modifying namespaces to be sure the |
192 | * entry in the "redirect" table points to the correct page and not to an |
193 | * invalid one. |
194 | * |
195 | * @internal |
196 | * @param Maintenance $maint |
197 | * @param int $id The page ID to check |
198 | */ |
199 | public static function fixRedirect( Maintenance $maint, $id ) { |
200 | $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id ); |
201 | |
202 | // In case the page just got deleted. |
203 | if ( $page === null ) { |
204 | return; |
205 | } |
206 | |
207 | $rt = null; |
208 | $content = $page->getContent( RevisionRecord::RAW ); |
209 | if ( $content !== null ) { |
210 | $rt = $content->getRedirectTarget(); |
211 | } |
212 | |
213 | $dbw = $maint->getDB( DB_PRIMARY ); |
214 | if ( $rt === null ) { |
215 | // The page is not a redirect |
216 | // Delete any redirect table entry for it |
217 | $dbw->newDeleteQueryBuilder() |
218 | ->deleteFrom( 'redirect' ) |
219 | ->where( [ 'rd_from' => $id ] ) |
220 | ->caller( __METHOD__ )->execute(); |
221 | $fieldValue = 0; |
222 | } else { |
223 | $page->insertRedirectEntry( $rt ); |
224 | $fieldValue = 1; |
225 | } |
226 | |
227 | // Update the page table to be sure it is an a consistent state |
228 | $dbw->newUpdateQueryBuilder() |
229 | ->update( 'page' ) |
230 | ->set( [ 'page_is_redirect' => $fieldValue ] ) |
231 | ->where( [ 'page_id' => $id ] ) |
232 | ->caller( __METHOD__ ) |
233 | ->execute(); |
234 | } |
235 | |
236 | /** |
237 | * Run LinksUpdate for all links on a given page_id |
238 | * @param int $id The page_id |
239 | */ |
240 | public static function fixLinksFromArticle( $id ) { |
241 | $services = MediaWikiServices::getInstance(); |
242 | $page = $services->getWikiPageFactory()->newFromID( $id ); |
243 | |
244 | // In case the page just got deleted. |
245 | if ( $page === null ) { |
246 | return; |
247 | } |
248 | |
249 | // Defer updates to post-send but then immediately execute deferred updates; |
250 | // this is the simplest way to run all updates immediately (including updates |
251 | // scheduled by other updates). |
252 | $page->doSecondaryDataUpdates( [ |
253 | 'defer' => DeferredUpdates::POSTSEND, |
254 | 'causeAction' => 'refresh-links-maintenance', |
255 | 'recursive' => false, |
256 | ] ); |
257 | DeferredUpdates::doUpdates(); |
258 | } |
259 | |
260 | /** |
261 | * Removes non-existing links from pages from pagelinks, imagelinks, |
262 | * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables. |
263 | * |
264 | * @param int|null $start Page_id to start from |
265 | * @param int|null $end Page_id to stop at |
266 | * @param int $batchSize The size of deletion batches |
267 | * @param int $chunkSize Maximum number of existent IDs to check per query |
268 | * |
269 | * @author Merlijn van Deen <valhallasw@arctus.nl> |
270 | */ |
271 | private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100, |
272 | $chunkSize = 100_000 |
273 | ) { |
274 | $this->waitForReplication(); |
275 | $this->output( "Deleting illegal entries from the links tables...\n" ); |
276 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
277 | do { |
278 | // Find the start of the next chunk. This is based only |
279 | // on existent page_ids. |
280 | $nextStart = $dbr->newSelectQueryBuilder() |
281 | ->select( 'page_id' ) |
282 | ->from( 'page' ) |
283 | ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] ) |
284 | ->orderBy( 'page_id' ) |
285 | ->offset( $chunkSize ) |
286 | ->caller( __METHOD__ )->fetchField(); |
287 | |
288 | if ( $nextStart !== false ) { |
289 | // To find the end of the current chunk, subtract one. |
290 | // This will serve to limit the number of rows scanned in |
291 | // dfnCheckInterval(), per query, to at most the sum of |
292 | // the chunk size and deletion batch size. |
293 | $chunkEnd = $nextStart - 1; |
294 | } else { |
295 | // This is the last chunk. Check all page_ids up to $end. |
296 | $chunkEnd = $end; |
297 | } |
298 | |
299 | $fmtStart = $start !== null ? "[$start" : '(-INF'; |
300 | $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)'; |
301 | $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" ); |
302 | $this->dfnCheckInterval( $start, $chunkEnd, $batchSize ); |
303 | |
304 | $start = $nextStart; |
305 | |
306 | } while ( $nextStart !== false ); |
307 | } |
308 | |
309 | /** |
310 | * @see RefreshLinks::deleteLinksFromNonexistent() |
311 | * @param int|null $start Page_id to start from |
312 | * @param int|null $end Page_id to stop at |
313 | * @param int $batchSize The size of deletion batches |
314 | */ |
315 | private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) { |
316 | $dbw = $this->getPrimaryDB(); |
317 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
318 | |
319 | $linksTables = [ |
320 | // table name => page_id field |
321 | 'pagelinks' => 'pl_from', |
322 | 'imagelinks' => 'il_from', |
323 | 'categorylinks' => 'cl_from', |
324 | 'templatelinks' => 'tl_from', |
325 | 'externallinks' => 'el_from', |
326 | 'iwlinks' => 'iwl_from', |
327 | 'langlinks' => 'll_from', |
328 | 'redirect' => 'rd_from', |
329 | 'page_props' => 'pp_page', |
330 | ]; |
331 | |
332 | foreach ( $linksTables as $table => $field ) { |
333 | $this->output( " $table: 0" ); |
334 | $tableStart = $start; |
335 | $counter = 0; |
336 | do { |
337 | $ids = $dbr->newSelectQueryBuilder() |
338 | ->select( $field ) |
339 | ->distinct() |
340 | ->from( $table ) |
341 | ->leftJoin( 'page', null, "$field = page_id" ) |
342 | ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) ) |
343 | ->andWhere( [ 'page_id' => null ] ) |
344 | ->orderBy( $field ) |
345 | ->limit( $batchSize ) |
346 | ->caller( __METHOD__ )->fetchFieldValues(); |
347 | |
348 | $numIds = count( $ids ); |
349 | if ( $numIds ) { |
350 | $counter += $numIds; |
351 | $dbw->newDeleteQueryBuilder() |
352 | ->deleteFrom( $table ) |
353 | ->where( [ $field => $ids ] ) |
354 | ->caller( __METHOD__ )->execute(); |
355 | $this->output( ", $counter" ); |
356 | $tableStart = $ids[$numIds - 1] + 1; |
357 | $this->waitForReplication(); |
358 | } |
359 | |
360 | } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) ); |
361 | |
362 | $this->output( " deleted.\n" ); |
363 | } |
364 | } |
365 | |
366 | /** |
367 | * Build a SQL expression for a closed interval. |
368 | * |
369 | * By specifying a null $start or $end, it is also possible to create |
370 | * half-bounded or unbounded intervals using this function. |
371 | * |
372 | * @param IReadableDatabase $db |
373 | * @param string $var Field name |
374 | * @param mixed $start First value to include or null |
375 | * @param mixed $end Last value to include or null |
376 | * @return IExpression |
377 | */ |
378 | private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) { |
379 | if ( $start === null && $end === null ) { |
380 | return $db->expr( $var, '!=', null ); |
381 | } elseif ( $end === null ) { |
382 | return $db->expr( $var, '>=', $start ); |
383 | } elseif ( $start === null ) { |
384 | return $db->expr( $var, '<=', $end ); |
385 | } else { |
386 | return $db->expr( $var, '>=', $start )->and( $var, '<=', $end ); |
387 | } |
388 | } |
389 | |
390 | /** |
391 | * Refershes links for pages in a tracking category |
392 | * |
393 | * @param SelectQueryBuilder $builder |
394 | * @param string $category Category key |
395 | */ |
396 | private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) { |
397 | $cats = $this->getPossibleCategories( $category ); |
398 | |
399 | if ( !$cats ) { |
400 | $this->error( "Tracking category '$category' is disabled\n" ); |
401 | // Output to stderr but don't bail out. |
402 | } |
403 | |
404 | foreach ( $cats as $cat ) { |
405 | $this->refreshCategory( clone $builder, $cat ); |
406 | } |
407 | } |
408 | |
409 | /** |
410 | * Refreshes links to a category |
411 | * |
412 | * @param SelectQueryBuilder $builder |
413 | * @param LinkTarget $category |
414 | */ |
415 | private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) { |
416 | $this->output( "Refreshing pages in category '{$category->getText()}'...\n" ); |
417 | |
418 | $builder->join( 'categorylinks', null, 'page_id=cl_from' ) |
419 | ->andWhere( [ 'cl_to' => $category->getDBkey() ] ); |
420 | $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] ); |
421 | } |
422 | |
423 | /** |
424 | * Returns a list of possible categories for a given tracking category key |
425 | * |
426 | * @param string $categoryKey |
427 | * @return LinkTarget[] |
428 | */ |
429 | private function getPossibleCategories( $categoryKey ) { |
430 | $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories(); |
431 | if ( isset( $cats[$categoryKey] ) ) { |
432 | return $cats[$categoryKey]['cats']; |
433 | } |
434 | $this->fatalError( "Unknown tracking category {$categoryKey}\n" ); |
435 | } |
436 | } |
437 | |
438 | // @codeCoverageIgnoreStart |
439 | $maintClass = RefreshLinks::class; |
440 | require_once RUN_MAINTENANCE_IF_MAIN; |
441 | // @codeCoverageIgnoreEnd |