Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 202 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
RefreshLinks | |
0.00% |
0 / 199 |
|
0.00% |
0 / 11 |
2450 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 41 |
|
0.00% |
0 / 1 |
182 | |||
doRefreshLinks | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
90 | |||
fixRedirect | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
20 | |||
fixLinksFromArticle | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
deleteLinksFromNonexistent | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
dfnCheckInterval | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
30 | |||
intervalCond | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
refreshTrackingCategory | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
refreshCategory | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getPossibleCategories | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | use MediaWiki\Deferred\DeferredUpdates; |
22 | use MediaWiki\Linker\LinkTarget; |
23 | use MediaWiki\MediaWikiServices; |
24 | use MediaWiki\Revision\RevisionRecord; |
25 | use MediaWiki\Title\Title; |
26 | use Wikimedia\Rdbms\IReadableDatabase; |
27 | use Wikimedia\Rdbms\SelectQueryBuilder; |
28 | |
29 | require_once __DIR__ . '/Maintenance.php'; |
30 | |
31 | /** |
32 | * Refresh link tables. |
33 | * |
34 | * @ingroup Maintenance |
35 | */ |
36 | class RefreshLinks extends Maintenance { |
37 | private const REPORTING_INTERVAL = 100; |
38 | |
39 | public function __construct() { |
40 | parent::__construct(); |
41 | $this->addDescription( 'Refresh link tables' ); |
42 | $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' ); |
43 | $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' ); |
44 | $this->addOption( 'new-only', 'Only affect articles with just a single edit' ); |
45 | $this->addOption( 'redirects-only', 'Only fix redirects, not all links' ); |
46 | $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' ); |
47 | $this->addOption( 'e', 'Last page id to refresh', false, true ); |
48 | $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . |
49 | 'query, default 100,000', false, true ); |
50 | $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true ); |
51 | $this->addOption( 'category', 'Only fix pages in this category', false, true ); |
52 | $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true ); |
53 | $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp', |
54 | false, true ); |
55 | $this->addArg( 'start', 'Page_id to start from, default 1', false ); |
56 | $this->setBatchSize( 100 ); |
57 | } |
58 | |
59 | public function execute() { |
60 | // Note that there is a difference between not specifying the start |
61 | // and end IDs and using the minimum and maximum values from the page |
62 | // table. In the latter case, deleteLinksFromNonexistent() will not |
63 | // delete entries for nonexistent IDs that fall outside the range. |
64 | $start = (int)$this->getArg( 0 ) ?: null; |
65 | $end = (int)$this->getOption( 'e' ) ?: null; |
66 | $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 ); |
67 | |
68 | if ( $this->hasOption( 'dfn-only' ) ) { |
69 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
70 | return; |
71 | } |
72 | |
73 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
74 | $builder = $dbr->newSelectQueryBuilder() |
75 | ->from( 'page' ) |
76 | ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) ) |
77 | ->limit( $this->getBatchSize() ); |
78 | |
79 | if ( $this->hasOption( 'namespace' ) ) { |
80 | $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] ); |
81 | } |
82 | |
83 | if ( $this->hasOption( 'before-timestamp' ) ) { |
84 | $builder->andWhere( |
85 | $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) ) |
86 | ->or( 'page_links_updated', '=', null ) |
87 | ); |
88 | } |
89 | |
90 | if ( $this->hasOption( 'category' ) ) { |
91 | $category = $this->getOption( 'category' ); |
92 | $title = Title::makeTitleSafe( NS_CATEGORY, $category ); |
93 | if ( !$title ) { |
94 | $this->fatalError( "'$category' is an invalid category name!\n" ); |
95 | } |
96 | $this->refreshCategory( $builder, $title ); |
97 | } elseif ( $this->hasOption( 'tracking-category' ) ) { |
98 | // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core |
99 | $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) ); |
100 | } else { |
101 | $new = $this->hasOption( 'new-only' ); |
102 | $redir = $this->hasOption( 'redirects-only' ); |
103 | $touched = $this->hasOption( 'touched-only' ); |
104 | $what = $redir ? 'redirects' : 'links'; |
105 | if ( $new ) { |
106 | $builder->andWhere( [ 'page_is_new' => 1 ] ); |
107 | $this->output( "Refreshing $what from new pages...\n" ); |
108 | } else { |
109 | if ( $touched ) { |
110 | $builder->andWhere( [ |
111 | 'page_touched > page_links_updated OR page_links_updated IS NULL', |
112 | ] ); |
113 | } |
114 | $this->output( "Refreshing $what from pages...\n" ); |
115 | } |
116 | $this->doRefreshLinks( $builder, $redir ); |
117 | if ( !$this->hasOption( 'namespace' ) ) { |
118 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
119 | } |
120 | } |
121 | } |
122 | |
123 | /** |
124 | * Do the actual link refreshing. |
125 | * @param SelectQueryBuilder $builder |
126 | * @param bool $redirectsOnly Only fix redirects |
127 | * @param array $indexFields |
128 | */ |
129 | private function doRefreshLinks( |
130 | SelectQueryBuilder $builder, |
131 | bool $redirectsOnly = false, |
132 | array $indexFields = [ 'page_id' ] |
133 | ) { |
134 | // Give extensions a chance to optimize settings |
135 | $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this ); |
136 | |
137 | $estimateCount = $builder->estimateRowCount(); |
138 | $this->output( "Estimated page count: $estimateCount\n" ); |
139 | |
140 | $i = 0; |
141 | $lastIndexes = array_fill_keys( $indexFields, 0 ); |
142 | $selectFields = in_array( 'page_id', $indexFields ) |
143 | ? $indexFields : [ 'page_id', ...$indexFields ]; |
144 | $verbose = $this->hasOption( 'verbose' ); |
145 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
146 | do { |
147 | $batchCond = $dbr->buildComparison( '>', $lastIndexes ); |
148 | $res = ( clone $builder )->select( $selectFields ) |
149 | ->andWhere( [ $batchCond ] ) |
150 | ->orderBy( $indexFields ) |
151 | ->caller( __METHOD__ )->fetchResultSet(); |
152 | |
153 | if ( $verbose ) { |
154 | $this->output( "Refreshing links for {$res->numRows()} pages\n" ); |
155 | } |
156 | |
157 | foreach ( $res as $row ) { |
158 | if ( !( ++$i % self::REPORTING_INTERVAL ) ) { |
159 | $this->output( "$i\n" ); |
160 | $this->waitForReplication(); |
161 | } |
162 | if ( $verbose ) { |
163 | $this->output( "Refreshing links for page ID {$row->page_id}\n" ); |
164 | } |
165 | self::fixRedirect( $this, $row->page_id ); |
166 | if ( !$redirectsOnly ) { |
167 | self::fixLinksFromArticle( $row->page_id ); |
168 | } |
169 | } |
170 | if ( $res->numRows() ) { |
171 | $res->seek( $res->numRows() - 1 ); |
172 | foreach ( $indexFields as $field ) { |
173 | $lastIndexes[$field] = $res->current()->$field; |
174 | } |
175 | } |
176 | |
177 | } while ( $res->numRows() == $this->getBatchSize() ); |
178 | } |
179 | |
180 | /** |
181 | * Update the redirect entry for a given page. |
182 | * |
183 | * This methods bypasses the "redirect" table to get the redirect target, |
184 | * and parses the page's content to fetch it. This allows to be sure that |
185 | * the redirect target is up to date and valid. |
186 | * This is particularly useful when modifying namespaces to be sure the |
187 | * entry in the "redirect" table points to the correct page and not to an |
188 | * invalid one. |
189 | * |
190 | * @internal |
191 | * @param Maintenance $maint |
192 | * @param int $id The page ID to check |
193 | */ |
194 | public static function fixRedirect( Maintenance $maint, $id ) { |
195 | $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id ); |
196 | |
197 | // In case the page just got deleted. |
198 | if ( $page === null ) { |
199 | return; |
200 | } |
201 | |
202 | $rt = null; |
203 | $content = $page->getContent( RevisionRecord::RAW ); |
204 | if ( $content !== null ) { |
205 | $rt = $content->getRedirectTarget(); |
206 | } |
207 | |
208 | $dbw = $maint->getDB( DB_PRIMARY ); |
209 | if ( $rt === null ) { |
210 | // The page is not a redirect |
211 | // Delete any redirect table entry for it |
212 | $dbw->newDeleteQueryBuilder() |
213 | ->deleteFrom( 'redirect' ) |
214 | ->where( [ 'rd_from' => $id ] ) |
215 | ->caller( __METHOD__ )->execute(); |
216 | $fieldValue = 0; |
217 | } else { |
218 | $page->insertRedirectEntry( $rt ); |
219 | $fieldValue = 1; |
220 | } |
221 | |
222 | // Update the page table to be sure it is an a consistent state |
223 | $dbw->newUpdateQueryBuilder() |
224 | ->update( 'page' ) |
225 | ->set( [ 'page_is_redirect' => $fieldValue ] ) |
226 | ->where( [ 'page_id' => $id ] ) |
227 | ->caller( __METHOD__ ) |
228 | ->execute(); |
229 | } |
230 | |
231 | /** |
232 | * Run LinksUpdate for all links on a given page_id |
233 | * @param int $id The page_id |
234 | */ |
235 | public static function fixLinksFromArticle( $id ) { |
236 | $services = MediaWikiServices::getInstance(); |
237 | $page = $services->getWikiPageFactory()->newFromID( $id ); |
238 | |
239 | // In case the page just got deleted. |
240 | if ( $page === null ) { |
241 | return; |
242 | } |
243 | |
244 | // Defer updates to post-send but then immediately execute deferred updates; |
245 | // this is the simplest way to run all updates immediately (including updates |
246 | // scheduled by other updates). |
247 | $page->doSecondaryDataUpdates( [ |
248 | 'defer' => DeferredUpdates::POSTSEND, |
249 | 'causeAction' => 'refresh-links-maintenance', |
250 | 'recursive' => false, |
251 | ] ); |
252 | DeferredUpdates::doUpdates(); |
253 | } |
254 | |
255 | /** |
256 | * Removes non-existing links from pages from pagelinks, imagelinks, |
257 | * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables. |
258 | * |
259 | * @param int|null $start Page_id to start from |
260 | * @param int|null $end Page_id to stop at |
261 | * @param int $batchSize The size of deletion batches |
262 | * @param int $chunkSize Maximum number of existent IDs to check per query |
263 | * |
264 | * @author Merlijn van Deen <valhallasw@arctus.nl> |
265 | */ |
266 | private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100, |
267 | $chunkSize = 100_000 |
268 | ) { |
269 | $this->waitForReplication(); |
270 | $this->output( "Deleting illegal entries from the links tables...\n" ); |
271 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
272 | do { |
273 | // Find the start of the next chunk. This is based only |
274 | // on existent page_ids. |
275 | $nextStart = $dbr->newSelectQueryBuilder() |
276 | ->select( 'page_id' ) |
277 | ->from( 'page' ) |
278 | ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] ) |
279 | ->orderBy( 'page_id' ) |
280 | ->offset( $chunkSize ) |
281 | ->caller( __METHOD__ )->fetchField(); |
282 | |
283 | if ( $nextStart !== false ) { |
284 | // To find the end of the current chunk, subtract one. |
285 | // This will serve to limit the number of rows scanned in |
286 | // dfnCheckInterval(), per query, to at most the sum of |
287 | // the chunk size and deletion batch size. |
288 | $chunkEnd = $nextStart - 1; |
289 | } else { |
290 | // This is the last chunk. Check all page_ids up to $end. |
291 | $chunkEnd = $end; |
292 | } |
293 | |
294 | $fmtStart = $start !== null ? "[$start" : '(-INF'; |
295 | $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)'; |
296 | $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" ); |
297 | $this->dfnCheckInterval( $start, $chunkEnd, $batchSize ); |
298 | |
299 | $start = $nextStart; |
300 | |
301 | } while ( $nextStart !== false ); |
302 | } |
303 | |
304 | /** |
305 | * @see RefreshLinks::deleteLinksFromNonexistent() |
306 | * @param int|null $start Page_id to start from |
307 | * @param int|null $end Page_id to stop at |
308 | * @param int $batchSize The size of deletion batches |
309 | */ |
310 | private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) { |
311 | $dbw = $this->getPrimaryDB(); |
312 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
313 | |
314 | $linksTables = [ |
315 | // table name => page_id field |
316 | 'pagelinks' => 'pl_from', |
317 | 'imagelinks' => 'il_from', |
318 | 'categorylinks' => 'cl_from', |
319 | 'templatelinks' => 'tl_from', |
320 | 'externallinks' => 'el_from', |
321 | 'iwlinks' => 'iwl_from', |
322 | 'langlinks' => 'll_from', |
323 | 'redirect' => 'rd_from', |
324 | 'page_props' => 'pp_page', |
325 | ]; |
326 | |
327 | foreach ( $linksTables as $table => $field ) { |
328 | $this->output( " $table: 0" ); |
329 | $tableStart = $start; |
330 | $counter = 0; |
331 | do { |
332 | $ids = $dbr->newSelectQueryBuilder() |
333 | ->select( $field ) |
334 | ->distinct() |
335 | ->from( $table ) |
336 | ->leftJoin( 'page', null, "$field = page_id" ) |
337 | ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) ) |
338 | ->andWhere( [ 'page_id' => null ] ) |
339 | ->orderBy( $field ) |
340 | ->limit( $batchSize ) |
341 | ->caller( __METHOD__ )->fetchFieldValues(); |
342 | |
343 | $numIds = count( $ids ); |
344 | if ( $numIds ) { |
345 | $counter += $numIds; |
346 | $dbw->newDeleteQueryBuilder() |
347 | ->deleteFrom( $table ) |
348 | ->where( [ $field => $ids ] ) |
349 | ->caller( __METHOD__ )->execute(); |
350 | $this->output( ", $counter" ); |
351 | $tableStart = $ids[$numIds - 1] + 1; |
352 | $this->waitForReplication(); |
353 | } |
354 | |
355 | } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) ); |
356 | |
357 | $this->output( " deleted.\n" ); |
358 | } |
359 | } |
360 | |
361 | /** |
362 | * Build a SQL expression for a closed interval (i.e. BETWEEN). |
363 | * |
364 | * By specifying a null $start or $end, it is also possible to create |
365 | * half-bounded or unbounded intervals using this function. |
366 | * |
367 | * @param IReadableDatabase $db |
368 | * @param string $var Field name |
369 | * @param mixed $start First value to include or null |
370 | * @param mixed $end Last value to include or null |
371 | * @return string |
372 | */ |
373 | private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) { |
374 | if ( $start === null && $end === null ) { |
375 | return "$var IS NOT NULL"; |
376 | } elseif ( $end === null ) { |
377 | return "$var >= " . $db->addQuotes( $start ); |
378 | } elseif ( $start === null ) { |
379 | return "$var <= " . $db->addQuotes( $end ); |
380 | } else { |
381 | return "$var BETWEEN " . $db->addQuotes( $start ) . ' AND ' . $db->addQuotes( $end ); |
382 | } |
383 | } |
384 | |
385 | /** |
386 | * Refershes links for pages in a tracking category |
387 | * |
388 | * @param SelectQueryBuilder $builder |
389 | * @param string $category Category key |
390 | */ |
391 | private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) { |
392 | $cats = $this->getPossibleCategories( $category ); |
393 | |
394 | if ( !$cats ) { |
395 | $this->error( "Tracking category '$category' is disabled\n" ); |
396 | // Output to stderr but don't bail out. |
397 | } |
398 | |
399 | foreach ( $cats as $cat ) { |
400 | $this->refreshCategory( clone $builder, $cat ); |
401 | } |
402 | } |
403 | |
404 | /** |
405 | * Refreshes links to a category |
406 | * |
407 | * @param SelectQueryBuilder $builder |
408 | * @param LinkTarget $category |
409 | */ |
410 | private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) { |
411 | $this->output( "Refreshing pages in category '{$category->getText()}'...\n" ); |
412 | |
413 | $builder->join( 'categorylinks', null, 'page_id=cl_from' ) |
414 | ->andWhere( [ 'cl_to' => $category->getDBkey() ] ); |
415 | $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] ); |
416 | } |
417 | |
418 | /** |
419 | * Returns a list of possible categories for a given tracking category key |
420 | * |
421 | * @param string $categoryKey |
422 | * @return LinkTarget[] |
423 | */ |
424 | private function getPossibleCategories( $categoryKey ) { |
425 | $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories(); |
426 | if ( isset( $cats[$categoryKey] ) ) { |
427 | return $cats[$categoryKey]['cats']; |
428 | } |
429 | $this->fatalError( "Unknown tracking category {$categoryKey}\n" ); |
430 | } |
431 | } |
432 | |
433 | $maintClass = RefreshLinks::class; |
434 | require_once RUN_MAINTENANCE_IF_MAIN; |