Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
8.10% |
17 / 210 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
| RefreshLinks | |
8.10% |
17 / 210 |
|
0.00% |
0 / 11 |
1912.83 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
| execute | |
0.00% |
0 / 42 |
|
0.00% |
0 / 1 |
182 | |||
| doRefreshLinks | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
90 | |||
| fixRedirect | |
73.91% |
17 / 23 |
|
0.00% |
0 / 1 |
4.28 | |||
| fixLinksFromArticle | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
| deleteLinksFromNonexistent | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
| dfnCheckInterval | |
0.00% |
0 / 48 |
|
0.00% |
0 / 1 |
30 | |||
| intervalCond | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
| refreshTrackingCategory | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| refreshCategory | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| getPossibleCategories | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | */ |
| 6 | |
| 7 | use MediaWiki\Deferred\DeferredUpdates; |
| 8 | use MediaWiki\Deferred\LinksUpdate\CategoryLinksTable; |
| 9 | use MediaWiki\Deferred\LinksUpdate\ExternalLinksTable; |
| 10 | use MediaWiki\Deferred\LinksUpdate\ImageLinksTable; |
| 11 | use MediaWiki\Deferred\LinksUpdate\InterwikiLinksTable; |
| 12 | use MediaWiki\Deferred\LinksUpdate\PageLinksTable; |
| 13 | use MediaWiki\Deferred\LinksUpdate\TemplateLinksTable; |
| 14 | use MediaWiki\Linker\LinkTarget; |
| 15 | use MediaWiki\Maintenance\Maintenance; |
| 16 | use MediaWiki\MediaWikiServices; |
| 17 | use MediaWiki\Revision\RevisionRecord; |
| 18 | use MediaWiki\Title\Title; |
| 19 | use Wikimedia\Rdbms\IExpression; |
| 20 | use Wikimedia\Rdbms\IReadableDatabase; |
| 21 | use Wikimedia\Rdbms\RawSQLExpression; |
| 22 | use Wikimedia\Rdbms\SelectQueryBuilder; |
| 23 | |
| 24 | // @codeCoverageIgnoreStart |
| 25 | require_once __DIR__ . '/Maintenance.php'; |
| 26 | // @codeCoverageIgnoreEnd |
| 27 | |
| 28 | /** |
| 29 | * Refresh link tables. |
| 30 | * |
| 31 | * @ingroup Maintenance |
| 32 | */ |
| 33 | class RefreshLinks extends Maintenance { |
| 34 | private const REPORTING_INTERVAL = 100; |
| 35 | |
| 36 | public function __construct() { |
| 37 | parent::__construct(); |
| 38 | $this->addDescription( 'Refresh link tables' ); |
| 39 | $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' ); |
| 40 | $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' ); |
| 41 | $this->addOption( 'new-only', 'Only affect articles with just a single edit' ); |
| 42 | $this->addOption( 'redirects-only', 'Only fix redirects, not all links' ); |
| 43 | $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' ); |
| 44 | $this->addOption( 'e', 'Last page id to refresh', false, true ); |
| 45 | $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . |
| 46 | 'query, default 100,000', false, true ); |
| 47 | $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true ); |
| 48 | $this->addOption( 'category', 'Only fix pages in this category', false, true ); |
| 49 | $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true ); |
| 50 | $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp', |
| 51 | false, true ); |
| 52 | $this->addArg( 'start', 'Page_id to start from, default 1', false ); |
| 53 | $this->setBatchSize( 100 ); |
| 54 | } |
| 55 | |
| 56 | public function execute() { |
| 57 | // Note that there is a difference between not specifying the start |
| 58 | // and end IDs and using the minimum and maximum values from the page |
| 59 | // table. In the latter case, deleteLinksFromNonexistent() will not |
| 60 | // delete entries for nonexistent IDs that fall outside the range. |
| 61 | $start = (int)$this->getArg( 0 ) ?: null; |
| 62 | $end = (int)$this->getOption( 'e' ) ?: null; |
| 63 | $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 ); |
| 64 | |
| 65 | if ( $this->hasOption( 'dfn-only' ) ) { |
| 66 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
| 67 | return; |
| 68 | } |
| 69 | |
| 70 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
| 71 | $builder = $dbr->newSelectQueryBuilder() |
| 72 | ->from( 'page' ) |
| 73 | ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) ) |
| 74 | ->limit( $this->getBatchSize() ); |
| 75 | |
| 76 | if ( $this->hasOption( 'namespace' ) ) { |
| 77 | $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] ); |
| 78 | } |
| 79 | |
| 80 | if ( $this->hasOption( 'before-timestamp' ) ) { |
| 81 | $builder->andWhere( |
| 82 | $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) ) |
| 83 | ->or( 'page_links_updated', '=', null ) |
| 84 | ); |
| 85 | } |
| 86 | |
| 87 | if ( $this->hasOption( 'category' ) ) { |
| 88 | $category = $this->getOption( 'category' ); |
| 89 | $title = Title::makeTitleSafe( NS_CATEGORY, $category ); |
| 90 | if ( !$title ) { |
| 91 | $this->fatalError( "'$category' is an invalid category name!\n" ); |
| 92 | } |
| 93 | $this->refreshCategory( $builder, $title ); |
| 94 | } elseif ( $this->hasOption( 'tracking-category' ) ) { |
| 95 | // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core |
| 96 | $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) ); |
| 97 | } else { |
| 98 | $new = $this->hasOption( 'new-only' ); |
| 99 | $redir = $this->hasOption( 'redirects-only' ); |
| 100 | $touched = $this->hasOption( 'touched-only' ); |
| 101 | $what = $redir ? 'redirects' : 'links'; |
| 102 | if ( $new ) { |
| 103 | $builder->andWhere( [ 'page_is_new' => 1 ] ); |
| 104 | $this->output( "Refreshing $what from new pages...\n" ); |
| 105 | } else { |
| 106 | if ( $touched ) { |
| 107 | $builder->andWhere( [ |
| 108 | $dbr->expr( 'page_links_updated', '=', null ) |
| 109 | ->orExpr( new RawSQLExpression( 'page_touched > page_links_updated' ) ), |
| 110 | ] ); |
| 111 | } |
| 112 | $this->output( "Refreshing $what from pages...\n" ); |
| 113 | } |
| 114 | $this->doRefreshLinks( $builder, $redir ); |
| 115 | if ( !$this->hasOption( 'namespace' ) ) { |
| 116 | $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); |
| 117 | } |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | /** |
| 122 | * Do the actual link refreshing. |
| 123 | * @param SelectQueryBuilder $builder |
| 124 | * @param bool $redirectsOnly Only fix redirects |
| 125 | * @param array $indexFields |
| 126 | */ |
| 127 | private function doRefreshLinks( |
| 128 | SelectQueryBuilder $builder, |
| 129 | bool $redirectsOnly = false, |
| 130 | array $indexFields = [ 'page_id' ] |
| 131 | ) { |
| 132 | // Give extensions a chance to optimize settings |
| 133 | $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this ); |
| 134 | |
| 135 | $estimateCount = $builder->caller( __METHOD__ )->estimateRowCount(); |
| 136 | $this->output( "Estimated page count: $estimateCount\n" ); |
| 137 | |
| 138 | $i = 0; |
| 139 | $lastIndexes = array_fill_keys( $indexFields, 0 ); |
| 140 | $selectFields = in_array( 'page_id', $indexFields ) |
| 141 | ? $indexFields : [ 'page_id', ...$indexFields ]; |
| 142 | $verbose = $this->hasOption( 'verbose' ); |
| 143 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
| 144 | do { |
| 145 | $batchCond = $dbr->buildComparison( '>', $lastIndexes ); |
| 146 | $res = ( clone $builder )->select( $selectFields ) |
| 147 | ->andWhere( [ $batchCond ] ) |
| 148 | ->orderBy( $indexFields ) |
| 149 | ->caller( __METHOD__ )->fetchResultSet(); |
| 150 | |
| 151 | if ( $verbose ) { |
| 152 | $this->output( "Refreshing links for {$res->numRows()} pages\n" ); |
| 153 | } |
| 154 | |
| 155 | foreach ( $res as $row ) { |
| 156 | if ( !( ++$i % self::REPORTING_INTERVAL ) ) { |
| 157 | $this->output( "$i\n" ); |
| 158 | $this->waitForReplication(); |
| 159 | } |
| 160 | if ( $verbose ) { |
| 161 | $this->output( "Refreshing links for page ID {$row->page_id}\n" ); |
| 162 | } |
| 163 | self::fixRedirect( $this, $row->page_id ); |
| 164 | if ( !$redirectsOnly ) { |
| 165 | self::fixLinksFromArticle( $row->page_id ); |
| 166 | } |
| 167 | } |
| 168 | if ( $res->numRows() ) { |
| 169 | $res->seek( $res->numRows() - 1 ); |
| 170 | foreach ( $indexFields as $field ) { |
| 171 | $lastIndexes[$field] = $res->current()->$field; |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | } while ( $res->numRows() == $this->getBatchSize() ); |
| 176 | } |
| 177 | |
| 178 | /** |
| 179 | * Update the redirect entry for a given page. |
| 180 | * |
| 181 | * This methods bypasses the "redirect" table to get the redirect target, |
| 182 | * and parses the page's content to fetch it. This allows to be sure that |
| 183 | * the redirect target is up to date and valid. |
| 184 | * This is particularly useful when modifying namespaces to be sure the |
| 185 | * entry in the "redirect" table points to the correct page and not to an |
| 186 | * invalid one. |
| 187 | * |
| 188 | * @internal |
| 189 | * @param Maintenance $maint |
| 190 | * @param int $id The page ID to check |
| 191 | */ |
| 192 | public static function fixRedirect( Maintenance $maint, $id ) { |
| 193 | $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id ); |
| 194 | |
| 195 | // In case the page just got deleted. |
| 196 | if ( $page === null ) { |
| 197 | return; |
| 198 | } |
| 199 | |
| 200 | $rt = null; |
| 201 | $content = $page->getContent( RevisionRecord::RAW ); |
| 202 | if ( $content !== null ) { |
| 203 | $rt = $content->getRedirectTarget(); |
| 204 | } |
| 205 | |
| 206 | $dbw = $maint->getDB( DB_PRIMARY ); |
| 207 | if ( $rt === null ) { |
| 208 | // The page is not a redirect |
| 209 | // Delete any redirect table entry for it |
| 210 | $dbw->newDeleteQueryBuilder() |
| 211 | ->deleteFrom( 'redirect' ) |
| 212 | ->where( [ 'rd_from' => $id ] ) |
| 213 | ->caller( __METHOD__ )->execute(); |
| 214 | $fieldValue = 0; |
| 215 | } else { |
| 216 | $page->insertRedirectEntry( $rt ); |
| 217 | $fieldValue = 1; |
| 218 | } |
| 219 | |
| 220 | // Update the page table to be sure it is an a consistent state |
| 221 | $update = $dbw->newUpdateQueryBuilder() |
| 222 | ->update( 'page' ) |
| 223 | ->set( [ 'page_is_redirect' => $fieldValue ] ) |
| 224 | ->where( [ 'page_id' => $id ] ) |
| 225 | ->caller( __METHOD__ ); |
| 226 | $update->execute(); |
| 227 | $maint->getServiceContainer()->getLinkWriteDuplicator()->duplicate( $update ); |
| 228 | } |
| 229 | |
| 230 | /** |
| 231 | * Run LinksUpdate for all links on a given page_id |
| 232 | * @param int $id The page_id |
| 233 | */ |
| 234 | public static function fixLinksFromArticle( $id ) { |
| 235 | $services = MediaWikiServices::getInstance(); |
| 236 | $page = $services->getWikiPageFactory()->newFromID( $id ); |
| 237 | |
| 238 | // In case the page just got deleted. |
| 239 | if ( $page === null ) { |
| 240 | return; |
| 241 | } |
| 242 | |
| 243 | // Defer updates to post-send but then immediately execute deferred updates; |
| 244 | // this is the simplest way to run all updates immediately (including updates |
| 245 | // scheduled by other updates). |
| 246 | $page->doSecondaryDataUpdates( [ |
| 247 | 'defer' => DeferredUpdates::POSTSEND, |
| 248 | 'causeAction' => 'refresh-links-maintenance', |
| 249 | 'recursive' => false, |
| 250 | ] ); |
| 251 | DeferredUpdates::doUpdates(); |
| 252 | } |
| 253 | |
| 254 | /** |
| 255 | * Removes non-existing links from pages from pagelinks, imagelinks, |
| 256 | * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables. |
| 257 | * |
| 258 | * @param int|null $start Page_id to start from |
| 259 | * @param int|null $end Page_id to stop at |
| 260 | * @param int $batchSize The size of deletion batches |
| 261 | * @param int $chunkSize Maximum number of existent IDs to check per query |
| 262 | * |
| 263 | * @author Merlijn van Deen <valhallasw@arctus.nl> |
| 264 | */ |
| 265 | private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100, |
| 266 | $chunkSize = 100_000 |
| 267 | ) { |
| 268 | $this->waitForReplication(); |
| 269 | $this->output( "Deleting illegal entries from the links tables...\n" ); |
| 270 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
| 271 | do { |
| 272 | // Find the start of the next chunk. This is based only |
| 273 | // on existent page_ids. |
| 274 | $nextStart = $dbr->newSelectQueryBuilder() |
| 275 | ->select( 'page_id' ) |
| 276 | ->from( 'page' ) |
| 277 | ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] ) |
| 278 | ->orderBy( 'page_id' ) |
| 279 | ->offset( $chunkSize ) |
| 280 | ->caller( __METHOD__ )->fetchField(); |
| 281 | |
| 282 | if ( $nextStart !== false ) { |
| 283 | // To find the end of the current chunk, subtract one. |
| 284 | // This will serve to limit the number of rows scanned in |
| 285 | // dfnCheckInterval(), per query, to at most the sum of |
| 286 | // the chunk size and deletion batch size. |
| 287 | $chunkEnd = $nextStart - 1; |
| 288 | } else { |
| 289 | // This is the last chunk. Check all page_ids up to $end. |
| 290 | $chunkEnd = $end; |
| 291 | } |
| 292 | |
| 293 | $fmtStart = $start !== null ? "[$start" : '(-INF'; |
| 294 | $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)'; |
| 295 | $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" ); |
| 296 | $this->dfnCheckInterval( $start, $chunkEnd, $batchSize ); |
| 297 | |
| 298 | $start = $nextStart; |
| 299 | |
| 300 | } while ( $nextStart !== false ); |
| 301 | } |
| 302 | |
| 303 | /** |
| 304 | * @see RefreshLinks::deleteLinksFromNonexistent() |
| 305 | * @param int|null $start Page_id to start from |
| 306 | * @param int|null $end Page_id to stop at |
| 307 | * @param int $batchSize The size of deletion batches |
| 308 | */ |
| 309 | private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) { |
| 310 | $linksTables = [ |
| 311 | // table name => page_id field |
| 312 | 'pagelinks' => 'pl_from', |
| 313 | 'imagelinks' => 'il_from', |
| 314 | 'categorylinks' => 'cl_from', |
| 315 | 'templatelinks' => 'tl_from', |
| 316 | 'externallinks' => 'el_from', |
| 317 | 'iwlinks' => 'iwl_from', |
| 318 | 'langlinks' => 'll_from', |
| 319 | 'redirect' => 'rd_from', |
| 320 | 'page_props' => 'pp_page', |
| 321 | ]; |
| 322 | |
| 323 | $domains = [ |
| 324 | 'categorylinks' => CategoryLinksTable::VIRTUAL_DOMAIN, |
| 325 | 'externallinks' => ExternalLinksTable::VIRTUAL_DOMAIN, |
| 326 | 'imagelinks' => ImageLinksTable::VIRTUAL_DOMAIN, |
| 327 | 'iwlinks' => InterwikiLinksTable::VIRTUAL_DOMAIN, |
| 328 | 'pagelinks' => PageLinksTable::VIRTUAL_DOMAIN, |
| 329 | 'templatelinks' => TemplateLinksTable::VIRTUAL_DOMAIN, |
| 330 | ]; |
| 331 | |
| 332 | foreach ( $linksTables as $table => $field ) { |
| 333 | $domain = $domains[$table] ?? false; |
| 334 | $dbw = $this->getServiceContainer()->getConnectionProvider()->getPrimaryDatabase( $domain ); |
| 335 | $dbr = $this->getServiceContainer()->getConnectionProvider()->getReplicaDatabase( $domain, 'vslow' ); |
| 336 | |
| 337 | $this->output( " $table: 0" ); |
| 338 | $tableStart = $start; |
| 339 | $counter = 0; |
| 340 | do { |
| 341 | $ids = $dbr->newSelectQueryBuilder() |
| 342 | ->select( $field ) |
| 343 | ->distinct() |
| 344 | ->from( $table ) |
| 345 | ->leftJoin( 'page', null, "$field = page_id" ) |
| 346 | ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) ) |
| 347 | ->andWhere( [ 'page_id' => null ] ) |
| 348 | ->orderBy( $field ) |
| 349 | ->limit( $batchSize ) |
| 350 | ->caller( __METHOD__ )->fetchFieldValues(); |
| 351 | |
| 352 | $numIds = count( $ids ); |
| 353 | if ( $numIds ) { |
| 354 | $counter += $numIds; |
| 355 | $dbw->newDeleteQueryBuilder() |
| 356 | ->deleteFrom( $table ) |
| 357 | ->where( [ $field => $ids ] ) |
| 358 | ->caller( __METHOD__ )->execute(); |
| 359 | $this->output( ", $counter" ); |
| 360 | $tableStart = $ids[$numIds - 1] + 1; |
| 361 | $this->waitForReplication(); |
| 362 | } |
| 363 | |
| 364 | } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) ); |
| 365 | |
| 366 | $this->output( " deleted.\n" ); |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | /** |
| 371 | * Build a SQL expression for a closed interval. |
| 372 | * |
| 373 | * By specifying a null $start or $end, it is also possible to create |
| 374 | * half-bounded or unbounded intervals using this function. |
| 375 | * |
| 376 | * @param IReadableDatabase $db |
| 377 | * @param string $var Field name |
| 378 | * @param mixed $start First value to include or null |
| 379 | * @param mixed $end Last value to include or null |
| 380 | * @return IExpression |
| 381 | */ |
| 382 | private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) { |
| 383 | if ( $start === null && $end === null ) { |
| 384 | return $db->expr( $var, '!=', null ); |
| 385 | } elseif ( $end === null ) { |
| 386 | return $db->expr( $var, '>=', $start ); |
| 387 | } elseif ( $start === null ) { |
| 388 | return $db->expr( $var, '<=', $end ); |
| 389 | } else { |
| 390 | return $db->expr( $var, '>=', $start )->and( $var, '<=', $end ); |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | /** |
| 395 | * Refershes links for pages in a tracking category |
| 396 | * |
| 397 | * @param SelectQueryBuilder $builder |
| 398 | * @param string $category Category key |
| 399 | */ |
| 400 | private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) { |
| 401 | $cats = $this->getPossibleCategories( $category ); |
| 402 | |
| 403 | if ( !$cats ) { |
| 404 | $this->error( "Tracking category '$category' is disabled\n" ); |
| 405 | // Output to stderr but don't bail out. |
| 406 | } |
| 407 | |
| 408 | foreach ( $cats as $cat ) { |
| 409 | $this->refreshCategory( clone $builder, $cat ); |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | /** |
| 414 | * Refreshes links to a category |
| 415 | * |
| 416 | * @param SelectQueryBuilder $builder |
| 417 | * @param LinkTarget $category |
| 418 | */ |
| 419 | private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) { |
| 420 | $this->output( "Refreshing pages in category '{$category->getText()}'...\n" ); |
| 421 | |
| 422 | $builder->join( 'categorylinks', null, 'page_id=cl_from' ) |
| 423 | ->join( 'linktarget', null, 'lt_id=cl_target_id' ) |
| 424 | ->andWhere( [ 'lt_title' => $category->getDBkey(), 'lt_namespace' => NS_CATEGORY ] ); |
| 425 | $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] ); |
| 426 | } |
| 427 | |
| 428 | /** |
| 429 | * Returns a list of possible categories for a given tracking category key |
| 430 | * |
| 431 | * @param string $categoryKey |
| 432 | * @return LinkTarget[] |
| 433 | */ |
| 434 | private function getPossibleCategories( $categoryKey ) { |
| 435 | $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories(); |
| 436 | if ( isset( $cats[$categoryKey] ) ) { |
| 437 | return $cats[$categoryKey]['cats']; |
| 438 | } |
| 439 | $this->fatalError( "Unknown tracking category {$categoryKey}\n" ); |
| 440 | } |
| 441 | } |
| 442 | |
| 443 | // @codeCoverageIgnoreStart |
| 444 | $maintClass = RefreshLinks::class; |
| 445 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 446 | // @codeCoverageIgnoreEnd |