Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 161 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
| CleanupInvalidDbKeys | |
0.00% |
0 / 161 |
|
0.00% |
0 / 6 |
1482 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| execute | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
| outputStatus | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| writeToReport | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| cleanupTable | |
0.00% |
0 / 135 |
|
0.00% |
0 / 1 |
812 | |||
| makeValidTitle | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Cleans up invalid titles in various tables. |
| 4 | * |
| 5 | * @license GPL-2.0-or-later |
| 6 | * @file |
| 7 | * @ingroup Maintenance |
| 8 | */ |
| 9 | |
| 10 | // @codeCoverageIgnoreStart |
| 11 | require_once __DIR__ . '/Maintenance.php'; |
| 12 | // @codeCoverageIgnoreEnd |
| 13 | |
| 14 | use MediaWiki\Deferred\LinksUpdate\CategoryLinksTable; |
| 15 | use MediaWiki\Deferred\LinksUpdate\ImageLinksTable; |
| 16 | use MediaWiki\Deferred\LinksUpdate\PageLinksTable; |
| 17 | use MediaWiki\Deferred\LinksUpdate\TemplateLinksTable; |
| 18 | use MediaWiki\Maintenance\Maintenance; |
| 19 | use MediaWiki\Title\TitleValue; |
| 20 | use MediaWiki\WikiMap\WikiMap; |
| 21 | use Wikimedia\Rdbms\IExpression; |
| 22 | use Wikimedia\Rdbms\LikeValue; |
| 23 | |
| 24 | /** |
| 25 | * Maintenance script that cleans up invalid titles in various tables. |
| 26 | * |
| 27 | * @since 1.29 |
| 28 | * @ingroup Maintenance |
| 29 | */ |
| 30 | class CleanupInvalidDbKeys extends Maintenance { |
| 31 | /** @var array[] List of tables to clean up, and the field prefix for that table */ |
| 32 | protected static $tables = [ |
| 33 | // Data tables |
| 34 | [ 'page', 'page' ], |
| 35 | [ 'redirect', 'rd', 'idField' => 'rd_from' ], |
| 36 | [ 'archive', 'ar' ], |
| 37 | [ 'logging', 'log' ], |
| 38 | [ 'protected_titles', 'pt', 'idField' => 0 ], |
| 39 | [ 'category', 'cat', 'nsField' => 14 ], |
| 40 | [ 'recentchanges', 'rc' ], |
| 41 | [ 'watchlist', 'wl' ], |
| 42 | // The querycache tables' qc(c)_title and qcc_titletwo may contain titles, |
| 43 | // but also usernames or other things like that, so we leave them alone |
| 44 | |
| 45 | // Links tables |
| 46 | [ 'pagelinks', 'pl', 'idField' => 'pl_from', 'virtualDomain' => PageLinksTable::VIRTUAL_DOMAIN ], |
| 47 | [ 'templatelinks', 'tl', 'idField' => 'tl_from', 'virtualDomain' => TemplateLinksTable::VIRTUAL_DOMAIN ], |
| 48 | [ 'categorylinks', 'cl', 'idField' => 'cl_from', 'virtualDomain' => CategoryLinksTable::VIRTUAL_DOMAIN ], |
| 49 | [ 'imagelinks', 'il', 'idField' => 'il_from', 'virtualDomain' => ImageLinksTable::VIRTUAL_DOMAIN ], |
| 50 | ]; |
| 51 | |
| 52 | public function __construct() { |
| 53 | parent::__construct(); |
| 54 | $this->addDescription( <<<'TEXT' |
| 55 | This script cleans up the title fields in various tables to remove entries that |
| 56 | will be rejected by the constructor of TitleValue. This constructor throws an |
| 57 | exception when invalid data is encountered, which will not normally occur on |
| 58 | regular page views, but can happen on query special pages. |
| 59 | |
| 60 | The script targets titles matching the regular expression /^_|[ \r\n\t]|_$/. |
| 61 | Because any foreign key relationships involving these titles will already be |
| 62 | broken, the titles are corrected to a valid version or the rows are deleted |
| 63 | entirely, depending on the table. |
| 64 | |
| 65 | The script runs with the expectation that STDOUT is redirected to a file. |
| 66 | TEXT |
| 67 | ); |
| 68 | $this->addOption( 'fix', 'Actually clean up invalid titles. If this parameter is ' . |
| 69 | 'not specified, the script will report invalid titles but not clean them up.', |
| 70 | false, false ); |
| 71 | $this->addOption( 'table', 'The table(s) to process. This option can be specified ' . |
| 72 | 'more than once (e.g. -t category -t watchlist). If not specified, all available ' . |
| 73 | 'tables will be processed. Available tables are: ' . |
| 74 | implode( ', ', array_column( static::$tables, 0 ) ), false, true, 't', true ); |
| 75 | |
| 76 | $this->setBatchSize( 500 ); |
| 77 | } |
| 78 | |
| 79 | public function execute() { |
| 80 | $tablesToProcess = $this->getOption( 'table' ); |
| 81 | foreach ( static::$tables as $tableParams ) { |
| 82 | if ( !$tablesToProcess || in_array( $tableParams[0], $tablesToProcess ) ) { |
| 83 | $this->cleanupTable( $tableParams ); |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | $this->outputStatus( 'Done!' ); |
| 88 | if ( $this->hasOption( 'fix' ) ) { |
| 89 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
| 90 | $this->outputStatus( " Cleaned up invalid DB keys on $dbDomain!\n" ); |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /** |
| 95 | * Prints text to STDOUT, and STDERR if STDOUT was redirected to a file. |
| 96 | * Used for progress reporting. |
| 97 | * |
| 98 | * @param string $str Text to write to both places |
| 99 | * @param string|null $channel Ignored |
| 100 | */ |
| 101 | protected function outputStatus( $str, $channel = null ) { |
| 102 | // Make it easier to find progress lines in the STDOUT log |
| 103 | if ( trim( $str ) ) { |
| 104 | fwrite( STDOUT, '*** ' . trim( $str ) . "\n" ); |
| 105 | } |
| 106 | fwrite( STDERR, $str ); |
| 107 | } |
| 108 | |
| 109 | /** |
| 110 | * Prints text to STDOUT. Used for logging output. |
| 111 | * |
| 112 | * @param string $str Text to write |
| 113 | */ |
| 114 | protected function writeToReport( $str ) { |
| 115 | fwrite( STDOUT, $str ); |
| 116 | } |
| 117 | |
| 118 | /** |
| 119 | * Identifies, and optionally cleans up, invalid titles. |
| 120 | * |
| 121 | * @param array $tableParams A child array of self::$tables |
| 122 | */ |
| 123 | protected function cleanupTable( $tableParams ) { |
| 124 | [ $table, $prefix ] = $tableParams; |
| 125 | $idField = $tableParams['idField'] ?? "{$prefix}_id"; |
| 126 | $nsField = $tableParams['nsField'] ?? "{$prefix}_namespace"; |
| 127 | $titleField = $tableParams['titleField'] ?? "{$prefix}_title"; |
| 128 | |
| 129 | $this->outputStatus( "Looking for invalid $titleField entries in $table...\n" ); |
| 130 | |
| 131 | // Do all the select queries on the replicas, as they are slow (they use |
| 132 | // unanchored LIKEs). Naturally this could cause problems if rows are |
| 133 | // modified after selecting and before deleting/updating, but working on |
| 134 | // the hypothesis that invalid rows will be old and in all likelihood |
| 135 | // unreferenced, we should be fine to do it like this. |
| 136 | if ( isset( $tableParams['virtualDomain'] ) ) { |
| 137 | $dbr = $this->getServiceContainer()->getConnectionProvider()->getReplicaDatabase( |
| 138 | $tableParams['virtualDomain'], |
| 139 | 'vslow' |
| 140 | ); |
| 141 | } else { |
| 142 | $dbr = $this->getDB( DB_REPLICA, 'vslow' ); |
| 143 | } |
| 144 | |
| 145 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
| 146 | $joinConds = []; |
| 147 | $tables = [ $table ]; |
| 148 | if ( isset( $linksMigration::$mapping[$table] ) ) { |
| 149 | [ $nsField, $titleField ] = $linksMigration->getTitleFields( $table ); |
| 150 | $joinConds = $linksMigration->getQueryInfo( $table )['joins']; |
| 151 | $tables = $linksMigration->getQueryInfo( $table )['tables']; |
| 152 | } |
| 153 | |
| 154 | // Find all TitleValue-invalid titles. |
| 155 | $percent = $dbr->anyString(); |
| 156 | // The REGEXP operator is not cross-DBMS, so we have to use lots of LIKEs |
| 157 | $likeExpr = $dbr |
| 158 | ->expr( $titleField, IExpression::LIKE, new LikeValue( $percent, ' ', $percent ) ) |
| 159 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\r", $percent ) ) |
| 160 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\n", $percent ) ) |
| 161 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\t", $percent ) ) |
| 162 | ->or( $titleField, IExpression::LIKE, new LikeValue( '_', $percent ) ) |
| 163 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, '_' ) ); |
| 164 | $res = $dbr->newSelectQueryBuilder() |
| 165 | ->select( [ |
| 166 | 'id' => $idField, |
| 167 | 'ns' => $nsField, |
| 168 | 'title' => $titleField, |
| 169 | ] ) |
| 170 | ->tables( $tables ) |
| 171 | ->where( $likeExpr ) |
| 172 | ->joinConds( $joinConds ) |
| 173 | ->limit( $this->getBatchSize() ) |
| 174 | ->caller( __METHOD__ ) |
| 175 | ->fetchResultSet(); |
| 176 | |
| 177 | $this->outputStatus( "Number of invalid rows: " . $res->numRows() . "\n" ); |
| 178 | if ( !$res->numRows() ) { |
| 179 | $this->outputStatus( "\n" ); |
| 180 | return; |
| 181 | } |
| 182 | |
| 183 | // Write a table of titles to the report file. Also keep a list of the found |
| 184 | // IDs, as we might need it later for DB updates |
| 185 | $this->writeToReport( sprintf( "%10s | ns | dbkey\n", $idField ) ); |
| 186 | $ids = []; |
| 187 | foreach ( $res as $row ) { |
| 188 | $this->writeToReport( sprintf( "%10d | %3d | %s\n", $row->id, $row->ns, $row->title ) ); |
| 189 | $ids[] = $row->id; |
| 190 | } |
| 191 | |
| 192 | // If we're doing a dry run, output the new titles we would use for the UPDATE |
| 193 | // queries (if relevant), and finish |
| 194 | if ( !$this->hasOption( 'fix' ) ) { |
| 195 | if ( $table === 'logging' || $table === 'archive' ) { |
| 196 | $this->writeToReport( "The following updates would be run with the --fix flag:\n" ); |
| 197 | foreach ( $res as $row ) { |
| 198 | $newTitle = self::makeValidTitle( $row->title ); |
| 199 | $this->writeToReport( |
| 200 | "$idField={$row->id}: update '{$row->title}' to '$newTitle'\n" ); |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | if ( $table !== 'page' && $table !== 'redirect' ) { |
| 205 | $this->outputStatus( "Run with --fix to clean up these rows\n" ); |
| 206 | } |
| 207 | $this->outputStatus( "\n" ); |
| 208 | return; |
| 209 | } |
| 210 | |
| 211 | $services = $this->getServiceContainer(); |
| 212 | |
| 213 | // Fix the bad data, using different logic for the various tables |
| 214 | if ( isset( $tableParams['virtualDomain'] ) ) { |
| 215 | $dbw = $this->getServiceContainer()->getConnectionProvider()->getPrimaryDatabase( |
| 216 | $tableParams['virtualDomain'] |
| 217 | ); |
| 218 | } else { |
| 219 | $dbw = $this->getPrimaryDB(); |
| 220 | } |
| 221 | |
| 222 | switch ( $table ) { |
| 223 | case 'page': |
| 224 | case 'redirect': |
| 225 | // This shouldn't happen on production wikis, and we already have a script |
| 226 | // to handle 'page' rows anyway, so just notify the user and let them decide |
| 227 | // what to do next. |
| 228 | $this->outputStatus( <<<TEXT |
| 229 | IMPORTANT: This script does not fix invalid entries in the $table table. |
| 230 | Consider repairing these rows, and rows in related tables, by hand. |
| 231 | You may like to run, or borrow logic from, the cleanupTitles.php script. |
| 232 | |
| 233 | TEXT |
| 234 | ); |
| 235 | break; |
| 236 | |
| 237 | case 'archive': |
| 238 | case 'logging': |
| 239 | // Rename the title to a corrected equivalent. Any foreign key relationships |
| 240 | // to the page_title field are already broken, so this will just make sure |
| 241 | // users can still access the log entries/deleted revisions from the interface |
| 242 | // using a valid page title. |
| 243 | $this->outputStatus( |
| 244 | "Updating these rows, setting $titleField to the closest valid DB key...\n" ); |
| 245 | $affectedRowCount = 0; |
| 246 | foreach ( $res as $row ) { |
| 247 | $newTitle = self::makeValidTitle( $row->title ); |
| 248 | $this->writeToReport( |
| 249 | "$idField={$row->id}: updating '{$row->title}' to '$newTitle'\n" ); |
| 250 | |
| 251 | $dbw->newUpdateQueryBuilder() |
| 252 | ->update( $table ) |
| 253 | ->set( [ $titleField => $newTitle ] ) |
| 254 | ->where( [ $idField => $row->id ] ) |
| 255 | ->caller( __METHOD__ ) |
| 256 | ->execute(); |
| 257 | $affectedRowCount += $dbw->affectedRows(); |
| 258 | } |
| 259 | $this->waitForReplication(); |
| 260 | $this->outputStatus( "Updated $affectedRowCount rows on $table.\n" ); |
| 261 | |
| 262 | break; |
| 263 | |
| 264 | case 'recentchanges': |
| 265 | case 'watchlist': |
| 266 | case 'category': |
| 267 | // Since these broken titles can't exist, there's really nothing to watch, |
| 268 | // nothing can be categorised in them, and they can't have been changed |
| 269 | // recently, so we can just remove these rows. |
| 270 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
| 271 | $dbw->newDeleteQueryBuilder() |
| 272 | ->deleteFrom( $table ) |
| 273 | ->where( [ $idField => $ids ] ) |
| 274 | ->caller( __METHOD__ )->execute(); |
| 275 | $this->waitForReplication(); |
| 276 | $this->outputStatus( 'Deleted ' . $dbw->affectedRows() . " rows from $table.\n" ); |
| 277 | break; |
| 278 | |
| 279 | case 'protected_titles': |
| 280 | // Since these broken titles can't exist, there's really nothing to protect, |
| 281 | // so we can just remove these rows. Made more complicated by this table |
| 282 | // not having an ID field |
| 283 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
| 284 | $affectedRowCount = 0; |
| 285 | foreach ( $res as $row ) { |
| 286 | $dbw->newDeleteQueryBuilder() |
| 287 | ->deleteFrom( $table ) |
| 288 | ->where( [ $nsField => $row->ns, $titleField => $row->title ] ) |
| 289 | ->caller( __METHOD__ )->execute(); |
| 290 | $affectedRowCount += $dbw->affectedRows(); |
| 291 | } |
| 292 | $this->waitForReplication(); |
| 293 | $this->outputStatus( "Deleted $affectedRowCount rows from $table.\n" ); |
| 294 | break; |
| 295 | |
| 296 | case 'pagelinks': |
| 297 | case 'templatelinks': |
| 298 | case 'categorylinks': |
| 299 | case 'imagelinks': |
| 300 | // Update links tables for each page where these bogus links are supposedly |
| 301 | // located. If the invalid rows don't go away after these jobs go through, |
| 302 | // they're probably being added by a buggy hook. |
| 303 | $this->outputStatus( "Queueing link update jobs for the pages in $idField...\n" ); |
| 304 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
| 305 | $wikiPageFactory = $services->getWikiPageFactory(); |
| 306 | foreach ( $res as $row ) { |
| 307 | if ( $wikiPageFactory->newFromID( $row->id ) ) { |
| 308 | RefreshLinks::fixLinksFromArticle( $row->id ); |
| 309 | } else { |
| 310 | // This link entry points to a nonexistent page, so just get rid of it |
| 311 | $dbw->newDeleteQueryBuilder() |
| 312 | ->deleteFrom( $table ) |
| 313 | ->where( [ $idField => $row->id ] ) |
| 314 | ->andWhere( $linksMigration->getLinksConditions( |
| 315 | $table, |
| 316 | new TitleValue( (int)$row->ns, $row->title ) |
| 317 | ) ) |
| 318 | ->caller( __METHOD__ )->execute(); |
| 319 | } |
| 320 | } |
| 321 | $this->waitForReplication(); |
| 322 | $this->outputStatus( "Link update jobs have been added to the job queue.\n" ); |
| 323 | break; |
| 324 | } |
| 325 | |
| 326 | $this->outputStatus( "\n" ); |
| 327 | } |
| 328 | |
| 329 | /** |
| 330 | * Fix possible validation issues in the given title (DB key). |
| 331 | * |
| 332 | * @param string $invalidTitle |
| 333 | * @return string |
| 334 | */ |
| 335 | protected static function makeValidTitle( $invalidTitle ) { |
| 336 | return strtr( trim( $invalidTitle, '_' ), |
| 337 | [ ' ' => '_', "\r" => '', "\n" => '', "\t" => '_' ] ); |
| 338 | } |
| 339 | } |
| 340 | |
| 341 | // @codeCoverageIgnoreStart |
| 342 | $maintClass = CleanupInvalidDbKeys::class; |
| 343 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 344 | // @codeCoverageIgnoreEnd |