Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 267 |
|
0.00% |
0 / 8 |
CRAP | |
0.00% |
0 / 1 |
| UpdateCollation | |
0.00% |
0 / 267 |
|
0.00% |
0 / 8 |
2652 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
2 | |||
| init | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
12 | |||
| execute | |
0.00% |
0 / 58 |
|
0.00% |
0 / 1 |
182 | |||
| updateBatch | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
72 | |||
| copyBatch | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
30 | |||
| updateSortKeySizeHistogram | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| showSortKeySizeHistogram | |
0.00% |
0 / 40 |
|
0.00% |
0 / 1 |
110 | |||
| runNormalizationMigration | |
0.00% |
0 / 47 |
|
0.00% |
0 / 1 |
72 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Find all rows in the categorylinks table whose collation is out-of-date |
| 4 | * (collation_name != $wgCategoryCollation) and repopulate cl_sortkey |
| 5 | * using the page title and cl_sortkey_prefix. |
| 6 | * |
| 7 | * @license GPL-2.0-or-later |
| 8 | * @file |
| 9 | * @ingroup Maintenance |
| 10 | * @author Aryeh Gregor (Simetrical) |
| 11 | */ |
| 12 | |
| 13 | // @codeCoverageIgnoreStart |
| 14 | require_once __DIR__ . '/Maintenance.php'; |
| 15 | // @codeCoverageIgnoreEnd |
| 16 | |
| 17 | use MediaWiki\Logger\LoggerFactory; |
| 18 | use MediaWiki\MainConfigNames; |
| 19 | use MediaWiki\Maintenance\Maintenance; |
| 20 | use MediaWiki\Storage\NameTableStore; |
| 21 | use MediaWiki\Title\NamespaceInfo; |
| 22 | use MediaWiki\Title\Title; |
| 23 | use Wikimedia\Rdbms\IMaintainableDatabase; |
| 24 | use Wikimedia\Rdbms\IReadableDatabase; |
| 25 | use Wikimedia\Rdbms\IResultWrapper; |
| 26 | |
| 27 | /** |
| 28 | * Maintenance script that will find all rows in the categorylinks table |
| 29 | * whose collation is out-of-date. |
| 30 | * |
| 31 | * @ingroup Maintenance |
| 32 | */ |
| 33 | class UpdateCollation extends Maintenance { |
| 34 | /** @var int[] */ |
| 35 | public $sizeHistogram = []; |
| 36 | |
| 37 | /** @var int */ |
| 38 | private $numRowsProcessed = 0; |
| 39 | |
| 40 | /** @var bool */ |
| 41 | private $force; |
| 42 | |
| 43 | /** @var bool */ |
| 44 | private $dryRun; |
| 45 | |
| 46 | /** @var bool */ |
| 47 | private $verboseStats; |
| 48 | |
| 49 | /** @var Collation */ |
| 50 | private $collation; |
| 51 | |
| 52 | /** @var string */ |
| 53 | private $collationName; |
| 54 | |
| 55 | /** @var string|null */ |
| 56 | private $targetTable; |
| 57 | |
| 58 | private bool $normalization = false; |
| 59 | |
| 60 | /** @var IReadableDatabase */ |
| 61 | private $dbr; |
| 62 | |
| 63 | /** @var IMaintainableDatabase */ |
| 64 | private $dbw; |
| 65 | |
| 66 | private NamespaceInfo $namespaceInfo; |
| 67 | private NameTableStore $collationNameStore; |
| 68 | |
| 69 | public function __construct() { |
| 70 | parent::__construct(); |
| 71 | |
| 72 | $this->addDescription( <<<TEXT |
| 73 | This script will find all rows in the categorylinks table whose collation is |
| 74 | out-of-date (collation_name is not the same as \$wgCategoryCollation) and |
| 75 | repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all |
| 76 | collations are up-to-date, it will do nothing. |
| 77 | TEXT |
| 78 | ); |
| 79 | |
| 80 | $this->setBatchSize( 100 ); |
| 81 | $this->addOption( 'force', 'Run on all rows, even if the collation is ' . |
| 82 | 'supposed to be up-to-date.', false, false, 'f' ); |
| 83 | $this->addOption( 'previous-collation', 'Set the previous value of ' . |
| 84 | '$wgCategoryCollation here to speed up this script, especially if your ' . |
| 85 | 'categorylinks table is large. This will only update rows with that ' . |
| 86 | 'collation, though, so it may miss out-of-date rows with a different, ' . |
| 87 | 'even older collation.', false, true ); |
| 88 | $this->addOption( 'target-collation', 'Set this to the new collation type to ' . |
| 89 | 'use instead of $wgCategoryCollation. Usually you should not use this, ' . |
| 90 | 'you should just update $wgCategoryCollation in LocalSettings.php.', |
| 91 | false, true ); |
| 92 | $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' . |
| 93 | 'specified table instead of updating them in place.', false, true ); |
| 94 | $this->addOption( 'only-migrate-normalization', 'Only backfill cl_collation_id ' . |
| 95 | 'field from cl_collation', false ); |
| 96 | $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' . |
| 97 | 'remotely.' ); |
| 98 | $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . |
| 99 | 'compile statistics.' ); |
| 100 | $this->addOption( 'verbose-stats', 'Show more statistics.' ); |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Get services and initialise member variables |
| 105 | */ |
| 106 | private function init() { |
| 107 | $services = $this->getServiceContainer(); |
| 108 | $this->namespaceInfo = $services->getNamespaceInfo(); |
| 109 | $this->collationNameStore = new NameTableStore( |
| 110 | $this->getServiceContainer()->getDBLoadBalancer(), |
| 111 | $this->getServiceContainer()->getMainWANObjectCache(), |
| 112 | LoggerFactory::getInstance( 'SecondaryDataUpdate' ), |
| 113 | 'collation', |
| 114 | 'collation_id', |
| 115 | 'collation_name' |
| 116 | ); |
| 117 | |
| 118 | if ( $this->hasOption( 'target-collation' ) ) { |
| 119 | $this->collationName = $this->getOption( 'target-collation' ); |
| 120 | } else { |
| 121 | $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation ); |
| 122 | } |
| 123 | if ( $this->hasOption( 'remote' ) ) { |
| 124 | $realCollationName = 'remote-' . $this->collationName; |
| 125 | } else { |
| 126 | $realCollationName = $this->collationName; |
| 127 | } |
| 128 | $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName ); |
| 129 | |
| 130 | // Collation check: in some cases the constructor will work, |
| 131 | // but this will raise an exception, breaking all category pages |
| 132 | $this->collation->getSortKey( 'MediaWiki' ); |
| 133 | |
| 134 | $this->force = $this->getOption( 'force' ); |
| 135 | $this->dryRun = $this->getOption( 'dry-run' ); |
| 136 | $this->verboseStats = $this->getOption( 'verbose-stats' ); |
| 137 | $this->dbw = $this->getDB( DB_PRIMARY ); |
| 138 | $this->dbr = $this->getReplicaDB(); |
| 139 | $this->targetTable = $this->getOption( 'target-table' ); |
| 140 | $this->normalization = $this->getOption( 'only-migrate-normalization', false ); |
| 141 | } |
| 142 | |
| 143 | public function execute() { |
| 144 | $this->init(); |
| 145 | $batchSize = $this->getBatchSize(); |
| 146 | |
| 147 | if ( $this->normalization ) { |
| 148 | $this->runNormalizationMigration(); |
| 149 | return; |
| 150 | } |
| 151 | |
| 152 | if ( $this->targetTable ) { |
| 153 | if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) { |
| 154 | $this->output( "Creating table {$this->targetTable}\n" ); |
| 155 | $this->dbw->query( |
| 156 | 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) . |
| 157 | ' LIKE ' . $this->dbw->tableName( 'categorylinks' ), |
| 158 | __METHOD__ |
| 159 | ); |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | $collationConds = []; |
| 164 | if ( !$this->force && !$this->targetTable ) { |
| 165 | if ( $this->hasOption( 'previous-collation' ) ) { |
| 166 | $collationConds['collation_name'] = $this->getOption( 'previous-collation' ); |
| 167 | } else { |
| 168 | $collationConds[] = $this->dbr->expr( 'collation_name', '!=', $this->collationName ); |
| 169 | } |
| 170 | } |
| 171 | $maxPageId = (int)$this->dbr->newSelectQueryBuilder() |
| 172 | ->select( 'MAX(page_id)' ) |
| 173 | ->from( 'page' ) |
| 174 | ->caller( __METHOD__ )->fetchField(); |
| 175 | $batchValue = 0; |
| 176 | do { |
| 177 | $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " ); |
| 178 | |
| 179 | // cl_type must be selected as a number for proper paging because |
| 180 | // enums suck. |
| 181 | if ( $this->dbw->getType() === 'mysql' ) { |
| 182 | $clType = 'cl_type+0 AS "cl_type_numeric"'; |
| 183 | } else { |
| 184 | $clType = 'cl_type'; |
| 185 | } |
| 186 | $res = $this->dbw->newSelectQueryBuilder() |
| 187 | ->select( [ |
| 188 | 'cl_from', 'cl_target_id', 'cl_sortkey_prefix', 'cl_sortkey', $clType, |
| 189 | 'cl_timestamp', 'collation_name', 'page_namespace', 'page_title' |
| 190 | ] ) |
| 191 | ->from( 'categorylinks' ) |
| 192 | ->join( 'collation', null, 'cl_collation_id = collation_id' ) |
| 193 | // per T58041 |
| 194 | ->straightJoin( 'page', null, 'cl_from = page_id' ) |
| 195 | ->where( $collationConds ) |
| 196 | ->andWhere( |
| 197 | $this->dbw->expr( 'cl_from', '>=', $batchValue ) |
| 198 | ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() ) |
| 199 | ) |
| 200 | ->orderBy( 'cl_from' ) |
| 201 | ->caller( __METHOD__ )->fetchResultSet(); |
| 202 | $this->output( "processing... " ); |
| 203 | |
| 204 | if ( $res->numRows() ) { |
| 205 | if ( $this->targetTable ) { |
| 206 | $this->copyBatch( $res ); |
| 207 | } else { |
| 208 | $this->updateBatch( $res ); |
| 209 | } |
| 210 | } |
| 211 | $batchValue += $this->getBatchSize(); |
| 212 | |
| 213 | if ( $this->dryRun ) { |
| 214 | $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" ); |
| 215 | } else { |
| 216 | $this->output( "{$this->numRowsProcessed} done.\n" ); |
| 217 | } |
| 218 | } while ( $maxPageId >= $batchValue ); |
| 219 | |
| 220 | if ( !$this->dryRun ) { |
| 221 | $this->output( "{$this->numRowsProcessed} rows processed\n" ); |
| 222 | } |
| 223 | |
| 224 | if ( $this->verboseStats ) { |
| 225 | $this->output( "\n" ); |
| 226 | $this->showSortKeySizeHistogram(); |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | /** |
| 231 | * Update a set of rows in the categorylinks table |
| 232 | */ |
| 233 | private function updateBatch( IResultWrapper $res ) { |
| 234 | if ( !$this->dryRun ) { |
| 235 | $this->beginTransactionRound( __METHOD__ ); |
| 236 | } |
| 237 | foreach ( $res as $row ) { |
| 238 | $title = Title::newFromRow( $row ); |
| 239 | if ( !$row->collation_name ) { |
| 240 | # This is an old-style row, so the sortkey needs to be |
| 241 | # converted. |
| 242 | if ( $row->cl_sortkey === $title->getText() |
| 243 | || $row->cl_sortkey === $title->getPrefixedText() |
| 244 | ) { |
| 245 | $prefix = ''; |
| 246 | } else { |
| 247 | # Custom sortkey, so use it as a prefix |
| 248 | $prefix = $row->cl_sortkey; |
| 249 | } |
| 250 | } else { |
| 251 | $prefix = $row->cl_sortkey_prefix; |
| 252 | } |
| 253 | # cl_type will be wrong for lots of pages if cl_collation is 0, |
| 254 | # so let's update it while we're here. |
| 255 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
| 256 | $newSortKey = $this->collation->getSortKey( |
| 257 | $title->getCategorySortkey( $prefix ) ); |
| 258 | $this->updateSortKeySizeHistogram( $newSortKey ); |
| 259 | // Truncate to 230 bytes to avoid DB error |
| 260 | $newSortKey = substr( $newSortKey, 0, 230 ); |
| 261 | |
| 262 | if ( $this->dryRun ) { |
| 263 | // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in |
| 264 | // other fields, if any, those usually only happen when upgrading old MediaWikis.) |
| 265 | $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey ); |
| 266 | } else { |
| 267 | $collationId = $this->collationNameStore->acquireId( $this->collationName ); |
| 268 | $this->dbw->newUpdateQueryBuilder() |
| 269 | ->update( 'categorylinks' ) |
| 270 | ->set( [ |
| 271 | 'cl_sortkey' => $newSortKey, |
| 272 | 'cl_sortkey_prefix' => $prefix, |
| 273 | 'cl_collation_id' => $collationId, |
| 274 | 'cl_type' => $type, |
| 275 | 'cl_timestamp = cl_timestamp', |
| 276 | ] ) |
| 277 | ->where( [ 'cl_from' => $row->cl_from, 'cl_target_id' => $row->cl_target_id ] ) |
| 278 | ->caller( __METHOD__ ) |
| 279 | ->execute(); |
| 280 | $this->numRowsProcessed++; |
| 281 | } |
| 282 | } |
| 283 | if ( !$this->dryRun ) { |
| 284 | $this->commitTransactionRound( __METHOD__ ); |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | /** |
| 289 | * Copy a set of rows to the target table |
| 290 | */ |
| 291 | private function copyBatch( IResultWrapper $res ) { |
| 292 | $sortKeyInputs = []; |
| 293 | foreach ( $res as $row ) { |
| 294 | $title = Title::newFromRow( $row ); |
| 295 | $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix ); |
| 296 | } |
| 297 | $sortKeys = $this->collation->getSortKeys( $sortKeyInputs ); |
| 298 | $rowsToInsert = []; |
| 299 | foreach ( $res as $i => $row ) { |
| 300 | if ( !isset( $sortKeys[$i] ) ) { |
| 301 | throw new RuntimeException( 'Unable to get sort key' ); |
| 302 | } |
| 303 | $newSortKey = $sortKeys[$i]; |
| 304 | $this->updateSortKeySizeHistogram( $newSortKey ); |
| 305 | // Truncate to 230 bytes to avoid DB error |
| 306 | $newSortKey = substr( $newSortKey, 0, 230 ); |
| 307 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
| 308 | $collationId = $this->collationNameStore->acquireId( $this->collationName ); |
| 309 | $rowsToInsert[] = [ |
| 310 | 'cl_from' => $row->cl_from, |
| 311 | 'cl_target_id' => $row->cl_target_id, |
| 312 | 'cl_sortkey' => $newSortKey, |
| 313 | 'cl_sortkey_prefix' => $row->cl_sortkey_prefix, |
| 314 | 'cl_collation_id' => $collationId, |
| 315 | 'cl_type' => $type, |
| 316 | 'cl_timestamp' => $row->cl_timestamp |
| 317 | ]; |
| 318 | } |
| 319 | if ( $this->dryRun ) { |
| 320 | $this->numRowsProcessed += count( $rowsToInsert ); |
| 321 | } else { |
| 322 | $this->beginTransactionRound( __METHOD__ ); |
| 323 | $this->dbw->newInsertQueryBuilder() |
| 324 | ->insertInto( $this->targetTable ) |
| 325 | ->ignore() |
| 326 | ->rows( $rowsToInsert ) |
| 327 | ->caller( __METHOD__ )->execute(); |
| 328 | $this->numRowsProcessed += $this->dbw->affectedRows(); |
| 329 | $this->commitTransactionRound( __METHOD__ ); |
| 330 | } |
| 331 | } |
| 332 | |
| 333 | /** |
| 334 | * Update the verbose statistics |
| 335 | */ |
| 336 | private function updateSortKeySizeHistogram( string $key ) { |
| 337 | if ( !$this->verboseStats ) { |
| 338 | return; |
| 339 | } |
| 340 | $length = strlen( $key ); |
| 341 | if ( !isset( $this->sizeHistogram[$length] ) ) { |
| 342 | $this->sizeHistogram[$length] = 0; |
| 343 | } |
| 344 | $this->sizeHistogram[$length]++; |
| 345 | } |
| 346 | |
| 347 | /** |
| 348 | * Show the verbose statistics |
| 349 | */ |
| 350 | private function showSortKeySizeHistogram() { |
| 351 | if ( !$this->sizeHistogram ) { |
| 352 | return; |
| 353 | } |
| 354 | $maxLength = max( array_keys( $this->sizeHistogram ) ); |
| 355 | if ( $maxLength === 0 ) { |
| 356 | return; |
| 357 | } |
| 358 | $numBins = 20; |
| 359 | $coarseHistogram = array_fill( 0, $numBins, 0 ); |
| 360 | $coarseBoundaries = []; |
| 361 | $boundary = 0; |
| 362 | for ( $i = 0; $i < $numBins - 1; $i++ ) { |
| 363 | $boundary += $maxLength / $numBins; |
| 364 | $coarseBoundaries[$i] = round( $boundary ); |
| 365 | } |
| 366 | $coarseBoundaries[$numBins - 1] = $maxLength + 1; |
| 367 | $raw = ''; |
| 368 | for ( $i = 0; $i <= $maxLength; $i++ ) { |
| 369 | if ( $raw !== '' ) { |
| 370 | $raw .= ', '; |
| 371 | } |
| 372 | $val = $this->sizeHistogram[$i] ?? 0; |
| 373 | for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { |
| 374 | if ( $coarseBoundaries[$coarseIndex] > $i ) { |
| 375 | $coarseHistogram[$coarseIndex] += $val; |
| 376 | break; |
| 377 | } |
| 378 | } |
| 379 | if ( $coarseIndex === ( $numBins - 1 ) ) { |
| 380 | $coarseHistogram[$coarseIndex] += $val; |
| 381 | } |
| 382 | $raw .= $val; |
| 383 | } |
| 384 | |
| 385 | $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); |
| 386 | |
| 387 | $maxBinVal = max( $coarseHistogram ); |
| 388 | $scale = (int)( 60 / $maxBinVal ); |
| 389 | $prevBoundary = 0; |
| 390 | for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { |
| 391 | $val = $coarseHistogram[$coarseIndex] ?? 0; |
| 392 | $boundary = $coarseBoundaries[$coarseIndex]; |
| 393 | $this->output( |
| 394 | sprintf( "%-10s %-10d |%s\n", |
| 395 | $prevBoundary . '-' . ( $boundary - 1 ) . ': ', |
| 396 | $val, |
| 397 | str_repeat( '*', $scale * $val ) |
| 398 | ) |
| 399 | ); |
| 400 | $prevBoundary = $boundary; |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | private function runNormalizationMigration() { |
| 405 | if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation', __METHOD__ ) ) { |
| 406 | $this->output( "The cl_collation column appears to already be normalized. Skipping.\n" ); |
| 407 | return; |
| 408 | } |
| 409 | if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation_id', __METHOD__ ) ) { |
| 410 | $this->output( "The cl_collation_id column doesn't exist. Run update.php to create it.\n" ); |
| 411 | return; |
| 412 | } |
| 413 | if ( !$this->dbw->tableExists( 'collation', __METHOD__ ) ) { |
| 414 | $this->output( "The collation table doesn't exist. Run update.php to create it.\n" ); |
| 415 | return; |
| 416 | } |
| 417 | |
| 418 | $maxPageId = (int)$this->dbr->newSelectQueryBuilder() |
| 419 | ->select( 'MAX(page_id)' ) |
| 420 | ->from( 'page' ) |
| 421 | ->caller( __METHOD__ )->fetchField(); |
| 422 | $batchValue = 0; |
| 423 | $batchSize = $this->getBatchSize(); |
| 424 | |
| 425 | do { |
| 426 | $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " ); |
| 427 | |
| 428 | $res = $this->dbw->newSelectQueryBuilder() |
| 429 | ->select( [ 'cl_collation' ] ) |
| 430 | ->distinct() |
| 431 | ->from( 'categorylinks' ) |
| 432 | ->where( [ 'cl_collation_id' => 0 ] ) |
| 433 | ->andWhere( |
| 434 | $this->dbw->expr( 'cl_from', '>=', $batchValue ) |
| 435 | ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() ) |
| 436 | ) |
| 437 | ->caller( __METHOD__ )->fetchResultSet(); |
| 438 | $this->output( "processing... " ); |
| 439 | |
| 440 | if ( $res->numRows() && !$this->dryRun ) { |
| 441 | foreach ( $res as $row ) { |
| 442 | $collationName = $row->cl_collation; |
| 443 | $collationId = $this->collationNameStore->acquireId( $collationName ); |
| 444 | $this->dbw->newUpdateQueryBuilder() |
| 445 | ->update( 'categorylinks' ) |
| 446 | ->set( [ 'cl_collation_id' => $collationId ] ) |
| 447 | ->where( [ 'cl_collation' => $collationName ] ) |
| 448 | ->andWhere( |
| 449 | $this->dbw->expr( 'cl_from', '>=', $batchValue ) |
| 450 | ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() ) |
| 451 | ) |
| 452 | ->caller( __METHOD__ )->execute(); |
| 453 | $this->numRowsProcessed += $this->dbw->affectedRows(); |
| 454 | } |
| 455 | |
| 456 | $this->waitForReplication(); |
| 457 | } |
| 458 | $batchValue += $this->getBatchSize(); |
| 459 | |
| 460 | $this->output( "{$this->numRowsProcessed} done.\n" ); |
| 461 | } while ( $maxPageId >= $batchValue ); |
| 462 | |
| 463 | if ( !$this->dryRun ) { |
| 464 | $this->output( "{$this->numRowsProcessed} rows processed\n" ); |
| 465 | } |
| 466 | } |
| 467 | } |
| 468 | |
| 469 | // @codeCoverageIgnoreStart |
| 470 | $maintClass = UpdateCollation::class; |
| 471 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 472 | // @codeCoverageIgnoreEnd |