35 public $sizeHistogram = [];
38 private $numRowsProcessed = 0;
47 private $verboseStats;
53 private $collationName;
58 private bool $normalization =
false;
70 parent::__construct();
73This script will find all rows in the categorylinks table whose collation is
75repopulate cl_sortkey
using the page title and cl_sortkey_prefix. If all
76collations are up-to-date, it will
do nothing.
81 $this->
addOption(
'force',
'Run on all rows, even if the collation is ' .
82 'supposed to be up-to-date.',
false,
false,
'f' );
83 $this->
addOption(
'previous-collation',
'Set the previous value of ' .
84 '$wgCategoryCollation here to speed up this script, especially if your ' .
85 'categorylinks table is large. This will only update rows with that ' .
86 'collation, though, so it may miss out-of-date rows with a different, ' .
87 'even older collation.',
false,
true );
88 $this->
addOption(
'target-collation',
'Set this to the new collation type to ' .
89 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
90 'you should just update $wgCategoryCollation in LocalSettings.php.',
92 $this->
addOption(
'target-table',
'Copy rows from categorylinks into the ' .
93 'specified table instead of updating them in place.',
false,
true );
94 $this->
addOption(
'only-migrate-normalization',
'Only backfill cl_collation_id ' .
95 'field from cl_collation',
false );
96 $this->
addOption(
'remote',
'Use Shellbox to calculate the new sort keys ' .
98 $this->
addOption(
'dry-run',
'Don\'t actually change the collations, just ' .
99 'compile statistics.' );
100 $this->
addOption(
'verbose-stats',
'Show more statistics.' );
106 private function init() {
107 $services = $this->getServiceContainer();
108 $this->namespaceInfo = $services->getNamespaceInfo();
110 $this->getServiceContainer()->getDBLoadBalancer(),
111 $this->getServiceContainer()->getMainWANObjectCache(),
112 LoggerFactory::getInstance(
'SecondaryDataUpdate' ),
118 if ( $this->hasOption(
'target-collation' ) ) {
119 $this->collationName = $this->getOption(
'target-collation' );
121 $this->collationName = $this->
getConfig()->get( MainConfigNames::CategoryCollation );
124 $realCollationName =
'remote-' . $this->collationName;
126 $realCollationName = $this->collationName;
128 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
132 $this->collation->getSortKey(
'MediaWiki' );
134 $this->force = $this->
getOption(
'force' );
135 $this->dryRun = $this->
getOption(
'dry-run' );
136 $this->verboseStats = $this->
getOption(
'verbose-stats' );
139 $this->targetTable = $this->
getOption(
'target-table' );
140 $this->normalization = $this->
getOption(
'only-migrate-normalization',
false );
147 if ( $this->normalization ) {
148 $this->runNormalizationMigration();
152 if ( $this->targetTable ) {
153 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
154 $this->
output(
"Creating table {$this->targetTable}\n" );
156 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
157 ' LIKE ' . $this->dbw->tableName(
'categorylinks' ),
163 $collationConds = [];
164 if ( !$this->force && !$this->targetTable ) {
165 if ( $this->
hasOption(
'previous-collation' ) ) {
166 $collationConds[
'collation_name'] = $this->
getOption(
'previous-collation' );
168 $collationConds[] = $this->dbr->expr(
'collation_name',
'!=', $this->collationName );
171 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
172 ->select(
'MAX(page_id)' )
174 ->caller( __METHOD__ )->fetchField();
177 $this->
output(
"Selecting next $batchSize pages from cl_from = $batchValue... " );
181 if ( $this->dbw->getType() ===
'mysql' ) {
182 $clType =
'cl_type+0 AS "cl_type_numeric"';
186 $res = $this->dbw->newSelectQueryBuilder()
188 'cl_from',
'cl_target_id',
'cl_sortkey_prefix',
'cl_sortkey', $clType,
189 'cl_timestamp',
'collation_name',
'page_namespace',
'page_title'
191 ->from(
'categorylinks' )
192 ->join(
'collation',
null,
'cl_collation_id = collation_id' )
194 ->straightJoin(
'page',
null,
'cl_from = page_id' )
195 ->where( $collationConds )
197 $this->dbw->expr(
'cl_from',
'>=', $batchValue )
198 ->and(
'cl_from',
'<', $batchValue + $this->getBatchSize() )
200 ->orderBy(
'cl_from' )
201 ->caller( __METHOD__ )->fetchResultSet();
202 $this->
output(
"processing... " );
205 if ( $this->targetTable ) {
206 $this->copyBatch( $res );
208 $this->updateBatch( $res );
213 if ( $this->dryRun ) {
214 $this->
output(
"{$this->numRowsProcessed} rows would be updated so far.\n" );
216 $this->
output(
"{$this->numRowsProcessed} done.\n" );
218 }
while ( $maxPageId >= $batchValue );
220 if ( !$this->dryRun ) {
221 $this->
output(
"{$this->numRowsProcessed} rows processed\n" );
224 if ( $this->verboseStats ) {
226 $this->showSortKeySizeHistogram();
234 if ( !$this->dryRun ) {
235 $this->beginTransactionRound( __METHOD__ );
237 foreach ( $res as $row ) {
238 $title = Title::newFromRow( $row );
239 if ( !$row->collation_name ) {
240 # This is an old-style row, so the sortkey needs to be
242 if ( $row->cl_sortkey === $title->getText()
243 || $row->cl_sortkey === $title->getPrefixedText()
247 # Custom sortkey, so use it as a prefix
248 $prefix = $row->cl_sortkey;
251 $prefix = $row->cl_sortkey_prefix;
253 # cl_type will be wrong for lots of pages if cl_collation is 0,
254 # so let's update it while we're here.
255 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
256 $newSortKey = $this->collation->getSortKey(
257 $title->getCategorySortkey( $prefix ) );
258 $this->updateSortKeySizeHistogram( $newSortKey );
260 $newSortKey = substr( $newSortKey, 0, 230 );
262 if ( $this->dryRun ) {
265 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
267 $collationId = $this->collationNameStore->acquireId( $this->collationName );
268 $this->dbw->newUpdateQueryBuilder()
269 ->update(
'categorylinks' )
271 'cl_sortkey' => $newSortKey,
272 'cl_sortkey_prefix' => $prefix,
273 'cl_collation_id' => $collationId,
275 'cl_timestamp = cl_timestamp',
277 ->where( [
'cl_from' => $row->cl_from,
'cl_target_id' => $row->cl_target_id ] )
278 ->caller( __METHOD__ )
280 $this->numRowsProcessed++;
283 if ( !$this->dryRun ) {
293 foreach ( $res as $row ) {
294 $title = Title::newFromRow( $row );
295 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
297 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
299 foreach ( $res as $i => $row ) {
300 if ( !isset( $sortKeys[$i] ) ) {
301 throw new RuntimeException(
'Unable to get sort key' );
303 $newSortKey = $sortKeys[$i];
304 $this->updateSortKeySizeHistogram( $newSortKey );
306 $newSortKey = substr( $newSortKey, 0, 230 );
307 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
308 $collationId = $this->collationNameStore->acquireId( $this->collationName );
310 'cl_from' => $row->cl_from,
311 'cl_target_id' => $row->cl_target_id,
312 'cl_sortkey' => $newSortKey,
313 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
314 'cl_collation_id' => $collationId,
316 'cl_timestamp' => $row->cl_timestamp
319 if ( $this->dryRun ) {
320 $this->numRowsProcessed += count( $rowsToInsert );
323 $this->dbw->newInsertQueryBuilder()
324 ->insertInto( $this->targetTable )
326 ->rows( $rowsToInsert )
327 ->caller( __METHOD__ )->execute();
328 $this->numRowsProcessed += $this->dbw->affectedRows();
336 private function updateSortKeySizeHistogram(
string $key ) {
337 if ( !$this->verboseStats ) {
340 $length = strlen( $key );
341 if ( !isset( $this->sizeHistogram[$length] ) ) {
342 $this->sizeHistogram[$length] = 0;
344 $this->sizeHistogram[$length]++;
350 private function showSortKeySizeHistogram() {
351 if ( !$this->sizeHistogram ) {
354 $maxLength = max( array_keys( $this->sizeHistogram ) );
355 if ( $maxLength === 0 ) {
359 $coarseHistogram = array_fill( 0, $numBins, 0 );
360 $coarseBoundaries = [];
362 for ( $i = 0; $i < $numBins - 1; $i++ ) {
363 $boundary += $maxLength / $numBins;
364 $coarseBoundaries[$i] = round( $boundary );
366 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
368 for ( $i = 0; $i <= $maxLength; $i++ ) {
372 $val = $this->sizeHistogram[$i] ?? 0;
373 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
375 if ( $coarseBoundaries[$coarseIndex] > $i ) {
376 $coarseHistogram[$coarseIndex] += $val;
380 if ( $coarseIndex === ( $numBins - 1 ) ) {
381 $coarseHistogram[$coarseIndex] += $val;
386 $this->
output(
"Sort key size histogram\nRaw data: $raw\n\n" );
388 $maxBinVal = max( $coarseHistogram );
389 $scale = (int)( 60 / $maxBinVal );
391 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
392 $val = $coarseHistogram[$coarseIndex] ?? 0;
394 $boundary = $coarseBoundaries[$coarseIndex];
396 sprintf(
"%-10s %-10d |%s\n",
397 $prevBoundary .
'-' . ( $boundary - 1 ) .
': ',
399 str_repeat(
'*', $scale * $val )
402 $prevBoundary = $boundary;
406 private function runNormalizationMigration() {
407 if ( !$this->dbw->fieldExists(
'categorylinks',
'cl_collation', __METHOD__ ) ) {
408 $this->
output(
"The cl_collation column appears to already be normalized. Skipping.\n" );
411 if ( !$this->dbw->fieldExists(
'categorylinks',
'cl_collation_id', __METHOD__ ) ) {
412 $this->
output(
"The cl_collation_id column doesn't exist. Run update.php to create it.\n" );
415 if ( !$this->dbw->tableExists(
'collation', __METHOD__ ) ) {
416 $this->
output(
"The collation table doesn't exist. Run update.php to create it.\n" );
420 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
421 ->select(
'MAX(page_id)' )
423 ->caller( __METHOD__ )->fetchField();
428 $this->
output(
"Selecting next $batchSize pages from cl_from = $batchValue... " );
430 $res = $this->dbw->newSelectQueryBuilder()
431 ->select( [
'cl_collation' ] )
433 ->from(
'categorylinks' )
434 ->where( [
'cl_collation_id' => 0 ] )
436 $this->dbw->expr(
'cl_from',
'>=', $batchValue )
437 ->and(
'cl_from',
'<', $batchValue + $this->getBatchSize() )
439 ->caller( __METHOD__ )->fetchResultSet();
440 $this->
output(
"processing... " );
442 if ( $res->
numRows() && !$this->dryRun ) {
443 foreach ( $res as $row ) {
444 $collationName = $row->cl_collation;
445 $collationId = $this->collationNameStore->acquireId( $collationName );
446 $this->dbw->newUpdateQueryBuilder()
447 ->update(
'categorylinks' )
448 ->set( [
'cl_collation_id' => $collationId ] )
449 ->where( [
'cl_collation' => $collationName ] )
451 $this->dbw->expr(
'cl_from',
'>=', $batchValue )
452 ->and(
'cl_from',
'<', $batchValue + $this->getBatchSize() )
454 ->caller( __METHOD__ )->execute();
455 $this->numRowsProcessed += $this->dbw->affectedRows();
462 $this->
output(
"{$this->numRowsProcessed} done.\n" );
463 }
while ( $maxPageId >= $batchValue );
465 if ( !$this->dryRun ) {
466 $this->
output(
"{$this->numRowsProcessed} rows processed\n" );