MediaWiki master
updateCollation.php
Go to the documentation of this file.
1<?php
13// @codeCoverageIgnoreStart
14require_once __DIR__ . '/Maintenance.php';
15// @codeCoverageIgnoreEnd
16
26
35 public $sizeHistogram = [];
36
38 private $numRowsProcessed = 0;
39
41 private $force;
42
44 private $dryRun;
45
47 private $verboseStats;
48
50 private $collation;
51
53 private $collationName;
54
56 private $targetTable;
57
58 private bool $normalization = false;
59
61 private $dbr;
62
64 private $dbw;
65
66 private NamespaceInfo $namespaceInfo;
67 private NameTableStore $collationNameStore;
68
69 public function __construct() {
70 parent::__construct();
71
72 $this->addDescription( <<<TEXT
73This script will find all rows in the categorylinks table whose collation is
74out-of-date (collation_name is not the same as \$wgCategoryCollation) and
75repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
76collations are up-to-date, it will do nothing.
77TEXT
78 );
79
80 $this->setBatchSize( 100 );
81 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
82 'supposed to be up-to-date.', false, false, 'f' );
83 $this->addOption( 'previous-collation', 'Set the previous value of ' .
84 '$wgCategoryCollation here to speed up this script, especially if your ' .
85 'categorylinks table is large. This will only update rows with that ' .
86 'collation, though, so it may miss out-of-date rows with a different, ' .
87 'even older collation.', false, true );
88 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
89 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
90 'you should just update $wgCategoryCollation in LocalSettings.php.',
91 false, true );
92 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
93 'specified table instead of updating them in place.', false, true );
94 $this->addOption( 'only-migrate-normalization', 'Only backfill cl_collation_id ' .
95 'field from cl_collation', false );
96 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
97 'remotely.' );
98 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
99 'compile statistics.' );
100 $this->addOption( 'verbose-stats', 'Show more statistics.' );
101 }
102
106 private function init() {
107 $services = $this->getServiceContainer();
108 $this->namespaceInfo = $services->getNamespaceInfo();
109 $this->collationNameStore = new NameTableStore(
110 $this->getServiceContainer()->getDBLoadBalancer(),
111 $this->getServiceContainer()->getMainWANObjectCache(),
112 LoggerFactory::getInstance( 'SecondaryDataUpdate' ),
113 'collation',
114 'collation_id',
115 'collation_name'
116 );
117
118 if ( $this->hasOption( 'target-collation' ) ) {
119 $this->collationName = $this->getOption( 'target-collation' );
120 } else {
121 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
122 }
123 if ( $this->hasOption( 'remote' ) ) {
124 $realCollationName = 'remote-' . $this->collationName;
125 } else {
126 $realCollationName = $this->collationName;
127 }
128 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
129
130 // Collation check: in some cases the constructor will work,
131 // but this will raise an exception, breaking all category pages
132 $this->collation->getSortKey( 'MediaWiki' );
133
134 $this->force = $this->getOption( 'force' );
135 $this->dryRun = $this->getOption( 'dry-run' );
136 $this->verboseStats = $this->getOption( 'verbose-stats' );
137 $this->dbw = $this->getDB( DB_PRIMARY );
138 $this->dbr = $this->getReplicaDB();
139 $this->targetTable = $this->getOption( 'target-table' );
140 $this->normalization = $this->getOption( 'only-migrate-normalization', false );
141 }
142
143 public function execute() {
144 $this->init();
145 $batchSize = $this->getBatchSize();
146
147 if ( $this->normalization ) {
148 $this->runNormalizationMigration();
149 return;
150 }
151
152 if ( $this->targetTable ) {
153 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
154 $this->output( "Creating table {$this->targetTable}\n" );
155 $this->dbw->query(
156 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
157 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
158 __METHOD__
159 );
160 }
161 }
162
163 $collationConds = [];
164 if ( !$this->force && !$this->targetTable ) {
165 if ( $this->hasOption( 'previous-collation' ) ) {
166 $collationConds['collation_name'] = $this->getOption( 'previous-collation' );
167 } else {
168 $collationConds[] = $this->dbr->expr( 'collation_name', '!=', $this->collationName );
169 }
170 }
171 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
172 ->select( 'MAX(page_id)' )
173 ->from( 'page' )
174 ->caller( __METHOD__ )->fetchField();
175 $batchValue = 0;
176 do {
177 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
178
179 // cl_type must be selected as a number for proper paging because
180 // enums suck.
181 if ( $this->dbw->getType() === 'mysql' ) {
182 $clType = 'cl_type+0 AS "cl_type_numeric"';
183 } else {
184 $clType = 'cl_type';
185 }
186 $res = $this->dbw->newSelectQueryBuilder()
187 ->select( [
188 'cl_from', 'cl_target_id', 'cl_sortkey_prefix', 'cl_sortkey', $clType,
189 'cl_timestamp', 'collation_name', 'page_namespace', 'page_title'
190 ] )
191 ->from( 'categorylinks' )
192 ->join( 'collation', null, 'cl_collation_id = collation_id' )
193 // per T58041
194 ->straightJoin( 'page', null, 'cl_from = page_id' )
195 ->where( $collationConds )
196 ->andWhere(
197 $this->dbw->expr( 'cl_from', '>=', $batchValue )
198 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
199 )
200 ->orderBy( 'cl_from' )
201 ->caller( __METHOD__ )->fetchResultSet();
202 $this->output( "processing... " );
203
204 if ( $res->numRows() ) {
205 if ( $this->targetTable ) {
206 $this->copyBatch( $res );
207 } else {
208 $this->updateBatch( $res );
209 }
210 }
211 $batchValue += $this->getBatchSize();
212
213 if ( $this->dryRun ) {
214 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
215 } else {
216 $this->output( "{$this->numRowsProcessed} done.\n" );
217 }
218 } while ( $maxPageId >= $batchValue );
219
220 if ( !$this->dryRun ) {
221 $this->output( "{$this->numRowsProcessed} rows processed\n" );
222 }
223
224 if ( $this->verboseStats ) {
225 $this->output( "\n" );
226 $this->showSortKeySizeHistogram();
227 }
228 }
229
233 private function updateBatch( IResultWrapper $res ) {
234 if ( !$this->dryRun ) {
235 $this->beginTransactionRound( __METHOD__ );
236 }
237 foreach ( $res as $row ) {
238 $title = Title::newFromRow( $row );
239 if ( !$row->collation_name ) {
240 # This is an old-style row, so the sortkey needs to be
241 # converted.
242 if ( $row->cl_sortkey === $title->getText()
243 || $row->cl_sortkey === $title->getPrefixedText()
244 ) {
245 $prefix = '';
246 } else {
247 # Custom sortkey, so use it as a prefix
248 $prefix = $row->cl_sortkey;
249 }
250 } else {
251 $prefix = $row->cl_sortkey_prefix;
252 }
253 # cl_type will be wrong for lots of pages if cl_collation is 0,
254 # so let's update it while we're here.
255 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
256 $newSortKey = $this->collation->getSortKey(
257 $title->getCategorySortkey( $prefix ) );
258 $this->updateSortKeySizeHistogram( $newSortKey );
259 // Truncate to 230 bytes to avoid DB error
260 $newSortKey = substr( $newSortKey, 0, 230 );
261
262 if ( $this->dryRun ) {
263 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
264 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
265 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
266 } else {
267 $collationId = $this->collationNameStore->acquireId( $this->collationName );
268 $this->dbw->newUpdateQueryBuilder()
269 ->update( 'categorylinks' )
270 ->set( [
271 'cl_sortkey' => $newSortKey,
272 'cl_sortkey_prefix' => $prefix,
273 'cl_collation_id' => $collationId,
274 'cl_type' => $type,
275 'cl_timestamp = cl_timestamp',
276 ] )
277 ->where( [ 'cl_from' => $row->cl_from, 'cl_target_id' => $row->cl_target_id ] )
278 ->caller( __METHOD__ )
279 ->execute();
280 $this->numRowsProcessed++;
281 }
282 }
283 if ( !$this->dryRun ) {
284 $this->commitTransactionRound( __METHOD__ );
285 }
286 }
287
291 private function copyBatch( IResultWrapper $res ) {
292 $sortKeyInputs = [];
293 foreach ( $res as $row ) {
294 $title = Title::newFromRow( $row );
295 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
296 }
297 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
298 $rowsToInsert = [];
299 foreach ( $res as $i => $row ) {
300 if ( !isset( $sortKeys[$i] ) ) {
301 throw new RuntimeException( 'Unable to get sort key' );
302 }
303 $newSortKey = $sortKeys[$i];
304 $this->updateSortKeySizeHistogram( $newSortKey );
305 // Truncate to 230 bytes to avoid DB error
306 $newSortKey = substr( $newSortKey, 0, 230 );
307 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
308 $collationId = $this->collationNameStore->acquireId( $this->collationName );
309 $rowsToInsert[] = [
310 'cl_from' => $row->cl_from,
311 'cl_target_id' => $row->cl_target_id,
312 'cl_sortkey' => $newSortKey,
313 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
314 'cl_collation_id' => $collationId,
315 'cl_type' => $type,
316 'cl_timestamp' => $row->cl_timestamp
317 ];
318 }
319 if ( $this->dryRun ) {
320 $this->numRowsProcessed += count( $rowsToInsert );
321 } else {
322 $this->beginTransactionRound( __METHOD__ );
323 $this->dbw->newInsertQueryBuilder()
324 ->insertInto( $this->targetTable )
325 ->ignore()
326 ->rows( $rowsToInsert )
327 ->caller( __METHOD__ )->execute();
328 $this->numRowsProcessed += $this->dbw->affectedRows();
329 $this->commitTransactionRound( __METHOD__ );
330 }
331 }
332
336 private function updateSortKeySizeHistogram( string $key ) {
337 if ( !$this->verboseStats ) {
338 return;
339 }
340 $length = strlen( $key );
341 if ( !isset( $this->sizeHistogram[$length] ) ) {
342 $this->sizeHistogram[$length] = 0;
343 }
344 $this->sizeHistogram[$length]++;
345 }
346
350 private function showSortKeySizeHistogram() {
351 if ( !$this->sizeHistogram ) {
352 return;
353 }
354 $maxLength = max( array_keys( $this->sizeHistogram ) );
355 if ( $maxLength === 0 ) {
356 return;
357 }
358 $numBins = 20;
359 $coarseHistogram = array_fill( 0, $numBins, 0 );
360 $coarseBoundaries = [];
361 $boundary = 0;
362 for ( $i = 0; $i < $numBins - 1; $i++ ) {
363 $boundary += $maxLength / $numBins;
364 $coarseBoundaries[$i] = round( $boundary );
365 }
366 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
367 $raw = '';
368 for ( $i = 0; $i <= $maxLength; $i++ ) {
369 if ( $raw !== '' ) {
370 $raw .= ', ';
371 }
372 $val = $this->sizeHistogram[$i] ?? 0;
373 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
374 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
375 if ( $coarseBoundaries[$coarseIndex] > $i ) {
376 $coarseHistogram[$coarseIndex] += $val;
377 break;
378 }
379 }
380 if ( $coarseIndex === ( $numBins - 1 ) ) {
381 $coarseHistogram[$coarseIndex] += $val;
382 }
383 $raw .= $val;
384 }
385
386 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
387
388 $maxBinVal = max( $coarseHistogram );
389 $scale = (int)( 60 / $maxBinVal );
390 $prevBoundary = 0;
391 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
392 $val = $coarseHistogram[$coarseIndex] ?? 0;
393 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
394 $boundary = $coarseBoundaries[$coarseIndex];
395 $this->output(
396 sprintf( "%-10s %-10d |%s\n",
397 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
398 $val,
399 str_repeat( '*', $scale * $val )
400 )
401 );
402 $prevBoundary = $boundary;
403 }
404 }
405
406 private function runNormalizationMigration() {
407 if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation', __METHOD__ ) ) {
408 $this->output( "The cl_collation column appears to already be normalized. Skipping.\n" );
409 return;
410 }
411 if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation_id', __METHOD__ ) ) {
412 $this->output( "The cl_collation_id column doesn't exist. Run update.php to create it.\n" );
413 return;
414 }
415 if ( !$this->dbw->tableExists( 'collation', __METHOD__ ) ) {
416 $this->output( "The collation table doesn't exist. Run update.php to create it.\n" );
417 return;
418 }
419
420 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
421 ->select( 'MAX(page_id)' )
422 ->from( 'page' )
423 ->caller( __METHOD__ )->fetchField();
424 $batchValue = 0;
425 $batchSize = $this->getBatchSize();
426
427 do {
428 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
429
430 $res = $this->dbw->newSelectQueryBuilder()
431 ->select( [ 'cl_collation' ] )
432 ->distinct()
433 ->from( 'categorylinks' )
434 ->where( [ 'cl_collation_id' => 0 ] )
435 ->andWhere(
436 $this->dbw->expr( 'cl_from', '>=', $batchValue )
437 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
438 )
439 ->caller( __METHOD__ )->fetchResultSet();
440 $this->output( "processing... " );
441
442 if ( $res->numRows() && !$this->dryRun ) {
443 foreach ( $res as $row ) {
444 $collationName = $row->cl_collation;
445 $collationId = $this->collationNameStore->acquireId( $collationName );
446 $this->dbw->newUpdateQueryBuilder()
447 ->update( 'categorylinks' )
448 ->set( [ 'cl_collation_id' => $collationId ] )
449 ->where( [ 'cl_collation' => $collationName ] )
450 ->andWhere(
451 $this->dbw->expr( 'cl_from', '>=', $batchValue )
452 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
453 )
454 ->caller( __METHOD__ )->execute();
455 $this->numRowsProcessed += $this->dbw->affectedRows();
456 }
457
458 $this->waitForReplication();
459 }
460 $batchValue += $this->getBatchSize();
461
462 $this->output( "{$this->numRowsProcessed} done.\n" );
463 } while ( $maxPageId >= $batchValue );
464
465 if ( !$this->dryRun ) {
466 $this->output( "{$this->numRowsProcessed} rows processed\n" );
467 }
468 }
469}
470
471// @codeCoverageIgnoreStart
472$maintClass = UpdateCollation::class;
473require_once RUN_MAINTENANCE_IF_MAIN;
474// @codeCoverageIgnoreEnd
const DB_PRIMARY
Definition defines.php:28
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
output( $out, $channel=null)
Throw some output to the user.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
waitForReplication()
Wait for replica DB servers to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
commitTransactionRound( $fname)
Commit a transactional batch of DB operations and wait for replica DB servers to catch up.
getReplicaDB(string|false $virtualDomain=false)
beginTransactionRound( $fname)
Start a transactional batch of DB operations.
addDescription( $text)
Set the description text.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition Title.php:69
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Advanced database interface for IDatabase handles that include maintenance methods.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.
numRows()
Get the number of rows in a result object.