MediaWiki master
updateCollation.php
Go to the documentation of this file.
1<?php
14// @codeCoverageIgnoreStart
15require_once __DIR__ . '/Maintenance.php';
16// @codeCoverageIgnoreEnd
17
28
37 public $sizeHistogram = [];
38
40 private $numRowsProcessed = 0;
41
43 private $force;
44
46 private $dryRun;
47
49 private $verboseStats;
50
52 private $collation;
53
55 private $collationName;
56
58 private $targetTable;
59
61 private $table;
62
63 private bool $normalization = false;
64
66 private $dbr;
67
69 private $dbw;
70
71 private NamespaceInfo $namespaceInfo;
72 private NameTableStore $collationNameStore;
73
74 public function __construct() {
75 parent::__construct();
76
77 $this->addDescription( <<<TEXT
78This script will find all rows in the categorylinks table whose collation is
79out-of-date (collation_name is not the same as \$wgCategoryCollation) and
80repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
81collations are up-to-date, it will do nothing.
82TEXT
83 );
84
85 $this->setBatchSize( 100 );
86 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
87 'supposed to be up-to-date.', false, false, 'f' );
88 $this->addOption( 'previous-collation', 'Set the previous value of ' .
89 '$wgCategoryCollation here to speed up this script, especially if your ' .
90 'categorylinks table is large. This will only update rows with that ' .
91 'collation, though, so it may miss out-of-date rows with a different, ' .
92 'even older collation.', false, true );
93 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
94 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
95 'you should just update $wgCategoryCollation in LocalSettings.php.',
96 false, true );
97 $this->addOption( 'table', 'Table relative to which updates are generated. This ' .
98 'table will be updated in place, unless --target-table is set. Defaults to ' .
99 'categorylinks.', false, true );
100 $this->addOption( 'target-table', 'Copy rows from table into the ' .
101 'specified table instead of updating them in place.', false, true );
102 $this->addOption( 'only-migrate-normalization', 'Only backfill cl_collation_id ' .
103 'field from cl_collation', false );
104 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
105 'remotely.' );
106 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
107 'compile statistics.' );
108 $this->addOption( 'verbose-stats', 'Show more statistics.' );
109 }
110
114 private function init() {
115 $services = $this->getServiceContainer();
116 $this->namespaceInfo = $services->getNamespaceInfo();
117 $this->collationNameStore = new NameTableStore(
118 $this->getServiceContainer()->getDBLoadBalancer(),
119 $this->getServiceContainer()->getMainWANObjectCache(),
120 LoggerFactory::getInstance( 'SecondaryDataUpdate' ),
121 'collation',
122 'collation_id',
123 'collation_name'
124 );
125
126 if ( $this->hasOption( 'target-collation' ) ) {
127 $this->collationName = $this->getOption( 'target-collation' );
128 } else {
129 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
130 }
131 if ( $this->hasOption( 'remote' ) ) {
132 $realCollationName = 'remote-' . $this->collationName;
133 } else {
134 $realCollationName = $this->collationName;
135 }
136 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
137
138 // Collation check: in some cases the constructor will work,
139 // but this will raise an exception, breaking all category pages
140 $this->collation->getSortKey( 'MediaWiki' );
141
142 $this->force = $this->getOption( 'force' );
143 $this->dryRun = $this->getOption( 'dry-run' );
144 $this->verboseStats = $this->getOption( 'verbose-stats' );
145 $this->dbw = $this->getDB( DB_PRIMARY );
146 $this->dbr = $this->getReplicaDB();
147 $this->table = $this->getOption( 'table', 'categorylinks' );
148 $this->targetTable = $this->getOption( 'target-table' );
149 $this->normalization = $this->getOption( 'only-migrate-normalization', false );
150 }
151
152 public function execute() {
153 $this->init();
154 $batchSize = $this->getBatchSize();
155
156 if ( $this->normalization ) {
157 $this->runNormalizationMigration();
158 return;
159 }
160
161 if ( $this->targetTable ) {
162 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
163 $this->output( "Creating table {$this->targetTable}\n" );
164 $this->dbw->query(
165 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
166 ' LIKE ' . $this->dbw->tableName( $this->table ),
167 __METHOD__
168 );
169 }
170 }
171
172 $collationConds = [];
173 if ( !$this->force && !$this->targetTable ) {
174 if ( $this->hasOption( 'previous-collation' ) ) {
175 $collationConds['collation_name'] = $this->getOption( 'previous-collation' );
176 } else {
177 $collationConds[] = $this->dbr->expr( 'collation_name', '!=', $this->collationName );
178 }
179 }
180 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
181 ->select( 'MAX(page_id)' )
182 ->from( 'page' )
183 ->caller( __METHOD__ )->fetchField();
184 $batchValue = 0;
185 do {
186 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
187
188 // cl_type must be selected as a number for proper paging because
189 // enums suck.
190 if ( $this->dbw->getType() === 'mysql' ) {
191 $clType = 'cl_type+0 AS "cl_type_numeric"';
192 } else {
193 $clType = 'cl_type';
194 }
195 $res = $this->dbw->newSelectQueryBuilder()
196 ->select( [
197 'cl_from', 'cl_target_id', 'cl_sortkey_prefix', 'cl_sortkey', $clType,
198 'cl_timestamp', 'collation_name', 'page_namespace', 'page_title'
199 ] )
200 ->from( $this->table )
201 ->join( 'collation', null, 'cl_collation_id = collation_id' )
202 // per T58041
203 ->straightJoin( 'page', null, 'cl_from = page_id' )
204 ->where( $collationConds )
205 ->andWhere(
206 $this->dbw->expr( 'cl_from', '>=', $batchValue )
207 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
208 )
209 ->orderBy( 'cl_from' )
210 ->caller( __METHOD__ )->fetchResultSet();
211 $this->output( "processing... " );
212
213 if ( $res->numRows() ) {
214 if ( $this->targetTable ) {
215 $this->copyBatch( $res );
216 } else {
217 $this->updateBatch( $res );
218 }
219 }
220 $batchValue += $this->getBatchSize();
221
222 if ( $this->dryRun ) {
223 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
224 } else {
225 $this->output( "{$this->numRowsProcessed} done.\n" );
226 }
227 } while ( $maxPageId >= $batchValue );
228
229 if ( !$this->dryRun ) {
230 $this->output( "{$this->numRowsProcessed} rows processed\n" );
231 }
232
233 if ( $this->verboseStats ) {
234 $this->output( "\n" );
235 $this->showSortKeySizeHistogram();
236 }
237 }
238
242 private function updateBatch( IResultWrapper $res ) {
243 if ( !$this->dryRun ) {
244 $this->beginTransactionRound( __METHOD__ );
245 }
246 foreach ( $res as $row ) {
247 $title = Title::newFromRow( $row );
248 if ( !$row->collation_name ) {
249 # This is an old-style row, so the sortkey needs to be
250 # converted.
251 if ( $row->cl_sortkey === $title->getText()
252 || $row->cl_sortkey === $title->getPrefixedText()
253 ) {
254 $prefix = '';
255 } else {
256 # Custom sortkey, so use it as a prefix
257 $prefix = $row->cl_sortkey;
258 }
259 } else {
260 $prefix = $row->cl_sortkey_prefix;
261 }
262 # cl_type will be wrong for lots of pages if cl_collation is 0,
263 # so let's update it while we're here.
264 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
265 $newSortKey = $this->collation->getSortKey(
266 $title->getCategorySortkey( $prefix ) );
267 $this->updateSortKeySizeHistogram( $newSortKey );
268 // Truncate to 230 bytes to avoid DB error
269 $newSortKey = substr( $newSortKey, 0, 230 );
270
271 if ( $this->dryRun ) {
272 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
273 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
274 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
275 } else {
276 $collationId = $this->collationNameStore->acquireId( $this->collationName );
277 $this->dbw->newUpdateQueryBuilder()
278 ->update( $this->table )
279 ->set( [
280 'cl_sortkey' => $newSortKey,
281 'cl_sortkey_prefix' => $prefix,
282 'cl_collation_id' => $collationId,
283 'cl_type' => $type,
284 'cl_timestamp = cl_timestamp',
285 ] )
286 ->where( [ 'cl_from' => $row->cl_from, 'cl_target_id' => $row->cl_target_id ] )
287 ->caller( __METHOD__ )
288 ->execute();
289 $this->numRowsProcessed++;
290 }
291 }
292 if ( !$this->dryRun ) {
293 $this->commitTransactionRound( __METHOD__ );
294 }
295 }
296
300 private function copyBatch( IResultWrapper $res ) {
301 $sortKeyInputs = [];
302 foreach ( $res as $row ) {
303 $title = Title::newFromRow( $row );
304 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
305 }
306 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
307 $rowsToInsert = [];
308 foreach ( $res as $i => $row ) {
309 if ( !isset( $sortKeys[$i] ) ) {
310 throw new RuntimeException( 'Unable to get sort key' );
311 }
312 $newSortKey = $sortKeys[$i];
313 $this->updateSortKeySizeHistogram( $newSortKey );
314 // Truncate to 230 bytes to avoid DB error
315 $newSortKey = substr( $newSortKey, 0, 230 );
316 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
317 $collationId = $this->collationNameStore->acquireId( $this->collationName );
318 $rowsToInsert[] = [
319 'cl_from' => $row->cl_from,
320 'cl_target_id' => $row->cl_target_id,
321 'cl_sortkey' => $newSortKey,
322 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
323 'cl_collation_id' => $collationId,
324 'cl_type' => $type,
325 'cl_timestamp' => $row->cl_timestamp
326 ];
327 }
328 if ( $this->dryRun ) {
329 $this->numRowsProcessed += count( $rowsToInsert );
330 } else {
331 $this->beginTransactionRound( __METHOD__ );
332 $this->dbw->newInsertQueryBuilder()
333 ->insertInto( $this->targetTable )
334 ->ignore()
335 ->rows( $rowsToInsert )
336 ->caller( __METHOD__ )->execute();
337 $this->numRowsProcessed += $this->dbw->affectedRows();
338 $this->commitTransactionRound( __METHOD__ );
339 }
340 }
341
345 private function updateSortKeySizeHistogram( string $key ) {
346 if ( !$this->verboseStats ) {
347 return;
348 }
349 $length = strlen( $key );
350 if ( !isset( $this->sizeHistogram[$length] ) ) {
351 $this->sizeHistogram[$length] = 0;
352 }
353 $this->sizeHistogram[$length]++;
354 }
355
359 private function showSortKeySizeHistogram() {
360 if ( !$this->sizeHistogram ) {
361 return;
362 }
363 $maxLength = max( array_keys( $this->sizeHistogram ) );
364 if ( $maxLength === 0 ) {
365 return;
366 }
367 $numBins = 20;
368 $coarseHistogram = array_fill( 0, $numBins, 0 );
369 $coarseBoundaries = [];
370 $boundary = 0;
371 for ( $i = 0; $i < $numBins - 1; $i++ ) {
372 $boundary += $maxLength / $numBins;
373 $coarseBoundaries[$i] = round( $boundary );
374 }
375 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
376 $raw = '';
377 for ( $i = 0; $i <= $maxLength; $i++ ) {
378 if ( $raw !== '' ) {
379 $raw .= ', ';
380 }
381 $val = $this->sizeHistogram[$i] ?? 0;
382 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
383 if ( $coarseBoundaries[$coarseIndex] > $i ) {
384 $coarseHistogram[$coarseIndex] += $val;
385 break;
386 }
387 }
388 if ( $coarseIndex === ( $numBins - 1 ) ) {
389 $coarseHistogram[$coarseIndex] += $val;
390 }
391 $raw .= $val;
392 }
393
394 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
395
396 $maxBinVal = max( $coarseHistogram );
397 $scale = (int)( 60 / $maxBinVal );
398 $prevBoundary = 0;
399 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
400 $val = $coarseHistogram[$coarseIndex] ?? 0;
401 $boundary = $coarseBoundaries[$coarseIndex];
402 $this->output(
403 sprintf( "%-10s %-10d |%s\n",
404 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
405 $val,
406 str_repeat( '*', $scale * $val )
407 )
408 );
409 $prevBoundary = $boundary;
410 }
411 }
412
413 private function runNormalizationMigration() {
414 if ( !$this->dbw->fieldExists( $this->table, 'cl_collation', __METHOD__ ) ) {
415 $this->output( "The cl_collation column appears to already be normalized. Skipping.\n" );
416 return;
417 }
418 if ( !$this->dbw->fieldExists( $this->table, 'cl_collation_id', __METHOD__ ) ) {
419 $this->output( "The cl_collation_id column doesn't exist. Run update.php to create it.\n" );
420 return;
421 }
422 if ( !$this->dbw->tableExists( 'collation', __METHOD__ ) ) {
423 $this->output( "The collation table doesn't exist. Run update.php to create it.\n" );
424 return;
425 }
426
427 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
428 ->select( 'MAX(page_id)' )
429 ->from( 'page' )
430 ->caller( __METHOD__ )->fetchField();
431 $batchValue = 0;
432 $batchSize = $this->getBatchSize();
433
434 do {
435 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
436
437 $res = $this->dbw->newSelectQueryBuilder()
438 ->select( [ 'cl_collation' ] )
439 ->distinct()
440 ->from( $this->table )
441 ->where( [ 'cl_collation_id' => 0 ] )
442 ->andWhere(
443 $this->dbw->expr( 'cl_from', '>=', $batchValue )
444 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
445 )
446 ->caller( __METHOD__ )->fetchResultSet();
447 $this->output( "processing... " );
448
449 if ( $res->numRows() && !$this->dryRun ) {
450 foreach ( $res as $row ) {
451 $collationName = $row->cl_collation;
452 $collationId = $this->collationNameStore->acquireId( $collationName );
453 $this->dbw->newUpdateQueryBuilder()
454 ->update( $this->table )
455 ->set( [ 'cl_collation_id' => $collationId ] )
456 ->where( [ 'cl_collation' => $collationName ] )
457 ->andWhere(
458 $this->dbw->expr( 'cl_from', '>=', $batchValue )
459 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
460 )
461 ->caller( __METHOD__ )->execute();
462 $this->numRowsProcessed += $this->dbw->affectedRows();
463 }
464
465 $this->waitForReplication();
466 }
467 $batchValue += $this->getBatchSize();
468
469 $this->output( "{$this->numRowsProcessed} done.\n" );
470 } while ( $maxPageId >= $batchValue );
471
472 if ( !$this->dryRun ) {
473 $this->output( "{$this->numRowsProcessed} rows processed\n" );
474 }
475 }
476}
477
478// @codeCoverageIgnoreStart
479$maintClass = UpdateCollation::class;
480require_once RUN_MAINTENANCE_IF_MAIN;
481// @codeCoverageIgnoreEnd
const DB_PRIMARY
Definition defines.php:28
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
output( $out, $channel=null)
Throw some output to the user.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
waitForReplication()
Wait for replica DB servers to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
commitTransactionRound( $fname)
Commit a transactional batch of DB operations and wait for replica DB servers to catch up.
getReplicaDB(string|false $virtualDomain=false)
beginTransactionRound( $fname)
Start a transactional batch of DB operations.
addDescription( $text)
Set the description text.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition Title.php:69
Maintenance script that will find all rows in the configured source table (default: categorylinks) wh...
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Advanced database interface for IDatabase handles that include maintenance methods.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.
numRows()
Get the number of rows in a result object.