MediaWiki master
updateCollation.php
Go to the documentation of this file.
1<?php
27// @codeCoverageIgnoreStart
28require_once __DIR__ . '/Maintenance.php';
29// @codeCoverageIgnoreEnd
30
40
49 public $sizeHistogram = [];
50
52 private $numRowsProcessed = 0;
53
55 private $force;
56
58 private $dryRun;
59
61 private $verboseStats;
62
64 private $collation;
65
67 private $collationName;
68
70 private $targetTable;
71
72 private bool $normalization = false;
73
75 private $dbr;
76
78 private $dbw;
79
81 private $namespaceInfo;
82
83 public function __construct() {
84 parent::__construct();
85
86 $this->addDescription( <<<TEXT
87This script will find all rows in the categorylinks table whose collation is
88out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
89repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
90collations are up-to-date, it will do nothing.
91TEXT
92 );
93
94 $this->setBatchSize( 100 );
95 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
96 'supposed to be up-to-date.', false, false, 'f' );
97 $this->addOption( 'previous-collation', 'Set the previous value of ' .
98 '$wgCategoryCollation here to speed up this script, especially if your ' .
99 'categorylinks table is large. This will only update rows with that ' .
100 'collation, though, so it may miss out-of-date rows with a different, ' .
101 'even older collation.', false, true );
102 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
103 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
104 'you should just update $wgCategoryCollation in LocalSettings.php.',
105 false, true );
106 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
107 'specified table instead of updating them in place.', false, true );
108 $this->addOption( 'only-migrate-normalization', 'Only backfill cl_collation_id ' .
109 'field from cl_collation', false );
110 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
111 'remotely.' );
112 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
113 'compile statistics.' );
114 $this->addOption( 'verbose-stats', 'Show more statistics.' );
115 }
116
120 private function init() {
121 $services = $this->getServiceContainer();
122 $this->namespaceInfo = $services->getNamespaceInfo();
123
124 if ( $this->hasOption( 'target-collation' ) ) {
125 $this->collationName = $this->getOption( 'target-collation' );
126 } else {
127 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
128 }
129 if ( $this->hasOption( 'remote' ) ) {
130 $realCollationName = 'remote-' . $this->collationName;
131 } else {
132 $realCollationName = $this->collationName;
133 }
134 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
135
136 // Collation check: in some cases the constructor will work,
137 // but this will raise an exception, breaking all category pages
138 $this->collation->getSortKey( 'MediaWiki' );
139
140 $this->force = $this->getOption( 'force' );
141 $this->dryRun = $this->getOption( 'dry-run' );
142 $this->verboseStats = $this->getOption( 'verbose-stats' );
143 $this->dbw = $this->getPrimaryDB();
144 $this->dbr = $this->getReplicaDB();
145 $this->targetTable = $this->getOption( 'target-table' );
146 $this->normalization = $this->getOption( 'only-migrate-normalization', false );
147 }
148
149 public function execute() {
150 $this->init();
151 $batchSize = $this->getBatchSize();
152
153 if ( $this->normalization ) {
154 $this->runNormalizationMigration();
155 return;
156 }
157
158 if ( $this->targetTable ) {
159 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
160 $this->output( "Creating table {$this->targetTable}\n" );
161 $this->dbw->query(
162 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
163 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
164 __METHOD__
165 );
166 }
167 }
168
169 $collationConds = [];
170 if ( !$this->force && !$this->targetTable ) {
171 if ( $this->hasOption( 'previous-collation' ) ) {
172 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
173 } else {
174 $collationConds[] = $this->dbr->expr( 'cl_collation', '!=', $this->collationName );
175 }
176 }
177 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
178 ->select( 'MAX(page_id)' )
179 ->from( 'page' )
180 ->caller( __METHOD__ )->fetchField();
181 $batchValue = 0;
182 do {
183 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
184
185 // cl_type must be selected as a number for proper paging because
186 // enums suck.
187 if ( $this->dbw->getType() === 'mysql' ) {
188 $clType = 'cl_type+0 AS "cl_type_numeric"';
189 } else {
190 $clType = 'cl_type';
191 }
192 $res = $this->dbw->newSelectQueryBuilder()
193 ->select( [
194 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
195 'cl_sortkey', $clType, 'cl_timestamp',
196 'page_namespace', 'page_title'
197 ] )
198 ->from( 'categorylinks' )
199 // per T58041
200 ->straightJoin( 'page', null, 'cl_from = page_id' )
201 ->where( $collationConds )
202 ->andWhere(
203 $this->dbw->expr( 'cl_from', '>=', $batchValue )
204 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
205 )
206 ->orderBy( 'cl_from' )
207 ->caller( __METHOD__ )->fetchResultSet();
208 $this->output( "processing... " );
209
210 if ( $res->numRows() ) {
211 if ( $this->targetTable ) {
212 $this->copyBatch( $res );
213 } else {
214 $this->updateBatch( $res );
215 }
216 }
217 $batchValue += $this->getBatchSize();
218
219 if ( $this->dryRun ) {
220 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
221 } else {
222 $this->output( "{$this->numRowsProcessed} done.\n" );
223 }
224 } while ( $maxPageId >= $batchValue );
225
226 if ( !$this->dryRun ) {
227 $this->output( "{$this->numRowsProcessed} rows processed\n" );
228 }
229
230 if ( $this->verboseStats ) {
231 $this->output( "\n" );
232 $this->showSortKeySizeHistogram();
233 }
234 }
235
239 private function updateBatch( IResultWrapper $res ) {
240 if ( !$this->dryRun ) {
241 $this->beginTransaction( $this->dbw, __METHOD__ );
242 }
243 foreach ( $res as $row ) {
244 $title = Title::newFromRow( $row );
245 if ( !$row->cl_collation ) {
246 # This is an old-style row, so the sortkey needs to be
247 # converted.
248 if ( $row->cl_sortkey === $title->getText()
249 || $row->cl_sortkey === $title->getPrefixedText()
250 ) {
251 $prefix = '';
252 } else {
253 # Custom sortkey, so use it as a prefix
254 $prefix = $row->cl_sortkey;
255 }
256 } else {
257 $prefix = $row->cl_sortkey_prefix;
258 }
259 # cl_type will be wrong for lots of pages if cl_collation is 0,
260 # so let's update it while we're here.
261 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
262 $newSortKey = $this->collation->getSortKey(
263 $title->getCategorySortkey( $prefix ) );
264 $this->updateSortKeySizeHistogram( $newSortKey );
265 // Truncate to 230 bytes to avoid DB error
266 $newSortKey = substr( $newSortKey, 0, 230 );
267
268 if ( $this->dryRun ) {
269 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
270 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
271 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
272 } else {
273 $this->dbw->newUpdateQueryBuilder()
274 ->update( 'categorylinks' )
275 ->set( [
276 'cl_sortkey' => $newSortKey,
277 'cl_sortkey_prefix' => $prefix,
278 'cl_collation' => $this->collationName,
279 'cl_type' => $type,
280 'cl_timestamp = cl_timestamp',
281 ] )
282 ->where( [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ] )
283 ->caller( __METHOD__ )
284 ->execute();
285 $this->numRowsProcessed++;
286 }
287 }
288 if ( !$this->dryRun ) {
289 $this->commitTransaction( $this->dbw, __METHOD__ );
290 }
291 }
292
296 private function copyBatch( IResultWrapper $res ) {
297 $sortKeyInputs = [];
298 foreach ( $res as $row ) {
299 $title = Title::newFromRow( $row );
300 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
301 }
302 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
303 $rowsToInsert = [];
304 foreach ( $res as $i => $row ) {
305 if ( !isset( $sortKeys[$i] ) ) {
306 throw new RuntimeException( 'Unable to get sort key' );
307 }
308 $newSortKey = $sortKeys[$i];
309 $this->updateSortKeySizeHistogram( $newSortKey );
310 // Truncate to 230 bytes to avoid DB error
311 $newSortKey = substr( $newSortKey, 0, 230 );
312 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
313 $rowsToInsert[] = [
314 'cl_from' => $row->cl_from,
315 'cl_to' => $row->cl_to,
316 'cl_sortkey' => $newSortKey,
317 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
318 'cl_collation' => $this->collationName,
319 'cl_type' => $type,
320 'cl_timestamp' => $row->cl_timestamp
321 ];
322 }
323 if ( $this->dryRun ) {
324 $this->numRowsProcessed += count( $rowsToInsert );
325 } else {
326 $this->beginTransaction( $this->dbw, __METHOD__ );
327 $this->dbw->newInsertQueryBuilder()
328 ->insertInto( $this->targetTable )
329 ->ignore()
330 ->rows( $rowsToInsert )
331 ->caller( __METHOD__ )->execute();
332 $this->numRowsProcessed += $this->dbw->affectedRows();
333 $this->commitTransaction( $this->dbw, __METHOD__ );
334 }
335 }
336
340 private function updateSortKeySizeHistogram( string $key ) {
341 if ( !$this->verboseStats ) {
342 return;
343 }
344 $length = strlen( $key );
345 if ( !isset( $this->sizeHistogram[$length] ) ) {
346 $this->sizeHistogram[$length] = 0;
347 }
348 $this->sizeHistogram[$length]++;
349 }
350
354 private function showSortKeySizeHistogram() {
355 if ( !$this->sizeHistogram ) {
356 return;
357 }
358 $maxLength = max( array_keys( $this->sizeHistogram ) );
359 if ( $maxLength === 0 ) {
360 return;
361 }
362 $numBins = 20;
363 $coarseHistogram = array_fill( 0, $numBins, 0 );
364 $coarseBoundaries = [];
365 $boundary = 0;
366 for ( $i = 0; $i < $numBins - 1; $i++ ) {
367 $boundary += $maxLength / $numBins;
368 $coarseBoundaries[$i] = round( $boundary );
369 }
370 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
371 $raw = '';
372 for ( $i = 0; $i <= $maxLength; $i++ ) {
373 if ( $raw !== '' ) {
374 $raw .= ', ';
375 }
376 $val = $this->sizeHistogram[$i] ?? 0;
377 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
378 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
379 if ( $coarseBoundaries[$coarseIndex] > $i ) {
380 $coarseHistogram[$coarseIndex] += $val;
381 break;
382 }
383 }
384 if ( $coarseIndex === ( $numBins - 1 ) ) {
385 $coarseHistogram[$coarseIndex] += $val;
386 }
387 $raw .= $val;
388 }
389
390 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
391
392 $maxBinVal = max( $coarseHistogram );
393 $scale = (int)( 60 / $maxBinVal );
394 $prevBoundary = 0;
395 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
396 $val = $coarseHistogram[$coarseIndex] ?? 0;
397 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
398 $boundary = $coarseBoundaries[$coarseIndex];
399 $this->output(
400 sprintf( "%-10s %-10d |%s\n",
401 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
402 $val,
403 str_repeat( '*', $scale * $val )
404 )
405 );
406 $prevBoundary = $boundary;
407 }
408 }
409
410 private function runNormalizationMigration() {
411 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
412 ->select( 'MAX(page_id)' )
413 ->from( 'page' )
414 ->caller( __METHOD__ )->fetchField();
415 $batchValue = 0;
416 $batchSize = $this->getBatchSize();
417
418 $collationNameStore = new NameTableStore(
419 $this->getServiceContainer()->getDBLoadBalancer(),
420 $this->getServiceContainer()->getMainWANObjectCache(),
421 LoggerFactory::getInstance( 'SecondaryDataUpdate' ),
422 'collation',
423 'collation_id',
424 'collation_name'
425 );
426 do {
427 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
428
429 $res = $this->dbw->newSelectQueryBuilder()
430 ->select( [ 'cl_collation' ] )
431 ->distinct()
432 ->from( 'categorylinks' )
433 ->where( [ 'cl_collation_id' => 0 ] )
434 ->andWhere(
435 $this->dbw->expr( 'cl_from', '>=', $batchValue )
436 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
437 )
438 ->orderBy( 'cl_from' )
439 ->caller( __METHOD__ )->fetchResultSet();
440 $this->output( "processing... " );
441
442 if ( $res->numRows() && !$this->dryRun ) {
443 foreach ( $res as $row ) {
444 $collationName = $row->cl_collation;
445 $collationId = $collationNameStore->acquireId( $collationName );
446 $this->dbw->newUpdateQueryBuilder()
447 ->update( 'categorylinks' )
448 ->set( [ 'cl_collation_id' => $collationId ] )
449 ->where( [ 'cl_collation' => $collationName ] )
450 ->andWhere(
451 $this->dbw->expr( 'cl_from', '>=', $batchValue )
452 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
453 )
454 ->caller( __METHOD__ )->execute();
455 $this->numRowsProcessed += $this->dbw->affectedRows();
456 }
457 }
458 $batchValue += $this->getBatchSize();
459
460 $this->output( "{$this->numRowsProcessed} done.\n" );
461 } while ( $maxPageId >= $batchValue );
462
463 if ( !$this->dryRun ) {
464 $this->output( "{$this->numRowsProcessed} rows processed\n" );
465 }
466 }
467}
468
469// @codeCoverageIgnoreStart
470$maintClass = UpdateCollation::class;
471require_once RUN_MAINTENANCE_IF_MAIN;
472// @codeCoverageIgnoreEnd
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
output( $out, $channel=null)
Throw some output to the user.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DB servers to catch up.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB handle.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition Title.php:78
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Interface to a relational database.
Definition IDatabase.php:45
Advanced database interface for IDatabase handles that include maintenance methods.
Result wrapper for grabbing data queried from an IDatabase object.
numRows()
Get the number of rows in a result object.