MediaWiki REL1_40
updateCollation.php
Go to the documentation of this file.
1<?php
27require_once __DIR__ . '/Maintenance.php';
28
36
45 public $sizeHistogram = [];
46
48 private $numRowsProcessed = 0;
49
51 private $dryRun;
52
54 private $force;
55
57 private $verboseStats;
58
60 private $collation;
61
63 private $collationName;
64
66 private $targetTable;
67
69 private $dbr;
70
72 private $dbw;
73
75 private $lbFactory;
76
78 private $namespaceInfo;
79
80 public function __construct() {
81 parent::__construct();
82
83 $this->addDescription( <<<TEXT
84This script will find all rows in the categorylinks table whose collation is
85out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
86repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
87collations are up-to-date, it will do nothing.
88TEXT
89 );
90
91 $this->setBatchSize( 100 );
92 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
93 'supposed to be up-to-date.', false, false, 'f' );
94 $this->addOption( 'previous-collation', 'Set the previous value of ' .
95 '$wgCategoryCollation here to speed up this script, especially if your ' .
96 'categorylinks table is large. This will only update rows with that ' .
97 'collation, though, so it may miss out-of-date rows with a different, ' .
98 'even older collation.', false, true );
99 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
100 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
101 'you should just update $wgCategoryCollation in LocalSettings.php.',
102 false, true );
103 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
104 'specified table instead of updating them in place.', false, true );
105 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
106 'remotely.' );
107 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
108 'compile statistics.' );
109 $this->addOption( 'verbose-stats', 'Show more statistics.' );
110 }
111
115 private function init() {
116 $services = MediaWikiServices::getInstance();
117 $this->namespaceInfo = $services->getNamespaceInfo();
118 $this->lbFactory = $services->getDBLoadBalancerFactory();
119
120 if ( $this->hasOption( 'target-collation' ) ) {
121 $this->collationName = $this->getOption( 'target-collation' );
122 } else {
123 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
124 }
125 if ( $this->hasOption( 'remote' ) ) {
126 $realCollationName = 'remote-' . $this->collationName;
127 } else {
128 $realCollationName = $this->collationName;
129 }
130 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
131
132 // Collation check: in some cases the constructor will work,
133 // but this will raise an exception, breaking all category pages
134 $this->collation->getSortKey( 'MediaWiki' );
135
136 $this->force = $this->getOption( 'force' );
137 $this->dryRun = $this->getOption( 'dry-run' );
138 $this->verboseStats = $this->getOption( 'verbose-stats' );
139 $this->dbw = $this->getDB( DB_PRIMARY );
140 $this->dbr = $this->getDB( DB_REPLICA );
141 $this->targetTable = $this->getOption( 'target-table' );
142 }
143
144 public function execute() {
145 $this->init();
146 $batchSize = $this->getBatchSize();
147
148 if ( $this->targetTable ) {
149 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
150 $this->output( "Creating table {$this->targetTable}\n" );
151 $this->dbw->query(
152 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
153 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
154 __METHOD__
155 );
156 }
157 }
158
159 // Locally at least, (my local is a rather old version of mysql)
160 // mysql seems to filesort if there is both an equality
161 // (but not for an inequality) condition on cl_collation in the
162 // WHERE and it is also the first item in the ORDER BY.
163 if ( $this->hasOption( 'previous-collation' ) ) {
164 $orderBy = 'cl_to, cl_type, cl_from';
165 } else {
166 $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
167 }
168 $options = [
169 'LIMIT' => $batchSize,
170 'ORDER BY' => $orderBy,
171 'STRAIGHT_JOIN' // per T58041
172 ];
173
174 $collationConds = [];
175 if ( !$this->force && !$this->targetTable ) {
176 if ( $this->hasOption( 'previous-collation' ) ) {
177 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
178 } else {
179 $collationConds = [
180 0 => 'cl_collation != ' . $this->dbr->addQuotes( $this->collationName )
181 ];
182 }
183
184 $count = $this->dbr->estimateRowCount(
185 'categorylinks',
186 '*',
187 $collationConds,
188 __METHOD__
189 );
190 // Improve estimate if feasible
191 if ( $count < 1000000 ) {
192 $count = $this->dbr->selectField(
193 'categorylinks',
194 'COUNT(*)',
195 $collationConds,
196 __METHOD__
197 );
198 }
199 if ( $count == 0 ) {
200 $this->output( "Collations up-to-date.\n" );
201
202 return;
203 }
204 if ( $this->dryRun ) {
205 $this->output( "$count rows would be updated.\n" );
206 } else {
207 $this->output( "Fixing collation for $count rows.\n" );
208 }
209 }
210 $batchConds = [];
211 do {
212 $this->output( "Selecting next $batchSize rows..." );
213
214 // cl_type must be selected as a number for proper paging because
215 // enums suck.
216 if ( $this->dbw->getType() === 'mysql' ) {
217 $clType = 'cl_type+0 AS "cl_type_numeric"';
218 } else {
219 $clType = 'cl_type';
220 }
221 $res = $this->dbw->select(
222 [ 'categorylinks', 'page' ],
223 [
224 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
225 'cl_sortkey', $clType, 'cl_timestamp',
226 'page_namespace', 'page_title'
227 ],
228 array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
229 __METHOD__,
230 $options
231 );
232 $this->output( " processing..." );
233
234 if ( $res->numRows() ) {
235 if ( $this->targetTable ) {
236 $this->copyBatch( $res );
237 } else {
238 $this->updateBatch( $res );
239 }
240 $res->seek( $res->numRows() - 1 );
241 $lastRow = $res->fetchObject();
242 $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw ) ];
243 }
244
245 if ( $this->dryRun ) {
246 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
247 } else {
248 $this->output( "{$this->numRowsProcessed} done.\n" );
249 }
250 } while ( $res->numRows() == $batchSize );
251
252 if ( !$this->dryRun ) {
253 $this->output( "{$this->numRowsProcessed} rows processed\n" );
254 }
255
256 if ( $this->verboseStats ) {
257 $this->output( "\n" );
258 $this->showSortKeySizeHistogram();
259 }
260 }
261
269 private function getBatchCondition( $row, $dbw ) {
270 if ( $this->hasOption( 'previous-collation' ) ) {
271 $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
272 } else {
273 $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
274 }
275 $conds = [];
276 foreach ( $fields as $field ) {
277 if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
278 // Range conditions with enums are weird in mysql
279 // This must be a numeric literal, or it won't work.
280 $value = intval( $row->cl_type_numeric );
281 } else {
282 $value = $row->$field;
283 }
284 $conds[ $field ] = $value;
285 }
286
287 return $dbw->buildComparison( '>', $conds );
288 }
289
295 private function updateBatch( $res ) {
296 if ( !$this->dryRun ) {
297 $this->beginTransaction( $this->dbw, __METHOD__ );
298 }
299 foreach ( $res as $row ) {
300 $title = Title::newFromRow( $row );
301 if ( !$row->cl_collation ) {
302 # This is an old-style row, so the sortkey needs to be
303 # converted.
304 if ( $row->cl_sortkey == $title->getText()
305 || $row->cl_sortkey == $title->getPrefixedText()
306 ) {
307 $prefix = '';
308 } else {
309 # Custom sortkey, use it as a prefix
310 $prefix = $row->cl_sortkey;
311 }
312 } else {
313 $prefix = $row->cl_sortkey_prefix;
314 }
315 # cl_type will be wrong for lots of pages if cl_collation is 0,
316 # so let's update it while we're here.
317 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
318 $newSortKey = $this->collation->getSortKey(
319 $title->getCategorySortkey( $prefix ) );
320 $this->updateSortKeySizeHistogram( $newSortKey );
321 // Truncate to 230 bytes to avoid DB error
322 $newSortKey = substr( $newSortKey, 0, 230 );
323
324 if ( $this->dryRun ) {
325 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
326 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
327 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
328 } else {
329 $this->dbw->update(
330 'categorylinks',
331 [
332 'cl_sortkey' => $newSortKey,
333 'cl_sortkey_prefix' => $prefix,
334 'cl_collation' => $this->collationName,
335 'cl_type' => $type,
336 'cl_timestamp = cl_timestamp',
337 ],
338 [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
339 __METHOD__
340 );
341 $this->numRowsProcessed++;
342 }
343 }
344 if ( !$this->dryRun ) {
345 $this->commitTransaction( $this->dbw, __METHOD__ );
346 }
347 }
348
354 private function copyBatch( $res ) {
355 $sortKeyInputs = [];
356 foreach ( $res as $row ) {
357 $title = Title::newFromRow( $row );
358 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
359 }
360 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
361 $rowsToInsert = [];
362 foreach ( $res as $i => $row ) {
363 if ( !isset( $sortKeys[$i] ) ) {
364 throw new RuntimeException( 'Unable to get sort key' );
365 }
366 $newSortKey = $sortKeys[$i];
367 $this->updateSortKeySizeHistogram( $newSortKey );
368 // Truncate to 230 bytes to avoid DB error
369 $newSortKey = substr( $newSortKey, 0, 230 );
370 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
371 $rowsToInsert[] = [
372 'cl_from' => $row->cl_from,
373 'cl_to' => $row->cl_to,
374 'cl_sortkey' => $newSortKey,
375 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
376 'cl_collation' => $this->collationName,
377 'cl_type' => $type,
378 'cl_timestamp' => $row->cl_timestamp
379 ];
380 }
381 if ( $this->dryRun ) {
382 $this->numRowsProcessed += count( $rowsToInsert );
383 } else {
384 $this->beginTransaction( $this->dbw, __METHOD__ );
385 $this->dbw->insert( $this->targetTable, $rowsToInsert, __METHOD__, [ 'IGNORE' ] );
386 $this->numRowsProcessed += $this->dbw->affectedRows();
387 $this->commitTransaction( $this->dbw, __METHOD__ );
388 }
389 }
390
396 private function updateSortKeySizeHistogram( $key ) {
397 if ( !$this->verboseStats ) {
398 return;
399 }
400 $length = strlen( $key );
401 if ( !isset( $this->sizeHistogram[$length] ) ) {
402 $this->sizeHistogram[$length] = 0;
403 }
404 $this->sizeHistogram[$length]++;
405 }
406
410 private function showSortKeySizeHistogram() {
411 if ( !$this->sizeHistogram ) {
412 return;
413 }
414 $maxLength = max( array_keys( $this->sizeHistogram ) );
415 if ( $maxLength == 0 ) {
416 return;
417 }
418 $numBins = 20;
419 $coarseHistogram = array_fill( 0, $numBins, 0 );
420 $coarseBoundaries = [];
421 $boundary = 0;
422 for ( $i = 0; $i < $numBins - 1; $i++ ) {
423 $boundary += $maxLength / $numBins;
424 $coarseBoundaries[$i] = round( $boundary );
425 }
426 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
427 $raw = '';
428 for ( $i = 0; $i <= $maxLength; $i++ ) {
429 if ( $raw !== '' ) {
430 $raw .= ', ';
431 }
432 $val = $this->sizeHistogram[$i] ?? 0;
433 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
434 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
435 if ( $coarseBoundaries[$coarseIndex] > $i ) {
436 $coarseHistogram[$coarseIndex] += $val;
437 break;
438 }
439 }
440 if ( $coarseIndex == $numBins - 1 ) {
441 $coarseHistogram[$coarseIndex] += $val;
442 }
443 $raw .= $val;
444 }
445
446 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
447
448 $maxBinVal = max( $coarseHistogram );
449 $scale = (int)( 60 / $maxBinVal );
450 $prevBoundary = 0;
451 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
452 $val = $coarseHistogram[$coarseIndex] ?? 0;
453 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
454 $boundary = $coarseBoundaries[$coarseIndex];
455 $this->output( sprintf( "%-10s %-10d |%s\n",
456 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
457 $val,
458 str_repeat( '*', $scale * $val ) ) );
459 $prevBoundary = $boundary;
460 }
461 }
462}
463
464$maintClass = UpdateCollation::class;
465require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Represents a title within MediaWiki.
Definition Title.php:82
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:36
Advanced database interface for IDatabase handles that include maintenance methods.
getType()
Get the RDBMS type of the server (e.g.
Result wrapper for grabbing data queried from an IDatabase object.
buildComparison(string $op, array $conds)
Build a condition comparing multiple values, for use with indexes that cover multiple fields,...
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28