MediaWiki REL1_39
updateCollation.php
Go to the documentation of this file.
1<?php
27require_once __DIR__ . '/Maintenance.php';
28
35
44 public $sizeHistogram = [];
45
47 private $numRowsProcessed = 0;
48
50 private $dryRun;
51
53 private $force;
54
56 private $verboseStats;
57
59 private $collation;
60
62 private $collationName;
63
65 private $targetTable;
66
68 private $dbr;
69
71 private $dbw;
72
74 private $lbFactory;
75
77 private $namespaceInfo;
78
79 public function __construct() {
80 parent::__construct();
81
82 $this->addDescription( <<<TEXT
83This script will find all rows in the categorylinks table whose collation is
84out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
85repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
86collations are up-to-date, it will do nothing.
87TEXT
88 );
89
90 $this->setBatchSize( 100 );
91 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
92 'supposed to be up-to-date.', false, false, 'f' );
93 $this->addOption( 'previous-collation', 'Set the previous value of ' .
94 '$wgCategoryCollation here to speed up this script, especially if your ' .
95 'categorylinks table is large. This will only update rows with that ' .
96 'collation, though, so it may miss out-of-date rows with a different, ' .
97 'even older collation.', false, true );
98 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
99 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
100 'you should just update $wgCategoryCollation in LocalSettings.php.',
101 false, true );
102 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
103 'specified table instead of updating them in place.', false, true );
104 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
105 'remotely.' );
106 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
107 'compile statistics.' );
108 $this->addOption( 'verbose-stats', 'Show more statistics.' );
109 }
110
114 private function init() {
115 $services = MediaWikiServices::getInstance();
116 $this->namespaceInfo = $services->getNamespaceInfo();
117 $this->lbFactory = $services->getDBLoadBalancerFactory();
118
119 if ( $this->hasOption( 'target-collation' ) ) {
120 $this->collationName = $this->getOption( 'target-collation' );
121 } else {
122 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
123 }
124 if ( $this->hasOption( 'remote' ) ) {
125 $realCollationName = 'remote-' . $this->collationName;
126 } else {
127 $realCollationName = $this->collationName;
128 }
129 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
130
131 // Collation check: in some cases the constructor will work,
132 // but this will raise an exception, breaking all category pages
133 $this->collation->getSortKey( 'MediaWiki' );
134
135 $this->force = $this->getOption( 'force' );
136 $this->dryRun = $this->getOption( 'dry-run' );
137 $this->verboseStats = $this->getOption( 'verbose-stats' );
138 $this->dbw = $this->getDB( DB_PRIMARY );
139 $this->dbr = $this->getDB( DB_REPLICA );
140 $this->targetTable = $this->getOption( 'target-table' );
141 }
142
143 public function execute() {
144 $this->init();
145 $batchSize = $this->getBatchSize();
146
147 if ( $this->targetTable ) {
148 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
149 $this->output( "Creating table {$this->targetTable}\n" );
150 $this->dbw->query(
151 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
152 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
153 __METHOD__
154 );
155 }
156 }
157
158 // Locally at least, (my local is a rather old version of mysql)
159 // mysql seems to filesort if there is both an equality
160 // (but not for an inequality) condition on cl_collation in the
161 // WHERE and it is also the first item in the ORDER BY.
162 if ( $this->hasOption( 'previous-collation' ) ) {
163 $orderBy = 'cl_to, cl_type, cl_from';
164 } else {
165 $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
166 }
167 $options = [
168 'LIMIT' => $batchSize,
169 'ORDER BY' => $orderBy,
170 'STRAIGHT_JOIN' // per T58041
171 ];
172
173 $collationConds = [];
174 if ( !$this->force && !$this->targetTable ) {
175 if ( $this->hasOption( 'previous-collation' ) ) {
176 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
177 } else {
178 $collationConds = [
179 0 => 'cl_collation != ' . $this->dbr->addQuotes( $this->collationName )
180 ];
181 }
182
183 $count = $this->dbr->estimateRowCount(
184 'categorylinks',
185 '*',
186 $collationConds,
187 __METHOD__
188 );
189 // Improve estimate if feasible
190 if ( $count < 1000000 ) {
191 $count = $this->dbr->selectField(
192 'categorylinks',
193 'COUNT(*)',
194 $collationConds,
195 __METHOD__
196 );
197 }
198 if ( $count == 0 ) {
199 $this->output( "Collations up-to-date.\n" );
200
201 return;
202 }
203 if ( $this->dryRun ) {
204 $this->output( "$count rows would be updated.\n" );
205 } else {
206 $this->output( "Fixing collation for $count rows.\n" );
207 }
208 }
209 $batchConds = [];
210 do {
211 $this->output( "Selecting next $batchSize rows..." );
212
213 // cl_type must be selected as a number for proper paging because
214 // enums suck.
215 if ( $this->dbw->getType() === 'mysql' ) {
216 $clType = 'cl_type+0 AS "cl_type_numeric"';
217 } else {
218 $clType = 'cl_type';
219 }
220 $res = $this->dbw->select(
221 [ 'categorylinks', 'page' ],
222 [
223 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
224 'cl_sortkey', $clType, 'cl_timestamp',
225 'page_namespace', 'page_title'
226 ],
227 array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
228 __METHOD__,
229 $options
230 );
231 $this->output( " processing..." );
232
233 if ( $res->numRows() ) {
234 if ( $this->targetTable ) {
235 $this->copyBatch( $res );
236 } else {
237 $this->updateBatch( $res );
238 }
239 $res->seek( $res->numRows() - 1 );
240 $lastRow = $res->fetchObject();
241 $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw ) ];
242 }
243
244 if ( $this->dryRun ) {
245 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
246 } else {
247 $this->output( "{$this->numRowsProcessed} done.\n" );
248 }
249 } while ( $res->numRows() == $batchSize );
250
251 if ( !$this->dryRun ) {
252 $this->output( "{$this->numRowsProcessed} rows processed\n" );
253 }
254
255 if ( $this->verboseStats ) {
256 $this->output( "\n" );
257 $this->showSortKeySizeHistogram();
258 }
259 }
260
268 private function getBatchCondition( $row, $dbw ) {
269 if ( $this->hasOption( 'previous-collation' ) ) {
270 $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
271 } else {
272 $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
273 }
274 $first = true;
275 $cond = false;
276 $prefix = false;
277 foreach ( $fields as $field ) {
278 if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
279 // Range conditions with enums are weird in mysql
280 // This must be a numeric literal, or it won't work.
281 $encValue = intval( $row->cl_type_numeric );
282 } else {
283 $encValue = $dbw->addQuotes( $row->$field );
284 }
285 $inequality = "$field > $encValue";
286 $equality = "$field = $encValue";
287 if ( $first ) {
288 $cond = $inequality;
289 $prefix = $equality;
290 $first = false;
291 } else {
292 // @phan-suppress-next-line PhanTypeSuspiciousStringExpression False positive
293 $cond .= " OR ($prefix AND $inequality)";
294 $prefix .= " AND $equality";
295 }
296 }
297
298 return $cond;
299 }
300
306 private function updateBatch( $res ) {
307 if ( !$this->dryRun ) {
308 $this->beginTransaction( $this->dbw, __METHOD__ );
309 }
310 foreach ( $res as $row ) {
311 $title = Title::newFromRow( $row );
312 if ( !$row->cl_collation ) {
313 # This is an old-style row, so the sortkey needs to be
314 # converted.
315 if ( $row->cl_sortkey == $title->getText()
316 || $row->cl_sortkey == $title->getPrefixedText()
317 ) {
318 $prefix = '';
319 } else {
320 # Custom sortkey, use it as a prefix
321 $prefix = $row->cl_sortkey;
322 }
323 } else {
324 $prefix = $row->cl_sortkey_prefix;
325 }
326 # cl_type will be wrong for lots of pages if cl_collation is 0,
327 # so let's update it while we're here.
328 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
329 $newSortKey = $this->collation->getSortKey(
330 $title->getCategorySortkey( $prefix ) );
331 $this->updateSortKeySizeHistogram( $newSortKey );
332 // Truncate to 230 bytes to avoid DB error
333 $newSortKey = substr( $newSortKey, 0, 230 );
334
335 if ( $this->dryRun ) {
336 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
337 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
338 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
339 } else {
340 $this->dbw->update(
341 'categorylinks',
342 [
343 'cl_sortkey' => $newSortKey,
344 'cl_sortkey_prefix' => $prefix,
345 'cl_collation' => $this->collationName,
346 'cl_type' => $type,
347 'cl_timestamp = cl_timestamp',
348 ],
349 [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
350 __METHOD__
351 );
352 $this->numRowsProcessed++;
353 }
354 }
355 if ( !$this->dryRun ) {
356 $this->commitTransaction( $this->dbw, __METHOD__ );
357 }
358 }
359
365 private function copyBatch( $res ) {
366 $sortKeyInputs = [];
367 foreach ( $res as $row ) {
368 $title = Title::newFromRow( $row );
369 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
370 }
371 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
372 $rowsToInsert = [];
373 foreach ( $res as $i => $row ) {
374 if ( !isset( $sortKeys[$i] ) ) {
375 throw new MWException( 'Unable to get sort key' );
376 }
377 $newSortKey = $sortKeys[$i];
378 $this->updateSortKeySizeHistogram( $newSortKey );
379 // Truncate to 230 bytes to avoid DB error
380 $newSortKey = substr( $newSortKey, 0, 230 );
381 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
382 $rowsToInsert[] = [
383 'cl_from' => $row->cl_from,
384 'cl_to' => $row->cl_to,
385 'cl_sortkey' => $newSortKey,
386 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
387 'cl_collation' => $this->collationName,
388 'cl_type' => $type,
389 'cl_timestamp' => $row->cl_timestamp
390 ];
391 }
392 if ( $this->dryRun ) {
393 $this->numRowsProcessed += count( $rowsToInsert );
394 } else {
395 $this->beginTransaction( $this->dbw, __METHOD__ );
396 $this->dbw->insert( $this->targetTable, $rowsToInsert, __METHOD__, [ 'IGNORE' ] );
397 $this->numRowsProcessed += $this->dbw->affectedRows();
398 $this->commitTransaction( $this->dbw, __METHOD__ );
399 }
400 }
401
407 private function updateSortKeySizeHistogram( $key ) {
408 if ( !$this->verboseStats ) {
409 return;
410 }
411 $length = strlen( $key );
412 if ( !isset( $this->sizeHistogram[$length] ) ) {
413 $this->sizeHistogram[$length] = 0;
414 }
415 $this->sizeHistogram[$length]++;
416 }
417
421 private function showSortKeySizeHistogram() {
422 if ( !$this->sizeHistogram ) {
423 return;
424 }
425 $maxLength = max( array_keys( $this->sizeHistogram ) );
426 if ( $maxLength == 0 ) {
427 return;
428 }
429 $numBins = 20;
430 $coarseHistogram = array_fill( 0, $numBins, 0 );
431 $coarseBoundaries = [];
432 $boundary = 0;
433 for ( $i = 0; $i < $numBins - 1; $i++ ) {
434 $boundary += $maxLength / $numBins;
435 $coarseBoundaries[$i] = round( $boundary );
436 }
437 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
438 $raw = '';
439 for ( $i = 0; $i <= $maxLength; $i++ ) {
440 if ( $raw !== '' ) {
441 $raw .= ', ';
442 }
443 $val = $this->sizeHistogram[$i] ?? 0;
444 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
445 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
446 if ( $coarseBoundaries[$coarseIndex] > $i ) {
447 $coarseHistogram[$coarseIndex] += $val;
448 break;
449 }
450 }
451 if ( $coarseIndex == $numBins - 1 ) {
452 $coarseHistogram[$coarseIndex] += $val;
453 }
454 $raw .= $val;
455 }
456
457 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
458
459 $maxBinVal = max( $coarseHistogram );
460 $scale = (int)( 60 / $maxBinVal );
461 $prevBoundary = 0;
462 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
463 $val = $coarseHistogram[$coarseIndex] ?? 0;
464 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
465 $boundary = $coarseBoundaries[$coarseIndex];
466 $this->output( sprintf( "%-10s %-10d |%s\n",
467 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
468 $val,
469 str_repeat( '*', $scale * $val ) ) );
470 $prevBoundary = $boundary;
471 }
472 }
473}
474
475$maintClass = UpdateCollation::class;
476require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
MediaWiki exception.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static newFromRow( $row)
Make a Title object from a DB row.
Definition Title.php:573
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:39
getType()
Get the RDBMS type of the server (e.g.
Advanced database interface for IDatabase handles that include maintenance methods.
Result wrapper for grabbing data queried from an IDatabase object.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28