MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
36 
45  public $sizeHistogram = [];
46 
48  private $numRowsProcessed = 0;
49 
51  private $dryRun;
52 
54  private $force;
55 
57  private $verboseStats;
58 
60  private $collation;
61 
63  private $collationName;
64 
66  private $targetTable;
67 
69  private $dbr;
70 
72  private $dbw;
73 
75  private $lbFactory;
76 
78  private $namespaceInfo;
79 
80  public function __construct() {
81  parent::__construct();
82 
83  $this->addDescription( <<<TEXT
84 This script will find all rows in the categorylinks table whose collation is
85 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
86 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
87 collations are up-to-date, it will do nothing.
88 TEXT
89  );
90 
91  $this->setBatchSize( 100 );
92  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
93  'supposed to be up-to-date.', false, false, 'f' );
94  $this->addOption( 'previous-collation', 'Set the previous value of ' .
95  '$wgCategoryCollation here to speed up this script, especially if your ' .
96  'categorylinks table is large. This will only update rows with that ' .
97  'collation, though, so it may miss out-of-date rows with a different, ' .
98  'even older collation.', false, true );
99  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
100  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
101  'you should just update $wgCategoryCollation in LocalSettings.php.',
102  false, true );
103  $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
104  'specified table instead of updating them in place.', false, true );
105  $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
106  'remotely.' );
107  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
108  'compile statistics.' );
109  $this->addOption( 'verbose-stats', 'Show more statistics.' );
110  }
111 
115  private function init() {
116  $services = $this->getServiceContainer();
117  $this->namespaceInfo = $services->getNamespaceInfo();
118  $this->lbFactory = $services->getDBLoadBalancerFactory();
119 
120  if ( $this->hasOption( 'target-collation' ) ) {
121  $this->collationName = $this->getOption( 'target-collation' );
122  } else {
123  $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
124  }
125  if ( $this->hasOption( 'remote' ) ) {
126  $realCollationName = 'remote-' . $this->collationName;
127  } else {
128  $realCollationName = $this->collationName;
129  }
130  $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
131 
132  // Collation check: in some cases the constructor will work,
133  // but this will raise an exception, breaking all category pages
134  $this->collation->getSortKey( 'MediaWiki' );
135 
136  $this->force = $this->getOption( 'force' );
137  $this->dryRun = $this->getOption( 'dry-run' );
138  $this->verboseStats = $this->getOption( 'verbose-stats' );
139  $this->dbw = $this->getDB( DB_PRIMARY );
140  $this->dbr = $this->getDB( DB_REPLICA );
141  $this->targetTable = $this->getOption( 'target-table' );
142  }
143 
144  public function execute() {
145  $this->init();
146  $batchSize = $this->getBatchSize();
147 
148  if ( $this->targetTable ) {
149  if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
150  $this->output( "Creating table {$this->targetTable}\n" );
151  $this->dbw->query(
152  'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
153  ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
154  __METHOD__
155  );
156  }
157  }
158 
159  // Locally at least, (my local is a rather old version of mysql)
160  // mysql seems to filesort if there is both an equality
161  // (but not for an inequality) condition on cl_collation in the
162  // WHERE and it is also the first item in the ORDER BY.
163  if ( $this->hasOption( 'previous-collation' ) ) {
164  $orderBy = 'cl_to, cl_type, cl_from';
165  } else {
166  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
167  }
168  $options = [
169  'LIMIT' => $batchSize,
170  'ORDER BY' => $orderBy,
171  'STRAIGHT_JOIN' // per T58041
172  ];
173 
174  $collationConds = [];
175  if ( !$this->force && !$this->targetTable ) {
176  if ( $this->hasOption( 'previous-collation' ) ) {
177  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
178  } else {
179  $collationConds = [
180  0 => 'cl_collation != ' . $this->dbr->addQuotes( $this->collationName )
181  ];
182  }
183 
184  $count = $this->dbr->estimateRowCount(
185  'categorylinks',
186  '*',
187  $collationConds,
188  __METHOD__
189  );
190  // Improve estimate if feasible
191  if ( $count < 1000000 ) {
192  $count = $this->dbr->selectField(
193  'categorylinks',
194  'COUNT(*)',
195  $collationConds,
196  __METHOD__
197  );
198  }
199  if ( $count == 0 ) {
200  $this->output( "Collations up-to-date.\n" );
201 
202  return;
203  }
204  if ( $this->dryRun ) {
205  $this->output( "$count rows would be updated.\n" );
206  } else {
207  $this->output( "Fixing collation for $count rows.\n" );
208  }
209  }
210  $batchConds = [];
211  do {
212  $this->output( "Selecting next $batchSize rows..." );
213 
214  // cl_type must be selected as a number for proper paging because
215  // enums suck.
216  if ( $this->dbw->getType() === 'mysql' ) {
217  $clType = 'cl_type+0 AS "cl_type_numeric"';
218  } else {
219  $clType = 'cl_type';
220  }
221  $res = $this->dbw->select(
222  [ 'categorylinks', 'page' ],
223  [
224  'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
225  'cl_sortkey', $clType, 'cl_timestamp',
226  'page_namespace', 'page_title'
227  ],
228  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
229  __METHOD__,
230  $options
231  );
232  $this->output( " processing..." );
233 
234  if ( $res->numRows() ) {
235  if ( $this->targetTable ) {
236  $this->copyBatch( $res );
237  } else {
238  $this->updateBatch( $res );
239  }
240  $res->seek( $res->numRows() - 1 );
241  $lastRow = $res->fetchObject();
242  $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw ) ];
243  }
244 
245  if ( $this->dryRun ) {
246  $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
247  } else {
248  $this->output( "{$this->numRowsProcessed} done.\n" );
249  }
250  } while ( $res->numRows() == $batchSize );
251 
252  if ( !$this->dryRun ) {
253  $this->output( "{$this->numRowsProcessed} rows processed\n" );
254  }
255 
256  if ( $this->verboseStats ) {
257  $this->output( "\n" );
258  $this->showSortKeySizeHistogram();
259  }
260  }
261 
269  private function getBatchCondition( $row, $dbw ) {
270  if ( $this->hasOption( 'previous-collation' ) ) {
271  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
272  } else {
273  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
274  }
275  $conds = [];
276  foreach ( $fields as $field ) {
277  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
278  // Range conditions with enums are weird in mysql
279  // This must be a numeric literal, or it won't work.
280  $value = intval( $row->cl_type_numeric );
281  } else {
282  $value = $row->$field;
283  }
284  $conds[ $field ] = $value;
285  }
286 
287  return $dbw->buildComparison( '>', $conds );
288  }
289 
295  private function updateBatch( $res ) {
296  if ( !$this->dryRun ) {
297  $this->beginTransaction( $this->dbw, __METHOD__ );
298  }
299  foreach ( $res as $row ) {
300  $title = Title::newFromRow( $row );
301  if ( !$row->cl_collation ) {
302  # This is an old-style row, so the sortkey needs to be
303  # converted.
304  if ( $row->cl_sortkey == $title->getText()
305  || $row->cl_sortkey == $title->getPrefixedText()
306  ) {
307  $prefix = '';
308  } else {
309  # Custom sortkey, use it as a prefix
310  $prefix = $row->cl_sortkey;
311  }
312  } else {
313  $prefix = $row->cl_sortkey_prefix;
314  }
315  # cl_type will be wrong for lots of pages if cl_collation is 0,
316  # so let's update it while we're here.
317  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
318  $newSortKey = $this->collation->getSortKey(
319  $title->getCategorySortkey( $prefix ) );
320  $this->updateSortKeySizeHistogram( $newSortKey );
321  // Truncate to 230 bytes to avoid DB error
322  $newSortKey = substr( $newSortKey, 0, 230 );
323 
324  if ( $this->dryRun ) {
325  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
326  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
327  $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
328  } else {
329  $this->dbw->update(
330  'categorylinks',
331  [
332  'cl_sortkey' => $newSortKey,
333  'cl_sortkey_prefix' => $prefix,
334  'cl_collation' => $this->collationName,
335  'cl_type' => $type,
336  'cl_timestamp = cl_timestamp',
337  ],
338  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
339  __METHOD__
340  );
341  $this->numRowsProcessed++;
342  }
343  }
344  if ( !$this->dryRun ) {
345  $this->commitTransaction( $this->dbw, __METHOD__ );
346  }
347  }
348 
354  private function copyBatch( $res ) {
355  $sortKeyInputs = [];
356  foreach ( $res as $row ) {
357  $title = Title::newFromRow( $row );
358  $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
359  }
360  $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
361  $rowsToInsert = [];
362  foreach ( $res as $i => $row ) {
363  if ( !isset( $sortKeys[$i] ) ) {
364  throw new RuntimeException( 'Unable to get sort key' );
365  }
366  $newSortKey = $sortKeys[$i];
367  $this->updateSortKeySizeHistogram( $newSortKey );
368  // Truncate to 230 bytes to avoid DB error
369  $newSortKey = substr( $newSortKey, 0, 230 );
370  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
371  $rowsToInsert[] = [
372  'cl_from' => $row->cl_from,
373  'cl_to' => $row->cl_to,
374  'cl_sortkey' => $newSortKey,
375  'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
376  'cl_collation' => $this->collationName,
377  'cl_type' => $type,
378  'cl_timestamp' => $row->cl_timestamp
379  ];
380  }
381  if ( $this->dryRun ) {
382  $this->numRowsProcessed += count( $rowsToInsert );
383  } else {
384  $this->beginTransaction( $this->dbw, __METHOD__ );
385  $this->dbw->insert( $this->targetTable, $rowsToInsert, __METHOD__, [ 'IGNORE' ] );
386  $this->numRowsProcessed += $this->dbw->affectedRows();
387  $this->commitTransaction( $this->dbw, __METHOD__ );
388  }
389  }
390 
396  private function updateSortKeySizeHistogram( $key ) {
397  if ( !$this->verboseStats ) {
398  return;
399  }
400  $length = strlen( $key );
401  if ( !isset( $this->sizeHistogram[$length] ) ) {
402  $this->sizeHistogram[$length] = 0;
403  }
404  $this->sizeHistogram[$length]++;
405  }
406 
410  private function showSortKeySizeHistogram() {
411  if ( !$this->sizeHistogram ) {
412  return;
413  }
414  $maxLength = max( array_keys( $this->sizeHistogram ) );
415  if ( $maxLength == 0 ) {
416  return;
417  }
418  $numBins = 20;
419  $coarseHistogram = array_fill( 0, $numBins, 0 );
420  $coarseBoundaries = [];
421  $boundary = 0;
422  for ( $i = 0; $i < $numBins - 1; $i++ ) {
423  $boundary += $maxLength / $numBins;
424  $coarseBoundaries[$i] = round( $boundary );
425  }
426  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
427  $raw = '';
428  for ( $i = 0; $i <= $maxLength; $i++ ) {
429  if ( $raw !== '' ) {
430  $raw .= ', ';
431  }
432  $val = $this->sizeHistogram[$i] ?? 0;
433  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
434  // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
435  if ( $coarseBoundaries[$coarseIndex] > $i ) {
436  $coarseHistogram[$coarseIndex] += $val;
437  break;
438  }
439  }
440  if ( $coarseIndex == $numBins - 1 ) {
441  $coarseHistogram[$coarseIndex] += $val;
442  }
443  $raw .= $val;
444  }
445 
446  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
447 
448  $maxBinVal = max( $coarseHistogram );
449  $scale = 60 / $maxBinVal;
450  $prevBoundary = 0;
451  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
452  $val = $coarseHistogram[$coarseIndex] ?? 0;
453  // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
454  $boundary = $coarseBoundaries[$coarseIndex];
455  $this->output( sprintf( "%-10s %-10d |%s\n",
456  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
457  $val,
458  str_repeat( '*', $scale * $val ) ) );
459  $prevBoundary = $boundary;
460  }
461  }
462 }
463 
464 $maintClass = UpdateCollation::class;
465 require_once RUN_MAINTENANCE_IF_MAIN;
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition: Title.php:76
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:36
Advanced database interface for IDatabase handles that include maintenance methods.
getType()
Get the RDBMS type of the server (e.g.
Result wrapper for grabbing data queried from an IDatabase object.
buildComparison(string $op, array $conds)
Build a condition comparing multiple values, for use with indexes that cover multiple fields,...
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28
$maintClass