MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
35 
44  public $sizeHistogram = [];
45 
47  private $numRowsProcessed = 0;
48 
50  private $dryRun;
51 
53  private $force;
54 
56  private $verboseStats;
57 
59  private $collation;
60 
62  private $collationName;
63 
65  private $targetTable;
66 
68  private $dbr;
69 
71  private $dbw;
72 
74  private $lbFactory;
75 
77  private $namespaceInfo;
78 
79  public function __construct() {
80  parent::__construct();
81 
82  $this->addDescription( <<<TEXT
83 This script will find all rows in the categorylinks table whose collation is
84 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
85 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
86 collations are up-to-date, it will do nothing.
87 TEXT
88  );
89 
90  $this->setBatchSize( 100 );
91  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
92  'supposed to be up-to-date.', false, false, 'f' );
93  $this->addOption( 'previous-collation', 'Set the previous value of ' .
94  '$wgCategoryCollation here to speed up this script, especially if your ' .
95  'categorylinks table is large. This will only update rows with that ' .
96  'collation, though, so it may miss out-of-date rows with a different, ' .
97  'even older collation.', false, true );
98  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
99  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
100  'you should just update $wgCategoryCollation in LocalSettings.php.',
101  false, true );
102  $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
103  'specified table instead of updating them in place.', false, true );
104  $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
105  'remotely.' );
106  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
107  'compile statistics.' );
108  $this->addOption( 'verbose-stats', 'Show more statistics.' );
109  }
110 
114  private function init() {
115  $services = MediaWikiServices::getInstance();
116  $this->namespaceInfo = $services->getNamespaceInfo();
117  $this->lbFactory = $services->getDBLoadBalancerFactory();
118 
119  if ( $this->hasOption( 'target-collation' ) ) {
120  $this->collationName = $this->getOption( 'target-collation' );
121  } else {
122  $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
123  }
124  if ( $this->hasOption( 'remote' ) ) {
125  $realCollationName = 'remote-' . $this->collationName;
126  } else {
127  $realCollationName = $this->collationName;
128  }
129  $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
130 
131  // Collation check: in some cases the constructor will work,
132  // but this will raise an exception, breaking all category pages
133  $this->collation->getSortKey( 'MediaWiki' );
134 
135  $this->force = $this->getOption( 'force' );
136  $this->dryRun = $this->getOption( 'dry-run' );
137  $this->verboseStats = $this->getOption( 'verbose-stats' );
138  $this->dbw = $this->getDB( DB_PRIMARY );
139  $this->dbr = $this->getDB( DB_REPLICA );
140  $this->targetTable = $this->getOption( 'target-table' );
141  }
142 
143  public function execute() {
144  $this->init();
145  $batchSize = $this->getBatchSize();
146 
147  if ( $this->targetTable ) {
148  if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
149  $this->output( "Creating table {$this->targetTable}\n" );
150  $this->dbw->query(
151  'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
152  ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
153  __METHOD__
154  );
155  }
156  }
157 
158  // Locally at least, (my local is a rather old version of mysql)
159  // mysql seems to filesort if there is both an equality
160  // (but not for an inequality) condition on cl_collation in the
161  // WHERE and it is also the first item in the ORDER BY.
162  if ( $this->hasOption( 'previous-collation' ) ) {
163  $orderBy = 'cl_to, cl_type, cl_from';
164  } else {
165  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
166  }
167  $options = [
168  'LIMIT' => $batchSize,
169  'ORDER BY' => $orderBy,
170  'STRAIGHT_JOIN' // per T58041
171  ];
172 
173  $collationConds = [];
174  if ( !$this->force && !$this->targetTable ) {
175  if ( $this->hasOption( 'previous-collation' ) ) {
176  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
177  } else {
178  $collationConds = [
179  0 => 'cl_collation != ' . $this->dbr->addQuotes( $this->collationName )
180  ];
181  }
182 
183  $count = $this->dbr->estimateRowCount(
184  'categorylinks',
185  '*',
186  $collationConds,
187  __METHOD__
188  );
189  // Improve estimate if feasible
190  if ( $count < 1000000 ) {
191  $count = $this->dbr->selectField(
192  'categorylinks',
193  'COUNT(*)',
194  $collationConds,
195  __METHOD__
196  );
197  }
198  if ( $count == 0 ) {
199  $this->output( "Collations up-to-date.\n" );
200 
201  return;
202  }
203  if ( $this->dryRun ) {
204  $this->output( "$count rows would be updated.\n" );
205  } else {
206  $this->output( "Fixing collation for $count rows.\n" );
207  }
208  }
209  $batchConds = [];
210  do {
211  $this->output( "Selecting next $batchSize rows..." );
212 
213  // cl_type must be selected as a number for proper paging because
214  // enums suck.
215  if ( $this->dbw->getType() === 'mysql' ) {
216  $clType = 'cl_type+0 AS "cl_type_numeric"';
217  } else {
218  $clType = 'cl_type';
219  }
220  $res = $this->dbw->select(
221  [ 'categorylinks', 'page' ],
222  [
223  'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
224  'cl_sortkey', $clType, 'cl_timestamp',
225  'page_namespace', 'page_title'
226  ],
227  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
228  __METHOD__,
229  $options
230  );
231  $this->output( " processing..." );
232 
233  if ( $res->numRows() ) {
234  if ( $this->targetTable ) {
235  $this->copyBatch( $res );
236  } else {
237  $this->updateBatch( $res );
238  }
239  $res->seek( $res->numRows() - 1 );
240  $lastRow = $res->fetchObject();
241  $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw ) ];
242  }
243 
244  if ( $this->dryRun ) {
245  $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
246  } else {
247  $this->output( "{$this->numRowsProcessed} done.\n" );
248  }
249  } while ( $res->numRows() == $batchSize );
250 
251  if ( !$this->dryRun ) {
252  $this->output( "{$this->numRowsProcessed} rows processed\n" );
253  }
254 
255  if ( $this->verboseStats ) {
256  $this->output( "\n" );
257  $this->showSortKeySizeHistogram();
258  }
259  }
260 
268  private function getBatchCondition( $row, $dbw ) {
269  if ( $this->hasOption( 'previous-collation' ) ) {
270  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
271  } else {
272  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
273  }
274  $first = true;
275  $cond = false;
276  $prefix = false;
277  foreach ( $fields as $field ) {
278  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
279  // Range conditions with enums are weird in mysql
280  // This must be a numeric literal, or it won't work.
281  $encValue = intval( $row->cl_type_numeric );
282  } else {
283  $encValue = $dbw->addQuotes( $row->$field );
284  }
285  $inequality = "$field > $encValue";
286  $equality = "$field = $encValue";
287  if ( $first ) {
288  $cond = $inequality;
289  $prefix = $equality;
290  $first = false;
291  } else {
292  // @phan-suppress-next-line PhanTypeSuspiciousStringExpression False positive
293  $cond .= " OR ($prefix AND $inequality)";
294  $prefix .= " AND $equality";
295  }
296  }
297 
298  return $cond;
299  }
300 
306  private function updateBatch( $res ) {
307  if ( !$this->dryRun ) {
308  $this->beginTransaction( $this->dbw, __METHOD__ );
309  }
310  foreach ( $res as $row ) {
311  $title = Title::newFromRow( $row );
312  if ( !$row->cl_collation ) {
313  # This is an old-style row, so the sortkey needs to be
314  # converted.
315  if ( $row->cl_sortkey == $title->getText()
316  || $row->cl_sortkey == $title->getPrefixedText()
317  ) {
318  $prefix = '';
319  } else {
320  # Custom sortkey, use it as a prefix
321  $prefix = $row->cl_sortkey;
322  }
323  } else {
324  $prefix = $row->cl_sortkey_prefix;
325  }
326  # cl_type will be wrong for lots of pages if cl_collation is 0,
327  # so let's update it while we're here.
328  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
329  $newSortKey = $this->collation->getSortKey(
330  $title->getCategorySortkey( $prefix ) );
331  $this->updateSortKeySizeHistogram( $newSortKey );
332  // Truncate to 230 bytes to avoid DB error
333  $newSortKey = substr( $newSortKey, 0, 230 );
334 
335  if ( $this->dryRun ) {
336  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
337  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
338  $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
339  } else {
340  $this->dbw->update(
341  'categorylinks',
342  [
343  'cl_sortkey' => $newSortKey,
344  'cl_sortkey_prefix' => $prefix,
345  'cl_collation' => $this->collationName,
346  'cl_type' => $type,
347  'cl_timestamp = cl_timestamp',
348  ],
349  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
350  __METHOD__
351  );
352  $this->numRowsProcessed++;
353  }
354  }
355  if ( !$this->dryRun ) {
356  $this->commitTransaction( $this->dbw, __METHOD__ );
357  }
358  }
359 
365  private function copyBatch( $res ) {
366  $sortKeyInputs = [];
367  foreach ( $res as $row ) {
368  $title = Title::newFromRow( $row );
369  $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
370  }
371  $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
372  $rowsToInsert = [];
373  foreach ( $res as $i => $row ) {
374  if ( !isset( $sortKeys[$i] ) ) {
375  throw new MWException( 'Unable to get sort key' );
376  }
377  $newSortKey = $sortKeys[$i];
378  $this->updateSortKeySizeHistogram( $newSortKey );
379  // Truncate to 230 bytes to avoid DB error
380  $newSortKey = substr( $newSortKey, 0, 230 );
381  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
382  $rowsToInsert[] = [
383  'cl_from' => $row->cl_from,
384  'cl_to' => $row->cl_to,
385  'cl_sortkey' => $newSortKey,
386  'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
387  'cl_collation' => $this->collationName,
388  'cl_type' => $type,
389  'cl_timestamp' => $row->cl_timestamp
390  ];
391  }
392  if ( $this->dryRun ) {
393  $this->numRowsProcessed += count( $rowsToInsert );
394  } else {
395  $this->beginTransaction( $this->dbw, __METHOD__ );
396  $this->dbw->insert( $this->targetTable, $rowsToInsert, __METHOD__, [ 'IGNORE' ] );
397  $this->numRowsProcessed += $this->dbw->affectedRows();
398  $this->commitTransaction( $this->dbw, __METHOD__ );
399  }
400  }
401 
407  private function updateSortKeySizeHistogram( $key ) {
408  if ( !$this->verboseStats ) {
409  return;
410  }
411  $length = strlen( $key );
412  if ( !isset( $this->sizeHistogram[$length] ) ) {
413  $this->sizeHistogram[$length] = 0;
414  }
415  $this->sizeHistogram[$length]++;
416  }
417 
421  private function showSortKeySizeHistogram() {
422  if ( !$this->sizeHistogram ) {
423  return;
424  }
425  $maxLength = max( array_keys( $this->sizeHistogram ) );
426  if ( $maxLength == 0 ) {
427  return;
428  }
429  $numBins = 20;
430  $coarseHistogram = array_fill( 0, $numBins, 0 );
431  $coarseBoundaries = [];
432  $boundary = 0;
433  for ( $i = 0; $i < $numBins - 1; $i++ ) {
434  $boundary += $maxLength / $numBins;
435  $coarseBoundaries[$i] = round( $boundary );
436  }
437  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
438  $raw = '';
439  for ( $i = 0; $i <= $maxLength; $i++ ) {
440  if ( $raw !== '' ) {
441  $raw .= ', ';
442  }
443  $val = $this->sizeHistogram[$i] ?? 0;
444  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
445  // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
446  if ( $coarseBoundaries[$coarseIndex] > $i ) {
447  $coarseHistogram[$coarseIndex] += $val;
448  break;
449  }
450  }
451  if ( $coarseIndex == $numBins - 1 ) {
452  $coarseHistogram[$coarseIndex] += $val;
453  }
454  $raw .= $val;
455  }
456 
457  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
458 
459  $maxBinVal = max( $coarseHistogram );
460  $scale = 60 / $maxBinVal;
461  $prevBoundary = 0;
462  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
463  $val = $coarseHistogram[$coarseIndex] ?? 0;
464  // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
465  $boundary = $coarseBoundaries[$coarseIndex];
466  $this->output( sprintf( "%-10s %-10d |%s\n",
467  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
468  $val,
469  str_repeat( '*', $scale * $val ) ) );
470  $prevBoundary = $boundary;
471  }
472  }
473 }
474 
475 $maintClass = UpdateCollation::class;
476 require_once RUN_MAINTENANCE_IF_MAIN;
MediaWiki exception.
Definition: MWException.php:29
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
static newFromRow( $row)
Make a Title object from a DB row.
Definition: Title.php:573
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:39
getType()
Get the RDBMS type of the server (e.g.
Advanced database interface for IDatabase handles that include maintenance methods.
Result wrapper for grabbing data queried from an IDatabase object.
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28
$maintClass