MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
34 
43  public $sizeHistogram = [];
44 
46  private $numRowsProcessed = 0;
47 
49  private $dryRun;
50 
52  private $force;
53 
55  private $verboseStats;
56 
58  private $collation;
59 
61  private $collationName;
62 
64  private $targetTable;
65 
67  private $dbr;
68 
70  private $dbw;
71 
73  private $lbFactory;
74 
76  private $namespaceInfo;
77 
78  public function __construct() {
79  parent::__construct();
80 
81  $this->addDescription( <<<TEXT
82 This script will find all rows in the categorylinks table whose collation is
83 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
84 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
85 collations are up-to-date, it will do nothing.
86 TEXT
87  );
88 
89  $this->setBatchSize( 100 );
90  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
91  'supposed to be up-to-date.', false, false, 'f' );
92  $this->addOption( 'previous-collation', 'Set the previous value of ' .
93  '$wgCategoryCollation here to speed up this script, especially if your ' .
94  'categorylinks table is large. This will only update rows with that ' .
95  'collation, though, so it may miss out-of-date rows with a different, ' .
96  'even older collation.', false, true );
97  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
98  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
99  'you should just update $wgCategoryCollation in LocalSettings.php.',
100  false, true );
101  $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
102  'specified table instead of updating them in place.', false, true );
103  $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
104  'remotely.' );
105  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
106  'compile statistics.' );
107  $this->addOption( 'verbose-stats', 'Show more statistics.' );
108  }
109 
113  private function init() {
114  $services = MediaWikiServices::getInstance();
115  $this->namespaceInfo = $services->getNamespaceInfo();
116  $this->lbFactory = $services->getDBLoadBalancerFactory();
117 
118  if ( $this->hasOption( 'target-collation' ) ) {
119  $this->collationName = $this->getOption( 'target-collation' );
120  } else {
121  $this->collationName = $this->getConfig()->get( 'CategoryCollation' );
122  }
123  if ( $this->hasOption( 'remote' ) ) {
124  $realCollationName = 'remote-' . $this->collationName;
125  } else {
126  $realCollationName = $this->collationName;
127  }
128  $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
129 
130  // Collation check: in some cases the constructor will work,
131  // but this will raise an exception, breaking all category pages
132  $this->collation->getSortKey( 'MediaWiki' );
133 
134  $this->force = $this->getOption( 'force' );
135  $this->dryRun = $this->getOption( 'dry-run' );
136  $this->verboseStats = $this->getOption( 'verbose-stats' );
137  $this->dbw = $this->getDB( DB_PRIMARY );
138  $this->dbr = $this->getDB( DB_REPLICA );
139  $this->targetTable = $this->getOption( 'target-table' );
140  }
141 
142  public function execute() {
143  $this->init();
144  $batchSize = $this->getBatchSize();
145 
146  if ( $this->targetTable ) {
147  if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
148  $this->output( "Creating table {$this->targetTable}\n" );
149  $this->dbw->query(
150  'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
151  ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
152  __METHOD__
153  );
154  }
155  }
156 
157  // Locally at least, (my local is a rather old version of mysql)
158  // mysql seems to filesort if there is both an equality
159  // (but not for an inequality) condition on cl_collation in the
160  // WHERE and it is also the first item in the ORDER BY.
161  if ( $this->hasOption( 'previous-collation' ) ) {
162  $orderBy = 'cl_to, cl_type, cl_from';
163  } else {
164  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
165  }
166  $options = [
167  'LIMIT' => $batchSize,
168  'ORDER BY' => $orderBy,
169  'STRAIGHT_JOIN' // per T58041
170  ];
171 
172  $collationConds = [];
173  if ( !$this->force && !$this->targetTable ) {
174  if ( $this->hasOption( 'previous-collation' ) ) {
175  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
176  } else {
177  $collationConds = [
178  0 => 'cl_collation != ' . $this->dbr->addQuotes( $this->collationName )
179  ];
180  }
181 
182  $count = $this->dbr->estimateRowCount(
183  'categorylinks',
184  '*',
185  $collationConds,
186  __METHOD__
187  );
188  // Improve estimate if feasible
189  if ( $count < 1000000 ) {
190  $count = $this->dbr->selectField(
191  'categorylinks',
192  'COUNT(*)',
193  $collationConds,
194  __METHOD__
195  );
196  }
197  if ( $count == 0 ) {
198  $this->output( "Collations up-to-date.\n" );
199 
200  return;
201  }
202  if ( $this->dryRun ) {
203  $this->output( "$count rows would be updated.\n" );
204  } else {
205  $this->output( "Fixing collation for $count rows.\n" );
206  }
207  }
208  $batchConds = [];
209  do {
210  $this->output( "Selecting next $batchSize rows..." );
211 
212  // cl_type must be selected as a number for proper paging because
213  // enums suck.
214  if ( $this->dbw->getType() === 'mysql' ) {
215  $clType = 'cl_type+0 AS "cl_type_numeric"';
216  } else {
217  $clType = 'cl_type';
218  }
219  $res = $this->dbw->select(
220  [ 'categorylinks', 'page' ],
221  [
222  'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
223  'cl_sortkey', $clType, 'cl_timestamp',
224  'page_namespace', 'page_title'
225  ],
226  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
227  __METHOD__,
228  $options
229  );
230  $this->output( " processing..." );
231 
232  if ( $res->numRows() ) {
233  if ( $this->targetTable ) {
234  $this->copyBatch( $res );
235  } else {
236  $this->updateBatch( $res );
237  }
238  $res->seek( $res->numRows() - 1 );
239  $lastRow = $res->fetchObject();
240  $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw ) ];
241  }
242 
243  if ( $this->dryRun ) {
244  $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
245  } else {
246  $this->output( "{$this->numRowsProcessed} done.\n" );
247  }
248  } while ( $res->numRows() == $batchSize );
249 
250  if ( !$this->dryRun ) {
251  $this->output( "{$this->numRowsProcessed} rows processed\n" );
252  }
253 
254  if ( $this->verboseStats ) {
255  $this->output( "\n" );
256  $this->showSortKeySizeHistogram();
257  }
258  }
259 
267  private function getBatchCondition( $row, $dbw ) {
268  if ( $this->hasOption( 'previous-collation' ) ) {
269  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
270  } else {
271  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
272  }
273  $first = true;
274  $cond = false;
275  $prefix = false;
276  foreach ( $fields as $field ) {
277  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
278  // Range conditions with enums are weird in mysql
279  // This must be a numeric literal, or it won't work.
280  $encValue = intval( $row->cl_type_numeric );
281  } else {
282  $encValue = $dbw->addQuotes( $row->$field );
283  }
284  $inequality = "$field > $encValue";
285  $equality = "$field = $encValue";
286  if ( $first ) {
287  $cond = $inequality;
288  $prefix = $equality;
289  $first = false;
290  } else {
291  $cond .= " OR ($prefix AND $inequality)";
292  $prefix .= " AND $equality";
293  }
294  }
295 
296  return $cond;
297  }
298 
304  private function updateBatch( $res ) {
305  if ( !$this->dryRun ) {
306  $this->beginTransaction( $this->dbw, __METHOD__ );
307  }
308  foreach ( $res as $row ) {
309  $title = Title::newFromRow( $row );
310  if ( !$row->cl_collation ) {
311  # This is an old-style row, so the sortkey needs to be
312  # converted.
313  if ( $row->cl_sortkey == $title->getText()
314  || $row->cl_sortkey == $title->getPrefixedText()
315  ) {
316  $prefix = '';
317  } else {
318  # Custom sortkey, use it as a prefix
319  $prefix = $row->cl_sortkey;
320  }
321  } else {
322  $prefix = $row->cl_sortkey_prefix;
323  }
324  # cl_type will be wrong for lots of pages if cl_collation is 0,
325  # so let's update it while we're here.
326  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
327  $newSortKey = $this->collation->getSortKey(
328  $title->getCategorySortkey( $prefix ) );
329  $this->updateSortKeySizeHistogram( $newSortKey );
330  // Truncate to 230 bytes to avoid DB error
331  $newSortKey = substr( $newSortKey, 0, 230 );
332 
333  if ( $this->dryRun ) {
334  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
335  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
336  $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
337  } else {
338  $this->dbw->update(
339  'categorylinks',
340  [
341  'cl_sortkey' => $newSortKey,
342  'cl_sortkey_prefix' => $prefix,
343  'cl_collation' => $this->collationName,
344  'cl_type' => $type,
345  'cl_timestamp = cl_timestamp',
346  ],
347  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
348  __METHOD__
349  );
350  $this->numRowsProcessed++;
351  }
352  }
353  if ( !$this->dryRun ) {
354  $this->commitTransaction( $this->dbw, __METHOD__ );
355  }
356  }
357 
363  private function copyBatch( $res ) {
364  $sortKeyInputs = [];
365  foreach ( $res as $row ) {
366  $title = Title::newFromRow( $row );
367  $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
368  }
369  $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
370  $rowsToInsert = [];
371  foreach ( $res as $i => $row ) {
372  if ( !isset( $sortKeys[$i] ) ) {
373  throw new MWException( 'Unable to get sort key' );
374  }
375  $newSortKey = $sortKeys[$i];
376  $this->updateSortKeySizeHistogram( $newSortKey );
377  // Truncate to 230 bytes to avoid DB error
378  $newSortKey = substr( $newSortKey, 0, 230 );
379  $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
380  $rowsToInsert[] = [
381  'cl_from' => $row->cl_from,
382  'cl_to' => $row->cl_to,
383  'cl_sortkey' => $newSortKey,
384  'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
385  'cl_collation' => $this->collationName,
386  'cl_type' => $type,
387  'cl_timestamp' => $row->cl_timestamp
388  ];
389  }
390  if ( $this->dryRun ) {
391  $this->numRowsProcessed += count( $rowsToInsert );
392  } else {
393  $this->beginTransaction( $this->dbw, __METHOD__ );
394  $this->dbw->insert( $this->targetTable, $rowsToInsert, __METHOD__, [ 'IGNORE' ] );
395  $this->numRowsProcessed += $this->dbw->affectedRows();
396  $this->commitTransaction( $this->dbw, __METHOD__ );
397  }
398  }
399 
405  private function updateSortKeySizeHistogram( $key ) {
406  if ( !$this->verboseStats ) {
407  return;
408  }
409  $length = strlen( $key );
410  if ( !isset( $this->sizeHistogram[$length] ) ) {
411  $this->sizeHistogram[$length] = 0;
412  }
413  $this->sizeHistogram[$length]++;
414  }
415 
419  private function showSortKeySizeHistogram() {
420  if ( !$this->sizeHistogram ) {
421  return;
422  }
423  $maxLength = max( array_keys( $this->sizeHistogram ) );
424  if ( $maxLength == 0 ) {
425  return;
426  }
427  $numBins = 20;
428  $coarseHistogram = array_fill( 0, $numBins, 0 );
429  $coarseBoundaries = [];
430  $boundary = 0;
431  for ( $i = 0; $i < $numBins - 1; $i++ ) {
432  $boundary += $maxLength / $numBins;
433  $coarseBoundaries[$i] = round( $boundary );
434  }
435  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
436  $raw = '';
437  for ( $i = 0; $i <= $maxLength; $i++ ) {
438  if ( $raw !== '' ) {
439  $raw .= ', ';
440  }
441  $val = $this->sizeHistogram[$i] ?? 0;
442  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
443  if ( $coarseBoundaries[$coarseIndex] > $i ) {
444  $coarseHistogram[$coarseIndex] += $val;
445  break;
446  }
447  }
448  if ( $coarseIndex == $numBins - 1 ) {
449  $coarseHistogram[$coarseIndex] += $val;
450  }
451  $raw .= $val;
452  }
453 
454  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
455 
456  $maxBinVal = max( $coarseHistogram );
457  $scale = 60 / $maxBinVal;
458  $prevBoundary = 0;
459  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
460  $val = $coarseHistogram[$coarseIndex] ?? 0;
461  $boundary = $coarseBoundaries[$coarseIndex];
462  $this->output( sprintf( "%-10s %-10d |%s\n",
463  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
464  $val,
465  str_repeat( '*', $scale * $val ) ) );
466  $prevBoundary = $boundary;
467  }
468  }
469 }
470 
471 $maintClass = UpdateCollation::class;
472 require_once RUN_MAINTENANCE_IF_MAIN;
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:203
UpdateCollation\$dbr
IDatabase $dbr
Definition: updateCollation.php:67
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:329
UpdateCollation\$force
bool $force
Definition: updateCollation.php:52
UpdateCollation\init
init()
Get services and initialise member variables.
Definition: updateCollation.php:113
UpdateCollation\copyBatch
copyBatch( $res)
Copy a set of rows to the target table.
Definition: updateCollation.php:363
$maintClass
$maintClass
Definition: updateCollation.php:471
UpdateCollation\$verboseStats
bool $verboseStats
Definition: updateCollation.php:55
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:59
$res
$res
Definition: testCompression.php:57
UpdateCollation\__construct
__construct()
Default constructor.
Definition: updateCollation.php:78
Collation
Definition: Collation.php:30
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
UpdateCollation\updateBatch
updateBatch( $res)
Update a set of rows in the categorylinks table.
Definition: updateCollation.php:304
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
Definition: Maintenance.php:1406
MWException
MediaWiki exception.
Definition: MWException.php:29
Maintenance\getConfig
getConfig()
Definition: Maintenance.php:598
Wikimedia\Rdbms\IResultWrapper
Result wrapper for grabbing data queried from an IDatabase object.
Definition: IResultWrapper.php:26
UpdateCollation\getBatchCondition
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
Definition: updateCollation.php:267
Title\newFromRow
static newFromRow( $row)
Make a Title object from a DB row.
Definition: Title.php:577
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:249
UpdateCollation\$dryRun
bool $dryRun
Definition: updateCollation.php:49
$title
$title
Definition: testCompression.php:38
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
UpdateCollation\execute
execute()
Do the actual work.
Definition: updateCollation.php:142
UpdateCollation\updateSortKeySizeHistogram
updateSortKeySizeHistogram( $key)
Update the verbose statistics.
Definition: updateCollation.php:405
UpdateCollation\$collationName
string $collationName
Definition: updateCollation.php:61
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
UpdateCollation\showSortKeySizeHistogram
showSortKeySizeHistogram()
Show the verbose statistics.
Definition: updateCollation.php:419
UpdateCollation\$sizeHistogram
int[] $sizeHistogram
Definition: updateCollation.php:43
UpdateCollation\$collation
Collation $collation
Definition: updateCollation.php:58
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1421
UpdateCollation\$lbFactory
LBFactory $lbFactory
Definition: updateCollation.php:73
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1375
UpdateCollation\$dbw
IMaintainableDatabase $dbw
Definition: updateCollation.php:70
UpdateCollation\$targetTable
string null $targetTable
Definition: updateCollation.php:64
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:286
Wikimedia\Rdbms\IDatabase\addQuotes
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
Wikimedia\Rdbms\LBFactory
An interface for generating database load balancers.
Definition: LBFactory.php:42
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:368
UpdateCollation\$numRowsProcessed
int $numRowsProcessed
Definition: updateCollation.php:46
Wikimedia\Rdbms\IDatabase\getType
getType()
Get the RDBMS type of the server (e.g.
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:435
$wgCategoryCollation
$wgCategoryCollation
Specify how category names should be sorted, when listed on a category page.
Definition: DefaultSettings.php:8856
NamespaceInfo
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Definition: NamespaceInfo.php:35
UpdateCollation\$namespaceInfo
NamespaceInfo $namespaceInfo
Definition: updateCollation.php:76
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option was set.
Definition: Maintenance.php:271
Wikimedia\Rdbms\IMaintainableDatabase
Advanced database interface for IDatabase handles that include maintenance methods.
Definition: IMaintainableDatabase.php:38
Maintenance\setBatchSize
setBatchSize( $s=0)
Definition: Maintenance.php:375
UpdateCollation
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
Definition: updateCollation.php:41
$type
$type
Definition: testCompression.php:52