MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
31 
39  private const BATCH_SIZE = 100; // Number of rows to process in one batch
40  private const SYNC_INTERVAL = 5; // Wait for replica DBs after this many batches
41 
42  public $sizeHistogram = [];
43 
44  public function __construct() {
45  parent::__construct();
46 
47  $this->addDescription( <<<TEXT
48 This script will find all rows in the categorylinks table whose collation is
49 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
50 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
51 collations are up-to-date, it will do nothing.
52 TEXT
53  );
54 
55  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
56  'supposed to be up-to-date.', false, false, 'f' );
57  $this->addOption( 'previous-collation', 'Set the previous value of ' .
58  '$wgCategoryCollation here to speed up this script, especially if your ' .
59  'categorylinks table is large. This will only update rows with that ' .
60  'collation, though, so it may miss out-of-date rows with a different, ' .
61  'even older collation.', false, true );
62  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
63  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
64  'you should just update $wgCategoryCollation in LocalSettings.php.',
65  false, true );
66  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
67  'compile statistics.' );
68  $this->addOption( 'verbose-stats', 'Show more statistics.' );
69  }
70 
71  public function execute() {
72  $dbw = $this->getDB( DB_MASTER );
73  $dbr = $this->getDB( DB_REPLICA );
74  $force = $this->getOption( 'force' );
75  $dryRun = $this->getOption( 'dry-run' );
76  $verboseStats = $this->getOption( 'verbose-stats' );
77  if ( $this->hasOption( 'target-collation' ) ) {
78  $collationName = $this->getOption( 'target-collation' );
79  $collation = Collation::factory( $collationName );
80  } else {
81  $collationName = $this->getConfig()->get( 'CategoryCollation' );
82  $collation = Collation::singleton();
83  }
84 
85  // Collation sanity check: in some cases the constructor will work,
86  // but this will raise an exception, breaking all category pages
87  $collation->getFirstLetter( 'MediaWiki' );
88 
89  // Locally at least, (my local is a rather old version of mysql)
90  // mysql seems to filesort if there is both an equality
91  // (but not for an inequality) condition on cl_collation in the
92  // WHERE and it is also the first item in the ORDER BY.
93  if ( $this->hasOption( 'previous-collation' ) ) {
94  $orderBy = 'cl_to, cl_type, cl_from';
95  } else {
96  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
97  }
98  $options = [
99  'LIMIT' => self::BATCH_SIZE,
100  'ORDER BY' => $orderBy,
101  'STRAIGHT_JOIN' // per T58041
102  ];
103 
104  $collationConds = [];
105  if ( !$force ) {
106  if ( $this->hasOption( 'previous-collation' ) ) {
107  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
108  } else {
109  $collationConds = [ 0 =>
110  'cl_collation != ' . $dbw->addQuotes( $collationName )
111  ];
112  }
113 
114  $count = $dbr->estimateRowCount(
115  'categorylinks',
116  '*',
117  $collationConds,
118  __METHOD__
119  );
120  // Improve estimate if feasible
121  if ( $count < 1000000 ) {
122  $count = $dbr->selectField(
123  'categorylinks',
124  'COUNT(*)',
125  $collationConds,
126  __METHOD__
127  );
128  }
129  if ( $count == 0 ) {
130  $this->output( "Collations up-to-date.\n" );
131 
132  return;
133  }
134  if ( $dryRun ) {
135  $this->output( "$count rows would be updated.\n" );
136  } else {
137  $this->output( "Fixing collation for $count rows.\n" );
138  }
139  MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
140  }
141  $count = 0;
142  $batchConds = [];
143  do {
144  $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
145 
146  // cl_type must be selected as a number for proper paging because
147  // enums suck.
148  if ( $dbw->getType() === 'mysql' ) {
149  $clType = 'cl_type+0 AS "cl_type_numeric"';
150  } else {
151  $clType = 'cl_type';
152  }
153  $res = $dbw->select(
154  [ 'categorylinks', 'page' ],
155  [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
156  'cl_sortkey', $clType,
157  'page_namespace', 'page_title'
158  ],
159  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
160  __METHOD__,
161  $options
162  );
163  $this->output( " processing..." );
164 
165  if ( !$dryRun ) {
166  $this->beginTransaction( $dbw, __METHOD__ );
167  }
168  foreach ( $res as $row ) {
169  $title = Title::newFromRow( $row );
170  if ( !$row->cl_collation ) {
171  # This is an old-style row, so the sortkey needs to be
172  # converted.
173  if ( $row->cl_sortkey == $title->getText()
174  || $row->cl_sortkey == $title->getPrefixedText()
175  ) {
176  $prefix = '';
177  } else {
178  # Custom sortkey, use it as a prefix
179  $prefix = $row->cl_sortkey;
180  }
181  } else {
182  $prefix = $row->cl_sortkey_prefix;
183  }
184  # cl_type will be wrong for lots of pages if cl_collation is 0,
185  # so let's update it while we're here.
186  $type = MediaWikiServices::getInstance()->getNamespaceInfo()->
187  getCategoryLinkType( $title->getNamespace() );
188  $newSortKey = $collation->getSortKey(
189  $title->getCategorySortkey( $prefix ) );
190  if ( $verboseStats ) {
191  $this->updateSortKeySizeHistogram( $newSortKey );
192  }
193 
194  if ( $dryRun ) {
195  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
196  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
197  $count += ( $row->cl_sortkey !== $newSortKey );
198  } else {
199  $dbw->update(
200  'categorylinks',
201  [
202  'cl_sortkey' => $newSortKey,
203  'cl_sortkey_prefix' => $prefix,
204  'cl_collation' => $collationName,
205  'cl_type' => $type,
206  'cl_timestamp = cl_timestamp',
207  ],
208  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
209  __METHOD__
210  );
211  $count++;
212  }
213  if ( $row ) {
214  $batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
215  }
216  }
217  if ( !$dryRun ) {
218  $this->commitTransaction( $dbw, __METHOD__ );
219  }
220 
221  if ( $dryRun ) {
222  $this->output( "$count rows would be updated so far.\n" );
223  } else {
224  $this->output( "$count done.\n" );
225  }
226  } while ( $res->numRows() == self::BATCH_SIZE );
227 
228  if ( !$dryRun ) {
229  $this->output( "$count rows processed\n" );
230  }
231 
232  if ( $verboseStats ) {
233  $this->output( "\n" );
234  $this->showSortKeySizeHistogram();
235  }
236  }
237 
245  private function getBatchCondition( $row, $dbw ) {
246  if ( $this->hasOption( 'previous-collation' ) ) {
247  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
248  } else {
249  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
250  }
251  $first = true;
252  $cond = false;
253  $prefix = false;
254  foreach ( $fields as $field ) {
255  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
256  // Range conditions with enums are weird in mysql
257  // This must be a numeric literal, or it won't work.
258  $encValue = intval( $row->cl_type_numeric );
259  } else {
260  $encValue = $dbw->addQuotes( $row->$field );
261  }
262  $inequality = "$field > $encValue";
263  $equality = "$field = $encValue";
264  if ( $first ) {
265  $cond = $inequality;
266  $prefix = $equality;
267  $first = false;
268  } else {
269  $cond .= " OR ($prefix AND $inequality)";
270  $prefix .= " AND $equality";
271  }
272  }
273 
274  return $cond;
275  }
276 
277  private function updateSortKeySizeHistogram( $key ) {
278  $length = strlen( $key );
279  if ( !isset( $this->sizeHistogram[$length] ) ) {
280  $this->sizeHistogram[$length] = 0;
281  }
282  $this->sizeHistogram[$length]++;
283  }
284 
285  private function showSortKeySizeHistogram() {
286  $maxLength = max( array_keys( $this->sizeHistogram ) );
287  if ( $maxLength == 0 ) {
288  return;
289  }
290  $numBins = 20;
291  $coarseHistogram = array_fill( 0, $numBins, 0 );
292  $coarseBoundaries = [];
293  $boundary = 0;
294  for ( $i = 0; $i < $numBins - 1; $i++ ) {
295  $boundary += $maxLength / $numBins;
296  $coarseBoundaries[$i] = round( $boundary );
297  }
298  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
299  $raw = '';
300  for ( $i = 0; $i <= $maxLength; $i++ ) {
301  if ( $raw !== '' ) {
302  $raw .= ', ';
303  }
304  $val = $this->sizeHistogram[$i] ?? 0;
305  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
306  if ( $coarseBoundaries[$coarseIndex] > $i ) {
307  $coarseHistogram[$coarseIndex] += $val;
308  break;
309  }
310  }
311  if ( $coarseIndex == $numBins - 1 ) {
312  $coarseHistogram[$coarseIndex] += $val;
313  }
314  $raw .= $val;
315  }
316 
317  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
318 
319  $maxBinVal = max( $coarseHistogram );
320  $scale = 60 / $maxBinVal;
321  $prevBoundary = 0;
322  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
323  $val = $coarseHistogram[$coarseIndex] ?? 0;
324  $boundary = $coarseBoundaries[$coarseIndex];
325  $this->output( sprintf( "%-10s %-10d |%s\n",
326  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
327  $val,
328  str_repeat( '*', $scale * $val ) ) );
329  $prevBoundary = $boundary;
330  }
331  }
332 }
333 
334 $maintClass = UpdateCollation::class;
335 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:38
UpdateCollation\SYNC_INTERVAL
const SYNC_INTERVAL
Definition: updateCollation.php:40
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:154
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:327
$maintClass
$maintClass
Definition: updateCollation.php:334
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:55
$res
$res
Definition: testCompression.php:57
UpdateCollation\__construct
__construct()
Default constructor.
Definition: updateCollation.php:44
UpdateCollation\BATCH_SIZE
const BATCH_SIZE
Definition: updateCollation.php:39
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
$dbr
$dbr
Definition: testCompression.php:54
Collation\singleton
static singleton()
Definition: Collation.php:37
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.
Definition: Maintenance.php:1397
Maintenance\getConfig
getConfig()
Definition: Maintenance.php:596
UpdateCollation\getBatchCondition
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
Definition: updateCollation.php:245
Title\newFromRow
static newFromRow( $row)
Make a Title object from a DB row.
Definition: Title.php:524
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:245
$title
$title
Definition: testCompression.php:38
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
DB_MASTER
const DB_MASTER
Definition: defines.php:26
UpdateCollation\execute
execute()
Do the actual work.
Definition: updateCollation.php:71
UpdateCollation\updateSortKeySizeHistogram
updateSortKeySizeHistogram( $key)
Definition: updateCollation.php:277
UpdateCollation\showSortKeySizeHistogram
showSortKeySizeHistogram()
Definition: updateCollation.php:285
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1412
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1366
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:281
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:434
$wgCategoryCollation
$wgCategoryCollation
Specify how category names should be sorted, when listed on a category page.
Definition: DefaultSettings.php:8108
Collation\factory
static factory( $collationName)
Definition: Collation.php:51
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option exists.
Definition: Maintenance.php:266
UpdateCollation\$sizeHistogram
$sizeHistogram
Definition: updateCollation.php:42
UpdateCollation
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
Definition: updateCollation.php:38
$type
$type
Definition: testCompression.php:52