MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
31 
39  private const BATCH_SIZE = 100; // Number of rows to process in one batch
40 
41  public $sizeHistogram = [];
42 
43  public function __construct() {
44  parent::__construct();
45 
46  $this->addDescription( <<<TEXT
47 This script will find all rows in the categorylinks table whose collation is
48 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
49 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
50 collations are up-to-date, it will do nothing.
51 TEXT
52  );
53 
54  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
55  'supposed to be up-to-date.', false, false, 'f' );
56  $this->addOption( 'previous-collation', 'Set the previous value of ' .
57  '$wgCategoryCollation here to speed up this script, especially if your ' .
58  'categorylinks table is large. This will only update rows with that ' .
59  'collation, though, so it may miss out-of-date rows with a different, ' .
60  'even older collation.', false, true );
61  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
62  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
63  'you should just update $wgCategoryCollation in LocalSettings.php.',
64  false, true );
65  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
66  'compile statistics.' );
67  $this->addOption( 'verbose-stats', 'Show more statistics.' );
68  }
69 
70  public function execute() {
71  $dbw = $this->getDB( DB_PRIMARY );
72  $dbr = $this->getDB( DB_REPLICA );
73  $force = $this->getOption( 'force' );
74  $dryRun = $this->getOption( 'dry-run' );
75  $verboseStats = $this->getOption( 'verbose-stats' );
76  if ( $this->hasOption( 'target-collation' ) ) {
77  $collationName = $this->getOption( 'target-collation' );
78  } else {
79  $collationName = $this->getConfig()->get( 'CategoryCollation' );
80  }
81  $collation = MediaWikiServices::getInstance()->getCollationFactory()->makeCollation( $collationName );
82 
83  // Collation sanity check: in some cases the constructor will work,
84  // but this will raise an exception, breaking all category pages
85  $collation->getFirstLetter( 'MediaWiki' );
86 
87  // Locally at least, (my local is a rather old version of mysql)
88  // mysql seems to filesort if there is both an equality
89  // (but not for an inequality) condition on cl_collation in the
90  // WHERE and it is also the first item in the ORDER BY.
91  if ( $this->hasOption( 'previous-collation' ) ) {
92  $orderBy = 'cl_to, cl_type, cl_from';
93  } else {
94  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
95  }
96  $options = [
97  'LIMIT' => self::BATCH_SIZE,
98  'ORDER BY' => $orderBy,
99  'STRAIGHT_JOIN' // per T58041
100  ];
101 
102  $collationConds = [];
103  if ( !$force ) {
104  if ( $this->hasOption( 'previous-collation' ) ) {
105  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
106  } else {
107  $collationConds = [ 0 =>
108  'cl_collation != ' . $dbw->addQuotes( $collationName )
109  ];
110  }
111 
112  $count = $dbr->estimateRowCount(
113  'categorylinks',
114  '*',
115  $collationConds,
116  __METHOD__
117  );
118  // Improve estimate if feasible
119  if ( $count < 1000000 ) {
120  $count = $dbr->selectField(
121  'categorylinks',
122  'COUNT(*)',
123  $collationConds,
124  __METHOD__
125  );
126  }
127  if ( $count == 0 ) {
128  $this->output( "Collations up-to-date.\n" );
129 
130  return;
131  }
132  if ( $dryRun ) {
133  $this->output( "$count rows would be updated.\n" );
134  } else {
135  $this->output( "Fixing collation for $count rows.\n" );
136  }
137  MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
138  }
139  $count = 0;
140  $batchConds = [];
141  do {
142  $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
143 
144  // cl_type must be selected as a number for proper paging because
145  // enums suck.
146  if ( $dbw->getType() === 'mysql' ) {
147  $clType = 'cl_type+0 AS "cl_type_numeric"';
148  } else {
149  $clType = 'cl_type';
150  }
151  $res = $dbw->select(
152  [ 'categorylinks', 'page' ],
153  [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
154  'cl_sortkey', $clType,
155  'page_namespace', 'page_title'
156  ],
157  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
158  __METHOD__,
159  $options
160  );
161  $this->output( " processing..." );
162 
163  if ( !$dryRun ) {
164  $this->beginTransaction( $dbw, __METHOD__ );
165  }
166  foreach ( $res as $row ) {
167  $title = Title::newFromRow( $row );
168  if ( !$row->cl_collation ) {
169  # This is an old-style row, so the sortkey needs to be
170  # converted.
171  if ( $row->cl_sortkey == $title->getText()
172  || $row->cl_sortkey == $title->getPrefixedText()
173  ) {
174  $prefix = '';
175  } else {
176  # Custom sortkey, use it as a prefix
177  $prefix = $row->cl_sortkey;
178  }
179  } else {
180  $prefix = $row->cl_sortkey_prefix;
181  }
182  # cl_type will be wrong for lots of pages if cl_collation is 0,
183  # so let's update it while we're here.
184  $type = MediaWikiServices::getInstance()->getNamespaceInfo()->
185  getCategoryLinkType( $title->getNamespace() );
186  $newSortKey = $collation->getSortKey(
187  $title->getCategorySortkey( $prefix ) );
188  if ( $verboseStats ) {
189  $this->updateSortKeySizeHistogram( $newSortKey );
190  }
191 
192  if ( $dryRun ) {
193  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
194  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
195  $count += ( $row->cl_sortkey !== $newSortKey );
196  } else {
197  $dbw->update(
198  'categorylinks',
199  [
200  'cl_sortkey' => $newSortKey,
201  'cl_sortkey_prefix' => $prefix,
202  'cl_collation' => $collationName,
203  'cl_type' => $type,
204  'cl_timestamp = cl_timestamp',
205  ],
206  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
207  __METHOD__
208  );
209  $count++;
210  }
211  if ( $row ) {
212  $batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
213  }
214  }
215  if ( !$dryRun ) {
216  $this->commitTransaction( $dbw, __METHOD__ );
217  }
218 
219  if ( $dryRun ) {
220  $this->output( "$count rows would be updated so far.\n" );
221  } else {
222  $this->output( "$count done.\n" );
223  }
224  } while ( $res->numRows() == self::BATCH_SIZE );
225 
226  if ( !$dryRun ) {
227  $this->output( "$count rows processed\n" );
228  }
229 
230  if ( $verboseStats ) {
231  $this->output( "\n" );
232  $this->showSortKeySizeHistogram();
233  }
234  }
235 
243  private function getBatchCondition( $row, $dbw ) {
244  if ( $this->hasOption( 'previous-collation' ) ) {
245  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
246  } else {
247  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
248  }
249  $first = true;
250  $cond = false;
251  $prefix = false;
252  foreach ( $fields as $field ) {
253  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
254  // Range conditions with enums are weird in mysql
255  // This must be a numeric literal, or it won't work.
256  $encValue = intval( $row->cl_type_numeric );
257  } else {
258  $encValue = $dbw->addQuotes( $row->$field );
259  }
260  $inequality = "$field > $encValue";
261  $equality = "$field = $encValue";
262  if ( $first ) {
263  $cond = $inequality;
264  $prefix = $equality;
265  $first = false;
266  } else {
267  $cond .= " OR ($prefix AND $inequality)";
268  $prefix .= " AND $equality";
269  }
270  }
271 
272  return $cond;
273  }
274 
275  private function updateSortKeySizeHistogram( $key ) {
276  $length = strlen( $key );
277  if ( !isset( $this->sizeHistogram[$length] ) ) {
278  $this->sizeHistogram[$length] = 0;
279  }
280  $this->sizeHistogram[$length]++;
281  }
282 
283  private function showSortKeySizeHistogram() {
284  if ( !$this->sizeHistogram ) {
285  return;
286  }
287  $maxLength = max( array_keys( $this->sizeHistogram ) );
288  if ( $maxLength == 0 ) {
289  return;
290  }
291  $numBins = 20;
292  $coarseHistogram = array_fill( 0, $numBins, 0 );
293  $coarseBoundaries = [];
294  $boundary = 0;
295  for ( $i = 0; $i < $numBins - 1; $i++ ) {
296  $boundary += $maxLength / $numBins;
297  $coarseBoundaries[$i] = round( $boundary );
298  }
299  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
300  $raw = '';
301  for ( $i = 0; $i <= $maxLength; $i++ ) {
302  if ( $raw !== '' ) {
303  $raw .= ', ';
304  }
305  $val = $this->sizeHistogram[$i] ?? 0;
306  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
307  if ( $coarseBoundaries[$coarseIndex] > $i ) {
308  $coarseHistogram[$coarseIndex] += $val;
309  break;
310  }
311  }
312  if ( $coarseIndex == $numBins - 1 ) {
313  $coarseHistogram[$coarseIndex] += $val;
314  }
315  $raw .= $val;
316  }
317 
318  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
319 
320  $maxBinVal = max( $coarseHistogram );
321  $scale = 60 / $maxBinVal;
322  $prevBoundary = 0;
323  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
324  $val = $coarseHistogram[$coarseIndex] ?? 0;
325  $boundary = $coarseBoundaries[$coarseIndex];
326  $this->output( sprintf( "%-10s %-10d |%s\n",
327  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
328  $val,
329  str_repeat( '*', $scale * $val ) ) );
330  $prevBoundary = $boundary;
331  }
332  }
333 }
334 
335 $maintClass = UpdateCollation::class;
336 require_once RUN_MAINTENANCE_IF_MAIN;
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:193
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:329
$maintClass
$maintClass
Definition: updateCollation.php:335
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:59
$res
$res
Definition: testCompression.php:57
UpdateCollation\__construct
__construct()
Default constructor.
Definition: updateCollation.php:43
UpdateCollation\BATCH_SIZE
const BATCH_SIZE
Definition: updateCollation.php:39
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
$dbr
$dbr
Definition: testCompression.php:54
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
Definition: Maintenance.php:1393
Maintenance\getConfig
getConfig()
Definition: Maintenance.php:598
UpdateCollation\getBatchCondition
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
Definition: updateCollation.php:243
Title\newFromRow
static newFromRow( $row)
Make a Title object from a DB row.
Definition: Title.php:580
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:249
$title
$title
Definition: testCompression.php:38
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
UpdateCollation\execute
execute()
Do the actual work.
Definition: updateCollation.php:70
UpdateCollation\updateSortKeySizeHistogram
updateSortKeySizeHistogram( $key)
Definition: updateCollation.php:275
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
UpdateCollation\showSortKeySizeHistogram
showSortKeySizeHistogram()
Definition: updateCollation.php:283
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1408
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1362
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:286
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:435
$wgCategoryCollation
$wgCategoryCollation
Specify how category names should be sorted, when listed on a category page.
Definition: DefaultSettings.php:8857
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option was set.
Definition: Maintenance.php:271
UpdateCollation\$sizeHistogram
$sizeHistogram
Definition: updateCollation.php:41
UpdateCollation
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
Definition: updateCollation.php:38
$type
$type
Definition: testCompression.php:52