MediaWiki  master
updateCollation.php
Go to the documentation of this file.
1 <?php
27 require_once __DIR__ . '/Maintenance.php';
28 
31 
39  const BATCH_SIZE = 100; // Number of rows to process in one batch
40  const SYNC_INTERVAL = 5; // Wait for replica DBs after this many batches
41 
42  public $sizeHistogram = [];
43 
44  public function __construct() {
45  parent::__construct();
46 
47  $categoryCollation = $this->getConfig()->get( 'CategoryCollation' );
48  $this->addDescription( <<<TEXT
49 This script will find all rows in the categorylinks table whose collation is
50 out-of-date (cl_collation != '$categoryCollation') and repopulate cl_sortkey
51 using the page title and cl_sortkey_prefix. If all collations are
52 up-to-date, it will do nothing.
53 TEXT
54  );
55 
56  $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
57  'supposed to be up-to-date.', false, false, 'f' );
58  $this->addOption( 'previous-collation', 'Set the previous value of ' .
59  '$wgCategoryCollation here to speed up this script, especially if your ' .
60  'categorylinks table is large. This will only update rows with that ' .
61  'collation, though, so it may miss out-of-date rows with a different, ' .
62  'even older collation.', false, true );
63  $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
64  'use instead of $wgCategoryCollation. Usually you should not use this, ' .
65  'you should just update $wgCategoryCollation in LocalSettings.php.',
66  false, true );
67  $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
68  'compile statistics.' );
69  $this->addOption( 'verbose-stats', 'Show more statistics.' );
70  }
71 
72  public function execute() {
73  $dbw = $this->getDB( DB_MASTER );
74  $dbr = $this->getDB( DB_REPLICA );
75  $force = $this->getOption( 'force' );
76  $dryRun = $this->getOption( 'dry-run' );
77  $verboseStats = $this->getOption( 'verbose-stats' );
78  if ( $this->hasOption( 'target-collation' ) ) {
79  $collationName = $this->getOption( 'target-collation' );
80  $collation = Collation::factory( $collationName );
81  } else {
82  $collationName = $this->getConfig()->get( 'CategoryCollation' );
83  $collation = Collation::singleton();
84  }
85 
86  // Collation sanity check: in some cases the constructor will work,
87  // but this will raise an exception, breaking all category pages
88  $collation->getFirstLetter( 'MediaWiki' );
89 
90  // Locally at least, (my local is a rather old version of mysql)
91  // mysql seems to filesort if there is both an equality
92  // (but not for an inequality) condition on cl_collation in the
93  // WHERE and it is also the first item in the ORDER BY.
94  if ( $this->hasOption( 'previous-collation' ) ) {
95  $orderBy = 'cl_to, cl_type, cl_from';
96  } else {
97  $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
98  }
99  $options = [
100  'LIMIT' => self::BATCH_SIZE,
101  'ORDER BY' => $orderBy,
102  'STRAIGHT_JOIN' // per T58041
103  ];
104 
105  $collationConds = [];
106  if ( !$force ) {
107  if ( $this->hasOption( 'previous-collation' ) ) {
108  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
109  } else {
110  $collationConds = [ 0 =>
111  'cl_collation != ' . $dbw->addQuotes( $collationName )
112  ];
113  }
114 
115  $count = $dbr->estimateRowCount(
116  'categorylinks',
117  '*',
118  $collationConds,
119  __METHOD__
120  );
121  // Improve estimate if feasible
122  if ( $count < 1000000 ) {
123  $count = $dbr->selectField(
124  'categorylinks',
125  'COUNT(*)',
126  $collationConds,
127  __METHOD__
128  );
129  }
130  if ( $count == 0 ) {
131  $this->output( "Collations up-to-date.\n" );
132 
133  return;
134  }
135  if ( $dryRun ) {
136  $this->output( "$count rows would be updated.\n" );
137  } else {
138  $this->output( "Fixing collation for $count rows.\n" );
139  }
140  wfWaitForSlaves();
141  }
142  $count = 0;
143  $batchConds = [];
144  do {
145  $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
146 
147  // cl_type must be selected as a number for proper paging because
148  // enums suck.
149  if ( $dbw->getType() === 'mysql' ) {
150  $clType = 'cl_type+0 AS "cl_type_numeric"';
151  } else {
152  $clType = 'cl_type';
153  }
154  $res = $dbw->select(
155  [ 'categorylinks', 'page' ],
156  [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
157  'cl_sortkey', $clType,
158  'page_namespace', 'page_title'
159  ],
160  array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
161  __METHOD__,
162  $options
163  );
164  $this->output( " processing..." );
165 
166  if ( !$dryRun ) {
167  $this->beginTransaction( $dbw, __METHOD__ );
168  }
169  foreach ( $res as $row ) {
170  $title = Title::newFromRow( $row );
171  if ( !$row->cl_collation ) {
172  # This is an old-style row, so the sortkey needs to be
173  # converted.
174  if ( $row->cl_sortkey == $title->getText()
175  || $row->cl_sortkey == $title->getPrefixedText()
176  ) {
177  $prefix = '';
178  } else {
179  # Custom sortkey, use it as a prefix
180  $prefix = $row->cl_sortkey;
181  }
182  } else {
183  $prefix = $row->cl_sortkey_prefix;
184  }
185  # cl_type will be wrong for lots of pages if cl_collation is 0,
186  # so let's update it while we're here.
187  $type = MediaWikiServices::getInstance()->getNamespaceInfo()->
188  getCategoryLinkType( $title->getNamespace() );
189  $newSortKey = $collation->getSortKey(
190  $title->getCategorySortkey( $prefix ) );
191  if ( $verboseStats ) {
192  $this->updateSortKeySizeHistogram( $newSortKey );
193  }
194 
195  if ( $dryRun ) {
196  // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
197  // other fields, if any, those usually only happen when upgrading old MediaWikis.)
198  $count += ( $row->cl_sortkey !== $newSortKey );
199  } else {
200  $dbw->update(
201  'categorylinks',
202  [
203  'cl_sortkey' => $newSortKey,
204  'cl_sortkey_prefix' => $prefix,
205  'cl_collation' => $collationName,
206  'cl_type' => $type,
207  'cl_timestamp = cl_timestamp',
208  ],
209  [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
210  __METHOD__
211  );
212  $count++;
213  }
214  if ( $row ) {
215  $batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
216  }
217  }
218  if ( !$dryRun ) {
219  $this->commitTransaction( $dbw, __METHOD__ );
220  }
221 
222  if ( $dryRun ) {
223  $this->output( "$count rows would be updated so far.\n" );
224  } else {
225  $this->output( "$count done.\n" );
226  }
227  } while ( $res->numRows() == self::BATCH_SIZE );
228 
229  if ( !$dryRun ) {
230  $this->output( "$count rows processed\n" );
231  }
232 
233  if ( $verboseStats ) {
234  $this->output( "\n" );
235  $this->showSortKeySizeHistogram();
236  }
237  }
238 
246  private function getBatchCondition( $row, $dbw ) {
247  if ( $this->hasOption( 'previous-collation' ) ) {
248  $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
249  } else {
250  $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
251  }
252  $first = true;
253  $cond = false;
254  $prefix = false;
255  foreach ( $fields as $field ) {
256  if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
257  // Range conditions with enums are weird in mysql
258  // This must be a numeric literal, or it won't work.
259  $encValue = intval( $row->cl_type_numeric );
260  } else {
261  $encValue = $dbw->addQuotes( $row->$field );
262  }
263  $inequality = "$field > $encValue";
264  $equality = "$field = $encValue";
265  if ( $first ) {
266  $cond = $inequality;
267  $prefix = $equality;
268  $first = false;
269  } else {
270  $cond .= " OR ($prefix AND $inequality)";
271  $prefix .= " AND $equality";
272  }
273  }
274 
275  return $cond;
276  }
277 
278  private function updateSortKeySizeHistogram( $key ) {
279  $length = strlen( $key );
280  if ( !isset( $this->sizeHistogram[$length] ) ) {
281  $this->sizeHistogram[$length] = 0;
282  }
283  $this->sizeHistogram[$length]++;
284  }
285 
286  private function showSortKeySizeHistogram() {
287  $maxLength = max( array_keys( $this->sizeHistogram ) );
288  if ( $maxLength == 0 ) {
289  return;
290  }
291  $numBins = 20;
292  $coarseHistogram = array_fill( 0, $numBins, 0 );
293  $coarseBoundaries = [];
294  $boundary = 0;
295  for ( $i = 0; $i < $numBins - 1; $i++ ) {
296  $boundary += $maxLength / $numBins;
297  $coarseBoundaries[$i] = round( $boundary );
298  }
299  $coarseBoundaries[$numBins - 1] = $maxLength + 1;
300  $raw = '';
301  for ( $i = 0; $i <= $maxLength; $i++ ) {
302  if ( $raw !== '' ) {
303  $raw .= ', ';
304  }
305  $val = $this->sizeHistogram[$i] ?? 0;
306  for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
307  if ( $coarseBoundaries[$coarseIndex] > $i ) {
308  $coarseHistogram[$coarseIndex] += $val;
309  break;
310  }
311  }
312  if ( $coarseIndex == $numBins - 1 ) {
313  $coarseHistogram[$coarseIndex] += $val;
314  }
315  $raw .= $val;
316  }
317 
318  $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
319 
320  $maxBinVal = max( $coarseHistogram );
321  $scale = 60 / $maxBinVal;
322  $prevBoundary = 0;
323  for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
324  $val = $coarseHistogram[$coarseIndex] ?? 0;
325  $boundary = $coarseBoundaries[$coarseIndex];
326  $this->output( sprintf( "%-10s %-10d |%s\n",
327  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
328  $val,
329  str_repeat( '*', $scale * $val ) ) );
330  $prevBoundary = $boundary;
331  }
332  }
333 }
334 
335 $maintClass = UpdateCollation::class;
336 require_once RUN_MAINTENANCE_IF_MAIN;
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
$maintClass
getOption( $name, $default=null)
Get an option, or return the default.
static singleton()
Definition: Collation.php:36
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:86
hasOption( $name)
Checks to see if a particular option exists.
static newFromRow( $row)
Make a Title object from a DB row.
Definition: Title.php:516
const DB_MASTER
Definition: defines.php:26
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
addDescription( $text)
Set the description text.
output( $out, $channel=null)
Throw some output to the user.
static factory( $collationName)
Definition: Collation.php:50
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date...
const DB_REPLICA
Definition: defines.php:25
updateSortKeySizeHistogram( $key)
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.