MediaWiki master
updateCollation.php
Go to the documentation of this file.
1<?php
27// @codeCoverageIgnoreStart
28require_once __DIR__ . '/Maintenance.php';
29// @codeCoverageIgnoreEnd
30
37
46 public $sizeHistogram = [];
47
49 private $numRowsProcessed = 0;
50
52 private $force;
53
55 private $dryRun;
56
58 private $verboseStats;
59
61 private $collation;
62
64 private $collationName;
65
67 private $targetTable;
68
70 private $dbr;
71
73 private $dbw;
74
76 private $namespaceInfo;
77
78 public function __construct() {
79 parent::__construct();
80
81 $this->addDescription( <<<TEXT
82This script will find all rows in the categorylinks table whose collation is
83out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
84repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
85collations are up-to-date, it will do nothing.
86TEXT
87 );
88
89 $this->setBatchSize( 100 );
90 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
91 'supposed to be up-to-date.', false, false, 'f' );
92 $this->addOption( 'previous-collation', 'Set the previous value of ' .
93 '$wgCategoryCollation here to speed up this script, especially if your ' .
94 'categorylinks table is large. This will only update rows with that ' .
95 'collation, though, so it may miss out-of-date rows with a different, ' .
96 'even older collation.', false, true );
97 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
98 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
99 'you should just update $wgCategoryCollation in LocalSettings.php.',
100 false, true );
101 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
102 'specified table instead of updating them in place.', false, true );
103 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
104 'remotely.' );
105 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
106 'compile statistics.' );
107 $this->addOption( 'verbose-stats', 'Show more statistics.' );
108 }
109
113 private function init() {
114 $services = $this->getServiceContainer();
115 $this->namespaceInfo = $services->getNamespaceInfo();
116
117 if ( $this->hasOption( 'target-collation' ) ) {
118 $this->collationName = $this->getOption( 'target-collation' );
119 } else {
120 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
121 }
122 if ( $this->hasOption( 'remote' ) ) {
123 $realCollationName = 'remote-' . $this->collationName;
124 } else {
125 $realCollationName = $this->collationName;
126 }
127 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
128
129 // Collation check: in some cases the constructor will work,
130 // but this will raise an exception, breaking all category pages
131 $this->collation->getSortKey( 'MediaWiki' );
132
133 $this->force = $this->getOption( 'force' );
134 $this->dryRun = $this->getOption( 'dry-run' );
135 $this->verboseStats = $this->getOption( 'verbose-stats' );
136 $this->dbw = $this->getPrimaryDB();
137 $this->dbr = $this->getReplicaDB();
138 $this->targetTable = $this->getOption( 'target-table' );
139 }
140
141 public function execute() {
142 $this->init();
143 $batchSize = $this->getBatchSize();
144
145 if ( $this->targetTable ) {
146 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
147 $this->output( "Creating table {$this->targetTable}\n" );
148 $this->dbw->query(
149 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
150 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
151 __METHOD__
152 );
153 }
154 }
155
156 $collationConds = [];
157 if ( !$this->force && !$this->targetTable ) {
158 if ( $this->hasOption( 'previous-collation' ) ) {
159 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
160 } else {
161 $collationConds[] = $this->dbr->expr( 'cl_collation', '!=', $this->collationName );
162 }
163 }
164 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
165 ->select( 'MAX(page_id)' )
166 ->from( 'page' )
167 ->caller( __METHOD__ )->fetchField();
168 $batchValue = 0;
169 do {
170 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
171
172 // cl_type must be selected as a number for proper paging because
173 // enums suck.
174 if ( $this->dbw->getType() === 'mysql' ) {
175 $clType = 'cl_type+0 AS "cl_type_numeric"';
176 } else {
177 $clType = 'cl_type';
178 }
179 $res = $this->dbw->newSelectQueryBuilder()
180 ->select( [
181 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
182 'cl_sortkey', $clType, 'cl_timestamp',
183 'page_namespace', 'page_title'
184 ] )
185 ->from( 'categorylinks' )
186 // per T58041
187 ->straightJoin( 'page', null, 'cl_from = page_id' )
188 ->where( $collationConds )
189 ->andWhere(
190 $this->dbw->expr( 'cl_from', '>=', $batchValue )
191 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
192 )
193 ->orderBy( 'cl_from' )
194 ->caller( __METHOD__ )->fetchResultSet();
195 $this->output( "processing... " );
196
197 if ( $res->numRows() ) {
198 if ( $this->targetTable ) {
199 $this->copyBatch( $res );
200 } else {
201 $this->updateBatch( $res );
202 }
203 }
204 $batchValue += $this->getBatchSize();
205
206 if ( $this->dryRun ) {
207 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
208 } else {
209 $this->output( "{$this->numRowsProcessed} done.\n" );
210 }
211 } while ( $maxPageId >= $batchValue );
212
213 if ( !$this->dryRun ) {
214 $this->output( "{$this->numRowsProcessed} rows processed\n" );
215 }
216
217 if ( $this->verboseStats ) {
218 $this->output( "\n" );
219 $this->showSortKeySizeHistogram();
220 }
221 }
222
226 private function updateBatch( IResultWrapper $res ) {
227 if ( !$this->dryRun ) {
228 $this->beginTransaction( $this->dbw, __METHOD__ );
229 }
230 foreach ( $res as $row ) {
231 $title = Title::newFromRow( $row );
232 if ( !$row->cl_collation ) {
233 # This is an old-style row, so the sortkey needs to be
234 # converted.
235 if ( $row->cl_sortkey === $title->getText()
236 || $row->cl_sortkey === $title->getPrefixedText()
237 ) {
238 $prefix = '';
239 } else {
240 # Custom sortkey, so use it as a prefix
241 $prefix = $row->cl_sortkey;
242 }
243 } else {
244 $prefix = $row->cl_sortkey_prefix;
245 }
246 # cl_type will be wrong for lots of pages if cl_collation is 0,
247 # so let's update it while we're here.
248 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
249 $newSortKey = $this->collation->getSortKey(
250 $title->getCategorySortkey( $prefix ) );
251 $this->updateSortKeySizeHistogram( $newSortKey );
252 // Truncate to 230 bytes to avoid DB error
253 $newSortKey = substr( $newSortKey, 0, 230 );
254
255 if ( $this->dryRun ) {
256 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
257 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
258 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
259 } else {
260 $this->dbw->newUpdateQueryBuilder()
261 ->update( 'categorylinks' )
262 ->set( [
263 'cl_sortkey' => $newSortKey,
264 'cl_sortkey_prefix' => $prefix,
265 'cl_collation' => $this->collationName,
266 'cl_type' => $type,
267 'cl_timestamp = cl_timestamp',
268 ] )
269 ->where( [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ] )
270 ->caller( __METHOD__ )
271 ->execute();
272 $this->numRowsProcessed++;
273 }
274 }
275 if ( !$this->dryRun ) {
276 $this->commitTransaction( $this->dbw, __METHOD__ );
277 }
278 }
279
283 private function copyBatch( IResultWrapper $res ) {
284 $sortKeyInputs = [];
285 foreach ( $res as $row ) {
286 $title = Title::newFromRow( $row );
287 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
288 }
289 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
290 $rowsToInsert = [];
291 foreach ( $res as $i => $row ) {
292 if ( !isset( $sortKeys[$i] ) ) {
293 throw new RuntimeException( 'Unable to get sort key' );
294 }
295 $newSortKey = $sortKeys[$i];
296 $this->updateSortKeySizeHistogram( $newSortKey );
297 // Truncate to 230 bytes to avoid DB error
298 $newSortKey = substr( $newSortKey, 0, 230 );
299 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
300 $rowsToInsert[] = [
301 'cl_from' => $row->cl_from,
302 'cl_to' => $row->cl_to,
303 'cl_sortkey' => $newSortKey,
304 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
305 'cl_collation' => $this->collationName,
306 'cl_type' => $type,
307 'cl_timestamp' => $row->cl_timestamp
308 ];
309 }
310 if ( $this->dryRun ) {
311 $this->numRowsProcessed += count( $rowsToInsert );
312 } else {
313 $this->beginTransaction( $this->dbw, __METHOD__ );
314 $this->dbw->newInsertQueryBuilder()
315 ->insertInto( $this->targetTable )
316 ->ignore()
317 ->rows( $rowsToInsert )
318 ->caller( __METHOD__ )->execute();
319 $this->numRowsProcessed += $this->dbw->affectedRows();
320 $this->commitTransaction( $this->dbw, __METHOD__ );
321 }
322 }
323
327 private function updateSortKeySizeHistogram( string $key ) {
328 if ( !$this->verboseStats ) {
329 return;
330 }
331 $length = strlen( $key );
332 if ( !isset( $this->sizeHistogram[$length] ) ) {
333 $this->sizeHistogram[$length] = 0;
334 }
335 $this->sizeHistogram[$length]++;
336 }
337
341 private function showSortKeySizeHistogram() {
342 if ( !$this->sizeHistogram ) {
343 return;
344 }
345 $maxLength = max( array_keys( $this->sizeHistogram ) );
346 if ( $maxLength === 0 ) {
347 return;
348 }
349 $numBins = 20;
350 $coarseHistogram = array_fill( 0, $numBins, 0 );
351 $coarseBoundaries = [];
352 $boundary = 0;
353 for ( $i = 0; $i < $numBins - 1; $i++ ) {
354 $boundary += $maxLength / $numBins;
355 $coarseBoundaries[$i] = round( $boundary );
356 }
357 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
358 $raw = '';
359 for ( $i = 0; $i <= $maxLength; $i++ ) {
360 if ( $raw !== '' ) {
361 $raw .= ', ';
362 }
363 $val = $this->sizeHistogram[$i] ?? 0;
364 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
365 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
366 if ( $coarseBoundaries[$coarseIndex] > $i ) {
367 $coarseHistogram[$coarseIndex] += $val;
368 break;
369 }
370 }
371 if ( $coarseIndex === ( $numBins - 1 ) ) {
372 $coarseHistogram[$coarseIndex] += $val;
373 }
374 $raw .= $val;
375 }
376
377 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
378
379 $maxBinVal = max( $coarseHistogram );
380 $scale = (int)( 60 / $maxBinVal );
381 $prevBoundary = 0;
382 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
383 $val = $coarseHistogram[$coarseIndex] ?? 0;
384 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
385 $boundary = $coarseBoundaries[$coarseIndex];
386 $this->output(
387 sprintf( "%-10s %-10d |%s\n",
388 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
389 $val,
390 str_repeat( '*', $scale * $val )
391 )
392 );
393 $prevBoundary = $boundary;
394 }
395 }
396}
397
398// @codeCoverageIgnoreStart
399$maintClass = UpdateCollation::class;
400require_once RUN_MAINTENANCE_IF_MAIN;
401// @codeCoverageIgnoreEnd
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition Title.php:78
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Interface to a relational database.
Definition IDatabase.php:48
Advanced database interface for IDatabase handles that include maintenance methods.
Result wrapper for grabbing data queried from an IDatabase object.
numRows()
Get the number of rows in a result object.