MediaWiki master
updateCollation.php
Go to the documentation of this file.
1<?php
27require_once __DIR__ . '/Maintenance.php';
28
35
44 public $sizeHistogram = [];
45
47 private $numRowsProcessed = 0;
48
50 private $force;
51
53 private $dryRun;
54
56 private $verboseStats;
57
59 private $collation;
60
62 private $collationName;
63
65 private $targetTable;
66
68 private $dbr;
69
71 private $dbw;
72
74 private $namespaceInfo;
75
76 public function __construct() {
77 parent::__construct();
78
79 $this->addDescription( <<<TEXT
80This script will find all rows in the categorylinks table whose collation is
81out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
82repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
83collations are up-to-date, it will do nothing.
84TEXT
85 );
86
87 $this->setBatchSize( 100 );
88 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
89 'supposed to be up-to-date.', false, false, 'f' );
90 $this->addOption( 'previous-collation', 'Set the previous value of ' .
91 '$wgCategoryCollation here to speed up this script, especially if your ' .
92 'categorylinks table is large. This will only update rows with that ' .
93 'collation, though, so it may miss out-of-date rows with a different, ' .
94 'even older collation.', false, true );
95 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
96 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
97 'you should just update $wgCategoryCollation in LocalSettings.php.',
98 false, true );
99 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
100 'specified table instead of updating them in place.', false, true );
101 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
102 'remotely.' );
103 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
104 'compile statistics.' );
105 $this->addOption( 'verbose-stats', 'Show more statistics.' );
106 }
107
111 private function init() {
112 $services = $this->getServiceContainer();
113 $this->namespaceInfo = $services->getNamespaceInfo();
114
115 if ( $this->hasOption( 'target-collation' ) ) {
116 $this->collationName = $this->getOption( 'target-collation' );
117 } else {
118 $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
119 }
120 if ( $this->hasOption( 'remote' ) ) {
121 $realCollationName = 'remote-' . $this->collationName;
122 } else {
123 $realCollationName = $this->collationName;
124 }
125 $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
126
127 // Collation check: in some cases the constructor will work,
128 // but this will raise an exception, breaking all category pages
129 $this->collation->getSortKey( 'MediaWiki' );
130
131 $this->force = $this->getOption( 'force' );
132 $this->dryRun = $this->getOption( 'dry-run' );
133 $this->verboseStats = $this->getOption( 'verbose-stats' );
134 $this->dbw = $this->getPrimaryDB();
135 $this->dbr = $this->getReplicaDB();
136 $this->targetTable = $this->getOption( 'target-table' );
137 }
138
139 public function execute() {
140 $this->init();
141 $batchSize = $this->getBatchSize();
142
143 if ( $this->targetTable ) {
144 if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
145 $this->output( "Creating table {$this->targetTable}\n" );
146 $this->dbw->query(
147 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
148 ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
149 __METHOD__
150 );
151 }
152 }
153
154 $collationConds = [];
155 if ( !$this->force && !$this->targetTable ) {
156 if ( $this->hasOption( 'previous-collation' ) ) {
157 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
158 } else {
159 $collationConds = [
160 0 => $this->dbr->expr( 'cl_collation', '!=', $this->collationName )
161 ];
162 }
163 }
164 $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
165 ->select( 'MAX(page_id)' )
166 ->from( 'page' )
167 ->caller( __METHOD__ )->fetchField();
168 $batchValue = 0;
169 do {
170 $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
171
172 // cl_type must be selected as a number for proper paging because
173 // enums suck.
174 if ( $this->dbw->getType() === 'mysql' ) {
175 $clType = 'cl_type+0 AS "cl_type_numeric"';
176 } else {
177 $clType = 'cl_type';
178 }
179 $res = $this->dbw->newSelectQueryBuilder()
180 ->select( [
181 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
182 'cl_sortkey', $clType, 'cl_timestamp',
183 'page_namespace', 'page_title'
184 ] )
185 ->from( 'categorylinks' )
186 // per T58041
187 ->straightJoin( 'page', null, 'cl_from = page_id' )
188 ->where( $collationConds )
189 ->andWhere(
190 $this->dbw->expr( 'cl_from', '>=', $batchValue )
191 ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
192 )
193 ->orderBy( 'cl_from' )
194 ->caller( __METHOD__ )->fetchResultSet();
195 $this->output( "processing... " );
196
197 if ( $res->numRows() ) {
198 if ( $this->targetTable ) {
199 $this->copyBatch( $res );
200 } else {
201 $this->updateBatch( $res );
202 }
203 }
204 $batchValue += $this->getBatchSize();
205
206 if ( $this->dryRun ) {
207 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
208 } else {
209 $this->output( "{$this->numRowsProcessed} done.\n" );
210 }
211 } while ( $maxPageId >= $batchValue );
212
213 if ( !$this->dryRun ) {
214 $this->output( "{$this->numRowsProcessed} rows processed\n" );
215 }
216
217 if ( $this->verboseStats ) {
218 $this->output( "\n" );
219 $this->showSortKeySizeHistogram();
220 }
221 }
222
228 private function updateBatch( $res ) {
229 if ( !$this->dryRun ) {
230 $this->beginTransaction( $this->dbw, __METHOD__ );
231 }
232 foreach ( $res as $row ) {
233 $title = Title::newFromRow( $row );
234 if ( !$row->cl_collation ) {
235 # This is an old-style row, so the sortkey needs to be
236 # converted.
237 if ( $row->cl_sortkey == $title->getText()
238 || $row->cl_sortkey == $title->getPrefixedText()
239 ) {
240 $prefix = '';
241 } else {
242 # Custom sortkey, use it as a prefix
243 $prefix = $row->cl_sortkey;
244 }
245 } else {
246 $prefix = $row->cl_sortkey_prefix;
247 }
248 # cl_type will be wrong for lots of pages if cl_collation is 0,
249 # so let's update it while we're here.
250 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
251 $newSortKey = $this->collation->getSortKey(
252 $title->getCategorySortkey( $prefix ) );
253 $this->updateSortKeySizeHistogram( $newSortKey );
254 // Truncate to 230 bytes to avoid DB error
255 $newSortKey = substr( $newSortKey, 0, 230 );
256
257 if ( $this->dryRun ) {
258 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
259 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
260 $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
261 } else {
262 $this->dbw->update(
263 'categorylinks',
264 [
265 'cl_sortkey' => $newSortKey,
266 'cl_sortkey_prefix' => $prefix,
267 'cl_collation' => $this->collationName,
268 'cl_type' => $type,
269 'cl_timestamp = cl_timestamp',
270 ],
271 [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
272 __METHOD__
273 );
274 $this->numRowsProcessed++;
275 }
276 }
277 if ( !$this->dryRun ) {
278 $this->commitTransaction( $this->dbw, __METHOD__ );
279 }
280 }
281
287 private function copyBatch( $res ) {
288 $sortKeyInputs = [];
289 foreach ( $res as $row ) {
290 $title = Title::newFromRow( $row );
291 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
292 }
293 $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
294 $rowsToInsert = [];
295 foreach ( $res as $i => $row ) {
296 if ( !isset( $sortKeys[$i] ) ) {
297 throw new RuntimeException( 'Unable to get sort key' );
298 }
299 $newSortKey = $sortKeys[$i];
300 $this->updateSortKeySizeHistogram( $newSortKey );
301 // Truncate to 230 bytes to avoid DB error
302 $newSortKey = substr( $newSortKey, 0, 230 );
303 $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
304 $rowsToInsert[] = [
305 'cl_from' => $row->cl_from,
306 'cl_to' => $row->cl_to,
307 'cl_sortkey' => $newSortKey,
308 'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
309 'cl_collation' => $this->collationName,
310 'cl_type' => $type,
311 'cl_timestamp' => $row->cl_timestamp
312 ];
313 }
314 if ( $this->dryRun ) {
315 $this->numRowsProcessed += count( $rowsToInsert );
316 } else {
317 $this->beginTransaction( $this->dbw, __METHOD__ );
318 $this->dbw->newInsertQueryBuilder()
319 ->insertInto( $this->targetTable )
320 ->ignore()
321 ->rows( $rowsToInsert )
322 ->caller( __METHOD__ )->execute();
323 $this->numRowsProcessed += $this->dbw->affectedRows();
324 $this->commitTransaction( $this->dbw, __METHOD__ );
325 }
326 }
327
333 private function updateSortKeySizeHistogram( $key ) {
334 if ( !$this->verboseStats ) {
335 return;
336 }
337 $length = strlen( $key );
338 if ( !isset( $this->sizeHistogram[$length] ) ) {
339 $this->sizeHistogram[$length] = 0;
340 }
341 $this->sizeHistogram[$length]++;
342 }
343
347 private function showSortKeySizeHistogram() {
348 if ( !$this->sizeHistogram ) {
349 return;
350 }
351 $maxLength = max( array_keys( $this->sizeHistogram ) );
352 if ( $maxLength == 0 ) {
353 return;
354 }
355 $numBins = 20;
356 $coarseHistogram = array_fill( 0, $numBins, 0 );
357 $coarseBoundaries = [];
358 $boundary = 0;
359 for ( $i = 0; $i < $numBins - 1; $i++ ) {
360 $boundary += $maxLength / $numBins;
361 $coarseBoundaries[$i] = round( $boundary );
362 }
363 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
364 $raw = '';
365 for ( $i = 0; $i <= $maxLength; $i++ ) {
366 if ( $raw !== '' ) {
367 $raw .= ', ';
368 }
369 $val = $this->sizeHistogram[$i] ?? 0;
370 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
371 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
372 if ( $coarseBoundaries[$coarseIndex] > $i ) {
373 $coarseHistogram[$coarseIndex] += $val;
374 break;
375 }
376 }
377 if ( $coarseIndex == $numBins - 1 ) {
378 $coarseHistogram[$coarseIndex] += $val;
379 }
380 $raw .= $val;
381 }
382
383 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
384
385 $maxBinVal = max( $coarseHistogram );
386 $scale = 60 / $maxBinVal;
387 $prevBoundary = 0;
388 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
389 $val = $coarseHistogram[$coarseIndex] ?? 0;
390 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
391 $boundary = $coarseBoundaries[$coarseIndex];
392 $this->output( sprintf( "%-10s %-10d |%s\n",
393 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
394 $val,
395 str_repeat( '*', $scale * $val ) ) );
396 $prevBoundary = $boundary;
397 }
398 }
399}
400
401$maintClass = UpdateCollation::class;
402require_once RUN_MAINTENANCE_IF_MAIN;
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
Represents a title within MediaWiki.
Definition Title.php:78
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
$wgCategoryCollation
Config variable stub for the CategoryCollation setting, for use by phpdoc and IDEs.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:36
Advanced database interface for IDatabase handles that include maintenance methods.
Result wrapper for grabbing data queried from an IDatabase object.