MediaWiki REL1_35
updateCollation.php
Go to the documentation of this file.
1<?php
27require_once __DIR__ . '/Maintenance.php';
28
31
39 private const BATCH_SIZE = 100; // Number of rows to process in one batch
40 private const SYNC_INTERVAL = 5; // Wait for replica DBs after this many batches
41
42 public $sizeHistogram = [];
43
44 public function __construct() {
45 parent::__construct();
46
47 $this->addDescription( <<<TEXT
48This script will find all rows in the categorylinks table whose collation is
49out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
50repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
51collations are up-to-date, it will do nothing.
52TEXT
53 );
54
55 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
56 'supposed to be up-to-date.', false, false, 'f' );
57 $this->addOption( 'previous-collation', 'Set the previous value of ' .
58 '$wgCategoryCollation here to speed up this script, especially if your ' .
59 'categorylinks table is large. This will only update rows with that ' .
60 'collation, though, so it may miss out-of-date rows with a different, ' .
61 'even older collation.', false, true );
62 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
63 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
64 'you should just update $wgCategoryCollation in LocalSettings.php.',
65 false, true );
66 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
67 'compile statistics.' );
68 $this->addOption( 'verbose-stats', 'Show more statistics.' );
69 }
70
71 public function execute() {
72 $dbw = $this->getDB( DB_MASTER );
73 $dbr = $this->getDB( DB_REPLICA );
74 $force = $this->getOption( 'force' );
75 $dryRun = $this->getOption( 'dry-run' );
76 $verboseStats = $this->getOption( 'verbose-stats' );
77 if ( $this->hasOption( 'target-collation' ) ) {
78 $collationName = $this->getOption( 'target-collation' );
79 $collation = Collation::factory( $collationName );
80 } else {
81 $collationName = $this->getConfig()->get( 'CategoryCollation' );
82 $collation = Collation::singleton();
83 }
84
85 // Collation sanity check: in some cases the constructor will work,
86 // but this will raise an exception, breaking all category pages
87 $collation->getFirstLetter( 'MediaWiki' );
88
89 // Locally at least, (my local is a rather old version of mysql)
90 // mysql seems to filesort if there is both an equality
91 // (but not for an inequality) condition on cl_collation in the
92 // WHERE and it is also the first item in the ORDER BY.
93 if ( $this->hasOption( 'previous-collation' ) ) {
94 $orderBy = 'cl_to, cl_type, cl_from';
95 } else {
96 $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
97 }
98 $options = [
99 'LIMIT' => self::BATCH_SIZE,
100 'ORDER BY' => $orderBy,
101 'STRAIGHT_JOIN' // per T58041
102 ];
103
104 $collationConds = [];
105 if ( !$force ) {
106 if ( $this->hasOption( 'previous-collation' ) ) {
107 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
108 } else {
109 $collationConds = [ 0 =>
110 'cl_collation != ' . $dbw->addQuotes( $collationName )
111 ];
112 }
113
114 $count = $dbr->estimateRowCount(
115 'categorylinks',
116 '*',
117 $collationConds,
118 __METHOD__
119 );
120 // Improve estimate if feasible
121 if ( $count < 1000000 ) {
122 $count = $dbr->selectField(
123 'categorylinks',
124 'COUNT(*)',
125 $collationConds,
126 __METHOD__
127 );
128 }
129 if ( $count == 0 ) {
130 $this->output( "Collations up-to-date.\n" );
131
132 return;
133 }
134 if ( $dryRun ) {
135 $this->output( "$count rows would be updated.\n" );
136 } else {
137 $this->output( "Fixing collation for $count rows.\n" );
138 }
139 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
140 }
141 $count = 0;
142 $batchConds = [];
143 do {
144 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
145
146 // cl_type must be selected as a number for proper paging because
147 // enums suck.
148 if ( $dbw->getType() === 'mysql' ) {
149 $clType = 'cl_type+0 AS "cl_type_numeric"';
150 } else {
151 $clType = 'cl_type';
152 }
153 $res = $dbw->select(
154 [ 'categorylinks', 'page' ],
155 [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
156 'cl_sortkey', $clType,
157 'page_namespace', 'page_title'
158 ],
159 array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
160 __METHOD__,
161 $options
162 );
163 $this->output( " processing..." );
164
165 if ( !$dryRun ) {
166 $this->beginTransaction( $dbw, __METHOD__ );
167 }
168 foreach ( $res as $row ) {
169 $title = Title::newFromRow( $row );
170 if ( !$row->cl_collation ) {
171 # This is an old-style row, so the sortkey needs to be
172 # converted.
173 if ( $row->cl_sortkey == $title->getText()
174 || $row->cl_sortkey == $title->getPrefixedText()
175 ) {
176 $prefix = '';
177 } else {
178 # Custom sortkey, use it as a prefix
179 $prefix = $row->cl_sortkey;
180 }
181 } else {
182 $prefix = $row->cl_sortkey_prefix;
183 }
184 # cl_type will be wrong for lots of pages if cl_collation is 0,
185 # so let's update it while we're here.
186 $type = MediaWikiServices::getInstance()->getNamespaceInfo()->
187 getCategoryLinkType( $title->getNamespace() );
188 $newSortKey = $collation->getSortKey(
189 $title->getCategorySortkey( $prefix ) );
190 if ( $verboseStats ) {
191 $this->updateSortKeySizeHistogram( $newSortKey );
192 }
193
194 if ( $dryRun ) {
195 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
196 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
197 $count += ( $row->cl_sortkey !== $newSortKey );
198 } else {
199 $dbw->update(
200 'categorylinks',
201 [
202 'cl_sortkey' => $newSortKey,
203 'cl_sortkey_prefix' => $prefix,
204 'cl_collation' => $collationName,
205 'cl_type' => $type,
206 'cl_timestamp = cl_timestamp',
207 ],
208 [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
209 __METHOD__
210 );
211 $count++;
212 }
213 if ( $row ) {
214 $batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
215 }
216 }
217 if ( !$dryRun ) {
218 $this->commitTransaction( $dbw, __METHOD__ );
219 }
220
221 if ( $dryRun ) {
222 $this->output( "$count rows would be updated so far.\n" );
223 } else {
224 $this->output( "$count done.\n" );
225 }
226 } while ( $res->numRows() == self::BATCH_SIZE );
227
228 if ( !$dryRun ) {
229 $this->output( "$count rows processed\n" );
230 }
231
232 if ( $verboseStats ) {
233 $this->output( "\n" );
235 }
236 }
237
245 private function getBatchCondition( $row, $dbw ) {
246 if ( $this->hasOption( 'previous-collation' ) ) {
247 $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
248 } else {
249 $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
250 }
251 $first = true;
252 $cond = false;
253 $prefix = false;
254 foreach ( $fields as $field ) {
255 if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
256 // Range conditions with enums are weird in mysql
257 // This must be a numeric literal, or it won't work.
258 $encValue = intval( $row->cl_type_numeric );
259 } else {
260 $encValue = $dbw->addQuotes( $row->$field );
261 }
262 $inequality = "$field > $encValue";
263 $equality = "$field = $encValue";
264 if ( $first ) {
265 $cond = $inequality;
266 $prefix = $equality;
267 $first = false;
268 } else {
269 $cond .= " OR ($prefix AND $inequality)";
270 $prefix .= " AND $equality";
271 }
272 }
273
274 return $cond;
275 }
276
277 private function updateSortKeySizeHistogram( $key ) {
278 $length = strlen( $key );
279 if ( !isset( $this->sizeHistogram[$length] ) ) {
280 $this->sizeHistogram[$length] = 0;
281 }
282 $this->sizeHistogram[$length]++;
283 }
284
285 private function showSortKeySizeHistogram() {
286 $maxLength = max( array_keys( $this->sizeHistogram ) );
287 if ( $maxLength == 0 ) {
288 return;
289 }
290 $numBins = 20;
291 $coarseHistogram = array_fill( 0, $numBins, 0 );
292 $coarseBoundaries = [];
293 $boundary = 0;
294 for ( $i = 0; $i < $numBins - 1; $i++ ) {
295 $boundary += $maxLength / $numBins;
296 $coarseBoundaries[$i] = round( $boundary );
297 }
298 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
299 $raw = '';
300 for ( $i = 0; $i <= $maxLength; $i++ ) {
301 if ( $raw !== '' ) {
302 $raw .= ', ';
303 }
304 $val = $this->sizeHistogram[$i] ?? 0;
305 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
306 if ( $coarseBoundaries[$coarseIndex] > $i ) {
307 $coarseHistogram[$coarseIndex] += $val;
308 break;
309 }
310 }
311 if ( $coarseIndex == $numBins - 1 ) {
312 $coarseHistogram[$coarseIndex] += $val;
313 }
314 $raw .= $val;
315 }
316
317 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
318
319 $maxBinVal = max( $coarseHistogram );
320 $scale = 60 / $maxBinVal;
321 $prevBoundary = 0;
322 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
323 $val = $coarseHistogram[$coarseIndex] ?? 0;
324 $boundary = $coarseBoundaries[$coarseIndex];
325 $this->output( sprintf( "%-10s %-10d |%s\n",
326 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
327 $val,
328 str_repeat( '*', $scale * $val ) ) );
329 $prevBoundary = $boundary;
330 }
331 }
332}
333
334$maintClass = UpdateCollation::class;
335require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
$wgCategoryCollation
Specify how category names should be sorted, when listed on a category page.
const RUN_MAINTENANCE_IF_MAIN
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
updateSortKeySizeHistogram( $key)
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
const DB_REPLICA
Definition defines.php:25
const DB_MASTER
Definition defines.php:29