MediaWiki REL1_37
updateCollation.php
Go to the documentation of this file.
1<?php
27require_once __DIR__ . '/Maintenance.php';
28
31
39 private const BATCH_SIZE = 100; // Number of rows to process in one batch
40
41 public $sizeHistogram = [];
42
43 public function __construct() {
44 parent::__construct();
45
46 $this->addDescription( <<<TEXT
47This script will find all rows in the categorylinks table whose collation is
48out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
49repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
50collations are up-to-date, it will do nothing.
51TEXT
52 );
53
54 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
55 'supposed to be up-to-date.', false, false, 'f' );
56 $this->addOption( 'previous-collation', 'Set the previous value of ' .
57 '$wgCategoryCollation here to speed up this script, especially if your ' .
58 'categorylinks table is large. This will only update rows with that ' .
59 'collation, though, so it may miss out-of-date rows with a different, ' .
60 'even older collation.', false, true );
61 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
62 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
63 'you should just update $wgCategoryCollation in LocalSettings.php.',
64 false, true );
65 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
66 'compile statistics.' );
67 $this->addOption( 'verbose-stats', 'Show more statistics.' );
68 }
69
70 public function execute() {
71 $dbw = $this->getDB( DB_PRIMARY );
72 $dbr = $this->getDB( DB_REPLICA );
73 $force = $this->getOption( 'force' );
74 $dryRun = $this->getOption( 'dry-run' );
75 $verboseStats = $this->getOption( 'verbose-stats' );
76 if ( $this->hasOption( 'target-collation' ) ) {
77 $collationName = $this->getOption( 'target-collation' );
78 } else {
79 $collationName = $this->getConfig()->get( 'CategoryCollation' );
80 }
81 $collation = MediaWikiServices::getInstance()->getCollationFactory()->makeCollation( $collationName );
82
83 // Collation sanity check: in some cases the constructor will work,
84 // but this will raise an exception, breaking all category pages
85 $collation->getFirstLetter( 'MediaWiki' );
86
87 // Locally at least, (my local is a rather old version of mysql)
88 // mysql seems to filesort if there is both an equality
89 // (but not for an inequality) condition on cl_collation in the
90 // WHERE and it is also the first item in the ORDER BY.
91 if ( $this->hasOption( 'previous-collation' ) ) {
92 $orderBy = 'cl_to, cl_type, cl_from';
93 } else {
94 $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
95 }
96 $options = [
97 'LIMIT' => self::BATCH_SIZE,
98 'ORDER BY' => $orderBy,
99 'STRAIGHT_JOIN' // per T58041
100 ];
101
102 $collationConds = [];
103 if ( !$force ) {
104 if ( $this->hasOption( 'previous-collation' ) ) {
105 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
106 } else {
107 $collationConds = [ 0 =>
108 'cl_collation != ' . $dbw->addQuotes( $collationName )
109 ];
110 }
111
112 $count = $dbr->estimateRowCount(
113 'categorylinks',
114 '*',
115 $collationConds,
116 __METHOD__
117 );
118 // Improve estimate if feasible
119 if ( $count < 1000000 ) {
120 $count = $dbr->selectField(
121 'categorylinks',
122 'COUNT(*)',
123 $collationConds,
124 __METHOD__
125 );
126 }
127 if ( $count == 0 ) {
128 $this->output( "Collations up-to-date.\n" );
129
130 return;
131 }
132 if ( $dryRun ) {
133 $this->output( "$count rows would be updated.\n" );
134 } else {
135 $this->output( "Fixing collation for $count rows.\n" );
136 }
137 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
138 }
139 $count = 0;
140 $batchConds = [];
141 do {
142 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
143
144 // cl_type must be selected as a number for proper paging because
145 // enums suck.
146 if ( $dbw->getType() === 'mysql' ) {
147 $clType = 'cl_type+0 AS "cl_type_numeric"';
148 } else {
149 $clType = 'cl_type';
150 }
151 $res = $dbw->select(
152 [ 'categorylinks', 'page' ],
153 [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
154 'cl_sortkey', $clType,
155 'page_namespace', 'page_title'
156 ],
157 array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
158 __METHOD__,
159 $options
160 );
161 $this->output( " processing..." );
162
163 if ( !$dryRun ) {
164 $this->beginTransaction( $dbw, __METHOD__ );
165 }
166 foreach ( $res as $row ) {
167 $title = Title::newFromRow( $row );
168 if ( !$row->cl_collation ) {
169 # This is an old-style row, so the sortkey needs to be
170 # converted.
171 if ( $row->cl_sortkey == $title->getText()
172 || $row->cl_sortkey == $title->getPrefixedText()
173 ) {
174 $prefix = '';
175 } else {
176 # Custom sortkey, use it as a prefix
177 $prefix = $row->cl_sortkey;
178 }
179 } else {
180 $prefix = $row->cl_sortkey_prefix;
181 }
182 # cl_type will be wrong for lots of pages if cl_collation is 0,
183 # so let's update it while we're here.
184 $type = MediaWikiServices::getInstance()->getNamespaceInfo()->
185 getCategoryLinkType( $title->getNamespace() );
186 $newSortKey = $collation->getSortKey(
187 $title->getCategorySortkey( $prefix ) );
188 if ( $verboseStats ) {
189 $this->updateSortKeySizeHistogram( $newSortKey );
190 }
191
192 if ( $dryRun ) {
193 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
194 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
195 $count += ( $row->cl_sortkey !== $newSortKey );
196 } else {
197 $dbw->update(
198 'categorylinks',
199 [
200 'cl_sortkey' => $newSortKey,
201 'cl_sortkey_prefix' => $prefix,
202 'cl_collation' => $collationName,
203 'cl_type' => $type,
204 'cl_timestamp = cl_timestamp',
205 ],
206 [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
207 __METHOD__
208 );
209 $count++;
210 }
211 if ( $row ) {
212 $batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
213 }
214 }
215 if ( !$dryRun ) {
216 $this->commitTransaction( $dbw, __METHOD__ );
217 }
218
219 if ( $dryRun ) {
220 $this->output( "$count rows would be updated so far.\n" );
221 } else {
222 $this->output( "$count done.\n" );
223 }
224 } while ( $res->numRows() == self::BATCH_SIZE );
225
226 if ( !$dryRun ) {
227 $this->output( "$count rows processed\n" );
228 }
229
230 if ( $verboseStats ) {
231 $this->output( "\n" );
233 }
234 }
235
243 private function getBatchCondition( $row, $dbw ) {
244 if ( $this->hasOption( 'previous-collation' ) ) {
245 $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
246 } else {
247 $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
248 }
249 $first = true;
250 $cond = false;
251 $prefix = false;
252 foreach ( $fields as $field ) {
253 if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
254 // Range conditions with enums are weird in mysql
255 // This must be a numeric literal, or it won't work.
256 $encValue = intval( $row->cl_type_numeric );
257 } else {
258 $encValue = $dbw->addQuotes( $row->$field );
259 }
260 $inequality = "$field > $encValue";
261 $equality = "$field = $encValue";
262 if ( $first ) {
263 $cond = $inequality;
264 $prefix = $equality;
265 $first = false;
266 } else {
267 $cond .= " OR ($prefix AND $inequality)";
268 $prefix .= " AND $equality";
269 }
270 }
271
272 return $cond;
273 }
274
275 private function updateSortKeySizeHistogram( $key ) {
276 $length = strlen( $key );
277 if ( !isset( $this->sizeHistogram[$length] ) ) {
278 $this->sizeHistogram[$length] = 0;
279 }
280 $this->sizeHistogram[$length]++;
281 }
282
283 private function showSortKeySizeHistogram() {
284 if ( !$this->sizeHistogram ) {
285 return;
286 }
287 $maxLength = max( array_keys( $this->sizeHistogram ) );
288 if ( $maxLength == 0 ) {
289 return;
290 }
291 $numBins = 20;
292 $coarseHistogram = array_fill( 0, $numBins, 0 );
293 $coarseBoundaries = [];
294 $boundary = 0;
295 for ( $i = 0; $i < $numBins - 1; $i++ ) {
296 $boundary += $maxLength / $numBins;
297 $coarseBoundaries[$i] = round( $boundary );
298 }
299 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
300 $raw = '';
301 for ( $i = 0; $i <= $maxLength; $i++ ) {
302 if ( $raw !== '' ) {
303 $raw .= ', ';
304 }
305 $val = $this->sizeHistogram[$i] ?? 0;
306 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
307 if ( $coarseBoundaries[$coarseIndex] > $i ) {
308 $coarseHistogram[$coarseIndex] += $val;
309 break;
310 }
311 }
312 if ( $coarseIndex == $numBins - 1 ) {
313 $coarseHistogram[$coarseIndex] += $val;
314 }
315 $raw .= $val;
316 }
317
318 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
319
320 $maxBinVal = max( $coarseHistogram );
321 $scale = 60 / $maxBinVal;
322 $prevBoundary = 0;
323 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
324 $val = $coarseHistogram[$coarseIndex] ?? 0;
325 $boundary = $coarseBoundaries[$coarseIndex];
326 $this->output( sprintf( "%-10s %-10d |%s\n",
327 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
328 $val,
329 str_repeat( '*', $scale * $val ) ) );
330 $prevBoundary = $boundary;
331 }
332 }
333}
334
335$maintClass = UpdateCollation::class;
336require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
$wgCategoryCollation
Specify how category names should be sorted, when listed on a category page.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Maintenance script that will find all rows in the categorylinks table whose collation is out-of-date.
execute()
Do the actual work.
__construct()
Default constructor.
updateSortKeySizeHistogram( $key)
getBatchCondition( $row, $dbw)
Return an SQL expression selecting rows which sort above the given row, assuming an ordering of cl_co...
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
const DB_REPLICA
Definition defines.php:25
const DB_PRIMARY
Definition defines.php:27