Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 207 |
|
0.00% |
0 / 7 |
CRAP | |
0.00% |
0 / 1 |
UpdateCollation | |
0.00% |
0 / 204 |
|
0.00% |
0 / 7 |
1806 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
2 | |||
init | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
12 | |||
execute | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
156 | |||
updateBatch | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
72 | |||
copyBatch | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
30 | |||
updateSortKeySizeHistogram | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
showSortKeySizeHistogram | |
0.00% |
0 / 40 |
|
0.00% |
0 / 1 |
110 |
1 | <?php |
2 | /** |
3 | * Find all rows in the categorylinks table whose collation is out-of-date |
4 | * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey |
5 | * using the page title and cl_sortkey_prefix. |
6 | * |
7 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or |
10 | * (at your option) any later version. |
11 | * |
12 | * This program is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | * GNU General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU General Public License along |
18 | * with this program; if not, write to the Free Software Foundation, Inc., |
19 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
20 | * http://www.gnu.org/copyleft/gpl.html |
21 | * |
22 | * @file |
23 | * @ingroup Maintenance |
24 | * @author Aryeh Gregor (Simetrical) |
25 | */ |
26 | |
27 | require_once __DIR__ . '/Maintenance.php'; |
28 | |
29 | use MediaWiki\MainConfigNames; |
30 | use MediaWiki\Title\NamespaceInfo; |
31 | use MediaWiki\Title\Title; |
32 | use Wikimedia\Rdbms\IDatabase; |
33 | use Wikimedia\Rdbms\IMaintainableDatabase; |
34 | use Wikimedia\Rdbms\IResultWrapper; |
35 | |
36 | /** |
37 | * Maintenance script that will find all rows in the categorylinks table |
38 | * whose collation is out-of-date. |
39 | * |
40 | * @ingroup Maintenance |
41 | */ |
42 | class UpdateCollation extends Maintenance { |
43 | /** @var int[] */ |
44 | public $sizeHistogram = []; |
45 | |
46 | /** @var int */ |
47 | private $numRowsProcessed = 0; |
48 | |
49 | /** @var bool */ |
50 | private $force; |
51 | |
52 | /** @var bool */ |
53 | private $dryRun; |
54 | |
55 | /** @var bool */ |
56 | private $verboseStats; |
57 | |
58 | /** @var Collation */ |
59 | private $collation; |
60 | |
61 | /** @var string */ |
62 | private $collationName; |
63 | |
64 | /** @var string|null */ |
65 | private $targetTable; |
66 | |
67 | /** @var IDatabase */ |
68 | private $dbr; |
69 | |
70 | /** @var IMaintainableDatabase */ |
71 | private $dbw; |
72 | |
73 | /** @var NamespaceInfo */ |
74 | private $namespaceInfo; |
75 | |
76 | public function __construct() { |
77 | parent::__construct(); |
78 | |
79 | $this->addDescription( <<<TEXT |
80 | This script will find all rows in the categorylinks table whose collation is |
81 | out-of-date (cl_collation is not the same as \$wgCategoryCollation) and |
82 | repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all |
83 | collations are up-to-date, it will do nothing. |
84 | TEXT |
85 | ); |
86 | |
87 | $this->setBatchSize( 100 ); |
88 | $this->addOption( 'force', 'Run on all rows, even if the collation is ' . |
89 | 'supposed to be up-to-date.', false, false, 'f' ); |
90 | $this->addOption( 'previous-collation', 'Set the previous value of ' . |
91 | '$wgCategoryCollation here to speed up this script, especially if your ' . |
92 | 'categorylinks table is large. This will only update rows with that ' . |
93 | 'collation, though, so it may miss out-of-date rows with a different, ' . |
94 | 'even older collation.', false, true ); |
95 | $this->addOption( 'target-collation', 'Set this to the new collation type to ' . |
96 | 'use instead of $wgCategoryCollation. Usually you should not use this, ' . |
97 | 'you should just update $wgCategoryCollation in LocalSettings.php.', |
98 | false, true ); |
99 | $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' . |
100 | 'specified table instead of updating them in place.', false, true ); |
101 | $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' . |
102 | 'remotely.' ); |
103 | $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . |
104 | 'compile statistics.' ); |
105 | $this->addOption( 'verbose-stats', 'Show more statistics.' ); |
106 | } |
107 | |
108 | /** |
109 | * Get services and initialise member variables |
110 | */ |
111 | private function init() { |
112 | $services = $this->getServiceContainer(); |
113 | $this->namespaceInfo = $services->getNamespaceInfo(); |
114 | |
115 | if ( $this->hasOption( 'target-collation' ) ) { |
116 | $this->collationName = $this->getOption( 'target-collation' ); |
117 | } else { |
118 | $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation ); |
119 | } |
120 | if ( $this->hasOption( 'remote' ) ) { |
121 | $realCollationName = 'remote-' . $this->collationName; |
122 | } else { |
123 | $realCollationName = $this->collationName; |
124 | } |
125 | $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName ); |
126 | |
127 | // Collation check: in some cases the constructor will work, |
128 | // but this will raise an exception, breaking all category pages |
129 | $this->collation->getSortKey( 'MediaWiki' ); |
130 | |
131 | $this->force = $this->getOption( 'force' ); |
132 | $this->dryRun = $this->getOption( 'dry-run' ); |
133 | $this->verboseStats = $this->getOption( 'verbose-stats' ); |
134 | $this->dbw = $this->getPrimaryDB(); |
135 | $this->dbr = $this->getReplicaDB(); |
136 | $this->targetTable = $this->getOption( 'target-table' ); |
137 | } |
138 | |
139 | public function execute() { |
140 | $this->init(); |
141 | $batchSize = $this->getBatchSize(); |
142 | |
143 | if ( $this->targetTable ) { |
144 | if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) { |
145 | $this->output( "Creating table {$this->targetTable}\n" ); |
146 | $this->dbw->query( |
147 | 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) . |
148 | ' LIKE ' . $this->dbw->tableName( 'categorylinks' ), |
149 | __METHOD__ |
150 | ); |
151 | } |
152 | } |
153 | |
154 | $collationConds = []; |
155 | if ( !$this->force && !$this->targetTable ) { |
156 | if ( $this->hasOption( 'previous-collation' ) ) { |
157 | $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); |
158 | } else { |
159 | $collationConds[] = $this->dbr->expr( 'cl_collation', '!=', $this->collationName ); |
160 | } |
161 | } |
162 | $maxPageId = (int)$this->dbr->newSelectQueryBuilder() |
163 | ->select( 'MAX(page_id)' ) |
164 | ->from( 'page' ) |
165 | ->caller( __METHOD__ )->fetchField(); |
166 | $batchValue = 0; |
167 | do { |
168 | $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " ); |
169 | |
170 | // cl_type must be selected as a number for proper paging because |
171 | // enums suck. |
172 | if ( $this->dbw->getType() === 'mysql' ) { |
173 | $clType = 'cl_type+0 AS "cl_type_numeric"'; |
174 | } else { |
175 | $clType = 'cl_type'; |
176 | } |
177 | $res = $this->dbw->newSelectQueryBuilder() |
178 | ->select( [ |
179 | 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', |
180 | 'cl_sortkey', $clType, 'cl_timestamp', |
181 | 'page_namespace', 'page_title' |
182 | ] ) |
183 | ->from( 'categorylinks' ) |
184 | // per T58041 |
185 | ->straightJoin( 'page', null, 'cl_from = page_id' ) |
186 | ->where( $collationConds ) |
187 | ->andWhere( |
188 | $this->dbw->expr( 'cl_from', '>=', $batchValue ) |
189 | ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() ) |
190 | ) |
191 | ->orderBy( 'cl_from' ) |
192 | ->caller( __METHOD__ )->fetchResultSet(); |
193 | $this->output( "processing... " ); |
194 | |
195 | if ( $res->numRows() ) { |
196 | if ( $this->targetTable ) { |
197 | $this->copyBatch( $res ); |
198 | } else { |
199 | $this->updateBatch( $res ); |
200 | } |
201 | } |
202 | $batchValue += $this->getBatchSize(); |
203 | |
204 | if ( $this->dryRun ) { |
205 | $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" ); |
206 | } else { |
207 | $this->output( "{$this->numRowsProcessed} done.\n" ); |
208 | } |
209 | } while ( $maxPageId >= $batchValue ); |
210 | |
211 | if ( !$this->dryRun ) { |
212 | $this->output( "{$this->numRowsProcessed} rows processed\n" ); |
213 | } |
214 | |
215 | if ( $this->verboseStats ) { |
216 | $this->output( "\n" ); |
217 | $this->showSortKeySizeHistogram(); |
218 | } |
219 | } |
220 | |
221 | /** |
222 | * Update a set of rows in the categorylinks table |
223 | */ |
224 | private function updateBatch( IResultWrapper $res ) { |
225 | if ( !$this->dryRun ) { |
226 | $this->beginTransaction( $this->dbw, __METHOD__ ); |
227 | } |
228 | foreach ( $res as $row ) { |
229 | $title = Title::newFromRow( $row ); |
230 | if ( !$row->cl_collation ) { |
231 | # This is an old-style row, so the sortkey needs to be |
232 | # converted. |
233 | if ( $row->cl_sortkey === $title->getText() |
234 | || $row->cl_sortkey === $title->getPrefixedText() |
235 | ) { |
236 | $prefix = ''; |
237 | } else { |
238 | # Custom sortkey, so use it as a prefix |
239 | $prefix = $row->cl_sortkey; |
240 | } |
241 | } else { |
242 | $prefix = $row->cl_sortkey_prefix; |
243 | } |
244 | # cl_type will be wrong for lots of pages if cl_collation is 0, |
245 | # so let's update it while we're here. |
246 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
247 | $newSortKey = $this->collation->getSortKey( |
248 | $title->getCategorySortkey( $prefix ) ); |
249 | $this->updateSortKeySizeHistogram( $newSortKey ); |
250 | // Truncate to 230 bytes to avoid DB error |
251 | $newSortKey = substr( $newSortKey, 0, 230 ); |
252 | |
253 | if ( $this->dryRun ) { |
254 | // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in |
255 | // other fields, if any, those usually only happen when upgrading old MediaWikis.) |
256 | $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey ); |
257 | } else { |
258 | $this->dbw->newUpdateQueryBuilder() |
259 | ->update( 'categorylinks' ) |
260 | ->set( [ |
261 | 'cl_sortkey' => $newSortKey, |
262 | 'cl_sortkey_prefix' => $prefix, |
263 | 'cl_collation' => $this->collationName, |
264 | 'cl_type' => $type, |
265 | 'cl_timestamp = cl_timestamp', |
266 | ] ) |
267 | ->where( [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ] ) |
268 | ->caller( __METHOD__ ) |
269 | ->execute(); |
270 | $this->numRowsProcessed++; |
271 | } |
272 | } |
273 | if ( !$this->dryRun ) { |
274 | $this->commitTransaction( $this->dbw, __METHOD__ ); |
275 | } |
276 | } |
277 | |
278 | /** |
279 | * Copy a set of rows to the target table |
280 | */ |
281 | private function copyBatch( IResultWrapper $res ) { |
282 | $sortKeyInputs = []; |
283 | foreach ( $res as $row ) { |
284 | $title = Title::newFromRow( $row ); |
285 | $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix ); |
286 | } |
287 | $sortKeys = $this->collation->getSortKeys( $sortKeyInputs ); |
288 | $rowsToInsert = []; |
289 | foreach ( $res as $i => $row ) { |
290 | if ( !isset( $sortKeys[$i] ) ) { |
291 | throw new RuntimeException( 'Unable to get sort key' ); |
292 | } |
293 | $newSortKey = $sortKeys[$i]; |
294 | $this->updateSortKeySizeHistogram( $newSortKey ); |
295 | // Truncate to 230 bytes to avoid DB error |
296 | $newSortKey = substr( $newSortKey, 0, 230 ); |
297 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
298 | $rowsToInsert[] = [ |
299 | 'cl_from' => $row->cl_from, |
300 | 'cl_to' => $row->cl_to, |
301 | 'cl_sortkey' => $newSortKey, |
302 | 'cl_sortkey_prefix' => $row->cl_sortkey_prefix, |
303 | 'cl_collation' => $this->collationName, |
304 | 'cl_type' => $type, |
305 | 'cl_timestamp' => $row->cl_timestamp |
306 | ]; |
307 | } |
308 | if ( $this->dryRun ) { |
309 | $this->numRowsProcessed += count( $rowsToInsert ); |
310 | } else { |
311 | $this->beginTransaction( $this->dbw, __METHOD__ ); |
312 | $this->dbw->newInsertQueryBuilder() |
313 | ->insertInto( $this->targetTable ) |
314 | ->ignore() |
315 | ->rows( $rowsToInsert ) |
316 | ->caller( __METHOD__ )->execute(); |
317 | $this->numRowsProcessed += $this->dbw->affectedRows(); |
318 | $this->commitTransaction( $this->dbw, __METHOD__ ); |
319 | } |
320 | } |
321 | |
322 | /** |
323 | * Update the verbose statistics |
324 | */ |
325 | private function updateSortKeySizeHistogram( string $key ) { |
326 | if ( !$this->verboseStats ) { |
327 | return; |
328 | } |
329 | $length = strlen( $key ); |
330 | if ( !isset( $this->sizeHistogram[$length] ) ) { |
331 | $this->sizeHistogram[$length] = 0; |
332 | } |
333 | $this->sizeHistogram[$length]++; |
334 | } |
335 | |
336 | /** |
337 | * Show the verbose statistics |
338 | */ |
339 | private function showSortKeySizeHistogram() { |
340 | if ( !$this->sizeHistogram ) { |
341 | return; |
342 | } |
343 | $maxLength = max( array_keys( $this->sizeHistogram ) ); |
344 | if ( $maxLength === 0 ) { |
345 | return; |
346 | } |
347 | $numBins = 20; |
348 | $coarseHistogram = array_fill( 0, $numBins, 0 ); |
349 | $coarseBoundaries = []; |
350 | $boundary = 0; |
351 | for ( $i = 0; $i < $numBins - 1; $i++ ) { |
352 | $boundary += $maxLength / $numBins; |
353 | $coarseBoundaries[$i] = round( $boundary ); |
354 | } |
355 | $coarseBoundaries[$numBins - 1] = $maxLength + 1; |
356 | $raw = ''; |
357 | for ( $i = 0; $i <= $maxLength; $i++ ) { |
358 | if ( $raw !== '' ) { |
359 | $raw .= ', '; |
360 | } |
361 | $val = $this->sizeHistogram[$i] ?? 0; |
362 | for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { |
363 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive |
364 | if ( $coarseBoundaries[$coarseIndex] > $i ) { |
365 | $coarseHistogram[$coarseIndex] += $val; |
366 | break; |
367 | } |
368 | } |
369 | if ( $coarseIndex === ( $numBins - 1 ) ) { |
370 | $coarseHistogram[$coarseIndex] += $val; |
371 | } |
372 | $raw .= $val; |
373 | } |
374 | |
375 | $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); |
376 | |
377 | $maxBinVal = max( $coarseHistogram ); |
378 | $scale = (int)( 60 / $maxBinVal ); |
379 | $prevBoundary = 0; |
380 | for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { |
381 | $val = $coarseHistogram[$coarseIndex] ?? 0; |
382 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive |
383 | $boundary = $coarseBoundaries[$coarseIndex]; |
384 | $this->output( |
385 | sprintf( "%-10s %-10d |%s\n", |
386 | $prevBoundary . '-' . ( $boundary - 1 ) . ': ', |
387 | $val, |
388 | str_repeat( '*', $scale * $val ) |
389 | ) |
390 | ); |
391 | $prevBoundary = $boundary; |
392 | } |
393 | } |
394 | } |
395 | |
396 | $maintClass = UpdateCollation::class; |
397 | require_once RUN_MAINTENANCE_IF_MAIN; |