Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 204 |
|
0.00% |
0 / 7 |
CRAP | |
0.00% |
0 / 1 |
UpdateCollation | |
0.00% |
0 / 204 |
|
0.00% |
0 / 7 |
1806 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
2 | |||
init | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
12 | |||
execute | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
156 | |||
updateBatch | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
72 | |||
copyBatch | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
30 | |||
updateSortKeySizeHistogram | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
showSortKeySizeHistogram | |
0.00% |
0 / 40 |
|
0.00% |
0 / 1 |
110 |
1 | <?php |
2 | /** |
3 | * Find all rows in the categorylinks table whose collation is out-of-date |
4 | * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey |
5 | * using the page title and cl_sortkey_prefix. |
6 | * |
7 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or |
10 | * (at your option) any later version. |
11 | * |
12 | * This program is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | * GNU General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU General Public License along |
18 | * with this program; if not, write to the Free Software Foundation, Inc., |
19 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
20 | * http://www.gnu.org/copyleft/gpl.html |
21 | * |
22 | * @file |
23 | * @ingroup Maintenance |
24 | * @author Aryeh Gregor (Simetrical) |
25 | */ |
26 | |
27 | // @codeCoverageIgnoreStart |
28 | require_once __DIR__ . '/Maintenance.php'; |
29 | // @codeCoverageIgnoreEnd |
30 | |
31 | use MediaWiki\MainConfigNames; |
32 | use MediaWiki\Maintenance\Maintenance; |
33 | use MediaWiki\Title\NamespaceInfo; |
34 | use MediaWiki\Title\Title; |
35 | use Wikimedia\Rdbms\IDatabase; |
36 | use Wikimedia\Rdbms\IMaintainableDatabase; |
37 | use Wikimedia\Rdbms\IResultWrapper; |
38 | |
39 | /** |
40 | * Maintenance script that will find all rows in the categorylinks table |
41 | * whose collation is out-of-date. |
42 | * |
43 | * @ingroup Maintenance |
44 | */ |
45 | class UpdateCollation extends Maintenance { |
46 | /** @var int[] */ |
47 | public $sizeHistogram = []; |
48 | |
49 | /** @var int */ |
50 | private $numRowsProcessed = 0; |
51 | |
52 | /** @var bool */ |
53 | private $force; |
54 | |
55 | /** @var bool */ |
56 | private $dryRun; |
57 | |
58 | /** @var bool */ |
59 | private $verboseStats; |
60 | |
61 | /** @var Collation */ |
62 | private $collation; |
63 | |
64 | /** @var string */ |
65 | private $collationName; |
66 | |
67 | /** @var string|null */ |
68 | private $targetTable; |
69 | |
70 | /** @var IDatabase */ |
71 | private $dbr; |
72 | |
73 | /** @var IMaintainableDatabase */ |
74 | private $dbw; |
75 | |
76 | /** @var NamespaceInfo */ |
77 | private $namespaceInfo; |
78 | |
79 | public function __construct() { |
80 | parent::__construct(); |
81 | |
82 | $this->addDescription( <<<TEXT |
83 | This script will find all rows in the categorylinks table whose collation is |
84 | out-of-date (cl_collation is not the same as \$wgCategoryCollation) and |
85 | repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all |
86 | collations are up-to-date, it will do nothing. |
87 | TEXT |
88 | ); |
89 | |
90 | $this->setBatchSize( 100 ); |
91 | $this->addOption( 'force', 'Run on all rows, even if the collation is ' . |
92 | 'supposed to be up-to-date.', false, false, 'f' ); |
93 | $this->addOption( 'previous-collation', 'Set the previous value of ' . |
94 | '$wgCategoryCollation here to speed up this script, especially if your ' . |
95 | 'categorylinks table is large. This will only update rows with that ' . |
96 | 'collation, though, so it may miss out-of-date rows with a different, ' . |
97 | 'even older collation.', false, true ); |
98 | $this->addOption( 'target-collation', 'Set this to the new collation type to ' . |
99 | 'use instead of $wgCategoryCollation. Usually you should not use this, ' . |
100 | 'you should just update $wgCategoryCollation in LocalSettings.php.', |
101 | false, true ); |
102 | $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' . |
103 | 'specified table instead of updating them in place.', false, true ); |
104 | $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' . |
105 | 'remotely.' ); |
106 | $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . |
107 | 'compile statistics.' ); |
108 | $this->addOption( 'verbose-stats', 'Show more statistics.' ); |
109 | } |
110 | |
111 | /** |
112 | * Get services and initialise member variables |
113 | */ |
114 | private function init() { |
115 | $services = $this->getServiceContainer(); |
116 | $this->namespaceInfo = $services->getNamespaceInfo(); |
117 | |
118 | if ( $this->hasOption( 'target-collation' ) ) { |
119 | $this->collationName = $this->getOption( 'target-collation' ); |
120 | } else { |
121 | $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation ); |
122 | } |
123 | if ( $this->hasOption( 'remote' ) ) { |
124 | $realCollationName = 'remote-' . $this->collationName; |
125 | } else { |
126 | $realCollationName = $this->collationName; |
127 | } |
128 | $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName ); |
129 | |
130 | // Collation check: in some cases the constructor will work, |
131 | // but this will raise an exception, breaking all category pages |
132 | $this->collation->getSortKey( 'MediaWiki' ); |
133 | |
134 | $this->force = $this->getOption( 'force' ); |
135 | $this->dryRun = $this->getOption( 'dry-run' ); |
136 | $this->verboseStats = $this->getOption( 'verbose-stats' ); |
137 | $this->dbw = $this->getPrimaryDB(); |
138 | $this->dbr = $this->getReplicaDB(); |
139 | $this->targetTable = $this->getOption( 'target-table' ); |
140 | } |
141 | |
142 | public function execute() { |
143 | $this->init(); |
144 | $batchSize = $this->getBatchSize(); |
145 | |
146 | if ( $this->targetTable ) { |
147 | if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) { |
148 | $this->output( "Creating table {$this->targetTable}\n" ); |
149 | $this->dbw->query( |
150 | 'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) . |
151 | ' LIKE ' . $this->dbw->tableName( 'categorylinks' ), |
152 | __METHOD__ |
153 | ); |
154 | } |
155 | } |
156 | |
157 | $collationConds = []; |
158 | if ( !$this->force && !$this->targetTable ) { |
159 | if ( $this->hasOption( 'previous-collation' ) ) { |
160 | $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); |
161 | } else { |
162 | $collationConds[] = $this->dbr->expr( 'cl_collation', '!=', $this->collationName ); |
163 | } |
164 | } |
165 | $maxPageId = (int)$this->dbr->newSelectQueryBuilder() |
166 | ->select( 'MAX(page_id)' ) |
167 | ->from( 'page' ) |
168 | ->caller( __METHOD__ )->fetchField(); |
169 | $batchValue = 0; |
170 | do { |
171 | $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " ); |
172 | |
173 | // cl_type must be selected as a number for proper paging because |
174 | // enums suck. |
175 | if ( $this->dbw->getType() === 'mysql' ) { |
176 | $clType = 'cl_type+0 AS "cl_type_numeric"'; |
177 | } else { |
178 | $clType = 'cl_type'; |
179 | } |
180 | $res = $this->dbw->newSelectQueryBuilder() |
181 | ->select( [ |
182 | 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', |
183 | 'cl_sortkey', $clType, 'cl_timestamp', |
184 | 'page_namespace', 'page_title' |
185 | ] ) |
186 | ->from( 'categorylinks' ) |
187 | // per T58041 |
188 | ->straightJoin( 'page', null, 'cl_from = page_id' ) |
189 | ->where( $collationConds ) |
190 | ->andWhere( |
191 | $this->dbw->expr( 'cl_from', '>=', $batchValue ) |
192 | ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() ) |
193 | ) |
194 | ->orderBy( 'cl_from' ) |
195 | ->caller( __METHOD__ )->fetchResultSet(); |
196 | $this->output( "processing... " ); |
197 | |
198 | if ( $res->numRows() ) { |
199 | if ( $this->targetTable ) { |
200 | $this->copyBatch( $res ); |
201 | } else { |
202 | $this->updateBatch( $res ); |
203 | } |
204 | } |
205 | $batchValue += $this->getBatchSize(); |
206 | |
207 | if ( $this->dryRun ) { |
208 | $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" ); |
209 | } else { |
210 | $this->output( "{$this->numRowsProcessed} done.\n" ); |
211 | } |
212 | } while ( $maxPageId >= $batchValue ); |
213 | |
214 | if ( !$this->dryRun ) { |
215 | $this->output( "{$this->numRowsProcessed} rows processed\n" ); |
216 | } |
217 | |
218 | if ( $this->verboseStats ) { |
219 | $this->output( "\n" ); |
220 | $this->showSortKeySizeHistogram(); |
221 | } |
222 | } |
223 | |
224 | /** |
225 | * Update a set of rows in the categorylinks table |
226 | */ |
227 | private function updateBatch( IResultWrapper $res ) { |
228 | if ( !$this->dryRun ) { |
229 | $this->beginTransaction( $this->dbw, __METHOD__ ); |
230 | } |
231 | foreach ( $res as $row ) { |
232 | $title = Title::newFromRow( $row ); |
233 | if ( !$row->cl_collation ) { |
234 | # This is an old-style row, so the sortkey needs to be |
235 | # converted. |
236 | if ( $row->cl_sortkey === $title->getText() |
237 | || $row->cl_sortkey === $title->getPrefixedText() |
238 | ) { |
239 | $prefix = ''; |
240 | } else { |
241 | # Custom sortkey, so use it as a prefix |
242 | $prefix = $row->cl_sortkey; |
243 | } |
244 | } else { |
245 | $prefix = $row->cl_sortkey_prefix; |
246 | } |
247 | # cl_type will be wrong for lots of pages if cl_collation is 0, |
248 | # so let's update it while we're here. |
249 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
250 | $newSortKey = $this->collation->getSortKey( |
251 | $title->getCategorySortkey( $prefix ) ); |
252 | $this->updateSortKeySizeHistogram( $newSortKey ); |
253 | // Truncate to 230 bytes to avoid DB error |
254 | $newSortKey = substr( $newSortKey, 0, 230 ); |
255 | |
256 | if ( $this->dryRun ) { |
257 | // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in |
258 | // other fields, if any, those usually only happen when upgrading old MediaWikis.) |
259 | $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey ); |
260 | } else { |
261 | $this->dbw->newUpdateQueryBuilder() |
262 | ->update( 'categorylinks' ) |
263 | ->set( [ |
264 | 'cl_sortkey' => $newSortKey, |
265 | 'cl_sortkey_prefix' => $prefix, |
266 | 'cl_collation' => $this->collationName, |
267 | 'cl_type' => $type, |
268 | 'cl_timestamp = cl_timestamp', |
269 | ] ) |
270 | ->where( [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ] ) |
271 | ->caller( __METHOD__ ) |
272 | ->execute(); |
273 | $this->numRowsProcessed++; |
274 | } |
275 | } |
276 | if ( !$this->dryRun ) { |
277 | $this->commitTransaction( $this->dbw, __METHOD__ ); |
278 | } |
279 | } |
280 | |
281 | /** |
282 | * Copy a set of rows to the target table |
283 | */ |
284 | private function copyBatch( IResultWrapper $res ) { |
285 | $sortKeyInputs = []; |
286 | foreach ( $res as $row ) { |
287 | $title = Title::newFromRow( $row ); |
288 | $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix ); |
289 | } |
290 | $sortKeys = $this->collation->getSortKeys( $sortKeyInputs ); |
291 | $rowsToInsert = []; |
292 | foreach ( $res as $i => $row ) { |
293 | if ( !isset( $sortKeys[$i] ) ) { |
294 | throw new RuntimeException( 'Unable to get sort key' ); |
295 | } |
296 | $newSortKey = $sortKeys[$i]; |
297 | $this->updateSortKeySizeHistogram( $newSortKey ); |
298 | // Truncate to 230 bytes to avoid DB error |
299 | $newSortKey = substr( $newSortKey, 0, 230 ); |
300 | $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace ); |
301 | $rowsToInsert[] = [ |
302 | 'cl_from' => $row->cl_from, |
303 | 'cl_to' => $row->cl_to, |
304 | 'cl_sortkey' => $newSortKey, |
305 | 'cl_sortkey_prefix' => $row->cl_sortkey_prefix, |
306 | 'cl_collation' => $this->collationName, |
307 | 'cl_type' => $type, |
308 | 'cl_timestamp' => $row->cl_timestamp |
309 | ]; |
310 | } |
311 | if ( $this->dryRun ) { |
312 | $this->numRowsProcessed += count( $rowsToInsert ); |
313 | } else { |
314 | $this->beginTransaction( $this->dbw, __METHOD__ ); |
315 | $this->dbw->newInsertQueryBuilder() |
316 | ->insertInto( $this->targetTable ) |
317 | ->ignore() |
318 | ->rows( $rowsToInsert ) |
319 | ->caller( __METHOD__ )->execute(); |
320 | $this->numRowsProcessed += $this->dbw->affectedRows(); |
321 | $this->commitTransaction( $this->dbw, __METHOD__ ); |
322 | } |
323 | } |
324 | |
325 | /** |
326 | * Update the verbose statistics |
327 | */ |
328 | private function updateSortKeySizeHistogram( string $key ) { |
329 | if ( !$this->verboseStats ) { |
330 | return; |
331 | } |
332 | $length = strlen( $key ); |
333 | if ( !isset( $this->sizeHistogram[$length] ) ) { |
334 | $this->sizeHistogram[$length] = 0; |
335 | } |
336 | $this->sizeHistogram[$length]++; |
337 | } |
338 | |
339 | /** |
340 | * Show the verbose statistics |
341 | */ |
342 | private function showSortKeySizeHistogram() { |
343 | if ( !$this->sizeHistogram ) { |
344 | return; |
345 | } |
346 | $maxLength = max( array_keys( $this->sizeHistogram ) ); |
347 | if ( $maxLength === 0 ) { |
348 | return; |
349 | } |
350 | $numBins = 20; |
351 | $coarseHistogram = array_fill( 0, $numBins, 0 ); |
352 | $coarseBoundaries = []; |
353 | $boundary = 0; |
354 | for ( $i = 0; $i < $numBins - 1; $i++ ) { |
355 | $boundary += $maxLength / $numBins; |
356 | $coarseBoundaries[$i] = round( $boundary ); |
357 | } |
358 | $coarseBoundaries[$numBins - 1] = $maxLength + 1; |
359 | $raw = ''; |
360 | for ( $i = 0; $i <= $maxLength; $i++ ) { |
361 | if ( $raw !== '' ) { |
362 | $raw .= ', '; |
363 | } |
364 | $val = $this->sizeHistogram[$i] ?? 0; |
365 | for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { |
366 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive |
367 | if ( $coarseBoundaries[$coarseIndex] > $i ) { |
368 | $coarseHistogram[$coarseIndex] += $val; |
369 | break; |
370 | } |
371 | } |
372 | if ( $coarseIndex === ( $numBins - 1 ) ) { |
373 | $coarseHistogram[$coarseIndex] += $val; |
374 | } |
375 | $raw .= $val; |
376 | } |
377 | |
378 | $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); |
379 | |
380 | $maxBinVal = max( $coarseHistogram ); |
381 | $scale = (int)( 60 / $maxBinVal ); |
382 | $prevBoundary = 0; |
383 | for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { |
384 | $val = $coarseHistogram[$coarseIndex] ?? 0; |
385 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive |
386 | $boundary = $coarseBoundaries[$coarseIndex]; |
387 | $this->output( |
388 | sprintf( "%-10s %-10d |%s\n", |
389 | $prevBoundary . '-' . ( $boundary - 1 ) . ': ', |
390 | $val, |
391 | str_repeat( '*', $scale * $val ) |
392 | ) |
393 | ); |
394 | $prevBoundary = $boundary; |
395 | } |
396 | } |
397 | } |
398 | |
399 | // @codeCoverageIgnoreStart |
400 | $maintClass = UpdateCollation::class; |
401 | require_once RUN_MAINTENANCE_IF_MAIN; |
402 | // @codeCoverageIgnoreEnd |