Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 267
0.00% covered (danger)
0.00%
0 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
UpdateCollation
0.00% covered (danger)
0.00%
0 / 267
0.00% covered (danger)
0.00%
0 / 8
2652
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 25
0.00% covered (danger)
0.00%
0 / 1
2
 init
0.00% covered (danger)
0.00%
0 / 25
0.00% covered (danger)
0.00%
0 / 1
12
 execute
0.00% covered (danger)
0.00%
0 / 58
0.00% covered (danger)
0.00%
0 / 1
182
 updateBatch
0.00% covered (danger)
0.00%
0 / 33
0.00% covered (danger)
0.00%
0 / 1
72
 copyBatch
0.00% covered (danger)
0.00%
0 / 33
0.00% covered (danger)
0.00%
0 / 1
30
 updateSortKeySizeHistogram
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
12
 showSortKeySizeHistogram
0.00% covered (danger)
0.00%
0 / 40
0.00% covered (danger)
0.00%
0 / 1
110
 runNormalizationMigration
0.00% covered (danger)
0.00%
0 / 47
0.00% covered (danger)
0.00%
0 / 1
72
1<?php
2/**
3 * Find all rows in the categorylinks table whose collation is out-of-date
4 * (collation_name != $wgCategoryCollation) and repopulate cl_sortkey
5 * using the page title and cl_sortkey_prefix.
6 *
7 * @license GPL-2.0-or-later
8 * @file
9 * @ingroup Maintenance
10 * @author Aryeh Gregor (Simetrical)
11 */
12
13// @codeCoverageIgnoreStart
14require_once __DIR__ . '/Maintenance.php';
15// @codeCoverageIgnoreEnd
16
17use MediaWiki\Logger\LoggerFactory;
18use MediaWiki\MainConfigNames;
19use MediaWiki\Maintenance\Maintenance;
20use MediaWiki\Storage\NameTableStore;
21use MediaWiki\Title\NamespaceInfo;
22use MediaWiki\Title\Title;
23use Wikimedia\Rdbms\IMaintainableDatabase;
24use Wikimedia\Rdbms\IReadableDatabase;
25use Wikimedia\Rdbms\IResultWrapper;
26
27/**
28 * Maintenance script that will find all rows in the categorylinks table
29 * whose collation is out-of-date.
30 *
31 * @ingroup Maintenance
32 */
33class UpdateCollation extends Maintenance {
34    /** @var int[] */
35    public $sizeHistogram = [];
36
37    /** @var int */
38    private $numRowsProcessed = 0;
39
40    /** @var bool */
41    private $force;
42
43    /** @var bool */
44    private $dryRun;
45
46    /** @var bool */
47    private $verboseStats;
48
49    /** @var Collation */
50    private $collation;
51
52    /** @var string */
53    private $collationName;
54
55    /** @var string|null */
56    private $targetTable;
57
58    private bool $normalization = false;
59
60    /** @var IReadableDatabase */
61    private $dbr;
62
63    /** @var IMaintainableDatabase */
64    private $dbw;
65
66    private NamespaceInfo $namespaceInfo;
67    private NameTableStore $collationNameStore;
68
69    public function __construct() {
70        parent::__construct();
71
72        $this->addDescription( <<<TEXT
73This script will find all rows in the categorylinks table whose collation is
74out-of-date (collation_name is not the same as \$wgCategoryCollation) and
75repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
76collations are up-to-date, it will do nothing.
77TEXT
78        );
79
80        $this->setBatchSize( 100 );
81        $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
82            'supposed to be up-to-date.', false, false, 'f' );
83        $this->addOption( 'previous-collation', 'Set the previous value of ' .
84            '$wgCategoryCollation here to speed up this script, especially if your ' .
85            'categorylinks table is large. This will only update rows with that ' .
86            'collation, though, so it may miss out-of-date rows with a different, ' .
87            'even older collation.', false, true );
88        $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
89            'use instead of $wgCategoryCollation. Usually you should not use this, ' .
90            'you should just update $wgCategoryCollation in LocalSettings.php.',
91            false, true );
92        $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
93            'specified table instead of updating them in place.', false, true );
94        $this->addOption( 'only-migrate-normalization', 'Only backfill cl_collation_id ' .
95            'field from cl_collation', false );
96        $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
97            'remotely.' );
98        $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
99            'compile statistics.' );
100        $this->addOption( 'verbose-stats', 'Show more statistics.' );
101    }
102
103    /**
104     * Get services and initialise member variables
105     */
106    private function init() {
107        $services = $this->getServiceContainer();
108        $this->namespaceInfo = $services->getNamespaceInfo();
109        $this->collationNameStore = new NameTableStore(
110            $this->getServiceContainer()->getDBLoadBalancer(),
111            $this->getServiceContainer()->getMainWANObjectCache(),
112            LoggerFactory::getInstance( 'SecondaryDataUpdate' ),
113            'collation',
114            'collation_id',
115            'collation_name'
116        );
117
118        if ( $this->hasOption( 'target-collation' ) ) {
119            $this->collationName = $this->getOption( 'target-collation' );
120        } else {
121            $this->collationName = $this->getConfig()->get( MainConfigNames::CategoryCollation );
122        }
123        if ( $this->hasOption( 'remote' ) ) {
124            $realCollationName = 'remote-' . $this->collationName;
125        } else {
126            $realCollationName = $this->collationName;
127        }
128        $this->collation = $services->getCollationFactory()->makeCollation( $realCollationName );
129
130        // Collation check: in some cases the constructor will work,
131        // but this will raise an exception, breaking all category pages
132        $this->collation->getSortKey( 'MediaWiki' );
133
134        $this->force = $this->getOption( 'force' );
135        $this->dryRun = $this->getOption( 'dry-run' );
136        $this->verboseStats = $this->getOption( 'verbose-stats' );
137        $this->dbw = $this->getDB( DB_PRIMARY );
138        $this->dbr = $this->getReplicaDB();
139        $this->targetTable = $this->getOption( 'target-table' );
140        $this->normalization = $this->getOption( 'only-migrate-normalization', false );
141    }
142
143    public function execute() {
144        $this->init();
145        $batchSize = $this->getBatchSize();
146
147        if ( $this->normalization ) {
148            $this->runNormalizationMigration();
149            return;
150        }
151
152        if ( $this->targetTable ) {
153            if ( !$this->dbw->tableExists( $this->targetTable, __METHOD__ ) ) {
154                $this->output( "Creating table {$this->targetTable}\n" );
155                $this->dbw->query(
156                    'CREATE TABLE ' . $this->dbw->tableName( $this->targetTable ) .
157                    ' LIKE ' . $this->dbw->tableName( 'categorylinks' ),
158                    __METHOD__
159                );
160            }
161        }
162
163        $collationConds = [];
164        if ( !$this->force && !$this->targetTable ) {
165            if ( $this->hasOption( 'previous-collation' ) ) {
166                $collationConds['collation_name'] = $this->getOption( 'previous-collation' );
167            } else {
168                $collationConds[] = $this->dbr->expr( 'collation_name', '!=', $this->collationName );
169            }
170        }
171        $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
172            ->select( 'MAX(page_id)' )
173            ->from( 'page' )
174            ->caller( __METHOD__ )->fetchField();
175        $batchValue = 0;
176        do {
177            $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
178
179            // cl_type must be selected as a number for proper paging because
180            // enums suck.
181            if ( $this->dbw->getType() === 'mysql' ) {
182                $clType = 'cl_type+0 AS "cl_type_numeric"';
183            } else {
184                $clType = 'cl_type';
185            }
186            $res = $this->dbw->newSelectQueryBuilder()
187                ->select( [
188                    'cl_from', 'cl_target_id', 'cl_sortkey_prefix', 'cl_sortkey', $clType,
189                    'cl_timestamp', 'collation_name', 'page_namespace', 'page_title'
190                ] )
191                ->from( 'categorylinks' )
192                ->join( 'collation', null, 'cl_collation_id = collation_id' )
193                // per T58041
194                ->straightJoin( 'page', null, 'cl_from = page_id' )
195                ->where( $collationConds )
196                ->andWhere(
197                    $this->dbw->expr( 'cl_from', '>=', $batchValue )
198                        ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
199                )
200                ->orderBy( 'cl_from' )
201                ->caller( __METHOD__ )->fetchResultSet();
202            $this->output( "processing... " );
203
204            if ( $res->numRows() ) {
205                if ( $this->targetTable ) {
206                    $this->copyBatch( $res );
207                } else {
208                    $this->updateBatch( $res );
209                }
210            }
211            $batchValue += $this->getBatchSize();
212
213            if ( $this->dryRun ) {
214                $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
215            } else {
216                $this->output( "{$this->numRowsProcessed} done.\n" );
217            }
218        } while ( $maxPageId >= $batchValue );
219
220        if ( !$this->dryRun ) {
221            $this->output( "{$this->numRowsProcessed} rows processed\n" );
222        }
223
224        if ( $this->verboseStats ) {
225            $this->output( "\n" );
226            $this->showSortKeySizeHistogram();
227        }
228    }
229
230    /**
231     * Update a set of rows in the categorylinks table
232     */
233    private function updateBatch( IResultWrapper $res ) {
234        if ( !$this->dryRun ) {
235            $this->beginTransactionRound( __METHOD__ );
236        }
237        foreach ( $res as $row ) {
238            $title = Title::newFromRow( $row );
239            if ( !$row->collation_name ) {
240                # This is an old-style row, so the sortkey needs to be
241                # converted.
242                if ( $row->cl_sortkey === $title->getText()
243                    || $row->cl_sortkey === $title->getPrefixedText()
244                ) {
245                    $prefix = '';
246                } else {
247                    # Custom sortkey, so use it as a prefix
248                    $prefix = $row->cl_sortkey;
249                }
250            } else {
251                $prefix = $row->cl_sortkey_prefix;
252            }
253            # cl_type will be wrong for lots of pages if cl_collation is 0,
254            # so let's update it while we're here.
255            $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
256            $newSortKey = $this->collation->getSortKey(
257                $title->getCategorySortkey( $prefix ) );
258            $this->updateSortKeySizeHistogram( $newSortKey );
259            // Truncate to 230 bytes to avoid DB error
260            $newSortKey = substr( $newSortKey, 0, 230 );
261
262            if ( $this->dryRun ) {
263                // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
264                // other fields, if any, those usually only happen when upgrading old MediaWikis.)
265                $this->numRowsProcessed += ( $row->cl_sortkey !== $newSortKey );
266            } else {
267                $collationId = $this->collationNameStore->acquireId( $this->collationName );
268                $this->dbw->newUpdateQueryBuilder()
269                    ->update( 'categorylinks' )
270                    ->set( [
271                        'cl_sortkey' => $newSortKey,
272                        'cl_sortkey_prefix' => $prefix,
273                        'cl_collation_id' => $collationId,
274                        'cl_type' => $type,
275                        'cl_timestamp = cl_timestamp',
276                    ] )
277                    ->where( [ 'cl_from' => $row->cl_from, 'cl_target_id' => $row->cl_target_id ] )
278                    ->caller( __METHOD__ )
279                    ->execute();
280                $this->numRowsProcessed++;
281            }
282        }
283        if ( !$this->dryRun ) {
284            $this->commitTransactionRound( __METHOD__ );
285        }
286    }
287
288    /**
289     * Copy a set of rows to the target table
290     */
291    private function copyBatch( IResultWrapper $res ) {
292        $sortKeyInputs = [];
293        foreach ( $res as $row ) {
294            $title = Title::newFromRow( $row );
295            $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix );
296        }
297        $sortKeys = $this->collation->getSortKeys( $sortKeyInputs );
298        $rowsToInsert = [];
299        foreach ( $res as $i => $row ) {
300            if ( !isset( $sortKeys[$i] ) ) {
301                throw new RuntimeException( 'Unable to get sort key' );
302            }
303            $newSortKey = $sortKeys[$i];
304            $this->updateSortKeySizeHistogram( $newSortKey );
305            // Truncate to 230 bytes to avoid DB error
306            $newSortKey = substr( $newSortKey, 0, 230 );
307            $type = $this->namespaceInfo->getCategoryLinkType( $row->page_namespace );
308            $collationId = $this->collationNameStore->acquireId( $this->collationName );
309            $rowsToInsert[] = [
310                'cl_from' => $row->cl_from,
311                'cl_target_id' => $row->cl_target_id,
312                'cl_sortkey' => $newSortKey,
313                'cl_sortkey_prefix' => $row->cl_sortkey_prefix,
314                'cl_collation_id' => $collationId,
315                'cl_type' => $type,
316                'cl_timestamp' => $row->cl_timestamp
317            ];
318        }
319        if ( $this->dryRun ) {
320            $this->numRowsProcessed += count( $rowsToInsert );
321        } else {
322            $this->beginTransactionRound( __METHOD__ );
323            $this->dbw->newInsertQueryBuilder()
324                ->insertInto( $this->targetTable )
325                ->ignore()
326                ->rows( $rowsToInsert )
327                ->caller( __METHOD__ )->execute();
328            $this->numRowsProcessed += $this->dbw->affectedRows();
329            $this->commitTransactionRound( __METHOD__ );
330        }
331    }
332
333    /**
334     * Update the verbose statistics
335     */
336    private function updateSortKeySizeHistogram( string $key ) {
337        if ( !$this->verboseStats ) {
338            return;
339        }
340        $length = strlen( $key );
341        if ( !isset( $this->sizeHistogram[$length] ) ) {
342            $this->sizeHistogram[$length] = 0;
343        }
344        $this->sizeHistogram[$length]++;
345    }
346
347    /**
348     * Show the verbose statistics
349     */
350    private function showSortKeySizeHistogram() {
351        if ( !$this->sizeHistogram ) {
352            return;
353        }
354        $maxLength = max( array_keys( $this->sizeHistogram ) );
355        if ( $maxLength === 0 ) {
356            return;
357        }
358        $numBins = 20;
359        $coarseHistogram = array_fill( 0, $numBins, 0 );
360        $coarseBoundaries = [];
361        $boundary = 0;
362        for ( $i = 0; $i < $numBins - 1; $i++ ) {
363            $boundary += $maxLength / $numBins;
364            $coarseBoundaries[$i] = round( $boundary );
365        }
366        $coarseBoundaries[$numBins - 1] = $maxLength + 1;
367        $raw = '';
368        for ( $i = 0; $i <= $maxLength; $i++ ) {
369            if ( $raw !== '' ) {
370                $raw .= ', ';
371            }
372            $val = $this->sizeHistogram[$i] ?? 0;
373            for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
374                if ( $coarseBoundaries[$coarseIndex] > $i ) {
375                    $coarseHistogram[$coarseIndex] += $val;
376                    break;
377                }
378            }
379            if ( $coarseIndex === ( $numBins - 1 ) ) {
380                $coarseHistogram[$coarseIndex] += $val;
381            }
382            $raw .= $val;
383        }
384
385        $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
386
387        $maxBinVal = max( $coarseHistogram );
388        $scale = (int)( 60 / $maxBinVal );
389        $prevBoundary = 0;
390        for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
391            $val = $coarseHistogram[$coarseIndex] ?? 0;
392            $boundary = $coarseBoundaries[$coarseIndex];
393            $this->output(
394                sprintf( "%-10s %-10d |%s\n",
395                    $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
396                    $val,
397                    str_repeat( '*', $scale * $val )
398                )
399            );
400            $prevBoundary = $boundary;
401        }
402    }
403
404    private function runNormalizationMigration() {
405        if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation', __METHOD__ ) ) {
406            $this->output( "The cl_collation column appears to already be normalized. Skipping.\n" );
407            return;
408        }
409        if ( !$this->dbw->fieldExists( 'categorylinks', 'cl_collation_id', __METHOD__ ) ) {
410            $this->output( "The cl_collation_id column doesn't exist. Run update.php to create it.\n" );
411            return;
412        }
413        if ( !$this->dbw->tableExists( 'collation', __METHOD__ ) ) {
414            $this->output( "The collation table doesn't exist. Run update.php to create it.\n" );
415            return;
416        }
417
418        $maxPageId = (int)$this->dbr->newSelectQueryBuilder()
419            ->select( 'MAX(page_id)' )
420            ->from( 'page' )
421            ->caller( __METHOD__ )->fetchField();
422        $batchValue = 0;
423        $batchSize = $this->getBatchSize();
424
425        do {
426            $this->output( "Selecting next $batchSize pages from cl_from = $batchValue... " );
427
428            $res = $this->dbw->newSelectQueryBuilder()
429                ->select( [ 'cl_collation' ] )
430                ->distinct()
431                ->from( 'categorylinks' )
432                ->where( [ 'cl_collation_id' => 0 ] )
433                ->andWhere(
434                    $this->dbw->expr( 'cl_from', '>=', $batchValue )
435                        ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
436                )
437                ->caller( __METHOD__ )->fetchResultSet();
438            $this->output( "processing... " );
439
440            if ( $res->numRows() && !$this->dryRun ) {
441                foreach ( $res as $row ) {
442                    $collationName = $row->cl_collation;
443                    $collationId = $this->collationNameStore->acquireId( $collationName );
444                    $this->dbw->newUpdateQueryBuilder()
445                        ->update( 'categorylinks' )
446                        ->set( [ 'cl_collation_id' => $collationId ] )
447                        ->where( [ 'cl_collation' => $collationName ] )
448                        ->andWhere(
449                            $this->dbw->expr( 'cl_from', '>=', $batchValue )
450                                ->and( 'cl_from', '<', $batchValue + $this->getBatchSize() )
451                        )
452                        ->caller( __METHOD__ )->execute();
453                    $this->numRowsProcessed += $this->dbw->affectedRows();
454                }
455
456                $this->waitForReplication();
457            }
458            $batchValue += $this->getBatchSize();
459
460            $this->output( "{$this->numRowsProcessed} done.\n" );
461        } while ( $maxPageId >= $batchValue );
462
463        if ( !$this->dryRun ) {
464            $this->output( "{$this->numRowsProcessed} rows processed\n" );
465        }
466    }
467}
468
469// @codeCoverageIgnoreStart
470$maintClass = UpdateCollation::class;
471require_once RUN_MAINTENANCE_IF_MAIN;
472// @codeCoverageIgnoreEnd