MediaWiki master
recountCategories.php
Go to the documentation of this file.
1<?php
24// @codeCoverageIgnoreStart
25require_once __DIR__ . '/Maintenance.php';
26// @codeCoverageIgnoreEnd
27
36 private $minimumId;
37
38 public function __construct() {
39 parent::__construct();
40 $this->addDescription( <<<'TEXT'
41This script refreshes the category membership counts stored in the category
42table. As time passes, these counts often drift from the actual number of
43category members. The script identifies rows where the value in the category
44table does not match the number of categorylinks rows for that category, and
45updates the category table accordingly.
46
47To fully refresh the data in the category table, you need to run this script
48for all three modes. Alternatively, just one mode can be run if required.
49TEXT
50 );
51 $this->addOption(
52 'mode',
53 '(REQUIRED) Which category count column to recompute: "pages", "subcats", "files" or "all".',
54 true,
55 true
56 );
57 $this->addOption(
58 'begin',
59 'Only recount categories with cat_id greater than the given value',
60 false,
61 true
62 );
63 $this->addOption(
64 'throttle',
65 'Wait this many milliseconds after each batch. Default: 0',
66 false,
67 true
68 );
69
70 $this->addOption(
71 'skip-cleanup',
72 'Skip running cleanupEmptyCategories if the "page" mode is selected',
73 false,
74 false
75 );
76
77 $this->setBatchSize( 500 );
78 }
79
80 public function execute() {
81 $originalMode = $this->getOption( 'mode' );
82 if ( !in_array( $originalMode, [ 'pages', 'subcats', 'files', 'all' ] ) ) {
83 $this->fatalError( 'Please specify a valid mode: one of "pages", "subcats", "files" or "all".' );
84 }
85
86 if ( $originalMode === 'all' ) {
87 $modes = [ 'pages', 'subcats', 'files' ];
88 } else {
89 $modes = [ $originalMode ];
90 }
91
92 foreach ( $modes as $mode ) {
93 $this->output( "Starting to recount {$mode} counts.\n" );
94 $this->minimumId = intval( $this->getOption( 'begin', 0 ) );
95
96 // do the work, batch by batch
97 $affectedRows = 0;
98 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
99 while ( ( $result = $this->doWork( $mode ) ) !== false ) {
100 $affectedRows += $result;
101 usleep( $this->getOption( 'throttle', 0 ) * 1000 );
102 }
103
104 $this->output( "Updated the {$mode} counts of $affectedRows categories.\n" );
105 }
106
107 // Finished
108 $this->output( "Done!\n" );
109 if ( $originalMode !== 'all' ) {
110 $this->output( "Now run the script using the other --mode options if you haven't already.\n" );
111 }
112
113 if ( in_array( 'pages', $modes ) ) {
114 if ( $this->hasOption( 'skip-cleanup' ) ) {
115 $this->output(
116 "Also run 'php cleanupEmptyCategories.php --mode remove' to remove empty,\n" .
117 "nonexistent categories from the category table.\n\n" );
118 } else {
119 $this->output( "Running cleanupEmptyCategories.php\n" );
120 $cleanup = $this->runChild( CleanupEmptyCategories::class );
121 '@phan-var CleanupEmptyCategories $cleanup';
122 // Pass no options into the child because of a parameter collision between "mode", which
123 // both scripts use but set to different values. We'll just use the defaults.
124 $cleanup->loadParamsAndArgs( $this->mSelf, [], [] );
125 // Force execution because we want to run it regardless of whether it's been run before.
126 $cleanup->setForce( true );
127 $cleanup->execute();
128 }
129 }
130 }
131
132 protected function doWork( $mode ) {
133 $this->output( "Finding up to {$this->getBatchSize()} drifted rows " .
134 "greater than cat_id {$this->minimumId}...\n" );
135
136 $dbr = $this->getDB( DB_REPLICA, 'vslow' );
137 $queryBuilder = $dbr->newSelectQueryBuilder()
138 ->select( 'COUNT(*)' )
139 ->from( 'categorylinks' )
140 ->where( 'cl_to = cat_title' );
141 if ( $mode === 'subcats' ) {
142 $queryBuilder->andWhere( [ 'cl_type' => 'subcat' ] );
143 } elseif ( $mode === 'files' ) {
144 $queryBuilder->andWhere( [ 'cl_type' => 'file' ] );
145 }
146
147 $countingSubquery = $queryBuilder->caller( __METHOD__ )->getSQL();
148
149 // First, let's find out which categories have drifted and need to be updated.
150 // The query counts the categorylinks for each category on the replica DB,
151 // but this data can't be used for updating the master, so we don't include it
152 // in the results.
153 $idsToUpdate = $dbr->newSelectQueryBuilder()
154 ->select( 'cat_id' )
155 ->from( 'category' )
156 ->where( [ $dbr->expr( 'cat_id', '>', (int)$this->minimumId ), "cat_{$mode} != ($countingSubquery)" ] )
157 ->limit( $this->getBatchSize() )
158 ->caller( __METHOD__ )->fetchFieldValues();
159 if ( !$idsToUpdate ) {
160 return false;
161 }
162 $this->output( "Updating cat_{$mode} field on " .
163 count( $idsToUpdate ) . " rows...\n" );
164
165 // In the next batch, start where this query left off. The rows selected
166 // in this iteration shouldn't be selected again after being updated, but
167 // we still keep track of where we are up to, as extra protection against
168 // infinite loops.
169 $this->minimumId = end( $idsToUpdate );
170
171 // Now, on master, find the correct counts for these categories.
172 $dbw = $this->getPrimaryDB();
173 $res = $dbw->newSelectQueryBuilder()
174 ->select( [ 'cat_id', 'count' => "($countingSubquery)" ] )
175 ->from( 'category' )
176 ->where( [ 'cat_id' => $idsToUpdate ] )
177 ->caller( __METHOD__ )->fetchResultSet();
178
179 // Update the category counts on the rows we just identified.
180 // This logic is equivalent to Category::refreshCounts, except here, we
181 // don't remove rows when cat_pages is zero and the category description page
182 // doesn't exist - instead we print a suggestion to run
183 // cleanupEmptyCategories.php.
184 $affectedRows = 0;
185 foreach ( $res as $row ) {
186 $dbw->newUpdateQueryBuilder()
187 ->update( 'category' )
188 ->set( [ "cat_{$mode}" => $row->count ] )
189 ->where( [
190 'cat_id' => $row->cat_id,
191 $dbw->expr( "cat_{$mode}", '!=', (int)$row->count ),
192 ] )
193 ->caller( __METHOD__ )
194 ->execute();
195 $affectedRows += $dbw->affectedRows();
196 }
197
198 $this->waitForReplication();
199
200 return $affectedRows;
201 }
202}
203
204// @codeCoverageIgnoreStart
205$maintClass = RecountCategories::class;
206require_once RUN_MAINTENANCE_IF_MAIN;
207// @codeCoverageIgnoreEnd
run()
Run the job.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
runChild( $maintClass, $classFile=null)
Returns an instance of the given maintenance script, with all of the current arguments passed to it.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Maintenance script that refreshes category membership counts in the category table.
__construct()
Default constructor.
execute()
Do the actual work.
const DB_REPLICA
Definition defines.php:26