Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 155 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
CleanupInvalidDbKeys | |
0.00% |
0 / 155 |
|
0.00% |
0 / 6 |
1406 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
outputStatus | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
writeToReport | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanupTable | |
0.00% |
0 / 129 |
|
0.00% |
0 / 1 |
756 | |||
makeValidTitle | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Cleans up invalid titles in various tables. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Maintenance |
22 | */ |
23 | |
24 | // @codeCoverageIgnoreStart |
25 | require_once __DIR__ . '/Maintenance.php'; |
26 | // @codeCoverageIgnoreEnd |
27 | |
28 | use MediaWiki\Title\Title; |
29 | use MediaWiki\WikiMap\WikiMap; |
30 | use Wikimedia\Rdbms\IExpression; |
31 | use Wikimedia\Rdbms\LikeValue; |
32 | |
33 | /** |
34 | * Maintenance script that cleans up invalid titles in various tables. |
35 | * |
36 | * @since 1.29 |
37 | * @ingroup Maintenance |
38 | */ |
39 | class CleanupInvalidDbKeys extends Maintenance { |
40 | /** @var array[] List of tables to clean up, and the field prefix for that table */ |
41 | protected static $tables = [ |
42 | // Data tables |
43 | [ 'page', 'page' ], |
44 | [ 'redirect', 'rd', 'idField' => 'rd_from' ], |
45 | [ 'archive', 'ar' ], |
46 | [ 'logging', 'log' ], |
47 | [ 'protected_titles', 'pt', 'idField' => 0 ], |
48 | [ 'category', 'cat', 'nsField' => 14 ], |
49 | [ 'recentchanges', 'rc' ], |
50 | [ 'watchlist', 'wl' ], |
51 | // The querycache tables' qc(c)_title and qcc_titletwo may contain titles, |
52 | // but also usernames or other things like that, so we leave them alone |
53 | |
54 | // Links tables |
55 | [ 'pagelinks', 'pl', 'idField' => 'pl_from' ], |
56 | [ 'templatelinks', 'tl', 'idField' => 'tl_from' ], |
57 | [ 'categorylinks', 'cl', 'idField' => 'cl_from', 'nsField' => 14, 'titleField' => 'cl_to' ], |
58 | [ 'imagelinks', 'il', 'idField' => 'il_from', 'nsField' => 6, 'titleField' => 'il_to' ], |
59 | ]; |
60 | |
61 | public function __construct() { |
62 | parent::__construct(); |
63 | $this->addDescription( <<<'TEXT' |
64 | This script cleans up the title fields in various tables to remove entries that |
65 | will be rejected by the constructor of TitleValue. This constructor throws an |
66 | exception when invalid data is encountered, which will not normally occur on |
67 | regular page views, but can happen on query special pages. |
68 | |
69 | The script targets titles matching the regular expression /^_|[ \r\n\t]|_$/. |
70 | Because any foreign key relationships involving these titles will already be |
71 | broken, the titles are corrected to a valid version or the rows are deleted |
72 | entirely, depending on the table. |
73 | |
74 | The script runs with the expectation that STDOUT is redirected to a file. |
75 | TEXT |
76 | ); |
77 | $this->addOption( 'fix', 'Actually clean up invalid titles. If this parameter is ' . |
78 | 'not specified, the script will report invalid titles but not clean them up.', |
79 | false, false ); |
80 | $this->addOption( 'table', 'The table(s) to process. This option can be specified ' . |
81 | 'more than once (e.g. -t category -t watchlist). If not specified, all available ' . |
82 | 'tables will be processed. Available tables are: ' . |
83 | implode( ', ', array_column( static::$tables, 0 ) ), false, true, 't', true ); |
84 | |
85 | $this->setBatchSize( 500 ); |
86 | } |
87 | |
88 | public function execute() { |
89 | $tablesToProcess = $this->getOption( 'table' ); |
90 | foreach ( static::$tables as $tableParams ) { |
91 | if ( !$tablesToProcess || in_array( $tableParams[0], $tablesToProcess ) ) { |
92 | $this->cleanupTable( $tableParams ); |
93 | } |
94 | } |
95 | |
96 | $this->outputStatus( 'Done!' ); |
97 | if ( $this->hasOption( 'fix' ) ) { |
98 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
99 | $this->outputStatus( " Cleaned up invalid DB keys on $dbDomain!\n" ); |
100 | } |
101 | } |
102 | |
103 | /** |
104 | * Prints text to STDOUT, and STDERR if STDOUT was redirected to a file. |
105 | * Used for progress reporting. |
106 | * |
107 | * @param string $str Text to write to both places |
108 | * @param string|null $channel Ignored |
109 | */ |
110 | protected function outputStatus( $str, $channel = null ) { |
111 | // Make it easier to find progress lines in the STDOUT log |
112 | if ( trim( $str ) ) { |
113 | fwrite( STDOUT, '*** ' . trim( $str ) . "\n" ); |
114 | } |
115 | fwrite( STDERR, $str ); |
116 | } |
117 | |
118 | /** |
119 | * Prints text to STDOUT. Used for logging output. |
120 | * |
121 | * @param string $str Text to write |
122 | */ |
123 | protected function writeToReport( $str ) { |
124 | fwrite( STDOUT, $str ); |
125 | } |
126 | |
127 | /** |
128 | * Identifies, and optionally cleans up, invalid titles. |
129 | * |
130 | * @param array $tableParams A child array of self::$tables |
131 | */ |
132 | protected function cleanupTable( $tableParams ) { |
133 | [ $table, $prefix ] = $tableParams; |
134 | $idField = $tableParams['idField'] ?? "{$prefix}_id"; |
135 | $nsField = $tableParams['nsField'] ?? "{$prefix}_namespace"; |
136 | $titleField = $tableParams['titleField'] ?? "{$prefix}_title"; |
137 | |
138 | $this->outputStatus( "Looking for invalid $titleField entries in $table...\n" ); |
139 | |
140 | // Do all the select queries on the replicas, as they are slow (they use |
141 | // unanchored LIKEs). Naturally this could cause problems if rows are |
142 | // modified after selecting and before deleting/updating, but working on |
143 | // the hypothesis that invalid rows will be old and in all likelihood |
144 | // unreferenced, we should be fine to do it like this. |
145 | $dbr = $this->getDB( DB_REPLICA, 'vslow' ); |
146 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
147 | $joinConds = []; |
148 | $tables = [ $table ]; |
149 | if ( isset( $linksMigration::$mapping[$table] ) ) { |
150 | [ $nsField, $titleField ] = $linksMigration->getTitleFields( $table ); |
151 | $joinConds = $linksMigration->getQueryInfo( $table )['joins']; |
152 | $tables = $linksMigration->getQueryInfo( $table )['tables']; |
153 | } |
154 | |
155 | // Find all TitleValue-invalid titles. |
156 | $percent = $dbr->anyString(); |
157 | // The REGEXP operator is not cross-DBMS, so we have to use lots of LIKEs |
158 | $likeExpr = $dbr |
159 | ->expr( $titleField, IExpression::LIKE, new LikeValue( $percent, ' ', $percent ) ) |
160 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\r", $percent ) ) |
161 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\n", $percent ) ) |
162 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\t", $percent ) ) |
163 | ->or( $titleField, IExpression::LIKE, new LikeValue( '_', $percent ) ) |
164 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, '_' ) ); |
165 | $res = $dbr->newSelectQueryBuilder() |
166 | ->select( [ |
167 | 'id' => $idField, |
168 | 'ns' => $nsField, |
169 | 'title' => $titleField, |
170 | ] ) |
171 | ->tables( $tables ) |
172 | ->where( $likeExpr ) |
173 | ->joinConds( $joinConds ) |
174 | ->limit( $this->getBatchSize() ) |
175 | ->caller( __METHOD__ ) |
176 | ->fetchResultSet(); |
177 | |
178 | $this->outputStatus( "Number of invalid rows: " . $res->numRows() . "\n" ); |
179 | if ( !$res->numRows() ) { |
180 | $this->outputStatus( "\n" ); |
181 | return; |
182 | } |
183 | |
184 | // Write a table of titles to the report file. Also keep a list of the found |
185 | // IDs, as we might need it later for DB updates |
186 | $this->writeToReport( sprintf( "%10s | ns | dbkey\n", $idField ) ); |
187 | $ids = []; |
188 | foreach ( $res as $row ) { |
189 | $this->writeToReport( sprintf( "%10d | %3d | %s\n", $row->id, $row->ns, $row->title ) ); |
190 | $ids[] = $row->id; |
191 | } |
192 | |
193 | // If we're doing a dry run, output the new titles we would use for the UPDATE |
194 | // queries (if relevant), and finish |
195 | if ( !$this->hasOption( 'fix' ) ) { |
196 | if ( $table === 'logging' || $table === 'archive' ) { |
197 | $this->writeToReport( "The following updates would be run with the --fix flag:\n" ); |
198 | foreach ( $res as $row ) { |
199 | $newTitle = self::makeValidTitle( $row->title ); |
200 | $this->writeToReport( |
201 | "$idField={$row->id}: update '{$row->title}' to '$newTitle'\n" ); |
202 | } |
203 | } |
204 | |
205 | if ( $table !== 'page' && $table !== 'redirect' ) { |
206 | $this->outputStatus( "Run with --fix to clean up these rows\n" ); |
207 | } |
208 | $this->outputStatus( "\n" ); |
209 | return; |
210 | } |
211 | |
212 | $services = $this->getServiceContainer(); |
213 | |
214 | // Fix the bad data, using different logic for the various tables |
215 | $dbw = $this->getPrimaryDB(); |
216 | switch ( $table ) { |
217 | case 'page': |
218 | case 'redirect': |
219 | // This shouldn't happen on production wikis, and we already have a script |
220 | // to handle 'page' rows anyway, so just notify the user and let them decide |
221 | // what to do next. |
222 | $this->outputStatus( <<<TEXT |
223 | IMPORTANT: This script does not fix invalid entries in the $table table. |
224 | Consider repairing these rows, and rows in related tables, by hand. |
225 | You may like to run, or borrow logic from, the cleanupTitles.php script. |
226 | |
227 | TEXT |
228 | ); |
229 | break; |
230 | |
231 | case 'archive': |
232 | case 'logging': |
233 | // Rename the title to a corrected equivalent. Any foreign key relationships |
234 | // to the page_title field are already broken, so this will just make sure |
235 | // users can still access the log entries/deleted revisions from the interface |
236 | // using a valid page title. |
237 | $this->outputStatus( |
238 | "Updating these rows, setting $titleField to the closest valid DB key...\n" ); |
239 | $affectedRowCount = 0; |
240 | foreach ( $res as $row ) { |
241 | $newTitle = self::makeValidTitle( $row->title ); |
242 | $this->writeToReport( |
243 | "$idField={$row->id}: updating '{$row->title}' to '$newTitle'\n" ); |
244 | |
245 | $dbw->newUpdateQueryBuilder() |
246 | ->update( $table ) |
247 | ->set( [ $titleField => $newTitle ] ) |
248 | ->where( [ $idField => $row->id ] ) |
249 | ->caller( __METHOD__ ) |
250 | ->execute(); |
251 | $affectedRowCount += $dbw->affectedRows(); |
252 | } |
253 | $this->waitForReplication(); |
254 | $this->outputStatus( "Updated $affectedRowCount rows on $table.\n" ); |
255 | |
256 | break; |
257 | |
258 | case 'recentchanges': |
259 | case 'watchlist': |
260 | case 'category': |
261 | // Since these broken titles can't exist, there's really nothing to watch, |
262 | // nothing can be categorised in them, and they can't have been changed |
263 | // recently, so we can just remove these rows. |
264 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
265 | $dbw->newDeleteQueryBuilder() |
266 | ->deleteFrom( $table ) |
267 | ->where( [ $idField => $ids ] ) |
268 | ->caller( __METHOD__ )->execute(); |
269 | $this->waitForReplication(); |
270 | $this->outputStatus( 'Deleted ' . $dbw->affectedRows() . " rows from $table.\n" ); |
271 | break; |
272 | |
273 | case 'protected_titles': |
274 | // Since these broken titles can't exist, there's really nothing to protect, |
275 | // so we can just remove these rows. Made more complicated by this table |
276 | // not having an ID field |
277 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
278 | $affectedRowCount = 0; |
279 | foreach ( $res as $row ) { |
280 | $dbw->newDeleteQueryBuilder() |
281 | ->deleteFrom( $table ) |
282 | ->where( [ $nsField => $row->ns, $titleField => $row->title ] ) |
283 | ->caller( __METHOD__ )->execute(); |
284 | $affectedRowCount += $dbw->affectedRows(); |
285 | } |
286 | $this->waitForReplication(); |
287 | $this->outputStatus( "Deleted $affectedRowCount rows from $table.\n" ); |
288 | break; |
289 | |
290 | case 'pagelinks': |
291 | case 'templatelinks': |
292 | case 'categorylinks': |
293 | case 'imagelinks': |
294 | // Update links tables for each page where these bogus links are supposedly |
295 | // located. If the invalid rows don't go away after these jobs go through, |
296 | // they're probably being added by a buggy hook. |
297 | $this->outputStatus( "Queueing link update jobs for the pages in $idField...\n" ); |
298 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
299 | $wikiPageFactory = $services->getWikiPageFactory(); |
300 | foreach ( $res as $row ) { |
301 | $wp = $wikiPageFactory->newFromID( $row->id ); |
302 | if ( $wp ) { |
303 | RefreshLinks::fixLinksFromArticle( $row->id ); |
304 | } else { |
305 | if ( isset( $linksMigration::$mapping[$table] ) ) { |
306 | $conds = $linksMigration->getLinksConditions( |
307 | $table, |
308 | Title::makeTitle( $row->ns, $row->title ) |
309 | ); |
310 | } else { |
311 | $conds = [ $nsField => $row->ns, $titleField => $row->title ]; |
312 | } |
313 | // This link entry points to a nonexistent page, so just get rid of it |
314 | $dbw->newDeleteQueryBuilder() |
315 | ->deleteFrom( $table ) |
316 | ->where( array_merge( [ $idField => $row->id ], $conds ) ) |
317 | ->caller( __METHOD__ )->execute(); |
318 | } |
319 | } |
320 | $this->waitForReplication(); |
321 | $this->outputStatus( "Link update jobs have been added to the job queue.\n" ); |
322 | break; |
323 | } |
324 | |
325 | $this->outputStatus( "\n" ); |
326 | } |
327 | |
328 | /** |
329 | * Fix possible validation issues in the given title (DB key). |
330 | * |
331 | * @param string $invalidTitle |
332 | * @return string |
333 | */ |
334 | protected static function makeValidTitle( $invalidTitle ) { |
335 | return strtr( trim( $invalidTitle, '_' ), |
336 | [ ' ' => '_', "\r" => '', "\n" => '', "\t" => '_' ] ); |
337 | } |
338 | } |
339 | |
340 | // @codeCoverageIgnoreStart |
341 | $maintClass = CleanupInvalidDbKeys::class; |
342 | require_once RUN_MAINTENANCE_IF_MAIN; |
343 | // @codeCoverageIgnoreEnd |