Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 158 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
CleanupInvalidDbKeys | |
0.00% |
0 / 155 |
|
0.00% |
0 / 6 |
1406 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
outputStatus | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
writeToReport | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanupTable | |
0.00% |
0 / 129 |
|
0.00% |
0 / 1 |
756 | |||
makeValidTitle | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Cleans up invalid titles in various tables. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Maintenance |
22 | */ |
23 | |
24 | require_once __DIR__ . '/Maintenance.php'; |
25 | |
26 | use MediaWiki\Title\Title; |
27 | use MediaWiki\WikiMap\WikiMap; |
28 | use Wikimedia\Rdbms\IExpression; |
29 | use Wikimedia\Rdbms\LikeValue; |
30 | |
31 | /** |
32 | * Maintenance script that cleans up invalid titles in various tables. |
33 | * |
34 | * @since 1.29 |
35 | * @ingroup Maintenance |
36 | */ |
37 | class CleanupInvalidDbKeys extends Maintenance { |
38 | /** @var array[] List of tables to clean up, and the field prefix for that table */ |
39 | protected static $tables = [ |
40 | // Data tables |
41 | [ 'page', 'page' ], |
42 | [ 'redirect', 'rd', 'idField' => 'rd_from' ], |
43 | [ 'archive', 'ar' ], |
44 | [ 'logging', 'log' ], |
45 | [ 'protected_titles', 'pt', 'idField' => 0 ], |
46 | [ 'category', 'cat', 'nsField' => 14 ], |
47 | [ 'recentchanges', 'rc' ], |
48 | [ 'watchlist', 'wl' ], |
49 | // The querycache tables' qc(c)_title and qcc_titletwo may contain titles, |
50 | // but also usernames or other things like that, so we leave them alone |
51 | |
52 | // Links tables |
53 | [ 'pagelinks', 'pl', 'idField' => 'pl_from' ], |
54 | [ 'templatelinks', 'tl', 'idField' => 'tl_from' ], |
55 | [ 'categorylinks', 'cl', 'idField' => 'cl_from', 'nsField' => 14, 'titleField' => 'cl_to' ], |
56 | [ 'imagelinks', 'il', 'idField' => 'il_from', 'nsField' => 6, 'titleField' => 'il_to' ], |
57 | ]; |
58 | |
59 | public function __construct() { |
60 | parent::__construct(); |
61 | $this->addDescription( <<<'TEXT' |
62 | This script cleans up the title fields in various tables to remove entries that |
63 | will be rejected by the constructor of TitleValue. This constructor throws an |
64 | exception when invalid data is encountered, which will not normally occur on |
65 | regular page views, but can happen on query special pages. |
66 | |
67 | The script targets titles matching the regular expression /^_|[ \r\n\t]|_$/. |
68 | Because any foreign key relationships involving these titles will already be |
69 | broken, the titles are corrected to a valid version or the rows are deleted |
70 | entirely, depending on the table. |
71 | |
72 | The script runs with the expectation that STDOUT is redirected to a file. |
73 | TEXT |
74 | ); |
75 | $this->addOption( 'fix', 'Actually clean up invalid titles. If this parameter is ' . |
76 | 'not specified, the script will report invalid titles but not clean them up.', |
77 | false, false ); |
78 | $this->addOption( 'table', 'The table(s) to process. This option can be specified ' . |
79 | 'more than once (e.g. -t category -t watchlist). If not specified, all available ' . |
80 | 'tables will be processed. Available tables are: ' . |
81 | implode( ', ', array_column( static::$tables, 0 ) ), false, true, 't', true ); |
82 | |
83 | $this->setBatchSize( 500 ); |
84 | } |
85 | |
86 | public function execute() { |
87 | $tablesToProcess = $this->getOption( 'table' ); |
88 | foreach ( static::$tables as $tableParams ) { |
89 | if ( !$tablesToProcess || in_array( $tableParams[0], $tablesToProcess ) ) { |
90 | $this->cleanupTable( $tableParams ); |
91 | } |
92 | } |
93 | |
94 | $this->outputStatus( 'Done!' ); |
95 | if ( $this->hasOption( 'fix' ) ) { |
96 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
97 | $this->outputStatus( " Cleaned up invalid DB keys on $dbDomain!\n" ); |
98 | } |
99 | } |
100 | |
101 | /** |
102 | * Prints text to STDOUT, and STDERR if STDOUT was redirected to a file. |
103 | * Used for progress reporting. |
104 | * |
105 | * @param string $str Text to write to both places |
106 | * @param string|null $channel Ignored |
107 | */ |
108 | protected function outputStatus( $str, $channel = null ) { |
109 | // Make it easier to find progress lines in the STDOUT log |
110 | if ( trim( $str ) ) { |
111 | fwrite( STDOUT, '*** ' . trim( $str ) . "\n" ); |
112 | } |
113 | fwrite( STDERR, $str ); |
114 | } |
115 | |
116 | /** |
117 | * Prints text to STDOUT. Used for logging output. |
118 | * |
119 | * @param string $str Text to write |
120 | */ |
121 | protected function writeToReport( $str ) { |
122 | fwrite( STDOUT, $str ); |
123 | } |
124 | |
125 | /** |
126 | * Identifies, and optionally cleans up, invalid titles. |
127 | * |
128 | * @param array $tableParams A child array of self::$tables |
129 | */ |
130 | protected function cleanupTable( $tableParams ) { |
131 | [ $table, $prefix ] = $tableParams; |
132 | $idField = $tableParams['idField'] ?? "{$prefix}_id"; |
133 | $nsField = $tableParams['nsField'] ?? "{$prefix}_namespace"; |
134 | $titleField = $tableParams['titleField'] ?? "{$prefix}_title"; |
135 | |
136 | $this->outputStatus( "Looking for invalid $titleField entries in $table...\n" ); |
137 | |
138 | // Do all the select queries on the replicas, as they are slow (they use |
139 | // unanchored LIKEs). Naturally this could cause problems if rows are |
140 | // modified after selecting and before deleting/updating, but working on |
141 | // the hypothesis that invalid rows will be old and in all likelihood |
142 | // unreferenced, we should be fine to do it like this. |
143 | $dbr = $this->getDB( DB_REPLICA, 'vslow' ); |
144 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
145 | $joinConds = []; |
146 | $tables = [ $table ]; |
147 | if ( isset( $linksMigration::$mapping[$table] ) ) { |
148 | [ $nsField, $titleField ] = $linksMigration->getTitleFields( $table ); |
149 | $joinConds = $linksMigration->getQueryInfo( $table )['joins']; |
150 | $tables = $linksMigration->getQueryInfo( $table )['tables']; |
151 | } |
152 | |
153 | // Find all TitleValue-invalid titles. |
154 | $percent = $dbr->anyString(); |
155 | // The REGEXP operator is not cross-DBMS, so we have to use lots of LIKEs |
156 | $likeExpr = $dbr |
157 | ->expr( $titleField, IExpression::LIKE, new LikeValue( $percent, ' ', $percent ) ) |
158 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\r", $percent ) ) |
159 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\n", $percent ) ) |
160 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, "\t", $percent ) ) |
161 | ->or( $titleField, IExpression::LIKE, new LikeValue( '_', $percent ) ) |
162 | ->or( $titleField, IExpression::LIKE, new LikeValue( $percent, '_' ) ); |
163 | $res = $dbr->newSelectQueryBuilder() |
164 | ->select( [ |
165 | 'id' => $idField, |
166 | 'ns' => $nsField, |
167 | 'title' => $titleField, |
168 | ] ) |
169 | ->tables( $tables ) |
170 | ->where( $likeExpr ) |
171 | ->joinConds( $joinConds ) |
172 | ->limit( $this->getBatchSize() ) |
173 | ->caller( __METHOD__ ) |
174 | ->fetchResultSet(); |
175 | |
176 | $this->outputStatus( "Number of invalid rows: " . $res->numRows() . "\n" ); |
177 | if ( !$res->numRows() ) { |
178 | $this->outputStatus( "\n" ); |
179 | return; |
180 | } |
181 | |
182 | // Write a table of titles to the report file. Also keep a list of the found |
183 | // IDs, as we might need it later for DB updates |
184 | $this->writeToReport( sprintf( "%10s | ns | dbkey\n", $idField ) ); |
185 | $ids = []; |
186 | foreach ( $res as $row ) { |
187 | $this->writeToReport( sprintf( "%10d | %3d | %s\n", $row->id, $row->ns, $row->title ) ); |
188 | $ids[] = $row->id; |
189 | } |
190 | |
191 | // If we're doing a dry run, output the new titles we would use for the UPDATE |
192 | // queries (if relevant), and finish |
193 | if ( !$this->hasOption( 'fix' ) ) { |
194 | if ( $table === 'logging' || $table === 'archive' ) { |
195 | $this->writeToReport( "The following updates would be run with the --fix flag:\n" ); |
196 | foreach ( $res as $row ) { |
197 | $newTitle = self::makeValidTitle( $row->title ); |
198 | $this->writeToReport( |
199 | "$idField={$row->id}: update '{$row->title}' to '$newTitle'\n" ); |
200 | } |
201 | } |
202 | |
203 | if ( $table !== 'page' && $table !== 'redirect' ) { |
204 | $this->outputStatus( "Run with --fix to clean up these rows\n" ); |
205 | } |
206 | $this->outputStatus( "\n" ); |
207 | return; |
208 | } |
209 | |
210 | $services = $this->getServiceContainer(); |
211 | |
212 | // Fix the bad data, using different logic for the various tables |
213 | $dbw = $this->getPrimaryDB(); |
214 | switch ( $table ) { |
215 | case 'page': |
216 | case 'redirect': |
217 | // This shouldn't happen on production wikis, and we already have a script |
218 | // to handle 'page' rows anyway, so just notify the user and let them decide |
219 | // what to do next. |
220 | $this->outputStatus( <<<TEXT |
221 | IMPORTANT: This script does not fix invalid entries in the $table table. |
222 | Consider repairing these rows, and rows in related tables, by hand. |
223 | You may like to run, or borrow logic from, the cleanupTitles.php script. |
224 | |
225 | TEXT |
226 | ); |
227 | break; |
228 | |
229 | case 'archive': |
230 | case 'logging': |
231 | // Rename the title to a corrected equivalent. Any foreign key relationships |
232 | // to the page_title field are already broken, so this will just make sure |
233 | // users can still access the log entries/deleted revisions from the interface |
234 | // using a valid page title. |
235 | $this->outputStatus( |
236 | "Updating these rows, setting $titleField to the closest valid DB key...\n" ); |
237 | $affectedRowCount = 0; |
238 | foreach ( $res as $row ) { |
239 | $newTitle = self::makeValidTitle( $row->title ); |
240 | $this->writeToReport( |
241 | "$idField={$row->id}: updating '{$row->title}' to '$newTitle'\n" ); |
242 | |
243 | $dbw->newUpdateQueryBuilder() |
244 | ->update( $table ) |
245 | ->set( [ $titleField => $newTitle ] ) |
246 | ->where( [ $idField => $row->id ] ) |
247 | ->caller( __METHOD__ ) |
248 | ->execute(); |
249 | $affectedRowCount += $dbw->affectedRows(); |
250 | } |
251 | $this->waitForReplication(); |
252 | $this->outputStatus( "Updated $affectedRowCount rows on $table.\n" ); |
253 | |
254 | break; |
255 | |
256 | case 'recentchanges': |
257 | case 'watchlist': |
258 | case 'category': |
259 | // Since these broken titles can't exist, there's really nothing to watch, |
260 | // nothing can be categorised in them, and they can't have been changed |
261 | // recently, so we can just remove these rows. |
262 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
263 | $dbw->newDeleteQueryBuilder() |
264 | ->deleteFrom( $table ) |
265 | ->where( [ $idField => $ids ] ) |
266 | ->caller( __METHOD__ )->execute(); |
267 | $this->waitForReplication(); |
268 | $this->outputStatus( 'Deleted ' . $dbw->affectedRows() . " rows from $table.\n" ); |
269 | break; |
270 | |
271 | case 'protected_titles': |
272 | // Since these broken titles can't exist, there's really nothing to protect, |
273 | // so we can just remove these rows. Made more complicated by this table |
274 | // not having an ID field |
275 | $this->outputStatus( "Deleting invalid $table rows...\n" ); |
276 | $affectedRowCount = 0; |
277 | foreach ( $res as $row ) { |
278 | $dbw->newDeleteQueryBuilder() |
279 | ->deleteFrom( $table ) |
280 | ->where( [ $nsField => $row->ns, $titleField => $row->title ] ) |
281 | ->caller( __METHOD__ )->execute(); |
282 | $affectedRowCount += $dbw->affectedRows(); |
283 | } |
284 | $this->waitForReplication(); |
285 | $this->outputStatus( "Deleted $affectedRowCount rows from $table.\n" ); |
286 | break; |
287 | |
288 | case 'pagelinks': |
289 | case 'templatelinks': |
290 | case 'categorylinks': |
291 | case 'imagelinks': |
292 | // Update links tables for each page where these bogus links are supposedly |
293 | // located. If the invalid rows don't go away after these jobs go through, |
294 | // they're probably being added by a buggy hook. |
295 | $this->outputStatus( "Queueing link update jobs for the pages in $idField...\n" ); |
296 | $linksMigration = $this->getServiceContainer()->getLinksMigration(); |
297 | $wikiPageFactory = $services->getWikiPageFactory(); |
298 | foreach ( $res as $row ) { |
299 | $wp = $wikiPageFactory->newFromID( $row->id ); |
300 | if ( $wp ) { |
301 | RefreshLinks::fixLinksFromArticle( $row->id ); |
302 | } else { |
303 | if ( isset( $linksMigration::$mapping[$table] ) ) { |
304 | $conds = $linksMigration->getLinksConditions( |
305 | $table, |
306 | Title::makeTitle( $row->ns, $row->title ) |
307 | ); |
308 | } else { |
309 | $conds = [ $nsField => $row->ns, $titleField => $row->title ]; |
310 | } |
311 | // This link entry points to a nonexistent page, so just get rid of it |
312 | $dbw->newDeleteQueryBuilder() |
313 | ->deleteFrom( $table ) |
314 | ->where( array_merge( [ $idField => $row->id ], $conds ) ) |
315 | ->caller( __METHOD__ )->execute(); |
316 | } |
317 | } |
318 | $this->waitForReplication(); |
319 | $this->outputStatus( "Link update jobs have been added to the job queue.\n" ); |
320 | break; |
321 | } |
322 | |
323 | $this->outputStatus( "\n" ); |
324 | } |
325 | |
326 | /** |
327 | * Fix possible validation issues in the given title (DB key). |
328 | * |
329 | * @param string $invalidTitle |
330 | * @return string |
331 | */ |
332 | protected static function makeValidTitle( $invalidTitle ) { |
333 | return strtr( trim( $invalidTitle, '_' ), |
334 | [ ' ' => '_', "\r" => '', "\n" => '', "\t" => '_' ] ); |
335 | } |
336 | } |
337 | |
338 | $maintClass = CleanupInvalidDbKeys::class; |
339 | require_once RUN_MAINTENANCE_IF_MAIN; |