Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
78.72% |
111 / 141 |
|
16.67% |
1 / 6 |
CRAP | |
0.00% |
0 / 1 |
TitleCleanup | |
78.72% |
111 / 141 |
|
16.67% |
1 / 6 |
70.19 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
5.20 | |||
processRow | |
89.47% |
17 / 19 |
|
0.00% |
0 / 1 |
8.07 | |||
fileExists | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
moveIllegalPage | |
75.44% |
43 / 57 |
|
0.00% |
0 / 1 |
21.28 | |||
moveInconsistentPage | |
89.36% |
42 / 47 |
|
0.00% |
0 / 1 |
16.31 |
1 | <?php |
2 | /** |
3 | * Clean up broken, unparseable titles. |
4 | * |
5 | * Copyright © 2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @author Brooke Vibber <bvibber@wikimedia.org> |
25 | * @ingroup Maintenance |
26 | */ |
27 | |
28 | use MediaWiki\Title\Title; |
29 | use Wikimedia\Rdbms\IDBAccessObject; |
30 | |
31 | // @codeCoverageIgnoreStart |
32 | require_once __DIR__ . '/TableCleanup.php'; |
33 | // @codeCoverageIgnoreEnd |
34 | |
35 | /** |
36 | * Maintenance script to clean up broken, unparseable titles. |
37 | * |
38 | * @ingroup Maintenance |
39 | */ |
40 | class TitleCleanup extends TableCleanup { |
41 | |
42 | private string $prefix; |
43 | |
44 | public function __construct() { |
45 | parent::__construct(); |
46 | $this->addDescription( 'Script to clean up broken, unparseable titles' ); |
47 | $this->addOption( 'prefix', "Broken pages will be renamed to titles with " . |
48 | "<prefix> prepended before the article name. Defaults to 'Broken'", false, true ); |
49 | $this->setBatchSize( 1000 ); |
50 | } |
51 | |
52 | /** |
53 | * @inheritDoc |
54 | */ |
55 | public function execute() { |
56 | $this->prefix = $this->getOption( 'prefix', 'Broken' ) . "/"; |
57 | // Make sure the prefix itself is a valid title now |
58 | // rather than spewing errors for every page being cleaned up |
59 | // if it's not (We assume below that concatenating the prefix to a title leaves it in NS0) |
60 | // The trailing slash above ensures that concatenating the title to something |
61 | // can't turn it into a namespace or interwiki |
62 | $title = Title::newFromText( $this->prefix ); |
63 | if ( !$title || !$title->canExist() || $title->getInterwiki() || $title->getNamespace() !== 0 ) { |
64 | $this->fatalError( "Invalid prefix {$this->prefix}. Must be a valid mainspace title." ); |
65 | } |
66 | parent::execute(); |
67 | } |
68 | |
69 | /** |
70 | * @param stdClass $row |
71 | */ |
72 | protected function processRow( $row ) { |
73 | $display = Title::makeName( $row->page_namespace, $row->page_title ); |
74 | $verified = $this->getServiceContainer()->getContentLanguage()->normalize( $display ); |
75 | $title = Title::newFromText( $verified ); |
76 | |
77 | if ( $title !== null |
78 | && $title->canExist() |
79 | && $title->getNamespace() == $row->page_namespace |
80 | && $title->getDBkey() === $row->page_title |
81 | ) { |
82 | // all is fine |
83 | $this->progress( 0 ); |
84 | |
85 | return; |
86 | } |
87 | |
88 | if ( $row->page_namespace == NS_FILE && $this->fileExists( $row->page_title ) ) { |
89 | $this->output( "file $row->page_title needs cleanup, please run cleanupImages.php.\n" ); |
90 | $this->progress( 0 ); |
91 | } elseif ( $title === null ) { |
92 | $this->output( "page $row->page_id ($display) is illegal.\n" ); |
93 | $this->moveIllegalPage( $row ); |
94 | $this->progress( 1 ); |
95 | } else { |
96 | $this->output( "page $row->page_id ($display) doesn't match self.\n" ); |
97 | $this->moveInconsistentPage( $row, $title ); |
98 | $this->progress( 1 ); |
99 | } |
100 | } |
101 | |
102 | /** |
103 | * @param string $name |
104 | * @return bool |
105 | */ |
106 | protected function fileExists( $name ) { |
107 | // XXX: Doesn't actually check for file existence, just presence of image record. |
108 | // This is reasonable, since cleanupImages.php only iterates over the image table. |
109 | $dbr = $this->getReplicaDB(); |
110 | $row = $dbr->newSelectQueryBuilder() |
111 | ->select( '*' ) |
112 | ->from( 'image' ) |
113 | ->where( [ 'img_name' => $name ] ) |
114 | ->caller( __METHOD__ ) |
115 | ->fetchRow(); |
116 | |
117 | return $row !== false; |
118 | } |
119 | |
120 | /** |
121 | * @param stdClass $row |
122 | */ |
123 | protected function moveIllegalPage( $row ) { |
124 | $legalChars = Title::legalChars(); |
125 | $legalizedUnprefixed = preg_replace_callback( "/([^$legalChars])/", |
126 | [ $this, 'hexChar' ], |
127 | $row->page_title ); |
128 | if ( $legalizedUnprefixed == '.' ) { |
129 | $legalizedUnprefixed = '(dot)'; |
130 | } |
131 | if ( $legalizedUnprefixed == '_' ) { |
132 | $legalizedUnprefixed = '(space)'; |
133 | } |
134 | $ns = (int)$row->page_namespace; |
135 | |
136 | $title = null; |
137 | // Try to move "Talk:Project:Foo" -> "Project talk:Foo" |
138 | if ( $ns === 1 ) { |
139 | $subjectTitle = Title::newFromText( $legalizedUnprefixed ); |
140 | if ( $subjectTitle && !$subjectTitle->isTalkPage() ) { |
141 | $talkTitle = $subjectTitle->getTalkPageIfDefined(); |
142 | if ( $talkTitle !== null && !$talkTitle->exists() ) { |
143 | $ns = $talkTitle->getNamespace(); |
144 | $title = $talkTitle; |
145 | } |
146 | } |
147 | } |
148 | |
149 | if ( $title === null ) { |
150 | // Not a talk page or that didn't work |
151 | // move any other broken pages to the main namespace so they can be found together |
152 | if ( $ns !== 0 ) { |
153 | $namespaceInfo = $this->getServiceContainer()->getNamespaceInfo(); |
154 | $namespaceName = $namespaceInfo->getCanonicalName( $ns ); |
155 | if ( $namespaceName === false ) { |
156 | $namespaceName = "NS$ns"; // Fallback for unknown namespaces |
157 | } |
158 | $ns = 0; |
159 | $legalizedUnprefixed = "$namespaceName:$legalizedUnprefixed"; |
160 | } |
161 | $title = Title::newFromText( $this->prefix . $legalizedUnprefixed ); |
162 | } |
163 | |
164 | if ( $title === null ) { |
165 | // It's still not a valid title, try again with a much smaller |
166 | // allowed character set. This will mangle any titles with non-ASCII |
167 | // characters, but if we don't do this the result will be |
168 | // falling back to the Broken/id:foo failsafe below which is worse |
169 | $legalizedUnprefixed = preg_replace_callback( '!([^A-Za-z0-9_:\\-])!', |
170 | [ $this, 'hexChar' ], |
171 | $legalizedUnprefixed |
172 | ); |
173 | $title = Title::newFromText( $this->prefix . $legalizedUnprefixed ); |
174 | } |
175 | |
176 | if ( $title === null ) { |
177 | // Oh well, we tried |
178 | $clean = $this->prefix . 'id:' . $row->page_id; |
179 | $legalized = $this->prefix . $legalizedUnprefixed; |
180 | $this->output( "Couldn't legalize; form '$legalized' still invalid; using '$clean'\n" ); |
181 | $title = Title::newFromText( $clean ); |
182 | } elseif ( $title->exists( IDBAccessObject::READ_LATEST ) ) { |
183 | $clean = $this->prefix . 'id:' . $row->page_id; |
184 | $conflict = $title->getDBKey(); |
185 | $this->output( "Legalized for '$conflict' exists; using '$clean'\n" ); |
186 | $title = Title::newFromText( $clean ); |
187 | } |
188 | |
189 | if ( !$title || $title->exists( IDBAccessObject::READ_LATEST ) ) { |
190 | // This can happen in corner cases like if numbers are made not valid |
191 | // title characters using the (deprecated) $wgLegalTitleChars or |
192 | // a 'Broken/id:foo' title already exists |
193 | $this->error( "Destination page {$title->getText()} is invalid or already exists, skipping." ); |
194 | return; |
195 | } |
196 | |
197 | $dest = $title->getDBkey(); |
198 | if ( $this->dryrun ) { |
199 | $this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," . |
200 | "'$row->page_title') to ($ns,'$dest')\n" ); |
201 | } else { |
202 | $this->output( "renaming $row->page_id ($row->page_namespace," . |
203 | "'$row->page_title') to ($ns,'$dest')\n" ); |
204 | $this->getPrimaryDB() |
205 | ->newUpdateQueryBuilder() |
206 | ->update( 'page' ) |
207 | ->set( [ 'page_title' => $dest, 'page_namespace' => $ns ] ) |
208 | ->where( [ 'page_id' => $row->page_id ] ) |
209 | ->caller( __METHOD__ )->execute(); |
210 | } |
211 | } |
212 | |
213 | /** |
214 | * @param stdClass $row |
215 | * @param Title $title |
216 | */ |
217 | protected function moveInconsistentPage( $row, Title $title ) { |
218 | $titleImpossible = $title->getInterwiki() || !$title->canExist(); |
219 | if ( $title->exists( IDBAccessObject::READ_LATEST ) || $titleImpossible ) { |
220 | if ( $titleImpossible ) { |
221 | $prior = $title->getPrefixedDBkey(); |
222 | } else { |
223 | $prior = $title->getDBkey(); |
224 | } |
225 | |
226 | $ns = (int)$row->page_namespace; |
227 | # If a page is saved in the main namespace with a namespace prefix then try to move it into |
228 | # that namespace. If there's no conflict then it will succeed. Otherwise it will hit the condition |
229 | # } else if ($ns !== 0) { and be moved to Broken/Namespace:Title |
230 | # whereas without this check it would just go to Broken/Title |
231 | if ( $ns === 0 ) { |
232 | $ns = $title->getNamespace(); |
233 | } |
234 | |
235 | # Old cleanupTitles could move articles there. See T25147. |
236 | # or a page could be stored as (0, "Special:Foo") in which case the $titleImpossible |
237 | # condition would be true and we've already added a prefix so pretend we're in mainspace |
238 | # and don't add another |
239 | if ( $ns < 0 ) { |
240 | $ns = 0; |
241 | } |
242 | |
243 | # Namespace which no longer exists. Put the page in the main namespace |
244 | # since we don't have any idea of the old namespace name. See T70501. |
245 | # We build the new title ourself rather than relying on getDBKey() because |
246 | # that will return Special:BadTitle |
247 | $namespaceInfo = $this->getServiceContainer()->getNamespaceInfo(); |
248 | if ( !$namespaceInfo->exists( $ns ) ) { |
249 | $clean = "{$this->prefix}NS$ns:$row->page_title"; |
250 | $ns = 0; |
251 | } elseif ( !$titleImpossible && !$title->exists( IDBAccessObject::READ_LATEST ) ) { |
252 | // Looks like the current title, after cleaning it up, is valid and available |
253 | $clean = $prior; |
254 | } elseif ( $ns !== 0 ) { |
255 | // Put all broken pages in the main namespace so that they can be found via Special:PrefixIndex |
256 | $nsName = $namespaceInfo->getCanonicalName( $ns ); |
257 | $clean = "{$this->prefix}$nsName:{$prior}"; |
258 | $ns = 0; |
259 | } else { |
260 | $clean = $this->prefix . $prior; |
261 | } |
262 | $verified = Title::makeTitleSafe( $ns, $clean ); |
263 | if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) { |
264 | $lastResort = "{$this->prefix}id: {$row->page_id}"; |
265 | $this->output( "Couldn't legalize; form '$clean' exists; using '$lastResort'\n" ); |
266 | $verified = Title::makeTitleSafe( $ns, $lastResort ); |
267 | if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) { |
268 | // This can happen in corner cases like if numbers are made not valid |
269 | // title characters using the (deprecated) $wgLegalTitleChars or |
270 | // a 'Broken/id:foo' title already exists |
271 | $this->error( "Destination page $lastResort invalid or already exists." ); |
272 | return; |
273 | } |
274 | } |
275 | $title = $verified; |
276 | } |
277 | |
278 | $ns = $title->getNamespace(); |
279 | $dest = $title->getDBkey(); |
280 | |
281 | if ( $this->dryrun ) { |
282 | $this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," . |
283 | "'$row->page_title') to ($ns,'$dest')\n" ); |
284 | } else { |
285 | $this->output( "renaming $row->page_id ($row->page_namespace," . |
286 | "'$row->page_title') to ($ns,'$dest')\n" ); |
287 | $this->getPrimaryDB() |
288 | ->newUpdateQueryBuilder() |
289 | ->update( 'page' ) |
290 | ->set( [ |
291 | 'page_namespace' => $ns, |
292 | 'page_title' => $dest |
293 | ] ) |
294 | ->where( [ 'page_id' => $row->page_id ] ) |
295 | ->caller( __METHOD__ )->execute(); |
296 | $this->getServiceContainer()->getLinkCache()->clear(); |
297 | } |
298 | } |
299 | } |
300 | |
301 | // @codeCoverageIgnoreStart |
302 | $maintClass = TitleCleanup::class; |
303 | require_once RUN_MAINTENANCE_IF_MAIN; |
304 | // @codeCoverageIgnoreEnd |