MediaWiki master
cleanupTitles.php
Go to the documentation of this file.
1<?php
17
18// @codeCoverageIgnoreStart
19require_once __DIR__ . '/TableCleanup.php';
20// @codeCoverageIgnoreEnd
21
28
29 private string $prefix;
30
31 public function __construct() {
32 parent::__construct();
33 $this->addDescription( 'Script to clean up broken, unparseable titles' );
34 $this->addOption( 'prefix', "Broken pages will be renamed to titles with " .
35 "<prefix> prepended before the article name. Defaults to 'Broken'", false, true );
36 $this->setBatchSize( 1000 );
37 }
38
42 public function execute() {
43 $this->prefix = $this->getOption( 'prefix', 'Broken' ) . "/";
44 // Make sure the prefix itself is a valid title now
45 // rather than spewing errors for every page being cleaned up
46 // if it's not (We assume below that concatenating the prefix to a title leaves it in NS0)
47 // The trailing slash above ensures that concatenating the title to something
48 // can't turn it into a namespace or interwiki
49 $title = Title::newFromText( $this->prefix );
50 if ( !$title || !$title->canExist() || $title->getInterwiki() || $title->getNamespace() !== 0 ) {
51 $this->fatalError( "Invalid prefix {$this->prefix}. Must be a valid mainspace title." );
52 }
53 parent::execute();
54 }
55
59 protected function processRow( $row ) {
60 $display = Title::makeName( $row->page_namespace, $row->page_title );
61 $verified = $this->getServiceContainer()->getContentLanguage()->normalize( $display );
62 $title = Title::newFromText( $verified );
63
64 if ( $title !== null
65 && $title->canExist()
66 && $title->getNamespace() == $row->page_namespace
67 && $title->getDBkey() === $row->page_title
68 ) {
69 // all is fine
70 $this->progress( 0 );
71
72 return;
73 }
74
75 if ( $row->page_namespace == NS_FILE && $this->fileExists( $row->page_title ) ) {
76 $this->output( "file $row->page_title needs cleanup, please run cleanupImages.php.\n" );
77 $this->progress( 0 );
78 } elseif ( $title === null ) {
79 $this->output( "page $row->page_id ($display) is illegal.\n" );
80 $this->moveIllegalPage( $row );
81 $this->progress( 1 );
82 } else {
83 $this->output( "page $row->page_id ($display) doesn't match self.\n" );
84 $this->moveInconsistentPage( $row, $title );
85 $this->progress( 1 );
86 }
87 }
88
93 protected function fileExists( $name ) {
94 // XXX: Doesn't actually check for file existence, just presence of image/file record.
95 // This is reasonable, since cleanupImages.php only iterates over the image/file table.
96 $dbr = $this->getReplicaDB();
97 $migrationStage = $this->getServiceContainer()->getMainConfig()->get(
98 MainConfigNames::FileSchemaMigrationStage
99 );
100 if ( $migrationStage & SCHEMA_COMPAT_READ_OLD ) {
101 $row = $dbr->newSelectQueryBuilder()
102 ->select( '*' )
103 ->from( 'image' )
104 ->where( [ 'img_name' => $name ] )
105 ->caller( __METHOD__ )
106 ->fetchRow();
107 } else {
108 $row = $dbr->newSelectQueryBuilder()
109 ->select( '*' )
110 ->from( 'file' )
111 ->where( [
112 'file_name' => $name,
113 'file_deleted' => 0,
114 ] )
115 ->caller( __METHOD__ )
116 ->fetchRow();
117 }
118
119 return $row !== false;
120 }
121
125 protected function moveIllegalPage( $row ) {
126 $legalChars = Title::legalChars();
127 $legalizedUnprefixed = preg_replace_callback( "/([^$legalChars])/",
128 $this->hexChar( ... ),
129 $row->page_title );
130 if ( $legalizedUnprefixed == '.' ) {
131 $legalizedUnprefixed = '(dot)';
132 }
133 if ( $legalizedUnprefixed == '_' ) {
134 $legalizedUnprefixed = '(space)';
135 }
136 $ns = (int)$row->page_namespace;
137
138 $title = null;
139 // Try to move "Talk:Project:Foo" -> "Project talk:Foo"
140 if ( $ns === 1 ) {
141 $subjectTitle = Title::newFromText( $legalizedUnprefixed );
142 if ( $subjectTitle && !$subjectTitle->isTalkPage() ) {
143 $talkTitle = $subjectTitle->getTalkPageIfDefined();
144 if ( $talkTitle !== null && !$talkTitle->exists() ) {
145 $ns = $talkTitle->getNamespace();
146 $title = $talkTitle;
147 }
148 }
149 }
150
151 if ( $title === null ) {
152 // Not a talk page or that didn't work
153 // move any other broken pages to the main namespace so they can be found together
154 if ( $ns !== 0 ) {
155 $namespaceInfo = $this->getServiceContainer()->getNamespaceInfo();
156 $namespaceName = $namespaceInfo->getCanonicalName( $ns );
157 if ( $namespaceName === false ) {
158 $namespaceName = "NS$ns"; // Fallback for unknown namespaces
159 }
160 $ns = 0;
161 $legalizedUnprefixed = "$namespaceName:$legalizedUnprefixed";
162 }
163 $title = Title::newFromText( $this->prefix . $legalizedUnprefixed );
164 }
165
166 if ( $title === null ) {
167 // It's still not a valid title, try again with a much smaller
168 // allowed character set. This will mangle any titles with non-ASCII
169 // characters, but if we don't do this the result will be
170 // falling back to the Broken/id:foo failsafe below which is worse
171 $legalizedUnprefixed = preg_replace_callback( '!([^A-Za-z0-9_:\\-])!',
172 $this->hexChar( ... ),
173 $legalizedUnprefixed
174 );
175 $title = Title::newFromText( $this->prefix . $legalizedUnprefixed );
176 }
177
178 if ( $title === null ) {
179 // Oh well, we tried
180 $clean = $this->prefix . 'id:' . $row->page_id;
181 $legalized = $this->prefix . $legalizedUnprefixed;
182 $this->output( "Couldn't legalize; form '$legalized' still invalid; using '$clean'\n" );
183 $title = Title::newFromText( $clean );
184 } elseif ( $title->exists( IDBAccessObject::READ_LATEST ) ) {
185 $clean = $this->prefix . 'id:' . $row->page_id;
186 $conflict = $title->getDBKey();
187 $this->output( "Legalized for '$conflict' exists; using '$clean'\n" );
188 $title = Title::newFromText( $clean );
189 }
190
191 if ( !$title || $title->exists( IDBAccessObject::READ_LATEST ) ) {
192 // This can happen in corner cases like if numbers are made not valid
193 // title characters using the (deprecated) $wgLegalTitleChars or
194 // a 'Broken/id:foo' title already exists
195 $this->error( "Destination page {$title->getText()} is invalid or already exists, skipping." );
196 return;
197 }
198
199 $dest = $title->getDBkey();
200 if ( $this->dryrun ) {
201 $this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," .
202 "'$row->page_title') to ($ns,'$dest')\n" );
203 } else {
204 $this->output( "renaming $row->page_id ($row->page_namespace," .
205 "'$row->page_title') to ($ns,'$dest')\n" );
206 $this->getPrimaryDB()
207 ->newUpdateQueryBuilder()
208 ->update( 'page' )
209 ->set( [ 'page_title' => $dest, 'page_namespace' => $ns ] )
210 ->where( [ 'page_id' => $row->page_id ] )
211 ->caller( __METHOD__ )->execute();
212 }
213 }
214
219 protected function moveInconsistentPage( $row, Title $title ) {
220 $titleImpossible = $title->getInterwiki() || !$title->canExist();
221 if ( $title->exists( IDBAccessObject::READ_LATEST ) || $titleImpossible ) {
222 if ( $titleImpossible ) {
223 $prior = $title->getPrefixedDBkey();
224 } else {
225 $prior = $title->getDBkey();
226 }
227
228 $ns = (int)$row->page_namespace;
229 # If a page is saved in the main namespace with a namespace prefix then try to move it into
230 # that namespace. If there's no conflict then it will succeed. Otherwise it will hit the condition
231 # } else if ($ns !== 0) { and be moved to Broken/Namespace:Title
232 # whereas without this check it would just go to Broken/Title
233 if ( $ns === 0 ) {
234 $ns = $title->getNamespace();
235 }
236
237 # Old cleanupTitles could move articles there. See T25147.
238 # or a page could be stored as (0, "Special:Foo") in which case the $titleImpossible
239 # condition would be true and we've already added a prefix so pretend we're in mainspace
240 # and don't add another
241 if ( $ns < 0 ) {
242 $ns = 0;
243 }
244
245 # Namespace which no longer exists. Put the page in the main namespace
246 # since we don't have any idea of the old namespace name. See T70501.
247 # We build the new title ourself rather than relying on getDBKey() because
248 # that will return Special:BadTitle
249 $namespaceInfo = $this->getServiceContainer()->getNamespaceInfo();
250 if ( !$namespaceInfo->exists( $ns ) ) {
251 $clean = "{$this->prefix}NS$ns:$row->page_title";
252 $ns = 0;
253 } elseif ( !$titleImpossible && !$title->exists( IDBAccessObject::READ_LATEST ) ) {
254 // Looks like the current title, after cleaning it up, is valid and available
255 $clean = $prior;
256 } elseif ( $ns !== 0 ) {
257 // Put all broken pages in the main namespace so that they can be found via Special:PrefixIndex
258 $nsName = $namespaceInfo->getCanonicalName( $ns );
259 $clean = "{$this->prefix}$nsName:{$prior}";
260 $ns = 0;
261 } else {
262 $clean = $this->prefix . $prior;
263 }
264 $verified = Title::makeTitleSafe( $ns, $clean );
265 if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) {
266 $lastResort = "{$this->prefix}id: {$row->page_id}";
267 $this->output( "Couldn't legalize; form '$clean' exists; using '$lastResort'\n" );
268 $verified = Title::makeTitleSafe( $ns, $lastResort );
269 if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) {
270 // This can happen in corner cases like if numbers are made not valid
271 // title characters using the (deprecated) $wgLegalTitleChars or
272 // a 'Broken/id:foo' title already exists
273 $this->error( "Destination page $lastResort invalid or already exists." );
274 return;
275 }
276 }
277 $title = $verified;
278 }
279
280 $ns = $title->getNamespace();
281 $dest = $title->getDBkey();
282
283 if ( $this->dryrun ) {
284 $this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," .
285 "'$row->page_title') to ($ns,'$dest')\n" );
286 } else {
287 $this->output( "renaming $row->page_id ($row->page_namespace," .
288 "'$row->page_title') to ($ns,'$dest')\n" );
289 $this->getPrimaryDB()
290 ->newUpdateQueryBuilder()
291 ->update( 'page' )
292 ->set( [
293 'page_namespace' => $ns,
294 'page_title' => $dest
295 ] )
296 ->where( [ 'page_id' => $row->page_id ] )
297 ->caller( __METHOD__ )->execute();
298 $this->getServiceContainer()->getLinkCache()->clear();
299 }
300 }
301}
302
303// @codeCoverageIgnoreStart
304$maintClass = TitleCleanup::class;
305require_once RUN_MAINTENANCE_IF_MAIN;
306// @codeCoverageIgnoreEnd
const NS_FILE
Definition Defines.php:57
const SCHEMA_COMPAT_READ_OLD
Definition Defines.php:294
A class containing constants representing the names of configuration variables.
output( $out, $channel=null)
Throw some output to the user.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
getReplicaDB(string|false $virtualDomain=false)
error( $err, $die=0)
Throw an error to the user.
getServiceContainer()
Returns the main service container.
getPrimaryDB(string|false $virtualDomain=false)
addDescription( $text)
Set the description text.
Represents a title within MediaWiki.
Definition Title.php:69
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1205
exists( $flags=0)
Check if page exists.
Definition Title.php:3129
getNamespace()
Get the namespace index, i.e.
Definition Title.php:1037
getInterwiki()
Get the interwiki prefix.
Definition Title.php:938
getDBkey()
Get the main part with underscores.
Definition Title.php:1028
getPrefixedDBkey()
Get the prefixed database key form.
Definition Title.php:1845
Generic class to cleanup a database table.
progress( $updated)
hexChar( $matches)
Maintenance script to clean up broken, unparseable titles.
moveIllegalPage( $row)
moveInconsistentPage( $row, Title $title)
__construct()
Default constructor.
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
fileExists( $name)
$maintClass
Interface for database access objects.