MediaWiki master
TitleParser.php
Go to the documentation of this file.
1<?php
10namespace MediaWiki\Title;
11
12use InvalidArgumentException;
13use LogicException;
18use Wikimedia\IPUtils;
19
30 private Language $language;
31 private InterwikiLookup $interwikiLookup;
32 private NamespaceInfo $nsInfo;
33
35 private array $localInterwikis;
36
44 private $createMalformedTitleException;
45
53 public function __construct(
54 Language $language,
55 InterwikiLookup $interwikiLookup,
56 NamespaceInfo $nsInfo,
57 $localInterwikis
58 ) {
59 $this->language = $language;
60 $this->interwikiLookup = $interwikiLookup;
61 $this->nsInfo = $nsInfo;
62 $this->localInterwikis = $localInterwikis;
63
64 // Default callback is to return a real MalformedTitleException,
65 // callback signature matches constructor
66 $this->createMalformedTitleException = static function (
67 $errorMessage,
68 $titleText = null,
69 $errorMessageParameters = []
71 return new MalformedTitleException( $errorMessage, $titleText, $errorMessageParameters );
72 };
73 }
74
79 public function overrideCreateMalformedTitleExceptionCallback( callable $callback ) {
80 // @codeCoverageIgnoreStart
81 if ( !defined( 'MW_PHPUNIT_TEST' ) ) {
82 throw new LogicException( __METHOD__ . ' can only be used in tests' );
83 }
84 // @codeCoverageIgnoreEnd
85 $this->createMalformedTitleException = $callback;
86 }
87
97 public function parseTitle( $text, $defaultNamespace = NS_MAIN ) {
98 // Convert things like &eacute; &#257; or &#x3017; into normalized (T16952) text
99 $filteredText = Sanitizer::decodeCharReferencesAndNormalize( $text );
100
101 // NOTE: this is an ugly kludge that allows this class to share the
102 // code for parsing with the old Title class. The parser code should
103 // be refactored to avoid this.
104 $parts = $this->splitTitleString( $filteredText, $defaultNamespace );
105
106 return new TitleValue(
107 $parts['namespace'],
108 $parts['dbkey'],
109 $parts['fragment'],
110 $parts['interwiki']
111 );
112 }
113
124 public function makeTitleValueSafe( $namespace, $text, $fragment = '', $interwiki = '' ) {
125 if ( !$this->nsInfo->exists( $namespace ) ) {
126 return null;
127 }
128
129 $canonicalNs = $this->nsInfo->getCanonicalName( $namespace );
130 $fullText = $canonicalNs == '' ? $text : "$canonicalNs:$text";
131 if ( strval( $interwiki ) != '' ) {
132 $fullText = "$interwiki:$fullText";
133 }
134 if ( strval( $fragment ) != '' ) {
135 $fullText .= '#' . $fragment;
136 }
137
138 try {
139 $parts = $this->splitTitleString( $fullText );
140 } catch ( MalformedTitleException ) {
141 return null;
142 }
143
144 return new TitleValue(
145 $parts['namespace'], $parts['dbkey'], $parts['fragment'], $parts['interwiki'] );
146 }
147
169 public function splitTitleString( $text, $defaultNamespace = NS_MAIN ) {
170 $dbkey = str_replace( ' ', '_', $text );
171
172 # Initialisation
173 $parts = [
174 'interwiki' => '',
175 'local_interwiki' => false,
176 'fragment' => '',
177 'namespace' => (int)$defaultNamespace,
178 'dbkey' => $dbkey,
179 ];
180
181 # Strip Unicode bidi override characters.
182 # Sometimes they slip into cut-n-pasted page titles, where the
183 # override chars get included in list displays.
184 $dbkey = preg_replace( '/[\x{200E}\x{200F}\x{202A}-\x{202E}]+/u', '', $dbkey );
185
186 if ( $dbkey === null ) {
187 # Regex had an error. Most likely this is caused by invalid UTF-8
188 $exception = ( $this->createMalformedTitleException )( 'title-invalid-utf8', $text );
189 throw $exception;
190 }
191
192 # Clean up whitespace
193 $dbkey = preg_replace(
194 '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u',
195 '_',
196 $dbkey
197 );
198 $dbkey = trim( $dbkey, '_' );
199
200 if ( str_contains( $dbkey, \UtfNormal\Constants::UTF8_REPLACEMENT ) ) {
201 # Contained illegal UTF-8 sequences or forbidden Unicode chars.
202 $exception = ( $this->createMalformedTitleException )( 'title-invalid-utf8', $text );
203 throw $exception;
204 }
205
206 $parts['dbkey'] = $dbkey;
207
208 # Initial colon indicates main namespace rather than specified default
209 # but should not create invalid {ns,title} pairs such as {0,Project:Foo}
210 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
211 $parts['namespace'] = NS_MAIN;
212 $dbkey = substr( $dbkey, 1 ); # remove the colon but continue processing
213 $dbkey = trim( $dbkey, '_' ); # remove any subsequent whitespace
214 }
215
216 if ( $dbkey == '' ) {
217 $exception = ( $this->createMalformedTitleException )( 'title-invalid-empty', $text );
218 throw $exception;
219 }
220
221 # Namespace or interwiki prefix
222 $prefixRegexp = "/^(.+?)_*:_*(.*)$/S";
223 do {
224 $m = [];
225 if ( preg_match( $prefixRegexp, $dbkey, $m ) ) {
226 $p = $m[1];
227 $ns = $this->language->getNsIndex( $p );
228 if ( $ns !== false ) {
229 # Ordinary namespace
230 $dbkey = $m[2];
231 $parts['namespace'] = $ns;
232 # For Talk:X pages, check if X has a "namespace" prefix
233 if ( $ns === NS_TALK && preg_match( $prefixRegexp, $dbkey, $x ) ) {
234 if ( $this->language->getNsIndex( $x[1] ) ) {
235 # Disallow Talk:File:x type titles...
236 $exception = ( $this->createMalformedTitleException )(
237 'title-invalid-talk-namespace',
238 $text
239 );
240 throw $exception;
241 } elseif ( $this->interwikiLookup->isValidInterwiki( $x[1] ) ) {
242 # Disallow Talk:Interwiki:x type titles...
243 $exception = ( $this->createMalformedTitleException )(
244 'title-invalid-talk-interwiki',
245 $text
246 );
247 throw $exception;
248 }
249 }
250 } elseif ( $this->interwikiLookup->isValidInterwiki( $p ) ) {
251 # Interwiki link
252 $dbkey = $m[2];
253 $parts['interwiki'] = $this->language->lc( $p );
254
255 # Redundant interwiki prefix to the local wiki
256 foreach ( $this->localInterwikis as $localIW ) {
257 if ( strcasecmp( $parts['interwiki'], $localIW ) == 0 ) {
258 if ( $dbkey == '' ) {
259 # Empty self-links should point to the Main Page, to ensure
260 # compatibility with cross-wiki transclusions and the like.
261 $mainPage = Title::newMainPage();
262 return [
263 'interwiki' => $mainPage->getInterwiki(),
264 'local_interwiki' => true,
265 'fragment' => $mainPage->getFragment(),
266 'namespace' => $mainPage->getNamespace(),
267 'dbkey' => $mainPage->getDBkey(),
268 ];
269 }
270 $parts['interwiki'] = '';
271 # local interwikis should behave like initial-colon links
272 $parts['local_interwiki'] = true;
273
274 # Do another namespace split...
275 continue 2;
276 }
277 }
278
279 # If there's an initial colon after the interwiki, that also
280 # resets the default namespace
281 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
282 $parts['namespace'] = NS_MAIN;
283 $dbkey = substr( $dbkey, 1 );
284 $dbkey = trim( $dbkey, '_' );
285 }
286 }
287 # If there's no recognized interwiki or namespace,
288 # then let the colon expression be part of the title.
289 }
290 break;
291 } while ( true );
292
293 $fragment = strstr( $dbkey, '#' );
294 if ( $fragment !== false ) {
295 $parts['fragment'] = str_replace( '_', ' ', substr( $fragment, 1 ) );
296 $dbkey = substr( $dbkey, 0, strlen( $dbkey ) - strlen( $fragment ) );
297 # remove whitespace again: prevents "Foo_bar_#"
298 # becoming "Foo_bar_"
299 $dbkey = rtrim( $dbkey, "_" );
300 }
301
302 # Reject illegal characters.
304 $matches = [];
305 if ( preg_match( $rxTc, $dbkey, $matches ) ) {
306 $exception = ( $this->createMalformedTitleException )( 'title-invalid-characters', $text, [ $matches[0] ] );
307 throw $exception;
308 }
309
310 # Pages with "/./" or "/../" appearing in the URLs will often be un-
311 # reachable due to the way web browsers deal with 'relative' URLs.
312 # Also, they conflict with subpage syntax. Forbid them explicitly.
313 if (
314 str_contains( $dbkey, '.' ) &&
315 (
316 $dbkey === '.' || $dbkey === '..' ||
317 str_starts_with( $dbkey, './' ) ||
318 str_starts_with( $dbkey, '../' ) ||
319 str_contains( $dbkey, '/./' ) ||
320 str_contains( $dbkey, '/../' ) ||
321 str_ends_with( $dbkey, '/.' ) ||
322 str_ends_with( $dbkey, '/..' )
323 )
324 ) {
325 $exception = ( $this->createMalformedTitleException )( 'title-invalid-relative', $text );
326 throw $exception;
327 }
328
329 # Magic tilde sequences? Nu-uh!
330 if ( str_contains( $dbkey, '~~~' ) ) {
331 $exception = ( $this->createMalformedTitleException )( 'title-invalid-magic-tilde', $text );
332 throw $exception;
333 }
334
335 # Limit the size of titles to 255 bytes. This is typically the size of the
336 # underlying database field. We make an exception for special pages, which
337 # don't need to be stored in the database, and may edge over 255 bytes due
338 # to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
339 $maxLength = ( $parts['namespace'] !== NS_SPECIAL ) ? 255 : 512;
340 if ( strlen( $dbkey ) > $maxLength ) {
341 $exception = ( $this->createMalformedTitleException )(
342 'title-invalid-too-long',
343 $text,
344 [ Message::numParam( $maxLength ), Message::numParam( strlen( $dbkey ) ) ]
345 );
346 throw $exception;
347 }
348
349 # Normally, all wiki links are forced to have an initial capital letter so [[foo]]
350 # and [[Foo]] point to the same place. Don't force it for interwikis, since the
351 # other site might be case-sensitive.
352 if ( $parts['interwiki'] === '' && $this->nsInfo->isCapitalized( $parts['namespace'] ) ) {
353 $dbkey = $this->language->ucfirst( $dbkey );
354 }
355
356 # Can't make a link to a namespace alone... "empty" local links can only be
357 # self-links with a fragment identifier.
358 if ( $dbkey == '' && $parts['interwiki'] === '' && $parts['namespace'] !== NS_MAIN ) {
359 $exception = ( $this->createMalformedTitleException )( 'title-invalid-empty', $text );
360 throw $exception;
361 }
362
363 // Allow IPv6 usernames to start with '::' by canonicalizing IPv6 titles.
364 // IP names are not allowed for accounts, and can only be referring to
365 // edits from the IP. Given '::' abbreviations and caps/lowercaps,
366 // there are numerous ways to present the same IP. Having sp:contribs scan
367 // them all is silly and having some show the edits and others not is
368 // inconsistent. Same for talk/userpages. Keep them normalized instead.
369 if ( $dbkey !== '' && ( $parts['namespace'] === NS_USER || $parts['namespace'] === NS_USER_TALK ) ) {
370 $dbkey = IPUtils::sanitizeIP( $dbkey );
371 // IPUtils::sanitizeIP return null only for bad input
372 '@phan-var string $dbkey';
373 }
374
375 // Any remaining initial :s are illegal.
376 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
377 $exception = ( $this->createMalformedTitleException )( 'title-invalid-leading-colon', $text );
378 throw $exception;
379 }
380
381 // Fill fields
382 $parts['dbkey'] = $dbkey;
383
384 // Check to ensure that the return value can be used to construct a TitleValue.
385 // All issues should in theory be caught above, this is here to enforce consistency.
386 try {
388 $parts['namespace'],
389 $parts['dbkey'],
390 $parts['fragment'],
391 $parts['interwiki']
392 );
393 } catch ( InvalidArgumentException $ex ) {
394 $exception = ( $this->createMalformedTitleException )( 'title-invalid', $text, [ $ex->getMessage() ] );
395 throw $exception;
396 }
397
398 return $parts;
399 }
400
409 public static function getTitleInvalidRegex() {
410 static $rxTc = false;
411 if ( !$rxTc ) {
412 # Matching titles will be held as illegal.
413 $rxTc = '/' .
414 # Any character not allowed is forbidden...
415 '[^' . Title::legalChars() . ']' .
416 # URL percent encoding sequences interfere with the ability
417 # to round-trip titles -- you can't link to them consistently.
418 '|%[0-9A-Fa-f]{2}' .
419 # XML/HTML character references produce similar issues.
420 '|&[A-Za-z0-9\x80-\xff]+;' .
421 '/S';
422 }
423
424 return $rxTc;
425 }
426}
427
429class_alias( TitleParser::class, 'TitleParser' );
const NS_USER
Definition Defines.php:53
const NS_MAIN
Definition Defines.php:51
const NS_SPECIAL
Definition Defines.php:40
const NS_TALK
Definition Defines.php:52
const NS_USER_TALK
Definition Defines.php:54
Base class for language-specific code.
Definition Language.php:68
The Message class deals with fetching and processing of interface message into a variety of formats.
Definition Message.php:144
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:32
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A title parser service for MediaWiki.
makeTitleValueSafe( $namespace, $text, $fragment='', $interwiki='')
Given a namespace and title, return a TitleValue if valid, or null if invalid.
splitTitleString( $text, $defaultNamespace=NS_MAIN)
Validates, normalizes and splits a title string.
__construct(Language $language, InterwikiLookup $interwikiLookup, NamespaceInfo $nsInfo, $localInterwikis)
parseTitle( $text, $defaultNamespace=NS_MAIN)
Parses the given text and constructs a TitleValue.
static getTitleInvalidRegex()
Returns a simple regex that will match on characters and sequences invalid in titles.
overrideCreateMalformedTitleExceptionCallback(callable $callback)
Represents the target of a wiki link.
static assertValidSpec( $namespace, $title, $fragment='', $interwiki='')
Assert that the given parameters could be used to construct a TitleValue object.
static newMainPage(?MessageLocalizer $localizer=null)
Create a new Title for the Main Page.
Definition Title.php:676
static legalChars()
Get a regex character class describing the legal characters in a link.
Definition Title.php:709
Service interface for looking up Interwiki records.