MediaWiki master
TitleParser.php
Go to the documentation of this file.
1<?php
24namespace MediaWiki\Title;
25
26use InvalidArgumentException;
27use LogicException;
32use Wikimedia\IPUtils;
33
44 private Language $language;
45 private InterwikiLookup $interwikiLookup;
46 private NamespaceInfo $nsInfo;
47
49 private array $localInterwikis;
50
58 private $createMalformedTitleException;
59
67 public function __construct(
68 Language $language,
69 InterwikiLookup $interwikiLookup,
70 NamespaceInfo $nsInfo,
71 $localInterwikis
72 ) {
73 $this->language = $language;
74 $this->interwikiLookup = $interwikiLookup;
75 $this->nsInfo = $nsInfo;
76 $this->localInterwikis = $localInterwikis;
77
78 // Default callback is to return a real MalformedTitleException,
79 // callback signature matches constructor
80 $this->createMalformedTitleException = static function (
81 $errorMessage,
82 $titleText = null,
83 $errorMessageParameters = []
85 return new MalformedTitleException( $errorMessage, $titleText, $errorMessageParameters );
86 };
87 }
88
93 public function overrideCreateMalformedTitleExceptionCallback( callable $callback ) {
94 // @codeCoverageIgnoreStart
95 if ( !defined( 'MW_PHPUNIT_TEST' ) ) {
96 throw new LogicException( __METHOD__ . ' can only be used in tests' );
97 }
98 // @codeCoverageIgnoreEnd
99 $this->createMalformedTitleException = $callback;
100 }
101
111 public function parseTitle( $text, $defaultNamespace = NS_MAIN ) {
112 // Convert things like &eacute; &#257; or &#x3017; into normalized (T16952) text
113 $filteredText = Sanitizer::decodeCharReferencesAndNormalize( $text );
114
115 // NOTE: this is an ugly kludge that allows this class to share the
116 // code for parsing with the old Title class. The parser code should
117 // be refactored to avoid this.
118 $parts = $this->splitTitleString( $filteredText, $defaultNamespace );
119
120 return new TitleValue(
121 $parts['namespace'],
122 $parts['dbkey'],
123 $parts['fragment'],
124 $parts['interwiki']
125 );
126 }
127
138 public function makeTitleValueSafe( $namespace, $text, $fragment = '', $interwiki = '' ) {
139 if ( !$this->nsInfo->exists( $namespace ) ) {
140 return null;
141 }
142
143 $canonicalNs = $this->nsInfo->getCanonicalName( $namespace );
144 $fullText = $canonicalNs == '' ? $text : "$canonicalNs:$text";
145 if ( strval( $interwiki ) != '' ) {
146 $fullText = "$interwiki:$fullText";
147 }
148 if ( strval( $fragment ) != '' ) {
149 $fullText .= '#' . $fragment;
150 }
151
152 try {
153 $parts = $this->splitTitleString( $fullText );
154 } catch ( MalformedTitleException $e ) {
155 return null;
156 }
157
158 return new TitleValue(
159 $parts['namespace'], $parts['dbkey'], $parts['fragment'], $parts['interwiki'] );
160 }
161
183 public function splitTitleString( $text, $defaultNamespace = NS_MAIN ) {
184 $dbkey = str_replace( ' ', '_', $text );
185
186 # Initialisation
187 $parts = [
188 'interwiki' => '',
189 'local_interwiki' => false,
190 'fragment' => '',
191 'namespace' => (int)$defaultNamespace,
192 'dbkey' => $dbkey,
193 ];
194
195 # Strip Unicode bidi override characters.
196 # Sometimes they slip into cut-n-pasted page titles, where the
197 # override chars get included in list displays.
198 $dbkey = preg_replace( '/[\x{200E}\x{200F}\x{202A}-\x{202E}]+/u', '', $dbkey );
199
200 if ( $dbkey === null ) {
201 # Regex had an error. Most likely this is caused by invalid UTF-8
202 $exception = ( $this->createMalformedTitleException )( 'title-invalid-utf8', $text );
203 throw $exception;
204 }
205
206 # Clean up whitespace
207 $dbkey = preg_replace(
208 '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u',
209 '_',
210 $dbkey
211 );
212 $dbkey = trim( $dbkey, '_' );
213
214 if ( strpos( $dbkey, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
215 # Contained illegal UTF-8 sequences or forbidden Unicode chars.
216 $exception = ( $this->createMalformedTitleException )( 'title-invalid-utf8', $text );
217 throw $exception;
218 }
219
220 $parts['dbkey'] = $dbkey;
221
222 # Initial colon indicates main namespace rather than specified default
223 # but should not create invalid {ns,title} pairs such as {0,Project:Foo}
224 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
225 $parts['namespace'] = NS_MAIN;
226 $dbkey = substr( $dbkey, 1 ); # remove the colon but continue processing
227 $dbkey = trim( $dbkey, '_' ); # remove any subsequent whitespace
228 }
229
230 if ( $dbkey == '' ) {
231 $exception = ( $this->createMalformedTitleException )( 'title-invalid-empty', $text );
232 throw $exception;
233 }
234
235 # Namespace or interwiki prefix
236 $prefixRegexp = "/^(.+?)_*:_*(.*)$/S";
237 do {
238 $m = [];
239 if ( preg_match( $prefixRegexp, $dbkey, $m ) ) {
240 $p = $m[1];
241 $ns = $this->language->getNsIndex( $p );
242 if ( $ns !== false ) {
243 # Ordinary namespace
244 $dbkey = $m[2];
245 $parts['namespace'] = $ns;
246 # For Talk:X pages, check if X has a "namespace" prefix
247 if ( $ns === NS_TALK && preg_match( $prefixRegexp, $dbkey, $x ) ) {
248 if ( $this->language->getNsIndex( $x[1] ) ) {
249 # Disallow Talk:File:x type titles...
250 $exception = ( $this->createMalformedTitleException )(
251 'title-invalid-talk-namespace',
252 $text
253 );
254 throw $exception;
255 } elseif ( $this->interwikiLookup->isValidInterwiki( $x[1] ) ) {
256 # Disallow Talk:Interwiki:x type titles...
257 $exception = ( $this->createMalformedTitleException )(
258 'title-invalid-talk-interwiki',
259 $text
260 );
261 throw $exception;
262 }
263 }
264 } elseif ( $this->interwikiLookup->isValidInterwiki( $p ) ) {
265 # Interwiki link
266 $dbkey = $m[2];
267 $parts['interwiki'] = $this->language->lc( $p );
268
269 # Redundant interwiki prefix to the local wiki
270 foreach ( $this->localInterwikis as $localIW ) {
271 if ( strcasecmp( $parts['interwiki'], $localIW ) == 0 ) {
272 if ( $dbkey == '' ) {
273 # Empty self-links should point to the Main Page, to ensure
274 # compatibility with cross-wiki transclusions and the like.
275 $mainPage = Title::newMainPage();
276 return [
277 'interwiki' => $mainPage->getInterwiki(),
278 'local_interwiki' => true,
279 'fragment' => $mainPage->getFragment(),
280 'namespace' => $mainPage->getNamespace(),
281 'dbkey' => $mainPage->getDBkey(),
282 ];
283 }
284 $parts['interwiki'] = '';
285 # local interwikis should behave like initial-colon links
286 $parts['local_interwiki'] = true;
287
288 # Do another namespace split...
289 continue 2;
290 }
291 }
292
293 # If there's an initial colon after the interwiki, that also
294 # resets the default namespace
295 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
296 $parts['namespace'] = NS_MAIN;
297 $dbkey = substr( $dbkey, 1 );
298 $dbkey = trim( $dbkey, '_' );
299 }
300 }
301 # If there's no recognized interwiki or namespace,
302 # then let the colon expression be part of the title.
303 }
304 break;
305 } while ( true );
306
307 $fragment = strstr( $dbkey, '#' );
308 if ( $fragment !== false ) {
309 $parts['fragment'] = str_replace( '_', ' ', substr( $fragment, 1 ) );
310 $dbkey = substr( $dbkey, 0, strlen( $dbkey ) - strlen( $fragment ) );
311 # remove whitespace again: prevents "Foo_bar_#"
312 # becoming "Foo_bar_"
313 $dbkey = rtrim( $dbkey, "_" );
314 }
315
316 # Reject illegal characters.
318 $matches = [];
319 if ( preg_match( $rxTc, $dbkey, $matches ) ) {
320 $exception = ( $this->createMalformedTitleException )( 'title-invalid-characters', $text, [ $matches[0] ] );
321 throw $exception;
322 }
323
324 # Pages with "/./" or "/../" appearing in the URLs will often be un-
325 # reachable due to the way web browsers deal with 'relative' URLs.
326 # Also, they conflict with subpage syntax. Forbid them explicitly.
327 if (
328 str_contains( $dbkey, '.' ) &&
329 (
330 $dbkey === '.' || $dbkey === '..' ||
331 str_starts_with( $dbkey, './' ) ||
332 str_starts_with( $dbkey, '../' ) ||
333 str_contains( $dbkey, '/./' ) ||
334 str_contains( $dbkey, '/../' ) ||
335 str_ends_with( $dbkey, '/.' ) ||
336 str_ends_with( $dbkey, '/..' )
337 )
338 ) {
339 $exception = ( $this->createMalformedTitleException )( 'title-invalid-relative', $text );
340 throw $exception;
341 }
342
343 # Magic tilde sequences? Nu-uh!
344 if ( strpos( $dbkey, '~~~' ) !== false ) {
345 $exception = ( $this->createMalformedTitleException )( 'title-invalid-magic-tilde', $text );
346 throw $exception;
347 }
348
349 # Limit the size of titles to 255 bytes. This is typically the size of the
350 # underlying database field. We make an exception for special pages, which
351 # don't need to be stored in the database, and may edge over 255 bytes due
352 # to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
353 $maxLength = ( $parts['namespace'] !== NS_SPECIAL ) ? 255 : 512;
354 if ( strlen( $dbkey ) > $maxLength ) {
355 $exception = ( $this->createMalformedTitleException )(
356 'title-invalid-too-long',
357 $text,
358 [ Message::numParam( $maxLength ), Message::numParam( strlen( $dbkey ) ) ]
359 );
360 throw $exception;
361 }
362
363 # Normally, all wiki links are forced to have an initial capital letter so [[foo]]
364 # and [[Foo]] point to the same place. Don't force it for interwikis, since the
365 # other site might be case-sensitive.
366 if ( $parts['interwiki'] === '' && $this->nsInfo->isCapitalized( $parts['namespace'] ) ) {
367 $dbkey = $this->language->ucfirst( $dbkey );
368 }
369
370 # Can't make a link to a namespace alone... "empty" local links can only be
371 # self-links with a fragment identifier.
372 if ( $dbkey == '' && $parts['interwiki'] === '' && $parts['namespace'] !== NS_MAIN ) {
373 $exception = ( $this->createMalformedTitleException )( 'title-invalid-empty', $text );
374 throw $exception;
375 }
376
377 // Allow IPv6 usernames to start with '::' by canonicalizing IPv6 titles.
378 // IP names are not allowed for accounts, and can only be referring to
379 // edits from the IP. Given '::' abbreviations and caps/lowercaps,
380 // there are numerous ways to present the same IP. Having sp:contribs scan
381 // them all is silly and having some show the edits and others not is
382 // inconsistent. Same for talk/userpages. Keep them normalized instead.
383 if ( $dbkey !== '' && ( $parts['namespace'] === NS_USER || $parts['namespace'] === NS_USER_TALK ) ) {
384 $dbkey = IPUtils::sanitizeIP( $dbkey );
385 // IPUtils::sanitizeIP return null only for bad input
386 '@phan-var string $dbkey';
387 }
388
389 // Any remaining initial :s are illegal.
390 if ( $dbkey !== '' && $dbkey[0] == ':' ) {
391 $exception = ( $this->createMalformedTitleException )( 'title-invalid-leading-colon', $text );
392 throw $exception;
393 }
394
395 // Fill fields
396 $parts['dbkey'] = $dbkey;
397
398 // Check to ensure that the return value can be used to construct a TitleValue.
399 // All issues should in theory be caught above, this is here to enforce consistency.
400 try {
402 $parts['namespace'],
403 $parts['dbkey'],
404 $parts['fragment'],
405 $parts['interwiki']
406 );
407 } catch ( InvalidArgumentException $ex ) {
408 $exception = ( $this->createMalformedTitleException )( 'title-invalid', $text, [ $ex->getMessage() ] );
409 throw $exception;
410 }
411
412 return $parts;
413 }
414
423 public static function getTitleInvalidRegex() {
424 static $rxTc = false;
425 if ( !$rxTc ) {
426 # Matching titles will be held as illegal.
427 $rxTc = '/' .
428 # Any character not allowed is forbidden...
429 '[^' . Title::legalChars() . ']' .
430 # URL percent encoding sequences interfere with the ability
431 # to round-trip titles -- you can't link to them consistently.
432 '|%[0-9A-Fa-f]{2}' .
433 # XML/HTML character references produce similar issues.
434 '|&[A-Za-z0-9\x80-\xff]+;' .
435 '/S';
436 }
437
438 return $rxTc;
439 }
440}
441
443class_alias( TitleParser::class, 'TitleParser' );
const NS_USER
Definition Defines.php:67
const NS_MAIN
Definition Defines.php:65
const NS_SPECIAL
Definition Defines.php:54
const NS_TALK
Definition Defines.php:66
const NS_USER_TALK
Definition Defines.php:68
Base class for language-specific code.
Definition Language.php:82
The Message class deals with fetching and processing of interface message into a variety of formats.
Definition Message.php:155
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A title parser service for MediaWiki.
makeTitleValueSafe( $namespace, $text, $fragment='', $interwiki='')
Given a namespace and title, return a TitleValue if valid, or null if invalid.
splitTitleString( $text, $defaultNamespace=NS_MAIN)
Validates, normalizes and splits a title string.
__construct(Language $language, InterwikiLookup $interwikiLookup, NamespaceInfo $nsInfo, $localInterwikis)
parseTitle( $text, $defaultNamespace=NS_MAIN)
Parses the given text and constructs a TitleValue.
static getTitleInvalidRegex()
Returns a simple regex that will match on characters and sequences invalid in titles.
overrideCreateMalformedTitleExceptionCallback(callable $callback)
Represents the target of a wiki link.
static assertValidSpec( $namespace, $title, $fragment='', $interwiki='')
Assert that the given parameters could be used to construct a TitleValue object.
static newMainPage(?MessageLocalizer $localizer=null)
Create a new Title for the Main Page.
Definition Title.php:679
static legalChars()
Get a regex character class describing the legal characters in a link.
Definition Title.php:712
Service interface for looking up Interwiki records.