MediaWiki 1.41.2
Sanitizer.php
Go to the documentation of this file.
1<?php
27namespace MediaWiki\Parser;
28
29use InvalidArgumentException;
30use LogicException;
34use StringUtils;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
41
46class Sanitizer {
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
55 |&\#([0-9]+);
56 |&\#[xX]([0-9A-Fa-f]+);
57 |(&)/x';
58
63 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
64
74 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
76
82 public const ID_PRIMARY = 0;
83
90 public const ID_FALLBACK = 1;
91
96 private const MW_ENTITY_ALIASES = [
97 'רלמ;' => 'rlm;',
98 'رلم;' => 'rlm;',
99 ];
100
104 private static $attribsRegex;
105
112 private static function getAttribsRegex() {
113 if ( self::$attribsRegex === null ) {
114 $spaceChars = '\x09\x0a\x0c\x0d\x20';
115 $space = "[{$spaceChars}]";
116 $attrib = "[^{$spaceChars}\/>=]";
117 $attribFirst = "(?:{$attrib}|=)";
118 self::$attribsRegex =
119 "/({$attribFirst}{$attrib}*)
120 ($space*=$space*
121 (?:
122 # The attribute value: quoted or alone
123 \"([^\"]*)(?:\"|\$)
124 | '([^']*)(?:'|\$)
125 | (((?!$space|>).)*)
126 )
127 )?/sxu";
128 }
129 return self::$attribsRegex;
130 }
131
135 private static $attribNameRegex;
136
141 private static function getAttribNameRegex() {
142 if ( self::$attribNameRegex === null ) {
143 $attribFirst = "[:_\p{L}\p{N}]";
144 $attrib = "[:_\.\-\p{L}\p{N}]";
145 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
146 }
147 return self::$attribNameRegex;
148 }
149
157 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
158 global $wgAllowImageTag;
159 static $commonCase, $staticInitialised;
160 $isCommonCase = ( $extratags === [] && $removetags === [] );
161 if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
162 return $commonCase;
163 }
164
165 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
166 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
167
168 // Base our staticInitialised variable off of the global config state so that if the globals
169 // are changed (like in the screwed up test system) we will re-initialise the settings.
170 $globalContext = $wgAllowImageTag;
171 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
172 $htmlpairsStatic = [ # Tags that must be closed
173 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
174 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
175 'strike', 'strong', 'tt', 'var', 'div', 'center',
176 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
177 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
178 'kbd', 'samp', 'data', 'time', 'mark'
179 ];
180 # These tags can be self-closed. For tags not also on
181 # $htmlsingleonly, a self-closed tag will be emitted as
182 # an empty element (open-tag/close-tag pair).
183 $htmlsingle = [
184 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
185 ];
186
187 # Elements that cannot have close tags. This is (not coincidentally)
188 # also the list of tags for which the HTML 5 parsing algorithm
189 # requires you to "acknowledge the token's self-closing flag", i.e.
190 # a self-closing tag like <br/> is not an HTML 5 parse error only
191 # for this list.
192 $htmlsingleonly = [
193 'br', 'wbr', 'hr', 'meta', 'link'
194 ];
195
196 $htmlnest = [ # Tags that can be nested--??
197 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
198 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
199 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
200 ];
201 $tabletags = [ # Can only appear inside table, we will close them
202 'td', 'th', 'tr',
203 ];
204 $htmllist = [ # Tags used by list
205 'ul', 'ol',
206 ];
207 $listtags = [ # Tags that can appear in a list
208 'li',
209 ];
210
211 if ( $wgAllowImageTag ) {
212 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
213 'is deprecated since MediaWiki 1.35', '1.35', false, false );
214 $htmlsingle[] = 'img';
215 $htmlsingleonly[] = 'img';
216 }
217
218 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
219 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
220
221 # Convert them all to hashtables for faster lookup
222 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
223 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
224 foreach ( $vars as $var ) {
225 $$var = array_fill_keys( $$var, true );
226 }
227 $staticInitialised = $globalContext;
228 }
229
230 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
231 $extratags = array_fill_keys( $extratags, true );
232 $removetags = array_fill_keys( $removetags, true );
233 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
234 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
235 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
236 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
237
238 $result = [
239 'htmlpairs' => $htmlpairs,
240 'htmlsingle' => $htmlsingle,
241 'htmlsingleonly' => $htmlsingleonly,
242 'htmlnest' => $htmlnest,
243 'tabletags' => $tabletags,
244 'htmllist' => $htmllist,
245 'listtags' => $listtags,
246 'htmlsingleallowed' => $htmlsingleallowed,
247 'htmlelements' => $htmlelements,
248 ];
249 if ( $isCommonCase ) {
250 $commonCase = $result;
251 }
252 return $result;
253 }
254
285 public static function removeHTMLtags( $text, $processCallback = null,
286 $args = [], $extratags = [], $removetags = []
287 ) {
288 wfDeprecated( __METHOD__, '1.38' );
290 $text, $processCallback, $args, $extratags, $removetags
291 );
292 }
293
322 public static function internalRemoveHtmlTags( $text, $processCallback = null,
323 $args = [], $extratags = [], $removetags = []
324 ) {
325 $tagData = self::getRecognizedTagData( $extratags, $removetags );
326 $htmlsingle = $tagData['htmlsingle'];
327 $htmlsingleonly = $tagData['htmlsingleonly'];
328 $htmlelements = $tagData['htmlelements'];
329
330 # Remove HTML comments
331 $text = self::removeHTMLcomments( $text );
332 $bits = explode( '<', $text );
333 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
334
335 # this might be possible using remex tidy itself
336 foreach ( $bits as $x ) {
337 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
338 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
339
340 $badtag = false;
341 $t = strtolower( $t );
342 if ( isset( $htmlelements[$t] ) ) {
343 if ( is_callable( $processCallback ) ) {
344 call_user_func_array( $processCallback, [ &$params, $args ] );
345 }
346
347 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
348 // Remove the self-closing slash, to be consistent
349 // with HTML5 semantics. T134423
350 $brace = '>';
351 }
352 if ( !self::validateTag( $params, $t ) ) {
353 $badtag = true;
354 }
355
356 $newparams = self::fixTagAttributes( $params, $t );
357 if ( !$badtag ) {
358 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
359 # Interpret self-closing tags as empty tags even when
360 # HTML 5 would interpret them as start tags. Such input
361 # is commonly seen on Wikimedia wikis with this intention.
362 $brace = "></$t>";
363 }
364
365 $rest = str_replace( '>', '&gt;', $rest );
366 $text .= "<$slash$t$newparams$brace$rest";
367 continue;
368 }
369 }
370 }
371 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
372 }
373 return $text;
374 }
375
397 public static function removeSomeTags(
398 string $text, array $options = []
399 ): string {
400 $extraTags = $options['extraTags'] ?? [];
401 $removeTags = $options['removeTags'] ?? [];
402 // These options are @internal:
403 $attrCallback = $options['attrCallback'] ?? null;
404 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
405
406 // This disallows HTML5-style "missing trailing semicolon" attributes
407 // In wikitext "clean&copy" does *not* contain an entity.
408 $text = self::normalizeCharReferences( $text );
409
410 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
411 // Use RemexHtml to tokenize $text and remove the barred tags
412 $formatter = new RemexCompatFormatter;
413 $serializer = new RemexSerializer( $formatter );
414 $treeBuilder = new RemexTreeBuilder( $serializer, [
415 'ignoreErrors' => true,
416 'ignoreNulls' => true,
417 ] );
418 $dispatcher = new RemexDispatcher( $treeBuilder );
419 $tokenHandler = $dispatcher;
420 $remover = new RemexRemoveTagHandler(
421 $tokenHandler, $text, $tagData,
422 $attrCallback, $attrCallbackArgs
423 );
424 $tokenizer = new RemexTokenizer( $remover, $text, [
425 'ignoreErrors' => true,
426 // don't ignore char refs, we want them to be decoded
427 'ignoreNulls' => true,
428 'skipPreprocess' => true,
429 ] );
430 $tokenizer->execute( [
431 'fragmentNamespace' => HTMLData::NS_HTML,
432 'fragmentName' => 'body',
433 ] );
434 return $serializer->getResult();
435 }
436
446 public static function removeHTMLcomments( $text ) {
447 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
448 $end = strpos( $text, '-->', $start + 4 );
449 if ( $end === false ) {
450 # Unterminated comment; bail out
451 break;
452 }
453
454 $end += 3;
455
456 # Trim space and newline if the comment is both
457 # preceded and followed by a newline
458 $spaceStart = max( $start - 1, 0 );
459 $spaceLen = $end - $spaceStart;
460 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
461 $spaceStart--;
462 $spaceLen++;
463 }
464 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
465 $spaceLen++;
466 }
467 if ( substr( $text, $spaceStart, 1 ) === "\n"
468 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
469 # Remove the comment, leading and trailing
470 # spaces, and leave only one newline.
471 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
472 } else {
473 # Remove just the comment.
474 $text = substr_replace( $text, '', $start, $end - $start );
475 }
476 }
477 return $text;
478 }
479
494 private static function validateTag( $params, $element ) {
495 $params = self::decodeTagAttributes( $params );
496
497 if ( $element == 'meta' || $element == 'link' ) {
498 if ( !isset( $params['itemprop'] ) ) {
499 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
500 return false;
501 }
502 if ( $element == 'meta' && !isset( $params['content'] ) ) {
503 // <meta> must have a content="" for the itemprop
504 return false;
505 }
506 if ( $element == 'link' && !isset( $params['href'] ) ) {
507 // <link> must have an associated href=""
508 return false;
509 }
510 }
511
512 return true;
513 }
514
530 public static function validateTagAttributes( $attribs, $element ) {
531 return self::validateAttributes( $attribs,
532 self::attributesAllowedInternal( $element ) );
533 }
534
553 public static function validateAttributes( $attribs, $allowed ) {
554 if ( isset( $allowed[0] ) ) {
555 // Calling this function with a sequential array is
556 // deprecated. For now just convert it.
557 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
558 $allowed = array_fill_keys( $allowed, true );
559 }
560 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
561
562 $out = [];
563 foreach ( $attribs as $attribute => $value ) {
564 # Allow XML namespace declaration to allow RDFa
565 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
566 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
567 $out[$attribute] = $value;
568 }
569
570 continue;
571 }
572
573 # Allow any attribute beginning with "data-"
574 # However:
575 # * Disallow data attributes used by MediaWiki code
576 # * Ensure that the attribute is not namespaced by banning
577 # colons.
578 if ( (
579 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
580 !array_key_exists( $attribute, $allowed )
581 ) || self::isReservedDataAttribute( $attribute ) ) {
582 continue;
583 }
584
585 # Strip javascript "expression" from stylesheets.
586 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
587 if ( $attribute == 'style' ) {
588 $value = self::checkCss( $value );
589 }
590
591 # Escape HTML id attributes
592 if ( $attribute === 'id' ) {
593 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
594 }
595
596 # Escape HTML id reference lists
597 if ( $attribute === 'aria-describedby'
598 || $attribute === 'aria-flowto'
599 || $attribute === 'aria-labelledby'
600 || $attribute === 'aria-owns'
601 ) {
602 $value = self::escapeIdReferenceListInternal( $value );
603 }
604
605 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
606 if ( $attribute === 'rel' || $attribute === 'rev'
607 # RDFa
608 || $attribute === 'about' || $attribute === 'property'
609 || $attribute === 'resource' || $attribute === 'datatype'
610 || $attribute === 'typeof'
611 # HTML5 microdata
612 || $attribute === 'itemid' || $attribute === 'itemprop'
613 || $attribute === 'itemref' || $attribute === 'itemscope'
614 || $attribute === 'itemtype'
615 ) {
616 // Paranoia. Allow "simple" values but suppress javascript
617 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
618 continue;
619 }
620 }
621
622 # NOTE: even though elements using href/src are not allowed directly, supply
623 # validation code that can be used by tag hook handlers, etc
624 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
625 if ( !preg_match( $hrefExp, $value ) ) {
626 continue; // drop any href or src attributes not using an allowed protocol.
627 // NOTE: this also drops all relative URLs
628 }
629 }
630
631 if ( $attribute === 'tabindex' && $value !== '0' ) {
632 // Only allow tabindex of 0, which is useful for accessibility.
633 continue;
634 }
635
636 // If this attribute was previously set, override it.
637 // Output should only have one attribute of each name.
638 $out[$attribute] = $value;
639 }
640
641 # itemtype, itemid, itemref don't make sense without itemscope
642 if ( !array_key_exists( 'itemscope', $out ) ) {
643 unset( $out['itemtype'] );
644 unset( $out['itemid'] );
645 unset( $out['itemref'] );
646 }
647 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
648
649 return $out;
650 }
651
659 public static function isReservedDataAttribute( $attr ) {
660 // data-ooui is reserved for ooui.
661 // data-mw and data-parsoid are reserved for parsoid.
662 // data-mw-<name here> is reserved for extensions (or core) if
663 // they need to communicate some data to the client and want to be
664 // sure that it isn't coming from an untrusted user.
665 // We ignore the possibility of namespaces since user-generated HTML
666 // can't use them anymore.
667 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
668 }
669
680 public static function mergeAttributes( $a, $b ) {
681 $out = array_merge( $a, $b );
682 if ( isset( $a['class'] ) && isset( $b['class'] )
683 && is_string( $a['class'] ) && is_string( $b['class'] )
684 && $a['class'] !== $b['class']
685 ) {
686 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
687 -1, PREG_SPLIT_NO_EMPTY );
688 $out['class'] = implode( ' ', array_unique( $classes ) );
689 }
690 return $out;
691 }
692
701 public static function normalizeCss( $value ) {
702 // Decode character references like &#123;
703 $value = self::decodeCharReferences( $value );
704
705 // Decode escape sequences and line continuation
706 // See the grammar in the CSS 2 spec, appendix D.
707 // This has to be done AFTER decoding character references.
708 // This means it isn't possible for this function to return
709 // unsanitized escape sequences. It is possible to manufacture
710 // input that contains character references that decode to
711 // escape sequences that decode to character references, but
712 // it's OK for the return value to contain character references
713 // because the caller is supposed to escape those anyway.
714 static $decodeRegex;
715 if ( !$decodeRegex ) {
716 $space = '[\\x20\\t\\r\\n\\f]';
717 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
718 $backslash = '\\\\';
719 $decodeRegex = "/ $backslash
720 (?:
721 ($nl) | # 1. Line continuation
722 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
723 (.) | # 3. backslash cancelling special meaning
724 () | # 4. backslash at end of string
725 )/xu";
726 }
727 $value = preg_replace_callback( $decodeRegex,
728 [ __CLASS__, 'cssDecodeCallback' ], $value );
729
730 // Let the value through if it's nothing but a single comment, to
731 // allow other functions which may reject it to pass some error
732 // message through.
733 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
734 // Remove any comments; IE gets token splitting wrong
735 // This must be done AFTER decoding character references and
736 // escape sequences, because those steps can introduce comments
737 // This step cannot introduce character references or escape
738 // sequences, because it replaces comments with spaces rather
739 // than removing them completely.
740 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
741
742 // Remove anything after a comment-start token, to guard against
743 // incorrect client implementations.
744 $commentPos = strpos( $value, '/*' );
745 if ( $commentPos !== false ) {
746 $value = substr( $value, 0, $commentPos );
747 }
748 }
749
750 return $value;
751 }
752
771 public static function checkCss( $value ) {
772 $value = self::normalizeCss( $value );
773
774 // Reject problematic keywords and control characters
775 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
776 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
777 return '/* invalid control char */';
778 } elseif ( preg_match(
779 '! expression
780 | filter\s*:
781 | accelerator\s*:
782 | -o-link\s*:
783 | -o-link-source\s*:
784 | -o-replace\s*:
785 | url\s*\‍(
786 | image\s*\‍(
787 | image-set\s*\‍(
788 | attr\s*\‍([^)]+[\s,]+url
789 !ix', $value ) ) {
790 return '/* insecure input */';
791 }
792 return $value;
793 }
794
799 private static function cssDecodeCallback( $matches ) {
800 if ( $matches[1] !== '' ) {
801 // Line continuation
802 return '';
803 } elseif ( $matches[2] !== '' ) {
804 # hexdec could return a float if the match is too long, but the
805 # regexp in question limits the string length to 6.
806 $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
807 } elseif ( $matches[3] !== '' ) {
808 $char = $matches[3];
809 } else {
810 $char = '\\';
811 }
812 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
813 // These characters need to be escaped in strings
814 // Clean up the escape sequence to avoid parsing errors by clients
815 return '\\' . dechex( ord( $char ) ) . ' ';
816 } else {
817 // Decode unnecessary escape
818 return $char;
819 }
820 }
821
843 public static function fixTagAttributes( $text, $element, $sorted = false ) {
844 if ( trim( $text ) == '' ) {
845 return '';
846 }
847
848 $decoded = self::decodeTagAttributes( $text );
849 $stripped = self::validateTagAttributes( $decoded, $element );
850
851 if ( $sorted ) {
852 ksort( $stripped );
853 }
854
855 return self::safeEncodeTagAttributes( $stripped );
856 }
857
865 public static function encodeAttribute( $text ) {
866 $encValue = htmlspecialchars( $text, ENT_QUOTES );
867
868 // Whitespace is normalized during attribute decoding,
869 // so if we've been passed non-spaces we must encode them
870 // ahead of time or they won't be preserved.
871 $encValue = strtr( $encValue, [
872 "\n" => '&#10;',
873 "\r" => '&#13;',
874 "\t" => '&#9;',
875 ] );
876
877 return $encValue;
878 }
879
888 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
889 // Replace $ with \$ and \ with \\
890 $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
891 $fixtags = [
892 # French spaces, last one Guillemet-left
893 # only if it isn't followed by a word character.
894 '/ (?=[?:;!%»›](?!\w))/u' => "$space",
895 # French spaces, Guillemet-right
896 '/([«‹]) /u' => "\\1$space",
897 ];
898 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
899 }
900
909 public static function safeEncodeAttribute( $text ) {
910 $encValue = self::encodeAttribute( $text );
911
912 # Templates and links may be expanded in later parsing,
913 # creating invalid or dangerous output. Suppress this.
914 $encValue = strtr( $encValue, [
915 '<' => '&lt;', // This should never happen,
916 '>' => '&gt;', // we've received invalid input
917 '"' => '&quot;', // which should have been escaped.
918 '{' => '&#123;',
919 '}' => '&#125;', // prevent unpaired language conversion syntax
920 '[' => '&#91;',
921 ']' => '&#93;',
922 "''" => '&#39;&#39;',
923 'ISBN' => '&#73;SBN',
924 'RFC' => '&#82;FC',
925 'PMID' => '&#80;MID',
926 '|' => '&#124;',
927 '__' => '&#95;_',
928 ] );
929
930 # Stupid hack
931 $encValue = preg_replace_callback(
932 '/((?i)' . wfUrlProtocols() . ')/',
933 static function ( $matches ) {
934 return str_replace( ':', '&#58;', $matches[1] );
935 },
936 $encValue );
937 return $encValue;
938 }
939
955 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
956 global $wgFragmentMode;
957
958 if ( !isset( $wgFragmentMode[$mode] ) ) {
959 if ( $mode === self::ID_PRIMARY ) {
960 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
961 }
962 return false;
963 }
964
965 $internalMode = $wgFragmentMode[$mode];
966
967 return self::escapeIdInternal( $id, $internalMode );
968 }
969
982 public static function escapeIdForLink( $id ) {
983 global $wgFragmentMode;
984
985 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
986 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
987 }
988
989 $mode = $wgFragmentMode[self::ID_PRIMARY];
990
991 $id = self::escapeIdInternalUrl( $id, $mode );
992
993 return $id;
994 }
995
1005 public static function escapeIdForExternalInterwiki( $id ) {
1007
1008 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1009
1010 return $id;
1011 }
1012
1022 private static function escapeIdInternalUrl( $id, $mode ) {
1023 $id = self::escapeIdInternal( $id, $mode );
1024 if ( $mode === 'html5' ) {
1025 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1026 }
1027 return $id;
1028 }
1029
1037 private static function escapeIdInternal( $id, $mode ) {
1038 // Truncate overly-long IDs. This isn't an HTML limit, it's just
1039 // griefer protection. [T251506]
1040 $id = mb_substr( $id, 0, 1024 );
1041
1042 switch ( $mode ) {
1043 case 'html5':
1044 // html5 spec says ids must not have any of the following:
1045 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1046 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1047 // possible using either Lua or html entities.
1048 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1049 break;
1050 case 'legacy':
1051 // This corresponds to 'noninitial' mode of the former escapeId()
1052 static $replace = [
1053 '%3A' => ':',
1054 '%' => '.'
1055 ];
1056
1057 $id = urlencode( str_replace( ' ', '_', $id ) );
1058 $id = strtr( $id, $replace );
1059 break;
1060 default:
1061 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1062 }
1063
1064 return $id;
1065 }
1066
1074 private static function escapeIdReferenceListInternal( $referenceString ) {
1075 # Explode the space delimited list string into an array of tokens
1076 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1077
1078 # Escape each token as an id
1079 foreach ( $references as &$ref ) {
1080 $ref = self::escapeIdForAttribute( $ref );
1081 }
1082
1083 # Merge the array back to a space delimited list string
1084 # If the array is empty, the result will be an empty string ('')
1085 $referenceString = implode( ' ', $references );
1086
1087 return $referenceString;
1088 }
1089
1101 public static function escapeClass( $class ) {
1102 // Convert ugly stuff to underscores and kill underscores in ugly places
1103 return rtrim( preg_replace(
1104 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1105 '_',
1106 $class ), '_' );
1107 }
1108
1118 public static function escapeHtmlAllowEntities( $html ) {
1119 $html = self::decodeCharReferences( $html );
1120 # It seems wise to escape ' as well as ", as a matter of course. Can't
1121 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1122 # don't cause the entire string to disappear.
1123 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1124 return $html;
1125 }
1126
1135 public static function decodeTagAttributes( $text ) {
1136 if ( trim( $text ) == '' ) {
1137 return [];
1138 }
1139
1140 $pairs = [];
1141 if ( !preg_match_all(
1142 self::getAttribsRegex(),
1143 $text,
1144 $pairs,
1145 PREG_SET_ORDER ) ) {
1146 return [];
1147 }
1148
1149 $attribs = [];
1150 foreach ( $pairs as $set ) {
1151 $attribute = strtolower( $set[1] );
1152
1153 // Filter attribute names with unacceptable characters
1154 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1155 continue;
1156 }
1157
1158 $value = self::getTagAttributeCallback( $set );
1159
1160 // Normalize whitespace
1161 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1162 $value = trim( $value );
1163
1164 // Decode character references
1165 $attribs[$attribute] = self::decodeCharReferences( $value );
1166 }
1167 return $attribs;
1168 }
1169
1177 public static function safeEncodeTagAttributes( $assoc_array ) {
1178 $attribs = [];
1179 foreach ( $assoc_array as $attribute => $value ) {
1180 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1181 $encValue = self::safeEncodeAttribute( $value );
1182
1183 $attribs[] = "$encAttribute=\"$encValue\"";
1184 }
1185 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1186 }
1187
1195 private static function getTagAttributeCallback( $set ) {
1196 if ( isset( $set[5] ) ) {
1197 # No quotes.
1198 return $set[5];
1199 } elseif ( isset( $set[4] ) ) {
1200 # Single-quoted
1201 return $set[4];
1202 } elseif ( isset( $set[3] ) ) {
1203 # Double-quoted
1204 return $set[3];
1205 } elseif ( !isset( $set[2] ) ) {
1206 # In XHTML, attributes must have a value so return an empty string.
1207 # See "Empty attribute syntax",
1208 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1209 return "";
1210 } else {
1211 throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1212 }
1213 }
1214
1219 private static function normalizeWhitespace( $text ) {
1220 return trim( preg_replace(
1221 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1222 ' ',
1223 $text ) );
1224 }
1225
1234 public static function normalizeSectionNameWhitespace( $section ) {
1235 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1236 }
1237
1253 public static function normalizeCharReferences( $text ) {
1254 return preg_replace_callback(
1255 self::CHAR_REFS_REGEX,
1256 [ self::class, 'normalizeCharReferencesCallback' ],
1257 $text );
1258 }
1259
1264 private static function normalizeCharReferencesCallback( $matches ) {
1265 $ret = null;
1266 if ( $matches[1] != '' ) {
1267 $ret = self::normalizeEntity( $matches[1] );
1268 } elseif ( $matches[2] != '' ) {
1269 $ret = self::decCharReference( $matches[2] );
1270 } elseif ( $matches[3] != '' ) {
1271 $ret = self::hexCharReference( $matches[3] );
1272 }
1273 if ( $ret === null ) {
1274 return htmlspecialchars( $matches[0], ENT_COMPAT );
1275 } else {
1276 return $ret;
1277 }
1278 }
1279
1290 private static function normalizeEntity( $name ) {
1291 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1292 // Non-standard MediaWiki-specific entities
1293 return '&' . self::MW_ENTITY_ALIASES[$name];
1294 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1295 // Keep these in word form
1296 return "&$name";
1297 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1298 // Beware: some entities expand to more than 1 codepoint
1299 return preg_replace_callback( '/./Ssu', static function ( $m ) {
1300 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1301 }, HTMLData::$namedEntityTranslations[$name] );
1302 } else {
1303 return "&amp;$name";
1304 }
1305 }
1306
1311 private static function decCharReference( $codepoint ) {
1312 # intval() will (safely) saturate at the maximum signed integer
1313 # value if $codepoint is too many digits
1314 $point = intval( $codepoint );
1315 if ( self::validateCodepoint( $point ) ) {
1316 return sprintf( '&#%d;', $point );
1317 } else {
1318 return null;
1319 }
1320 }
1321
1326 private static function hexCharReference( $codepoint ) {
1327 # hexdec() will return a float (not an int) if $codepoint is too
1328 # long, so protect against that. The largest valid codepoint is
1329 # 0x10FFFF.
1330 if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1331 return null;
1332 }
1333 $point = hexdec( $codepoint );
1334 if ( self::validateCodepoint( $point ) ) {
1335 return sprintf( '&#x%x;', $point );
1336 } else {
1337 return null;
1338 }
1339 }
1340
1347 private static function validateCodepoint( $codepoint ) {
1348 # U+000C is valid in HTML5 but not allowed in XML.
1349 # U+000D is valid in XML but not allowed in HTML5.
1350 # U+007F - U+009F are disallowed in HTML5 (control characters).
1351 return $codepoint == 0x09
1352 || $codepoint == 0x0a
1353 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1354 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1355 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1356 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1357 }
1358
1366 public static function decodeCharReferences( $text ) {
1367 return preg_replace_callback(
1368 self::CHAR_REFS_REGEX,
1369 [ self::class, 'decodeCharReferencesCallback' ],
1370 $text );
1371 }
1372
1383 public static function decodeCharReferencesAndNormalize( $text ) {
1384 $text = preg_replace_callback(
1385 self::CHAR_REFS_REGEX,
1386 [ self::class, 'decodeCharReferencesCallback' ],
1387 $text,
1388 -1, // limit
1389 $count
1390 );
1391
1392 if ( $count ) {
1393 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1394 } else {
1395 return $text;
1396 }
1397 }
1398
1403 private static function decodeCharReferencesCallback( $matches ) {
1404 if ( $matches[1] != '' ) {
1405 return self::decodeEntity( $matches[1] );
1406 } elseif ( $matches[2] != '' ) {
1407 return self::decodeChar( intval( $matches[2] ) );
1408 } elseif ( $matches[3] != '' ) {
1409 # hexdec will return a float if the string is too long (!) so
1410 # check the length of the string first.
1411 if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1412 // Invalid character reference.
1413 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1414 }
1415 return self::decodeChar( hexdec( $matches[3] ) );
1416 }
1417 # Last case should be an ampersand by itself
1418 return $matches[0];
1419 }
1420
1428 private static function decodeChar( $codepoint ) {
1429 if ( self::validateCodepoint( $codepoint ) ) {
1430 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1431 } else {
1432 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1433 }
1434 }
1435
1444 private static function decodeEntity( $name ) {
1445 // These are MediaWiki-specific entities, not in the HTML standard
1446 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1447 $name = self::MW_ENTITY_ALIASES[$name];
1448 }
1449 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1450 return $trans ?? "&$name";
1451 }
1452
1460 private static function attributesAllowedInternal( $element ) {
1461 $list = self::setupAttributesAllowedInternal();
1462 return $list[$element] ?? [];
1463 }
1464
1472 private static function setupAttributesAllowedInternal() {
1473 static $allowed;
1474
1475 if ( $allowed !== null ) {
1476 return $allowed;
1477 }
1478
1479 // For lookup efficiency flip each attributes array so the keys are
1480 // the valid attributes.
1481 $merge = static function ( $a, $b, $c = [] ) {
1482 return array_merge(
1483 $a,
1484 array_fill_keys( $b, true ),
1485 array_fill_keys( $c, true ) );
1486 };
1487 $common = $merge( [], [
1488 # HTML
1489 'id',
1490 'class',
1491 'style',
1492 'lang',
1493 'dir',
1494 'title',
1495 'tabindex',
1496
1497 # WAI-ARIA
1498 'aria-describedby',
1499 'aria-flowto',
1500 'aria-hidden',
1501 'aria-label',
1502 'aria-labelledby',
1503 'aria-level',
1504 'aria-owns',
1505 'role',
1506
1507 # RDFa
1508 # These attributes are specified in section 9 of
1509 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1510 'about',
1511 'property',
1512 'resource',
1513 'datatype',
1514 'typeof',
1515
1516 # Microdata. These are specified by
1517 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1518 'itemid',
1519 'itemprop',
1520 'itemref',
1521 'itemscope',
1522 'itemtype',
1523 ] );
1524
1525 $block = $merge( $common, [ 'align' ] );
1526
1527 $tablealign = [ 'align', 'valign' ];
1528 $tablecell = [
1529 'abbr',
1530 'axis',
1531 'headers',
1532 'scope',
1533 'rowspan',
1534 'colspan',
1535 'nowrap', # deprecated
1536 'width', # deprecated
1537 'height', # deprecated
1538 'bgcolor', # deprecated
1539 ];
1540
1541 # Numbers refer to sections in HTML 4.01 standard describing the element.
1542 # See: https://www.w3.org/TR/html4/
1543 $allowed = [
1544 # 7.5.4
1545 'div' => $block,
1546 'center' => $common, # deprecated
1547 'span' => $common,
1548
1549 # 7.5.5
1550 'h1' => $block,
1551 'h2' => $block,
1552 'h3' => $block,
1553 'h4' => $block,
1554 'h5' => $block,
1555 'h6' => $block,
1556
1557 # 7.5.6
1558 # address
1559
1560 # 8.2.4
1561 'bdo' => $common,
1562
1563 # 9.2.1
1564 'em' => $common,
1565 'strong' => $common,
1566 'cite' => $common,
1567 'dfn' => $common,
1568 'code' => $common,
1569 'samp' => $common,
1570 'kbd' => $common,
1571 'var' => $common,
1572 'abbr' => $common,
1573 # acronym
1574
1575 # 9.2.2
1576 'blockquote' => $merge( $common, [ 'cite' ] ),
1577 'q' => $merge( $common, [ 'cite' ] ),
1578
1579 # 9.2.3
1580 'sub' => $common,
1581 'sup' => $common,
1582
1583 # 9.3.1
1584 'p' => $block,
1585
1586 # 9.3.2
1587 'br' => $merge( $common, [ 'clear' ] ),
1588
1589 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1590 'wbr' => $common,
1591
1592 # 9.3.4
1593 'pre' => $merge( $common, [ 'width' ] ),
1594
1595 # 9.4
1596 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1597 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1598
1599 # 10.2
1600 'ul' => $merge( $common, [ 'type' ] ),
1601 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1602 'li' => $merge( $common, [ 'type', 'value' ] ),
1603
1604 # 10.3
1605 'dl' => $common,
1606 'dd' => $common,
1607 'dt' => $common,
1608
1609 # 11.2.1
1610 'table' => $merge( $common,
1611 [ 'summary', 'width', 'border', 'frame',
1612 'rules', 'cellspacing', 'cellpadding',
1613 'align', 'bgcolor',
1614 ] ),
1615
1616 # 11.2.2
1617 'caption' => $block,
1618
1619 # 11.2.3
1620 'thead' => $common,
1621 'tfoot' => $common,
1622 'tbody' => $common,
1623
1624 # 11.2.4
1625 'colgroup' => $merge( $common, [ 'span' ] ),
1626 'col' => $merge( $common, [ 'span' ] ),
1627
1628 # 11.2.5
1629 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1630
1631 # 11.2.6
1632 'td' => $merge( $common, $tablecell, $tablealign ),
1633 'th' => $merge( $common, $tablecell, $tablealign ),
1634
1635 # 12.2
1636 # NOTE: <a> is not allowed directly, but this list of allowed
1637 # attributes is used from the Parser object
1638 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1639
1640 # 13.2
1641 # Not usually allowed, but may be used for extension-style hooks
1642 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1643 # true
1644 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1645 # Attributes for A/V tags added in T163583 / T133673
1646 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1647 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1648 'source' => $merge( $common, [ 'type', 'src' ] ),
1649 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1650
1651 # 15.2.1
1652 'tt' => $common,
1653 'b' => $common,
1654 'i' => $common,
1655 'big' => $common,
1656 'small' => $common,
1657 'strike' => $common,
1658 's' => $common,
1659 'u' => $common,
1660
1661 # 15.2.2
1662 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1663 # basefont
1664
1665 # 15.3
1666 'hr' => $merge( $common, [ 'width' ] ),
1667
1668 # HTML Ruby annotation text module, simple ruby only.
1669 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1670 'ruby' => $common,
1671 # rbc
1672 'rb' => $common,
1673 'rp' => $common,
1674 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1675 'rtc' => $common,
1676
1677 # MathML root element, where used for extensions
1678 # 'title' may not be 100% valid here; it's XHTML
1679 # https://www.w3.org/TR/REC-MathML/
1680 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1681
1682 // HTML 5 section 4.5
1683 'figure' => $common,
1684 'figcaption' => $common,
1685
1686 # HTML 5 section 4.6
1687 'bdi' => $common,
1688
1689 # HTML5 elements, defined by:
1690 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1691 'data' => $merge( $common, [ 'value' ] ),
1692 'time' => $merge( $common, [ 'datetime' ] ),
1693 'mark' => $common,
1694
1695 // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1696 // is enabled so we don't bother adding a conditional to hide these
1697 // Also meta and link are only valid in WikiText as Microdata elements
1698 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1699 // So we don't bother including $common attributes that have no purpose.
1700 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1701 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1702
1703 # HTML 5 section 4.3.5
1704 'aside' => $common,
1705 ];
1706
1707 return $allowed;
1708 }
1709
1721 public static function stripAllTags( $html ) {
1722 // Use RemexHtml to tokenize $html and extract the text
1723 $handler = new RemexStripTagHandler;
1724 $tokenizer = new RemexTokenizer( $handler, $html, [
1725 'ignoreErrors' => true,
1726 // don't ignore char refs, we want them to be decoded
1727 'ignoreNulls' => true,
1728 'skipPreprocess' => true,
1729 ] );
1730 $tokenizer->execute();
1731 $text = $handler->getResult();
1732
1733 $text = self::normalizeWhitespace( $text );
1734 return $text;
1735 }
1736
1748 public static function hackDocType() {
1749 $out = "<!DOCTYPE html [\n";
1750 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1751 if ( substr( $entity, -1 ) !== ';' ) {
1752 // Some HTML entities omit the trailing semicolon;
1753 // wikitext does not permit these.
1754 continue;
1755 }
1756 $name = substr( $entity, 0, -1 );
1757 $expansion = self::normalizeEntity( $entity );
1758 if ( $entity === $expansion ) {
1759 // Skip &lt; &gt; etc
1760 continue;
1761 }
1762 $out .= "<!ENTITY $name \"$expansion\">";
1763 }
1764 $out .= "]>\n";
1765 return $out;
1766 }
1767
1772 public static function cleanUrl( $url ) {
1773 # Normalize any HTML entities in input. They will be
1774 # re-escaped by makeExternalLink().
1775 $url = self::decodeCharReferences( $url );
1776
1777 # Escape any control characters introduced by the above step
1778 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1779 [ __CLASS__, 'cleanUrlCallback' ], $url );
1780
1781 # Validate hostname portion
1782 $matches = [];
1783 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1784 [ /* $whole */, $protocol, $host, $rest ] = $matches;
1785
1786 // Characters that will be ignored in IDNs.
1787 // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1788 // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1789 // Strip them before further processing so deny lists and such work.
1790 $strip = "/
1791 \\s| # general whitespace
1792 \u{00AD}| # SOFT HYPHEN
1793 \u{034F}| # COMBINING GRAPHEME JOINER
1794 \u{061C}| # ARABIC LETTER MARK
1795 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1796 # HANGUL JUNGSEONG FILLER
1797 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1798 # KHMER VOWEL INHERENT AA
1799 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1800 # MONGOLIAN FREE VARIATION SELECTOR THREE
1801 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1802 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1803 # RIGHT-TO-LEFT MARK
1804 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1805 # RIGHT-TO-LEFT OVERRIDE
1806 [\u{2060}-\u{2064}]| # WORD JOINER..
1807 # INVISIBLE PLUS
1808 \u{2065}| # <reserved-2065>
1809 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1810 # NOMINAL DIGIT SHAPES
1811 \u{3164}| # HANGUL FILLER
1812 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1813 # VARIATION SELECTOR-16
1814 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1815 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1816 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1817 # <reserved-FFF8>
1818 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1819 # SHORTHAND FORMAT UP STEP
1820 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1821 # MUSICAL SYMBOL END PHRASE
1822 \u{E0000}| # <reserved-E0000>
1823 \u{E0001}| # LANGUAGE TAG
1824 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1825 # <reserved-E001F>
1826 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1827 # CANCEL TAG
1828 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1829 # <reserved-E00FF>
1830 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1831 # VARIATION SELECTOR-256
1832 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1833 # <reserved-E0FFF>
1834 /xuD";
1835
1836 $host = preg_replace( $strip, '', $host );
1837
1838 // IPv6 host names are bracketed with []. Url-decode these.
1839 if ( str_starts_with( $host, "//%5B" ) &&
1840 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1841 ) {
1842 $host = '//[' . $matches[1] . ']' . $matches[2];
1843 }
1844
1845 // @todo FIXME: Validate hostnames here
1846
1847 return $protocol . $host . $rest;
1848 } else {
1849 return $url;
1850 }
1851 }
1852
1857 private static function cleanUrlCallback( $matches ) {
1858 return urlencode( $matches[0] );
1859 }
1860
1889 public static function validateEmail( $addr ) {
1890 $result = null;
1891 // TODO This method should be non-static, and have a HookRunner injected
1892 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1893 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1894 return $result;
1895 }
1896
1897 // Please note strings below are enclosed in brackets [], this make the
1898 // hyphen "-" a range indicator. Hence it is double backslashed below.
1899 // See T28948
1900 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1901 $rfc1034_ldh_str = "a-z0-9\\-";
1902
1903 $html5_email_regexp = "/
1904 ^ # start of string
1905 [$rfc5322_atext\\.]+ # user part which is liberal :p
1906 @ # 'apostrophe'
1907 [$rfc1034_ldh_str]+ # First domain part
1908 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1909 $ # End of string
1910 /ix"; // case Insensitive, eXtended
1911
1912 return (bool)preg_match( $html5_email_regexp, $addr );
1913 }
1914}
1915
1920class_alias( Sanitizer::class, 'Sanitizer' );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:90
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static validateEmail( $addr)
Does a string look like an e-mail address?
static removeHTMLcomments( $text)
Remove '', and everything between.
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:82
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
A collection of static methods to play with strings.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.