MediaWiki master
Sanitizer.php
Go to the documentation of this file.
1<?php
27namespace MediaWiki\Parser;
28
29use InvalidArgumentException;
30use LogicException;
34use StringUtils;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
41
46class Sanitizer {
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
55 |&\#([0-9]+);
56 |&\#[xX]([0-9A-Fa-f]+);
57 |(&)/x';
58
63 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
64
74 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
76
82 public const ID_PRIMARY = 0;
83
90 public const ID_FALLBACK = 1;
91
96 private const MW_ENTITY_ALIASES = [
97 'רלמ;' => 'rlm;',
98 'رلم;' => 'rlm;',
99 ];
100
104 private static $attribsRegex;
105
112 private static function getAttribsRegex() {
113 if ( self::$attribsRegex === null ) {
114 $spaceChars = '\x09\x0a\x0c\x0d\x20';
115 $space = "[{$spaceChars}]";
116 $attrib = "[^{$spaceChars}\/>=]";
117 $attribFirst = "(?:{$attrib}|=)";
118 self::$attribsRegex =
119 "/({$attribFirst}{$attrib}*)
120 ($space*=$space*
121 (?:
122 # The attribute value: quoted or alone
123 \"([^\"]*)(?:\"|\$)
124 | '([^']*)(?:'|\$)
125 | (((?!$space|>).)*)
126 )
127 )?/sxu";
128 }
129 return self::$attribsRegex;
130 }
131
135 private static $attribNameRegex;
136
141 private static function getAttribNameRegex() {
142 if ( self::$attribNameRegex === null ) {
143 $attribFirst = "[:_\p{L}\p{N}]";
144 $attrib = "[:_\.\-\p{L}\p{N}]";
145 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
146 }
147 return self::$attribNameRegex;
148 }
149
157 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
158 global $wgAllowImageTag;
159 static $commonCase, $staticInitialised;
160 $isCommonCase = ( $extratags === [] && $removetags === [] );
161 if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
162 return $commonCase;
163 }
164
165 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
166 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
167
168 // Base our staticInitialised variable off of the global config state so that if the globals
169 // are changed (like in the screwed up test system) we will re-initialise the settings.
170 $globalContext = $wgAllowImageTag;
171 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
172 $htmlpairsStatic = [ # Tags that must be closed
173 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
174 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
175 'strike', 'strong', 'tt', 'var', 'div', 'center',
176 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
177 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
178 'kbd', 'samp', 'data', 'time', 'mark'
179 ];
180 # These tags can be self-closed. For tags not also on
181 # $htmlsingleonly, a self-closed tag will be emitted as
182 # an empty element (open-tag/close-tag pair).
183 $htmlsingle = [
184 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
185 ];
186
187 # Elements that cannot have close tags. This is (not coincidentally)
188 # also the list of tags for which the HTML 5 parsing algorithm
189 # requires you to "acknowledge the token's self-closing flag", i.e.
190 # a self-closing tag like <br/> is not an HTML 5 parse error only
191 # for this list.
192 $htmlsingleonly = [
193 'br', 'wbr', 'hr', 'meta', 'link'
194 ];
195
196 $htmlnest = [ # Tags that can be nested--??
197 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
198 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
199 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
200 ];
201 $tabletags = [ # Can only appear inside table, we will close them
202 'td', 'th', 'tr',
203 ];
204 $htmllist = [ # Tags used by list
205 'ul', 'ol',
206 ];
207 $listtags = [ # Tags that can appear in a list
208 'li',
209 ];
210
211 if ( $wgAllowImageTag ) {
212 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
213 'is deprecated since MediaWiki 1.35', '1.35', false, false );
214 $htmlsingle[] = 'img';
215 $htmlsingleonly[] = 'img';
216 }
217
218 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
219 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
220
221 # Convert them all to hashtables for faster lookup
222 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
223 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
224 foreach ( $vars as $var ) {
225 $$var = array_fill_keys( $$var, true );
226 }
227 $staticInitialised = $globalContext;
228 }
229
230 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
231 $extratags = array_fill_keys( $extratags, true );
232 $removetags = array_fill_keys( $removetags, true );
233 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
234 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
235 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
236 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
237
238 $result = [
239 'htmlpairs' => $htmlpairs,
240 'htmlsingle' => $htmlsingle,
241 'htmlsingleonly' => $htmlsingleonly,
242 'htmlnest' => $htmlnest,
243 'tabletags' => $tabletags,
244 'htmllist' => $htmllist,
245 'listtags' => $listtags,
246 'htmlsingleallowed' => $htmlsingleallowed,
247 'htmlelements' => $htmlelements,
248 ];
249 if ( $isCommonCase ) {
250 $commonCase = $result;
251 }
252 return $result;
253 }
254
285 public static function removeHTMLtags( $text, $processCallback = null,
286 $args = [], $extratags = [], $removetags = []
287 ) {
288 wfDeprecated( __METHOD__, '1.38' );
290 $text, $processCallback, $args, $extratags, $removetags
291 );
292 }
293
322 public static function internalRemoveHtmlTags( $text, $processCallback = null,
323 $args = [], $extratags = [], $removetags = []
324 ) {
325 $tagData = self::getRecognizedTagData( $extratags, $removetags );
326 $htmlsingle = $tagData['htmlsingle'];
327 $htmlsingleonly = $tagData['htmlsingleonly'];
328 $htmlelements = $tagData['htmlelements'];
329
330 # Remove HTML comments
331 $text = self::removeHTMLcomments( $text );
332 $bits = explode( '<', $text );
333 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
334
335 # this might be possible using remex tidy itself
336 foreach ( $bits as $x ) {
337 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
338 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
339
340 $badtag = false;
341 $t = strtolower( $t );
342 if ( isset( $htmlelements[$t] ) ) {
343 if ( is_callable( $processCallback ) ) {
344 call_user_func_array( $processCallback, [ &$params, $args ] );
345 }
346
347 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
348 // Remove the self-closing slash, to be consistent
349 // with HTML5 semantics. T134423
350 $brace = '>';
351 }
352 if ( !self::validateTag( $params, $t ) ) {
353 $badtag = true;
354 }
355
356 $newparams = self::fixTagAttributes( $params, $t );
357 if ( !$badtag ) {
358 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
359 # Interpret self-closing tags as empty tags even when
360 # HTML 5 would interpret them as start tags. Such input
361 # is commonly seen on Wikimedia wikis with this intention.
362 $brace = "></$t>";
363 }
364
365 $rest = str_replace( '>', '&gt;', $rest );
366 $text .= "<$slash$t$newparams$brace$rest";
367 continue;
368 }
369 }
370 }
371 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
372 }
373 return $text;
374 }
375
397 public static function removeSomeTags(
398 string $text, array $options = []
399 ): string {
400 $extraTags = $options['extraTags'] ?? [];
401 $removeTags = $options['removeTags'] ?? [];
402 // These options are @internal:
403 $attrCallback = $options['attrCallback'] ?? null;
404 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
405
406 // This disallows HTML5-style "missing trailing semicolon" attributes
407 // In wikitext "clean&copy" does *not* contain an entity.
408 $text = self::normalizeCharReferences( $text );
409
410 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
411 // Use RemexHtml to tokenize $text and remove the barred tags
412 $formatter = new RemexCompatFormatter;
413 $serializer = new RemexSerializer( $formatter );
414 $treeBuilder = new RemexTreeBuilder( $serializer, [
415 'ignoreErrors' => true,
416 'ignoreNulls' => true,
417 ] );
418 $dispatcher = new RemexDispatcher( $treeBuilder );
419 $tokenHandler = $dispatcher;
420 $remover = new RemexRemoveTagHandler(
421 $tokenHandler, $text, $tagData,
422 $attrCallback, $attrCallbackArgs
423 );
424 $tokenizer = new RemexTokenizer( $remover, $text, [
425 'ignoreErrors' => true,
426 // don't ignore char refs, we want them to be decoded
427 'ignoreNulls' => true,
428 'skipPreprocess' => true,
429 ] );
430 $tokenizer->execute( [
431 'fragmentNamespace' => HTMLData::NS_HTML,
432 'fragmentName' => 'body',
433 ] );
434 return $serializer->getResult();
435 }
436
446 public static function removeHTMLcomments( $text ) {
447 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
448 $end = strpos( $text, '-->', $start + 4 );
449 if ( $end === false ) {
450 # Unterminated comment; bail out
451 break;
452 }
453
454 $end += 3;
455
456 # Trim space and newline if the comment is both
457 # preceded and followed by a newline
458 $spaceStart = max( $start - 1, 0 );
459 $spaceLen = $end - $spaceStart;
460 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
461 $spaceStart--;
462 $spaceLen++;
463 }
464 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
465 $spaceLen++;
466 }
467 if ( substr( $text, $spaceStart, 1 ) === "\n"
468 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
469 # Remove the comment, leading and trailing
470 # spaces, and leave only one newline.
471 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
472 } else {
473 # Remove just the comment.
474 $text = substr_replace( $text, '', $start, $end - $start );
475 }
476 }
477 return $text;
478 }
479
494 private static function validateTag( $params, $element ) {
495 $params = self::decodeTagAttributes( $params );
496
497 if ( $element == 'meta' || $element == 'link' ) {
498 if ( !isset( $params['itemprop'] ) ) {
499 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
500 return false;
501 }
502 if ( $element == 'meta' && !isset( $params['content'] ) ) {
503 // <meta> must have a content="" for the itemprop
504 return false;
505 }
506 if ( $element == 'link' && !isset( $params['href'] ) ) {
507 // <link> must have an associated href=""
508 return false;
509 }
510 }
511
512 return true;
513 }
514
530 public static function validateTagAttributes( $attribs, $element ) {
531 return self::validateAttributes( $attribs,
532 self::attributesAllowedInternal( $element ) );
533 }
534
553 public static function validateAttributes( $attribs, $allowed ) {
554 if ( isset( $allowed[0] ) ) {
555 // Calling this function with a sequential array is
556 // deprecated. For now just convert it.
557 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
558 $allowed = array_fill_keys( $allowed, true );
559 }
560 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
561
562 $out = [];
563 foreach ( $attribs as $attribute => $value ) {
564 # Allow XML namespace declaration to allow RDFa
565 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
566 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
567 $out[$attribute] = $value;
568 }
569
570 continue;
571 }
572
573 # Allow any attribute beginning with "data-"
574 # However:
575 # * Disallow data attributes used by MediaWiki code
576 # * Ensure that the attribute is not namespaced by banning
577 # colons.
578 if ( (
579 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
580 !array_key_exists( $attribute, $allowed )
581 ) || self::isReservedDataAttribute( $attribute ) ) {
582 continue;
583 }
584
585 # Strip javascript "expression" from stylesheets.
586 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
587 if ( $attribute == 'style' ) {
588 $value = self::checkCss( $value );
589 }
590
591 # Escape HTML id attributes
592 if ( $attribute === 'id' ) {
593 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
594 }
595
596 # Escape HTML id reference lists
597 if ( $attribute === 'aria-describedby'
598 || $attribute === 'aria-flowto'
599 || $attribute === 'aria-labelledby'
600 || $attribute === 'aria-owns'
601 ) {
602 $value = self::escapeIdReferenceListInternal( $value );
603 }
604
605 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
606 if ( $attribute === 'rel' || $attribute === 'rev'
607 # RDFa
608 || $attribute === 'about' || $attribute === 'property'
609 || $attribute === 'resource' || $attribute === 'datatype'
610 || $attribute === 'typeof'
611 # HTML5 microdata
612 || $attribute === 'itemid' || $attribute === 'itemprop'
613 || $attribute === 'itemref' || $attribute === 'itemscope'
614 || $attribute === 'itemtype'
615 ) {
616 // Paranoia. Allow "simple" values but suppress javascript
617 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
618 continue;
619 }
620 }
621
622 # NOTE: even though elements using href/src are not allowed directly, supply
623 # validation code that can be used by tag hook handlers, etc
624 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
625 if ( !preg_match( $hrefExp, $value ) ) {
626 continue; // drop any href or src attributes not using an allowed protocol.
627 // NOTE: this also drops all relative URLs
628 }
629 }
630
631 if ( $attribute === 'tabindex' && $value !== '0' ) {
632 // Only allow tabindex of 0, which is useful for accessibility.
633 continue;
634 }
635
636 // If this attribute was previously set, override it.
637 // Output should only have one attribute of each name.
638 $out[$attribute] = $value;
639 }
640
641 # itemtype, itemid, itemref don't make sense without itemscope
642 if ( !array_key_exists( 'itemscope', $out ) ) {
643 unset( $out['itemtype'] );
644 unset( $out['itemid'] );
645 unset( $out['itemref'] );
646 }
647 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
648
649 return $out;
650 }
651
659 public static function isReservedDataAttribute( $attr ) {
660 // data-ooui is reserved for ooui.
661 // data-mw and data-parsoid are reserved for parsoid.
662 // data-mw-<name here> is reserved for extensions (or core) if
663 // they need to communicate some data to the client and want to be
664 // sure that it isn't coming from an untrusted user.
665 // We ignore the possibility of namespaces since user-generated HTML
666 // can't use them anymore.
667 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
668 }
669
680 public static function mergeAttributes( $a, $b ) {
681 $out = array_merge( $a, $b );
682 if ( isset( $a['class'] ) && isset( $b['class'] )
683 && is_string( $a['class'] ) && is_string( $b['class'] )
684 && $a['class'] !== $b['class']
685 ) {
686 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
687 -1, PREG_SPLIT_NO_EMPTY );
688 $out['class'] = implode( ' ', array_unique( $classes ) );
689 }
690 return $out;
691 }
692
701 public static function normalizeCss( $value ) {
702 // Decode character references like &#123;
703 $value = self::decodeCharReferences( $value );
704
705 // Decode escape sequences and line continuation
706 // See the grammar in the CSS 2 spec, appendix D.
707 // This has to be done AFTER decoding character references.
708 // This means it isn't possible for this function to return
709 // unsanitized escape sequences. It is possible to manufacture
710 // input that contains character references that decode to
711 // escape sequences that decode to character references, but
712 // it's OK for the return value to contain character references
713 // because the caller is supposed to escape those anyway.
714 static $decodeRegex;
715 if ( !$decodeRegex ) {
716 $space = '[\\x20\\t\\r\\n\\f]';
717 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
718 $backslash = '\\\\';
719 $decodeRegex = "/ $backslash
720 (?:
721 ($nl) | # 1. Line continuation
722 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
723 (.) | # 3. backslash cancelling special meaning
724 () | # 4. backslash at end of string
725 )/xu";
726 }
727 $value = preg_replace_callback( $decodeRegex,
728 [ __CLASS__, 'cssDecodeCallback' ], $value );
729
730 // Let the value through if it's nothing but a single comment, to
731 // allow other functions which may reject it to pass some error
732 // message through.
733 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
734 // Remove any comments; IE gets token splitting wrong
735 // This must be done AFTER decoding character references and
736 // escape sequences, because those steps can introduce comments
737 // This step cannot introduce character references or escape
738 // sequences, because it replaces comments with spaces rather
739 // than removing them completely.
740 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
741
742 // Remove anything after a comment-start token, to guard against
743 // incorrect client implementations.
744 $commentPos = strpos( $value, '/*' );
745 if ( $commentPos !== false ) {
746 $value = substr( $value, 0, $commentPos );
747 }
748 }
749
750 return $value;
751 }
752
771 public static function checkCss( $value ) {
772 $value = self::normalizeCss( $value );
773
774 // Reject problematic keywords and control characters
775 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
776 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
777 return '/* invalid control char */';
778 } elseif ( preg_match(
779 '! expression
780 | filter\s*:
781 | accelerator\s*:
782 | -o-link\s*:
783 | -o-link-source\s*:
784 | -o-replace\s*:
785 | url\s*\‍(
786 | image\s*\‍(
787 | image-set\s*\‍(
788 | attr\s*\‍([^)]+[\s,]+url
789 !ix', $value ) ) {
790 return '/* insecure input */';
791 }
792 return $value;
793 }
794
799 private static function cssDecodeCallback( $matches ) {
800 if ( $matches[1] !== '' ) {
801 // Line continuation
802 return '';
803 } elseif ( $matches[2] !== '' ) {
804 # hexdec could return a float if the match is too long, but the
805 # regexp in question limits the string length to 6.
806 $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
807 } elseif ( $matches[3] !== '' ) {
808 $char = $matches[3];
809 } else {
810 $char = '\\';
811 }
812 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
813 // These characters need to be escaped in strings
814 // Clean up the escape sequence to avoid parsing errors by clients
815 return '\\' . dechex( ord( $char ) ) . ' ';
816 } else {
817 // Decode unnecessary escape
818 return $char;
819 }
820 }
821
843 public static function fixTagAttributes( $text, $element, $sorted = false ) {
844 if ( trim( $text ) == '' ) {
845 return '';
846 }
847
848 $decoded = self::decodeTagAttributes( $text );
849 $stripped = self::validateTagAttributes( $decoded, $element );
850
851 if ( $sorted ) {
852 ksort( $stripped );
853 }
854
855 return self::safeEncodeTagAttributes( $stripped );
856 }
857
865 public static function encodeAttribute( $text ) {
866 $encValue = htmlspecialchars( $text, ENT_QUOTES );
867
868 // Whitespace is normalized during attribute decoding,
869 // so if we've been passed non-spaces we must encode them
870 // ahead of time or they won't be preserved.
871 $encValue = strtr( $encValue, [
872 "\n" => '&#10;',
873 "\r" => '&#13;',
874 "\t" => '&#9;',
875 ] );
876
877 return $encValue;
878 }
879
888 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
889 // Replace $ with \$ and \ with \\
890 $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
891 $fixtags = [
892 # French spaces, last one Guillemet-left
893 # only if it isn't followed by a word character.
894 '/ (?=[?:;!%»›](?!\w))/u' => "$space",
895 # French spaces, Guillemet-right
896 '/([«‹]) /u' => "\\1$space",
897 ];
898 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
899 }
900
909 public static function safeEncodeAttribute( $text ) {
910 $encValue = self::encodeAttribute( $text );
911
912 # Templates and links may be expanded in later parsing,
913 # creating invalid or dangerous output. Suppress this.
914 $encValue = strtr( $encValue, [
915 // '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should
916 // have been escaped.
917 '<' => '&lt;',
918 '>' => '&gt;',
919 '"' => '&quot;',
920 '{' => '&#123;',
921 '}' => '&#125;', // prevent unpaired language conversion syntax
922 '[' => '&#91;',
923 ']' => '&#93;',
924 "''" => '&#39;&#39;',
925 'ISBN' => '&#73;SBN',
926 'RFC' => '&#82;FC',
927 'PMID' => '&#80;MID',
928 '|' => '&#124;',
929 '__' => '&#95;_',
930 ] );
931
932 # Stupid hack
933 $encValue = preg_replace_callback(
934 '/((?i)' . wfUrlProtocols() . ')/',
935 static function ( $matches ) {
936 return str_replace( ':', '&#58;', $matches[1] );
937 },
938 $encValue );
939 return $encValue;
940 }
941
957 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
958 global $wgFragmentMode;
959
960 if ( !isset( $wgFragmentMode[$mode] ) ) {
961 if ( $mode === self::ID_PRIMARY ) {
962 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
963 }
964 return false;
965 }
966
967 $internalMode = $wgFragmentMode[$mode];
968
969 return self::escapeIdInternal( $id, $internalMode );
970 }
971
984 public static function escapeIdForLink( $id ) {
985 global $wgFragmentMode;
986
987 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
988 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
989 }
990
991 $mode = $wgFragmentMode[self::ID_PRIMARY];
992
993 $id = self::escapeIdInternalUrl( $id, $mode );
994
995 return $id;
996 }
997
1007 public static function escapeIdForExternalInterwiki( $id ) {
1009
1010 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1011
1012 return $id;
1013 }
1014
1024 private static function escapeIdInternalUrl( $id, $mode ) {
1025 $id = self::escapeIdInternal( $id, $mode );
1026 if ( $mode === 'html5' ) {
1027 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1028 }
1029 return $id;
1030 }
1031
1039 private static function escapeIdInternal( $id, $mode ) {
1040 // Truncate overly-long IDs. This isn't an HTML limit, it's just
1041 // griefer protection. [T251506]
1042 $id = mb_substr( $id, 0, 1024 );
1043
1044 switch ( $mode ) {
1045 case 'html5':
1046 // html5 spec says ids must not have any of the following:
1047 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1048 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1049 // possible using either Lua or html entities.
1050 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1051 break;
1052 case 'legacy':
1053 // This corresponds to 'noninitial' mode of the former escapeId()
1054 static $replace = [
1055 '%3A' => ':',
1056 '%' => '.'
1057 ];
1058
1059 $id = urlencode( str_replace( ' ', '_', $id ) );
1060 $id = strtr( $id, $replace );
1061 break;
1062 default:
1063 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1064 }
1065
1066 return $id;
1067 }
1068
1076 private static function escapeIdReferenceListInternal( $referenceString ) {
1077 # Explode the space delimited list string into an array of tokens
1078 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1079
1080 # Escape each token as an id
1081 foreach ( $references as &$ref ) {
1082 $ref = self::escapeIdForAttribute( $ref );
1083 }
1084
1085 # Merge the array back to a space delimited list string
1086 # If the array is empty, the result will be an empty string ('')
1087 $referenceString = implode( ' ', $references );
1088
1089 return $referenceString;
1090 }
1091
1103 public static function escapeClass( $class ) {
1104 // Convert ugly stuff to underscores and kill underscores in ugly places
1105 return rtrim( preg_replace(
1106 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1107 '_',
1108 $class ), '_' );
1109 }
1110
1120 public static function escapeHtmlAllowEntities( $html ) {
1121 $html = self::decodeCharReferences( $html );
1122 # It seems wise to escape ' as well as ", as a matter of course. Can't
1123 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1124 # don't cause the entire string to disappear.
1125 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1126 return $html;
1127 }
1128
1137 public static function decodeTagAttributes( $text ) {
1138 if ( trim( $text ) == '' ) {
1139 return [];
1140 }
1141
1142 $pairs = [];
1143 if ( !preg_match_all(
1144 self::getAttribsRegex(),
1145 $text,
1146 $pairs,
1147 PREG_SET_ORDER ) ) {
1148 return [];
1149 }
1150
1151 $attribs = [];
1152 foreach ( $pairs as $set ) {
1153 $attribute = strtolower( $set[1] );
1154
1155 // Filter attribute names with unacceptable characters
1156 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1157 continue;
1158 }
1159
1160 $value = self::getTagAttributeCallback( $set );
1161
1162 // Normalize whitespace
1163 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1164 $value = trim( $value );
1165
1166 // Decode character references
1167 $attribs[$attribute] = self::decodeCharReferences( $value );
1168 }
1169 return $attribs;
1170 }
1171
1179 public static function safeEncodeTagAttributes( $assoc_array ) {
1180 $attribs = [];
1181 foreach ( $assoc_array as $attribute => $value ) {
1182 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1183 $encValue = self::safeEncodeAttribute( $value );
1184
1185 $attribs[] = "$encAttribute=\"$encValue\"";
1186 }
1187 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1188 }
1189
1197 private static function getTagAttributeCallback( $set ) {
1198 if ( isset( $set[5] ) ) {
1199 # No quotes.
1200 return $set[5];
1201 } elseif ( isset( $set[4] ) ) {
1202 # Single-quoted
1203 return $set[4];
1204 } elseif ( isset( $set[3] ) ) {
1205 # Double-quoted
1206 return $set[3];
1207 } elseif ( !isset( $set[2] ) ) {
1208 # In XHTML, attributes must have a value so return an empty string.
1209 # See "Empty attribute syntax",
1210 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1211 return "";
1212 } else {
1213 throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1214 }
1215 }
1216
1221 private static function normalizeWhitespace( $text ) {
1222 return trim( preg_replace(
1223 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1224 ' ',
1225 $text ) );
1226 }
1227
1236 public static function normalizeSectionNameWhitespace( $section ) {
1237 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1238 }
1239
1255 public static function normalizeCharReferences( $text ) {
1256 return preg_replace_callback(
1257 self::CHAR_REFS_REGEX,
1258 [ self::class, 'normalizeCharReferencesCallback' ],
1259 $text );
1260 }
1261
1266 private static function normalizeCharReferencesCallback( $matches ) {
1267 $ret = null;
1268 if ( $matches[1] != '' ) {
1269 $ret = self::normalizeEntity( $matches[1] );
1270 } elseif ( $matches[2] != '' ) {
1271 $ret = self::decCharReference( $matches[2] );
1272 } elseif ( $matches[3] != '' ) {
1273 $ret = self::hexCharReference( $matches[3] );
1274 }
1275 if ( $ret === null ) {
1276 return htmlspecialchars( $matches[0], ENT_COMPAT );
1277 } else {
1278 return $ret;
1279 }
1280 }
1281
1292 private static function normalizeEntity( $name ) {
1293 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1294 // Non-standard MediaWiki-specific entities
1295 return '&' . self::MW_ENTITY_ALIASES[$name];
1296 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1297 // Keep these in word form
1298 return "&$name";
1299 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1300 // Beware: some entities expand to more than 1 codepoint
1301 return preg_replace_callback( '/./Ssu', static function ( $m ) {
1302 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1303 }, HTMLData::$namedEntityTranslations[$name] );
1304 } else {
1305 return "&amp;$name";
1306 }
1307 }
1308
1313 private static function decCharReference( $codepoint ) {
1314 # intval() will (safely) saturate at the maximum signed integer
1315 # value if $codepoint is too many digits
1316 $point = intval( $codepoint );
1317 if ( self::validateCodepoint( $point ) ) {
1318 return sprintf( '&#%d;', $point );
1319 } else {
1320 return null;
1321 }
1322 }
1323
1328 private static function hexCharReference( $codepoint ) {
1329 # hexdec() will return a float (not an int) if $codepoint is too
1330 # long, so protect against that. The largest valid codepoint is
1331 # 0x10FFFF.
1332 if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1333 return null;
1334 }
1335 $point = hexdec( $codepoint );
1336 if ( self::validateCodepoint( $point ) ) {
1337 return sprintf( '&#x%x;', $point );
1338 } else {
1339 return null;
1340 }
1341 }
1342
1349 private static function validateCodepoint( $codepoint ) {
1350 # U+000C is valid in HTML5 but not allowed in XML.
1351 # U+000D is valid in XML but not allowed in HTML5.
1352 # U+007F - U+009F are disallowed in HTML5 (control characters).
1353 return $codepoint == 0x09
1354 || $codepoint == 0x0a
1355 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1356 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1357 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1358 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1359 }
1360
1368 public static function decodeCharReferences( $text ) {
1369 return preg_replace_callback(
1370 self::CHAR_REFS_REGEX,
1371 [ self::class, 'decodeCharReferencesCallback' ],
1372 $text );
1373 }
1374
1385 public static function decodeCharReferencesAndNormalize( $text ) {
1386 $text = preg_replace_callback(
1387 self::CHAR_REFS_REGEX,
1388 [ self::class, 'decodeCharReferencesCallback' ],
1389 $text,
1390 -1, // limit
1391 $count
1392 );
1393
1394 if ( $count ) {
1395 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1396 } else {
1397 return $text;
1398 }
1399 }
1400
1405 private static function decodeCharReferencesCallback( $matches ) {
1406 if ( $matches[1] != '' ) {
1407 return self::decodeEntity( $matches[1] );
1408 } elseif ( $matches[2] != '' ) {
1409 return self::decodeChar( intval( $matches[2] ) );
1410 } elseif ( $matches[3] != '' ) {
1411 # hexdec will return a float if the string is too long (!) so
1412 # check the length of the string first.
1413 if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1414 // Invalid character reference.
1415 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1416 }
1417 return self::decodeChar( hexdec( $matches[3] ) );
1418 }
1419 # Last case should be an ampersand by itself
1420 return $matches[0];
1421 }
1422
1430 private static function decodeChar( $codepoint ) {
1431 if ( self::validateCodepoint( $codepoint ) ) {
1432 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1433 } else {
1434 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1435 }
1436 }
1437
1446 private static function decodeEntity( $name ) {
1447 // These are MediaWiki-specific entities, not in the HTML standard
1448 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1449 $name = self::MW_ENTITY_ALIASES[$name];
1450 }
1451 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1452 return $trans ?? "&$name";
1453 }
1454
1462 private static function attributesAllowedInternal( $element ) {
1463 $list = self::setupAttributesAllowedInternal();
1464 return $list[$element] ?? [];
1465 }
1466
1474 private static function setupAttributesAllowedInternal() {
1475 static $allowed;
1476
1477 if ( $allowed !== null ) {
1478 return $allowed;
1479 }
1480
1481 // For lookup efficiency flip each attributes array so the keys are
1482 // the valid attributes.
1483 $merge = static function ( $a, $b, $c = [] ) {
1484 return array_merge(
1485 $a,
1486 array_fill_keys( $b, true ),
1487 array_fill_keys( $c, true ) );
1488 };
1489 $common = $merge( [], [
1490 # HTML
1491 'id',
1492 'class',
1493 'style',
1494 'lang',
1495 'dir',
1496 'title',
1497 'tabindex',
1498
1499 # WAI-ARIA
1500 'aria-describedby',
1501 'aria-flowto',
1502 'aria-hidden',
1503 'aria-label',
1504 'aria-labelledby',
1505 'aria-level',
1506 'aria-owns',
1507 'role',
1508
1509 # RDFa
1510 # These attributes are specified in section 9 of
1511 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1512 'about',
1513 'property',
1514 'resource',
1515 'datatype',
1516 'typeof',
1517
1518 # Microdata. These are specified by
1519 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1520 'itemid',
1521 'itemprop',
1522 'itemref',
1523 'itemscope',
1524 'itemtype',
1525 ] );
1526
1527 $block = $merge( $common, [ 'align' ] );
1528
1529 $tablealign = [ 'align', 'valign' ];
1530 $tablecell = [
1531 'abbr',
1532 'axis',
1533 'headers',
1534 'scope',
1535 'rowspan',
1536 'colspan',
1537 'nowrap', # deprecated
1538 'width', # deprecated
1539 'height', # deprecated
1540 'bgcolor', # deprecated
1541 ];
1542
1543 # Numbers refer to sections in HTML 4.01 standard describing the element.
1544 # See: https://www.w3.org/TR/html4/
1545 $allowed = [
1546 # 7.5.4
1547 'div' => $block,
1548 'center' => $common, # deprecated
1549 'span' => $common,
1550
1551 # 7.5.5
1552 'h1' => $block,
1553 'h2' => $block,
1554 'h3' => $block,
1555 'h4' => $block,
1556 'h5' => $block,
1557 'h6' => $block,
1558
1559 # 7.5.6
1560 # address
1561
1562 # 8.2.4
1563 'bdo' => $common,
1564
1565 # 9.2.1
1566 'em' => $common,
1567 'strong' => $common,
1568 'cite' => $common,
1569 'dfn' => $common,
1570 'code' => $common,
1571 'samp' => $common,
1572 'kbd' => $common,
1573 'var' => $common,
1574 'abbr' => $common,
1575 # acronym
1576
1577 # 9.2.2
1578 'blockquote' => $merge( $common, [ 'cite' ] ),
1579 'q' => $merge( $common, [ 'cite' ] ),
1580
1581 # 9.2.3
1582 'sub' => $common,
1583 'sup' => $common,
1584
1585 # 9.3.1
1586 'p' => $block,
1587
1588 # 9.3.2
1589 'br' => $merge( $common, [ 'clear' ] ),
1590
1591 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1592 'wbr' => $common,
1593
1594 # 9.3.4
1595 'pre' => $merge( $common, [ 'width' ] ),
1596
1597 # 9.4
1598 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1599 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1600
1601 # 10.2
1602 'ul' => $merge( $common, [ 'type' ] ),
1603 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1604 'li' => $merge( $common, [ 'type', 'value' ] ),
1605
1606 # 10.3
1607 'dl' => $common,
1608 'dd' => $common,
1609 'dt' => $common,
1610
1611 # 11.2.1
1612 'table' => $merge( $common,
1613 [ 'summary', 'width', 'border', 'frame',
1614 'rules', 'cellspacing', 'cellpadding',
1615 'align', 'bgcolor',
1616 ] ),
1617
1618 # 11.2.2
1619 'caption' => $block,
1620
1621 # 11.2.3
1622 'thead' => $common,
1623 'tfoot' => $common,
1624 'tbody' => $common,
1625
1626 # 11.2.4
1627 'colgroup' => $merge( $common, [ 'span' ] ),
1628 'col' => $merge( $common, [ 'span' ] ),
1629
1630 # 11.2.5
1631 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1632
1633 # 11.2.6
1634 'td' => $merge( $common, $tablecell, $tablealign ),
1635 'th' => $merge( $common, $tablecell, $tablealign ),
1636
1637 # 12.2
1638 # NOTE: <a> is not allowed directly, but this list of allowed
1639 # attributes is used from the Parser object
1640 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1641
1642 # 13.2
1643 # Not usually allowed, but may be used for extension-style hooks
1644 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1645 # true
1646 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1647 # Attributes for A/V tags added in T163583 / T133673
1648 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1649 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1650 'source' => $merge( $common, [ 'type', 'src' ] ),
1651 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1652
1653 # 15.2.1
1654 'tt' => $common,
1655 'b' => $common,
1656 'i' => $common,
1657 'big' => $common,
1658 'small' => $common,
1659 'strike' => $common,
1660 's' => $common,
1661 'u' => $common,
1662
1663 # 15.2.2
1664 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1665 # basefont
1666
1667 # 15.3
1668 'hr' => $merge( $common, [ 'width' ] ),
1669
1670 # HTML Ruby annotation text module, simple ruby only.
1671 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1672 'ruby' => $common,
1673 # rbc
1674 'rb' => $common,
1675 'rp' => $common,
1676 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1677 'rtc' => $common,
1678
1679 # MathML root element, where used for extensions
1680 # 'title' may not be 100% valid here; it's XHTML
1681 # https://www.w3.org/TR/REC-MathML/
1682 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1683
1684 // HTML 5 section 4.5
1685 'figure' => $common,
1686 'figcaption' => $common,
1687
1688 # HTML 5 section 4.6
1689 'bdi' => $common,
1690
1691 # HTML5 elements, defined by:
1692 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1693 'data' => $merge( $common, [ 'value' ] ),
1694 'time' => $merge( $common, [ 'datetime' ] ),
1695 'mark' => $common,
1696
1697 // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1698 // is enabled so we don't bother adding a conditional to hide these
1699 // Also meta and link are only valid in WikiText as Microdata elements
1700 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1701 // So we don't bother including $common attributes that have no purpose.
1702 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1703 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1704
1705 # HTML 5 section 4.3.5
1706 'aside' => $common,
1707 ];
1708
1709 return $allowed;
1710 }
1711
1723 public static function stripAllTags( $html ) {
1724 // Use RemexHtml to tokenize $html and extract the text
1725 $handler = new RemexStripTagHandler;
1726 $tokenizer = new RemexTokenizer( $handler, $html, [
1727 'ignoreErrors' => true,
1728 // don't ignore char refs, we want them to be decoded
1729 'ignoreNulls' => true,
1730 'skipPreprocess' => true,
1731 ] );
1732 $tokenizer->execute();
1733 $text = $handler->getResult();
1734
1735 $text = self::normalizeWhitespace( $text );
1736 return $text;
1737 }
1738
1750 public static function hackDocType() {
1751 $out = "<!DOCTYPE html [\n";
1752 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1753 if ( substr( $entity, -1 ) !== ';' ) {
1754 // Some HTML entities omit the trailing semicolon;
1755 // wikitext does not permit these.
1756 continue;
1757 }
1758 $name = substr( $entity, 0, -1 );
1759 $expansion = self::normalizeEntity( $entity );
1760 if ( $entity === $expansion ) {
1761 // Skip &lt; &gt; etc
1762 continue;
1763 }
1764 $out .= "<!ENTITY $name \"$expansion\">";
1765 }
1766 $out .= "]>\n";
1767 return $out;
1768 }
1769
1774 public static function cleanUrl( $url ) {
1775 # Normalize any HTML entities in input. They will be
1776 # re-escaped by makeExternalLink().
1777 $url = self::decodeCharReferences( $url );
1778
1779 # Escape any control characters introduced by the above step
1780 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]+/',
1781 static fn ( $m ) => urlencode( $m[0] ), $url );
1782
1783 # Validate hostname portion
1784 $matches = [];
1785 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1786 [ /* $whole */, $protocol, $host, $rest ] = $matches;
1787
1788 // Characters that will be ignored in IDNs.
1789 // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1790 // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1791 // Strip them before further processing so deny lists and such work.
1792 $strip = "/
1793 \\s| # general whitespace
1794 \u{00AD}| # SOFT HYPHEN
1795 \u{034F}| # COMBINING GRAPHEME JOINER
1796 \u{061C}| # ARABIC LETTER MARK
1797 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1798 # HANGUL JUNGSEONG FILLER
1799 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1800 # KHMER VOWEL INHERENT AA
1801 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1802 # MONGOLIAN FREE VARIATION SELECTOR THREE
1803 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1804 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1805 # RIGHT-TO-LEFT MARK
1806 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1807 # RIGHT-TO-LEFT OVERRIDE
1808 [\u{2060}-\u{2064}]| # WORD JOINER..
1809 # INVISIBLE PLUS
1810 \u{2065}| # <reserved-2065>
1811 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1812 # NOMINAL DIGIT SHAPES
1813 \u{3164}| # HANGUL FILLER
1814 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1815 # VARIATION SELECTOR-16
1816 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1817 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1818 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1819 # <reserved-FFF8>
1820 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1821 # SHORTHAND FORMAT UP STEP
1822 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1823 # MUSICAL SYMBOL END PHRASE
1824 \u{E0000}| # <reserved-E0000>
1825 \u{E0001}| # LANGUAGE TAG
1826 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1827 # <reserved-E001F>
1828 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1829 # CANCEL TAG
1830 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1831 # <reserved-E00FF>
1832 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1833 # VARIATION SELECTOR-256
1834 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1835 # <reserved-E0FFF>
1836 /xuD";
1837
1838 $host = preg_replace( $strip, '', $host );
1839
1840 // IPv6 host names are bracketed with []. Url-decode these.
1841 if ( str_starts_with( $host, "//%5B" ) &&
1842 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1843 ) {
1844 $host = '//[' . $matches[1] . ']' . $matches[2];
1845 }
1846
1847 // @todo FIXME: Validate hostnames here
1848
1849 return $protocol . $host . $rest;
1850 } else {
1851 return $url;
1852 }
1853 }
1854
1883 public static function validateEmail( $addr ) {
1884 $result = null;
1885 // TODO This method should be non-static, and have a HookRunner injected
1886 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1887 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1888 return $result;
1889 }
1890
1891 // Please note strings below are enclosed in brackets [], this make the
1892 // hyphen "-" a range indicator. Hence it is double backslashed below.
1893 // See T28948
1894 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1895 $rfc1034_ldh_str = "a-z0-9\\-";
1896
1897 $html5_email_regexp = "/
1898 ^ # start of string
1899 [$rfc5322_atext\\.]+ # user part which is liberal :p
1900 @ # 'apostrophe'
1901 [$rfc1034_ldh_str]+ # First domain part
1902 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1903 $ # End of string
1904 /ix"; // case Insensitive, eXtended
1905
1906 return (bool)preg_match( $html5_email_regexp, $addr );
1907 }
1908}
1909
1914class_alias( Sanitizer::class, 'Sanitizer' );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:90
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static validateEmail( $addr)
Does a string look like an e-mail address?
static removeHTMLcomments( $text)
Remove '', and everything between.
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:82
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
A collection of static methods to play with strings.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.