MediaWiki REL1_39
Sanitizer.php
Go to the documentation of this file.
1<?php
31use Wikimedia\RemexHtml\HTMLData;
32use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
36
41class Sanitizer {
48 private const CHAR_REFS_REGEX =
49 '/&([A-Za-z0-9\x80-\xff]+;)
50 |&\#([0-9]+);
51 |&\#[xX]([0-9A-Fa-f]+);
52 |(&)/x';
53
58 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
59
69 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
71
77 public const ID_PRIMARY = 0;
78
85 public const ID_FALLBACK = 1;
86
91 private const MW_ENTITY_ALIASES = [
92 'רלמ;' => 'rlm;',
93 'رلم;' => 'rlm;',
94 ];
95
99 private static $attribsRegex;
100
107 private static function getAttribsRegex() {
108 if ( self::$attribsRegex === null ) {
109 $spaceChars = '\x09\x0a\x0c\x0d\x20';
110 $space = "[{$spaceChars}]";
111 $attrib = "[^{$spaceChars}\/>=]";
112 $attribFirst = "(?:{$attrib}|=)";
113 self::$attribsRegex =
114 "/({$attribFirst}{$attrib}*)
115 ($space*=$space*
116 (?:
117 # The attribute value: quoted or alone
118 \"([^\"]*)(?:\"|\$)
119 | '([^']*)(?:'|\$)
120 | (((?!$space|>).)*)
121 )
122 )?/sxu";
123 }
124 return self::$attribsRegex;
125 }
126
130 private static $attribNameRegex;
131
136 private static function getAttribNameRegex() {
137 if ( self::$attribNameRegex === null ) {
138 $attribFirst = "[:_\p{L}\p{N}]";
139 $attrib = "[:_\.\-\p{L}\p{N}]";
140 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
141 }
142 return self::$attribNameRegex;
143 }
144
152 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
153 global $wgAllowImageTag;
154 static $commonCase, $staticInitialised;
155 $isCommonCase = ( $extratags === [] && $removetags === [] );
156 if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
157 return $commonCase;
158 }
159
160 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
162
163 // Base our staticInitialised variable off of the global config state so that if the globals
164 // are changed (like in the screwed up test system) we will re-initialise the settings.
165 $globalContext = $wgAllowImageTag;
166 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167 $htmlpairsStatic = [ # Tags that must be closed
168 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
169 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
170 'strike', 'strong', 'tt', 'var', 'div', 'center',
171 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
172 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
173 'kbd', 'samp', 'data', 'time', 'mark'
174 ];
175 # These tags can be self-closed. For tags not also on
176 # $htmlsingleonly, a self-closed tag will be emitted as
177 # an empty element (open-tag/close-tag pair).
178 $htmlsingle = [
179 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
180 ];
181
182 # Elements that cannot have close tags. This is (not coincidentally)
183 # also the list of tags for which the HTML 5 parsing algorithm
184 # requires you to "acknowledge the token's self-closing flag", i.e.
185 # a self-closing tag like <br/> is not an HTML 5 parse error only
186 # for this list.
187 $htmlsingleonly = [
188 'br', 'wbr', 'hr', 'meta', 'link'
189 ];
190
191 $htmlnest = [ # Tags that can be nested--??
192 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
193 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
194 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
195 ];
196 $tabletags = [ # Can only appear inside table, we will close them
197 'td', 'th', 'tr',
198 ];
199 $htmllist = [ # Tags used by list
200 'ul', 'ol',
201 ];
202 $listtags = [ # Tags that can appear in a list
203 'li',
204 ];
205
206 if ( $wgAllowImageTag ) {
207 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
208 'is deprecated since MediaWiki 1.35', '1.35', false, false );
209 $htmlsingle[] = 'img';
210 $htmlsingleonly[] = 'img';
211 }
212
213 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
215
216 # Convert them all to hashtables for faster lookup
217 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
218 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
219 foreach ( $vars as $var ) {
220 $$var = array_fill_keys( $$var, true );
221 }
222 $staticInitialised = $globalContext;
223 }
224
225 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226 $extratags = array_fill_keys( $extratags, true );
227 $removetags = array_fill_keys( $removetags, true );
228 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
229 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
230 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
231 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
232
233 $result = [
234 'htmlpairs' => $htmlpairs,
235 'htmlsingle' => $htmlsingle,
236 'htmlsingleonly' => $htmlsingleonly,
237 'htmlnest' => $htmlnest,
238 'tabletags' => $tabletags,
239 'htmllist' => $htmllist,
240 'listtags' => $listtags,
241 'htmlsingleallowed' => $htmlsingleallowed,
242 'htmlelements' => $htmlelements,
243 ];
244 if ( $isCommonCase ) {
245 $commonCase = $result;
246 }
247 return $result;
248 }
249
280 public static function removeHTMLtags( $text, $processCallback = null,
281 $args = [], $extratags = [], $removetags = []
282 ) {
283 wfDeprecated( __METHOD__, '1.38' );
284 return self::internalRemoveHtmlTags(
285 $text, $processCallback, $args, $extratags, $removetags
286 );
287 }
288
317 public static function internalRemoveHtmlTags( $text, $processCallback = null,
318 $args = [], $extratags = [], $removetags = []
319 ) {
320 $tagData = self::getRecognizedTagData( $extratags, $removetags );
321 $htmlsingle = $tagData['htmlsingle'];
322 $htmlsingleonly = $tagData['htmlsingleonly'];
323 $htmlelements = $tagData['htmlelements'];
324
325 # Remove HTML comments
326 $text = self::removeHTMLcomments( $text );
327 $bits = explode( '<', $text );
328 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
329
330 # this might be possible using remex tidy itself
331 foreach ( $bits as $x ) {
332 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
334
335 $badtag = false;
336 $t = strtolower( $t );
337 if ( isset( $htmlelements[$t] ) ) {
338 if ( is_callable( $processCallback ) ) {
339 call_user_func_array( $processCallback, [ &$params, $args ] );
340 }
341
342 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
343 // Remove the self-closing slash, to be consistent
344 // with HTML5 semantics. T134423
345 $brace = '>';
346 }
347 if ( !self::validateTag( $params, $t ) ) {
348 $badtag = true;
349 }
350
351 $newparams = self::fixTagAttributes( $params, $t );
352 if ( !$badtag ) {
353 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
354 # Interpret self-closing tags as empty tags even when
355 # HTML 5 would interpret them as start tags. Such input
356 # is commonly seen on Wikimedia wikis with this intention.
357 $brace = "></$t>";
358 }
359
360 $rest = str_replace( '>', '&gt;', $rest );
361 $text .= "<$slash$t$newparams$brace$rest";
362 continue;
363 }
364 }
365 }
366 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
367 }
368 return $text;
369 }
370
392 public static function removeSomeTags(
393 string $text, array $options = []
394 ): string {
395 $extraTags = $options['extraTags'] ?? [];
396 $removeTags = $options['removeTags'] ?? [];
397 // These options are @internal:
398 $attrCallback = $options['attrCallback'] ?? null;
399 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
400
401 // This disallows HTML5-style "missing trailing semicolon" attributes
402 // In wikitext "clean&copy" does *not* contain an entity.
403 $text = self::normalizeCharReferences( $text );
404
405 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
406 // Use RemexHtml to tokenize $text and remove the barred tags
407 $formatter = new RemexCompatFormatter;
408 $serializer = new RemexSerializer( $formatter );
409 $treeBuilder = new RemexTreeBuilder( $serializer, [
410 'ignoreErrors' => true,
411 'ignoreNulls' => true,
412 ] );
413 $dispatcher = new RemexDispatcher( $treeBuilder );
414 $tokenHandler = $dispatcher;
415 $remover = new RemexRemoveTagHandler(
416 $tokenHandler, $text, $tagData,
417 $attrCallback, $attrCallbackArgs
418 );
419 $tokenizer = new RemexTokenizer( $remover, $text, [
420 'ignoreErrors' => true,
421 // don't ignore char refs, we want them to be decoded
422 'ignoreNulls' => true,
423 'skipPreprocess' => true,
424 ] );
425 $tokenizer->execute( [
426 'fragmentNamespace' => HTMLData::NS_HTML,
427 'fragmentName' => 'body',
428 ] );
429 return $serializer->getResult();
430 }
431
441 public static function removeHTMLcomments( $text ) {
442 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
443 $end = strpos( $text, '-->', $start + 4 );
444 if ( $end === false ) {
445 # Unterminated comment; bail out
446 break;
447 }
448
449 $end += 3;
450
451 # Trim space and newline if the comment is both
452 # preceded and followed by a newline
453 $spaceStart = max( $start - 1, 0 );
454 $spaceLen = $end - $spaceStart;
455 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
456 $spaceStart--;
457 $spaceLen++;
458 }
459 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
460 $spaceLen++;
461 }
462 if ( substr( $text, $spaceStart, 1 ) === "\n"
463 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
464 # Remove the comment, leading and trailing
465 # spaces, and leave only one newline.
466 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
467 } else {
468 # Remove just the comment.
469 $text = substr_replace( $text, '', $start, $end - $start );
470 }
471 }
472 return $text;
473 }
474
489 private static function validateTag( $params, $element ) {
490 $params = self::decodeTagAttributes( $params );
491
492 if ( $element == 'meta' || $element == 'link' ) {
493 if ( !isset( $params['itemprop'] ) ) {
494 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
495 return false;
496 }
497 if ( $element == 'meta' && !isset( $params['content'] ) ) {
498 // <meta> must have a content="" for the itemprop
499 return false;
500 }
501 if ( $element == 'link' && !isset( $params['href'] ) ) {
502 // <link> must have an associated href=""
503 return false;
504 }
505 }
506
507 return true;
508 }
509
525 public static function validateTagAttributes( $attribs, $element ) {
526 return self::validateAttributes( $attribs,
527 self::attributesAllowedInternal( $element ) );
528 }
529
548 public static function validateAttributes( $attribs, $allowed ) {
549 if ( isset( $allowed[0] ) ) {
550 // Calling this function with a sequential array is
551 // deprecated. For now just convert it.
552 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
553 $allowed = array_fill_keys( $allowed, true );
554 }
555 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
556
557 $out = [];
558 foreach ( $attribs as $attribute => $value ) {
559 # Allow XML namespace declaration to allow RDFa
560 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
561 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
562 $out[$attribute] = $value;
563 }
564
565 continue;
566 }
567
568 # Allow any attribute beginning with "data-"
569 # However:
570 # * Disallow data attributes used by MediaWiki code
571 # * Ensure that the attribute is not namespaced by banning
572 # colons.
573 if ( (
574 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
575 !array_key_exists( $attribute, $allowed )
576 ) || self::isReservedDataAttribute( $attribute ) ) {
577 continue;
578 }
579
580 # Strip javascript "expression" from stylesheets.
581 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
582 if ( $attribute == 'style' ) {
583 $value = self::checkCss( $value );
584 }
585
586 # Escape HTML id attributes
587 if ( $attribute === 'id' ) {
588 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
589 }
590
591 # Escape HTML id reference lists
592 if ( $attribute === 'aria-describedby'
593 || $attribute === 'aria-flowto'
594 || $attribute === 'aria-labelledby'
595 || $attribute === 'aria-owns'
596 ) {
597 $value = self::escapeIdReferenceListInternal( $value );
598 }
599
600 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
601 if ( $attribute === 'rel' || $attribute === 'rev'
602 # RDFa
603 || $attribute === 'about' || $attribute === 'property'
604 || $attribute === 'resource' || $attribute === 'datatype'
605 || $attribute === 'typeof'
606 # HTML5 microdata
607 || $attribute === 'itemid' || $attribute === 'itemprop'
608 || $attribute === 'itemref' || $attribute === 'itemscope'
609 || $attribute === 'itemtype'
610 ) {
611 // Paranoia. Allow "simple" values but suppress javascript
612 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
613 continue;
614 }
615 }
616
617 # NOTE: even though elements using href/src are not allowed directly, supply
618 # validation code that can be used by tag hook handlers, etc
619 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
620 if ( !preg_match( $hrefExp, $value ) ) {
621 continue; // drop any href or src attributes not using an allowed protocol.
622 // NOTE: this also drops all relative URLs
623 }
624 }
625
626 if ( $attribute === 'tabindex' && $value !== '0' ) {
627 // Only allow tabindex of 0, which is useful for accessibility.
628 continue;
629 }
630
631 // If this attribute was previously set, override it.
632 // Output should only have one attribute of each name.
633 $out[$attribute] = $value;
634 }
635
636 # itemtype, itemid, itemref don't make sense without itemscope
637 if ( !array_key_exists( 'itemscope', $out ) ) {
638 unset( $out['itemtype'] );
639 unset( $out['itemid'] );
640 unset( $out['itemref'] );
641 }
642 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
643
644 return $out;
645 }
646
654 public static function isReservedDataAttribute( $attr ) {
655 // data-ooui is reserved for ooui.
656 // data-mw and data-parsoid are reserved for parsoid.
657 // data-mw-<name here> is reserved for extensions (or core) if
658 // they need to communicate some data to the client and want to be
659 // sure that it isn't coming from an untrusted user.
660 // We ignore the possibility of namespaces since user-generated HTML
661 // can't use them anymore.
662 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
663 }
664
675 public static function mergeAttributes( $a, $b ) {
676 $out = array_merge( $a, $b );
677 if ( isset( $a['class'] ) && isset( $b['class'] )
678 && is_string( $a['class'] ) && is_string( $b['class'] )
679 && $a['class'] !== $b['class']
680 ) {
681 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
682 -1, PREG_SPLIT_NO_EMPTY );
683 $out['class'] = implode( ' ', array_unique( $classes ) );
684 }
685 return $out;
686 }
687
696 public static function normalizeCss( $value ) {
697 // Decode character references like &#123;
698 $value = self::decodeCharReferences( $value );
699
700 // Decode escape sequences and line continuation
701 // See the grammar in the CSS 2 spec, appendix D.
702 // This has to be done AFTER decoding character references.
703 // This means it isn't possible for this function to return
704 // unsanitized escape sequences. It is possible to manufacture
705 // input that contains character references that decode to
706 // escape sequences that decode to character references, but
707 // it's OK for the return value to contain character references
708 // because the caller is supposed to escape those anyway.
709 static $decodeRegex;
710 if ( !$decodeRegex ) {
711 $space = '[\\x20\\t\\r\\n\\f]';
712 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
713 $backslash = '\\\\';
714 $decodeRegex = "/ $backslash
715 (?:
716 ($nl) | # 1. Line continuation
717 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
718 (.) | # 3. backslash cancelling special meaning
719 () | # 4. backslash at end of string
720 )/xu";
721 }
722 $value = preg_replace_callback( $decodeRegex,
723 [ __CLASS__, 'cssDecodeCallback' ], $value );
724
725 // Let the value through if it's nothing but a single comment, to
726 // allow other functions which may reject it to pass some error
727 // message through.
728 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
729 // Remove any comments; IE gets token splitting wrong
730 // This must be done AFTER decoding character references and
731 // escape sequences, because those steps can introduce comments
732 // This step cannot introduce character references or escape
733 // sequences, because it replaces comments with spaces rather
734 // than removing them completely.
735 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
736
737 // Remove anything after a comment-start token, to guard against
738 // incorrect client implementations.
739 $commentPos = strpos( $value, '/*' );
740 if ( $commentPos !== false ) {
741 $value = substr( $value, 0, $commentPos );
742 }
743 }
744
745 return $value;
746 }
747
766 public static function checkCss( $value ) {
767 $value = self::normalizeCss( $value );
768
769 // Reject problematic keywords and control characters
770 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
771 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
772 return '/* invalid control char */';
773 } elseif ( preg_match(
774 '! expression
775 | filter\s*:
776 | accelerator\s*:
777 | -o-link\s*:
778 | -o-link-source\s*:
779 | -o-replace\s*:
780 | url\s*\‍(
781 | image\s*\‍(
782 | image-set\s*\‍(
783 | attr\s*\‍([^)]+[\s,]+url
784 !ix', $value ) ) {
785 return '/* insecure input */';
786 }
787 return $value;
788 }
789
794 private static function cssDecodeCallback( $matches ) {
795 if ( $matches[1] !== '' ) {
796 // Line continuation
797 return '';
798 } elseif ( $matches[2] !== '' ) {
799 $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
800 } elseif ( $matches[3] !== '' ) {
801 $char = $matches[3];
802 } else {
803 $char = '\\';
804 }
805 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
806 // These characters need to be escaped in strings
807 // Clean up the escape sequence to avoid parsing errors by clients
808 return '\\' . dechex( ord( $char ) ) . ' ';
809 } else {
810 // Decode unnecessary escape
811 return $char;
812 }
813 }
814
836 public static function fixTagAttributes( $text, $element, $sorted = false ) {
837 if ( trim( $text ) == '' ) {
838 return '';
839 }
840
841 $decoded = self::decodeTagAttributes( $text );
842 $stripped = self::validateTagAttributes( $decoded, $element );
843
844 if ( $sorted ) {
845 ksort( $stripped );
846 }
847
848 return self::safeEncodeTagAttributes( $stripped );
849 }
850
856 public static function encodeAttribute( $text ) {
857 $encValue = htmlspecialchars( $text, ENT_QUOTES );
858
859 // Whitespace is normalized during attribute decoding,
860 // so if we've been passed non-spaces we must encode them
861 // ahead of time or they won't be preserved.
862 $encValue = strtr( $encValue, [
863 "\n" => '&#10;',
864 "\r" => '&#13;',
865 "\t" => '&#9;',
866 ] );
867
868 return $encValue;
869 }
870
879 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
880 // Replace $ with \$ and \ with \\
881 $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
882 $fixtags = [
883 # French spaces, last one Guillemet-left
884 # only if it isn't followed by a word character.
885 '/ (?=[?:;!%»›](?!\w))/u' => "$space",
886 # French spaces, Guillemet-right
887 '/([«‹]) /u' => "\\1$space",
888 ];
889 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
890 }
891
898 public static function safeEncodeAttribute( $text ) {
899 $encValue = self::encodeAttribute( $text );
900
901 # Templates and links may be expanded in later parsing,
902 # creating invalid or dangerous output. Suppress this.
903 $encValue = strtr( $encValue, [
904 '<' => '&lt;', // This should never happen,
905 '>' => '&gt;', // we've received invalid input
906 '"' => '&quot;', // which should have been escaped.
907 '{' => '&#123;',
908 '}' => '&#125;', // prevent unpaired language conversion syntax
909 '[' => '&#91;',
910 ']' => '&#93;',
911 "''" => '&#39;&#39;',
912 'ISBN' => '&#73;SBN',
913 'RFC' => '&#82;FC',
914 'PMID' => '&#80;MID',
915 '|' => '&#124;',
916 '__' => '&#95;_',
917 ] );
918
919 # Stupid hack
920 $encValue = preg_replace_callback(
921 '/((?i)' . wfUrlProtocols() . ')/',
922 static function ( $matches ) {
923 return str_replace( ':', '&#58;', $matches[1] );
924 },
925 $encValue );
926 return $encValue;
927 }
928
944 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
945 global $wgFragmentMode;
946
947 if ( !isset( $wgFragmentMode[$mode] ) ) {
948 if ( $mode === self::ID_PRIMARY ) {
949 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
950 }
951 return false;
952 }
953
954 $internalMode = $wgFragmentMode[$mode];
955
956 return self::escapeIdInternal( $id, $internalMode );
957 }
958
971 public static function escapeIdForLink( $id ) {
972 global $wgFragmentMode;
973
974 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
975 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
976 }
977
978 $mode = $wgFragmentMode[self::ID_PRIMARY];
979
980 $id = self::escapeIdInternalUrl( $id, $mode );
981
982 return $id;
983 }
984
994 public static function escapeIdForExternalInterwiki( $id ) {
996
997 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
998
999 return $id;
1000 }
1001
1011 private static function escapeIdInternalUrl( $id, $mode ) {
1012 $id = self::escapeIdInternal( $id, $mode );
1013 if ( $mode === 'html5' ) {
1014 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1015 }
1016 return $id;
1017 }
1018
1026 private static function escapeIdInternal( $id, $mode ) {
1027 // Truncate overly-long IDs. This isn't an HTML limit, it's just
1028 // griefer protection. [T251506]
1029 $id = mb_substr( $id, 0, 1024 );
1030
1031 switch ( $mode ) {
1032 case 'html5':
1033 // html5 spec says ids must not have any of the following:
1034 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1035 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1036 // possible using either Lua or html entities.
1037 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1038 break;
1039 case 'legacy':
1040 // This corresponds to 'noninitial' mode of the former escapeId()
1041 static $replace = [
1042 '%3A' => ':',
1043 '%' => '.'
1044 ];
1045
1046 $id = urlencode( str_replace( ' ', '_', $id ) );
1047 $id = strtr( $id, $replace );
1048 break;
1049 default:
1050 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1051 }
1052
1053 return $id;
1054 }
1055
1066 public static function escapeIdReferenceList( $referenceString ) {
1067 wfDeprecated( __METHOD__, '1.36' );
1068 return self::escapeIdReferenceListInternal( $referenceString );
1069 }
1070
1078 private static function escapeIdReferenceListInternal( $referenceString ) {
1079 # Explode the space delimited list string into an array of tokens
1080 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1081
1082 # Escape each token as an id
1083 foreach ( $references as &$ref ) {
1084 $ref = self::escapeIdForAttribute( $ref );
1085 }
1086
1087 # Merge the array back to a space delimited list string
1088 # If the array is empty, the result will be an empty string ('')
1089 $referenceString = implode( ' ', $references );
1090
1091 return $referenceString;
1092 }
1093
1105 public static function escapeClass( $class ) {
1106 // Convert ugly stuff to underscores and kill underscores in ugly places
1107 return rtrim( preg_replace(
1108 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1109 '_',
1110 $class ), '_' );
1111 }
1112
1120 public static function escapeHtmlAllowEntities( $html ) {
1121 $html = self::decodeCharReferences( $html );
1122 # It seems wise to escape ' as well as ", as a matter of course. Can't
1123 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1124 # don't cause the entire string to disappear.
1125 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1126 return $html;
1127 }
1128
1137 public static function decodeTagAttributes( $text ) {
1138 if ( trim( $text ) == '' ) {
1139 return [];
1140 }
1141
1142 $pairs = [];
1143 if ( !preg_match_all(
1144 self::getAttribsRegex(),
1145 $text,
1146 $pairs,
1147 PREG_SET_ORDER ) ) {
1148 return [];
1149 }
1150
1151 $attribs = [];
1152 foreach ( $pairs as $set ) {
1153 $attribute = strtolower( $set[1] );
1154
1155 // Filter attribute names with unacceptable characters
1156 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1157 continue;
1158 }
1159
1160 $value = self::getTagAttributeCallback( $set );
1161
1162 // Normalize whitespace
1163 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1164 $value = trim( $value );
1165
1166 // Decode character references
1167 $attribs[$attribute] = self::decodeCharReferences( $value );
1168 }
1169 return $attribs;
1170 }
1171
1179 public static function safeEncodeTagAttributes( $assoc_array ) {
1180 $attribs = [];
1181 foreach ( $assoc_array as $attribute => $value ) {
1182 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1183 $encValue = self::safeEncodeAttribute( $value );
1184
1185 $attribs[] = "$encAttribute=\"$encValue\"";
1186 }
1187 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1188 }
1189
1198 private static function getTagAttributeCallback( $set ) {
1199 if ( isset( $set[5] ) ) {
1200 # No quotes.
1201 return $set[5];
1202 } elseif ( isset( $set[4] ) ) {
1203 # Single-quoted
1204 return $set[4];
1205 } elseif ( isset( $set[3] ) ) {
1206 # Double-quoted
1207 return $set[3];
1208 } elseif ( !isset( $set[2] ) ) {
1209 # In XHTML, attributes must have a value so return an empty string.
1210 # See "Empty attribute syntax",
1211 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1212 return "";
1213 } else {
1214 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1215 }
1216 }
1217
1222 private static function normalizeWhitespace( $text ) {
1223 return trim( preg_replace(
1224 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1225 ' ',
1226 $text ) );
1227 }
1228
1237 public static function normalizeSectionNameWhitespace( $section ) {
1238 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1239 }
1240
1256 public static function normalizeCharReferences( $text ) {
1257 return preg_replace_callback(
1258 self::CHAR_REFS_REGEX,
1259 [ self::class, 'normalizeCharReferencesCallback' ],
1260 $text );
1261 }
1262
1267 private static function normalizeCharReferencesCallback( $matches ) {
1268 $ret = null;
1269 if ( $matches[1] != '' ) {
1270 $ret = self::normalizeEntity( $matches[1] );
1271 } elseif ( $matches[2] != '' ) {
1272 $ret = self::decCharReference( $matches[2] );
1273 } elseif ( $matches[3] != '' ) {
1274 $ret = self::hexCharReference( $matches[3] );
1275 }
1276 if ( $ret === null ) {
1277 return htmlspecialchars( $matches[0], ENT_COMPAT );
1278 } else {
1279 return $ret;
1280 }
1281 }
1282
1293 private static function normalizeEntity( $name ) {
1294 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1295 // Non-standard MediaWiki-specific entities
1296 return '&' . self::MW_ENTITY_ALIASES[$name];
1297 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1298 // Keep these in word form
1299 return "&$name";
1300 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1301 // Beware: some entities expand to more than 1 codepoint
1302 return preg_replace_callback( '/./Ssu', static function ( $m ) {
1303 return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1304 }, HTMLData::$namedEntityTranslations[$name] );
1305 } else {
1306 return "&amp;$name";
1307 }
1308 }
1309
1314 private static function decCharReference( $codepoint ) {
1315 $point = intval( $codepoint );
1316 if ( self::validateCodepoint( $point ) ) {
1317 return sprintf( '&#%d;', $point );
1318 } else {
1319 return null;
1320 }
1321 }
1322
1327 private static function hexCharReference( $codepoint ) {
1328 $point = hexdec( $codepoint );
1329 if ( self::validateCodepoint( $point ) ) {
1330 return sprintf( '&#x%x;', $point );
1331 } else {
1332 return null;
1333 }
1334 }
1335
1342 private static function validateCodepoint( $codepoint ) {
1343 # U+000C is valid in HTML5 but not allowed in XML.
1344 # U+000D is valid in XML but not allowed in HTML5.
1345 # U+007F - U+009F are disallowed in HTML5 (control characters).
1346 return $codepoint == 0x09
1347 || $codepoint == 0x0a
1348 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1349 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1350 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1351 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1352 }
1353
1361 public static function decodeCharReferences( $text ) {
1362 return preg_replace_callback(
1363 self::CHAR_REFS_REGEX,
1364 [ self::class, 'decodeCharReferencesCallback' ],
1365 $text );
1366 }
1367
1378 public static function decodeCharReferencesAndNormalize( $text ) {
1379 $text = preg_replace_callback(
1380 self::CHAR_REFS_REGEX,
1381 [ self::class, 'decodeCharReferencesCallback' ],
1382 $text,
1383 -1, // limit
1384 $count
1385 );
1386
1387 if ( $count ) {
1388 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1389 } else {
1390 return $text;
1391 }
1392 }
1393
1398 private static function decodeCharReferencesCallback( $matches ) {
1399 if ( $matches[1] != '' ) {
1400 return self::decodeEntity( $matches[1] );
1401 } elseif ( $matches[2] != '' ) {
1402 return self::decodeChar( intval( $matches[2] ) );
1403 } elseif ( $matches[3] != '' ) {
1404 return self::decodeChar( hexdec( $matches[3] ) );
1405 }
1406 # Last case should be an ampersand by itself
1407 return $matches[0];
1408 }
1409
1417 private static function decodeChar( $codepoint ) {
1418 if ( self::validateCodepoint( $codepoint ) ) {
1419 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1420 } else {
1421 return UtfNormal\Constants::UTF8_REPLACEMENT;
1422 }
1423 }
1424
1433 private static function decodeEntity( $name ) {
1434 // These are MediaWiki-specific entities, not in the HTML standard
1435 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1436 $name = self::MW_ENTITY_ALIASES[$name];
1437 }
1438 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1439 return $trans ?? "&$name";
1440 }
1441
1449 private static function attributesAllowedInternal( $element ) {
1450 $list = self::setupAttributesAllowedInternal();
1451 return $list[$element] ?? [];
1452 }
1453
1461 private static function setupAttributesAllowedInternal() {
1462 static $allowed;
1463
1464 if ( $allowed !== null ) {
1465 return $allowed;
1466 }
1467
1468 // For lookup efficiency flip each attributes array so the keys are
1469 // the valid attributes.
1470 $merge = static function ( $a, $b, $c = [] ) {
1471 return array_merge(
1472 $a,
1473 array_fill_keys( $b, true ),
1474 array_fill_keys( $c, true ) );
1475 };
1476 $common = $merge( [], [
1477 # HTML
1478 'id',
1479 'class',
1480 'style',
1481 'lang',
1482 'dir',
1483 'title',
1484 'tabindex',
1485
1486 # WAI-ARIA
1487 'aria-describedby',
1488 'aria-flowto',
1489 'aria-hidden',
1490 'aria-label',
1491 'aria-labelledby',
1492 'aria-owns',
1493 'role',
1494
1495 # RDFa
1496 # These attributes are specified in section 9 of
1497 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1498 'about',
1499 'property',
1500 'resource',
1501 'datatype',
1502 'typeof',
1503
1504 # Microdata. These are specified by
1505 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1506 'itemid',
1507 'itemprop',
1508 'itemref',
1509 'itemscope',
1510 'itemtype',
1511 ] );
1512
1513 $block = $merge( $common, [ 'align' ] );
1514
1515 $tablealign = [ 'align', 'valign' ];
1516 $tablecell = [
1517 'abbr',
1518 'axis',
1519 'headers',
1520 'scope',
1521 'rowspan',
1522 'colspan',
1523 'nowrap', # deprecated
1524 'width', # deprecated
1525 'height', # deprecated
1526 'bgcolor', # deprecated
1527 ];
1528
1529 # Numbers refer to sections in HTML 4.01 standard describing the element.
1530 # See: https://www.w3.org/TR/html4/
1531 $allowed = [
1532 # 7.5.4
1533 'div' => $block,
1534 'center' => $common, # deprecated
1535 'span' => $common,
1536
1537 # 7.5.5
1538 'h1' => $block,
1539 'h2' => $block,
1540 'h3' => $block,
1541 'h4' => $block,
1542 'h5' => $block,
1543 'h6' => $block,
1544
1545 # 7.5.6
1546 # address
1547
1548 # 8.2.4
1549 'bdo' => $common,
1550
1551 # 9.2.1
1552 'em' => $common,
1553 'strong' => $common,
1554 'cite' => $common,
1555 'dfn' => $common,
1556 'code' => $common,
1557 'samp' => $common,
1558 'kbd' => $common,
1559 'var' => $common,
1560 'abbr' => $common,
1561 # acronym
1562
1563 # 9.2.2
1564 'blockquote' => $merge( $common, [ 'cite' ] ),
1565 'q' => $merge( $common, [ 'cite' ] ),
1566
1567 # 9.2.3
1568 'sub' => $common,
1569 'sup' => $common,
1570
1571 # 9.3.1
1572 'p' => $block,
1573
1574 # 9.3.2
1575 'br' => $merge( $common, [ 'clear' ] ),
1576
1577 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1578 'wbr' => $common,
1579
1580 # 9.3.4
1581 'pre' => $merge( $common, [ 'width' ] ),
1582
1583 # 9.4
1584 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1585 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1586
1587 # 10.2
1588 'ul' => $merge( $common, [ 'type' ] ),
1589 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1590 'li' => $merge( $common, [ 'type', 'value' ] ),
1591
1592 # 10.3
1593 'dl' => $common,
1594 'dd' => $common,
1595 'dt' => $common,
1596
1597 # 11.2.1
1598 'table' => $merge( $common,
1599 [ 'summary', 'width', 'border', 'frame',
1600 'rules', 'cellspacing', 'cellpadding',
1601 'align', 'bgcolor',
1602 ] ),
1603
1604 # 11.2.2
1605 'caption' => $block,
1606
1607 # 11.2.3
1608 'thead' => $common,
1609 'tfoot' => $common,
1610 'tbody' => $common,
1611
1612 # 11.2.4
1613 'colgroup' => $merge( $common, [ 'span' ] ),
1614 'col' => $merge( $common, [ 'span' ] ),
1615
1616 # 11.2.5
1617 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1618
1619 # 11.2.6
1620 'td' => $merge( $common, $tablecell, $tablealign ),
1621 'th' => $merge( $common, $tablecell, $tablealign ),
1622
1623 # 12.2
1624 # NOTE: <a> is not allowed directly, but this list of allowed
1625 # attributes is used from the Parser object
1626 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1627
1628 # 13.2
1629 # Not usually allowed, but may be used for extension-style hooks
1630 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1631 # true
1632 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1633 # Attributes for A/V tags added in T163583 / T133673
1634 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1635 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1636 'source' => $merge( $common, [ 'type', 'src' ] ),
1637 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1638
1639 # 15.2.1
1640 'tt' => $common,
1641 'b' => $common,
1642 'i' => $common,
1643 'big' => $common,
1644 'small' => $common,
1645 'strike' => $common,
1646 's' => $common,
1647 'u' => $common,
1648
1649 # 15.2.2
1650 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1651 # basefont
1652
1653 # 15.3
1654 'hr' => $merge( $common, [ 'width' ] ),
1655
1656 # HTML Ruby annotation text module, simple ruby only.
1657 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1658 'ruby' => $common,
1659 # rbc
1660 'rb' => $common,
1661 'rp' => $common,
1662 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1663 'rtc' => $common,
1664
1665 # MathML root element, where used for extensions
1666 # 'title' may not be 100% valid here; it's XHTML
1667 # https://www.w3.org/TR/REC-MathML/
1668 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1669
1670 // HTML 5 section 4.5
1671 'figure' => $common,
1672 'figcaption' => $common,
1673
1674 # HTML 5 section 4.6
1675 'bdi' => $common,
1676
1677 # HTML5 elements, defined by:
1678 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1679 'data' => $merge( $common, [ 'value' ] ),
1680 'time' => $merge( $common, [ 'datetime' ] ),
1681 'mark' => $common,
1682
1683 // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1684 // is enabled so we don't bother adding a conditional to hide these
1685 // Also meta and link are only valid in WikiText as Microdata elements
1686 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1687 // So we don't bother including $common attributes that have no purpose.
1688 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1689 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1690
1691 # HTML 5 section 4.3.5
1692 'aside' => $common,
1693 ];
1694
1695 return $allowed;
1696 }
1697
1709 public static function stripAllTags( $html ) {
1710 // Use RemexHtml to tokenize $html and extract the text
1711 $handler = new RemexStripTagHandler;
1712 $tokenizer = new RemexTokenizer( $handler, $html, [
1713 'ignoreErrors' => true,
1714 // don't ignore char refs, we want them to be decoded
1715 'ignoreNulls' => true,
1716 'skipPreprocess' => true,
1717 ] );
1718 $tokenizer->execute();
1719 $text = $handler->getResult();
1720
1721 $text = self::normalizeWhitespace( $text );
1722 return $text;
1723 }
1724
1736 public static function hackDocType() {
1737 $out = "<!DOCTYPE html [\n";
1738 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1739 if ( substr( $entity, -1 ) !== ';' ) {
1740 // Some HTML entities omit the trailing semicolon;
1741 // wikitext does not permit these.
1742 continue;
1743 }
1744 $name = substr( $entity, 0, -1 );
1745 $expansion = self::normalizeEntity( $entity );
1746 if ( $entity === $expansion ) {
1747 // Skip &lt; &gt; etc
1748 continue;
1749 }
1750 $out .= "<!ENTITY $name \"$expansion\">";
1751 }
1752 $out .= "]>\n";
1753 return $out;
1754 }
1755
1760 public static function cleanUrl( $url ) {
1761 # Normalize any HTML entities in input. They will be
1762 # re-escaped by makeExternalLink().
1763 $url = self::decodeCharReferences( $url );
1764
1765 # Escape any control characters introduced by the above step
1766 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1767 [ __CLASS__, 'cleanUrlCallback' ], $url );
1768
1769 # Validate hostname portion
1770 $matches = [];
1771 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1772 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1773
1774 // Characters that will be ignored in IDNs.
1775 // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1776 // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1777 // Strip them before further processing so deny lists and such work.
1778 $strip = "/
1779 \\s| # general whitespace
1780 \u{00AD}| # SOFT HYPHEN
1781 \u{034F}| # COMBINING GRAPHEME JOINER
1782 \u{061C}| # ARABIC LETTER MARK
1783 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1784 # HANGUL JUNGSEONG FILLER
1785 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1786 # KHMER VOWEL INHERENT AA
1787 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1788 # MONGOLIAN FREE VARIATION SELECTOR THREE
1789 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1790 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1791 # RIGHT-TO-LEFT MARK
1792 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1793 # RIGHT-TO-LEFT OVERRIDE
1794 [\u{2060}-\u{2064}]| # WORD JOINER..
1795 # INVISIBLE PLUS
1796 \u{2065}| # <reserved-2065>
1797 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1798 # NOMINAL DIGIT SHAPES
1799 \u{3164}| # HANGUL FILLER
1800 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1801 # VARIATION SELECTOR-16
1802 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1803 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1804 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1805 # <reserved-FFF8>
1806 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1807 # SHORTHAND FORMAT UP STEP
1808 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1809 # MUSICAL SYMBOL END PHRASE
1810 \u{E0000}| # <reserved-E0000>
1811 \u{E0001}| # LANGUAGE TAG
1812 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1813 # <reserved-E001F>
1814 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1815 # CANCEL TAG
1816 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1817 # <reserved-E00FF>
1818 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1819 # VARIATION SELECTOR-256
1820 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1821 # <reserved-E0FFF>
1822 /xuD";
1823
1824 $host = preg_replace( $strip, '', $host );
1825
1826 // IPv6 host names are bracketed with []. Url-decode these.
1827 if ( str_starts_with( $host, "//%5B" ) &&
1828 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1829 ) {
1830 $host = '//[' . $matches[1] . ']' . $matches[2];
1831 }
1832
1833 // @todo FIXME: Validate hostnames here
1834
1835 return $protocol . $host . $rest;
1836 } else {
1837 return $url;
1838 }
1839 }
1840
1845 private static function cleanUrlCallback( $matches ) {
1846 return urlencode( $matches[0] );
1847 }
1848
1877 public static function validateEmail( $addr ) {
1878 $result = null;
1879 // TODO This method should be non-static, and have a HookRunner injected
1880 if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1881 return $result;
1882 }
1883
1884 // Please note strings below are enclosed in brackets [], this make the
1885 // hyphen "-" a range indicator. Hence it is double backslashed below.
1886 // See T28948
1887 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1888 $rfc1034_ldh_str = "a-z0-9\\-";
1889
1890 $html5_email_regexp = "/
1891 ^ # start of string
1892 [$rfc5322_atext\\.]+ # user part which is liberal :p
1893 @ # 'apostrophe'
1894 [$rfc1034_ldh_str]+ # First domain part
1895 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1896 $ # End of string
1897 /ix"; // case Insensitive, eXtended
1898
1899 return (bool)preg_match( $html5_email_regexp, $addr );
1900 }
1901}
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
MediaWiki exception.
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:41
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static cleanUrl( $url)
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:85
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:77
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.
if( $line===false) $args
Definition mcc.php:124