MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
32 use Wikimedia\RemexHtml\HTMLData;
33 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
34 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
35 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
36 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
37 
42 class Sanitizer {
49  private const CHAR_REFS_REGEX =
50  '/&([A-Za-z0-9\x80-\xff]+;)
51  |&\#([0-9]+);
52  |&\#[xX]([0-9A-Fa-f]+);
53  |(&)/x';
54 
59  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
60 
70  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
71  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
72 
78  public const ID_PRIMARY = 0;
79 
86  public const ID_FALLBACK = 1;
87 
92  private const MW_ENTITY_ALIASES = [
93  'רלמ;' => 'rlm;',
94  'رلم;' => 'rlm;',
95  ];
96 
100  private static $attribsRegex;
101 
108  private static function getAttribsRegex() {
109  if ( self::$attribsRegex === null ) {
110  $spaceChars = '\x09\x0a\x0c\x0d\x20';
111  $space = "[{$spaceChars}]";
112  $attrib = "[^{$spaceChars}\/>=]";
113  $attribFirst = "(?:{$attrib}|=)";
114  self::$attribsRegex =
115  "/({$attribFirst}{$attrib}*)
116  ($space*=$space*
117  (?:
118  # The attribute value: quoted or alone
119  \"([^\"]*)(?:\"|\$)
120  | '([^']*)(?:'|\$)
121  | (((?!$space|>).)*)
122  )
123  )?/sxu";
124  }
125  return self::$attribsRegex;
126  }
127 
131  private static $attribNameRegex;
132 
137  private static function getAttribNameRegex() {
138  if ( self::$attribNameRegex === null ) {
139  $attribFirst = "[:_\p{L}\p{N}]";
140  $attrib = "[:_\.\-\p{L}\p{N}]";
141  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
142  }
143  return self::$attribNameRegex;
144  }
145 
153  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
154  global $wgAllowImageTag;
155  static $commonCase, $staticInitialised;
156  $isCommonCase = ( $extratags === [] && $removetags === [] );
157  if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
158  return $commonCase;
159  }
160 
161  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
162  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
163 
164  // Base our staticInitialised variable off of the global config state so that if the globals
165  // are changed (like in the screwed up test system) we will re-initialise the settings.
166  $globalContext = $wgAllowImageTag;
167  if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
168  $htmlpairsStatic = [ # Tags that must be closed
169  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
170  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
171  'strike', 'strong', 'tt', 'var', 'div', 'center',
172  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
173  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
174  'kbd', 'samp', 'data', 'time', 'mark'
175  ];
176  # These tags can be self-closed. For tags not also on
177  # $htmlsingleonly, a self-closed tag will be emitted as
178  # an empty element (open-tag/close-tag pair).
179  $htmlsingle = [
180  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
181  ];
182 
183  # Elements that cannot have close tags. This is (not coincidentally)
184  # also the list of tags for which the HTML 5 parsing algorithm
185  # requires you to "acknowledge the token's self-closing flag", i.e.
186  # a self-closing tag like <br/> is not an HTML 5 parse error only
187  # for this list.
188  $htmlsingleonly = [
189  'br', 'wbr', 'hr', 'meta', 'link'
190  ];
191 
192  $htmlnest = [ # Tags that can be nested--??
193  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
194  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
195  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
196  ];
197  $tabletags = [ # Can only appear inside table, we will close them
198  'td', 'th', 'tr',
199  ];
200  $htmllist = [ # Tags used by list
201  'ul', 'ol',
202  ];
203  $listtags = [ # Tags that can appear in a list
204  'li',
205  ];
206 
207  if ( $wgAllowImageTag ) {
208  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
209  'is deprecated since MediaWiki 1.35', '1.35', false, false );
210  $htmlsingle[] = 'img';
211  $htmlsingleonly[] = 'img';
212  }
213 
214  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
215  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
216 
217  # Convert them all to hashtables for faster lookup
218  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
219  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
220  foreach ( $vars as $var ) {
221  $$var = array_fill_keys( $$var, true );
222  }
223  $staticInitialised = $globalContext;
224  }
225 
226  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
227  $extratags = array_fill_keys( $extratags, true );
228  $removetags = array_fill_keys( $removetags, true );
229  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
230  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
231  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
232  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
233 
234  $result = [
235  'htmlpairs' => $htmlpairs,
236  'htmlsingle' => $htmlsingle,
237  'htmlsingleonly' => $htmlsingleonly,
238  'htmlnest' => $htmlnest,
239  'tabletags' => $tabletags,
240  'htmllist' => $htmllist,
241  'listtags' => $listtags,
242  'htmlsingleallowed' => $htmlsingleallowed,
243  'htmlelements' => $htmlelements,
244  ];
245  if ( $isCommonCase ) {
246  $commonCase = $result;
247  }
248  return $result;
249  }
250 
281  public static function removeHTMLtags( $text, $processCallback = null,
282  $args = [], $extratags = [], $removetags = []
283  ) {
284  wfDeprecated( __METHOD__, '1.38' );
286  $text, $processCallback, $args, $extratags, $removetags
287  );
288  }
289 
318  public static function internalRemoveHtmlTags( $text, $processCallback = null,
319  $args = [], $extratags = [], $removetags = []
320  ) {
321  $tagData = self::getRecognizedTagData( $extratags, $removetags );
322  $htmlsingle = $tagData['htmlsingle'];
323  $htmlsingleonly = $tagData['htmlsingleonly'];
324  $htmlelements = $tagData['htmlelements'];
325 
326  # Remove HTML comments
327  $text = self::removeHTMLcomments( $text );
328  $bits = explode( '<', $text );
329  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
330 
331  # this might be possible using remex tidy itself
332  foreach ( $bits as $x ) {
333  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
334  [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
335 
336  $badtag = false;
337  $t = strtolower( $t );
338  if ( isset( $htmlelements[$t] ) ) {
339  if ( is_callable( $processCallback ) ) {
340  call_user_func_array( $processCallback, [ &$params, $args ] );
341  }
342 
343  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
344  // Remove the self-closing slash, to be consistent
345  // with HTML5 semantics. T134423
346  $brace = '>';
347  }
348  if ( !self::validateTag( $params, $t ) ) {
349  $badtag = true;
350  }
351 
352  $newparams = self::fixTagAttributes( $params, $t );
353  if ( !$badtag ) {
354  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
355  # Interpret self-closing tags as empty tags even when
356  # HTML 5 would interpret them as start tags. Such input
357  # is commonly seen on Wikimedia wikis with this intention.
358  $brace = "></$t>";
359  }
360 
361  $rest = str_replace( '>', '&gt;', $rest );
362  $text .= "<$slash$t$newparams$brace$rest";
363  continue;
364  }
365  }
366  }
367  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
368  }
369  return $text;
370  }
371 
393  public static function removeSomeTags(
394  string $text, array $options = []
395  ): string {
396  $extraTags = $options['extraTags'] ?? [];
397  $removeTags = $options['removeTags'] ?? [];
398  // These options are @internal:
399  $attrCallback = $options['attrCallback'] ?? null;
400  $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
401 
402  // This disallows HTML5-style "missing trailing semicolon" attributes
403  // In wikitext "clean&copy" does *not* contain an entity.
404  $text = self::normalizeCharReferences( $text );
405 
406  $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
407  // Use RemexHtml to tokenize $text and remove the barred tags
408  $formatter = new RemexCompatFormatter;
409  $serializer = new RemexSerializer( $formatter );
410  $treeBuilder = new RemexTreeBuilder( $serializer, [
411  'ignoreErrors' => true,
412  'ignoreNulls' => true,
413  ] );
414  $dispatcher = new RemexDispatcher( $treeBuilder );
415  $tokenHandler = $dispatcher;
416  $remover = new RemexRemoveTagHandler(
417  $tokenHandler, $text, $tagData,
418  $attrCallback, $attrCallbackArgs
419  );
420  $tokenizer = new RemexTokenizer( $remover, $text, [
421  'ignoreErrors' => true,
422  // don't ignore char refs, we want them to be decoded
423  'ignoreNulls' => true,
424  'skipPreprocess' => true,
425  ] );
426  $tokenizer->execute( [
427  'fragmentNamespace' => HTMLData::NS_HTML,
428  'fragmentName' => 'body',
429  ] );
430  return $serializer->getResult();
431  }
432 
442  public static function removeHTMLcomments( $text ) {
443  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
444  $end = strpos( $text, '-->', $start + 4 );
445  if ( $end === false ) {
446  # Unterminated comment; bail out
447  break;
448  }
449 
450  $end += 3;
451 
452  # Trim space and newline if the comment is both
453  # preceded and followed by a newline
454  $spaceStart = max( $start - 1, 0 );
455  $spaceLen = $end - $spaceStart;
456  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
457  $spaceStart--;
458  $spaceLen++;
459  }
460  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
461  $spaceLen++;
462  }
463  if ( substr( $text, $spaceStart, 1 ) === "\n"
464  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
465  # Remove the comment, leading and trailing
466  # spaces, and leave only one newline.
467  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
468  } else {
469  # Remove just the comment.
470  $text = substr_replace( $text, '', $start, $end - $start );
471  }
472  }
473  return $text;
474  }
475 
490  private static function validateTag( $params, $element ) {
491  $params = self::decodeTagAttributes( $params );
492 
493  if ( $element == 'meta' || $element == 'link' ) {
494  if ( !isset( $params['itemprop'] ) ) {
495  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
496  return false;
497  }
498  if ( $element == 'meta' && !isset( $params['content'] ) ) {
499  // <meta> must have a content="" for the itemprop
500  return false;
501  }
502  if ( $element == 'link' && !isset( $params['href'] ) ) {
503  // <link> must have an associated href=""
504  return false;
505  }
506  }
507 
508  return true;
509  }
510 
526  public static function validateTagAttributes( $attribs, $element ) {
527  return self::validateAttributes( $attribs,
528  self::attributesAllowedInternal( $element ) );
529  }
530 
549  public static function validateAttributes( $attribs, $allowed ) {
550  if ( isset( $allowed[0] ) ) {
551  // Calling this function with a sequential array is
552  // deprecated. For now just convert it.
553  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
554  $allowed = array_fill_keys( $allowed, true );
555  }
556  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
557 
558  $out = [];
559  foreach ( $attribs as $attribute => $value ) {
560  # Allow XML namespace declaration to allow RDFa
561  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
562  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
563  $out[$attribute] = $value;
564  }
565 
566  continue;
567  }
568 
569  # Allow any attribute beginning with "data-"
570  # However:
571  # * Disallow data attributes used by MediaWiki code
572  # * Ensure that the attribute is not namespaced by banning
573  # colons.
574  if ( (
575  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
576  !array_key_exists( $attribute, $allowed )
577  ) || self::isReservedDataAttribute( $attribute ) ) {
578  continue;
579  }
580 
581  # Strip javascript "expression" from stylesheets.
582  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
583  if ( $attribute == 'style' ) {
584  $value = self::checkCss( $value );
585  }
586 
587  # Escape HTML id attributes
588  if ( $attribute === 'id' ) {
589  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
590  }
591 
592  # Escape HTML id reference lists
593  if ( $attribute === 'aria-describedby'
594  || $attribute === 'aria-flowto'
595  || $attribute === 'aria-labelledby'
596  || $attribute === 'aria-owns'
597  ) {
598  $value = self::escapeIdReferenceListInternal( $value );
599  }
600 
601  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
602  if ( $attribute === 'rel' || $attribute === 'rev'
603  # RDFa
604  || $attribute === 'about' || $attribute === 'property'
605  || $attribute === 'resource' || $attribute === 'datatype'
606  || $attribute === 'typeof'
607  # HTML5 microdata
608  || $attribute === 'itemid' || $attribute === 'itemprop'
609  || $attribute === 'itemref' || $attribute === 'itemscope'
610  || $attribute === 'itemtype'
611  ) {
612  // Paranoia. Allow "simple" values but suppress javascript
613  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
614  continue;
615  }
616  }
617 
618  # NOTE: even though elements using href/src are not allowed directly, supply
619  # validation code that can be used by tag hook handlers, etc
620  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
621  if ( !preg_match( $hrefExp, $value ) ) {
622  continue; // drop any href or src attributes not using an allowed protocol.
623  // NOTE: this also drops all relative URLs
624  }
625  }
626 
627  if ( $attribute === 'tabindex' && $value !== '0' ) {
628  // Only allow tabindex of 0, which is useful for accessibility.
629  continue;
630  }
631 
632  // If this attribute was previously set, override it.
633  // Output should only have one attribute of each name.
634  $out[$attribute] = $value;
635  }
636 
637  # itemtype, itemid, itemref don't make sense without itemscope
638  if ( !array_key_exists( 'itemscope', $out ) ) {
639  unset( $out['itemtype'] );
640  unset( $out['itemid'] );
641  unset( $out['itemref'] );
642  }
643  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
644 
645  return $out;
646  }
647 
655  public static function isReservedDataAttribute( $attr ) {
656  // data-ooui is reserved for ooui.
657  // data-mw and data-parsoid are reserved for parsoid.
658  // data-mw-<name here> is reserved for extensions (or core) if
659  // they need to communicate some data to the client and want to be
660  // sure that it isn't coming from an untrusted user.
661  // We ignore the possibility of namespaces since user-generated HTML
662  // can't use them anymore.
663  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
664  }
665 
676  public static function mergeAttributes( $a, $b ) {
677  $out = array_merge( $a, $b );
678  if ( isset( $a['class'] ) && isset( $b['class'] )
679  && is_string( $a['class'] ) && is_string( $b['class'] )
680  && $a['class'] !== $b['class']
681  ) {
682  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
683  -1, PREG_SPLIT_NO_EMPTY );
684  $out['class'] = implode( ' ', array_unique( $classes ) );
685  }
686  return $out;
687  }
688 
697  public static function normalizeCss( $value ) {
698  // Decode character references like &#123;
699  $value = self::decodeCharReferences( $value );
700 
701  // Decode escape sequences and line continuation
702  // See the grammar in the CSS 2 spec, appendix D.
703  // This has to be done AFTER decoding character references.
704  // This means it isn't possible for this function to return
705  // unsanitized escape sequences. It is possible to manufacture
706  // input that contains character references that decode to
707  // escape sequences that decode to character references, but
708  // it's OK for the return value to contain character references
709  // because the caller is supposed to escape those anyway.
710  static $decodeRegex;
711  if ( !$decodeRegex ) {
712  $space = '[\\x20\\t\\r\\n\\f]';
713  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
714  $backslash = '\\\\';
715  $decodeRegex = "/ $backslash
716  (?:
717  ($nl) | # 1. Line continuation
718  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
719  (.) | # 3. backslash cancelling special meaning
720  () | # 4. backslash at end of string
721  )/xu";
722  }
723  $value = preg_replace_callback( $decodeRegex,
724  [ __CLASS__, 'cssDecodeCallback' ], $value );
725 
726  // Let the value through if it's nothing but a single comment, to
727  // allow other functions which may reject it to pass some error
728  // message through.
729  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
730  // Remove any comments; IE gets token splitting wrong
731  // This must be done AFTER decoding character references and
732  // escape sequences, because those steps can introduce comments
733  // This step cannot introduce character references or escape
734  // sequences, because it replaces comments with spaces rather
735  // than removing them completely.
736  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
737 
738  // Remove anything after a comment-start token, to guard against
739  // incorrect client implementations.
740  $commentPos = strpos( $value, '/*' );
741  if ( $commentPos !== false ) {
742  $value = substr( $value, 0, $commentPos );
743  }
744  }
745 
746  return $value;
747  }
748 
767  public static function checkCss( $value ) {
768  $value = self::normalizeCss( $value );
769 
770  // Reject problematic keywords and control characters
771  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
772  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
773  return '/* invalid control char */';
774  } elseif ( preg_match(
775  '! expression
776  | filter\s*:
777  | accelerator\s*:
778  | -o-link\s*:
779  | -o-link-source\s*:
780  | -o-replace\s*:
781  | url\s*\‍(
782  | image\s*\‍(
783  | image-set\s*\‍(
784  | attr\s*\‍([^)]+[\s,]+url
785  !ix', $value ) ) {
786  return '/* insecure input */';
787  }
788  return $value;
789  }
790 
795  private static function cssDecodeCallback( $matches ) {
796  if ( $matches[1] !== '' ) {
797  // Line continuation
798  return '';
799  } elseif ( $matches[2] !== '' ) {
800  # hexdec could return a float if the match is too long, but the
801  # regexp in question limits the string length to 6.
802  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
803  } elseif ( $matches[3] !== '' ) {
804  $char = $matches[3];
805  } else {
806  $char = '\\';
807  }
808  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
809  // These characters need to be escaped in strings
810  // Clean up the escape sequence to avoid parsing errors by clients
811  return '\\' . dechex( ord( $char ) ) . ' ';
812  } else {
813  // Decode unnecessary escape
814  return $char;
815  }
816  }
817 
839  public static function fixTagAttributes( $text, $element, $sorted = false ) {
840  if ( trim( $text ) == '' ) {
841  return '';
842  }
843 
844  $decoded = self::decodeTagAttributes( $text );
845  $stripped = self::validateTagAttributes( $decoded, $element );
846 
847  if ( $sorted ) {
848  ksort( $stripped );
849  }
850 
851  return self::safeEncodeTagAttributes( $stripped );
852  }
853 
859  public static function encodeAttribute( $text ) {
860  $encValue = htmlspecialchars( $text, ENT_QUOTES );
861 
862  // Whitespace is normalized during attribute decoding,
863  // so if we've been passed non-spaces we must encode them
864  // ahead of time or they won't be preserved.
865  $encValue = strtr( $encValue, [
866  "\n" => '&#10;',
867  "\r" => '&#13;',
868  "\t" => '&#9;',
869  ] );
870 
871  return $encValue;
872  }
873 
882  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
883  // Replace $ with \$ and \ with \\
884  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
885  $fixtags = [
886  # French spaces, last one Guillemet-left
887  # only if it isn't followed by a word character.
888  '/ (?=[?:;!%»›](?!\w))/u' => "$space",
889  # French spaces, Guillemet-right
890  '/([«‹]) /u' => "\\1$space",
891  ];
892  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
893  }
894 
901  public static function safeEncodeAttribute( $text ) {
902  $encValue = self::encodeAttribute( $text );
903 
904  # Templates and links may be expanded in later parsing,
905  # creating invalid or dangerous output. Suppress this.
906  $encValue = strtr( $encValue, [
907  '<' => '&lt;', // This should never happen,
908  '>' => '&gt;', // we've received invalid input
909  '"' => '&quot;', // which should have been escaped.
910  '{' => '&#123;',
911  '}' => '&#125;', // prevent unpaired language conversion syntax
912  '[' => '&#91;',
913  ']' => '&#93;',
914  "''" => '&#39;&#39;',
915  'ISBN' => '&#73;SBN',
916  'RFC' => '&#82;FC',
917  'PMID' => '&#80;MID',
918  '|' => '&#124;',
919  '__' => '&#95;_',
920  ] );
921 
922  # Stupid hack
923  $encValue = preg_replace_callback(
924  '/((?i)' . wfUrlProtocols() . ')/',
925  static function ( $matches ) {
926  return str_replace( ':', '&#58;', $matches[1] );
927  },
928  $encValue );
929  return $encValue;
930  }
931 
947  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
948  global $wgFragmentMode;
949 
950  if ( !isset( $wgFragmentMode[$mode] ) ) {
951  if ( $mode === self::ID_PRIMARY ) {
952  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
953  }
954  return false;
955  }
956 
957  $internalMode = $wgFragmentMode[$mode];
958 
959  return self::escapeIdInternal( $id, $internalMode );
960  }
961 
974  public static function escapeIdForLink( $id ) {
975  global $wgFragmentMode;
976 
977  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
978  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
979  }
980 
981  $mode = $wgFragmentMode[self::ID_PRIMARY];
982 
983  $id = self::escapeIdInternalUrl( $id, $mode );
984 
985  return $id;
986  }
987 
997  public static function escapeIdForExternalInterwiki( $id ) {
999 
1000  $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1001 
1002  return $id;
1003  }
1004 
1014  private static function escapeIdInternalUrl( $id, $mode ) {
1015  $id = self::escapeIdInternal( $id, $mode );
1016  if ( $mode === 'html5' ) {
1017  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1018  }
1019  return $id;
1020  }
1021 
1029  private static function escapeIdInternal( $id, $mode ) {
1030  // Truncate overly-long IDs. This isn't an HTML limit, it's just
1031  // griefer protection. [T251506]
1032  $id = mb_substr( $id, 0, 1024 );
1033 
1034  switch ( $mode ) {
1035  case 'html5':
1036  // html5 spec says ids must not have any of the following:
1037  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1038  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1039  // possible using either Lua or html entities.
1040  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1041  break;
1042  case 'legacy':
1043  // This corresponds to 'noninitial' mode of the former escapeId()
1044  static $replace = [
1045  '%3A' => ':',
1046  '%' => '.'
1047  ];
1048 
1049  $id = urlencode( str_replace( ' ', '_', $id ) );
1050  $id = strtr( $id, $replace );
1051  break;
1052  default:
1053  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1054  }
1055 
1056  return $id;
1057  }
1058 
1069  public static function escapeIdReferenceList( $referenceString ) {
1070  wfDeprecated( __METHOD__, '1.36' );
1071  return self::escapeIdReferenceListInternal( $referenceString );
1072  }
1073 
1081  private static function escapeIdReferenceListInternal( $referenceString ) {
1082  # Explode the space delimited list string into an array of tokens
1083  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1084 
1085  # Escape each token as an id
1086  foreach ( $references as &$ref ) {
1087  $ref = self::escapeIdForAttribute( $ref );
1088  }
1089 
1090  # Merge the array back to a space delimited list string
1091  # If the array is empty, the result will be an empty string ('')
1092  $referenceString = implode( ' ', $references );
1093 
1094  return $referenceString;
1095  }
1096 
1108  public static function escapeClass( $class ) {
1109  // Convert ugly stuff to underscores and kill underscores in ugly places
1110  return rtrim( preg_replace(
1111  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1112  '_',
1113  $class ), '_' );
1114  }
1115 
1123  public static function escapeHtmlAllowEntities( $html ) {
1124  $html = self::decodeCharReferences( $html );
1125  # It seems wise to escape ' as well as ", as a matter of course. Can't
1126  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1127  # don't cause the entire string to disappear.
1128  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1129  return $html;
1130  }
1131 
1140  public static function decodeTagAttributes( $text ) {
1141  if ( trim( $text ) == '' ) {
1142  return [];
1143  }
1144 
1145  $pairs = [];
1146  if ( !preg_match_all(
1147  self::getAttribsRegex(),
1148  $text,
1149  $pairs,
1150  PREG_SET_ORDER ) ) {
1151  return [];
1152  }
1153 
1154  $attribs = [];
1155  foreach ( $pairs as $set ) {
1156  $attribute = strtolower( $set[1] );
1157 
1158  // Filter attribute names with unacceptable characters
1159  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1160  continue;
1161  }
1162 
1163  $value = self::getTagAttributeCallback( $set );
1164 
1165  // Normalize whitespace
1166  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1167  $value = trim( $value );
1168 
1169  // Decode character references
1170  $attribs[$attribute] = self::decodeCharReferences( $value );
1171  }
1172  return $attribs;
1173  }
1174 
1182  public static function safeEncodeTagAttributes( $assoc_array ) {
1183  $attribs = [];
1184  foreach ( $assoc_array as $attribute => $value ) {
1185  $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1186  $encValue = self::safeEncodeAttribute( $value );
1187 
1188  $attribs[] = "$encAttribute=\"$encValue\"";
1189  }
1190  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1191  }
1192 
1201  private static function getTagAttributeCallback( $set ) {
1202  if ( isset( $set[5] ) ) {
1203  # No quotes.
1204  return $set[5];
1205  } elseif ( isset( $set[4] ) ) {
1206  # Single-quoted
1207  return $set[4];
1208  } elseif ( isset( $set[3] ) ) {
1209  # Double-quoted
1210  return $set[3];
1211  } elseif ( !isset( $set[2] ) ) {
1212  # In XHTML, attributes must have a value so return an empty string.
1213  # See "Empty attribute syntax",
1214  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1215  return "";
1216  } else {
1217  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1218  }
1219  }
1220 
1225  private static function normalizeWhitespace( $text ) {
1226  return trim( preg_replace(
1227  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1228  ' ',
1229  $text ) );
1230  }
1231 
1240  public static function normalizeSectionNameWhitespace( $section ) {
1241  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1242  }
1243 
1259  public static function normalizeCharReferences( $text ) {
1260  return preg_replace_callback(
1261  self::CHAR_REFS_REGEX,
1262  [ self::class, 'normalizeCharReferencesCallback' ],
1263  $text );
1264  }
1265 
1270  private static function normalizeCharReferencesCallback( $matches ) {
1271  $ret = null;
1272  if ( $matches[1] != '' ) {
1273  $ret = self::normalizeEntity( $matches[1] );
1274  } elseif ( $matches[2] != '' ) {
1275  $ret = self::decCharReference( $matches[2] );
1276  } elseif ( $matches[3] != '' ) {
1277  $ret = self::hexCharReference( $matches[3] );
1278  }
1279  if ( $ret === null ) {
1280  return htmlspecialchars( $matches[0], ENT_COMPAT );
1281  } else {
1282  return $ret;
1283  }
1284  }
1285 
1296  private static function normalizeEntity( $name ) {
1297  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1298  // Non-standard MediaWiki-specific entities
1299  return '&' . self::MW_ENTITY_ALIASES[$name];
1300  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1301  // Keep these in word form
1302  return "&$name";
1303  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1304  // Beware: some entities expand to more than 1 codepoint
1305  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1306  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1307  }, HTMLData::$namedEntityTranslations[$name] );
1308  } else {
1309  return "&amp;$name";
1310  }
1311  }
1312 
1317  private static function decCharReference( $codepoint ) {
1318  # intval() will (safely) saturate at the maximum signed integer
1319  # value if $codepoint is too many digits
1320  $point = intval( $codepoint );
1321  if ( self::validateCodepoint( $point ) ) {
1322  return sprintf( '&#%d;', $point );
1323  } else {
1324  return null;
1325  }
1326  }
1327 
1332  private static function hexCharReference( $codepoint ) {
1333  # hexdec() will return a float (not an int) if $codepoint is too
1334  # long, so protect against that. The largest valid codepoint is
1335  # 0x10FFFF.
1336  if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1337  return null;
1338  }
1339  $point = hexdec( $codepoint );
1340  if ( self::validateCodepoint( $point ) ) {
1341  return sprintf( '&#x%x;', $point );
1342  } else {
1343  return null;
1344  }
1345  }
1346 
1353  private static function validateCodepoint( $codepoint ) {
1354  # U+000C is valid in HTML5 but not allowed in XML.
1355  # U+000D is valid in XML but not allowed in HTML5.
1356  # U+007F - U+009F are disallowed in HTML5 (control characters).
1357  return $codepoint == 0x09
1358  || $codepoint == 0x0a
1359  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1360  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1361  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1362  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1363  }
1364 
1372  public static function decodeCharReferences( $text ) {
1373  return preg_replace_callback(
1374  self::CHAR_REFS_REGEX,
1375  [ self::class, 'decodeCharReferencesCallback' ],
1376  $text );
1377  }
1378 
1389  public static function decodeCharReferencesAndNormalize( $text ) {
1390  $text = preg_replace_callback(
1391  self::CHAR_REFS_REGEX,
1392  [ self::class, 'decodeCharReferencesCallback' ],
1393  $text,
1394  -1, // limit
1395  $count
1396  );
1397 
1398  if ( $count ) {
1399  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1400  } else {
1401  return $text;
1402  }
1403  }
1404 
1409  private static function decodeCharReferencesCallback( $matches ) {
1410  if ( $matches[1] != '' ) {
1411  return self::decodeEntity( $matches[1] );
1412  } elseif ( $matches[2] != '' ) {
1413  return self::decodeChar( intval( $matches[2] ) );
1414  } elseif ( $matches[3] != '' ) {
1415  # hexdec will return a float if the string is too long (!) so
1416  # check the length of the string first.
1417  if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1418  // Invalid character reference.
1419  return UtfNormal\Constants::UTF8_REPLACEMENT;
1420  }
1421  return self::decodeChar( hexdec( $matches[3] ) );
1422  }
1423  # Last case should be an ampersand by itself
1424  return $matches[0];
1425  }
1426 
1434  private static function decodeChar( $codepoint ) {
1435  if ( self::validateCodepoint( $codepoint ) ) {
1436  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1437  } else {
1438  return UtfNormal\Constants::UTF8_REPLACEMENT;
1439  }
1440  }
1441 
1450  private static function decodeEntity( $name ) {
1451  // These are MediaWiki-specific entities, not in the HTML standard
1452  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1453  $name = self::MW_ENTITY_ALIASES[$name];
1454  }
1455  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1456  return $trans ?? "&$name";
1457  }
1458 
1466  private static function attributesAllowedInternal( $element ) {
1467  $list = self::setupAttributesAllowedInternal();
1468  return $list[$element] ?? [];
1469  }
1470 
1478  private static function setupAttributesAllowedInternal() {
1479  static $allowed;
1480 
1481  if ( $allowed !== null ) {
1482  return $allowed;
1483  }
1484 
1485  // For lookup efficiency flip each attributes array so the keys are
1486  // the valid attributes.
1487  $merge = static function ( $a, $b, $c = [] ) {
1488  return array_merge(
1489  $a,
1490  array_fill_keys( $b, true ),
1491  array_fill_keys( $c, true ) );
1492  };
1493  $common = $merge( [], [
1494  # HTML
1495  'id',
1496  'class',
1497  'style',
1498  'lang',
1499  'dir',
1500  'title',
1501  'tabindex',
1502 
1503  # WAI-ARIA
1504  'aria-describedby',
1505  'aria-flowto',
1506  'aria-hidden',
1507  'aria-label',
1508  'aria-labelledby',
1509  'aria-level',
1510  'aria-owns',
1511  'role',
1512 
1513  # RDFa
1514  # These attributes are specified in section 9 of
1515  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1516  'about',
1517  'property',
1518  'resource',
1519  'datatype',
1520  'typeof',
1521 
1522  # Microdata. These are specified by
1523  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1524  'itemid',
1525  'itemprop',
1526  'itemref',
1527  'itemscope',
1528  'itemtype',
1529  ] );
1530 
1531  $block = $merge( $common, [ 'align' ] );
1532 
1533  $tablealign = [ 'align', 'valign' ];
1534  $tablecell = [
1535  'abbr',
1536  'axis',
1537  'headers',
1538  'scope',
1539  'rowspan',
1540  'colspan',
1541  'nowrap', # deprecated
1542  'width', # deprecated
1543  'height', # deprecated
1544  'bgcolor', # deprecated
1545  ];
1546 
1547  # Numbers refer to sections in HTML 4.01 standard describing the element.
1548  # See: https://www.w3.org/TR/html4/
1549  $allowed = [
1550  # 7.5.4
1551  'div' => $block,
1552  'center' => $common, # deprecated
1553  'span' => $common,
1554 
1555  # 7.5.5
1556  'h1' => $block,
1557  'h2' => $block,
1558  'h3' => $block,
1559  'h4' => $block,
1560  'h5' => $block,
1561  'h6' => $block,
1562 
1563  # 7.5.6
1564  # address
1565 
1566  # 8.2.4
1567  'bdo' => $common,
1568 
1569  # 9.2.1
1570  'em' => $common,
1571  'strong' => $common,
1572  'cite' => $common,
1573  'dfn' => $common,
1574  'code' => $common,
1575  'samp' => $common,
1576  'kbd' => $common,
1577  'var' => $common,
1578  'abbr' => $common,
1579  # acronym
1580 
1581  # 9.2.2
1582  'blockquote' => $merge( $common, [ 'cite' ] ),
1583  'q' => $merge( $common, [ 'cite' ] ),
1584 
1585  # 9.2.3
1586  'sub' => $common,
1587  'sup' => $common,
1588 
1589  # 9.3.1
1590  'p' => $block,
1591 
1592  # 9.3.2
1593  'br' => $merge( $common, [ 'clear' ] ),
1594 
1595  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1596  'wbr' => $common,
1597 
1598  # 9.3.4
1599  'pre' => $merge( $common, [ 'width' ] ),
1600 
1601  # 9.4
1602  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1603  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1604 
1605  # 10.2
1606  'ul' => $merge( $common, [ 'type' ] ),
1607  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1608  'li' => $merge( $common, [ 'type', 'value' ] ),
1609 
1610  # 10.3
1611  'dl' => $common,
1612  'dd' => $common,
1613  'dt' => $common,
1614 
1615  # 11.2.1
1616  'table' => $merge( $common,
1617  [ 'summary', 'width', 'border', 'frame',
1618  'rules', 'cellspacing', 'cellpadding',
1619  'align', 'bgcolor',
1620  ] ),
1621 
1622  # 11.2.2
1623  'caption' => $block,
1624 
1625  # 11.2.3
1626  'thead' => $common,
1627  'tfoot' => $common,
1628  'tbody' => $common,
1629 
1630  # 11.2.4
1631  'colgroup' => $merge( $common, [ 'span' ] ),
1632  'col' => $merge( $common, [ 'span' ] ),
1633 
1634  # 11.2.5
1635  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1636 
1637  # 11.2.6
1638  'td' => $merge( $common, $tablecell, $tablealign ),
1639  'th' => $merge( $common, $tablecell, $tablealign ),
1640 
1641  # 12.2
1642  # NOTE: <a> is not allowed directly, but this list of allowed
1643  # attributes is used from the Parser object
1644  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1645 
1646  # 13.2
1647  # Not usually allowed, but may be used for extension-style hooks
1648  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1649  # true
1650  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1651  # Attributes for A/V tags added in T163583 / T133673
1652  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1653  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1654  'source' => $merge( $common, [ 'type', 'src' ] ),
1655  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1656 
1657  # 15.2.1
1658  'tt' => $common,
1659  'b' => $common,
1660  'i' => $common,
1661  'big' => $common,
1662  'small' => $common,
1663  'strike' => $common,
1664  's' => $common,
1665  'u' => $common,
1666 
1667  # 15.2.2
1668  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1669  # basefont
1670 
1671  # 15.3
1672  'hr' => $merge( $common, [ 'width' ] ),
1673 
1674  # HTML Ruby annotation text module, simple ruby only.
1675  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1676  'ruby' => $common,
1677  # rbc
1678  'rb' => $common,
1679  'rp' => $common,
1680  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1681  'rtc' => $common,
1682 
1683  # MathML root element, where used for extensions
1684  # 'title' may not be 100% valid here; it's XHTML
1685  # https://www.w3.org/TR/REC-MathML/
1686  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1687 
1688  // HTML 5 section 4.5
1689  'figure' => $common,
1690  'figcaption' => $common,
1691 
1692  # HTML 5 section 4.6
1693  'bdi' => $common,
1694 
1695  # HTML5 elements, defined by:
1696  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1697  'data' => $merge( $common, [ 'value' ] ),
1698  'time' => $merge( $common, [ 'datetime' ] ),
1699  'mark' => $common,
1700 
1701  // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1702  // is enabled so we don't bother adding a conditional to hide these
1703  // Also meta and link are only valid in WikiText as Microdata elements
1704  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1705  // So we don't bother including $common attributes that have no purpose.
1706  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1707  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1708 
1709  # HTML 5 section 4.3.5
1710  'aside' => $common,
1711  ];
1712 
1713  return $allowed;
1714  }
1715 
1727  public static function stripAllTags( $html ) {
1728  // Use RemexHtml to tokenize $html and extract the text
1729  $handler = new RemexStripTagHandler;
1730  $tokenizer = new RemexTokenizer( $handler, $html, [
1731  'ignoreErrors' => true,
1732  // don't ignore char refs, we want them to be decoded
1733  'ignoreNulls' => true,
1734  'skipPreprocess' => true,
1735  ] );
1736  $tokenizer->execute();
1737  $text = $handler->getResult();
1738 
1739  $text = self::normalizeWhitespace( $text );
1740  return $text;
1741  }
1742 
1754  public static function hackDocType() {
1755  $out = "<!DOCTYPE html [\n";
1756  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1757  if ( substr( $entity, -1 ) !== ';' ) {
1758  // Some HTML entities omit the trailing semicolon;
1759  // wikitext does not permit these.
1760  continue;
1761  }
1762  $name = substr( $entity, 0, -1 );
1763  $expansion = self::normalizeEntity( $entity );
1764  if ( $entity === $expansion ) {
1765  // Skip &lt; &gt; etc
1766  continue;
1767  }
1768  $out .= "<!ENTITY $name \"$expansion\">";
1769  }
1770  $out .= "]>\n";
1771  return $out;
1772  }
1773 
1778  public static function cleanUrl( $url ) {
1779  # Normalize any HTML entities in input. They will be
1780  # re-escaped by makeExternalLink().
1781  $url = self::decodeCharReferences( $url );
1782 
1783  # Escape any control characters introduced by the above step
1784  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1785  [ __CLASS__, 'cleanUrlCallback' ], $url );
1786 
1787  # Validate hostname portion
1788  $matches = [];
1789  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1790  [ /* $whole */, $protocol, $host, $rest ] = $matches;
1791 
1792  // Characters that will be ignored in IDNs.
1793  // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1794  // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1795  // Strip them before further processing so deny lists and such work.
1796  $strip = "/
1797  \\s| # general whitespace
1798  \u{00AD}| # SOFT HYPHEN
1799  \u{034F}| # COMBINING GRAPHEME JOINER
1800  \u{061C}| # ARABIC LETTER MARK
1801  [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1802  # HANGUL JUNGSEONG FILLER
1803  [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1804  # KHMER VOWEL INHERENT AA
1805  [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1806  # MONGOLIAN FREE VARIATION SELECTOR THREE
1807  \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1808  [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1809  # RIGHT-TO-LEFT MARK
1810  [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1811  # RIGHT-TO-LEFT OVERRIDE
1812  [\u{2060}-\u{2064}]| # WORD JOINER..
1813  # INVISIBLE PLUS
1814  \u{2065}| # <reserved-2065>
1815  [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1816  # NOMINAL DIGIT SHAPES
1817  \u{3164}| # HANGUL FILLER
1818  [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1819  # VARIATION SELECTOR-16
1820  \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1821  \u{FFA0}| # HALFWIDTH HANGUL FILLER
1822  [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1823  # <reserved-FFF8>
1824  [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1825  # SHORTHAND FORMAT UP STEP
1826  [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1827  # MUSICAL SYMBOL END PHRASE
1828  \u{E0000}| # <reserved-E0000>
1829  \u{E0001}| # LANGUAGE TAG
1830  [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1831  # <reserved-E001F>
1832  [\u{E0020}-\u{E007F}]| # TAG SPACE..
1833  # CANCEL TAG
1834  [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1835  # <reserved-E00FF>
1836  [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1837  # VARIATION SELECTOR-256
1838  [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1839  # <reserved-E0FFF>
1840  /xuD";
1841 
1842  $host = preg_replace( $strip, '', $host );
1843 
1844  // IPv6 host names are bracketed with []. Url-decode these.
1845  if ( str_starts_with( $host, "//%5B" ) &&
1846  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1847  ) {
1848  $host = '//[' . $matches[1] . ']' . $matches[2];
1849  }
1850 
1851  // @todo FIXME: Validate hostnames here
1852 
1853  return $protocol . $host . $rest;
1854  } else {
1855  return $url;
1856  }
1857  }
1858 
1863  private static function cleanUrlCallback( $matches ) {
1864  return urlencode( $matches[0] );
1865  }
1866 
1895  public static function validateEmail( $addr ) {
1896  $result = null;
1897  // TODO This method should be non-static, and have a HookRunner injected
1898  $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1899  if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1900  return $result;
1901  }
1902 
1903  // Please note strings below are enclosed in brackets [], this make the
1904  // hyphen "-" a range indicator. Hence it is double backslashed below.
1905  // See T28948
1906  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1907  $rfc1034_ldh_str = "a-z0-9\\-";
1908 
1909  $html5_email_regexp = "/
1910  ^ # start of string
1911  [$rfc5322_atext\\.]+ # user part which is liberal :p
1912  @ # 'apostrophe'
1913  [$rfc1034_ldh_str]+ # First domain part
1914  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1915  $ # End of string
1916  /ix"; // case Insensitive, eXtended
1917 
1918  return (bool)preg_match( $html5_email_regexp, $addr );
1919  }
1920 }
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
$matches
MediaWiki exception.
Definition: MWException.php:32
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:565
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:42
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:839
static cleanUrl( $url)
Definition: Sanitizer.php:1778
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:655
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1123
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:767
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:549
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:882
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:153
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:974
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:442
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:859
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:526
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1108
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
Definition: Sanitizer.php:393
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:318
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1240
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1754
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1259
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:281
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1069
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:86
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1372
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:947
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1727
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1389
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:676
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:997
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1895
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:78
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:901
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:697
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.