MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 use Wikimedia\RemexHtml\HTMLData;
32 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
36 
41 class Sanitizer {
48  private const CHAR_REFS_REGEX =
49  '/&([A-Za-z0-9\x80-\xff]+;)
50  |&\#([0-9]+);
51  |&\#[xX]([0-9A-Fa-f]+);
52  |(&)/x';
53 
58  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
59 
69  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
71 
77  public const ID_PRIMARY = 0;
78 
85  public const ID_FALLBACK = 1;
86 
91  private const MW_ENTITY_ALIASES = [
92  'רלמ;' => 'rlm;',
93  'رلم;' => 'rlm;',
94  ];
95 
99  private static $attribsRegex;
100 
107  private static function getAttribsRegex() {
108  if ( self::$attribsRegex === null ) {
109  $spaceChars = '\x09\x0a\x0c\x0d\x20';
110  $space = "[{$spaceChars}]";
111  $attrib = "[^{$spaceChars}\/>=]";
112  $attribFirst = "(?:{$attrib}|=)";
113  self::$attribsRegex =
114  "/({$attribFirst}{$attrib}*)
115  ($space*=$space*
116  (?:
117  # The attribute value: quoted or alone
118  \"([^\"]*)(?:\"|\$)
119  | '([^']*)(?:'|\$)
120  | (((?!$space|>).)*)
121  )
122  )?/sxu";
123  }
124  return self::$attribsRegex;
125  }
126 
130  private static $attribNameRegex;
131 
136  private static function getAttribNameRegex() {
137  if ( self::$attribNameRegex === null ) {
138  $attribFirst = "[:_\p{L}\p{N}]";
139  $attrib = "[:_\.\-\p{L}\p{N}]";
140  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
141  }
142  return self::$attribNameRegex;
143  }
144 
152  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
153  global $wgAllowImageTag;
154  static $commonCase, $staticInitialised;
155  $isCommonCase = ( $extratags === [] && $removetags === [] );
156  if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
157  return $commonCase;
158  }
159 
160  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
162 
163  // Base our staticInitialised variable off of the global config state so that if the globals
164  // are changed (like in the screwed up test system) we will re-initialise the settings.
165  $globalContext = $wgAllowImageTag;
166  if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167  $htmlpairsStatic = [ # Tags that must be closed
168  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
169  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
170  'strike', 'strong', 'tt', 'var', 'div', 'center',
171  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
172  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
173  'kbd', 'samp', 'data', 'time', 'mark'
174  ];
175  # These tags can be self-closed. For tags not also on
176  # $htmlsingleonly, a self-closed tag will be emitted as
177  # an empty element (open-tag/close-tag pair).
178  $htmlsingle = [
179  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
180  ];
181 
182  # Elements that cannot have close tags. This is (not coincidentally)
183  # also the list of tags for which the HTML 5 parsing algorithm
184  # requires you to "acknowledge the token's self-closing flag", i.e.
185  # a self-closing tag like <br/> is not an HTML 5 parse error only
186  # for this list.
187  $htmlsingleonly = [
188  'br', 'wbr', 'hr', 'meta', 'link'
189  ];
190 
191  $htmlnest = [ # Tags that can be nested--??
192  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
193  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
194  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
195  ];
196  $tabletags = [ # Can only appear inside table, we will close them
197  'td', 'th', 'tr',
198  ];
199  $htmllist = [ # Tags used by list
200  'ul', 'ol',
201  ];
202  $listtags = [ # Tags that can appear in a list
203  'li',
204  ];
205 
206  if ( $wgAllowImageTag ) {
207  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
208  'is deprecated since MediaWiki 1.35', '1.35', false, false );
209  $htmlsingle[] = 'img';
210  $htmlsingleonly[] = 'img';
211  }
212 
213  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
215 
216  # Convert them all to hashtables for faster lookup
217  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
218  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
219  foreach ( $vars as $var ) {
220  $$var = array_fill_keys( $$var, true );
221  }
222  $staticInitialised = $globalContext;
223  }
224 
225  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226  $extratags = array_fill_keys( $extratags, true );
227  $removetags = array_fill_keys( $removetags, true );
228  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
229  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
230  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
231  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
232 
233  $result = [
234  'htmlpairs' => $htmlpairs,
235  'htmlsingle' => $htmlsingle,
236  'htmlsingleonly' => $htmlsingleonly,
237  'htmlnest' => $htmlnest,
238  'tabletags' => $tabletags,
239  'htmllist' => $htmllist,
240  'listtags' => $listtags,
241  'htmlsingleallowed' => $htmlsingleallowed,
242  'htmlelements' => $htmlelements,
243  ];
244  if ( $isCommonCase ) {
245  $commonCase = $result;
246  }
247  return $result;
248  }
249 
280  public static function removeHTMLtags( $text, $processCallback = null,
281  $args = [], $extratags = [], $removetags = []
282  ) {
283  wfDeprecated( __METHOD__, '1.38' );
285  $text, $processCallback, $args, $extratags, $removetags
286  );
287  }
288 
317  public static function internalRemoveHtmlTags( $text, $processCallback = null,
318  $args = [], $extratags = [], $removetags = []
319  ) {
320  $tagData = self::getRecognizedTagData( $extratags, $removetags );
321  $htmlsingle = $tagData['htmlsingle'];
322  $htmlsingleonly = $tagData['htmlsingleonly'];
323  $htmlelements = $tagData['htmlelements'];
324 
325  # Remove HTML comments
326  $text = self::removeHTMLcomments( $text );
327  $bits = explode( '<', $text );
328  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
329 
330  # this might be possible using remex tidy itself
331  foreach ( $bits as $x ) {
332  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333  [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
334 
335  $badtag = false;
336  $t = strtolower( $t );
337  if ( isset( $htmlelements[$t] ) ) {
338  if ( is_callable( $processCallback ) ) {
339  call_user_func_array( $processCallback, [ &$params, $args ] );
340  }
341 
342  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
343  // Remove the self-closing slash, to be consistent
344  // with HTML5 semantics. T134423
345  $brace = '>';
346  }
347  if ( !self::validateTag( $params, $t ) ) {
348  $badtag = true;
349  }
350 
351  $newparams = self::fixTagAttributes( $params, $t );
352  if ( !$badtag ) {
353  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
354  # Interpret self-closing tags as empty tags even when
355  # HTML 5 would interpret them as start tags. Such input
356  # is commonly seen on Wikimedia wikis with this intention.
357  $brace = "></$t>";
358  }
359 
360  $rest = str_replace( '>', '&gt;', $rest );
361  $text .= "<$slash$t$newparams$brace$rest";
362  continue;
363  }
364  }
365  }
366  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
367  }
368  return $text;
369  }
370 
392  public static function removeSomeTags(
393  string $text, array $options = []
394  ): string {
395  $extraTags = $options['extraTags'] ?? [];
396  $removeTags = $options['removeTags'] ?? [];
397  // These options are @internal:
398  $attrCallback = $options['attrCallback'] ?? null;
399  $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
400 
401  // This disallows HTML5-style "missing trailing semicolon" attributes
402  // In wikitext "clean&copy" does *not* contain an entity.
403  $text = self::normalizeCharReferences( $text );
404 
405  $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
406  // Use RemexHtml to tokenize $text and remove the barred tags
407  $formatter = new RemexCompatFormatter;
408  $serializer = new RemexSerializer( $formatter );
409  $treeBuilder = new RemexTreeBuilder( $serializer, [
410  'ignoreErrors' => true,
411  'ignoreNulls' => true,
412  ] );
413  $dispatcher = new RemexDispatcher( $treeBuilder );
414  $tokenHandler = $dispatcher;
415  $remover = new RemexRemoveTagHandler(
416  $tokenHandler, $text, $tagData,
417  $attrCallback, $attrCallbackArgs
418  );
419  $tokenizer = new RemexTokenizer( $remover, $text, [
420  'ignoreErrors' => true,
421  // don't ignore char refs, we want them to be decoded
422  'ignoreNulls' => true,
423  'skipPreprocess' => true,
424  ] );
425  $tokenizer->execute( [
426  'fragmentNamespace' => HTMLData::NS_HTML,
427  'fragmentName' => 'body',
428  ] );
429  return $serializer->getResult();
430  }
431 
441  public static function removeHTMLcomments( $text ) {
442  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
443  $end = strpos( $text, '-->', $start + 4 );
444  if ( $end === false ) {
445  # Unterminated comment; bail out
446  break;
447  }
448 
449  $end += 3;
450 
451  # Trim space and newline if the comment is both
452  # preceded and followed by a newline
453  $spaceStart = max( $start - 1, 0 );
454  $spaceLen = $end - $spaceStart;
455  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
456  $spaceStart--;
457  $spaceLen++;
458  }
459  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
460  $spaceLen++;
461  }
462  if ( substr( $text, $spaceStart, 1 ) === "\n"
463  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
464  # Remove the comment, leading and trailing
465  # spaces, and leave only one newline.
466  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
467  } else {
468  # Remove just the comment.
469  $text = substr_replace( $text, '', $start, $end - $start );
470  }
471  }
472  return $text;
473  }
474 
489  private static function validateTag( $params, $element ) {
490  $params = self::decodeTagAttributes( $params );
491 
492  if ( $element == 'meta' || $element == 'link' ) {
493  if ( !isset( $params['itemprop'] ) ) {
494  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
495  return false;
496  }
497  if ( $element == 'meta' && !isset( $params['content'] ) ) {
498  // <meta> must have a content="" for the itemprop
499  return false;
500  }
501  if ( $element == 'link' && !isset( $params['href'] ) ) {
502  // <link> must have an associated href=""
503  return false;
504  }
505  }
506 
507  return true;
508  }
509 
525  public static function validateTagAttributes( $attribs, $element ) {
526  return self::validateAttributes( $attribs,
527  self::attributesAllowedInternal( $element ) );
528  }
529 
548  public static function validateAttributes( $attribs, $allowed ) {
549  if ( isset( $allowed[0] ) ) {
550  // Calling this function with a sequential array is
551  // deprecated. For now just convert it.
552  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
553  $allowed = array_fill_keys( $allowed, true );
554  }
555  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
556 
557  $out = [];
558  foreach ( $attribs as $attribute => $value ) {
559  # Allow XML namespace declaration to allow RDFa
560  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
561  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
562  $out[$attribute] = $value;
563  }
564 
565  continue;
566  }
567 
568  # Allow any attribute beginning with "data-"
569  # However:
570  # * Disallow data attributes used by MediaWiki code
571  # * Ensure that the attribute is not namespaced by banning
572  # colons.
573  if ( (
574  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
575  !array_key_exists( $attribute, $allowed )
576  ) || self::isReservedDataAttribute( $attribute ) ) {
577  continue;
578  }
579 
580  # Strip javascript "expression" from stylesheets.
581  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
582  if ( $attribute == 'style' ) {
583  $value = self::checkCss( $value );
584  }
585 
586  # Escape HTML id attributes
587  if ( $attribute === 'id' ) {
588  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
589  }
590 
591  # Escape HTML id reference lists
592  if ( $attribute === 'aria-describedby'
593  || $attribute === 'aria-flowto'
594  || $attribute === 'aria-labelledby'
595  || $attribute === 'aria-owns'
596  ) {
597  $value = self::escapeIdReferenceListInternal( $value );
598  }
599 
600  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
601  if ( $attribute === 'rel' || $attribute === 'rev'
602  # RDFa
603  || $attribute === 'about' || $attribute === 'property'
604  || $attribute === 'resource' || $attribute === 'datatype'
605  || $attribute === 'typeof'
606  # HTML5 microdata
607  || $attribute === 'itemid' || $attribute === 'itemprop'
608  || $attribute === 'itemref' || $attribute === 'itemscope'
609  || $attribute === 'itemtype'
610  ) {
611  // Paranoia. Allow "simple" values but suppress javascript
612  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
613  continue;
614  }
615  }
616 
617  # NOTE: even though elements using href/src are not allowed directly, supply
618  # validation code that can be used by tag hook handlers, etc
619  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
620  if ( !preg_match( $hrefExp, $value ) ) {
621  continue; // drop any href or src attributes not using an allowed protocol.
622  // NOTE: this also drops all relative URLs
623  }
624  }
625 
626  if ( $attribute === 'tabindex' && $value !== '0' ) {
627  // Only allow tabindex of 0, which is useful for accessibility.
628  continue;
629  }
630 
631  // If this attribute was previously set, override it.
632  // Output should only have one attribute of each name.
633  $out[$attribute] = $value;
634  }
635 
636  # itemtype, itemid, itemref don't make sense without itemscope
637  if ( !array_key_exists( 'itemscope', $out ) ) {
638  unset( $out['itemtype'] );
639  unset( $out['itemid'] );
640  unset( $out['itemref'] );
641  }
642  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
643 
644  return $out;
645  }
646 
654  public static function isReservedDataAttribute( $attr ) {
655  // data-ooui is reserved for ooui.
656  // data-mw and data-parsoid are reserved for parsoid.
657  // data-mw-<name here> is reserved for extensions (or core) if
658  // they need to communicate some data to the client and want to be
659  // sure that it isn't coming from an untrusted user.
660  // We ignore the possibility of namespaces since user-generated HTML
661  // can't use them anymore.
662  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
663  }
664 
675  public static function mergeAttributes( $a, $b ) {
676  $out = array_merge( $a, $b );
677  if ( isset( $a['class'] ) && isset( $b['class'] )
678  && is_string( $a['class'] ) && is_string( $b['class'] )
679  && $a['class'] !== $b['class']
680  ) {
681  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
682  -1, PREG_SPLIT_NO_EMPTY );
683  $out['class'] = implode( ' ', array_unique( $classes ) );
684  }
685  return $out;
686  }
687 
696  public static function normalizeCss( $value ) {
697  // Decode character references like &#123;
698  $value = self::decodeCharReferences( $value );
699 
700  // Decode escape sequences and line continuation
701  // See the grammar in the CSS 2 spec, appendix D.
702  // This has to be done AFTER decoding character references.
703  // This means it isn't possible for this function to return
704  // unsanitized escape sequences. It is possible to manufacture
705  // input that contains character references that decode to
706  // escape sequences that decode to character references, but
707  // it's OK for the return value to contain character references
708  // because the caller is supposed to escape those anyway.
709  static $decodeRegex;
710  if ( !$decodeRegex ) {
711  $space = '[\\x20\\t\\r\\n\\f]';
712  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
713  $backslash = '\\\\';
714  $decodeRegex = "/ $backslash
715  (?:
716  ($nl) | # 1. Line continuation
717  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
718  (.) | # 3. backslash cancelling special meaning
719  () | # 4. backslash at end of string
720  )/xu";
721  }
722  $value = preg_replace_callback( $decodeRegex,
723  [ __CLASS__, 'cssDecodeCallback' ], $value );
724 
725  // Let the value through if it's nothing but a single comment, to
726  // allow other functions which may reject it to pass some error
727  // message through.
728  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
729  // Remove any comments; IE gets token splitting wrong
730  // This must be done AFTER decoding character references and
731  // escape sequences, because those steps can introduce comments
732  // This step cannot introduce character references or escape
733  // sequences, because it replaces comments with spaces rather
734  // than removing them completely.
735  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
736 
737  // Remove anything after a comment-start token, to guard against
738  // incorrect client implementations.
739  $commentPos = strpos( $value, '/*' );
740  if ( $commentPos !== false ) {
741  $value = substr( $value, 0, $commentPos );
742  }
743  }
744 
745  return $value;
746  }
747 
766  public static function checkCss( $value ) {
767  $value = self::normalizeCss( $value );
768 
769  // Reject problematic keywords and control characters
770  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
771  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
772  return '/* invalid control char */';
773  } elseif ( preg_match(
774  '! expression
775  | filter\s*:
776  | accelerator\s*:
777  | -o-link\s*:
778  | -o-link-source\s*:
779  | -o-replace\s*:
780  | url\s*\‍(
781  | image\s*\‍(
782  | image-set\s*\‍(
783  | attr\s*\‍([^)]+[\s,]+url
784  !ix', $value ) ) {
785  return '/* insecure input */';
786  }
787  return $value;
788  }
789 
794  private static function cssDecodeCallback( $matches ) {
795  if ( $matches[1] !== '' ) {
796  // Line continuation
797  return '';
798  } elseif ( $matches[2] !== '' ) {
799  # hexdec could return a float if the match is too long, but the
800  # regexp in question limits the string length to 6.
801  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
802  } elseif ( $matches[3] !== '' ) {
803  $char = $matches[3];
804  } else {
805  $char = '\\';
806  }
807  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
808  // These characters need to be escaped in strings
809  // Clean up the escape sequence to avoid parsing errors by clients
810  return '\\' . dechex( ord( $char ) ) . ' ';
811  } else {
812  // Decode unnecessary escape
813  return $char;
814  }
815  }
816 
838  public static function fixTagAttributes( $text, $element, $sorted = false ) {
839  if ( trim( $text ) == '' ) {
840  return '';
841  }
842 
843  $decoded = self::decodeTagAttributes( $text );
844  $stripped = self::validateTagAttributes( $decoded, $element );
845 
846  if ( $sorted ) {
847  ksort( $stripped );
848  }
849 
850  return self::safeEncodeTagAttributes( $stripped );
851  }
852 
858  public static function encodeAttribute( $text ) {
859  $encValue = htmlspecialchars( $text, ENT_QUOTES );
860 
861  // Whitespace is normalized during attribute decoding,
862  // so if we've been passed non-spaces we must encode them
863  // ahead of time or they won't be preserved.
864  $encValue = strtr( $encValue, [
865  "\n" => '&#10;',
866  "\r" => '&#13;',
867  "\t" => '&#9;',
868  ] );
869 
870  return $encValue;
871  }
872 
881  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
882  // Replace $ with \$ and \ with \\
883  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
884  $fixtags = [
885  # French spaces, last one Guillemet-left
886  # only if it isn't followed by a word character.
887  '/ (?=[?:;!%»›](?!\w))/u' => "$space",
888  # French spaces, Guillemet-right
889  '/([«‹]) /u' => "\\1$space",
890  ];
891  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
892  }
893 
900  public static function safeEncodeAttribute( $text ) {
901  $encValue = self::encodeAttribute( $text );
902 
903  # Templates and links may be expanded in later parsing,
904  # creating invalid or dangerous output. Suppress this.
905  $encValue = strtr( $encValue, [
906  '<' => '&lt;', // This should never happen,
907  '>' => '&gt;', // we've received invalid input
908  '"' => '&quot;', // which should have been escaped.
909  '{' => '&#123;',
910  '}' => '&#125;', // prevent unpaired language conversion syntax
911  '[' => '&#91;',
912  ']' => '&#93;',
913  "''" => '&#39;&#39;',
914  'ISBN' => '&#73;SBN',
915  'RFC' => '&#82;FC',
916  'PMID' => '&#80;MID',
917  '|' => '&#124;',
918  '__' => '&#95;_',
919  ] );
920 
921  # Stupid hack
922  $encValue = preg_replace_callback(
923  '/((?i)' . wfUrlProtocols() . ')/',
924  static function ( $matches ) {
925  return str_replace( ':', '&#58;', $matches[1] );
926  },
927  $encValue );
928  return $encValue;
929  }
930 
946  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
947  global $wgFragmentMode;
948 
949  if ( !isset( $wgFragmentMode[$mode] ) ) {
950  if ( $mode === self::ID_PRIMARY ) {
951  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
952  }
953  return false;
954  }
955 
956  $internalMode = $wgFragmentMode[$mode];
957 
958  return self::escapeIdInternal( $id, $internalMode );
959  }
960 
973  public static function escapeIdForLink( $id ) {
974  global $wgFragmentMode;
975 
976  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
977  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
978  }
979 
980  $mode = $wgFragmentMode[self::ID_PRIMARY];
981 
982  $id = self::escapeIdInternalUrl( $id, $mode );
983 
984  return $id;
985  }
986 
996  public static function escapeIdForExternalInterwiki( $id ) {
998 
999  $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1000 
1001  return $id;
1002  }
1003 
1013  private static function escapeIdInternalUrl( $id, $mode ) {
1014  $id = self::escapeIdInternal( $id, $mode );
1015  if ( $mode === 'html5' ) {
1016  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1017  }
1018  return $id;
1019  }
1020 
1028  private static function escapeIdInternal( $id, $mode ) {
1029  // Truncate overly-long IDs. This isn't an HTML limit, it's just
1030  // griefer protection. [T251506]
1031  $id = mb_substr( $id, 0, 1024 );
1032 
1033  switch ( $mode ) {
1034  case 'html5':
1035  // html5 spec says ids must not have any of the following:
1036  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1037  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1038  // possible using either Lua or html entities.
1039  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1040  break;
1041  case 'legacy':
1042  // This corresponds to 'noninitial' mode of the former escapeId()
1043  static $replace = [
1044  '%3A' => ':',
1045  '%' => '.'
1046  ];
1047 
1048  $id = urlencode( str_replace( ' ', '_', $id ) );
1049  $id = strtr( $id, $replace );
1050  break;
1051  default:
1052  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1053  }
1054 
1055  return $id;
1056  }
1057 
1068  public static function escapeIdReferenceList( $referenceString ) {
1069  wfDeprecated( __METHOD__, '1.36' );
1070  return self::escapeIdReferenceListInternal( $referenceString );
1071  }
1072 
1080  private static function escapeIdReferenceListInternal( $referenceString ) {
1081  # Explode the space delimited list string into an array of tokens
1082  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1083 
1084  # Escape each token as an id
1085  foreach ( $references as &$ref ) {
1086  $ref = self::escapeIdForAttribute( $ref );
1087  }
1088 
1089  # Merge the array back to a space delimited list string
1090  # If the array is empty, the result will be an empty string ('')
1091  $referenceString = implode( ' ', $references );
1092 
1093  return $referenceString;
1094  }
1095 
1107  public static function escapeClass( $class ) {
1108  // Convert ugly stuff to underscores and kill underscores in ugly places
1109  return rtrim( preg_replace(
1110  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1111  '_',
1112  $class ), '_' );
1113  }
1114 
1122  public static function escapeHtmlAllowEntities( $html ) {
1123  $html = self::decodeCharReferences( $html );
1124  # It seems wise to escape ' as well as ", as a matter of course. Can't
1125  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1126  # don't cause the entire string to disappear.
1127  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1128  return $html;
1129  }
1130 
1139  public static function decodeTagAttributes( $text ) {
1140  if ( trim( $text ) == '' ) {
1141  return [];
1142  }
1143 
1144  $pairs = [];
1145  if ( !preg_match_all(
1146  self::getAttribsRegex(),
1147  $text,
1148  $pairs,
1149  PREG_SET_ORDER ) ) {
1150  return [];
1151  }
1152 
1153  $attribs = [];
1154  foreach ( $pairs as $set ) {
1155  $attribute = strtolower( $set[1] );
1156 
1157  // Filter attribute names with unacceptable characters
1158  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1159  continue;
1160  }
1161 
1162  $value = self::getTagAttributeCallback( $set );
1163 
1164  // Normalize whitespace
1165  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1166  $value = trim( $value );
1167 
1168  // Decode character references
1169  $attribs[$attribute] = self::decodeCharReferences( $value );
1170  }
1171  return $attribs;
1172  }
1173 
1181  public static function safeEncodeTagAttributes( $assoc_array ) {
1182  $attribs = [];
1183  foreach ( $assoc_array as $attribute => $value ) {
1184  $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1185  $encValue = self::safeEncodeAttribute( $value );
1186 
1187  $attribs[] = "$encAttribute=\"$encValue\"";
1188  }
1189  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1190  }
1191 
1200  private static function getTagAttributeCallback( $set ) {
1201  if ( isset( $set[5] ) ) {
1202  # No quotes.
1203  return $set[5];
1204  } elseif ( isset( $set[4] ) ) {
1205  # Single-quoted
1206  return $set[4];
1207  } elseif ( isset( $set[3] ) ) {
1208  # Double-quoted
1209  return $set[3];
1210  } elseif ( !isset( $set[2] ) ) {
1211  # In XHTML, attributes must have a value so return an empty string.
1212  # See "Empty attribute syntax",
1213  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1214  return "";
1215  } else {
1216  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1217  }
1218  }
1219 
1224  private static function normalizeWhitespace( $text ) {
1225  return trim( preg_replace(
1226  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1227  ' ',
1228  $text ) );
1229  }
1230 
1239  public static function normalizeSectionNameWhitespace( $section ) {
1240  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1241  }
1242 
1258  public static function normalizeCharReferences( $text ) {
1259  return preg_replace_callback(
1260  self::CHAR_REFS_REGEX,
1261  [ self::class, 'normalizeCharReferencesCallback' ],
1262  $text );
1263  }
1264 
1269  private static function normalizeCharReferencesCallback( $matches ) {
1270  $ret = null;
1271  if ( $matches[1] != '' ) {
1272  $ret = self::normalizeEntity( $matches[1] );
1273  } elseif ( $matches[2] != '' ) {
1274  $ret = self::decCharReference( $matches[2] );
1275  } elseif ( $matches[3] != '' ) {
1276  $ret = self::hexCharReference( $matches[3] );
1277  }
1278  if ( $ret === null ) {
1279  return htmlspecialchars( $matches[0], ENT_COMPAT );
1280  } else {
1281  return $ret;
1282  }
1283  }
1284 
1295  private static function normalizeEntity( $name ) {
1296  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1297  // Non-standard MediaWiki-specific entities
1298  return '&' . self::MW_ENTITY_ALIASES[$name];
1299  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1300  // Keep these in word form
1301  return "&$name";
1302  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1303  // Beware: some entities expand to more than 1 codepoint
1304  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1305  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1306  }, HTMLData::$namedEntityTranslations[$name] );
1307  } else {
1308  return "&amp;$name";
1309  }
1310  }
1311 
1316  private static function decCharReference( $codepoint ) {
1317  # intval() will (safely) saturate at the maximum signed integer
1318  # value if $codepoint is too many digits
1319  $point = intval( $codepoint );
1320  if ( self::validateCodepoint( $point ) ) {
1321  return sprintf( '&#%d;', $point );
1322  } else {
1323  return null;
1324  }
1325  }
1326 
1331  private static function hexCharReference( $codepoint ) {
1332  # hexdec() will return a float (not an int) if $codepoint is too
1333  # long, so protect against that. The largest valid codepoint is
1334  # 0x10FFFF.
1335  if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1336  return null;
1337  }
1338  $point = hexdec( $codepoint );
1339  if ( self::validateCodepoint( $point ) ) {
1340  return sprintf( '&#x%x;', $point );
1341  } else {
1342  return null;
1343  }
1344  }
1345 
1352  private static function validateCodepoint( $codepoint ) {
1353  # U+000C is valid in HTML5 but not allowed in XML.
1354  # U+000D is valid in XML but not allowed in HTML5.
1355  # U+007F - U+009F are disallowed in HTML5 (control characters).
1356  return $codepoint == 0x09
1357  || $codepoint == 0x0a
1358  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1359  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1360  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1361  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1362  }
1363 
1371  public static function decodeCharReferences( $text ) {
1372  return preg_replace_callback(
1373  self::CHAR_REFS_REGEX,
1374  [ self::class, 'decodeCharReferencesCallback' ],
1375  $text );
1376  }
1377 
1388  public static function decodeCharReferencesAndNormalize( $text ) {
1389  $text = preg_replace_callback(
1390  self::CHAR_REFS_REGEX,
1391  [ self::class, 'decodeCharReferencesCallback' ],
1392  $text,
1393  -1, // limit
1394  $count
1395  );
1396 
1397  if ( $count ) {
1398  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1399  } else {
1400  return $text;
1401  }
1402  }
1403 
1408  private static function decodeCharReferencesCallback( $matches ) {
1409  if ( $matches[1] != '' ) {
1410  return self::decodeEntity( $matches[1] );
1411  } elseif ( $matches[2] != '' ) {
1412  return self::decodeChar( intval( $matches[2] ) );
1413  } elseif ( $matches[3] != '' ) {
1414  # hexdec will return a float if the string is too long (!) so
1415  # check the length of the string first.
1416  if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1417  // Invalid character reference.
1418  return UtfNormal\Constants::UTF8_REPLACEMENT;
1419  }
1420  return self::decodeChar( hexdec( $matches[3] ) );
1421  }
1422  # Last case should be an ampersand by itself
1423  return $matches[0];
1424  }
1425 
1433  private static function decodeChar( $codepoint ) {
1434  if ( self::validateCodepoint( $codepoint ) ) {
1435  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1436  } else {
1437  return UtfNormal\Constants::UTF8_REPLACEMENT;
1438  }
1439  }
1440 
1449  private static function decodeEntity( $name ) {
1450  // These are MediaWiki-specific entities, not in the HTML standard
1451  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1452  $name = self::MW_ENTITY_ALIASES[$name];
1453  }
1454  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1455  return $trans ?? "&$name";
1456  }
1457 
1465  private static function attributesAllowedInternal( $element ) {
1466  $list = self::setupAttributesAllowedInternal();
1467  return $list[$element] ?? [];
1468  }
1469 
1477  private static function setupAttributesAllowedInternal() {
1478  static $allowed;
1479 
1480  if ( $allowed !== null ) {
1481  return $allowed;
1482  }
1483 
1484  // For lookup efficiency flip each attributes array so the keys are
1485  // the valid attributes.
1486  $merge = static function ( $a, $b, $c = [] ) {
1487  return array_merge(
1488  $a,
1489  array_fill_keys( $b, true ),
1490  array_fill_keys( $c, true ) );
1491  };
1492  $common = $merge( [], [
1493  # HTML
1494  'id',
1495  'class',
1496  'style',
1497  'lang',
1498  'dir',
1499  'title',
1500  'tabindex',
1501 
1502  # WAI-ARIA
1503  'aria-describedby',
1504  'aria-flowto',
1505  'aria-hidden',
1506  'aria-label',
1507  'aria-labelledby',
1508  'aria-owns',
1509  'role',
1510 
1511  # RDFa
1512  # These attributes are specified in section 9 of
1513  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1514  'about',
1515  'property',
1516  'resource',
1517  'datatype',
1518  'typeof',
1519 
1520  # Microdata. These are specified by
1521  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1522  'itemid',
1523  'itemprop',
1524  'itemref',
1525  'itemscope',
1526  'itemtype',
1527  ] );
1528 
1529  $block = $merge( $common, [ 'align' ] );
1530 
1531  $tablealign = [ 'align', 'valign' ];
1532  $tablecell = [
1533  'abbr',
1534  'axis',
1535  'headers',
1536  'scope',
1537  'rowspan',
1538  'colspan',
1539  'nowrap', # deprecated
1540  'width', # deprecated
1541  'height', # deprecated
1542  'bgcolor', # deprecated
1543  ];
1544 
1545  # Numbers refer to sections in HTML 4.01 standard describing the element.
1546  # See: https://www.w3.org/TR/html4/
1547  $allowed = [
1548  # 7.5.4
1549  'div' => $block,
1550  'center' => $common, # deprecated
1551  'span' => $common,
1552 
1553  # 7.5.5
1554  'h1' => $block,
1555  'h2' => $block,
1556  'h3' => $block,
1557  'h4' => $block,
1558  'h5' => $block,
1559  'h6' => $block,
1560 
1561  # 7.5.6
1562  # address
1563 
1564  # 8.2.4
1565  'bdo' => $common,
1566 
1567  # 9.2.1
1568  'em' => $common,
1569  'strong' => $common,
1570  'cite' => $common,
1571  'dfn' => $common,
1572  'code' => $common,
1573  'samp' => $common,
1574  'kbd' => $common,
1575  'var' => $common,
1576  'abbr' => $common,
1577  # acronym
1578 
1579  # 9.2.2
1580  'blockquote' => $merge( $common, [ 'cite' ] ),
1581  'q' => $merge( $common, [ 'cite' ] ),
1582 
1583  # 9.2.3
1584  'sub' => $common,
1585  'sup' => $common,
1586 
1587  # 9.3.1
1588  'p' => $block,
1589 
1590  # 9.3.2
1591  'br' => $merge( $common, [ 'clear' ] ),
1592 
1593  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1594  'wbr' => $common,
1595 
1596  # 9.3.4
1597  'pre' => $merge( $common, [ 'width' ] ),
1598 
1599  # 9.4
1600  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1601  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1602 
1603  # 10.2
1604  'ul' => $merge( $common, [ 'type' ] ),
1605  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1606  'li' => $merge( $common, [ 'type', 'value' ] ),
1607 
1608  # 10.3
1609  'dl' => $common,
1610  'dd' => $common,
1611  'dt' => $common,
1612 
1613  # 11.2.1
1614  'table' => $merge( $common,
1615  [ 'summary', 'width', 'border', 'frame',
1616  'rules', 'cellspacing', 'cellpadding',
1617  'align', 'bgcolor',
1618  ] ),
1619 
1620  # 11.2.2
1621  'caption' => $block,
1622 
1623  # 11.2.3
1624  'thead' => $common,
1625  'tfoot' => $common,
1626  'tbody' => $common,
1627 
1628  # 11.2.4
1629  'colgroup' => $merge( $common, [ 'span' ] ),
1630  'col' => $merge( $common, [ 'span' ] ),
1631 
1632  # 11.2.5
1633  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1634 
1635  # 11.2.6
1636  'td' => $merge( $common, $tablecell, $tablealign ),
1637  'th' => $merge( $common, $tablecell, $tablealign ),
1638 
1639  # 12.2
1640  # NOTE: <a> is not allowed directly, but this list of allowed
1641  # attributes is used from the Parser object
1642  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1643 
1644  # 13.2
1645  # Not usually allowed, but may be used for extension-style hooks
1646  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1647  # true
1648  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1649  # Attributes for A/V tags added in T163583 / T133673
1650  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1651  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1652  'source' => $merge( $common, [ 'type', 'src' ] ),
1653  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1654 
1655  # 15.2.1
1656  'tt' => $common,
1657  'b' => $common,
1658  'i' => $common,
1659  'big' => $common,
1660  'small' => $common,
1661  'strike' => $common,
1662  's' => $common,
1663  'u' => $common,
1664 
1665  # 15.2.2
1666  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1667  # basefont
1668 
1669  # 15.3
1670  'hr' => $merge( $common, [ 'width' ] ),
1671 
1672  # HTML Ruby annotation text module, simple ruby only.
1673  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1674  'ruby' => $common,
1675  # rbc
1676  'rb' => $common,
1677  'rp' => $common,
1678  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1679  'rtc' => $common,
1680 
1681  # MathML root element, where used for extensions
1682  # 'title' may not be 100% valid here; it's XHTML
1683  # https://www.w3.org/TR/REC-MathML/
1684  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1685 
1686  // HTML 5 section 4.5
1687  'figure' => $common,
1688  'figcaption' => $common,
1689 
1690  # HTML 5 section 4.6
1691  'bdi' => $common,
1692 
1693  # HTML5 elements, defined by:
1694  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1695  'data' => $merge( $common, [ 'value' ] ),
1696  'time' => $merge( $common, [ 'datetime' ] ),
1697  'mark' => $common,
1698 
1699  // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1700  // is enabled so we don't bother adding a conditional to hide these
1701  // Also meta and link are only valid in WikiText as Microdata elements
1702  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1703  // So we don't bother including $common attributes that have no purpose.
1704  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1705  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1706 
1707  # HTML 5 section 4.3.5
1708  'aside' => $common,
1709  ];
1710 
1711  return $allowed;
1712  }
1713 
1725  public static function stripAllTags( $html ) {
1726  // Use RemexHtml to tokenize $html and extract the text
1727  $handler = new RemexStripTagHandler;
1728  $tokenizer = new RemexTokenizer( $handler, $html, [
1729  'ignoreErrors' => true,
1730  // don't ignore char refs, we want them to be decoded
1731  'ignoreNulls' => true,
1732  'skipPreprocess' => true,
1733  ] );
1734  $tokenizer->execute();
1735  $text = $handler->getResult();
1736 
1737  $text = self::normalizeWhitespace( $text );
1738  return $text;
1739  }
1740 
1752  public static function hackDocType() {
1753  $out = "<!DOCTYPE html [\n";
1754  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1755  if ( substr( $entity, -1 ) !== ';' ) {
1756  // Some HTML entities omit the trailing semicolon;
1757  // wikitext does not permit these.
1758  continue;
1759  }
1760  $name = substr( $entity, 0, -1 );
1761  $expansion = self::normalizeEntity( $entity );
1762  if ( $entity === $expansion ) {
1763  // Skip &lt; &gt; etc
1764  continue;
1765  }
1766  $out .= "<!ENTITY $name \"$expansion\">";
1767  }
1768  $out .= "]>\n";
1769  return $out;
1770  }
1771 
1776  public static function cleanUrl( $url ) {
1777  # Normalize any HTML entities in input. They will be
1778  # re-escaped by makeExternalLink().
1779  $url = self::decodeCharReferences( $url );
1780 
1781  # Escape any control characters introduced by the above step
1782  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1783  [ __CLASS__, 'cleanUrlCallback' ], $url );
1784 
1785  # Validate hostname portion
1786  $matches = [];
1787  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1788  [ /* $whole */, $protocol, $host, $rest ] = $matches;
1789 
1790  // Characters that will be ignored in IDNs.
1791  // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1792  // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1793  // Strip them before further processing so deny lists and such work.
1794  $strip = "/
1795  \\s| # general whitespace
1796  \u{00AD}| # SOFT HYPHEN
1797  \u{034F}| # COMBINING GRAPHEME JOINER
1798  \u{061C}| # ARABIC LETTER MARK
1799  [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1800  # HANGUL JUNGSEONG FILLER
1801  [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1802  # KHMER VOWEL INHERENT AA
1803  [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1804  # MONGOLIAN FREE VARIATION SELECTOR THREE
1805  \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1806  [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1807  # RIGHT-TO-LEFT MARK
1808  [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1809  # RIGHT-TO-LEFT OVERRIDE
1810  [\u{2060}-\u{2064}]| # WORD JOINER..
1811  # INVISIBLE PLUS
1812  \u{2065}| # <reserved-2065>
1813  [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1814  # NOMINAL DIGIT SHAPES
1815  \u{3164}| # HANGUL FILLER
1816  [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1817  # VARIATION SELECTOR-16
1818  \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1819  \u{FFA0}| # HALFWIDTH HANGUL FILLER
1820  [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1821  # <reserved-FFF8>
1822  [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1823  # SHORTHAND FORMAT UP STEP
1824  [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1825  # MUSICAL SYMBOL END PHRASE
1826  \u{E0000}| # <reserved-E0000>
1827  \u{E0001}| # LANGUAGE TAG
1828  [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1829  # <reserved-E001F>
1830  [\u{E0020}-\u{E007F}]| # TAG SPACE..
1831  # CANCEL TAG
1832  [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1833  # <reserved-E00FF>
1834  [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1835  # VARIATION SELECTOR-256
1836  [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1837  # <reserved-E0FFF>
1838  /xuD";
1839 
1840  $host = preg_replace( $strip, '', $host );
1841 
1842  // IPv6 host names are bracketed with []. Url-decode these.
1843  if ( str_starts_with( $host, "//%5B" ) &&
1844  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1845  ) {
1846  $host = '//[' . $matches[1] . ']' . $matches[2];
1847  }
1848 
1849  // @todo FIXME: Validate hostnames here
1850 
1851  return $protocol . $host . $rest;
1852  } else {
1853  return $url;
1854  }
1855  }
1856 
1861  private static function cleanUrlCallback( $matches ) {
1862  return urlencode( $matches[0] );
1863  }
1864 
1893  public static function validateEmail( $addr ) {
1894  $result = null;
1895  // TODO This method should be non-static, and have a HookRunner injected
1896  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1897  return $result;
1898  }
1899 
1900  // Please note strings below are enclosed in brackets [], this make the
1901  // hyphen "-" a range indicator. Hence it is double backslashed below.
1902  // See T28948
1903  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1904  $rfc1034_ldh_str = "a-z0-9\\-";
1905 
1906  $html5_email_regexp = "/
1907  ^ # start of string
1908  [$rfc5322_atext\\.]+ # user part which is liberal :p
1909  @ # 'apostrophe'
1910  [$rfc1034_ldh_str]+ # First domain part
1911  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1912  $ # End of string
1913  /ix"; // case Insensitive, eXtended
1914 
1915  return (bool)preg_match( $html5_email_regexp, $addr );
1916  }
1917 }
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
$matches
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:173
MediaWiki exception.
Definition: MWException.php:30
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:41
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:838
static cleanUrl( $url)
Definition: Sanitizer.php:1776
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:654
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1122
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:766
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:548
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:881
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:152
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:973
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:441
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:858
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:525
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1107
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
Definition: Sanitizer.php:392
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:317
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1239
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1752
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1258
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:280
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1068
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:85
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1371
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:946
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1725
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1388
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:675
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:996
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1893
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:77
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:900
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:696
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.