MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 use Wikimedia\RemexHtml\HTMLData;
32 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
36 
41 class Sanitizer {
48  private const CHAR_REFS_REGEX =
49  '/&([A-Za-z0-9\x80-\xff]+;)
50  |&\#([0-9]+);
51  |&\#[xX]([0-9A-Fa-f]+);
52  |(&)/x';
53 
58  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
59 
69  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
71 
77  public const ID_PRIMARY = 0;
78 
85  public const ID_FALLBACK = 1;
86 
91  private const MW_ENTITY_ALIASES = [
92  'רלמ;' => 'rlm;',
93  'رلم;' => 'rlm;',
94  ];
95 
99  private static $attribsRegex;
100 
107  private static function getAttribsRegex() {
108  if ( self::$attribsRegex === null ) {
109  $spaceChars = '\x09\x0a\x0c\x0d\x20';
110  $space = "[{$spaceChars}]";
111  $attrib = "[^{$spaceChars}\/>=]";
112  $attribFirst = "(?:{$attrib}|=)";
113  self::$attribsRegex =
114  "/({$attribFirst}{$attrib}*)
115  ($space*=$space*
116  (?:
117  # The attribute value: quoted or alone
118  \"([^\"]*)(?:\"|\$)
119  | '([^']*)(?:'|\$)
120  | (((?!$space|>).)*)
121  )
122  )?/sxu";
123  }
124  return self::$attribsRegex;
125  }
126 
130  private static $attribNameRegex;
131 
136  private static function getAttribNameRegex() {
137  if ( self::$attribNameRegex === null ) {
138  $attribFirst = "[:_\p{L}\p{N}]";
139  $attrib = "[:_\.\-\p{L}\p{N}]";
140  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
141  }
142  return self::$attribNameRegex;
143  }
144 
152  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
153  global $wgAllowImageTag;
154  static $commonCase, $staticInitialised;
155  $isCommonCase = ( $extratags === [] && $removetags === [] );
156  if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
157  return $commonCase;
158  }
159 
160  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
162 
163  // Base our staticInitialised variable off of the global config state so that if the globals
164  // are changed (like in the screwed up test system) we will re-initialise the settings.
165  $globalContext = $wgAllowImageTag;
166  if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167  $htmlpairsStatic = [ # Tags that must be closed
168  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
169  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
170  'strike', 'strong', 'tt', 'var', 'div', 'center',
171  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
172  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
173  'kbd', 'samp', 'data', 'time', 'mark'
174  ];
175  # These tags can be self-closed. For tags not also on
176  # $htmlsingleonly, a self-closed tag will be emitted as
177  # an empty element (open-tag/close-tag pair).
178  $htmlsingle = [
179  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
180  ];
181 
182  # Elements that cannot have close tags. This is (not coincidentally)
183  # also the list of tags for which the HTML 5 parsing algorithm
184  # requires you to "acknowledge the token's self-closing flag", i.e.
185  # a self-closing tag like <br/> is not an HTML 5 parse error only
186  # for this list.
187  $htmlsingleonly = [
188  'br', 'wbr', 'hr', 'meta', 'link'
189  ];
190 
191  $htmlnest = [ # Tags that can be nested--??
192  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
193  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
194  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
195  ];
196  $tabletags = [ # Can only appear inside table, we will close them
197  'td', 'th', 'tr',
198  ];
199  $htmllist = [ # Tags used by list
200  'ul', 'ol',
201  ];
202  $listtags = [ # Tags that can appear in a list
203  'li',
204  ];
205 
206  if ( $wgAllowImageTag ) {
207  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
208  'is deprecated since MediaWiki 1.35', '1.35', false, false );
209  $htmlsingle[] = 'img';
210  $htmlsingleonly[] = 'img';
211  }
212 
213  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
215 
216  # Convert them all to hashtables for faster lookup
217  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
218  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
219  foreach ( $vars as $var ) {
220  $$var = array_fill_keys( $$var, true );
221  }
222  $staticInitialised = $globalContext;
223  }
224 
225  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226  $extratags = array_fill_keys( $extratags, true );
227  $removetags = array_fill_keys( $removetags, true );
228  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
229  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
230  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
231  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
232 
233  $result = [
234  'htmlpairs' => $htmlpairs,
235  'htmlsingle' => $htmlsingle,
236  'htmlsingleonly' => $htmlsingleonly,
237  'htmlnest' => $htmlnest,
238  'tabletags' => $tabletags,
239  'htmllist' => $htmllist,
240  'listtags' => $listtags,
241  'htmlsingleallowed' => $htmlsingleallowed,
242  'htmlelements' => $htmlelements,
243  ];
244  if ( $isCommonCase ) {
245  $commonCase = $result;
246  }
247  return $result;
248  }
249 
280  public static function removeHTMLtags( $text, $processCallback = null,
281  $args = [], $extratags = [], $removetags = []
282  ) {
283  wfDeprecated( __METHOD__, '1.38' );
285  $text, $processCallback, $args, $extratags, $removetags
286  );
287  }
288 
317  public static function internalRemoveHtmlTags( $text, $processCallback = null,
318  $args = [], $extratags = [], $removetags = []
319  ) {
320  $tagData = self::getRecognizedTagData( $extratags, $removetags );
321  $htmlsingle = $tagData['htmlsingle'];
322  $htmlsingleonly = $tagData['htmlsingleonly'];
323  $htmlelements = $tagData['htmlelements'];
324 
325  # Remove HTML comments
326  $text = self::removeHTMLcomments( $text );
327  $bits = explode( '<', $text );
328  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
329 
330  # this might be possible using remex tidy itself
331  foreach ( $bits as $x ) {
332  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
334 
335  $badtag = false;
336  $t = strtolower( $t );
337  if ( isset( $htmlelements[$t] ) ) {
338  if ( is_callable( $processCallback ) ) {
339  call_user_func_array( $processCallback, [ &$params, $args ] );
340  }
341 
342  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
343  // Remove the self-closing slash, to be consistent
344  // with HTML5 semantics. T134423
345  $brace = '>';
346  }
347  if ( !self::validateTag( $params, $t ) ) {
348  $badtag = true;
349  }
350 
351  $newparams = self::fixTagAttributes( $params, $t );
352  if ( !$badtag ) {
353  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
354  # Interpret self-closing tags as empty tags even when
355  # HTML 5 would interpret them as start tags. Such input
356  # is commonly seen on Wikimedia wikis with this intention.
357  $brace = "></$t>";
358  }
359 
360  $rest = str_replace( '>', '&gt;', $rest );
361  $text .= "<$slash$t$newparams$brace$rest";
362  continue;
363  }
364  }
365  }
366  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
367  }
368  return $text;
369  }
370 
392  public static function removeSomeTags(
393  string $text, array $options = []
394  ): string {
395  $extraTags = $options['extraTags'] ?? [];
396  $removeTags = $options['removeTags'] ?? [];
397  // These options are @internal:
398  $attrCallback = $options['attrCallback'] ?? null;
399  $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
400  $tidy = $options['tidy'] ?? true;
401 
402  // This disallows HTML5-style "missing trailing semicolon" attributes
403  // In wikitext "clean&copy" does *not* contain an entity.
404  $text = self::normalizeCharReferences( $text );
405 
406  $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
407  // Use RemexHtml to tokenize $text and remove the barred tags
408  $formatter = new RemexCompatFormatter;
409  $serializer = new RemexSerializer( $formatter );
410  $treeBuilder = new RemexTreeBuilder( $serializer, [
411  'ignoreErrors' => true,
412  'ignoreNulls' => true,
413  ] );
414  $dispatcher = new RemexDispatcher( $treeBuilder );
415  $tokenHandler = $dispatcher;
416  $remover = new RemexRemoveTagHandler(
417  $tokenHandler, $text, $tagData,
418  $attrCallback, $attrCallbackArgs
419  );
420  $tokenizer = new RemexTokenizer( $remover, $text, [
421  'ignoreErrors' => true,
422  // don't ignore char refs, we want them to be decoded
423  'ignoreNulls' => true,
424  'skipPreprocess' => true,
425  ] );
426  $tokenizer->execute( [
427  'fragmentNamespace' => HTMLData::NS_HTML,
428  'fragmentName' => 'body',
429  ] );
430  return $serializer->getResult();
431  }
432 
442  public static function removeHTMLcomments( $text ) {
443  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
444  $end = strpos( $text, '-->', $start + 4 );
445  if ( $end === false ) {
446  # Unterminated comment; bail out
447  break;
448  }
449 
450  $end += 3;
451 
452  # Trim space and newline if the comment is both
453  # preceded and followed by a newline
454  $spaceStart = max( $start - 1, 0 );
455  $spaceLen = $end - $spaceStart;
456  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
457  $spaceStart--;
458  $spaceLen++;
459  }
460  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
461  $spaceLen++;
462  }
463  if ( substr( $text, $spaceStart, 1 ) === "\n"
464  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
465  # Remove the comment, leading and trailing
466  # spaces, and leave only one newline.
467  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
468  } else {
469  # Remove just the comment.
470  $text = substr_replace( $text, '', $start, $end - $start );
471  }
472  }
473  return $text;
474  }
475 
490  private static function validateTag( $params, $element ) {
491  $params = self::decodeTagAttributes( $params );
492 
493  if ( $element == 'meta' || $element == 'link' ) {
494  if ( !isset( $params['itemprop'] ) ) {
495  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
496  return false;
497  }
498  if ( $element == 'meta' && !isset( $params['content'] ) ) {
499  // <meta> must have a content="" for the itemprop
500  return false;
501  }
502  if ( $element == 'link' && !isset( $params['href'] ) ) {
503  // <link> must have an associated href=""
504  return false;
505  }
506  }
507 
508  return true;
509  }
510 
526  public static function validateTagAttributes( $attribs, $element ) {
527  return self::validateAttributes( $attribs,
528  self::attributesAllowedInternal( $element ) );
529  }
530 
549  public static function validateAttributes( $attribs, $allowed ) {
550  if ( isset( $allowed[0] ) ) {
551  // Calling this function with a sequential array is
552  // deprecated. For now just convert it.
553  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
554  $allowed = array_fill_keys( $allowed, true );
555  }
556  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
557 
558  $out = [];
559  foreach ( $attribs as $attribute => $value ) {
560  # Allow XML namespace declaration to allow RDFa
561  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
562  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
563  $out[$attribute] = $value;
564  }
565 
566  continue;
567  }
568 
569  # Allow any attribute beginning with "data-"
570  # However:
571  # * Disallow data attributes used by MediaWiki code
572  # * Ensure that the attribute is not namespaced by banning
573  # colons.
574  if ( (
575  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
576  !array_key_exists( $attribute, $allowed )
577  ) || self::isReservedDataAttribute( $attribute ) ) {
578  continue;
579  }
580 
581  # Strip javascript "expression" from stylesheets.
582  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
583  if ( $attribute == 'style' ) {
584  $value = self::checkCss( $value );
585  }
586 
587  # Escape HTML id attributes
588  if ( $attribute === 'id' ) {
589  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
590  }
591 
592  # Escape HTML id reference lists
593  if ( $attribute === 'aria-describedby'
594  || $attribute === 'aria-flowto'
595  || $attribute === 'aria-labelledby'
596  || $attribute === 'aria-owns'
597  ) {
598  $value = self::escapeIdReferenceListInternal( $value );
599  }
600 
601  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
602  if ( $attribute === 'rel' || $attribute === 'rev'
603  # RDFa
604  || $attribute === 'about' || $attribute === 'property'
605  || $attribute === 'resource' || $attribute === 'datatype'
606  || $attribute === 'typeof'
607  # HTML5 microdata
608  || $attribute === 'itemid' || $attribute === 'itemprop'
609  || $attribute === 'itemref' || $attribute === 'itemscope'
610  || $attribute === 'itemtype'
611  ) {
612  // Paranoia. Allow "simple" values but suppress javascript
613  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
614  continue;
615  }
616  }
617 
618  # NOTE: even though elements using href/src are not allowed directly, supply
619  # validation code that can be used by tag hook handlers, etc
620  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
621  if ( !preg_match( $hrefExp, $value ) ) {
622  continue; // drop any href or src attributes not using an allowed protocol.
623  // NOTE: this also drops all relative URLs
624  }
625  }
626 
627  if ( $attribute === 'tabindex' && $value !== '0' ) {
628  // Only allow tabindex of 0, which is useful for accessibility.
629  continue;
630  }
631 
632  // If this attribute was previously set, override it.
633  // Output should only have one attribute of each name.
634  $out[$attribute] = $value;
635  }
636 
637  # itemtype, itemid, itemref don't make sense without itemscope
638  if ( !array_key_exists( 'itemscope', $out ) ) {
639  unset( $out['itemtype'] );
640  unset( $out['itemid'] );
641  unset( $out['itemref'] );
642  }
643  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
644 
645  return $out;
646  }
647 
655  public static function isReservedDataAttribute( $attr ) {
656  // data-ooui is reserved for ooui.
657  // data-mw and data-parsoid are reserved for parsoid.
658  // data-mw-<name here> is reserved for extensions (or core) if
659  // they need to communicate some data to the client and want to be
660  // sure that it isn't coming from an untrusted user.
661  // We ignore the possibility of namespaces since user-generated HTML
662  // can't use them anymore.
663  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
664  }
665 
676  public static function mergeAttributes( $a, $b ) {
677  $out = array_merge( $a, $b );
678  if ( isset( $a['class'] ) && isset( $b['class'] )
679  && is_string( $a['class'] ) && is_string( $b['class'] )
680  && $a['class'] !== $b['class']
681  ) {
682  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
683  -1, PREG_SPLIT_NO_EMPTY );
684  $out['class'] = implode( ' ', array_unique( $classes ) );
685  }
686  return $out;
687  }
688 
697  public static function normalizeCss( $value ) {
698  // Decode character references like &#123;
699  $value = self::decodeCharReferences( $value );
700 
701  // Decode escape sequences and line continuation
702  // See the grammar in the CSS 2 spec, appendix D.
703  // This has to be done AFTER decoding character references.
704  // This means it isn't possible for this function to return
705  // unsanitized escape sequences. It is possible to manufacture
706  // input that contains character references that decode to
707  // escape sequences that decode to character references, but
708  // it's OK for the return value to contain character references
709  // because the caller is supposed to escape those anyway.
710  static $decodeRegex;
711  if ( !$decodeRegex ) {
712  $space = '[\\x20\\t\\r\\n\\f]';
713  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
714  $backslash = '\\\\';
715  $decodeRegex = "/ $backslash
716  (?:
717  ($nl) | # 1. Line continuation
718  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
719  (.) | # 3. backslash cancelling special meaning
720  () | # 4. backslash at end of string
721  )/xu";
722  }
723  $value = preg_replace_callback( $decodeRegex,
724  [ __CLASS__, 'cssDecodeCallback' ], $value );
725 
726  // Let the value through if it's nothing but a single comment, to
727  // allow other functions which may reject it to pass some error
728  // message through.
729  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
730  // Remove any comments; IE gets token splitting wrong
731  // This must be done AFTER decoding character references and
732  // escape sequences, because those steps can introduce comments
733  // This step cannot introduce character references or escape
734  // sequences, because it replaces comments with spaces rather
735  // than removing them completely.
736  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
737 
738  // Remove anything after a comment-start token, to guard against
739  // incorrect client implementations.
740  $commentPos = strpos( $value, '/*' );
741  if ( $commentPos !== false ) {
742  $value = substr( $value, 0, $commentPos );
743  }
744  }
745 
746  return $value;
747  }
748 
767  public static function checkCss( $value ) {
768  $value = self::normalizeCss( $value );
769 
770  // Reject problematic keywords and control characters
771  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
772  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
773  return '/* invalid control char */';
774  } elseif ( preg_match(
775  '! expression
776  | filter\s*:
777  | accelerator\s*:
778  | -o-link\s*:
779  | -o-link-source\s*:
780  | -o-replace\s*:
781  | url\s*\‍(
782  | image\s*\‍(
783  | image-set\s*\‍(
784  | attr\s*\‍([^)]+[\s,]+url
785  | var\s*\‍(
786  !ix', $value ) ) {
787  return '/* insecure input */';
788  }
789  return $value;
790  }
791 
796  private static function cssDecodeCallback( $matches ) {
797  if ( $matches[1] !== '' ) {
798  // Line continuation
799  return '';
800  } elseif ( $matches[2] !== '' ) {
801  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
802  } elseif ( $matches[3] !== '' ) {
803  $char = $matches[3];
804  } else {
805  $char = '\\';
806  }
807  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
808  // These characters need to be escaped in strings
809  // Clean up the escape sequence to avoid parsing errors by clients
810  return '\\' . dechex( ord( $char ) ) . ' ';
811  } else {
812  // Decode unnecessary escape
813  return $char;
814  }
815  }
816 
838  public static function fixTagAttributes( $text, $element, $sorted = false ) {
839  if ( trim( $text ) == '' ) {
840  return '';
841  }
842 
843  $decoded = self::decodeTagAttributes( $text );
844  $stripped = self::validateTagAttributes( $decoded, $element );
845 
846  if ( $sorted ) {
847  ksort( $stripped );
848  }
849 
850  return self::safeEncodeTagAttributes( $stripped );
851  }
852 
858  public static function encodeAttribute( $text ) {
859  $encValue = htmlspecialchars( $text, ENT_QUOTES );
860 
861  // Whitespace is normalized during attribute decoding,
862  // so if we've been passed non-spaces we must encode them
863  // ahead of time or they won't be preserved.
864  $encValue = strtr( $encValue, [
865  "\n" => '&#10;',
866  "\r" => '&#13;',
867  "\t" => '&#9;',
868  ] );
869 
870  return $encValue;
871  }
872 
881  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
882  // Replace $ with \$ and \ with \\
883  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
884  $fixtags = [
885  # French spaces, last one Guillemet-left
886  # only if it isn't followed by a word character.
887  '/ (?=[?:;!%»›](?!\w))/u' => "$space",
888  # French spaces, Guillemet-right
889  '/([«‹]) /u' => "\\1$space",
890  ];
891  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
892  }
893 
900  public static function safeEncodeAttribute( $text ) {
901  $encValue = self::encodeAttribute( $text );
902 
903  # Templates and links may be expanded in later parsing,
904  # creating invalid or dangerous output. Suppress this.
905  $encValue = strtr( $encValue, [
906  '<' => '&lt;', // This should never happen,
907  '>' => '&gt;', // we've received invalid input
908  '"' => '&quot;', // which should have been escaped.
909  '{' => '&#123;',
910  '}' => '&#125;', // prevent unpaired language conversion syntax
911  '[' => '&#91;',
912  ']' => '&#93;',
913  "''" => '&#39;&#39;',
914  'ISBN' => '&#73;SBN',
915  'RFC' => '&#82;FC',
916  'PMID' => '&#80;MID',
917  '|' => '&#124;',
918  '__' => '&#95;_',
919  ] );
920 
921  # Stupid hack
922  $encValue = preg_replace_callback(
923  '/((?i)' . wfUrlProtocols() . ')/',
924  static function ( $matches ) {
925  return str_replace( ':', '&#58;', $matches[1] );
926  },
927  $encValue );
928  return $encValue;
929  }
930 
946  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
947  global $wgFragmentMode;
948 
949  if ( !isset( $wgFragmentMode[$mode] ) ) {
950  if ( $mode === self::ID_PRIMARY ) {
951  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
952  }
953  return false;
954  }
955 
956  $internalMode = $wgFragmentMode[$mode];
957 
958  return self::escapeIdInternal( $id, $internalMode );
959  }
960 
973  public static function escapeIdForLink( $id ) {
974  global $wgFragmentMode;
975 
976  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
977  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
978  }
979 
980  $mode = $wgFragmentMode[self::ID_PRIMARY];
981 
982  $id = self::escapeIdInternalUrl( $id, $mode );
983 
984  return $id;
985  }
986 
996  public static function escapeIdForExternalInterwiki( $id ) {
998 
999  $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1000 
1001  return $id;
1002  }
1003 
1013  private static function escapeIdInternalUrl( $id, $mode ) {
1014  $id = self::escapeIdInternal( $id, $mode );
1015  if ( $mode === 'html5' ) {
1016  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1017  }
1018  return $id;
1019  }
1020 
1028  private static function escapeIdInternal( $id, $mode ) {
1029  // Truncate overly-long IDs. This isn't an HTML limit, it's just
1030  // griefer protection. [T251506]
1031  $id = mb_substr( $id, 0, 1024 );
1032 
1033  switch ( $mode ) {
1034  case 'html5':
1035  // html5 spec says ids must not have any of the following:
1036  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1037  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1038  // possible using either Lua or html entities.
1039  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1040  break;
1041  case 'legacy':
1042  // This corresponds to 'noninitial' mode of the former escapeId()
1043  static $replace = [
1044  '%3A' => ':',
1045  '%' => '.'
1046  ];
1047 
1048  $id = urlencode( str_replace( ' ', '_', $id ) );
1049  $id = strtr( $id, $replace );
1050  break;
1051  default:
1052  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1053  }
1054 
1055  return $id;
1056  }
1057 
1068  public static function escapeIdReferenceList( $referenceString ) {
1069  wfDeprecated( __METHOD__, '1.36' );
1070  return self::escapeIdReferenceListInternal( $referenceString );
1071  }
1072 
1080  private static function escapeIdReferenceListInternal( $referenceString ) {
1081  # Explode the space delimited list string into an array of tokens
1082  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1083 
1084  # Escape each token as an id
1085  foreach ( $references as &$ref ) {
1086  $ref = self::escapeIdForAttribute( $ref );
1087  }
1088 
1089  # Merge the array back to a space delimited list string
1090  # If the array is empty, the result will be an empty string ('')
1091  $referenceString = implode( ' ', $references );
1092 
1093  return $referenceString;
1094  }
1095 
1107  public static function escapeClass( $class ) {
1108  // Convert ugly stuff to underscores and kill underscores in ugly places
1109  return rtrim( preg_replace(
1110  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1111  '_',
1112  $class ), '_' );
1113  }
1114 
1122  public static function escapeHtmlAllowEntities( $html ) {
1123  $html = self::decodeCharReferences( $html );
1124  # It seems wise to escape ' as well as ", as a matter of course. Can't
1125  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1126  # don't cause the entire string to disappear.
1127  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1128  return $html;
1129  }
1130 
1139  public static function decodeTagAttributes( $text ) {
1140  if ( trim( $text ) == '' ) {
1141  return [];
1142  }
1143 
1144  $pairs = [];
1145  if ( !preg_match_all(
1146  self::getAttribsRegex(),
1147  $text,
1148  $pairs,
1149  PREG_SET_ORDER ) ) {
1150  return [];
1151  }
1152 
1153  $attribs = [];
1154  foreach ( $pairs as $set ) {
1155  $attribute = strtolower( $set[1] );
1156 
1157  // Filter attribute names with unacceptable characters
1158  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1159  continue;
1160  }
1161 
1162  $value = self::getTagAttributeCallback( $set );
1163 
1164  // Normalize whitespace
1165  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1166  $value = trim( $value );
1167 
1168  // Decode character references
1169  $attribs[$attribute] = self::decodeCharReferences( $value );
1170  }
1171  return $attribs;
1172  }
1173 
1181  public static function safeEncodeTagAttributes( $assoc_array ) {
1182  $attribs = [];
1183  foreach ( $assoc_array as $attribute => $value ) {
1184  $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1185  $encValue = self::safeEncodeAttribute( $value );
1186 
1187  $attribs[] = "$encAttribute=\"$encValue\"";
1188  }
1189  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1190  }
1191 
1200  private static function getTagAttributeCallback( $set ) {
1201  if ( isset( $set[5] ) ) {
1202  # No quotes.
1203  return $set[5];
1204  } elseif ( isset( $set[4] ) ) {
1205  # Single-quoted
1206  return $set[4];
1207  } elseif ( isset( $set[3] ) ) {
1208  # Double-quoted
1209  return $set[3];
1210  } elseif ( !isset( $set[2] ) ) {
1211  # In XHTML, attributes must have a value so return an empty string.
1212  # See "Empty attribute syntax",
1213  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1214  return "";
1215  } else {
1216  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1217  }
1218  }
1219 
1224  private static function normalizeWhitespace( $text ) {
1225  return trim( preg_replace(
1226  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1227  ' ',
1228  $text ) );
1229  }
1230 
1239  public static function normalizeSectionNameWhitespace( $section ) {
1240  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1241  }
1242 
1258  public static function normalizeCharReferences( $text ) {
1259  return preg_replace_callback(
1260  self::CHAR_REFS_REGEX,
1261  [ self::class, 'normalizeCharReferencesCallback' ],
1262  $text );
1263  }
1264 
1269  private static function normalizeCharReferencesCallback( $matches ) {
1270  $ret = null;
1271  if ( $matches[1] != '' ) {
1272  $ret = self::normalizeEntity( $matches[1] );
1273  } elseif ( $matches[2] != '' ) {
1274  $ret = self::decCharReference( $matches[2] );
1275  } elseif ( $matches[3] != '' ) {
1276  $ret = self::hexCharReference( $matches[3] );
1277  }
1278  if ( $ret === null ) {
1279  return htmlspecialchars( $matches[0], ENT_COMPAT );
1280  } else {
1281  return $ret;
1282  }
1283  }
1284 
1295  private static function normalizeEntity( $name ) {
1296  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1297  // Non-standard MediaWiki-specific entities
1298  return '&' . self::MW_ENTITY_ALIASES[$name];
1299  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1300  // Keep these in word form
1301  return "&$name";
1302  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1303  // Beware: some entities expand to more than 1 codepoint
1304  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1305  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1306  }, HTMLData::$namedEntityTranslations[$name] );
1307  } else {
1308  return "&amp;$name";
1309  }
1310  }
1311 
1316  private static function decCharReference( $codepoint ) {
1317  $point = intval( $codepoint );
1318  if ( self::validateCodepoint( $point ) ) {
1319  return sprintf( '&#%d;', $point );
1320  } else {
1321  return null;
1322  }
1323  }
1324 
1329  private static function hexCharReference( $codepoint ) {
1330  $point = hexdec( $codepoint );
1331  if ( self::validateCodepoint( $point ) ) {
1332  return sprintf( '&#x%x;', $point );
1333  } else {
1334  return null;
1335  }
1336  }
1337 
1344  private static function validateCodepoint( $codepoint ) {
1345  # U+000C is valid in HTML5 but not allowed in XML.
1346  # U+000D is valid in XML but not allowed in HTML5.
1347  # U+007F - U+009F are disallowed in HTML5 (control characters).
1348  return $codepoint == 0x09
1349  || $codepoint == 0x0a
1350  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1351  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1352  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1353  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1354  }
1355 
1363  public static function decodeCharReferences( $text ) {
1364  return preg_replace_callback(
1365  self::CHAR_REFS_REGEX,
1366  [ self::class, 'decodeCharReferencesCallback' ],
1367  $text );
1368  }
1369 
1380  public static function decodeCharReferencesAndNormalize( $text ) {
1381  $text = preg_replace_callback(
1382  self::CHAR_REFS_REGEX,
1383  [ self::class, 'decodeCharReferencesCallback' ],
1384  $text,
1385  -1, // limit
1386  $count
1387  );
1388 
1389  if ( $count ) {
1390  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1391  } else {
1392  return $text;
1393  }
1394  }
1395 
1400  private static function decodeCharReferencesCallback( $matches ) {
1401  if ( $matches[1] != '' ) {
1402  return self::decodeEntity( $matches[1] );
1403  } elseif ( $matches[2] != '' ) {
1404  return self::decodeChar( intval( $matches[2] ) );
1405  } elseif ( $matches[3] != '' ) {
1406  return self::decodeChar( hexdec( $matches[3] ) );
1407  }
1408  # Last case should be an ampersand by itself
1409  return $matches[0];
1410  }
1411 
1419  private static function decodeChar( $codepoint ) {
1420  if ( self::validateCodepoint( $codepoint ) ) {
1421  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1422  } else {
1423  return UtfNormal\Constants::UTF8_REPLACEMENT;
1424  }
1425  }
1426 
1435  private static function decodeEntity( $name ) {
1436  // These are MediaWiki-specific entities, not in the HTML standard
1437  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1438  $name = self::MW_ENTITY_ALIASES[$name];
1439  }
1440  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1441  return $trans ?? "&$name";
1442  }
1443 
1451  private static function attributesAllowedInternal( $element ) {
1452  $list = self::setupAttributesAllowedInternal();
1453  return $list[$element] ?? [];
1454  }
1455 
1463  private static function setupAttributesAllowedInternal() {
1464  static $allowed;
1465 
1466  if ( $allowed !== null ) {
1467  return $allowed;
1468  }
1469 
1470  // For lookup efficiency flip each attributes array so the keys are
1471  // the valid attributes.
1472  $merge = static function ( $a, $b, $c = [] ) {
1473  return array_merge(
1474  $a,
1475  array_fill_keys( $b, true ),
1476  array_fill_keys( $c, true ) );
1477  };
1478  $common = $merge( [], [
1479  # HTML
1480  'id',
1481  'class',
1482  'style',
1483  'lang',
1484  'dir',
1485  'title',
1486  'tabindex',
1487 
1488  # WAI-ARIA
1489  'aria-describedby',
1490  'aria-flowto',
1491  'aria-hidden',
1492  'aria-label',
1493  'aria-labelledby',
1494  'aria-owns',
1495  'role',
1496 
1497  # RDFa
1498  # These attributes are specified in section 9 of
1499  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1500  'about',
1501  'property',
1502  'resource',
1503  'datatype',
1504  'typeof',
1505 
1506  # Microdata. These are specified by
1507  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1508  'itemid',
1509  'itemprop',
1510  'itemref',
1511  'itemscope',
1512  'itemtype',
1513  ] );
1514 
1515  $block = $merge( $common, [ 'align' ] );
1516 
1517  $tablealign = [ 'align', 'valign' ];
1518  $tablecell = [
1519  'abbr',
1520  'axis',
1521  'headers',
1522  'scope',
1523  'rowspan',
1524  'colspan',
1525  'nowrap', # deprecated
1526  'width', # deprecated
1527  'height', # deprecated
1528  'bgcolor', # deprecated
1529  ];
1530 
1531  # Numbers refer to sections in HTML 4.01 standard describing the element.
1532  # See: https://www.w3.org/TR/html4/
1533  $allowed = [
1534  # 7.5.4
1535  'div' => $block,
1536  'center' => $common, # deprecated
1537  'span' => $common,
1538 
1539  # 7.5.5
1540  'h1' => $block,
1541  'h2' => $block,
1542  'h3' => $block,
1543  'h4' => $block,
1544  'h5' => $block,
1545  'h6' => $block,
1546 
1547  # 7.5.6
1548  # address
1549 
1550  # 8.2.4
1551  'bdo' => $common,
1552 
1553  # 9.2.1
1554  'em' => $common,
1555  'strong' => $common,
1556  'cite' => $common,
1557  'dfn' => $common,
1558  'code' => $common,
1559  'samp' => $common,
1560  'kbd' => $common,
1561  'var' => $common,
1562  'abbr' => $common,
1563  # acronym
1564 
1565  # 9.2.2
1566  'blockquote' => $merge( $common, [ 'cite' ] ),
1567  'q' => $merge( $common, [ 'cite' ] ),
1568 
1569  # 9.2.3
1570  'sub' => $common,
1571  'sup' => $common,
1572 
1573  # 9.3.1
1574  'p' => $block,
1575 
1576  # 9.3.2
1577  'br' => $merge( $common, [ 'clear' ] ),
1578 
1579  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1580  'wbr' => $common,
1581 
1582  # 9.3.4
1583  'pre' => $merge( $common, [ 'width' ] ),
1584 
1585  # 9.4
1586  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1587  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1588 
1589  # 10.2
1590  'ul' => $merge( $common, [ 'type' ] ),
1591  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1592  'li' => $merge( $common, [ 'type', 'value' ] ),
1593 
1594  # 10.3
1595  'dl' => $common,
1596  'dd' => $common,
1597  'dt' => $common,
1598 
1599  # 11.2.1
1600  'table' => $merge( $common,
1601  [ 'summary', 'width', 'border', 'frame',
1602  'rules', 'cellspacing', 'cellpadding',
1603  'align', 'bgcolor',
1604  ] ),
1605 
1606  # 11.2.2
1607  'caption' => $block,
1608 
1609  # 11.2.3
1610  'thead' => $common,
1611  'tfoot' => $common,
1612  'tbody' => $common,
1613 
1614  # 11.2.4
1615  'colgroup' => $merge( $common, [ 'span' ] ),
1616  'col' => $merge( $common, [ 'span' ] ),
1617 
1618  # 11.2.5
1619  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1620 
1621  # 11.2.6
1622  'td' => $merge( $common, $tablecell, $tablealign ),
1623  'th' => $merge( $common, $tablecell, $tablealign ),
1624 
1625  # 12.2
1626  # NOTE: <a> is not allowed directly, but this list of allowed
1627  # attributes is used from the Parser object
1628  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1629 
1630  # 13.2
1631  # Not usually allowed, but may be used for extension-style hooks
1632  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1633  # true
1634  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1635  # Attributes for A/V tags added in T163583 / T133673
1636  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1637  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1638  'source' => $merge( $common, [ 'type', 'src' ] ),
1639  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1640 
1641  # 15.2.1
1642  'tt' => $common,
1643  'b' => $common,
1644  'i' => $common,
1645  'big' => $common,
1646  'small' => $common,
1647  'strike' => $common,
1648  's' => $common,
1649  'u' => $common,
1650 
1651  # 15.2.2
1652  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1653  # basefont
1654 
1655  # 15.3
1656  'hr' => $merge( $common, [ 'width' ] ),
1657 
1658  # HTML Ruby annotation text module, simple ruby only.
1659  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1660  'ruby' => $common,
1661  # rbc
1662  'rb' => $common,
1663  'rp' => $common,
1664  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1665  'rtc' => $common,
1666 
1667  # MathML root element, where used for extensions
1668  # 'title' may not be 100% valid here; it's XHTML
1669  # https://www.w3.org/TR/REC-MathML/
1670  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1671 
1672  // HTML 5 section 4.5
1673  'figure' => $common,
1674  'figcaption' => $common,
1675 
1676  # HTML 5 section 4.6
1677  'bdi' => $common,
1678 
1679  # HTML5 elements, defined by:
1680  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1681  'data' => $merge( $common, [ 'value' ] ),
1682  'time' => $merge( $common, [ 'datetime' ] ),
1683  'mark' => $common,
1684 
1685  // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1686  // is enabled so we don't bother adding a conditional to hide these
1687  // Also meta and link are only valid in WikiText as Microdata elements
1688  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1689  // So we don't bother including $common attributes that have no purpose.
1690  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1691  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1692 
1693  # HTML 5 section 4.3.5
1694  'aside' => $common,
1695  ];
1696 
1697  return $allowed;
1698  }
1699 
1711  public static function stripAllTags( $html ) {
1712  // Use RemexHtml to tokenize $html and extract the text
1713  $handler = new RemexStripTagHandler;
1714  $tokenizer = new RemexTokenizer( $handler, $html, [
1715  'ignoreErrors' => true,
1716  // don't ignore char refs, we want them to be decoded
1717  'ignoreNulls' => true,
1718  'skipPreprocess' => true,
1719  ] );
1720  $tokenizer->execute();
1721  $text = $handler->getResult();
1722 
1723  $text = self::normalizeWhitespace( $text );
1724  return $text;
1725  }
1726 
1738  public static function hackDocType() {
1739  $out = "<!DOCTYPE html [\n";
1740  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1741  if ( substr( $entity, -1 ) !== ';' ) {
1742  // Some HTML entities omit the trailing semicolon;
1743  // wikitext does not permit these.
1744  continue;
1745  }
1746  $name = substr( $entity, 0, -1 );
1747  $expansion = self::normalizeEntity( $entity );
1748  if ( $entity === $expansion ) {
1749  // Skip &lt; &gt; etc
1750  continue;
1751  }
1752  $out .= "<!ENTITY $name \"$expansion\">";
1753  }
1754  $out .= "]>\n";
1755  return $out;
1756  }
1757 
1762  public static function cleanUrl( $url ) {
1763  # Normalize any HTML entities in input. They will be
1764  # re-escaped by makeExternalLink().
1765  $url = self::decodeCharReferences( $url );
1766 
1767  # Escape any control characters introduced by the above step
1768  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1769  [ __CLASS__, 'cleanUrlCallback' ], $url );
1770 
1771  # Validate hostname portion
1772  $matches = [];
1773  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1774  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1775 
1776  // Characters that will be ignored in IDNs.
1777  // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1778  // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1779  // Strip them before further processing so deny lists and such work.
1780  $strip = "/
1781  \\s| # general whitespace
1782  \u{00AD}| # SOFT HYPHEN
1783  \u{034F}| # COMBINING GRAPHEME JOINER
1784  \u{061C}| # ARABIC LETTER MARK
1785  [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1786  # HANGUL JUNGSEONG FILLER
1787  [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1788  # KHMER VOWEL INHERENT AA
1789  [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1790  # MONGOLIAN FREE VARIATION SELECTOR THREE
1791  \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1792  [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1793  # RIGHT-TO-LEFT MARK
1794  [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1795  # RIGHT-TO-LEFT OVERRIDE
1796  [\u{2060}-\u{2064}]| # WORD JOINER..
1797  # INVISIBLE PLUS
1798  \u{2065}| # <reserved-2065>
1799  [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1800  # NOMINAL DIGIT SHAPES
1801  \u{3164}| # HANGUL FILLER
1802  [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1803  # VARIATION SELECTOR-16
1804  \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1805  \u{FFA0}| # HALFWIDTH HANGUL FILLER
1806  [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1807  # <reserved-FFF8>
1808  [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1809  # SHORTHAND FORMAT UP STEP
1810  [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1811  # MUSICAL SYMBOL END PHRASE
1812  \u{E0000}| # <reserved-E0000>
1813  \u{E0001}| # LANGUAGE TAG
1814  [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1815  # <reserved-E001F>
1816  [\u{E0020}-\u{E007F}]| # TAG SPACE..
1817  # CANCEL TAG
1818  [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1819  # <reserved-E00FF>
1820  [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1821  # VARIATION SELECTOR-256
1822  [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1823  # <reserved-E0FFF>
1824  /xuD";
1825 
1826  $host = preg_replace( $strip, '', $host );
1827 
1828  // IPv6 host names are bracketed with []. Url-decode these.
1829  if ( str_starts_with( $host, "//%5B" ) &&
1830  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1831  ) {
1832  $host = '//[' . $matches[1] . ']' . $matches[2];
1833  }
1834 
1835  // @todo FIXME: Validate hostnames here
1836 
1837  return $protocol . $host . $rest;
1838  } else {
1839  return $url;
1840  }
1841  }
1842 
1847  private static function cleanUrlCallback( $matches ) {
1848  return urlencode( $matches[0] );
1849  }
1850 
1879  public static function validateEmail( $addr ) {
1880  $result = null;
1881  // TODO This method should be non-static, and have a HookRunner injected
1882  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1883  return $result;
1884  }
1885 
1886  // Please note strings below are enclosed in brackets [], this make the
1887  // hyphen "-" a range indicator. Hence it is double backslashed below.
1888  // See T28948
1889  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1890  $rfc1034_ldh_str = "a-z0-9\\-";
1891 
1892  $html5_email_regexp = "/
1893  ^ # start of string
1894  [$rfc5322_atext\\.]+ # user part which is liberal :p
1895  @ # 'apostrophe'
1896  [$rfc1034_ldh_str]+ # First domain part
1897  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1898  $ # End of string
1899  /ix"; // case Insensitive, eXtended
1900 
1901  return (bool)preg_match( $html5_email_regexp, $addr );
1902  }
1903 }
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
$matches
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:173
MediaWiki exception.
Definition: MWException.php:29
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:41
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:838
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:48
static cleanUrl( $url)
Definition: Sanitizer.php:1762
static decCharReference( $codepoint)
Definition: Sanitizer.php:1316
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:655
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
Definition: Sanitizer.php:1013
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1419
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1122
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:136
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:767
static decodeEntity( $name)
If the named entity is defined in HTML5 return the UTF-8 encoding of that character.
Definition: Sanitizer.php:1435
static normalizeEntity( $name)
If the named entity is defined in HTML5 return the equivalent numeric entity reference (except for th...
Definition: Sanitizer.php:1295
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:549
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:881
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1463
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:152
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:99
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:973
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:442
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:858
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1329
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:526
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1107
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
Definition: Sanitizer.php:392
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:317
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition: Sanitizer.php:69
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1239
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1269
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1738
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1847
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1258
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:280
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:796
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1068
const MW_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki in wikitext.
Definition: Sanitizer.php:91
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1224
static escapeIdReferenceListInternal( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1080
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:107
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1344
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:85
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1363
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:946
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:130
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1200
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:1028
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:490
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1711
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
Definition: Sanitizer.php:1451
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1380
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1400
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:676
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:996
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1879
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:77
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition: Sanitizer.php:58
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:900
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:70
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:697
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.
if( $line===false) $args
Definition: mcc.php:124