MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 use Wikimedia\RemexHtml\HTMLData;
32 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
36 
41 class Sanitizer {
48  private const CHAR_REFS_REGEX =
49  '/&([A-Za-z0-9\x80-\xff]+;)
50  |&\#([0-9]+);
51  |&\#[xX]([0-9A-Fa-f]+);
52  |(&)/x';
53 
58  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
59 
69  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
71 
77  public const ID_PRIMARY = 0;
78 
85  public const ID_FALLBACK = 1;
86 
91  private const MW_ENTITY_ALIASES = [
92  'רלמ;' => 'rlm;',
93  'رلم;' => 'rlm;',
94  ];
95 
99  private static $attribsRegex;
100 
107  private static function getAttribsRegex() {
108  if ( self::$attribsRegex === null ) {
109  $spaceChars = '\x09\x0a\x0c\x0d\x20';
110  $space = "[{$spaceChars}]";
111  $attrib = "[^{$spaceChars}\/>=]";
112  $attribFirst = "(?:{$attrib}|=)";
113  self::$attribsRegex =
114  "/({$attribFirst}{$attrib}*)
115  ($space*=$space*
116  (?:
117  # The attribute value: quoted or alone
118  \"([^\"]*)(?:\"|\$)
119  | '([^']*)(?:'|\$)
120  | (((?!$space|>).)*)
121  )
122  )?/sxu";
123  }
124  return self::$attribsRegex;
125  }
126 
130  private static $attribNameRegex;
131 
136  private static function getAttribNameRegex() {
137  if ( self::$attribNameRegex === null ) {
138  $attribFirst = "[:_\p{L}\p{N}]";
139  $attrib = "[:_\.\-\p{L}\p{N}]";
140  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
141  }
142  return self::$attribNameRegex;
143  }
144 
152  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
153  global $wgAllowImageTag;
154  static $commonCase, $staticInitialised;
155  $isCommonCase = ( $extratags === [] && $removetags === [] );
156  if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
157  return $commonCase;
158  }
159 
160  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
162 
163  // Base our staticInitialised variable off of the global config state so that if the globals
164  // are changed (like in the screwed up test system) we will re-initialise the settings.
165  $globalContext = $wgAllowImageTag;
166  if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167  $htmlpairsStatic = [ # Tags that must be closed
168  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
169  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
170  'strike', 'strong', 'tt', 'var', 'div', 'center',
171  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
172  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
173  'kbd', 'samp', 'data', 'time', 'mark'
174  ];
175  # These tags can be self-closed. For tags not also on
176  # $htmlsingleonly, a self-closed tag will be emitted as
177  # an empty element (open-tag/close-tag pair).
178  $htmlsingle = [
179  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
180  ];
181 
182  # Elements that cannot have close tags. This is (not coincidentally)
183  # also the list of tags for which the HTML 5 parsing algorithm
184  # requires you to "acknowledge the token's self-closing flag", i.e.
185  # a self-closing tag like <br/> is not an HTML 5 parse error only
186  # for this list.
187  $htmlsingleonly = [
188  'br', 'wbr', 'hr', 'meta', 'link'
189  ];
190 
191  $htmlnest = [ # Tags that can be nested--??
192  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
193  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
194  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
195  ];
196  $tabletags = [ # Can only appear inside table, we will close them
197  'td', 'th', 'tr',
198  ];
199  $htmllist = [ # Tags used by list
200  'ul', 'ol',
201  ];
202  $listtags = [ # Tags that can appear in a list
203  'li',
204  ];
205 
206  if ( $wgAllowImageTag ) {
207  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
208  'is deprecated since MediaWiki 1.35', '1.35', false, false );
209  $htmlsingle[] = 'img';
210  $htmlsingleonly[] = 'img';
211  }
212 
213  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
215 
216  # Convert them all to hashtables for faster lookup
217  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
218  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
219  foreach ( $vars as $var ) {
220  $$var = array_fill_keys( $$var, true );
221  }
222  $staticInitialised = $globalContext;
223  }
224 
225  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226  $extratags = array_fill_keys( $extratags, true );
227  $removetags = array_fill_keys( $removetags, true );
228  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
229  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
230  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
231  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
232 
233  $result = [
234  'htmlpairs' => $htmlpairs,
235  'htmlsingle' => $htmlsingle,
236  'htmlsingleonly' => $htmlsingleonly,
237  'htmlnest' => $htmlnest,
238  'tabletags' => $tabletags,
239  'htmllist' => $htmllist,
240  'listtags' => $listtags,
241  'htmlsingleallowed' => $htmlsingleallowed,
242  'htmlelements' => $htmlelements,
243  ];
244  if ( $isCommonCase ) {
245  $commonCase = $result;
246  }
247  return $result;
248  }
249 
280  public static function removeHTMLtags( $text, $processCallback = null,
281  $args = [], $extratags = [], $removetags = []
282  ) {
283  wfDeprecated( __METHOD__, '1.38' );
285  $text, $processCallback, $args, $extratags, $removetags
286  );
287  }
288 
317  public static function internalRemoveHtmlTags( $text, $processCallback = null,
318  $args = [], $extratags = [], $removetags = []
319  ) {
320  $tagData = self::getRecognizedTagData( $extratags, $removetags );
321  $htmlsingle = $tagData['htmlsingle'];
322  $htmlsingleonly = $tagData['htmlsingleonly'];
323  $htmlelements = $tagData['htmlelements'];
324 
325  # Remove HTML comments
326  $text = self::removeHTMLcomments( $text );
327  $bits = explode( '<', $text );
328  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
329 
330  # this might be possible using remex tidy itself
331  foreach ( $bits as $x ) {
332  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
334 
335  $badtag = false;
336  $t = strtolower( $t );
337  if ( isset( $htmlelements[$t] ) ) {
338  if ( is_callable( $processCallback ) ) {
339  call_user_func_array( $processCallback, [ &$params, $args ] );
340  }
341 
342  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
343  // Remove the self-closing slash, to be consistent
344  // with HTML5 semantics. T134423
345  $brace = '>';
346  }
347  if ( !self::validateTag( $params, $t ) ) {
348  $badtag = true;
349  }
350 
351  $newparams = self::fixTagAttributes( $params, $t );
352  if ( !$badtag ) {
353  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
354  # Interpret self-closing tags as empty tags even when
355  # HTML 5 would interpret them as start tags. Such input
356  # is commonly seen on Wikimedia wikis with this intention.
357  $brace = "></$t>";
358  }
359 
360  $rest = str_replace( '>', '&gt;', $rest );
361  $text .= "<$slash$t$newparams$brace$rest";
362  continue;
363  }
364  }
365  }
366  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
367  }
368  return $text;
369  }
370 
392  public static function removeSomeTags(
393  string $text, array $options = []
394  ): string {
395  $extraTags = $options['extraTags'] ?? [];
396  $removeTags = $options['removeTags'] ?? [];
397  // These options are @internal:
398  $attrCallback = $options['attrCallback'] ?? null;
399  $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
400 
401  // This disallows HTML5-style "missing trailing semicolon" attributes
402  // In wikitext "clean&copy" does *not* contain an entity.
403  $text = self::normalizeCharReferences( $text );
404 
405  $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
406  // Use RemexHtml to tokenize $text and remove the barred tags
407  $formatter = new RemexCompatFormatter;
408  $serializer = new RemexSerializer( $formatter );
409  $treeBuilder = new RemexTreeBuilder( $serializer, [
410  'ignoreErrors' => true,
411  'ignoreNulls' => true,
412  ] );
413  $dispatcher = new RemexDispatcher( $treeBuilder );
414  $tokenHandler = $dispatcher;
415  $remover = new RemexRemoveTagHandler(
416  $tokenHandler, $text, $tagData,
417  $attrCallback, $attrCallbackArgs
418  );
419  $tokenizer = new RemexTokenizer( $remover, $text, [
420  'ignoreErrors' => true,
421  // don't ignore char refs, we want them to be decoded
422  'ignoreNulls' => true,
423  'skipPreprocess' => true,
424  ] );
425  $tokenizer->execute( [
426  'fragmentNamespace' => HTMLData::NS_HTML,
427  'fragmentName' => 'body',
428  ] );
429  return $serializer->getResult();
430  }
431 
441  public static function removeHTMLcomments( $text ) {
442  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
443  $end = strpos( $text, '-->', $start + 4 );
444  if ( $end === false ) {
445  # Unterminated comment; bail out
446  break;
447  }
448 
449  $end += 3;
450 
451  # Trim space and newline if the comment is both
452  # preceded and followed by a newline
453  $spaceStart = max( $start - 1, 0 );
454  $spaceLen = $end - $spaceStart;
455  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
456  $spaceStart--;
457  $spaceLen++;
458  }
459  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
460  $spaceLen++;
461  }
462  if ( substr( $text, $spaceStart, 1 ) === "\n"
463  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
464  # Remove the comment, leading and trailing
465  # spaces, and leave only one newline.
466  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
467  } else {
468  # Remove just the comment.
469  $text = substr_replace( $text, '', $start, $end - $start );
470  }
471  }
472  return $text;
473  }
474 
489  private static function validateTag( $params, $element ) {
490  $params = self::decodeTagAttributes( $params );
491 
492  if ( $element == 'meta' || $element == 'link' ) {
493  if ( !isset( $params['itemprop'] ) ) {
494  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
495  return false;
496  }
497  if ( $element == 'meta' && !isset( $params['content'] ) ) {
498  // <meta> must have a content="" for the itemprop
499  return false;
500  }
501  if ( $element == 'link' && !isset( $params['href'] ) ) {
502  // <link> must have an associated href=""
503  return false;
504  }
505  }
506 
507  return true;
508  }
509 
525  public static function validateTagAttributes( $attribs, $element ) {
526  return self::validateAttributes( $attribs,
527  self::attributesAllowedInternal( $element ) );
528  }
529 
548  public static function validateAttributes( $attribs, $allowed ) {
549  if ( isset( $allowed[0] ) ) {
550  // Calling this function with a sequential array is
551  // deprecated. For now just convert it.
552  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
553  $allowed = array_fill_keys( $allowed, true );
554  }
555  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
556 
557  $out = [];
558  foreach ( $attribs as $attribute => $value ) {
559  # Allow XML namespace declaration to allow RDFa
560  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
561  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
562  $out[$attribute] = $value;
563  }
564 
565  continue;
566  }
567 
568  # Allow any attribute beginning with "data-"
569  # However:
570  # * Disallow data attributes used by MediaWiki code
571  # * Ensure that the attribute is not namespaced by banning
572  # colons.
573  if ( (
574  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
575  !array_key_exists( $attribute, $allowed )
576  ) || self::isReservedDataAttribute( $attribute ) ) {
577  continue;
578  }
579 
580  # Strip javascript "expression" from stylesheets.
581  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
582  if ( $attribute == 'style' ) {
583  $value = self::checkCss( $value );
584  }
585 
586  # Escape HTML id attributes
587  if ( $attribute === 'id' ) {
588  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
589  }
590 
591  # Escape HTML id reference lists
592  if ( $attribute === 'aria-describedby'
593  || $attribute === 'aria-flowto'
594  || $attribute === 'aria-labelledby'
595  || $attribute === 'aria-owns'
596  ) {
597  $value = self::escapeIdReferenceListInternal( $value );
598  }
599 
600  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
601  if ( $attribute === 'rel' || $attribute === 'rev'
602  # RDFa
603  || $attribute === 'about' || $attribute === 'property'
604  || $attribute === 'resource' || $attribute === 'datatype'
605  || $attribute === 'typeof'
606  # HTML5 microdata
607  || $attribute === 'itemid' || $attribute === 'itemprop'
608  || $attribute === 'itemref' || $attribute === 'itemscope'
609  || $attribute === 'itemtype'
610  ) {
611  // Paranoia. Allow "simple" values but suppress javascript
612  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
613  continue;
614  }
615  }
616 
617  # NOTE: even though elements using href/src are not allowed directly, supply
618  # validation code that can be used by tag hook handlers, etc
619  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
620  if ( !preg_match( $hrefExp, $value ) ) {
621  continue; // drop any href or src attributes not using an allowed protocol.
622  // NOTE: this also drops all relative URLs
623  }
624  }
625 
626  if ( $attribute === 'tabindex' && $value !== '0' ) {
627  // Only allow tabindex of 0, which is useful for accessibility.
628  continue;
629  }
630 
631  // If this attribute was previously set, override it.
632  // Output should only have one attribute of each name.
633  $out[$attribute] = $value;
634  }
635 
636  # itemtype, itemid, itemref don't make sense without itemscope
637  if ( !array_key_exists( 'itemscope', $out ) ) {
638  unset( $out['itemtype'] );
639  unset( $out['itemid'] );
640  unset( $out['itemref'] );
641  }
642  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
643 
644  return $out;
645  }
646 
654  public static function isReservedDataAttribute( $attr ) {
655  // data-ooui is reserved for ooui.
656  // data-mw and data-parsoid are reserved for parsoid.
657  // data-mw-<name here> is reserved for extensions (or core) if
658  // they need to communicate some data to the client and want to be
659  // sure that it isn't coming from an untrusted user.
660  // We ignore the possibility of namespaces since user-generated HTML
661  // can't use them anymore.
662  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
663  }
664 
675  public static function mergeAttributes( $a, $b ) {
676  $out = array_merge( $a, $b );
677  if ( isset( $a['class'] ) && isset( $b['class'] )
678  && is_string( $a['class'] ) && is_string( $b['class'] )
679  && $a['class'] !== $b['class']
680  ) {
681  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
682  -1, PREG_SPLIT_NO_EMPTY );
683  $out['class'] = implode( ' ', array_unique( $classes ) );
684  }
685  return $out;
686  }
687 
696  public static function normalizeCss( $value ) {
697  // Decode character references like &#123;
698  $value = self::decodeCharReferences( $value );
699 
700  // Decode escape sequences and line continuation
701  // See the grammar in the CSS 2 spec, appendix D.
702  // This has to be done AFTER decoding character references.
703  // This means it isn't possible for this function to return
704  // unsanitized escape sequences. It is possible to manufacture
705  // input that contains character references that decode to
706  // escape sequences that decode to character references, but
707  // it's OK for the return value to contain character references
708  // because the caller is supposed to escape those anyway.
709  static $decodeRegex;
710  if ( !$decodeRegex ) {
711  $space = '[\\x20\\t\\r\\n\\f]';
712  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
713  $backslash = '\\\\';
714  $decodeRegex = "/ $backslash
715  (?:
716  ($nl) | # 1. Line continuation
717  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
718  (.) | # 3. backslash cancelling special meaning
719  () | # 4. backslash at end of string
720  )/xu";
721  }
722  $value = preg_replace_callback( $decodeRegex,
723  [ __CLASS__, 'cssDecodeCallback' ], $value );
724 
725  // Let the value through if it's nothing but a single comment, to
726  // allow other functions which may reject it to pass some error
727  // message through.
728  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
729  // Remove any comments; IE gets token splitting wrong
730  // This must be done AFTER decoding character references and
731  // escape sequences, because those steps can introduce comments
732  // This step cannot introduce character references or escape
733  // sequences, because it replaces comments with spaces rather
734  // than removing them completely.
735  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
736 
737  // Remove anything after a comment-start token, to guard against
738  // incorrect client implementations.
739  $commentPos = strpos( $value, '/*' );
740  if ( $commentPos !== false ) {
741  $value = substr( $value, 0, $commentPos );
742  }
743  }
744 
745  return $value;
746  }
747 
766  public static function checkCss( $value ) {
767  $value = self::normalizeCss( $value );
768 
769  // Reject problematic keywords and control characters
770  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
771  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
772  return '/* invalid control char */';
773  } elseif ( preg_match(
774  '! expression
775  | filter\s*:
776  | accelerator\s*:
777  | -o-link\s*:
778  | -o-link-source\s*:
779  | -o-replace\s*:
780  | url\s*\‍(
781  | image\s*\‍(
782  | image-set\s*\‍(
783  | attr\s*\‍([^)]+[\s,]+url
784  !ix', $value ) ) {
785  return '/* insecure input */';
786  }
787  return $value;
788  }
789 
794  private static function cssDecodeCallback( $matches ) {
795  if ( $matches[1] !== '' ) {
796  // Line continuation
797  return '';
798  } elseif ( $matches[2] !== '' ) {
799  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
800  } elseif ( $matches[3] !== '' ) {
801  $char = $matches[3];
802  } else {
803  $char = '\\';
804  }
805  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
806  // These characters need to be escaped in strings
807  // Clean up the escape sequence to avoid parsing errors by clients
808  return '\\' . dechex( ord( $char ) ) . ' ';
809  } else {
810  // Decode unnecessary escape
811  return $char;
812  }
813  }
814 
836  public static function fixTagAttributes( $text, $element, $sorted = false ) {
837  if ( trim( $text ) == '' ) {
838  return '';
839  }
840 
841  $decoded = self::decodeTagAttributes( $text );
842  $stripped = self::validateTagAttributes( $decoded, $element );
843 
844  if ( $sorted ) {
845  ksort( $stripped );
846  }
847 
848  return self::safeEncodeTagAttributes( $stripped );
849  }
850 
856  public static function encodeAttribute( $text ) {
857  $encValue = htmlspecialchars( $text, ENT_QUOTES );
858 
859  // Whitespace is normalized during attribute decoding,
860  // so if we've been passed non-spaces we must encode them
861  // ahead of time or they won't be preserved.
862  $encValue = strtr( $encValue, [
863  "\n" => '&#10;',
864  "\r" => '&#13;',
865  "\t" => '&#9;',
866  ] );
867 
868  return $encValue;
869  }
870 
879  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
880  // Replace $ with \$ and \ with \\
881  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
882  $fixtags = [
883  # French spaces, last one Guillemet-left
884  # only if it isn't followed by a word character.
885  '/ (?=[?:;!%»›](?!\w))/u' => "$space",
886  # French spaces, Guillemet-right
887  '/([«‹]) /u' => "\\1$space",
888  ];
889  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
890  }
891 
898  public static function safeEncodeAttribute( $text ) {
899  $encValue = self::encodeAttribute( $text );
900 
901  # Templates and links may be expanded in later parsing,
902  # creating invalid or dangerous output. Suppress this.
903  $encValue = strtr( $encValue, [
904  '<' => '&lt;', // This should never happen,
905  '>' => '&gt;', // we've received invalid input
906  '"' => '&quot;', // which should have been escaped.
907  '{' => '&#123;',
908  '}' => '&#125;', // prevent unpaired language conversion syntax
909  '[' => '&#91;',
910  ']' => '&#93;',
911  "''" => '&#39;&#39;',
912  'ISBN' => '&#73;SBN',
913  'RFC' => '&#82;FC',
914  'PMID' => '&#80;MID',
915  '|' => '&#124;',
916  '__' => '&#95;_',
917  ] );
918 
919  # Stupid hack
920  $encValue = preg_replace_callback(
921  '/((?i)' . wfUrlProtocols() . ')/',
922  static function ( $matches ) {
923  return str_replace( ':', '&#58;', $matches[1] );
924  },
925  $encValue );
926  return $encValue;
927  }
928 
944  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
945  global $wgFragmentMode;
946 
947  if ( !isset( $wgFragmentMode[$mode] ) ) {
948  if ( $mode === self::ID_PRIMARY ) {
949  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
950  }
951  return false;
952  }
953 
954  $internalMode = $wgFragmentMode[$mode];
955 
956  return self::escapeIdInternal( $id, $internalMode );
957  }
958 
971  public static function escapeIdForLink( $id ) {
972  global $wgFragmentMode;
973 
974  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
975  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
976  }
977 
978  $mode = $wgFragmentMode[self::ID_PRIMARY];
979 
980  $id = self::escapeIdInternalUrl( $id, $mode );
981 
982  return $id;
983  }
984 
994  public static function escapeIdForExternalInterwiki( $id ) {
996 
997  $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
998 
999  return $id;
1000  }
1001 
1011  private static function escapeIdInternalUrl( $id, $mode ) {
1012  $id = self::escapeIdInternal( $id, $mode );
1013  if ( $mode === 'html5' ) {
1014  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1015  }
1016  return $id;
1017  }
1018 
1026  private static function escapeIdInternal( $id, $mode ) {
1027  // Truncate overly-long IDs. This isn't an HTML limit, it's just
1028  // griefer protection. [T251506]
1029  $id = mb_substr( $id, 0, 1024 );
1030 
1031  switch ( $mode ) {
1032  case 'html5':
1033  // html5 spec says ids must not have any of the following:
1034  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1035  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1036  // possible using either Lua or html entities.
1037  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1038  break;
1039  case 'legacy':
1040  // This corresponds to 'noninitial' mode of the former escapeId()
1041  static $replace = [
1042  '%3A' => ':',
1043  '%' => '.'
1044  ];
1045 
1046  $id = urlencode( str_replace( ' ', '_', $id ) );
1047  $id = strtr( $id, $replace );
1048  break;
1049  default:
1050  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1051  }
1052 
1053  return $id;
1054  }
1055 
1066  public static function escapeIdReferenceList( $referenceString ) {
1067  wfDeprecated( __METHOD__, '1.36' );
1068  return self::escapeIdReferenceListInternal( $referenceString );
1069  }
1070 
1078  private static function escapeIdReferenceListInternal( $referenceString ) {
1079  # Explode the space delimited list string into an array of tokens
1080  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1081 
1082  # Escape each token as an id
1083  foreach ( $references as &$ref ) {
1084  $ref = self::escapeIdForAttribute( $ref );
1085  }
1086 
1087  # Merge the array back to a space delimited list string
1088  # If the array is empty, the result will be an empty string ('')
1089  $referenceString = implode( ' ', $references );
1090 
1091  return $referenceString;
1092  }
1093 
1105  public static function escapeClass( $class ) {
1106  // Convert ugly stuff to underscores and kill underscores in ugly places
1107  return rtrim( preg_replace(
1108  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1109  '_',
1110  $class ), '_' );
1111  }
1112 
1120  public static function escapeHtmlAllowEntities( $html ) {
1121  $html = self::decodeCharReferences( $html );
1122  # It seems wise to escape ' as well as ", as a matter of course. Can't
1123  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1124  # don't cause the entire string to disappear.
1125  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1126  return $html;
1127  }
1128 
1137  public static function decodeTagAttributes( $text ) {
1138  if ( trim( $text ) == '' ) {
1139  return [];
1140  }
1141 
1142  $pairs = [];
1143  if ( !preg_match_all(
1144  self::getAttribsRegex(),
1145  $text,
1146  $pairs,
1147  PREG_SET_ORDER ) ) {
1148  return [];
1149  }
1150 
1151  $attribs = [];
1152  foreach ( $pairs as $set ) {
1153  $attribute = strtolower( $set[1] );
1154 
1155  // Filter attribute names with unacceptable characters
1156  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1157  continue;
1158  }
1159 
1160  $value = self::getTagAttributeCallback( $set );
1161 
1162  // Normalize whitespace
1163  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1164  $value = trim( $value );
1165 
1166  // Decode character references
1167  $attribs[$attribute] = self::decodeCharReferences( $value );
1168  }
1169  return $attribs;
1170  }
1171 
1179  public static function safeEncodeTagAttributes( $assoc_array ) {
1180  $attribs = [];
1181  foreach ( $assoc_array as $attribute => $value ) {
1182  $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1183  $encValue = self::safeEncodeAttribute( $value );
1184 
1185  $attribs[] = "$encAttribute=\"$encValue\"";
1186  }
1187  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1188  }
1189 
1198  private static function getTagAttributeCallback( $set ) {
1199  if ( isset( $set[5] ) ) {
1200  # No quotes.
1201  return $set[5];
1202  } elseif ( isset( $set[4] ) ) {
1203  # Single-quoted
1204  return $set[4];
1205  } elseif ( isset( $set[3] ) ) {
1206  # Double-quoted
1207  return $set[3];
1208  } elseif ( !isset( $set[2] ) ) {
1209  # In XHTML, attributes must have a value so return an empty string.
1210  # See "Empty attribute syntax",
1211  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1212  return "";
1213  } else {
1214  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1215  }
1216  }
1217 
1222  private static function normalizeWhitespace( $text ) {
1223  return trim( preg_replace(
1224  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1225  ' ',
1226  $text ) );
1227  }
1228 
1237  public static function normalizeSectionNameWhitespace( $section ) {
1238  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1239  }
1240 
1256  public static function normalizeCharReferences( $text ) {
1257  return preg_replace_callback(
1258  self::CHAR_REFS_REGEX,
1259  [ self::class, 'normalizeCharReferencesCallback' ],
1260  $text );
1261  }
1262 
1267  private static function normalizeCharReferencesCallback( $matches ) {
1268  $ret = null;
1269  if ( $matches[1] != '' ) {
1270  $ret = self::normalizeEntity( $matches[1] );
1271  } elseif ( $matches[2] != '' ) {
1272  $ret = self::decCharReference( $matches[2] );
1273  } elseif ( $matches[3] != '' ) {
1274  $ret = self::hexCharReference( $matches[3] );
1275  }
1276  if ( $ret === null ) {
1277  return htmlspecialchars( $matches[0], ENT_COMPAT );
1278  } else {
1279  return $ret;
1280  }
1281  }
1282 
1293  private static function normalizeEntity( $name ) {
1294  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1295  // Non-standard MediaWiki-specific entities
1296  return '&' . self::MW_ENTITY_ALIASES[$name];
1297  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1298  // Keep these in word form
1299  return "&$name";
1300  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1301  // Beware: some entities expand to more than 1 codepoint
1302  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1303  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1304  }, HTMLData::$namedEntityTranslations[$name] );
1305  } else {
1306  return "&amp;$name";
1307  }
1308  }
1309 
1314  private static function decCharReference( $codepoint ) {
1315  $point = intval( $codepoint );
1316  if ( self::validateCodepoint( $point ) ) {
1317  return sprintf( '&#%d;', $point );
1318  } else {
1319  return null;
1320  }
1321  }
1322 
1327  private static function hexCharReference( $codepoint ) {
1328  $point = hexdec( $codepoint );
1329  if ( self::validateCodepoint( $point ) ) {
1330  return sprintf( '&#x%x;', $point );
1331  } else {
1332  return null;
1333  }
1334  }
1335 
1342  private static function validateCodepoint( $codepoint ) {
1343  # U+000C is valid in HTML5 but not allowed in XML.
1344  # U+000D is valid in XML but not allowed in HTML5.
1345  # U+007F - U+009F are disallowed in HTML5 (control characters).
1346  return $codepoint == 0x09
1347  || $codepoint == 0x0a
1348  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1349  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1350  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1351  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1352  }
1353 
1361  public static function decodeCharReferences( $text ) {
1362  return preg_replace_callback(
1363  self::CHAR_REFS_REGEX,
1364  [ self::class, 'decodeCharReferencesCallback' ],
1365  $text );
1366  }
1367 
1378  public static function decodeCharReferencesAndNormalize( $text ) {
1379  $text = preg_replace_callback(
1380  self::CHAR_REFS_REGEX,
1381  [ self::class, 'decodeCharReferencesCallback' ],
1382  $text,
1383  -1, // limit
1384  $count
1385  );
1386 
1387  if ( $count ) {
1388  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1389  } else {
1390  return $text;
1391  }
1392  }
1393 
1398  private static function decodeCharReferencesCallback( $matches ) {
1399  if ( $matches[1] != '' ) {
1400  return self::decodeEntity( $matches[1] );
1401  } elseif ( $matches[2] != '' ) {
1402  return self::decodeChar( intval( $matches[2] ) );
1403  } elseif ( $matches[3] != '' ) {
1404  return self::decodeChar( hexdec( $matches[3] ) );
1405  }
1406  # Last case should be an ampersand by itself
1407  return $matches[0];
1408  }
1409 
1417  private static function decodeChar( $codepoint ) {
1418  if ( self::validateCodepoint( $codepoint ) ) {
1419  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1420  } else {
1421  return UtfNormal\Constants::UTF8_REPLACEMENT;
1422  }
1423  }
1424 
1433  private static function decodeEntity( $name ) {
1434  // These are MediaWiki-specific entities, not in the HTML standard
1435  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1436  $name = self::MW_ENTITY_ALIASES[$name];
1437  }
1438  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1439  return $trans ?? "&$name";
1440  }
1441 
1449  private static function attributesAllowedInternal( $element ) {
1450  $list = self::setupAttributesAllowedInternal();
1451  return $list[$element] ?? [];
1452  }
1453 
1461  private static function setupAttributesAllowedInternal() {
1462  static $allowed;
1463 
1464  if ( $allowed !== null ) {
1465  return $allowed;
1466  }
1467 
1468  // For lookup efficiency flip each attributes array so the keys are
1469  // the valid attributes.
1470  $merge = static function ( $a, $b, $c = [] ) {
1471  return array_merge(
1472  $a,
1473  array_fill_keys( $b, true ),
1474  array_fill_keys( $c, true ) );
1475  };
1476  $common = $merge( [], [
1477  # HTML
1478  'id',
1479  'class',
1480  'style',
1481  'lang',
1482  'dir',
1483  'title',
1484  'tabindex',
1485 
1486  # WAI-ARIA
1487  'aria-describedby',
1488  'aria-flowto',
1489  'aria-hidden',
1490  'aria-label',
1491  'aria-labelledby',
1492  'aria-owns',
1493  'role',
1494 
1495  # RDFa
1496  # These attributes are specified in section 9 of
1497  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1498  'about',
1499  'property',
1500  'resource',
1501  'datatype',
1502  'typeof',
1503 
1504  # Microdata. These are specified by
1505  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1506  'itemid',
1507  'itemprop',
1508  'itemref',
1509  'itemscope',
1510  'itemtype',
1511  ] );
1512 
1513  $block = $merge( $common, [ 'align' ] );
1514 
1515  $tablealign = [ 'align', 'valign' ];
1516  $tablecell = [
1517  'abbr',
1518  'axis',
1519  'headers',
1520  'scope',
1521  'rowspan',
1522  'colspan',
1523  'nowrap', # deprecated
1524  'width', # deprecated
1525  'height', # deprecated
1526  'bgcolor', # deprecated
1527  ];
1528 
1529  # Numbers refer to sections in HTML 4.01 standard describing the element.
1530  # See: https://www.w3.org/TR/html4/
1531  $allowed = [
1532  # 7.5.4
1533  'div' => $block,
1534  'center' => $common, # deprecated
1535  'span' => $common,
1536 
1537  # 7.5.5
1538  'h1' => $block,
1539  'h2' => $block,
1540  'h3' => $block,
1541  'h4' => $block,
1542  'h5' => $block,
1543  'h6' => $block,
1544 
1545  # 7.5.6
1546  # address
1547 
1548  # 8.2.4
1549  'bdo' => $common,
1550 
1551  # 9.2.1
1552  'em' => $common,
1553  'strong' => $common,
1554  'cite' => $common,
1555  'dfn' => $common,
1556  'code' => $common,
1557  'samp' => $common,
1558  'kbd' => $common,
1559  'var' => $common,
1560  'abbr' => $common,
1561  # acronym
1562 
1563  # 9.2.2
1564  'blockquote' => $merge( $common, [ 'cite' ] ),
1565  'q' => $merge( $common, [ 'cite' ] ),
1566 
1567  # 9.2.3
1568  'sub' => $common,
1569  'sup' => $common,
1570 
1571  # 9.3.1
1572  'p' => $block,
1573 
1574  # 9.3.2
1575  'br' => $merge( $common, [ 'clear' ] ),
1576 
1577  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1578  'wbr' => $common,
1579 
1580  # 9.3.4
1581  'pre' => $merge( $common, [ 'width' ] ),
1582 
1583  # 9.4
1584  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1585  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1586 
1587  # 10.2
1588  'ul' => $merge( $common, [ 'type' ] ),
1589  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1590  'li' => $merge( $common, [ 'type', 'value' ] ),
1591 
1592  # 10.3
1593  'dl' => $common,
1594  'dd' => $common,
1595  'dt' => $common,
1596 
1597  # 11.2.1
1598  'table' => $merge( $common,
1599  [ 'summary', 'width', 'border', 'frame',
1600  'rules', 'cellspacing', 'cellpadding',
1601  'align', 'bgcolor',
1602  ] ),
1603 
1604  # 11.2.2
1605  'caption' => $block,
1606 
1607  # 11.2.3
1608  'thead' => $common,
1609  'tfoot' => $common,
1610  'tbody' => $common,
1611 
1612  # 11.2.4
1613  'colgroup' => $merge( $common, [ 'span' ] ),
1614  'col' => $merge( $common, [ 'span' ] ),
1615 
1616  # 11.2.5
1617  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1618 
1619  # 11.2.6
1620  'td' => $merge( $common, $tablecell, $tablealign ),
1621  'th' => $merge( $common, $tablecell, $tablealign ),
1622 
1623  # 12.2
1624  # NOTE: <a> is not allowed directly, but this list of allowed
1625  # attributes is used from the Parser object
1626  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1627 
1628  # 13.2
1629  # Not usually allowed, but may be used for extension-style hooks
1630  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1631  # true
1632  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1633  # Attributes for A/V tags added in T163583 / T133673
1634  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1635  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1636  'source' => $merge( $common, [ 'type', 'src' ] ),
1637  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1638 
1639  # 15.2.1
1640  'tt' => $common,
1641  'b' => $common,
1642  'i' => $common,
1643  'big' => $common,
1644  'small' => $common,
1645  'strike' => $common,
1646  's' => $common,
1647  'u' => $common,
1648 
1649  # 15.2.2
1650  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1651  # basefont
1652 
1653  # 15.3
1654  'hr' => $merge( $common, [ 'width' ] ),
1655 
1656  # HTML Ruby annotation text module, simple ruby only.
1657  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1658  'ruby' => $common,
1659  # rbc
1660  'rb' => $common,
1661  'rp' => $common,
1662  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1663  'rtc' => $common,
1664 
1665  # MathML root element, where used for extensions
1666  # 'title' may not be 100% valid here; it's XHTML
1667  # https://www.w3.org/TR/REC-MathML/
1668  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1669 
1670  // HTML 5 section 4.5
1671  'figure' => $common,
1672  'figcaption' => $common,
1673 
1674  # HTML 5 section 4.6
1675  'bdi' => $common,
1676 
1677  # HTML5 elements, defined by:
1678  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1679  'data' => $merge( $common, [ 'value' ] ),
1680  'time' => $merge( $common, [ 'datetime' ] ),
1681  'mark' => $common,
1682 
1683  // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1684  // is enabled so we don't bother adding a conditional to hide these
1685  // Also meta and link are only valid in WikiText as Microdata elements
1686  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1687  // So we don't bother including $common attributes that have no purpose.
1688  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1689  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1690 
1691  # HTML 5 section 4.3.5
1692  'aside' => $common,
1693  ];
1694 
1695  return $allowed;
1696  }
1697 
1709  public static function stripAllTags( $html ) {
1710  // Use RemexHtml to tokenize $html and extract the text
1711  $handler = new RemexStripTagHandler;
1712  $tokenizer = new RemexTokenizer( $handler, $html, [
1713  'ignoreErrors' => true,
1714  // don't ignore char refs, we want them to be decoded
1715  'ignoreNulls' => true,
1716  'skipPreprocess' => true,
1717  ] );
1718  $tokenizer->execute();
1719  $text = $handler->getResult();
1720 
1721  $text = self::normalizeWhitespace( $text );
1722  return $text;
1723  }
1724 
1736  public static function hackDocType() {
1737  $out = "<!DOCTYPE html [\n";
1738  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1739  if ( substr( $entity, -1 ) !== ';' ) {
1740  // Some HTML entities omit the trailing semicolon;
1741  // wikitext does not permit these.
1742  continue;
1743  }
1744  $name = substr( $entity, 0, -1 );
1745  $expansion = self::normalizeEntity( $entity );
1746  if ( $entity === $expansion ) {
1747  // Skip &lt; &gt; etc
1748  continue;
1749  }
1750  $out .= "<!ENTITY $name \"$expansion\">";
1751  }
1752  $out .= "]>\n";
1753  return $out;
1754  }
1755 
1760  public static function cleanUrl( $url ) {
1761  # Normalize any HTML entities in input. They will be
1762  # re-escaped by makeExternalLink().
1763  $url = self::decodeCharReferences( $url );
1764 
1765  # Escape any control characters introduced by the above step
1766  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1767  [ __CLASS__, 'cleanUrlCallback' ], $url );
1768 
1769  # Validate hostname portion
1770  $matches = [];
1771  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1772  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1773 
1774  // Characters that will be ignored in IDNs.
1775  // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1776  // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1777  // Strip them before further processing so deny lists and such work.
1778  $strip = "/
1779  \\s| # general whitespace
1780  \u{00AD}| # SOFT HYPHEN
1781  \u{034F}| # COMBINING GRAPHEME JOINER
1782  \u{061C}| # ARABIC LETTER MARK
1783  [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1784  # HANGUL JUNGSEONG FILLER
1785  [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1786  # KHMER VOWEL INHERENT AA
1787  [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1788  # MONGOLIAN FREE VARIATION SELECTOR THREE
1789  \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1790  [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1791  # RIGHT-TO-LEFT MARK
1792  [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1793  # RIGHT-TO-LEFT OVERRIDE
1794  [\u{2060}-\u{2064}]| # WORD JOINER..
1795  # INVISIBLE PLUS
1796  \u{2065}| # <reserved-2065>
1797  [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1798  # NOMINAL DIGIT SHAPES
1799  \u{3164}| # HANGUL FILLER
1800  [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1801  # VARIATION SELECTOR-16
1802  \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1803  \u{FFA0}| # HALFWIDTH HANGUL FILLER
1804  [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1805  # <reserved-FFF8>
1806  [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1807  # SHORTHAND FORMAT UP STEP
1808  [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1809  # MUSICAL SYMBOL END PHRASE
1810  \u{E0000}| # <reserved-E0000>
1811  \u{E0001}| # LANGUAGE TAG
1812  [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1813  # <reserved-E001F>
1814  [\u{E0020}-\u{E007F}]| # TAG SPACE..
1815  # CANCEL TAG
1816  [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1817  # <reserved-E00FF>
1818  [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1819  # VARIATION SELECTOR-256
1820  [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1821  # <reserved-E0FFF>
1822  /xuD";
1823 
1824  $host = preg_replace( $strip, '', $host );
1825 
1826  // IPv6 host names are bracketed with []. Url-decode these.
1827  if ( str_starts_with( $host, "//%5B" ) &&
1828  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1829  ) {
1830  $host = '//[' . $matches[1] . ']' . $matches[2];
1831  }
1832 
1833  // @todo FIXME: Validate hostnames here
1834 
1835  return $protocol . $host . $rest;
1836  } else {
1837  return $url;
1838  }
1839  }
1840 
1845  private static function cleanUrlCallback( $matches ) {
1846  return urlencode( $matches[0] );
1847  }
1848 
1877  public static function validateEmail( $addr ) {
1878  $result = null;
1879  // TODO This method should be non-static, and have a HookRunner injected
1880  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1881  return $result;
1882  }
1883 
1884  // Please note strings below are enclosed in brackets [], this make the
1885  // hyphen "-" a range indicator. Hence it is double backslashed below.
1886  // See T28948
1887  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1888  $rfc1034_ldh_str = "a-z0-9\\-";
1889 
1890  $html5_email_regexp = "/
1891  ^ # start of string
1892  [$rfc5322_atext\\.]+ # user part which is liberal :p
1893  @ # 'apostrophe'
1894  [$rfc1034_ldh_str]+ # First domain part
1895  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1896  $ # End of string
1897  /ix"; // case Insensitive, eXtended
1898 
1899  return (bool)preg_match( $html5_email_regexp, $addr );
1900  }
1901 }
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
$matches
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:173
MediaWiki exception.
Definition: MWException.php:29
Service locator for MediaWiki core services.
Helper class for Sanitizer::removeSomeTags().
Helper class for Sanitizer::stripAllTags().
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:41
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:836
static cleanUrl( $url)
Definition: Sanitizer.php:1760
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:654
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1120
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:766
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:548
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:879
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:152
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:971
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:441
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:856
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:525
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1105
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
Definition: Sanitizer.php:392
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:317
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1237
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1736
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1256
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
Definition: Sanitizer.php:280
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1066
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:85
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1361
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:944
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1709
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1378
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:675
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:994
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1877
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:77
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:898
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:696
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.
if( $line===false) $args
Definition: mcc.php:124