1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
50  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
51  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58  private static $htmlEntities = array(
59  'Aacute' => 193,
60  'aacute' => 225,
61  'Acirc' => 194,
62  'acirc' => 226,
63  'acute' => 180,
64  'AElig' => 198,
65  'aelig' => 230,
66  'Agrave' => 192,
67  'agrave' => 224,
68  'alefsym' => 8501,
69  'Alpha' => 913,
70  'alpha' => 945,
71  'amp' => 38,
72  'and' => 8743,
73  'ang' => 8736,
74  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
75  'Aring' => 197,
76  'aring' => 229,
77  'asymp' => 8776,
78  'Atilde' => 195,
79  'atilde' => 227,
80  'Auml' => 196,
81  'auml' => 228,
82  'bdquo' => 8222,
83  'Beta' => 914,
84  'beta' => 946,
85  'brvbar' => 166,
86  'bull' => 8226,
87  'cap' => 8745,
88  'Ccedil' => 199,
89  'ccedil' => 231,
90  'cedil' => 184,
91  'cent' => 162,
92  'Chi' => 935,
93  'chi' => 967,
94  'circ' => 710,
95  'clubs' => 9827,
96  'cong' => 8773,
97  'copy' => 169,
98  'crarr' => 8629,
99  'cup' => 8746,
100  'curren' => 164,
101  'dagger' => 8224,
102  'Dagger' => 8225,
103  'darr' => 8595,
104  'dArr' => 8659,
105  'deg' => 176,
106  'Delta' => 916,
107  'delta' => 948,
108  'diams' => 9830,
109  'divide' => 247,
110  'Eacute' => 201,
111  'eacute' => 233,
112  'Ecirc' => 202,
113  'ecirc' => 234,
114  'Egrave' => 200,
115  'egrave' => 232,
116  'empty' => 8709,
117  'emsp' => 8195,
118  'ensp' => 8194,
119  'Epsilon' => 917,
120  'epsilon' => 949,
121  'equiv' => 8801,
122  'Eta' => 919,
123  'eta' => 951,
124  'ETH' => 208,
125  'eth' => 240,
126  'Euml' => 203,
127  'euml' => 235,
128  'euro' => 8364,
129  'exist' => 8707,
130  'fnof' => 402,
131  'forall' => 8704,
132  'frac12' => 189,
133  'frac14' => 188,
134  'frac34' => 190,
135  'frasl' => 8260,
136  'Gamma' => 915,
137  'gamma' => 947,
138  'ge' => 8805,
139  'gt' => 62,
140  'harr' => 8596,
141  'hArr' => 8660,
142  'hearts' => 9829,
143  'hellip' => 8230,
144  'Iacute' => 205,
145  'iacute' => 237,
146  'Icirc' => 206,
147  'icirc' => 238,
148  'iexcl' => 161,
149  'Igrave' => 204,
150  'igrave' => 236,
151  'image' => 8465,
152  'infin' => 8734,
153  'int' => 8747,
154  'Iota' => 921,
155  'iota' => 953,
156  'iquest' => 191,
157  'isin' => 8712,
158  'Iuml' => 207,
159  'iuml' => 239,
160  'Kappa' => 922,
161  'kappa' => 954,
162  'Lambda' => 923,
163  'lambda' => 955,
164  'lang' => 9001,
165  'laquo' => 171,
166  'larr' => 8592,
167  'lArr' => 8656,
168  'lceil' => 8968,
169  'ldquo' => 8220,
170  'le' => 8804,
171  'lfloor' => 8970,
172  'lowast' => 8727,
173  'loz' => 9674,
174  'lrm' => 8206,
175  'lsaquo' => 8249,
176  'lsquo' => 8216,
177  'lt' => 60,
178  'macr' => 175,
179  'mdash' => 8212,
180  'micro' => 181,
181  'middot' => 183,
182  'minus' => 8722,
183  'Mu' => 924,
184  'mu' => 956,
185  'nabla' => 8711,
186  'nbsp' => 160,
187  'ndash' => 8211,
188  'ne' => 8800,
189  'ni' => 8715,
190  'not' => 172,
191  'notin' => 8713,
192  'nsub' => 8836,
193  'Ntilde' => 209,
194  'ntilde' => 241,
195  'Nu' => 925,
196  'nu' => 957,
197  'Oacute' => 211,
198  'oacute' => 243,
199  'Ocirc' => 212,
200  'ocirc' => 244,
201  'OElig' => 338,
202  'oelig' => 339,
203  'Ograve' => 210,
204  'ograve' => 242,
205  'oline' => 8254,
206  'Omega' => 937,
207  'omega' => 969,
208  'Omicron' => 927,
209  'omicron' => 959,
210  'oplus' => 8853,
211  'or' => 8744,
212  'ordf' => 170,
213  'ordm' => 186,
214  'Oslash' => 216,
215  'oslash' => 248,
216  'Otilde' => 213,
217  'otilde' => 245,
218  'otimes' => 8855,
219  'Ouml' => 214,
220  'ouml' => 246,
221  'para' => 182,
222  'part' => 8706,
223  'permil' => 8240,
224  'perp' => 8869,
225  'Phi' => 934,
226  'phi' => 966,
227  'Pi' => 928,
228  'pi' => 960,
229  'piv' => 982,
230  'plusmn' => 177,
231  'pound' => 163,
232  'prime' => 8242,
233  'Prime' => 8243,
234  'prod' => 8719,
235  'prop' => 8733,
236  'Psi' => 936,
237  'psi' => 968,
238  'quot' => 34,
239  'radic' => 8730,
240  'rang' => 9002,
241  'raquo' => 187,
242  'rarr' => 8594,
243  'rArr' => 8658,
244  'rceil' => 8969,
245  'rdquo' => 8221,
246  'real' => 8476,
247  'reg' => 174,
248  'rfloor' => 8971,
249  'Rho' => 929,
250  'rho' => 961,
251  'rlm' => 8207,
252  'rsaquo' => 8250,
253  'rsquo' => 8217,
254  'sbquo' => 8218,
255  'Scaron' => 352,
256  'scaron' => 353,
257  'sdot' => 8901,
258  'sect' => 167,
259  'shy' => 173,
260  'Sigma' => 931,
261  'sigma' => 963,
262  'sigmaf' => 962,
263  'sim' => 8764,
264  'spades' => 9824,
265  'sub' => 8834,
266  'sube' => 8838,
267  'sum' => 8721,
268  'sup' => 8835,
269  'sup1' => 185,
270  'sup2' => 178,
271  'sup3' => 179,
272  'supe' => 8839,
273  'szlig' => 223,
274  'Tau' => 932,
275  'tau' => 964,
276  'there4' => 8756,
277  'Theta' => 920,
278  'theta' => 952,
279  'thetasym' => 977,
280  'thinsp' => 8201,
281  'THORN' => 222,
282  'thorn' => 254,
283  'tilde' => 732,
284  'times' => 215,
285  'trade' => 8482,
286  'Uacute' => 218,
287  'uacute' => 250,
288  'uarr' => 8593,
289  'uArr' => 8657,
290  'Ucirc' => 219,
291  'ucirc' => 251,
292  'Ugrave' => 217,
293  'ugrave' => 249,
294  'uml' => 168,
295  'upsih' => 978,
296  'Upsilon' => 933,
297  'upsilon' => 965,
298  'Uuml' => 220,
299  'uuml' => 252,
300  'weierp' => 8472,
301  'Xi' => 926,
302  'xi' => 958,
303  'Yacute' => 221,
304  'yacute' => 253,
305  'yen' => 165,
306  'Yuml' => 376,
307  'yuml' => 255,
308  'Zeta' => 918,
309  'zeta' => 950,
310  'zwj' => 8205,
311  'zwnj' => 8204
312  );
317  private static $htmlEntityAliases = array(
318  'רלמ' => 'rlm',
319  'رلم' => 'rlm',
320  );
325  private static $attribsRegex;
332  static function getAttribsRegex() {
333  if ( self::$attribsRegex === null ) {
334  $attribFirst = '[:A-Z_a-z0-9]';
335  $attrib = '[:A-Z_a-z-.0-9]';
336  $space = '[\x09\x0a\x0d\x20]';
337  self::$attribsRegex =
338  "/(?:^|$space)({$attribFirst}{$attrib}*)
339  ($space*=$space*
340  (?:
341  # The attribute value: quoted or alone
342  \"([^<\"]*)\"
343  | '([^<']*)'
344  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
345  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
346  # colors are specified like this.
347  # We'll be normalizing it.
348  )
349  )?(?=$space|\$)/sx";
350  }
351  return self::$attribsRegex;
352  }
366  static function removeHTMLtags( $text, $processCallback = null,
367  $args = array(), $extratags = array(), $removetags = array()
368  ) {
369  global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
371  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
372  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
374  wfProfileIn( __METHOD__ );
376  // Base our staticInitialised variable off of the global config state so that if the globals
377  // are changed (like in the screwed up test system) we will re-initialise the settings.
378  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
379  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
381  $htmlpairsStatic = array( # Tags that must be closed
382  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
383  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
384  'strike', 'strong', 'tt', 'var', 'div', 'center',
385  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
386  'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
387  'kbd', 'samp', 'data', 'time', 'mark'
388  );
389  $htmlsingle = array(
390  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
391  );
392  $htmlsingleonly = array( # Elements that cannot have close tags
393  'br', 'wbr', 'hr'
394  );
395  if ( $wgAllowMicrodataAttributes ) {
396  $htmlsingle[] = $htmlsingleonly[] = 'meta';
397  $htmlsingle[] = $htmlsingleonly[] = 'link';
398  }
399  $htmlnest = array( # Tags that can be nested--??
400  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
401  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
402  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
403  );
404  $tabletags = array( # Can only appear inside table, we will close them
405  'td', 'th', 'tr',
406  );
407  $htmllist = array( # Tags used by list
408  'ul', 'ol',
409  );
410  $listtags = array( # Tags that can appear in a list
411  'li',
412  );
414  if ( $wgAllowImageTag ) {
415  $htmlsingle[] = 'img';
416  $htmlsingleonly[] = 'img';
417  }
419  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
420  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
422  # Convert them all to hashtables for faster lookup
423  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
424  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
425  foreach ( $vars as $var ) {
426  $$var = array_flip( $$var );
427  }
428  $staticInitialised = $globalContext;
429  }
430  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
431  $extratags = array_flip( $extratags );
432  $removetags = array_flip( $removetags );
433  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
434  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
436  # Remove HTML comments
437  $text = Sanitizer::removeHTMLcomments( $text );
438  $bits = explode( '<', $text );
439  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
440  if ( !$wgUseTidy ) {
441  $tagstack = $tablestack = array();
442  foreach ( $bits as $x ) {
443  $regs = array();
444  # $slash: Does the current element start with a '/'?
445  # $t: Current element name
446  # $params: String between element name and >
447  # $brace: Ending '>' or '/>'
448  # $rest: Everything until the next element of $bits
449  if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
450  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
451  } else {
452  $slash = $t = $params = $brace = $rest = null;
453  }
455  $badtag = false;
456  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
457  # Check our stack
458  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
459  $badtag = true;
460  } elseif ( $slash ) {
461  # Closing a tag... is it the one we just opened?
462  $ot = @array_pop( $tagstack );
463  if ( $ot != $t ) {
464  if ( isset( $htmlsingleallowed[$ot] ) ) {
465  # Pop all elements with an optional close tag
466  # and see if we find a match below them
467  $optstack = array();
468  array_push( $optstack, $ot );
470  $ot = array_pop( $tagstack );
472  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
473  array_push( $optstack, $ot );
475  $ot = array_pop( $tagstack );
477  }
478  if ( $t != $ot ) {
479  # No match. Push the optional elements back again
480  $badtag = true;
482  $ot = array_pop( $optstack );
484  while ( $ot ) {
485  array_push( $tagstack, $ot );
487  $ot = array_pop( $optstack );
489  }
490  }
491  } else {
492  @array_push( $tagstack, $ot );
493  # <li> can be nested in <ul> or <ol>, skip those cases:
494  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
495  $badtag = true;
496  }
497  }
498  } else {
499  if ( $t == 'table' ) {
500  $tagstack = array_pop( $tablestack );
501  }
502  }
503  $newparams = '';
504  } else {
505  # Keep track for later
506  if ( isset( $tabletags[$t] ) &&
507  !in_array( 'table', $tagstack ) ) {
508  $badtag = true;
509  } elseif ( in_array( $t, $tagstack ) &&
510  !isset( $htmlnest[$t] ) ) {
511  $badtag = true;
512  # Is it a self closed htmlpair ? (bug 5487)
513  } elseif ( $brace == '/>' &&
514  isset( $htmlpairs[$t] ) ) {
515  $badtag = true;
516  } elseif ( isset( $htmlsingleonly[$t] ) ) {
517  # Hack to force empty tag for unclosable elements
518  $brace = '/>';
519  } elseif ( isset( $htmlsingle[$t] ) ) {
520  # Hack to not close $htmlsingle tags
521  $brace = null;
522  # Still need to push this optionally-closed tag to
523  # the tag stack so that we can match end tags
524  # instead of marking them as bad.
525  array_push( $tagstack, $t );
526  } elseif ( isset( $tabletags[$t] )
527  && in_array( $t, $tagstack ) ) {
528  // New table tag but forgot to close the previous one
529  $text .= "</$t>";
530  } else {
531  if ( $t == 'table' ) {
532  array_push( $tablestack, $tagstack );
533  $tagstack = array();
534  }
535  array_push( $tagstack, $t );
536  }
538  # Replace any variables or template parameters with
539  # plaintext results.
540  if ( is_callable( $processCallback ) ) {
541  call_user_func_array( $processCallback, array( &$params, $args ) );
542  }
544  if ( !Sanitizer::validateTag( $params, $t ) ) {
545  $badtag = true;
546  }
548  # Strip non-approved attributes from the tag
549  $newparams = Sanitizer::fixTagAttributes( $params, $t );
550  }
551  if ( !$badtag ) {
552  $rest = str_replace( '>', '&gt;', $rest );
553  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
554  $text .= "<$slash$t$newparams$close>$rest";
555  continue;
556  }
557  }
558  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
559  }
560  # Close off any remaining tags
561  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
562  $text .= "</$t>\n";
563  if ( $t == 'table' ) {
564  $tagstack = array_pop( $tablestack );
565  }
566  }
567  } else {
568  # this might be possible using tidy itself
569  foreach ( $bits as $x ) {
570  preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
571  $x, $regs );
572  @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
573  $badtag = false;
574  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
575  if ( is_callable( $processCallback ) ) {
576  call_user_func_array( $processCallback, array( &$params, $args ) );
577  }
579  if ( !Sanitizer::validateTag( $params, $t ) ) {
580  $badtag = true;
581  }
583  $newparams = Sanitizer::fixTagAttributes( $params, $t );
584  if ( !$badtag ) {
585  $rest = str_replace( '>', '&gt;', $rest );
586  $text .= "<$slash$t$newparams$brace$rest";
587  continue;
588  }
589  }
590  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
591  }
592  }
593  wfProfileOut( __METHOD__ );
594  return $text;
595  }
607  static function removeHTMLcomments( $text ) {
608  wfProfileIn( __METHOD__ );
609  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
610  $end = strpos( $text, '-->', $start + 4 );
611  if ( $end === false ) {
612  # Unterminated comment; bail out
613  break;
614  }
616  $end += 3;
618  # Trim space and newline if the comment is both
619  # preceded and followed by a newline
620  $spaceStart = max( $start - 1, 0 );
621  $spaceLen = $end - $spaceStart;
622  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
623  $spaceStart--;
624  $spaceLen++;
625  }
626  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
627  $spaceLen++;
628  }
629  if ( substr( $text, $spaceStart, 1 ) === "\n"
630  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
631  # Remove the comment, leading and trailing
632  # spaces, and leave only one newline.
633  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
634  } else {
635  # Remove just the comment.
636  $text = substr_replace( $text, '', $start, $end - $start );
637  }
638  }
639  wfProfileOut( __METHOD__ );
640  return $text;
641  }
655  static function validateTag( $params, $element ) {
658  if ( $element == 'meta' || $element == 'link' ) {
659  if ( !isset( $params['itemprop'] ) ) {
660  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
661  return false;
662  }
663  if ( $element == 'meta' && !isset( $params['content'] ) ) {
664  // <meta> must have a content="" for the itemprop
665  return false;
666  }
667  if ( $element == 'link' && !isset( $params['href'] ) ) {
668  // <link> must have an associated href=""
669  return false;
670  }
671  }
673  return true;
674  }
691  static function validateTagAttributes( $attribs, $element ) {
693  Sanitizer::attributeWhitelist( $element ) );
694  }
711  static function validateAttributes( $attribs, $whitelist ) {
712  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
714  $whitelist = array_flip( $whitelist );
715  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
717  $out = array();
718  foreach ( $attribs as $attribute => $value ) {
719  #allow XML namespace declaration if RDFa is enabled
720  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
721  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
722  $out[$attribute] = $value;
723  }
725  continue;
726  }
728  # Allow any attribute beginning with "data-"
729  if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
730  continue;
731  }
733  # Strip javascript "expression" from stylesheets.
734  #
735  if ( $attribute == 'style' ) {
737  }
739  if ( $attribute === 'id' ) {
740  $value = Sanitizer::escapeId( $value, 'noninitial' );
741  }
743  # WAI-ARIA
744  #
745  #
746  # For now we only support role="presentation" until we work out what roles should be
747  # usable by content and we ensure that our code explicitly rejects patterns that
748  # violate HTML5's ARIA restrictions.
749  if ( $attribute === 'role' && $value !== 'presentation' ) {
750  continue;
751  }
753  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
754  // Check them for sanity.
755  if ( $attribute === 'rel' || $attribute === 'rev'
756  # RDFa
757  || $attribute === 'about' || $attribute === 'property'
758  || $attribute === 'resource' || $attribute === 'datatype'
759  || $attribute === 'typeof'
760  # HTML5 microdata
761  || $attribute === 'itemid' || $attribute === 'itemprop'
762  || $attribute === 'itemref' || $attribute === 'itemscope'
763  || $attribute === 'itemtype'
764  ) {
765  //Paranoia. Allow "simple" values but suppress javascript
766  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
767  continue;
768  }
769  }
771  # NOTE: even though elements using href/src are not allowed directly, supply
772  # validation code that can be used by tag hook handlers, etc
773  if ( $attribute === 'href' || $attribute === 'src' ) {
774  if ( !preg_match( $hrefExp, $value ) ) {
775  continue; //drop any href or src attributes not using an allowed protocol.
776  // NOTE: this also drops all relative URLs
777  }
778  }
780  // If this attribute was previously set, override it.
781  // Output should only have one attribute of each name.
782  $out[$attribute] = $value;
783  }
785  if ( $wgAllowMicrodataAttributes ) {
786  # itemtype, itemid, itemref don't make sense without itemscope
787  if ( !array_key_exists( 'itemscope', $out ) ) {
788  unset( $out['itemtype'] );
789  unset( $out['itemid'] );
790  unset( $out['itemref'] );
791  }
792  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
793  }
794  return $out;
795  }
807  static function mergeAttributes( $a, $b ) {
808  $out = array_merge( $a, $b );
809  if ( isset( $a['class'] ) && isset( $b['class'] )
810  && is_string( $a['class'] ) && is_string( $b['class'] )
811  && $a['class'] !== $b['class']
812  ) {
813  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
815  $out['class'] = implode( ' ', array_unique( $classes ) );
816  }
817  return $out;
818  }
829  public static function normalizeCss( $value ) {
831  // Decode character references like &#123;
834  // Decode escape sequences and line continuation
835  // See the grammar in the CSS 2 spec, appendix D.
836  // This has to be done AFTER decoding character references.
837  // This means it isn't possible for this function to return
838  // unsanitized escape sequences. It is possible to manufacture
839  // input that contains character references that decode to
840  // escape sequences that decode to character references, but
841  // it's OK for the return value to contain character references
842  // because the caller is supposed to escape those anyway.
843  static $decodeRegex;
844  if ( !$decodeRegex ) {
845  $space = '[\\x20\\t\\r\\n\\f]';
846  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
847  $backslash = '\\\\';
848  $decodeRegex = "/ $backslash
849  (?:
850  ($nl) | # 1. Line continuation
851  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
852  (.) | # 3. backslash cancelling special meaning
853  () | # 4. backslash at end of string
854  )/xu";
855  }
856  $value = preg_replace_callback( $decodeRegex,
857  array( __CLASS__, 'cssDecodeCallback' ), $value );
859  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
860  $value = preg_replace_callback(
861  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
862  function ( $matches ) {
863  $cp = utf8ToCodepoint( $matches[0] );
864  if ( $cp === false ) {
865  return '';
866  }
867  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
868  },
869  $value
870  );
872  // Convert more characters IE6 might treat as ascii
873  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
874  $value = str_replace(
875  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
876  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
877  $value
878  );
880  // Let the value through if it's nothing but a single comment, to
881  // allow other functions which may reject it to pass some error
882  // message through.
883  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
884  // Remove any comments; IE gets token splitting wrong
885  // This must be done AFTER decoding character references and
886  // escape sequences, because those steps can introduce comments
887  // This step cannot introduce character references or escape
888  // sequences, because it replaces comments with spaces rather
889  // than removing them completely.
890  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
892  // Remove anything after a comment-start token, to guard against
893  // incorrect client implementations.
894  $commentPos = strpos( $value, '/*' );
895  if ( $commentPos !== false ) {
896  $value = substr( $value, 0, $commentPos );
897  }
898  }
900  // S followed by repeat, iteration, or prolonged sound marks,
901  // which IE will treat as "ss"
902  $value = preg_replace(
903  '/s(?:
904  \xE3\x80\xB1 | # U+3031
905  \xE3\x82\x9D | # U+309D
906  \xE3\x83\xBC | # U+30FC
907  \xE3\x83\xBD | # U+30FD
908  \xEF\xB9\xBC | # U+FE7C
909  \xEF\xB9\xBD | # U+FE7D
910  \xEF\xBD\xB0 # U+FF70
911  )/ix',
912  'ss',
913  $value
914  );
916  return $value;
917  }
938  static function checkCss( $value ) {
941  // Reject problematic keywords and control characters
942  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
943  return '/* invalid control char */';
944  } elseif ( preg_match(
945  '! expression
946  | filter\s*:
947  | accelerator\s*:
948  | -o-link\s*:
949  | -o-link-source\s*:
950  | -o-replace\s*:
951  | url\s*\(
952  | image\s*\(
953  | image-set\s*\(
954  !ix', $value ) ) {
955  return '/* insecure input */';
956  }
957  return $value;
958  }
964  static function cssDecodeCallback( $matches ) {
965  if ( $matches[1] !== '' ) {
966  // Line continuation
967  return '';
968  } elseif ( $matches[2] !== '' ) {
969  $char = codepointToUtf8( hexdec( $matches[2] ) );
970  } elseif ( $matches[3] !== '' ) {
971  $char = $matches[3];
972  } else {
973  $char = '\\';
974  }
975  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
976  // These characters need to be escaped in strings
977  // Clean up the escape sequence to avoid parsing errors by clients
978  return '\\' . dechex( ord( $char ) ) . ' ';
979  } else {
980  // Decode unnecessary escape
981  return $char;
982  }
983  }
1004  static function fixTagAttributes( $text, $element ) {
1005  if ( trim( $text ) == '' ) {
1006  return '';
1007  }
1009  $decoded = Sanitizer::decodeTagAttributes( $text );
1010  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1012  return Sanitizer::safeEncodeTagAttributes( $stripped );
1013  }
1020  static function encodeAttribute( $text ) {
1021  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1023  // Whitespace is normalized during attribute decoding,
1024  // so if we've been passed non-spaces we must encode them
1025  // ahead of time or they won't be preserved.
1026  $encValue = strtr( $encValue, array(
1027  "\n" => '&#10;',
1028  "\r" => '&#13;',
1029  "\t" => '&#9;',
1030  ) );
1032  return $encValue;
1033  }
1041  static function safeEncodeAttribute( $text ) {
1042  $encValue = Sanitizer::encodeAttribute( $text );
1044  # Templates and links may be expanded in later parsing,
1045  # creating invalid or dangerous output. Suppress this.
1046  $encValue = strtr( $encValue, array(
1047  '<' => '&lt;', // This should never happen,
1048  '>' => '&gt;', // we've received invalid input
1049  '"' => '&quot;', // which should have been escaped.
1050  '{' => '&#123;',
1051  '[' => '&#91;',
1052  "''" => '&#39;&#39;',
1053  'ISBN' => '&#73;SBN',
1054  'RFC' => '&#82;FC',
1055  'PMID' => '&#80;MID',
1056  '|' => '&#124;',
1057  '__' => '&#95;_',
1058  ) );
1060  # Stupid hack
1061  $encValue = preg_replace_callback(
1062  '/((?i)' . wfUrlProtocols() . ')/',
1063  array( 'Sanitizer', 'armorLinksCallback' ),
1064  $encValue );
1065  return $encValue;
1066  }
1099  static function escapeId( $id, $options = array() ) {
1100  global $wgExperimentalHtmlIds;
1101  $options = (array)$options;
1103  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1104  $id = Sanitizer::decodeCharReferences( $id );
1105  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1106  $id = trim( $id, '_' );
1107  if ( $id === '' ) {
1108  # Must have been all whitespace to start with.
1109  return '_';
1110  } else {
1111  return $id;
1112  }
1113  }
1115  # HTML4-style escaping
1116  static $replace = array(
1117  '%3A' => ':',
1118  '%' => '.'
1119  );
1121  $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
1122  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1124  if ( !preg_match( '/^[a-zA-Z]/', $id )
1125  && !in_array( 'noninitial', $options ) ) {
1126  // Initial character must be a letter!
1127  $id = "x$id";
1128  }
1129  return $id;
1130  }
1143  static function escapeClass( $class ) {
1144  // Convert ugly stuff to underscores and kill underscores in ugly places
1145  return rtrim( preg_replace(
1146  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1147  '_',
1148  $class ), '_' );
1149  }
1158  static function escapeHtmlAllowEntities( $html ) {
1160  # It seems wise to escape ' as well as ", as a matter of course. Can't
1161  # hurt.
1162  $html = htmlspecialchars( $html, ENT_QUOTES );
1163  return $html;
1164  }
1171  private static function armorLinksCallback( $matches ) {
1172  return str_replace( ':', '&#58;', $matches[1] );
1173  }
1183  public static function decodeTagAttributes( $text ) {
1184  if ( trim( $text ) == '' ) {
1185  return array();
1186  }
1188  $attribs = array();
1189  $pairs = array();
1190  if ( !preg_match_all(
1191  self::getAttribsRegex(),
1192  $text,
1193  $pairs,
1194  PREG_SET_ORDER ) ) {
1195  return $attribs;
1196  }
1198  foreach ( $pairs as $set ) {
1199  $attribute = strtolower( $set[1] );
1202  // Normalize whitespace
1203  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1204  $value = trim( $value );
1206  // Decode character references
1208  }
1209  return $attribs;
1210  }
1219  public static function safeEncodeTagAttributes( $assoc_array ) {
1220  $attribs = array();
1221  foreach ( $assoc_array as $attribute => $value ) {
1222  $encAttribute = htmlspecialchars( $attribute );
1223  $encValue = Sanitizer::safeEncodeAttribute( $value );
1225  $attribs[] = "$encAttribute=\"$encValue\"";
1226  }
1227  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1228  }
1238  private static function getTagAttributeCallback( $set ) {
1239  if ( isset( $set[6] ) ) {
1240  # Illegal #XXXXXX color with no quotes.
1241  return $set[6];
1242  } elseif ( isset( $set[5] ) ) {
1243  # No quotes.
1244  return $set[5];
1245  } elseif ( isset( $set[4] ) ) {
1246  # Single-quoted
1247  return $set[4];
1248  } elseif ( isset( $set[3] ) ) {
1249  # Double-quoted
1250  return $set[3];
1251  } elseif ( !isset( $set[2] ) ) {
1252  # In XHTML, attributes must have a value.
1253  # For 'reduced' form, return explicitly the attribute name here.
1254  return $set[1];
1255  } else {
1256  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1257  }
1258  }
1272  private static function normalizeAttributeValue( $text ) {
1273  return str_replace( '"', '&quot;',
1274  self::normalizeWhitespace(
1276  }
1282  private static function normalizeWhitespace( $text ) {
1283  return preg_replace(
1284  '/\r\n|[\x20\x0d\x0a\x09]/',
1285  ' ',
1286  $text );
1287  }
1298  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1299  }
1316  static function normalizeCharReferences( $text ) {
1317  return preg_replace_callback(
1318  self::CHAR_REFS_REGEX,
1319  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1320  $text );
1321  }
1328  $ret = null;
1329  if ( $matches[1] != '' ) {
1331  } elseif ( $matches[2] != '' ) {
1333  } elseif ( $matches[3] != '' ) {
1335  }
1336  if ( is_null( $ret ) ) {
1337  return htmlspecialchars( $matches[0] );
1338  } else {
1339  return $ret;
1340  }
1341  }
1353  static function normalizeEntity( $name ) {
1354  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1355  return '&' . self::$htmlEntityAliases[$name] . ';';
1356  } elseif ( in_array( $name,
1357  array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1358  return "&$name;";
1359  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1360  return '&#' . self::$htmlEntities[$name] . ';';
1361  } else {
1362  return "&amp;$name;";
1363  }
1364  }
1370  static function decCharReference( $codepoint ) {
1371  $point = intval( $codepoint );
1372  if ( Sanitizer::validateCodepoint( $point ) ) {
1373  return sprintf( '&#%d;', $point );
1374  } else {
1375  return null;
1376  }
1377  }
1383  static function hexCharReference( $codepoint ) {
1384  $point = hexdec( $codepoint );
1385  if ( Sanitizer::validateCodepoint( $point ) ) {
1386  return sprintf( '&#x%x;', $point );
1387  } else {
1388  return null;
1389  }
1390  }
1397  private static function validateCodepoint( $codepoint ) {
1398  return $codepoint == 0x09
1399  || $codepoint == 0x0a
1400  || $codepoint == 0x0d
1401  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1402  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1403  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1404  }
1413  public static function decodeCharReferences( $text ) {
1414  return preg_replace_callback(
1415  self::CHAR_REFS_REGEX,
1416  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1417  $text );
1418  }
1430  public static function decodeCharReferencesAndNormalize( $text ) {
1432  $text = preg_replace_callback(
1433  self::CHAR_REFS_REGEX,
1434  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1435  $text, /* limit */ -1, $count );
1437  if ( $count ) {
1438  return $wgContLang->normalize( $text );
1439  } else {
1440  return $text;
1441  }
1442  }
1449  if ( $matches[1] != '' ) {
1450  return Sanitizer::decodeEntity( $matches[1] );
1451  } elseif ( $matches[2] != '' ) {
1452  return Sanitizer::decodeChar( intval( $matches[2] ) );
1453  } elseif ( $matches[3] != '' ) {
1454  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1455  }
1456  # Last case should be an ampersand by itself
1457  return $matches[0];
1458  }
1467  static function decodeChar( $codepoint ) {
1468  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1469  return codepointToUtf8( $codepoint );
1470  } else {
1471  return UTF8_REPLACEMENT;
1472  }
1473  }
1483  static function decodeEntity( $name ) {
1484  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1485  $name = self::$htmlEntityAliases[$name];
1486  }
1487  if ( isset( self::$htmlEntities[$name] ) ) {
1488  return codepointToUtf8( self::$htmlEntities[$name] );
1489  } else {
1490  return "&$name;";
1491  }
1492  }
1500  static function attributeWhitelist( $element ) {
1502  return isset( $list[$element] )
1503  ? $list[$element]
1504  : array();
1505  }
1512  static function setupAttributeWhitelist() {
1513  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
1515  static $whitelist, $staticInitialised;
1516  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1518  if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
1519  return $whitelist;
1520  }
1522  $common = array(
1523  # HTML
1524  'id',
1525  'class',
1526  'style',
1527  'lang',
1528  'dir',
1529  'title',
1531  # WAI-ARIA
1532  'role',
1533  );
1535  if ( $wgAllowRdfaAttributes ) {
1536  # RDFa attributes as specified in section 9 of
1537  #
1538  $common = array_merge( $common, array(
1539  'about', 'property', 'resource', 'datatype', 'typeof',
1540  ) );
1541  }
1543  if ( $wgAllowMicrodataAttributes ) {
1544  # add HTML5 microdata tags as specified by
1545  #
1546  $common = array_merge( $common, array(
1547  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1548  ) );
1549  }
1551  $block = array_merge( $common, array( 'align' ) );
1552  $tablealign = array( 'align', 'valign' );
1553  $tablecell = array(
1554  'abbr',
1555  'axis',
1556  'headers',
1557  'scope',
1558  'rowspan',
1559  'colspan',
1560  'nowrap', # deprecated
1561  'width', # deprecated
1562  'height', # deprecated
1563  'bgcolor', # deprecated
1564  );
1566  # Numbers refer to sections in HTML 4.01 standard describing the element.
1567  # See:
1568  $whitelist = array(
1569  # 7.5.4
1570  'div' => $block,
1571  'center' => $common, # deprecated
1572  'span' => $common,
1574  # 7.5.5
1575  'h1' => $block,
1576  'h2' => $block,
1577  'h3' => $block,
1578  'h4' => $block,
1579  'h5' => $block,
1580  'h6' => $block,
1582  # 7.5.6
1583  # address
1585  # 8.2.4
1586  'bdo' => $common,
1588  # 9.2.1
1589  'em' => $common,
1590  'strong' => $common,
1591  'cite' => $common,
1592  'dfn' => $common,
1593  'code' => $common,
1594  'samp' => $common,
1595  'kbd' => $common,
1596  'var' => $common,
1597  'abbr' => $common,
1598  # acronym
1600  # 9.2.2
1601  'blockquote' => array_merge( $common, array( 'cite' ) ),
1602  'q' => array_merge( $common, array( 'cite' ) ),
1604  # 9.2.3
1605  'sub' => $common,
1606  'sup' => $common,
1608  # 9.3.1
1609  'p' => $block,
1611  # 9.3.2
1612  'br' => array_merge( $common, array( 'clear' ) ),
1614  #
1615  'wbr' => $common,
1617  # 9.3.4
1618  'pre' => array_merge( $common, array( 'width' ) ),
1620  # 9.4
1621  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1622  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1624  # 10.2
1625  'ul' => array_merge( $common, array( 'type' ) ),
1626  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1627  'li' => array_merge( $common, array( 'type', 'value' ) ),
1629  # 10.3
1630  'dl' => $common,
1631  'dd' => $common,
1632  'dt' => $common,
1634  # 11.2.1
1635  'table' => array_merge( $common,
1636  array( 'summary', 'width', 'border', 'frame',
1637  'rules', 'cellspacing', 'cellpadding',
1638  'align', 'bgcolor',
1639  ) ),
1641  # 11.2.2
1642  'caption' => $block,
1644  # 11.2.3
1645  'thead' => $common,
1646  'tfoot' => $common,
1647  'tbody' => $common,
1649  # 11.2.4
1650  'colgroup' => array_merge( $common, array( 'span' ) ),
1651  'col' => array_merge( $common, array( 'span' ) ),
1653  # 11.2.5
1654  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1656  # 11.2.6
1657  'td' => array_merge( $common, $tablecell, $tablealign ),
1658  'th' => array_merge( $common, $tablecell, $tablealign ),
1660  # 12.2
1661  # NOTE: <a> is not allowed directly, but the attrib
1662  # whitelist is used from the Parser object
1663  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1665  # 13.2
1666  # Not usually allowed, but may be used for extension-style hooks
1667  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1668  # true
1669  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1671  # 15.2.1
1672  'tt' => $common,
1673  'b' => $common,
1674  'i' => $common,
1675  'big' => $common,
1676  'small' => $common,
1677  'strike' => $common,
1678  's' => $common,
1679  'u' => $common,
1681  # 15.2.2
1682  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1683  # basefont
1685  # 15.3
1686  'hr' => array_merge( $common, array( 'width' ) ),
1688  # HTML Ruby annotation text module, simple ruby only.
1689  #
1690  'ruby' => $common,
1691  # rbc
1692  # rtc
1693  'rb' => $common,
1694  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1695  'rp' => $common,
1697  # MathML root element, where used for extensions
1698  # 'title' may not be 100% valid here; it's XHTML
1699  #
1700  'math' => array( 'class', 'style', 'id', 'title' ),
1702  # HTML 5 section 4.6
1703  'bdi' => $common,
1705  # HTML5 elements, defined by:
1706  #
1707  'data' => array_merge( $common, array( 'value' ) ),
1708  'time' => array_merge( $common, array( 'datetime' ) ),
1709  'mark' => $common,
1711  // meta and link are only permitted by removeHTMLtags when Microdata
1712  // is enabled so we don't bother adding a conditional to hide these
1713  // Also meta and link are only valid in WikiText as Microdata elements
1714  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1715  // So we don't bother including $common attributes that have no purpose.
1716  'meta' => array( 'itemprop', 'content' ),
1717  'link' => array( 'itemprop', 'href' ),
1718  );
1720  $staticInitialised = $globalContext;
1722  return $whitelist;
1723  }
1735  static function stripAllTags( $text ) {
1736  # Actual <tags>
1737  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1739  # Normalize &entities and whitespace
1740  $text = self::decodeCharReferences( $text );
1741  $text = self::normalizeWhitespace( $text );
1743  return $text;
1744  }
1755  static function hackDocType() {
1756  $out = "<!DOCTYPE html [\n";
1757  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1758  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1759  }
1760  $out .= "]>\n";
1761  return $out;
1762  }
1768  static function cleanUrl( $url ) {
1769  # Normalize any HTML entities in input. They will be
1770  # re-escaped by makeExternalLink().
1771  $url = Sanitizer::decodeCharReferences( $url );
1773  # Escape any control characters introduced by the above step
1774  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1775  array( __CLASS__, 'cleanUrlCallback' ), $url );
1777  # Validate hostname portion
1778  $matches = array();
1779  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1780  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1782  // Characters that will be ignored in IDNs.
1783  //
1784  // Strip them before further processing so blacklists and such work.
1785  $strip = "/
1786  \\s| # general whitespace
1787  \xc2\xad| # 00ad SOFT HYPHEN
1788  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1789  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1790  \xe2\x81\xa0| # 2060 WORD JOINER
1791  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1792  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1796  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1797  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1798  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1799  /xuD";
1801  $host = preg_replace( $strip, '', $host );
1803  // @todo FIXME: Validate hostnames here
1805  return $protocol . $host . $rest;
1806  } else {
1807  return $url;
1808  }
1809  }
1815  static function cleanUrlCallback( $matches ) {
1816  return urlencode( $matches[0] );
1817  }
1847  public static function validateEmail( $addr ) {
1848  $result = null;
1849  if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1850  return $result;
1851  }
1853  // Please note strings below are enclosed in brackets [], this make the
1854  // hyphen "-" a range indicator. Hence it is double backslashed below.
1855  // See bug 26948
1856  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1857  $rfc1034_ldh_str = "a-z0-9\\-";
1859  $html5_email_regexp = "/
1860  ^ # start of string
1861  [$rfc5322_atext\\.]+ # user part which is liberal :p
1862  @ # 'apostrophe'
1863  [$rfc1034_ldh_str]+ # First domain part
1864  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1865  $ # End of string
1866  /ix"; // case Insensitive, eXtended
1868  return (bool)preg_match( $html5_email_regexp, $addr );
1869  }
1870 }
