MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
46  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
47 
56  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58 
64  private static $htmlEntities = array(
65  'Aacute' => 193,
66  'aacute' => 225,
67  'Acirc' => 194,
68  'acirc' => 226,
69  'acute' => 180,
70  'AElig' => 198,
71  'aelig' => 230,
72  'Agrave' => 192,
73  'agrave' => 224,
74  'alefsym' => 8501,
75  'Alpha' => 913,
76  'alpha' => 945,
77  'amp' => 38,
78  'and' => 8743,
79  'ang' => 8736,
80  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
81  'Aring' => 197,
82  'aring' => 229,
83  'asymp' => 8776,
84  'Atilde' => 195,
85  'atilde' => 227,
86  'Auml' => 196,
87  'auml' => 228,
88  'bdquo' => 8222,
89  'Beta' => 914,
90  'beta' => 946,
91  'brvbar' => 166,
92  'bull' => 8226,
93  'cap' => 8745,
94  'Ccedil' => 199,
95  'ccedil' => 231,
96  'cedil' => 184,
97  'cent' => 162,
98  'Chi' => 935,
99  'chi' => 967,
100  'circ' => 710,
101  'clubs' => 9827,
102  'cong' => 8773,
103  'copy' => 169,
104  'crarr' => 8629,
105  'cup' => 8746,
106  'curren' => 164,
107  'dagger' => 8224,
108  'Dagger' => 8225,
109  'darr' => 8595,
110  'dArr' => 8659,
111  'deg' => 176,
112  'Delta' => 916,
113  'delta' => 948,
114  'diams' => 9830,
115  'divide' => 247,
116  'Eacute' => 201,
117  'eacute' => 233,
118  'Ecirc' => 202,
119  'ecirc' => 234,
120  'Egrave' => 200,
121  'egrave' => 232,
122  'empty' => 8709,
123  'emsp' => 8195,
124  'ensp' => 8194,
125  'Epsilon' => 917,
126  'epsilon' => 949,
127  'equiv' => 8801,
128  'Eta' => 919,
129  'eta' => 951,
130  'ETH' => 208,
131  'eth' => 240,
132  'Euml' => 203,
133  'euml' => 235,
134  'euro' => 8364,
135  'exist' => 8707,
136  'fnof' => 402,
137  'forall' => 8704,
138  'frac12' => 189,
139  'frac14' => 188,
140  'frac34' => 190,
141  'frasl' => 8260,
142  'Gamma' => 915,
143  'gamma' => 947,
144  'ge' => 8805,
145  'gt' => 62,
146  'harr' => 8596,
147  'hArr' => 8660,
148  'hearts' => 9829,
149  'hellip' => 8230,
150  'Iacute' => 205,
151  'iacute' => 237,
152  'Icirc' => 206,
153  'icirc' => 238,
154  'iexcl' => 161,
155  'Igrave' => 204,
156  'igrave' => 236,
157  'image' => 8465,
158  'infin' => 8734,
159  'int' => 8747,
160  'Iota' => 921,
161  'iota' => 953,
162  'iquest' => 191,
163  'isin' => 8712,
164  'Iuml' => 207,
165  'iuml' => 239,
166  'Kappa' => 922,
167  'kappa' => 954,
168  'Lambda' => 923,
169  'lambda' => 955,
170  'lang' => 9001,
171  'laquo' => 171,
172  'larr' => 8592,
173  'lArr' => 8656,
174  'lceil' => 8968,
175  'ldquo' => 8220,
176  'le' => 8804,
177  'lfloor' => 8970,
178  'lowast' => 8727,
179  'loz' => 9674,
180  'lrm' => 8206,
181  'lsaquo' => 8249,
182  'lsquo' => 8216,
183  'lt' => 60,
184  'macr' => 175,
185  'mdash' => 8212,
186  'micro' => 181,
187  'middot' => 183,
188  'minus' => 8722,
189  'Mu' => 924,
190  'mu' => 956,
191  'nabla' => 8711,
192  'nbsp' => 160,
193  'ndash' => 8211,
194  'ne' => 8800,
195  'ni' => 8715,
196  'not' => 172,
197  'notin' => 8713,
198  'nsub' => 8836,
199  'Ntilde' => 209,
200  'ntilde' => 241,
201  'Nu' => 925,
202  'nu' => 957,
203  'Oacute' => 211,
204  'oacute' => 243,
205  'Ocirc' => 212,
206  'ocirc' => 244,
207  'OElig' => 338,
208  'oelig' => 339,
209  'Ograve' => 210,
210  'ograve' => 242,
211  'oline' => 8254,
212  'Omega' => 937,
213  'omega' => 969,
214  'Omicron' => 927,
215  'omicron' => 959,
216  'oplus' => 8853,
217  'or' => 8744,
218  'ordf' => 170,
219  'ordm' => 186,
220  'Oslash' => 216,
221  'oslash' => 248,
222  'Otilde' => 213,
223  'otilde' => 245,
224  'otimes' => 8855,
225  'Ouml' => 214,
226  'ouml' => 246,
227  'para' => 182,
228  'part' => 8706,
229  'permil' => 8240,
230  'perp' => 8869,
231  'Phi' => 934,
232  'phi' => 966,
233  'Pi' => 928,
234  'pi' => 960,
235  'piv' => 982,
236  'plusmn' => 177,
237  'pound' => 163,
238  'prime' => 8242,
239  'Prime' => 8243,
240  'prod' => 8719,
241  'prop' => 8733,
242  'Psi' => 936,
243  'psi' => 968,
244  'quot' => 34,
245  'radic' => 8730,
246  'rang' => 9002,
247  'raquo' => 187,
248  'rarr' => 8594,
249  'rArr' => 8658,
250  'rceil' => 8969,
251  'rdquo' => 8221,
252  'real' => 8476,
253  'reg' => 174,
254  'rfloor' => 8971,
255  'Rho' => 929,
256  'rho' => 961,
257  'rlm' => 8207,
258  'rsaquo' => 8250,
259  'rsquo' => 8217,
260  'sbquo' => 8218,
261  'Scaron' => 352,
262  'scaron' => 353,
263  'sdot' => 8901,
264  'sect' => 167,
265  'shy' => 173,
266  'Sigma' => 931,
267  'sigma' => 963,
268  'sigmaf' => 962,
269  'sim' => 8764,
270  'spades' => 9824,
271  'sub' => 8834,
272  'sube' => 8838,
273  'sum' => 8721,
274  'sup' => 8835,
275  'sup1' => 185,
276  'sup2' => 178,
277  'sup3' => 179,
278  'supe' => 8839,
279  'szlig' => 223,
280  'Tau' => 932,
281  'tau' => 964,
282  'there4' => 8756,
283  'Theta' => 920,
284  'theta' => 952,
285  'thetasym' => 977,
286  'thinsp' => 8201,
287  'THORN' => 222,
288  'thorn' => 254,
289  'tilde' => 732,
290  'times' => 215,
291  'trade' => 8482,
292  'Uacute' => 218,
293  'uacute' => 250,
294  'uarr' => 8593,
295  'uArr' => 8657,
296  'Ucirc' => 219,
297  'ucirc' => 251,
298  'Ugrave' => 217,
299  'ugrave' => 249,
300  'uml' => 168,
301  'upsih' => 978,
302  'Upsilon' => 933,
303  'upsilon' => 965,
304  'Uuml' => 220,
305  'uuml' => 252,
306  'weierp' => 8472,
307  'Xi' => 926,
308  'xi' => 958,
309  'Yacute' => 221,
310  'yacute' => 253,
311  'yen' => 165,
312  'Yuml' => 376,
313  'yuml' => 255,
314  'Zeta' => 918,
315  'zeta' => 950,
316  'zwj' => 8205,
317  'zwnj' => 8204
318  );
319 
323  private static $htmlEntityAliases = array(
324  'רלמ' => 'rlm',
325  'رلم' => 'rlm',
326  );
327 
331  private static $attribsRegex;
332 
339  static function getAttribsRegex() {
340  if ( self::$attribsRegex === null ) {
341  $attribFirst = '[:A-Z_a-z0-9]';
342  $attrib = '[:A-Z_a-z-.0-9]';
343  $space = '[\x09\x0a\x0d\x20]';
344  self::$attribsRegex =
345  "/(?:^|$space)({$attribFirst}{$attrib}*)
346  ($space*=$space*
347  (?:
348  # The attribute value: quoted or alone
349  \"([^<\"]*)\"
350  | '([^<']*)'
351  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
352  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
353  # colors are specified like this.
354  # We'll be normalizing it.
355  )
356  )?(?=$space|\$)/sx";
357  }
358  return self::$attribsRegex;
359  }
360 
372  public static function removeHTMLtags( $text, $processCallback = null,
373  $args = array(), $extratags = array(), $removetags = array()
374  ) {
376 
377  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
378  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
379 
380  // Base our staticInitialised variable off of the global config state so that if the globals
381  // are changed (like in the screwed up test system) we will re-initialise the settings.
382  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
383  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
384 
385  $htmlpairsStatic = array( # Tags that must be closed
386  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
387  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
388  'strike', 'strong', 'tt', 'var', 'div', 'center',
389  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
390  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
391  'kbd', 'samp', 'data', 'time', 'mark'
392  );
393  $htmlsingle = array(
394  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
395  );
396  $htmlsingleonly = array( # Elements that cannot have close tags
397  'br', 'wbr', 'hr'
398  );
399  if ( $wgAllowMicrodataAttributes ) {
400  $htmlsingle[] = $htmlsingleonly[] = 'meta';
401  $htmlsingle[] = $htmlsingleonly[] = 'link';
402  }
403  $htmlnest = array( # Tags that can be nested--??
404  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
405  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
406  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
407  );
408  $tabletags = array( # Can only appear inside table, we will close them
409  'td', 'th', 'tr',
410  );
411  $htmllist = array( # Tags used by list
412  'ul', 'ol',
413  );
414  $listtags = array( # Tags that can appear in a list
415  'li',
416  );
417 
418  if ( $wgAllowImageTag ) {
419  $htmlsingle[] = 'img';
420  $htmlsingleonly[] = 'img';
421  }
422 
423  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
424  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
425 
426  # Convert them all to hashtables for faster lookup
427  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
428  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
429  foreach ( $vars as $var ) {
430  $$var = array_flip( $$var );
431  }
432  $staticInitialised = $globalContext;
433  }
434  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
435  $extratags = array_flip( $extratags );
436  $removetags = array_flip( $removetags );
437  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
438  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
439 
440  # Remove HTML comments
441  $text = Sanitizer::removeHTMLcomments( $text );
442  $bits = explode( '<', $text );
443  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
444  if ( !$wgUseTidy ) {
445  $tagstack = $tablestack = array();
446  foreach ( $bits as $x ) {
447  $regs = array();
448  # $slash: Does the current element start with a '/'?
449  # $t: Current element name
450  # $params: String between element name and >
451  # $brace: Ending '>' or '/>'
452  # $rest: Everything until the next element of $bits
453  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
454  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
455  } else {
456  $slash = $t = $params = $brace = $rest = null;
457  }
458 
459  $badtag = false;
460  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
461  # Check our stack
462  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
463  $badtag = true;
464  } elseif ( $slash ) {
465  # Closing a tag... is it the one we just opened?
467  $ot = array_pop( $tagstack );
469 
470  if ( $ot != $t ) {
471  if ( isset( $htmlsingleallowed[$ot] ) ) {
472  # Pop all elements with an optional close tag
473  # and see if we find a match below them
474  $optstack = array();
475  array_push( $optstack, $ot );
477  $ot = array_pop( $tagstack );
479  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
480  array_push( $optstack, $ot );
482  $ot = array_pop( $tagstack );
484  }
485  if ( $t != $ot ) {
486  # No match. Push the optional elements back again
487  $badtag = true;
489  $ot = array_pop( $optstack );
491  while ( $ot ) {
492  array_push( $tagstack, $ot );
494  $ot = array_pop( $optstack );
496  }
497  }
498  } else {
500  array_push( $tagstack, $ot );
502 
503  # <li> can be nested in <ul> or <ol>, skip those cases:
504  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
505  $badtag = true;
506  }
507  }
508  } else {
509  if ( $t == 'table' ) {
510  $tagstack = array_pop( $tablestack );
511  }
512  }
513  $newparams = '';
514  } else {
515  # Keep track for later
516  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
517  $badtag = true;
518  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
519  $badtag = true;
520  # Is it a self closed htmlpair ? (bug 5487)
521  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
522  $badtag = true;
523  } elseif ( isset( $htmlsingleonly[$t] ) ) {
524  # Hack to force empty tag for unclosable elements
525  $brace = '/>';
526  } elseif ( isset( $htmlsingle[$t] ) ) {
527  # Hack to not close $htmlsingle tags
528  $brace = null;
529  # Still need to push this optionally-closed tag to
530  # the tag stack so that we can match end tags
531  # instead of marking them as bad.
532  array_push( $tagstack, $t );
533  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
534  // New table tag but forgot to close the previous one
535  $text .= "</$t>";
536  } else {
537  if ( $t == 'table' ) {
538  array_push( $tablestack, $tagstack );
539  $tagstack = array();
540  }
541  array_push( $tagstack, $t );
542  }
543 
544  # Replace any variables or template parameters with
545  # plaintext results.
546  if ( is_callable( $processCallback ) ) {
547  call_user_func_array( $processCallback, array( &$params, $args ) );
548  }
549 
550  if ( !Sanitizer::validateTag( $params, $t ) ) {
551  $badtag = true;
552  }
553 
554  # Strip non-approved attributes from the tag
555  $newparams = Sanitizer::fixTagAttributes( $params, $t );
556  }
557  if ( !$badtag ) {
558  $rest = str_replace( '>', '&gt;', $rest );
559  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
560  $text .= "<$slash$t$newparams$close>$rest";
561  continue;
562  }
563  }
564  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
565  }
566  # Close off any remaining tags
567  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
568  $text .= "</$t>\n";
569  if ( $t == 'table' ) {
570  $tagstack = array_pop( $tablestack );
571  }
572  }
573  } else {
574  # this might be possible using tidy itself
575  foreach ( $bits as $x ) {
576  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
577  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
578 
579  $badtag = false;
580  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
581  if ( is_callable( $processCallback ) ) {
582  call_user_func_array( $processCallback, array( &$params, $args ) );
583  }
584 
585  if ( !Sanitizer::validateTag( $params, $t ) ) {
586  $badtag = true;
587  }
588 
589  $newparams = Sanitizer::fixTagAttributes( $params, $t );
590  if ( !$badtag ) {
591  $rest = str_replace( '>', '&gt;', $rest );
592  $text .= "<$slash$t$newparams$brace$rest";
593  continue;
594  }
595  }
596  }
597  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
598  }
599  }
600  return $text;
601  }
602 
612  public static function removeHTMLcomments( $text ) {
613  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
614  $end = strpos( $text, '-->', $start + 4 );
615  if ( $end === false ) {
616  # Unterminated comment; bail out
617  break;
618  }
619 
620  $end += 3;
621 
622  # Trim space and newline if the comment is both
623  # preceded and followed by a newline
624  $spaceStart = max( $start - 1, 0 );
625  $spaceLen = $end - $spaceStart;
626  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
627  $spaceStart--;
628  $spaceLen++;
629  }
630  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
631  $spaceLen++;
632  }
633  if ( substr( $text, $spaceStart, 1 ) === "\n"
634  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
635  # Remove the comment, leading and trailing
636  # spaces, and leave only one newline.
637  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
638  } else {
639  # Remove just the comment.
640  $text = substr_replace( $text, '', $start, $end - $start );
641  }
642  }
643  return $text;
644  }
645 
658  static function validateTag( $params, $element ) {
660 
661  if ( $element == 'meta' || $element == 'link' ) {
662  if ( !isset( $params['itemprop'] ) ) {
663  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
664  return false;
665  }
666  if ( $element == 'meta' && !isset( $params['content'] ) ) {
667  // <meta> must have a content="" for the itemprop
668  return false;
669  }
670  if ( $element == 'link' && !isset( $params['href'] ) ) {
671  // <link> must have an associated href=""
672  return false;
673  }
674  }
675 
676  return true;
677  }
678 
694  static function validateTagAttributes( $attribs, $element ) {
696  Sanitizer::attributeWhitelist( $element ) );
697  }
698 
714  static function validateAttributes( $attribs, $whitelist ) {
716 
717  $whitelist = array_flip( $whitelist );
718  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
719 
720  $out = array();
721  foreach ( $attribs as $attribute => $value ) {
722  #allow XML namespace declaration if RDFa is enabled
723  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
724  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
725  $out[$attribute] = $value;
726  }
727 
728  continue;
729  }
730 
731  # Allow any attribute beginning with "data-"
732  if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
733  continue;
734  }
735 
736  # Strip javascript "expression" from stylesheets.
737  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
738  if ( $attribute == 'style' ) {
740  }
741 
742  if ( $attribute === 'id' ) {
743  $value = Sanitizer::escapeId( $value, 'noninitial' );
744  }
745 
746  # WAI-ARIA
747  # http://www.w3.org/TR/wai-aria/
748  # http://www.whatwg.org/html/elements.html#wai-aria
749  # For now we only support role="presentation" until we work out what roles should be
750  # usable by content and we ensure that our code explicitly rejects patterns that
751  # violate HTML5's ARIA restrictions.
752  if ( $attribute === 'role' && $value !== 'presentation' ) {
753  continue;
754  }
755 
756  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
757  // Check them for sanity.
758  if ( $attribute === 'rel' || $attribute === 'rev'
759  # RDFa
760  || $attribute === 'about' || $attribute === 'property'
761  || $attribute === 'resource' || $attribute === 'datatype'
762  || $attribute === 'typeof'
763  # HTML5 microdata
764  || $attribute === 'itemid' || $attribute === 'itemprop'
765  || $attribute === 'itemref' || $attribute === 'itemscope'
766  || $attribute === 'itemtype'
767  ) {
768  //Paranoia. Allow "simple" values but suppress javascript
769  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
770  continue;
771  }
772  }
773 
774  # NOTE: even though elements using href/src are not allowed directly, supply
775  # validation code that can be used by tag hook handlers, etc
776  if ( $attribute === 'href' || $attribute === 'src' ) {
777  if ( !preg_match( $hrefExp, $value ) ) {
778  continue; //drop any href or src attributes not using an allowed protocol.
779  // NOTE: this also drops all relative URLs
780  }
781  }
782 
783  // If this attribute was previously set, override it.
784  // Output should only have one attribute of each name.
785  $out[$attribute] = $value;
786  }
787 
788  if ( $wgAllowMicrodataAttributes ) {
789  # itemtype, itemid, itemref don't make sense without itemscope
790  if ( !array_key_exists( 'itemscope', $out ) ) {
791  unset( $out['itemtype'] );
792  unset( $out['itemid'] );
793  unset( $out['itemref'] );
794  }
795  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
796  }
797  return $out;
798  }
799 
810  static function mergeAttributes( $a, $b ) {
811  $out = array_merge( $a, $b );
812  if ( isset( $a['class'] ) && isset( $b['class'] )
813  && is_string( $a['class'] ) && is_string( $b['class'] )
814  && $a['class'] !== $b['class']
815  ) {
816  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
817  -1, PREG_SPLIT_NO_EMPTY );
818  $out['class'] = implode( ' ', array_unique( $classes ) );
819  }
820  return $out;
821  }
822 
832  public static function normalizeCss( $value ) {
833 
834  // Decode character references like &#123;
836 
837  // Decode escape sequences and line continuation
838  // See the grammar in the CSS 2 spec, appendix D.
839  // This has to be done AFTER decoding character references.
840  // This means it isn't possible for this function to return
841  // unsanitized escape sequences. It is possible to manufacture
842  // input that contains character references that decode to
843  // escape sequences that decode to character references, but
844  // it's OK for the return value to contain character references
845  // because the caller is supposed to escape those anyway.
846  static $decodeRegex;
847  if ( !$decodeRegex ) {
848  $space = '[\\x20\\t\\r\\n\\f]';
849  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
850  $backslash = '\\\\';
851  $decodeRegex = "/ $backslash
852  (?:
853  ($nl) | # 1. Line continuation
854  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
855  (.) | # 3. backslash cancelling special meaning
856  () | # 4. backslash at end of string
857  )/xu";
858  }
859  $value = preg_replace_callback( $decodeRegex,
860  array( __CLASS__, 'cssDecodeCallback' ), $value );
861 
862  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
863  $value = preg_replace_callback(
864  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
865  function ( $matches ) {
867  if ( $cp === false ) {
868  return '';
869  }
870  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
871  },
872  $value
873  );
874 
875  // Convert more characters IE6 might treat as ascii
876  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
877  $value = str_replace(
878  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
879  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
880  $value
881  );
882 
883  // Let the value through if it's nothing but a single comment, to
884  // allow other functions which may reject it to pass some error
885  // message through.
886  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
887  // Remove any comments; IE gets token splitting wrong
888  // This must be done AFTER decoding character references and
889  // escape sequences, because those steps can introduce comments
890  // This step cannot introduce character references or escape
891  // sequences, because it replaces comments with spaces rather
892  // than removing them completely.
893  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
894 
895  // Remove anything after a comment-start token, to guard against
896  // incorrect client implementations.
897  $commentPos = strpos( $value, '/*' );
898  if ( $commentPos !== false ) {
899  $value = substr( $value, 0, $commentPos );
900  }
901  }
902 
903  // S followed by repeat, iteration, or prolonged sound marks,
904  // which IE will treat as "ss"
905  $value = preg_replace(
906  '/s(?:
907  \xE3\x80\xB1 | # U+3031
908  \xE3\x82\x9D | # U+309D
909  \xE3\x83\xBC | # U+30FC
910  \xE3\x83\xBD | # U+30FD
911  \xEF\xB9\xBC | # U+FE7C
912  \xEF\xB9\xBD | # U+FE7D
913  \xEF\xBD\xB0 # U+FF70
914  )/ix',
915  'ss',
916  $value
917  );
918 
919  return $value;
920  }
921 
922 
941  static function checkCss( $value ) {
942  $value = self::normalizeCss( $value );
943 
944  // Reject problematic keywords and control characters
945  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
946  return '/* invalid control char */';
947  } elseif ( preg_match(
948  '! expression
949  | filter\s*:
950  | accelerator\s*:
951  | -o-link\s*:
952  | -o-link-source\s*:
953  | -o-replace\s*:
954  | url\s*\(
955  | image\s*\(
956  | image-set\s*\(
957  !ix', $value ) ) {
958  return '/* insecure input */';
959  }
960  return $value;
961  }
962 
967  static function cssDecodeCallback( $matches ) {
968  if ( $matches[1] !== '' ) {
969  // Line continuation
970  return '';
971  } elseif ( $matches[2] !== '' ) {
972  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
973  } elseif ( $matches[3] !== '' ) {
974  $char = $matches[3];
975  } else {
976  $char = '\\';
977  }
978  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
979  // These characters need to be escaped in strings
980  // Clean up the escape sequence to avoid parsing errors by clients
981  return '\\' . dechex( ord( $char ) ) . ' ';
982  } else {
983  // Decode unnecessary escape
984  return $char;
985  }
986  }
987 
1007  static function fixTagAttributes( $text, $element ) {
1008  if ( trim( $text ) == '' ) {
1009  return '';
1010  }
1011 
1012  $decoded = Sanitizer::decodeTagAttributes( $text );
1013  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1014 
1015  return Sanitizer::safeEncodeTagAttributes( $stripped );
1016  }
1017 
1023  static function encodeAttribute( $text ) {
1024  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1025 
1026  // Whitespace is normalized during attribute decoding,
1027  // so if we've been passed non-spaces we must encode them
1028  // ahead of time or they won't be preserved.
1029  $encValue = strtr( $encValue, array(
1030  "\n" => '&#10;',
1031  "\r" => '&#13;',
1032  "\t" => '&#9;',
1033  ) );
1034 
1035  return $encValue;
1036  }
1037 
1044  static function safeEncodeAttribute( $text ) {
1045  $encValue = Sanitizer::encodeAttribute( $text );
1046 
1047  # Templates and links may be expanded in later parsing,
1048  # creating invalid or dangerous output. Suppress this.
1049  $encValue = strtr( $encValue, array(
1050  '<' => '&lt;', // This should never happen,
1051  '>' => '&gt;', // we've received invalid input
1052  '"' => '&quot;', // which should have been escaped.
1053  '{' => '&#123;',
1054  '[' => '&#91;',
1055  "''" => '&#39;&#39;',
1056  'ISBN' => '&#73;SBN',
1057  'RFC' => '&#82;FC',
1058  'PMID' => '&#80;MID',
1059  '|' => '&#124;',
1060  '__' => '&#95;_',
1061  ) );
1062 
1063  # Stupid hack
1064  $encValue = preg_replace_callback(
1065  '/((?i)' . wfUrlProtocols() . ')/',
1066  array( 'Sanitizer', 'armorLinksCallback' ),
1067  $encValue );
1068  return $encValue;
1069  }
1070 
1102  static function escapeId( $id, $options = array() ) {
1104  $options = (array)$options;
1105 
1106  $id = Sanitizer::decodeCharReferences( $id );
1107 
1108  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1109  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1110  $id = trim( $id, '_' );
1111  if ( $id === '' ) {
1112  // Must have been all whitespace to start with.
1113  return '_';
1114  } else {
1115  return $id;
1116  }
1117  }
1118 
1119  // HTML4-style escaping
1120  static $replace = array(
1121  '%3A' => ':',
1122  '%' => '.'
1123  );
1124 
1125  $id = urlencode( strtr( $id, ' ', '_' ) );
1126  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1127 
1128  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1129  // Initial character must be a letter!
1130  $id = "x$id";
1131  }
1132  return $id;
1133  }
1134 
1146  static function escapeClass( $class ) {
1147  // Convert ugly stuff to underscores and kill underscores in ugly places
1148  return rtrim( preg_replace(
1149  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1150  '_',
1151  $class ), '_' );
1152  }
1153 
1161  static function escapeHtmlAllowEntities( $html ) {
1163  # It seems wise to escape ' as well as ", as a matter of course. Can't
1164  # hurt.
1165  $html = htmlspecialchars( $html, ENT_QUOTES );
1166  return $html;
1167  }
1168 
1174  private static function armorLinksCallback( $matches ) {
1175  return str_replace( ':', '&#58;', $matches[1] );
1176  }
1177 
1186  public static function decodeTagAttributes( $text ) {
1187  if ( trim( $text ) == '' ) {
1188  return array();
1189  }
1190 
1191  $attribs = array();
1192  $pairs = array();
1193  if ( !preg_match_all(
1194  self::getAttribsRegex(),
1195  $text,
1196  $pairs,
1197  PREG_SET_ORDER ) ) {
1198  return $attribs;
1199  }
1200 
1201  foreach ( $pairs as $set ) {
1202  $attribute = strtolower( $set[1] );
1204 
1205  // Normalize whitespace
1206  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1207  $value = trim( $value );
1208 
1209  // Decode character references
1211  }
1212  return $attribs;
1213  }
1214 
1222  public static function safeEncodeTagAttributes( $assoc_array ) {
1223  $attribs = array();
1224  foreach ( $assoc_array as $attribute => $value ) {
1225  $encAttribute = htmlspecialchars( $attribute );
1226  $encValue = Sanitizer::safeEncodeAttribute( $value );
1227 
1228  $attribs[] = "$encAttribute=\"$encValue\"";
1229  }
1230  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1231  }
1232 
1241  private static function getTagAttributeCallback( $set ) {
1242  if ( isset( $set[6] ) ) {
1243  # Illegal #XXXXXX color with no quotes.
1244  return $set[6];
1245  } elseif ( isset( $set[5] ) ) {
1246  # No quotes.
1247  return $set[5];
1248  } elseif ( isset( $set[4] ) ) {
1249  # Single-quoted
1250  return $set[4];
1251  } elseif ( isset( $set[3] ) ) {
1252  # Double-quoted
1253  return $set[3];
1254  } elseif ( !isset( $set[2] ) ) {
1255  # In XHTML, attributes must have a value.
1256  # For 'reduced' form, return explicitly the attribute name here.
1257  return $set[1];
1258  } else {
1259  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1260  }
1261  }
1262 
1267  private static function normalizeWhitespace( $text ) {
1268  return preg_replace(
1269  '/\r\n|[\x20\x0d\x0a\x09]/',
1270  ' ',
1271  $text );
1272  }
1273 
1283  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1284  }
1285 
1301  static function normalizeCharReferences( $text ) {
1302  return preg_replace_callback(
1303  self::CHAR_REFS_REGEX,
1304  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1305  $text );
1306  }
1307 
1313  $ret = null;
1314  if ( $matches[1] != '' ) {
1316  } elseif ( $matches[2] != '' ) {
1318  } elseif ( $matches[3] != '' ) {
1320  }
1321  if ( is_null( $ret ) ) {
1322  return htmlspecialchars( $matches[0] );
1323  } else {
1324  return $ret;
1325  }
1326  }
1327 
1338  static function normalizeEntity( $name ) {
1339  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1340  return '&' . self::$htmlEntityAliases[$name] . ';';
1341  } elseif ( in_array( $name, array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1342  return "&$name;";
1343  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1344  return '&#' . self::$htmlEntities[$name] . ';';
1345  } else {
1346  return "&amp;$name;";
1347  }
1348  }
1349 
1354  static function decCharReference( $codepoint ) {
1355  $point = intval( $codepoint );
1356  if ( Sanitizer::validateCodepoint( $point ) ) {
1357  return sprintf( '&#%d;', $point );
1358  } else {
1359  return null;
1360  }
1361  }
1362 
1367  static function hexCharReference( $codepoint ) {
1368  $point = hexdec( $codepoint );
1369  if ( Sanitizer::validateCodepoint( $point ) ) {
1370  return sprintf( '&#x%x;', $point );
1371  } else {
1372  return null;
1373  }
1374  }
1375 
1381  private static function validateCodepoint( $codepoint ) {
1382  return $codepoint == 0x09
1383  || $codepoint == 0x0a
1384  || $codepoint == 0x0d
1385  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1386  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1387  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1388  }
1389 
1397  public static function decodeCharReferences( $text ) {
1398  return preg_replace_callback(
1399  self::CHAR_REFS_REGEX,
1400  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1401  $text );
1402  }
1403 
1414  public static function decodeCharReferencesAndNormalize( $text ) {
1416  $text = preg_replace_callback(
1417  self::CHAR_REFS_REGEX,
1418  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1419  $text, /* limit */ -1, $count );
1420 
1421  if ( $count ) {
1422  return $wgContLang->normalize( $text );
1423  } else {
1424  return $text;
1425  }
1426  }
1427 
1433  if ( $matches[1] != '' ) {
1434  return Sanitizer::decodeEntity( $matches[1] );
1435  } elseif ( $matches[2] != '' ) {
1436  return Sanitizer::decodeChar( intval( $matches[2] ) );
1437  } elseif ( $matches[3] != '' ) {
1438  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1439  }
1440  # Last case should be an ampersand by itself
1441  return $matches[0];
1442  }
1443 
1451  static function decodeChar( $codepoint ) {
1452  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1453  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1454  } else {
1456  }
1457  }
1458 
1467  static function decodeEntity( $name ) {
1468  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1469  $name = self::$htmlEntityAliases[$name];
1470  }
1471  if ( isset( self::$htmlEntities[$name] ) ) {
1472  return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1473  } else {
1474  return "&$name;";
1475  }
1476  }
1477 
1484  static function attributeWhitelist( $element ) {
1486  return isset( $list[$element] )
1487  ? $list[$element]
1488  : array();
1489  }
1490 
1496  static function setupAttributeWhitelist() {
1498  static $whitelist, $staticInitialised;
1499 
1500  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1501 
1502  if ( $whitelist !== null && $staticInitialised == $globalContext ) {
1503  return $whitelist;
1504  }
1505 
1506  $common = array(
1507  # HTML
1508  'id',
1509  'class',
1510  'style',
1511  'lang',
1512  'dir',
1513  'title',
1514 
1515  # WAI-ARIA
1516  'role',
1517  );
1518 
1519  if ( $wgAllowRdfaAttributes ) {
1520  # RDFa attributes as specified in section 9 of
1521  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1522  $common = array_merge( $common, array(
1523  'about', 'property', 'resource', 'datatype', 'typeof',
1524  ) );
1525  }
1526 
1527  if ( $wgAllowMicrodataAttributes ) {
1528  # add HTML5 microdata tags as specified by
1529  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1530  $common = array_merge( $common, array(
1531  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1532  ) );
1533  }
1534 
1535  $block = array_merge( $common, array( 'align' ) );
1536  $tablealign = array( 'align', 'valign' );
1537  $tablecell = array(
1538  'abbr',
1539  'axis',
1540  'headers',
1541  'scope',
1542  'rowspan',
1543  'colspan',
1544  'nowrap', # deprecated
1545  'width', # deprecated
1546  'height', # deprecated
1547  'bgcolor', # deprecated
1548  );
1549 
1550  # Numbers refer to sections in HTML 4.01 standard describing the element.
1551  # See: http://www.w3.org/TR/html4/
1552  $whitelist = array(
1553  # 7.5.4
1554  'div' => $block,
1555  'center' => $common, # deprecated
1556  'span' => $common,
1557 
1558  # 7.5.5
1559  'h1' => $block,
1560  'h2' => $block,
1561  'h3' => $block,
1562  'h4' => $block,
1563  'h5' => $block,
1564  'h6' => $block,
1565 
1566  # 7.5.6
1567  # address
1568 
1569  # 8.2.4
1570  'bdo' => $common,
1571 
1572  # 9.2.1
1573  'em' => $common,
1574  'strong' => $common,
1575  'cite' => $common,
1576  'dfn' => $common,
1577  'code' => $common,
1578  'samp' => $common,
1579  'kbd' => $common,
1580  'var' => $common,
1581  'abbr' => $common,
1582  # acronym
1583 
1584  # 9.2.2
1585  'blockquote' => array_merge( $common, array( 'cite' ) ),
1586  'q' => array_merge( $common, array( 'cite' ) ),
1587 
1588  # 9.2.3
1589  'sub' => $common,
1590  'sup' => $common,
1591 
1592  # 9.3.1
1593  'p' => $block,
1594 
1595  # 9.3.2
1596  'br' => array_merge( $common, array( 'clear' ) ),
1597 
1598  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1599  'wbr' => $common,
1600 
1601  # 9.3.4
1602  'pre' => array_merge( $common, array( 'width' ) ),
1603 
1604  # 9.4
1605  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1606  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1607 
1608  # 10.2
1609  'ul' => array_merge( $common, array( 'type' ) ),
1610  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1611  'li' => array_merge( $common, array( 'type', 'value' ) ),
1612 
1613  # 10.3
1614  'dl' => $common,
1615  'dd' => $common,
1616  'dt' => $common,
1617 
1618  # 11.2.1
1619  'table' => array_merge( $common,
1620  array( 'summary', 'width', 'border', 'frame',
1621  'rules', 'cellspacing', 'cellpadding',
1622  'align', 'bgcolor',
1623  ) ),
1624 
1625  # 11.2.2
1626  'caption' => $block,
1627 
1628  # 11.2.3
1629  'thead' => $common,
1630  'tfoot' => $common,
1631  'tbody' => $common,
1632 
1633  # 11.2.4
1634  'colgroup' => array_merge( $common, array( 'span' ) ),
1635  'col' => array_merge( $common, array( 'span' ) ),
1636 
1637  # 11.2.5
1638  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1639 
1640  # 11.2.6
1641  'td' => array_merge( $common, $tablecell, $tablealign ),
1642  'th' => array_merge( $common, $tablecell, $tablealign ),
1643 
1644  # 12.2
1645  # NOTE: <a> is not allowed directly, but the attrib
1646  # whitelist is used from the Parser object
1647  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1648 
1649  # 13.2
1650  # Not usually allowed, but may be used for extension-style hooks
1651  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1652  # true
1653  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1654 
1655  # 15.2.1
1656  'tt' => $common,
1657  'b' => $common,
1658  'i' => $common,
1659  'big' => $common,
1660  'small' => $common,
1661  'strike' => $common,
1662  's' => $common,
1663  'u' => $common,
1664 
1665  # 15.2.2
1666  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1667  # basefont
1668 
1669  # 15.3
1670  'hr' => array_merge( $common, array( 'width' ) ),
1671 
1672  # HTML Ruby annotation text module, simple ruby only.
1673  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1674  'ruby' => $common,
1675  # rbc
1676  'rb' => $common,
1677  'rp' => $common,
1678  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1679  'rtc' => $common,
1680 
1681  # MathML root element, where used for extensions
1682  # 'title' may not be 100% valid here; it's XHTML
1683  # http://www.w3.org/TR/REC-MathML/
1684  'math' => array( 'class', 'style', 'id', 'title' ),
1685 
1686  # HTML 5 section 4.6
1687  'bdi' => $common,
1688 
1689  # HTML5 elements, defined by:
1690  # http://www.whatwg.org/html/
1691  'data' => array_merge( $common, array( 'value' ) ),
1692  'time' => array_merge( $common, array( 'datetime' ) ),
1693  'mark' => $common,
1694 
1695  // meta and link are only permitted by removeHTMLtags when Microdata
1696  // is enabled so we don't bother adding a conditional to hide these
1697  // Also meta and link are only valid in WikiText as Microdata elements
1698  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1699  // So we don't bother including $common attributes that have no purpose.
1700  'meta' => array( 'itemprop', 'content' ),
1701  'link' => array( 'itemprop', 'href' ),
1702  );
1703 
1704  $staticInitialised = $globalContext;
1705 
1706  return $whitelist;
1707  }
1708 
1719  static function stripAllTags( $text ) {
1720  # Actual <tags>
1721  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1722 
1723  # Normalize &entities and whitespace
1724  $text = self::decodeCharReferences( $text );
1725  $text = self::normalizeWhitespace( $text );
1726 
1727  return $text;
1728  }
1729 
1739  static function hackDocType() {
1740  $out = "<!DOCTYPE html [\n";
1741  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1742  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1743  }
1744  $out .= "]>\n";
1745  return $out;
1746  }
1747 
1752  static function cleanUrl( $url ) {
1753  # Normalize any HTML entities in input. They will be
1754  # re-escaped by makeExternalLink().
1755  $url = Sanitizer::decodeCharReferences( $url );
1756 
1757  # Escape any control characters introduced by the above step
1758  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1759  array( __CLASS__, 'cleanUrlCallback' ), $url );
1760 
1761  # Validate hostname portion
1762  $matches = array();
1763  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1764  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1765 
1766  // Characters that will be ignored in IDNs.
1767  // http://tools.ietf.org/html/3454#section-3.1
1768  // Strip them before further processing so blacklists and such work.
1769  $strip = "/
1770  \\s| # general whitespace
1771  \xc2\xad| # 00ad SOFT HYPHEN
1772  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1773  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1774  \xe2\x81\xa0| # 2060 WORD JOINER
1775  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1776  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1777  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1778  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1779  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1780  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1781  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1782  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1783  /xuD";
1784 
1785  $host = preg_replace( $strip, '', $host );
1786 
1787  // @todo FIXME: Validate hostnames here
1788 
1789  return $protocol . $host . $rest;
1790  } else {
1791  return $url;
1792  }
1793  }
1794 
1799  static function cleanUrlCallback( $matches ) {
1800  return urlencode( $matches[0] );
1801  }
1802 
1831  public static function validateEmail( $addr ) {
1832  $result = null;
1833  if ( !Hooks::run( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1834  return $result;
1835  }
1836 
1837  // Please note strings below are enclosed in brackets [], this make the
1838  // hyphen "-" a range indicator. Hence it is double backslashed below.
1839  // See bug 26948
1840  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1841  $rfc1034_ldh_str = "a-z0-9\\-";
1842 
1843  $html5_email_regexp = "/
1844  ^ # start of string
1845  [$rfc5322_atext\\.]+ # user part which is liberal :p
1846  @ # 'apostrophe'
1847  [$rfc1034_ldh_str]+ # First domain part
1848  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1849  $ # End of string
1850  /ix"; // case Insensitive, eXtended
1851 
1852  return (bool)preg_match( $html5_email_regexp, $addr );
1853  }
1854 }
utf8ToCodepoint($char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
static decCharReference($codepoint)
Definition: Sanitizer.php:1354
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1712
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1186
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1338
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
or
false for read/write
static safeEncodeTagAttributes($assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1222
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:1312
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:325
static removeHTMLtags($text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:372
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1496
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1712
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content.The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content.These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text.All manipulation and analysis of page content must be done via the appropriate methods of the Content object.For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers.The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id).Also Title, WikiPage and Revision now have getContentHandler() methods for convenience.ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page.ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type.However, it is recommended to instead use WikiPage::getContent() resp.Revision::getContent() to get a page's content as a Content object.These two methods should be the ONLY way in which page content is accessed.Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides().This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based.Objects implementing the Content interface are used to represent and handle the content internally.For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content).The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats().Content serialization formats are identified using MIME type like strings.The following formats are built in:*text/x-wiki-wikitext *text/javascript-for js pages *text/css-for css pages *text/plain-for future use, e.g.with plain text messages.*text/html-for future use, e.g.with plain html messages.*application/vnd.php.serialized-for future use with the api and for extensions *application/json-for future use with the api, and for use by extensions *application/xml-for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant.Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly.Without that information, interpretation of the provided content is not reliable.The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export.Also note that the API will provide encapsulated, serialized content-so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure.Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content.However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page's content model, and will now generate warnings when used.Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent()*WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject().However, both methods should be avoided since they do not provide clean access to the page's actual content.For instance, they may return a system message for non-existing pages.Use WikiPage::getContent() instead.Code that relies on a textual representation of the page content should eventually be rewritten.However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page.Its behavior is controlled by $wgContentHandlerTextFallback it
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1432
static cssDecodeCallback($matches)
Definition: Sanitizer.php:967
$value
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by similarly to how extensions are installed You can then make that skin the default by adding
Definition: skin.txt:57
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1146
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1739
static cleanUrl($url)
Definition: Sanitizer.php:1752
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:1719
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
const UTF8_REPLACEMENT
static hexCharReference($codepoint)
Definition: Sanitizer.php:1367
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and local administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:694
static normalizeWhitespace($text)
Definition: Sanitizer.php:1267
Apache License January http
it sets a lot of them automatically from query and such
Definition: design.txt:93
if($line===false) $args
Definition: cdb.php:64
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1397
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
pull multiple revisions may often pull multiple times from the same blob.
Definition: deferred.txt:11
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1381
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:56
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec http://www.w3.org/TR/html5/syntax.html#tag-open-state.
Definition: Sanitizer.php:46
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1451
static normalizeSectionNameWhitespace($section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id's that are used for section links.
Definition: Sanitizer.php:1282
Some quick notes on the file repository architecture Functionality is
Definition: README:3
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
Definition: Sanitizer.php:64
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1484
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message.Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item.$reader:XMLReader object $logInfo:Array of information Return false to stop further processing of the tag 'ImportHandlePageXMLTag':When parsing a XML tag in a page.$reader:XMLReader object $pageInfo:Array of information Return false to stop further processing of the tag 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision.$reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information Return false to stop further processing of the tag 'ImportHandleToplevelXMLTag':When parsing a top level XML tag.$reader:XMLReader object Return false to stop further processing of the tag 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload.$reader:XMLReader object $revisionInfo:Array of information Return false to stop further processing of the tag 'InfoAction':When building information to display on the action=info page.$context:IContextSource object &$pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect.$title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not.Return true without providing an interwiki to continue interwiki search.$prefix:interwiki prefix we are looking for.&$iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InternalParseBeforeSanitize':during Parser's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings.Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InternalParseBeforeLinks':during Parser's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InvalidateEmailComplete':Called after a user's email has been invalidated successfully.$user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification.Callee may modify $url and $query, URL will be constructed as $url.$query &$url:URL to index.php &$query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from &$allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of Sanitizer::validateEmail(), for instance to return false if the domain name doesn't match your organization.$addr:The e-mail address entered by the user &$result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user &$result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we're looking for a messages file for &$file:The messages file path, you can override this to change the location. 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces.Do not use this hook to add namespaces.Use CanonicalNamespaces for that.&$namespaces:Array of namespaces indexed by their numbers 'LanguageGetMagic':DEPRECATED, use $magicWords in a file listed in $wgExtensionMessagesFiles instead.Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetSpecialPageAliases':DEPRECATED, use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead.Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names.&$names:array of language code=> language name $code language of the preferred translations 'LanguageLinks':Manipulate a page's language links.This is called in various places to allow extensions to define the effective language links for a page.$title:The page's Title.&$links:Associative array mapping language codes to prefixed links of the form"language:title".&$linkFlags:Associative array mapping prefixed links to arrays of flags.Currently unused, but planned to provide support for marking individual language links in the UI, e.g.for featured articles. 'LanguageSelector':Hook to change the language selector available on a page.$out:The output page.$cssClassName:CSS class name of the language selector. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts.Return false to skip default processing and return $ret.See documentation for Linker::link() for details on the expected meanings of parameters.$skin:the Skin object $target:the Title that the link is pointing to &$html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1710
namespace and then decline to actually register it file or subcat img or subcat RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions instead of letting the login form give the generic error message that the account does not exist For when the account has been renamed or deleted or an array to pass a message key and parameters but no entry for that model exists in $wgContentHandlers if desired whether it is OK to use $contentModel on $title Handler functions that modify $ok should generally return false to prevent further hooks from further modifying $ok called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context as context $options
Definition: hooks.txt:952
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1161
static validateTag($params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:658
static mergeAttributes($a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:810
MediaWiki exception.
Definition: MWException.php:26
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
static run($event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:137
$params
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
be sent.
static validateAttributes($attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:714
static decodeCharReferencesAndNormalize($text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1414
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable or merely the Work and Derivative Works thereof Contribution shall mean any work of including the original version of the Work and any modifications or additions to that Work or Derivative Works that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner For the purposes of this submitted means any form of or written communication sent to the Licensor or its including but not limited to communication on electronic mailing source code control and issue tracking systems that are managed by
static escapeId($id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it. ...
Definition: Sanitizer.php:1102
static cleanUrlCallback($matches)
Definition: Sanitizer.php:1799
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
wfSuppressWarnings($end=false)
Reference-counted warning suppression.
usually copyright or history_copyright This message must be in HTML not wikitext if the section is included from a template $section
Definition: hooks.txt:2512
#define the
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1174
wfUrlProtocols($includeProtocolRelative=true)
Returns a regular expression of url protocols.
static normalizeCss($value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:832
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:57
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types as usual *javascript user provided javascript code *json simple implementation for use by extensions
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:323
$wgUseTidy
$wgUseTidy: use tidy to make sure HTML output is sane.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1301
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
Bar style
wfRestoreWarnings()
Restore error level to previous value.
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$wgExperimentalHtmlIds
Should we allow a broader set of characters in id attributes, per HTML5? If not, use only HTML 4-comp...
$count
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:1007
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:612
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:339
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
static validateEmail($addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1831
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:941
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1241
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
maintenance dev scripts can help quickly setup a local MediaWiki for development purposes Wikis setup in this way are NOT meant to be publicly available They use a development database not acceptible for use in production Place a sqlite database in an unsafe location a real wiki should never place it in And use predictable default logins for the initial administrator user Running maintenance dev install sh will download and install a local copy of php
Definition: README:5
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1467
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:1044
$wgAllowRdfaAttributes
Enabled RDFa attributes for use in wikitext.
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1888
$wgAllowMicrodataAttributes
Enabled HTML5 microdata attributes for use in wikitext.
PHP Parser - Processes wiki markup (which uses a more user-friendly syntax, such as "[[link]]" for ma...
Definition: Parser.php:67
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1712
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:331
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1023
$matches