MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
46  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
47 
56  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58 
64  private static $htmlEntities = array(
65  'Aacute' => 193,
66  'aacute' => 225,
67  'Acirc' => 194,
68  'acirc' => 226,
69  'acute' => 180,
70  'AElig' => 198,
71  'aelig' => 230,
72  'Agrave' => 192,
73  'agrave' => 224,
74  'alefsym' => 8501,
75  'Alpha' => 913,
76  'alpha' => 945,
77  'amp' => 38,
78  'and' => 8743,
79  'ang' => 8736,
80  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
81  'Aring' => 197,
82  'aring' => 229,
83  'asymp' => 8776,
84  'Atilde' => 195,
85  'atilde' => 227,
86  'Auml' => 196,
87  'auml' => 228,
88  'bdquo' => 8222,
89  'Beta' => 914,
90  'beta' => 946,
91  'brvbar' => 166,
92  'bull' => 8226,
93  'cap' => 8745,
94  'Ccedil' => 199,
95  'ccedil' => 231,
96  'cedil' => 184,
97  'cent' => 162,
98  'Chi' => 935,
99  'chi' => 967,
100  'circ' => 710,
101  'clubs' => 9827,
102  'cong' => 8773,
103  'copy' => 169,
104  'crarr' => 8629,
105  'cup' => 8746,
106  'curren' => 164,
107  'dagger' => 8224,
108  'Dagger' => 8225,
109  'darr' => 8595,
110  'dArr' => 8659,
111  'deg' => 176,
112  'Delta' => 916,
113  'delta' => 948,
114  'diams' => 9830,
115  'divide' => 247,
116  'Eacute' => 201,
117  'eacute' => 233,
118  'Ecirc' => 202,
119  'ecirc' => 234,
120  'Egrave' => 200,
121  'egrave' => 232,
122  'empty' => 8709,
123  'emsp' => 8195,
124  'ensp' => 8194,
125  'Epsilon' => 917,
126  'epsilon' => 949,
127  'equiv' => 8801,
128  'Eta' => 919,
129  'eta' => 951,
130  'ETH' => 208,
131  'eth' => 240,
132  'Euml' => 203,
133  'euml' => 235,
134  'euro' => 8364,
135  'exist' => 8707,
136  'fnof' => 402,
137  'forall' => 8704,
138  'frac12' => 189,
139  'frac14' => 188,
140  'frac34' => 190,
141  'frasl' => 8260,
142  'Gamma' => 915,
143  'gamma' => 947,
144  'ge' => 8805,
145  'gt' => 62,
146  'harr' => 8596,
147  'hArr' => 8660,
148  'hearts' => 9829,
149  'hellip' => 8230,
150  'Iacute' => 205,
151  'iacute' => 237,
152  'Icirc' => 206,
153  'icirc' => 238,
154  'iexcl' => 161,
155  'Igrave' => 204,
156  'igrave' => 236,
157  'image' => 8465,
158  'infin' => 8734,
159  'int' => 8747,
160  'Iota' => 921,
161  'iota' => 953,
162  'iquest' => 191,
163  'isin' => 8712,
164  'Iuml' => 207,
165  'iuml' => 239,
166  'Kappa' => 922,
167  'kappa' => 954,
168  'Lambda' => 923,
169  'lambda' => 955,
170  'lang' => 9001,
171  'laquo' => 171,
172  'larr' => 8592,
173  'lArr' => 8656,
174  'lceil' => 8968,
175  'ldquo' => 8220,
176  'le' => 8804,
177  'lfloor' => 8970,
178  'lowast' => 8727,
179  'loz' => 9674,
180  'lrm' => 8206,
181  'lsaquo' => 8249,
182  'lsquo' => 8216,
183  'lt' => 60,
184  'macr' => 175,
185  'mdash' => 8212,
186  'micro' => 181,
187  'middot' => 183,
188  'minus' => 8722,
189  'Mu' => 924,
190  'mu' => 956,
191  'nabla' => 8711,
192  'nbsp' => 160,
193  'ndash' => 8211,
194  'ne' => 8800,
195  'ni' => 8715,
196  'not' => 172,
197  'notin' => 8713,
198  'nsub' => 8836,
199  'Ntilde' => 209,
200  'ntilde' => 241,
201  'Nu' => 925,
202  'nu' => 957,
203  'Oacute' => 211,
204  'oacute' => 243,
205  'Ocirc' => 212,
206  'ocirc' => 244,
207  'OElig' => 338,
208  'oelig' => 339,
209  'Ograve' => 210,
210  'ograve' => 242,
211  'oline' => 8254,
212  'Omega' => 937,
213  'omega' => 969,
214  'Omicron' => 927,
215  'omicron' => 959,
216  'oplus' => 8853,
217  'or' => 8744,
218  'ordf' => 170,
219  'ordm' => 186,
220  'Oslash' => 216,
221  'oslash' => 248,
222  'Otilde' => 213,
223  'otilde' => 245,
224  'otimes' => 8855,
225  'Ouml' => 214,
226  'ouml' => 246,
227  'para' => 182,
228  'part' => 8706,
229  'permil' => 8240,
230  'perp' => 8869,
231  'Phi' => 934,
232  'phi' => 966,
233  'Pi' => 928,
234  'pi' => 960,
235  'piv' => 982,
236  'plusmn' => 177,
237  'pound' => 163,
238  'prime' => 8242,
239  'Prime' => 8243,
240  'prod' => 8719,
241  'prop' => 8733,
242  'Psi' => 936,
243  'psi' => 968,
244  'quot' => 34,
245  'radic' => 8730,
246  'rang' => 9002,
247  'raquo' => 187,
248  'rarr' => 8594,
249  'rArr' => 8658,
250  'rceil' => 8969,
251  'rdquo' => 8221,
252  'real' => 8476,
253  'reg' => 174,
254  'rfloor' => 8971,
255  'Rho' => 929,
256  'rho' => 961,
257  'rlm' => 8207,
258  'rsaquo' => 8250,
259  'rsquo' => 8217,
260  'sbquo' => 8218,
261  'Scaron' => 352,
262  'scaron' => 353,
263  'sdot' => 8901,
264  'sect' => 167,
265  'shy' => 173,
266  'Sigma' => 931,
267  'sigma' => 963,
268  'sigmaf' => 962,
269  'sim' => 8764,
270  'spades' => 9824,
271  'sub' => 8834,
272  'sube' => 8838,
273  'sum' => 8721,
274  'sup' => 8835,
275  'sup1' => 185,
276  'sup2' => 178,
277  'sup3' => 179,
278  'supe' => 8839,
279  'szlig' => 223,
280  'Tau' => 932,
281  'tau' => 964,
282  'there4' => 8756,
283  'Theta' => 920,
284  'theta' => 952,
285  'thetasym' => 977,
286  'thinsp' => 8201,
287  'THORN' => 222,
288  'thorn' => 254,
289  'tilde' => 732,
290  'times' => 215,
291  'trade' => 8482,
292  'Uacute' => 218,
293  'uacute' => 250,
294  'uarr' => 8593,
295  'uArr' => 8657,
296  'Ucirc' => 219,
297  'ucirc' => 251,
298  'Ugrave' => 217,
299  'ugrave' => 249,
300  'uml' => 168,
301  'upsih' => 978,
302  'Upsilon' => 933,
303  'upsilon' => 965,
304  'Uuml' => 220,
305  'uuml' => 252,
306  'weierp' => 8472,
307  'Xi' => 926,
308  'xi' => 958,
309  'Yacute' => 221,
310  'yacute' => 253,
311  'yen' => 165,
312  'Yuml' => 376,
313  'yuml' => 255,
314  'Zeta' => 918,
315  'zeta' => 950,
316  'zwj' => 8205,
317  'zwnj' => 8204
318  );
319 
323  private static $htmlEntityAliases = array(
324  'רלמ' => 'rlm',
325  'رلم' => 'rlm',
326  );
327 
331  private static $attribsRegex;
332 
339  static function getAttribsRegex() {
340  if ( self::$attribsRegex === null ) {
341  $attribFirst = '[:A-Z_a-z0-9]';
342  $attrib = '[:A-Z_a-z-.0-9]';
343  $space = '[\x09\x0a\x0d\x20]';
344  self::$attribsRegex =
345  "/(?:^|$space)({$attribFirst}{$attrib}*)
346  ($space*=$space*
347  (?:
348  # The attribute value: quoted or alone
349  \"([^<\"]*)\"
350  | '([^<']*)'
351  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
352  )
353  )?(?=$space|\$)/sx";
354  }
355  return self::$attribsRegex;
356  }
357 
364  public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) {
366 
367  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
368  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
369 
370  // Base our staticInitialised variable off of the global config state so that if the globals
371  // are changed (like in the screwed up test system) we will re-initialise the settings.
372  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
373  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
374  $htmlpairsStatic = array( # Tags that must be closed
375  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
376  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
377  'strike', 'strong', 'tt', 'var', 'div', 'center',
378  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
379  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
380  'kbd', 'samp', 'data', 'time', 'mark'
381  );
382  $htmlsingle = array(
383  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
384  );
385  $htmlsingleonly = array( # Elements that cannot have close tags
386  'br', 'wbr', 'hr'
387  );
388  if ( $wgAllowMicrodataAttributes ) {
389  $htmlsingle[] = $htmlsingleonly[] = 'meta';
390  $htmlsingle[] = $htmlsingleonly[] = 'link';
391  }
392  $htmlnest = array( # Tags that can be nested--??
393  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
394  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
395  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
396  );
397  $tabletags = array( # Can only appear inside table, we will close them
398  'td', 'th', 'tr',
399  );
400  $htmllist = array( # Tags used by list
401  'ul', 'ol',
402  );
403  $listtags = array( # Tags that can appear in a list
404  'li',
405  );
406 
407  if ( $wgAllowImageTag ) {
408  $htmlsingle[] = 'img';
409  $htmlsingleonly[] = 'img';
410  }
411 
412  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
413  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
414 
415  # Convert them all to hashtables for faster lookup
416  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
417  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
418  foreach ( $vars as $var ) {
419  $$var = array_flip( $$var );
420  }
421  $staticInitialised = $globalContext;
422  }
423 
424  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
425  $extratags = array_flip( $extratags );
426  $removetags = array_flip( $removetags );
427  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
428  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
429 
430  return array(
431  'htmlpairs' => $htmlpairs,
432  'htmlsingle' => $htmlsingle,
433  'htmlsingleonly' => $htmlsingleonly,
434  'htmlnest' => $htmlnest,
435  'tabletags' => $tabletags,
436  'htmllist' => $htmllist,
437  'listtags' => $listtags,
438  'htmlsingleallowed' => $htmlsingleallowed,
439  'htmlelements' => $htmlelements,
440  );
441  }
442 
454  public static function removeHTMLtags( $text, $processCallback = null,
455  $args = array(), $extratags = array(), $removetags = array()
456  ) {
458 
459  extract( self::getRecognizedTagData( $extratags, $removetags ) );
460 
461  # Remove HTML comments
462  $text = Sanitizer::removeHTMLcomments( $text );
463  $bits = explode( '<', $text );
464  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
465  if ( !$wgUseTidy ) {
466  $tagstack = $tablestack = array();
467  foreach ( $bits as $x ) {
468  $regs = array();
469  # $slash: Does the current element start with a '/'?
470  # $t: Current element name
471  # $params: String between element name and >
472  # $brace: Ending '>' or '/>'
473  # $rest: Everything until the next element of $bits
474  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
475  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
476  } else {
477  $slash = $t = $params = $brace = $rest = null;
478  }
479 
480  $badtag = false;
481  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
482  # Check our stack
483  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
484  $badtag = true;
485  } elseif ( $slash ) {
486  # Closing a tag... is it the one we just opened?
487  MediaWiki\suppressWarnings();
488  $ot = array_pop( $tagstack );
489  MediaWiki\restoreWarnings();
490 
491  if ( $ot != $t ) {
492  if ( isset( $htmlsingleallowed[$ot] ) ) {
493  # Pop all elements with an optional close tag
494  # and see if we find a match below them
495  $optstack = array();
496  array_push( $optstack, $ot );
497  MediaWiki\suppressWarnings();
498  $ot = array_pop( $tagstack );
499  MediaWiki\restoreWarnings();
500  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
501  array_push( $optstack, $ot );
502  MediaWiki\suppressWarnings();
503  $ot = array_pop( $tagstack );
504  MediaWiki\restoreWarnings();
505  }
506  if ( $t != $ot ) {
507  # No match. Push the optional elements back again
508  $badtag = true;
509  MediaWiki\suppressWarnings();
510  $ot = array_pop( $optstack );
511  MediaWiki\restoreWarnings();
512  while ( $ot ) {
513  array_push( $tagstack, $ot );
514  MediaWiki\suppressWarnings();
515  $ot = array_pop( $optstack );
516  MediaWiki\restoreWarnings();
517  }
518  }
519  } else {
520  MediaWiki\suppressWarnings();
521  array_push( $tagstack, $ot );
522  MediaWiki\restoreWarnings();
523 
524  # <li> can be nested in <ul> or <ol>, skip those cases:
525  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
526  $badtag = true;
527  }
528  }
529  } else {
530  if ( $t == 'table' ) {
531  $tagstack = array_pop( $tablestack );
532  }
533  }
534  $newparams = '';
535  } else {
536  # Keep track for later
537  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
538  $badtag = true;
539  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
540  $badtag = true;
541  # Is it a self closed htmlpair ? (bug 5487)
542  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
543  $badtag = true;
544  } elseif ( isset( $htmlsingleonly[$t] ) ) {
545  # Hack to force empty tag for unclosable elements
546  $brace = '/>';
547  } elseif ( isset( $htmlsingle[$t] ) ) {
548  # Hack to not close $htmlsingle tags
549  $brace = null;
550  # Still need to push this optionally-closed tag to
551  # the tag stack so that we can match end tags
552  # instead of marking them as bad.
553  array_push( $tagstack, $t );
554  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
555  // New table tag but forgot to close the previous one
556  $text .= "</$t>";
557  } else {
558  if ( $t == 'table' ) {
559  array_push( $tablestack, $tagstack );
560  $tagstack = array();
561  }
562  array_push( $tagstack, $t );
563  }
564 
565  # Replace any variables or template parameters with
566  # plaintext results.
567  if ( is_callable( $processCallback ) ) {
568  call_user_func_array( $processCallback, array( &$params, $args ) );
569  }
570 
571  if ( !Sanitizer::validateTag( $params, $t ) ) {
572  $badtag = true;
573  }
574 
575  # Strip non-approved attributes from the tag
576  $newparams = Sanitizer::fixTagAttributes( $params, $t );
577  }
578  if ( !$badtag ) {
579  $rest = str_replace( '>', '&gt;', $rest );
580  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
581  $text .= "<$slash$t$newparams$close>$rest";
582  continue;
583  }
584  }
585  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
586  }
587  # Close off any remaining tags
588  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
589  $text .= "</$t>\n";
590  if ( $t == 'table' ) {
591  $tagstack = array_pop( $tablestack );
592  }
593  }
594  } else {
595  # this might be possible using tidy itself
596  foreach ( $bits as $x ) {
597  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
598  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
599 
600  $badtag = false;
601  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
602  if ( is_callable( $processCallback ) ) {
603  call_user_func_array( $processCallback, array( &$params, $args ) );
604  }
605 
606  if ( !Sanitizer::validateTag( $params, $t ) ) {
607  $badtag = true;
608  }
609 
610  $newparams = Sanitizer::fixTagAttributes( $params, $t );
611  if ( !$badtag ) {
612  $rest = str_replace( '>', '&gt;', $rest );
613  $text .= "<$slash$t$newparams$brace$rest";
614  continue;
615  }
616  }
617  }
618  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
619  }
620  }
621  return $text;
622  }
623 
633  public static function removeHTMLcomments( $text ) {
634  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
635  $end = strpos( $text, '-->', $start + 4 );
636  if ( $end === false ) {
637  # Unterminated comment; bail out
638  break;
639  }
640 
641  $end += 3;
642 
643  # Trim space and newline if the comment is both
644  # preceded and followed by a newline
645  $spaceStart = max( $start - 1, 0 );
646  $spaceLen = $end - $spaceStart;
647  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
648  $spaceStart--;
649  $spaceLen++;
650  }
651  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
652  $spaceLen++;
653  }
654  if ( substr( $text, $spaceStart, 1 ) === "\n"
655  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
656  # Remove the comment, leading and trailing
657  # spaces, and leave only one newline.
658  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
659  } else {
660  # Remove just the comment.
661  $text = substr_replace( $text, '', $start, $end - $start );
662  }
663  }
664  return $text;
665  }
666 
679  static function validateTag( $params, $element ) {
681 
682  if ( $element == 'meta' || $element == 'link' ) {
683  if ( !isset( $params['itemprop'] ) ) {
684  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
685  return false;
686  }
687  if ( $element == 'meta' && !isset( $params['content'] ) ) {
688  // <meta> must have a content="" for the itemprop
689  return false;
690  }
691  if ( $element == 'link' && !isset( $params['href'] ) ) {
692  // <link> must have an associated href=""
693  return false;
694  }
695  }
696 
697  return true;
698  }
699 
715  static function validateTagAttributes( $attribs, $element ) {
717  Sanitizer::attributeWhitelist( $element ) );
718  }
719 
735  static function validateAttributes( $attribs, $whitelist ) {
737 
738  $whitelist = array_flip( $whitelist );
739  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
740 
741  $out = array();
742  foreach ( $attribs as $attribute => $value ) {
743  #allow XML namespace declaration if RDFa is enabled
744  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
745  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
746  $out[$attribute] = $value;
747  }
748 
749  continue;
750  }
751 
752  # Allow any attribute beginning with "data-"
753  if ( !preg_match( '/^data-(?!ooui)/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
754  continue;
755  }
756 
757  # Strip javascript "expression" from stylesheets.
758  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
759  if ( $attribute == 'style' ) {
761  }
762 
763  if ( $attribute === 'id' ) {
764  $value = Sanitizer::escapeId( $value, 'noninitial' );
765  }
766 
767  # WAI-ARIA
768  # http://www.w3.org/TR/wai-aria/
769  # http://www.whatwg.org/html/elements.html#wai-aria
770  # For now we only support role="presentation" until we work out what roles should be
771  # usable by content and we ensure that our code explicitly rejects patterns that
772  # violate HTML5's ARIA restrictions.
773  if ( $attribute === 'role' && $value !== 'presentation' ) {
774  continue;
775  }
776 
777  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
778  // Check them for sanity.
779  if ( $attribute === 'rel' || $attribute === 'rev'
780  # RDFa
781  || $attribute === 'about' || $attribute === 'property'
782  || $attribute === 'resource' || $attribute === 'datatype'
783  || $attribute === 'typeof'
784  # HTML5 microdata
785  || $attribute === 'itemid' || $attribute === 'itemprop'
786  || $attribute === 'itemref' || $attribute === 'itemscope'
787  || $attribute === 'itemtype'
788  ) {
789  //Paranoia. Allow "simple" values but suppress javascript
790  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
791  continue;
792  }
793  }
794 
795  # NOTE: even though elements using href/src are not allowed directly, supply
796  # validation code that can be used by tag hook handlers, etc
797  if ( $attribute === 'href' || $attribute === 'src' ) {
798  if ( !preg_match( $hrefExp, $value ) ) {
799  continue; //drop any href or src attributes not using an allowed protocol.
800  // NOTE: this also drops all relative URLs
801  }
802  }
803 
804  // If this attribute was previously set, override it.
805  // Output should only have one attribute of each name.
806  $out[$attribute] = $value;
807  }
808 
809  if ( $wgAllowMicrodataAttributes ) {
810  # itemtype, itemid, itemref don't make sense without itemscope
811  if ( !array_key_exists( 'itemscope', $out ) ) {
812  unset( $out['itemtype'] );
813  unset( $out['itemid'] );
814  unset( $out['itemref'] );
815  }
816  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
817  }
818  return $out;
819  }
820 
831  static function mergeAttributes( $a, $b ) {
832  $out = array_merge( $a, $b );
833  if ( isset( $a['class'] ) && isset( $b['class'] )
834  && is_string( $a['class'] ) && is_string( $b['class'] )
835  && $a['class'] !== $b['class']
836  ) {
837  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
838  -1, PREG_SPLIT_NO_EMPTY );
839  $out['class'] = implode( ' ', array_unique( $classes ) );
840  }
841  return $out;
842  }
843 
853  public static function normalizeCss( $value ) {
854 
855  // Decode character references like &#123;
857 
858  // Decode escape sequences and line continuation
859  // See the grammar in the CSS 2 spec, appendix D.
860  // This has to be done AFTER decoding character references.
861  // This means it isn't possible for this function to return
862  // unsanitized escape sequences. It is possible to manufacture
863  // input that contains character references that decode to
864  // escape sequences that decode to character references, but
865  // it's OK for the return value to contain character references
866  // because the caller is supposed to escape those anyway.
867  static $decodeRegex;
868  if ( !$decodeRegex ) {
869  $space = '[\\x20\\t\\r\\n\\f]';
870  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
871  $backslash = '\\\\';
872  $decodeRegex = "/ $backslash
873  (?:
874  ($nl) | # 1. Line continuation
875  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
876  (.) | # 3. backslash cancelling special meaning
877  () | # 4. backslash at end of string
878  )/xu";
879  }
880  $value = preg_replace_callback( $decodeRegex,
881  array( __CLASS__, 'cssDecodeCallback' ), $value );
882 
883  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
884  $value = preg_replace_callback(
885  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
886  function ( $matches ) {
888  if ( $cp === false ) {
889  return '';
890  }
891  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
892  },
893  $value
894  );
895 
896  // Convert more characters IE6 might treat as ascii
897  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
898  $value = str_replace(
899  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
900  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
901  $value
902  );
903 
904  // Let the value through if it's nothing but a single comment, to
905  // allow other functions which may reject it to pass some error
906  // message through.
907  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
908  // Remove any comments; IE gets token splitting wrong
909  // This must be done AFTER decoding character references and
910  // escape sequences, because those steps can introduce comments
911  // This step cannot introduce character references or escape
912  // sequences, because it replaces comments with spaces rather
913  // than removing them completely.
914  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
915 
916  // Remove anything after a comment-start token, to guard against
917  // incorrect client implementations.
918  $commentPos = strpos( $value, '/*' );
919  if ( $commentPos !== false ) {
920  $value = substr( $value, 0, $commentPos );
921  }
922  }
923 
924  // S followed by repeat, iteration, or prolonged sound marks,
925  // which IE will treat as "ss"
926  $value = preg_replace(
927  '/s(?:
928  \xE3\x80\xB1 | # U+3031
929  \xE3\x82\x9D | # U+309D
930  \xE3\x83\xBC | # U+30FC
931  \xE3\x83\xBD | # U+30FD
932  \xEF\xB9\xBC | # U+FE7C
933  \xEF\xB9\xBD | # U+FE7D
934  \xEF\xBD\xB0 # U+FF70
935  )/ix',
936  'ss',
937  $value
938  );
939 
940  return $value;
941  }
942 
943 
962  static function checkCss( $value ) {
963  $value = self::normalizeCss( $value );
964 
965  // Reject problematic keywords and control characters
966  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
967  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
968  return '/* invalid control char */';
969  } elseif ( preg_match(
970  '! expression
971  | filter\s*:
972  | accelerator\s*:
973  | -o-link\s*:
974  | -o-link-source\s*:
975  | -o-replace\s*:
976  | url\s*\(
977  | image\s*\(
978  | image-set\s*\(
979  !ix', $value ) ) {
980  return '/* insecure input */';
981  }
982  return $value;
983  }
984 
989  static function cssDecodeCallback( $matches ) {
990  if ( $matches[1] !== '' ) {
991  // Line continuation
992  return '';
993  } elseif ( $matches[2] !== '' ) {
994  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
995  } elseif ( $matches[3] !== '' ) {
996  $char = $matches[3];
997  } else {
998  $char = '\\';
999  }
1000  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1001  // These characters need to be escaped in strings
1002  // Clean up the escape sequence to avoid parsing errors by clients
1003  return '\\' . dechex( ord( $char ) ) . ' ';
1004  } else {
1005  // Decode unnecessary escape
1006  return $char;
1007  }
1008  }
1009 
1029  static function fixTagAttributes( $text, $element ) {
1030  if ( trim( $text ) == '' ) {
1031  return '';
1032  }
1033 
1034  $decoded = Sanitizer::decodeTagAttributes( $text );
1035  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1036 
1037  return Sanitizer::safeEncodeTagAttributes( $stripped );
1038  }
1039 
1045  static function encodeAttribute( $text ) {
1046  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1047 
1048  // Whitespace is normalized during attribute decoding,
1049  // so if we've been passed non-spaces we must encode them
1050  // ahead of time or they won't be preserved.
1051  $encValue = strtr( $encValue, array(
1052  "\n" => '&#10;',
1053  "\r" => '&#13;',
1054  "\t" => '&#9;',
1055  ) );
1056 
1057  return $encValue;
1058  }
1059 
1066  static function safeEncodeAttribute( $text ) {
1067  $encValue = Sanitizer::encodeAttribute( $text );
1068 
1069  # Templates and links may be expanded in later parsing,
1070  # creating invalid or dangerous output. Suppress this.
1071  $encValue = strtr( $encValue, array(
1072  '<' => '&lt;', // This should never happen,
1073  '>' => '&gt;', // we've received invalid input
1074  '"' => '&quot;', // which should have been escaped.
1075  '{' => '&#123;',
1076  '[' => '&#91;',
1077  "''" => '&#39;&#39;',
1078  'ISBN' => '&#73;SBN',
1079  'RFC' => '&#82;FC',
1080  'PMID' => '&#80;MID',
1081  '|' => '&#124;',
1082  '__' => '&#95;_',
1083  ) );
1084 
1085  # Stupid hack
1086  $encValue = preg_replace_callback(
1087  '/((?i)' . wfUrlProtocols() . ')/',
1088  array( 'Sanitizer', 'armorLinksCallback' ),
1089  $encValue );
1090  return $encValue;
1091  }
1092 
1124  static function escapeId( $id, $options = array() ) {
1126  $options = (array)$options;
1127 
1128  $id = Sanitizer::decodeCharReferences( $id );
1129 
1130  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1131  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1132  $id = trim( $id, '_' );
1133  if ( $id === '' ) {
1134  // Must have been all whitespace to start with.
1135  return '_';
1136  } else {
1137  return $id;
1138  }
1139  }
1140 
1141  // HTML4-style escaping
1142  static $replace = array(
1143  '%3A' => ':',
1144  '%' => '.'
1145  );
1146 
1147  $id = urlencode( strtr( $id, ' ', '_' ) );
1148  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1149 
1150  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1151  // Initial character must be a letter!
1152  $id = "x$id";
1153  }
1154  return $id;
1155  }
1156 
1168  static function escapeClass( $class ) {
1169  // Convert ugly stuff to underscores and kill underscores in ugly places
1170  return rtrim( preg_replace(
1171  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1172  '_',
1173  $class ), '_' );
1174  }
1175 
1183  static function escapeHtmlAllowEntities( $html ) {
1185  # It seems wise to escape ' as well as ", as a matter of course. Can't
1186  # hurt.
1187  $html = htmlspecialchars( $html, ENT_QUOTES );
1188  return $html;
1189  }
1190 
1196  private static function armorLinksCallback( $matches ) {
1197  return str_replace( ':', '&#58;', $matches[1] );
1198  }
1199 
1208  public static function decodeTagAttributes( $text ) {
1209  if ( trim( $text ) == '' ) {
1210  return array();
1211  }
1212 
1213  $attribs = array();
1214  $pairs = array();
1215  if ( !preg_match_all(
1216  self::getAttribsRegex(),
1217  $text,
1218  $pairs,
1219  PREG_SET_ORDER ) ) {
1220  return $attribs;
1221  }
1222 
1223  foreach ( $pairs as $set ) {
1224  $attribute = strtolower( $set[1] );
1226 
1227  // Normalize whitespace
1228  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1229  $value = trim( $value );
1230 
1231  // Decode character references
1233  }
1234  return $attribs;
1235  }
1236 
1244  public static function safeEncodeTagAttributes( $assoc_array ) {
1245  $attribs = array();
1246  foreach ( $assoc_array as $attribute => $value ) {
1247  $encAttribute = htmlspecialchars( $attribute );
1248  $encValue = Sanitizer::safeEncodeAttribute( $value );
1249 
1250  $attribs[] = "$encAttribute=\"$encValue\"";
1251  }
1252  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1253  }
1254 
1263  private static function getTagAttributeCallback( $set ) {
1264  if ( isset( $set[5] ) ) {
1265  # No quotes.
1266  return $set[5];
1267  } elseif ( isset( $set[4] ) ) {
1268  # Single-quoted
1269  return $set[4];
1270  } elseif ( isset( $set[3] ) ) {
1271  # Double-quoted
1272  return $set[3];
1273  } elseif ( !isset( $set[2] ) ) {
1274  # In XHTML, attributes must have a value so return an empty string.
1275  # See "Empty attribute syntax",
1276  # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1277  return "";
1278  } else {
1279  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1280  }
1281  }
1282 
1287  private static function normalizeWhitespace( $text ) {
1288  return preg_replace(
1289  '/\r\n|[\x20\x0d\x0a\x09]/',
1290  ' ',
1291  $text );
1292  }
1293 
1303  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1304  }
1305 
1321  static function normalizeCharReferences( $text ) {
1322  return preg_replace_callback(
1323  self::CHAR_REFS_REGEX,
1324  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1325  $text );
1326  }
1327 
1333  $ret = null;
1334  if ( $matches[1] != '' ) {
1336  } elseif ( $matches[2] != '' ) {
1338  } elseif ( $matches[3] != '' ) {
1340  }
1341  if ( is_null( $ret ) ) {
1342  return htmlspecialchars( $matches[0] );
1343  } else {
1344  return $ret;
1345  }
1346  }
1347 
1358  static function normalizeEntity( $name ) {
1359  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1360  return '&' . self::$htmlEntityAliases[$name] . ';';
1361  } elseif ( in_array( $name, array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1362  return "&$name;";
1363  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1364  return '&#' . self::$htmlEntities[$name] . ';';
1365  } else {
1366  return "&amp;$name;";
1367  }
1368  }
1369 
1374  static function decCharReference( $codepoint ) {
1375  $point = intval( $codepoint );
1376  if ( Sanitizer::validateCodepoint( $point ) ) {
1377  return sprintf( '&#%d;', $point );
1378  } else {
1379  return null;
1380  }
1381  }
1382 
1387  static function hexCharReference( $codepoint ) {
1388  $point = hexdec( $codepoint );
1389  if ( Sanitizer::validateCodepoint( $point ) ) {
1390  return sprintf( '&#x%x;', $point );
1391  } else {
1392  return null;
1393  }
1394  }
1395 
1402  private static function validateCodepoint( $codepoint ) {
1403  # U+000C is valid in HTML5 but not allowed in XML.
1404  # U+000D is valid in XML but not allowed in HTML5.
1405  # U+007F - U+009F are disallowed in HTML5 (control characters).
1406  return $codepoint == 0x09
1407  || $codepoint == 0x0a
1408  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1409  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1410  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1411  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1412  }
1413 
1421  public static function decodeCharReferences( $text ) {
1422  return preg_replace_callback(
1423  self::CHAR_REFS_REGEX,
1424  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1425  $text );
1426  }
1427 
1438  public static function decodeCharReferencesAndNormalize( $text ) {
1440  $text = preg_replace_callback(
1441  self::CHAR_REFS_REGEX,
1442  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1443  $text, /* limit */ -1, $count );
1444 
1445  if ( $count ) {
1446  return $wgContLang->normalize( $text );
1447  } else {
1448  return $text;
1449  }
1450  }
1451 
1457  if ( $matches[1] != '' ) {
1458  return Sanitizer::decodeEntity( $matches[1] );
1459  } elseif ( $matches[2] != '' ) {
1460  return Sanitizer::decodeChar( intval( $matches[2] ) );
1461  } elseif ( $matches[3] != '' ) {
1462  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1463  }
1464  # Last case should be an ampersand by itself
1465  return $matches[0];
1466  }
1467 
1475  static function decodeChar( $codepoint ) {
1476  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1477  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1478  } else {
1480  }
1481  }
1482 
1491  static function decodeEntity( $name ) {
1492  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1493  $name = self::$htmlEntityAliases[$name];
1494  }
1495  if ( isset( self::$htmlEntities[$name] ) ) {
1496  return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1497  } else {
1498  return "&$name;";
1499  }
1500  }
1501 
1508  static function attributeWhitelist( $element ) {
1510  return isset( $list[$element] )
1511  ? $list[$element]
1512  : array();
1513  }
1514 
1520  static function setupAttributeWhitelist() {
1522  static $whitelist, $staticInitialised;
1523 
1524  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1525 
1526  if ( $whitelist !== null && $staticInitialised == $globalContext ) {
1527  return $whitelist;
1528  }
1529 
1530  $common = array(
1531  # HTML
1532  'id',
1533  'class',
1534  'style',
1535  'lang',
1536  'dir',
1537  'title',
1538 
1539  # WAI-ARIA
1540  'role',
1541  );
1542 
1543  if ( $wgAllowRdfaAttributes ) {
1544  # RDFa attributes as specified in section 9 of
1545  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1546  $common = array_merge( $common, array(
1547  'about', 'property', 'resource', 'datatype', 'typeof',
1548  ) );
1549  }
1550 
1551  if ( $wgAllowMicrodataAttributes ) {
1552  # add HTML5 microdata tags as specified by
1553  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1554  $common = array_merge( $common, array(
1555  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1556  ) );
1557  }
1558 
1559  $block = array_merge( $common, array( 'align' ) );
1560  $tablealign = array( 'align', 'valign' );
1561  $tablecell = array(
1562  'abbr',
1563  'axis',
1564  'headers',
1565  'scope',
1566  'rowspan',
1567  'colspan',
1568  'nowrap', # deprecated
1569  'width', # deprecated
1570  'height', # deprecated
1571  'bgcolor', # deprecated
1572  );
1573 
1574  # Numbers refer to sections in HTML 4.01 standard describing the element.
1575  # See: http://www.w3.org/TR/html4/
1576  $whitelist = array(
1577  # 7.5.4
1578  'div' => $block,
1579  'center' => $common, # deprecated
1580  'span' => $common,
1581 
1582  # 7.5.5
1583  'h1' => $block,
1584  'h2' => $block,
1585  'h3' => $block,
1586  'h4' => $block,
1587  'h5' => $block,
1588  'h6' => $block,
1589 
1590  # 7.5.6
1591  # address
1592 
1593  # 8.2.4
1594  'bdo' => $common,
1595 
1596  # 9.2.1
1597  'em' => $common,
1598  'strong' => $common,
1599  'cite' => $common,
1600  'dfn' => $common,
1601  'code' => $common,
1602  'samp' => $common,
1603  'kbd' => $common,
1604  'var' => $common,
1605  'abbr' => $common,
1606  # acronym
1607 
1608  # 9.2.2
1609  'blockquote' => array_merge( $common, array( 'cite' ) ),
1610  'q' => array_merge( $common, array( 'cite' ) ),
1611 
1612  # 9.2.3
1613  'sub' => $common,
1614  'sup' => $common,
1615 
1616  # 9.3.1
1617  'p' => $block,
1618 
1619  # 9.3.2
1620  'br' => array_merge( $common, array( 'clear' ) ),
1621 
1622  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1623  'wbr' => $common,
1624 
1625  # 9.3.4
1626  'pre' => array_merge( $common, array( 'width' ) ),
1627 
1628  # 9.4
1629  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1630  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1631 
1632  # 10.2
1633  'ul' => array_merge( $common, array( 'type' ) ),
1634  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1635  'li' => array_merge( $common, array( 'type', 'value' ) ),
1636 
1637  # 10.3
1638  'dl' => $common,
1639  'dd' => $common,
1640  'dt' => $common,
1641 
1642  # 11.2.1
1643  'table' => array_merge( $common,
1644  array( 'summary', 'width', 'border', 'frame',
1645  'rules', 'cellspacing', 'cellpadding',
1646  'align', 'bgcolor',
1647  ) ),
1648 
1649  # 11.2.2
1650  'caption' => $block,
1651 
1652  # 11.2.3
1653  'thead' => $common,
1654  'tfoot' => $common,
1655  'tbody' => $common,
1656 
1657  # 11.2.4
1658  'colgroup' => array_merge( $common, array( 'span' ) ),
1659  'col' => array_merge( $common, array( 'span' ) ),
1660 
1661  # 11.2.5
1662  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1663 
1664  # 11.2.6
1665  'td' => array_merge( $common, $tablecell, $tablealign ),
1666  'th' => array_merge( $common, $tablecell, $tablealign ),
1667 
1668  # 12.2
1669  # NOTE: <a> is not allowed directly, but the attrib
1670  # whitelist is used from the Parser object
1671  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1672 
1673  # 13.2
1674  # Not usually allowed, but may be used for extension-style hooks
1675  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1676  # true
1677  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1678 
1679  # 15.2.1
1680  'tt' => $common,
1681  'b' => $common,
1682  'i' => $common,
1683  'big' => $common,
1684  'small' => $common,
1685  'strike' => $common,
1686  's' => $common,
1687  'u' => $common,
1688 
1689  # 15.2.2
1690  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1691  # basefont
1692 
1693  # 15.3
1694  'hr' => array_merge( $common, array( 'width' ) ),
1695 
1696  # HTML Ruby annotation text module, simple ruby only.
1697  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1698  'ruby' => $common,
1699  # rbc
1700  'rb' => $common,
1701  'rp' => $common,
1702  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1703  'rtc' => $common,
1704 
1705  # MathML root element, where used for extensions
1706  # 'title' may not be 100% valid here; it's XHTML
1707  # http://www.w3.org/TR/REC-MathML/
1708  'math' => array( 'class', 'style', 'id', 'title' ),
1709 
1710  # HTML 5 section 4.6
1711  'bdi' => $common,
1712 
1713  # HTML5 elements, defined by:
1714  # http://www.whatwg.org/html/
1715  'data' => array_merge( $common, array( 'value' ) ),
1716  'time' => array_merge( $common, array( 'datetime' ) ),
1717  'mark' => $common,
1718 
1719  // meta and link are only permitted by removeHTMLtags when Microdata
1720  // is enabled so we don't bother adding a conditional to hide these
1721  // Also meta and link are only valid in WikiText as Microdata elements
1722  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1723  // So we don't bother including $common attributes that have no purpose.
1724  'meta' => array( 'itemprop', 'content' ),
1725  'link' => array( 'itemprop', 'href' ),
1726  );
1727 
1728  $staticInitialised = $globalContext;
1729 
1730  return $whitelist;
1731  }
1732 
1743  static function stripAllTags( $text ) {
1744  # Actual <tags>
1745  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1746 
1747  # Normalize &entities and whitespace
1748  $text = self::decodeCharReferences( $text );
1749  $text = self::normalizeWhitespace( $text );
1750 
1751  return $text;
1752  }
1753 
1763  static function hackDocType() {
1764  $out = "<!DOCTYPE html [\n";
1765  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1766  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1767  }
1768  $out .= "]>\n";
1769  return $out;
1770  }
1771 
1776  static function cleanUrl( $url ) {
1777  # Normalize any HTML entities in input. They will be
1778  # re-escaped by makeExternalLink().
1779  $url = Sanitizer::decodeCharReferences( $url );
1780 
1781  # Escape any control characters introduced by the above step
1782  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1783  array( __CLASS__, 'cleanUrlCallback' ), $url );
1784 
1785  # Validate hostname portion
1786  $matches = array();
1787  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1788  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1789 
1790  // Characters that will be ignored in IDNs.
1791  // http://tools.ietf.org/html/3454#section-3.1
1792  // Strip them before further processing so blacklists and such work.
1793  $strip = "/
1794  \\s| # general whitespace
1795  \xc2\xad| # 00ad SOFT HYPHEN
1796  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1797  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1798  \xe2\x81\xa0| # 2060 WORD JOINER
1799  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1800  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1801  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1802  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1803  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1804  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1805  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1806  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1807  /xuD";
1808 
1809  $host = preg_replace( $strip, '', $host );
1810 
1811  // IPv6 host names are bracketed with []. Url-decode these.
1812  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) ) {
1813  $host = '//[' . $matches[1] . ']' . $matches[2];
1814  }
1815 
1816  // @todo FIXME: Validate hostnames here
1817 
1818  return $protocol . $host . $rest;
1819  } else {
1820  return $url;
1821  }
1822  }
1823 
1828  static function cleanUrlCallback( $matches ) {
1829  return urlencode( $matches[0] );
1830  }
1831 
1860  public static function validateEmail( $addr ) {
1861  $result = null;
1862  if ( !Hooks::run( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1863  return $result;
1864  }
1865 
1866  // Please note strings below are enclosed in brackets [], this make the
1867  // hyphen "-" a range indicator. Hence it is double backslashed below.
1868  // See bug 26948
1869  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1870  $rfc1034_ldh_str = "a-z0-9\\-";
1871 
1872  $html5_email_regexp = "/
1873  ^ # start of string
1874  [$rfc5322_atext\\.]+ # user part which is liberal :p
1875  @ # 'apostrophe'
1876  [$rfc1034_ldh_str]+ # First domain part
1877  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1878  $ # End of string
1879  /ix"; // case Insensitive, eXtended
1880 
1881  return (bool)preg_match( $html5_email_regexp, $addr );
1882  }
1883 }
utf8ToCodepoint($char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
static decCharReference($codepoint)
Definition: Sanitizer.php:1374
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1740
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1208
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1358
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
or
false for read/write
static safeEncodeTagAttributes($assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1244
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:1332
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:324
static removeHTMLtags($text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:454
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1520
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1740
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content.The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content.These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text.All manipulation and analysis of page content must be done via the appropriate methods of the Content object.For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers.The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id).Also Title, WikiPage and Revision now have getContentHandler() methods for convenience.ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page.ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type.However, it is recommended to instead use WikiPage::getContent() resp.Revision::getContent() to get a page's content as a Content object.These two methods should be the ONLY way in which page content is accessed.Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides().This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based.Objects implementing the Content interface are used to represent and handle the content internally.For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content).The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats().Content serialization formats are identified using MIME type like strings.The following formats are built in:*text/x-wiki-wikitext *text/javascript-for js pages *text/css-for css pages *text/plain-for future use, e.g.with plain text messages.*text/html-for future use, e.g.with plain html messages.*application/vnd.php.serialized-for future use with the api and for extensions *application/json-for future use with the api, and for use by extensions *application/xml-for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant.Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly.Without that information, interpretation of the provided content is not reliable.The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export.Also note that the API will provide encapsulated, serialized content-so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure.Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content.However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page's content model, and will now generate warnings when used.Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent()*WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject().However, both methods should be avoided since they do not provide clean access to the page's actual content.For instance, they may return a system message for non-existing pages.Use WikiPage::getContent() instead.Code that relies on a textual representation of the page content should eventually be rewritten.However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page.Its behavior is controlled by $wgContentHandlerTextFallback it
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1456
static cssDecodeCallback($matches)
Definition: Sanitizer.php:989
static getRecognizedTagData($extratags=array(), $removetags=array())
Return the various lists of recognized tags.
Definition: Sanitizer.php:364
$value
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by similarly to how extensions are installed You can then make that skin the default by adding
Definition: skin.txt:57
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1168
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1763
static cleanUrl($url)
Definition: Sanitizer.php:1776
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:1743
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
const UTF8_REPLACEMENT
static hexCharReference($codepoint)
Definition: Sanitizer.php:1387
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and local administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:715
static normalizeWhitespace($text)
Definition: Sanitizer.php:1287
Apache License January http
it sets a lot of them automatically from query and such
Definition: design.txt:93
if($line===false) $args
Definition: cdb.php:64
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1421
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
pull multiple revisions may often pull multiple times from the same blob.
Definition: deferred.txt:11
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1402
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:56
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec http://www.w3.org/TR/html5/syntax.html#tag-open-state.
Definition: Sanitizer.php:46
Unicode normalization routines for working with UTF-8 strings.
Definition: UtfNormal.php:48
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1475
static normalizeSectionNameWhitespace($section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id's that are used for section links.
Definition: Sanitizer.php:1302
Some quick notes on the file repository architecture Functionality is
Definition: README:3
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
Definition: Sanitizer.php:64
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1508
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
namespace and then decline to actually register it file or subcat img or subcat RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context as context $options
Definition: hooks.txt:968
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1183
static validateTag($params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:679
static mergeAttributes($a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:831
MediaWiki exception.
Definition: MWException.php:26
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
static run($event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:137
$params
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
be sent.
static validateAttributes($attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:735
static decodeCharReferencesAndNormalize($text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1438
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable or merely the Work and Derivative Works thereof Contribution shall mean any work of including the original version of the Work and any modifications or additions to that Work or Derivative Works that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner For the purposes of this submitted means any form of or written communication sent to the Licensor or its including but not limited to communication on electronic mailing source code control and issue tracking systems that are managed by
static escapeId($id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it. ...
Definition: Sanitizer.php:1124
static cleanUrlCallback($matches)
Definition: Sanitizer.php:1828
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
usually copyright or history_copyright This message must be in HTML not wikitext if the section is included from a template $section
Definition: hooks.txt:2624
#define the
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1196
wfUrlProtocols($includeProtocolRelative=true)
Returns a regular expression of url protocols.
static normalizeCss($value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:853
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:57
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types as usual *javascript user provided javascript code *json simple implementation for use by extensions
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:323
$wgUseTidy
$wgUseTidy: use tidy to make sure HTML output is sane.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1321
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message.Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item.Return false to stop further processing of the tag $reader:XMLReader object $logInfo:Array of information 'ImportHandlePageXMLTag':When parsing a XML tag in a page.Return false to stop further processing of the tag $reader:XMLReader object $pageInfo:Array of information 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision.Return false to stop further processing of the tag $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information 'ImportHandleToplevelXMLTag':When parsing a top level XML tag.Return false to stop further processing of the tag $reader:XMLReader object 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload.Return false to stop further processing of the tag $reader:XMLReader object $revisionInfo:Array of information 'InfoAction':When building information to display on the action=info page.$context:IContextSource object &$pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect.$title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InternalParseBeforeLinks':during Parser's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InternalParseBeforeSanitize':during Parser's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings.Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not.Return true without providing an interwiki to continue interwiki search.$prefix:interwiki prefix we are looking for.&$iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InvalidateEmailComplete':Called after a user's email has been invalidated successfully.$user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification.Callee may modify $url and $query, URL will be constructed as $url.$query &$url:URL to index.php &$query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from &$allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of Sanitizer::validateEmail(), for instance to return false if the domain name doesn't match your organization.$addr:The e-mail address entered by the user &$result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user &$result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we're looking for a messages file for &$file:The messages file path, you can override this to change the location. 'LanguageGetMagic':DEPRECATED!Use $magicWords in a file listed in $wgExtensionMessagesFiles instead.Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces.Do not use this hook to add namespaces.Use CanonicalNamespaces for that.&$namespaces:Array of namespaces indexed by their numbers 'LanguageGetSpecialPageAliases':DEPRECATED!Use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead.Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names.&$names:array of language code=> language name $code:language of the preferred translations 'LanguageLinks':Manipulate a page's language links.This is called in various places to allow extensions to define the effective language links for a page.$title:The page's Title.&$links:Associative array mapping language codes to prefixed links of the form"language:title".&$linkFlags:Associative array mapping prefixed links to arrays of flags.Currently unused, but planned to provide support for marking individual language links in the UI, e.g.for featured articles. 'LanguageSelector':Hook to change the language selector available on a page.$out:The output page.$cssClassName:CSS class name of the language selector. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts.Return false to skip default processing and return $ret.See documentation for Linker::link() for details on the expected meanings of parameters.$skin:the Skin object $target:the Title that the link is pointing to &$html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1738
Bar style
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$wgExperimentalHtmlIds
Should we allow a broader set of characters in id attributes, per HTML5? If not, use only HTML 4-comp...
$count
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:1029
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:633
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:339
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
static validateEmail($addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1860
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:962
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1263
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
maintenance dev scripts can help quickly setup a local MediaWiki for development purposes Wikis setup in this way are NOT meant to be publicly available They use a development database not acceptible for use in production Place a sqlite database in an unsafe location a real wiki should never place it in And use predictable default logins for the initial administrator user Running maintenance dev install sh will download and install a local copy of php
Definition: README:5
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1491
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:1066
$wgAllowRdfaAttributes
Enabled RDFa attributes for use in wikitext.
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1939
$wgAllowMicrodataAttributes
Enabled HTML5 microdata attributes for use in wikitext.
PHP Parser - Processes wiki markup (which uses a more user-friendly syntax, such as "[[link]]" for ma...
Definition: Parser.php:67
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1740
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:331
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1045
$matches