MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
46  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
47 
56  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58 
64  private static $htmlEntities = array(
65  'Aacute' => 193,
66  'aacute' => 225,
67  'Acirc' => 194,
68  'acirc' => 226,
69  'acute' => 180,
70  'AElig' => 198,
71  'aelig' => 230,
72  'Agrave' => 192,
73  'agrave' => 224,
74  'alefsym' => 8501,
75  'Alpha' => 913,
76  'alpha' => 945,
77  'amp' => 38,
78  'and' => 8743,
79  'ang' => 8736,
80  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
81  'Aring' => 197,
82  'aring' => 229,
83  'asymp' => 8776,
84  'Atilde' => 195,
85  'atilde' => 227,
86  'Auml' => 196,
87  'auml' => 228,
88  'bdquo' => 8222,
89  'Beta' => 914,
90  'beta' => 946,
91  'brvbar' => 166,
92  'bull' => 8226,
93  'cap' => 8745,
94  'Ccedil' => 199,
95  'ccedil' => 231,
96  'cedil' => 184,
97  'cent' => 162,
98  'Chi' => 935,
99  'chi' => 967,
100  'circ' => 710,
101  'clubs' => 9827,
102  'cong' => 8773,
103  'copy' => 169,
104  'crarr' => 8629,
105  'cup' => 8746,
106  'curren' => 164,
107  'dagger' => 8224,
108  'Dagger' => 8225,
109  'darr' => 8595,
110  'dArr' => 8659,
111  'deg' => 176,
112  'Delta' => 916,
113  'delta' => 948,
114  'diams' => 9830,
115  'divide' => 247,
116  'Eacute' => 201,
117  'eacute' => 233,
118  'Ecirc' => 202,
119  'ecirc' => 234,
120  'Egrave' => 200,
121  'egrave' => 232,
122  'empty' => 8709,
123  'emsp' => 8195,
124  'ensp' => 8194,
125  'Epsilon' => 917,
126  'epsilon' => 949,
127  'equiv' => 8801,
128  'Eta' => 919,
129  'eta' => 951,
130  'ETH' => 208,
131  'eth' => 240,
132  'Euml' => 203,
133  'euml' => 235,
134  'euro' => 8364,
135  'exist' => 8707,
136  'fnof' => 402,
137  'forall' => 8704,
138  'frac12' => 189,
139  'frac14' => 188,
140  'frac34' => 190,
141  'frasl' => 8260,
142  'Gamma' => 915,
143  'gamma' => 947,
144  'ge' => 8805,
145  'gt' => 62,
146  'harr' => 8596,
147  'hArr' => 8660,
148  'hearts' => 9829,
149  'hellip' => 8230,
150  'Iacute' => 205,
151  'iacute' => 237,
152  'Icirc' => 206,
153  'icirc' => 238,
154  'iexcl' => 161,
155  'Igrave' => 204,
156  'igrave' => 236,
157  'image' => 8465,
158  'infin' => 8734,
159  'int' => 8747,
160  'Iota' => 921,
161  'iota' => 953,
162  'iquest' => 191,
163  'isin' => 8712,
164  'Iuml' => 207,
165  'iuml' => 239,
166  'Kappa' => 922,
167  'kappa' => 954,
168  'Lambda' => 923,
169  'lambda' => 955,
170  'lang' => 9001,
171  'laquo' => 171,
172  'larr' => 8592,
173  'lArr' => 8656,
174  'lceil' => 8968,
175  'ldquo' => 8220,
176  'le' => 8804,
177  'lfloor' => 8970,
178  'lowast' => 8727,
179  'loz' => 9674,
180  'lrm' => 8206,
181  'lsaquo' => 8249,
182  'lsquo' => 8216,
183  'lt' => 60,
184  'macr' => 175,
185  'mdash' => 8212,
186  'micro' => 181,
187  'middot' => 183,
188  'minus' => 8722,
189  'Mu' => 924,
190  'mu' => 956,
191  'nabla' => 8711,
192  'nbsp' => 160,
193  'ndash' => 8211,
194  'ne' => 8800,
195  'ni' => 8715,
196  'not' => 172,
197  'notin' => 8713,
198  'nsub' => 8836,
199  'Ntilde' => 209,
200  'ntilde' => 241,
201  'Nu' => 925,
202  'nu' => 957,
203  'Oacute' => 211,
204  'oacute' => 243,
205  'Ocirc' => 212,
206  'ocirc' => 244,
207  'OElig' => 338,
208  'oelig' => 339,
209  'Ograve' => 210,
210  'ograve' => 242,
211  'oline' => 8254,
212  'Omega' => 937,
213  'omega' => 969,
214  'Omicron' => 927,
215  'omicron' => 959,
216  'oplus' => 8853,
217  'or' => 8744,
218  'ordf' => 170,
219  'ordm' => 186,
220  'Oslash' => 216,
221  'oslash' => 248,
222  'Otilde' => 213,
223  'otilde' => 245,
224  'otimes' => 8855,
225  'Ouml' => 214,
226  'ouml' => 246,
227  'para' => 182,
228  'part' => 8706,
229  'permil' => 8240,
230  'perp' => 8869,
231  'Phi' => 934,
232  'phi' => 966,
233  'Pi' => 928,
234  'pi' => 960,
235  'piv' => 982,
236  'plusmn' => 177,
237  'pound' => 163,
238  'prime' => 8242,
239  'Prime' => 8243,
240  'prod' => 8719,
241  'prop' => 8733,
242  'Psi' => 936,
243  'psi' => 968,
244  'quot' => 34,
245  'radic' => 8730,
246  'rang' => 9002,
247  'raquo' => 187,
248  'rarr' => 8594,
249  'rArr' => 8658,
250  'rceil' => 8969,
251  'rdquo' => 8221,
252  'real' => 8476,
253  'reg' => 174,
254  'rfloor' => 8971,
255  'Rho' => 929,
256  'rho' => 961,
257  'rlm' => 8207,
258  'rsaquo' => 8250,
259  'rsquo' => 8217,
260  'sbquo' => 8218,
261  'Scaron' => 352,
262  'scaron' => 353,
263  'sdot' => 8901,
264  'sect' => 167,
265  'shy' => 173,
266  'Sigma' => 931,
267  'sigma' => 963,
268  'sigmaf' => 962,
269  'sim' => 8764,
270  'spades' => 9824,
271  'sub' => 8834,
272  'sube' => 8838,
273  'sum' => 8721,
274  'sup' => 8835,
275  'sup1' => 185,
276  'sup2' => 178,
277  'sup3' => 179,
278  'supe' => 8839,
279  'szlig' => 223,
280  'Tau' => 932,
281  'tau' => 964,
282  'there4' => 8756,
283  'Theta' => 920,
284  'theta' => 952,
285  'thetasym' => 977,
286  'thinsp' => 8201,
287  'THORN' => 222,
288  'thorn' => 254,
289  'tilde' => 732,
290  'times' => 215,
291  'trade' => 8482,
292  'Uacute' => 218,
293  'uacute' => 250,
294  'uarr' => 8593,
295  'uArr' => 8657,
296  'Ucirc' => 219,
297  'ucirc' => 251,
298  'Ugrave' => 217,
299  'ugrave' => 249,
300  'uml' => 168,
301  'upsih' => 978,
302  'Upsilon' => 933,
303  'upsilon' => 965,
304  'Uuml' => 220,
305  'uuml' => 252,
306  'weierp' => 8472,
307  'Xi' => 926,
308  'xi' => 958,
309  'Yacute' => 221,
310  'yacute' => 253,
311  'yen' => 165,
312  'Yuml' => 376,
313  'yuml' => 255,
314  'Zeta' => 918,
315  'zeta' => 950,
316  'zwj' => 8205,
317  'zwnj' => 8204
318  );
319 
323  private static $htmlEntityAliases = array(
324  'רלמ' => 'rlm',
325  'رلم' => 'rlm',
326  );
327 
331  private static $attribsRegex;
332 
339  static function getAttribsRegex() {
340  if ( self::$attribsRegex === null ) {
341  $attribFirst = '[:A-Z_a-z0-9]';
342  $attrib = '[:A-Z_a-z-.0-9]';
343  $space = '[\x09\x0a\x0d\x20]';
344  self::$attribsRegex =
345  "/(?:^|$space)({$attribFirst}{$attrib}*)
346  ($space*=$space*
347  (?:
348  # The attribute value: quoted or alone
349  \"([^<\"]*)\"
350  | '([^<']*)'
351  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
352  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
353  # colors are specified like this.
354  # We'll be normalizing it.
355  )
356  )?(?=$space|\$)/sx";
357  }
358  return self::$attribsRegex;
359  }
360 
367  public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) {
369 
370  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
371  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
372 
373  // Base our staticInitialised variable off of the global config state so that if the globals
374  // are changed (like in the screwed up test system) we will re-initialise the settings.
375  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
376  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
377  $htmlpairsStatic = array( # Tags that must be closed
378  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
379  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
380  'strike', 'strong', 'tt', 'var', 'div', 'center',
381  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
382  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
383  'kbd', 'samp', 'data', 'time', 'mark'
384  );
385  $htmlsingle = array(
386  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
387  );
388  $htmlsingleonly = array( # Elements that cannot have close tags
389  'br', 'wbr', 'hr'
390  );
391  if ( $wgAllowMicrodataAttributes ) {
392  $htmlsingle[] = $htmlsingleonly[] = 'meta';
393  $htmlsingle[] = $htmlsingleonly[] = 'link';
394  }
395  $htmlnest = array( # Tags that can be nested--??
396  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
397  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
398  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
399  );
400  $tabletags = array( # Can only appear inside table, we will close them
401  'td', 'th', 'tr',
402  );
403  $htmllist = array( # Tags used by list
404  'ul', 'ol',
405  );
406  $listtags = array( # Tags that can appear in a list
407  'li',
408  );
409 
410  if ( $wgAllowImageTag ) {
411  $htmlsingle[] = 'img';
412  $htmlsingleonly[] = 'img';
413  }
414 
415  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
416  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
417 
418  # Convert them all to hashtables for faster lookup
419  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
420  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
421  foreach ( $vars as $var ) {
422  $$var = array_flip( $$var );
423  }
424  $staticInitialised = $globalContext;
425  }
426 
427  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
428  $extratags = array_flip( $extratags );
429  $removetags = array_flip( $removetags );
430  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
431  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
432 
433  return array(
434  'htmlpairs' => $htmlpairs,
435  'htmlsingle' => $htmlsingle,
436  'htmlsingleonly' => $htmlsingleonly,
437  'htmlnest' => $htmlnest,
438  'tabletags' => $tabletags,
439  'htmllist' => $htmllist,
440  'listtags' => $listtags,
441  'htmlsingleallowed' => $htmlsingleallowed,
442  'htmlelements' => $htmlelements,
443  );
444  }
445 
457  public static function removeHTMLtags( $text, $processCallback = null,
458  $args = array(), $extratags = array(), $removetags = array()
459  ) {
461 
462  extract( self::getRecognizedTagData( $extratags, $removetags ) );
463 
464  # Remove HTML comments
465  $text = Sanitizer::removeHTMLcomments( $text );
466  $bits = explode( '<', $text );
467  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
468  if ( !$wgUseTidy ) {
469  $tagstack = $tablestack = array();
470  foreach ( $bits as $x ) {
471  $regs = array();
472  # $slash: Does the current element start with a '/'?
473  # $t: Current element name
474  # $params: String between element name and >
475  # $brace: Ending '>' or '/>'
476  # $rest: Everything until the next element of $bits
477  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
478  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
479  } else {
480  $slash = $t = $params = $brace = $rest = null;
481  }
482 
483  $badtag = false;
484  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
485  # Check our stack
486  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
487  $badtag = true;
488  } elseif ( $slash ) {
489  # Closing a tag... is it the one we just opened?
490  MediaWiki\suppressWarnings();
491  $ot = array_pop( $tagstack );
492  MediaWiki\restoreWarnings();
493 
494  if ( $ot != $t ) {
495  if ( isset( $htmlsingleallowed[$ot] ) ) {
496  # Pop all elements with an optional close tag
497  # and see if we find a match below them
498  $optstack = array();
499  array_push( $optstack, $ot );
500  MediaWiki\suppressWarnings();
501  $ot = array_pop( $tagstack );
502  MediaWiki\restoreWarnings();
503  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
504  array_push( $optstack, $ot );
505  MediaWiki\suppressWarnings();
506  $ot = array_pop( $tagstack );
507  MediaWiki\restoreWarnings();
508  }
509  if ( $t != $ot ) {
510  # No match. Push the optional elements back again
511  $badtag = true;
512  MediaWiki\suppressWarnings();
513  $ot = array_pop( $optstack );
514  MediaWiki\restoreWarnings();
515  while ( $ot ) {
516  array_push( $tagstack, $ot );
517  MediaWiki\suppressWarnings();
518  $ot = array_pop( $optstack );
519  MediaWiki\restoreWarnings();
520  }
521  }
522  } else {
523  MediaWiki\suppressWarnings();
524  array_push( $tagstack, $ot );
525  MediaWiki\restoreWarnings();
526 
527  # <li> can be nested in <ul> or <ol>, skip those cases:
528  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
529  $badtag = true;
530  }
531  }
532  } else {
533  if ( $t == 'table' ) {
534  $tagstack = array_pop( $tablestack );
535  }
536  }
537  $newparams = '';
538  } else {
539  # Keep track for later
540  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
541  $badtag = true;
542  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
543  $badtag = true;
544  # Is it a self closed htmlpair ? (bug 5487)
545  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
546  $badtag = true;
547  } elseif ( isset( $htmlsingleonly[$t] ) ) {
548  # Hack to force empty tag for unclosable elements
549  $brace = '/>';
550  } elseif ( isset( $htmlsingle[$t] ) ) {
551  # Hack to not close $htmlsingle tags
552  $brace = null;
553  # Still need to push this optionally-closed tag to
554  # the tag stack so that we can match end tags
555  # instead of marking them as bad.
556  array_push( $tagstack, $t );
557  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
558  // New table tag but forgot to close the previous one
559  $text .= "</$t>";
560  } else {
561  if ( $t == 'table' ) {
562  array_push( $tablestack, $tagstack );
563  $tagstack = array();
564  }
565  array_push( $tagstack, $t );
566  }
567 
568  # Replace any variables or template parameters with
569  # plaintext results.
570  if ( is_callable( $processCallback ) ) {
571  call_user_func_array( $processCallback, array( &$params, $args ) );
572  }
573 
574  if ( !Sanitizer::validateTag( $params, $t ) ) {
575  $badtag = true;
576  }
577 
578  # Strip non-approved attributes from the tag
579  $newparams = Sanitizer::fixTagAttributes( $params, $t );
580  }
581  if ( !$badtag ) {
582  $rest = str_replace( '>', '&gt;', $rest );
583  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
584  $text .= "<$slash$t$newparams$close>$rest";
585  continue;
586  }
587  }
588  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
589  }
590  # Close off any remaining tags
591  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
592  $text .= "</$t>\n";
593  if ( $t == 'table' ) {
594  $tagstack = array_pop( $tablestack );
595  }
596  }
597  } else {
598  # this might be possible using tidy itself
599  foreach ( $bits as $x ) {
600  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
601  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
602 
603  $badtag = false;
604  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
605  if ( is_callable( $processCallback ) ) {
606  call_user_func_array( $processCallback, array( &$params, $args ) );
607  }
608 
609  if ( !Sanitizer::validateTag( $params, $t ) ) {
610  $badtag = true;
611  }
612 
613  $newparams = Sanitizer::fixTagAttributes( $params, $t );
614  if ( !$badtag ) {
615  $rest = str_replace( '>', '&gt;', $rest );
616  $text .= "<$slash$t$newparams$brace$rest";
617  continue;
618  }
619  }
620  }
621  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
622  }
623  }
624  return $text;
625  }
626 
636  public static function removeHTMLcomments( $text ) {
637  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
638  $end = strpos( $text, '-->', $start + 4 );
639  if ( $end === false ) {
640  # Unterminated comment; bail out
641  break;
642  }
643 
644  $end += 3;
645 
646  # Trim space and newline if the comment is both
647  # preceded and followed by a newline
648  $spaceStart = max( $start - 1, 0 );
649  $spaceLen = $end - $spaceStart;
650  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
651  $spaceStart--;
652  $spaceLen++;
653  }
654  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
655  $spaceLen++;
656  }
657  if ( substr( $text, $spaceStart, 1 ) === "\n"
658  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
659  # Remove the comment, leading and trailing
660  # spaces, and leave only one newline.
661  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
662  } else {
663  # Remove just the comment.
664  $text = substr_replace( $text, '', $start, $end - $start );
665  }
666  }
667  return $text;
668  }
669 
682  static function validateTag( $params, $element ) {
684 
685  if ( $element == 'meta' || $element == 'link' ) {
686  if ( !isset( $params['itemprop'] ) ) {
687  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
688  return false;
689  }
690  if ( $element == 'meta' && !isset( $params['content'] ) ) {
691  // <meta> must have a content="" for the itemprop
692  return false;
693  }
694  if ( $element == 'link' && !isset( $params['href'] ) ) {
695  // <link> must have an associated href=""
696  return false;
697  }
698  }
699 
700  return true;
701  }
702 
718  static function validateTagAttributes( $attribs, $element ) {
720  Sanitizer::attributeWhitelist( $element ) );
721  }
722 
738  static function validateAttributes( $attribs, $whitelist ) {
740 
741  $whitelist = array_flip( $whitelist );
742  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
743 
744  $out = array();
745  foreach ( $attribs as $attribute => $value ) {
746  #allow XML namespace declaration if RDFa is enabled
747  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
748  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
749  $out[$attribute] = $value;
750  }
751 
752  continue;
753  }
754 
755  # Allow any attribute beginning with "data-"
756  if ( !preg_match( '/^data-(?!ooui)/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
757  continue;
758  }
759 
760  # Strip javascript "expression" from stylesheets.
761  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
762  if ( $attribute == 'style' ) {
764  }
765 
766  if ( $attribute === 'id' ) {
767  $value = Sanitizer::escapeId( $value, 'noninitial' );
768  }
769 
770  # WAI-ARIA
771  # http://www.w3.org/TR/wai-aria/
772  # http://www.whatwg.org/html/elements.html#wai-aria
773  # For now we only support role="presentation" until we work out what roles should be
774  # usable by content and we ensure that our code explicitly rejects patterns that
775  # violate HTML5's ARIA restrictions.
776  if ( $attribute === 'role' && $value !== 'presentation' ) {
777  continue;
778  }
779 
780  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
781  // Check them for sanity.
782  if ( $attribute === 'rel' || $attribute === 'rev'
783  # RDFa
784  || $attribute === 'about' || $attribute === 'property'
785  || $attribute === 'resource' || $attribute === 'datatype'
786  || $attribute === 'typeof'
787  # HTML5 microdata
788  || $attribute === 'itemid' || $attribute === 'itemprop'
789  || $attribute === 'itemref' || $attribute === 'itemscope'
790  || $attribute === 'itemtype'
791  ) {
792  //Paranoia. Allow "simple" values but suppress javascript
793  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
794  continue;
795  }
796  }
797 
798  # NOTE: even though elements using href/src are not allowed directly, supply
799  # validation code that can be used by tag hook handlers, etc
800  if ( $attribute === 'href' || $attribute === 'src' ) {
801  if ( !preg_match( $hrefExp, $value ) ) {
802  continue; //drop any href or src attributes not using an allowed protocol.
803  // NOTE: this also drops all relative URLs
804  }
805  }
806 
807  // If this attribute was previously set, override it.
808  // Output should only have one attribute of each name.
809  $out[$attribute] = $value;
810  }
811 
812  if ( $wgAllowMicrodataAttributes ) {
813  # itemtype, itemid, itemref don't make sense without itemscope
814  if ( !array_key_exists( 'itemscope', $out ) ) {
815  unset( $out['itemtype'] );
816  unset( $out['itemid'] );
817  unset( $out['itemref'] );
818  }
819  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
820  }
821  return $out;
822  }
823 
834  static function mergeAttributes( $a, $b ) {
835  $out = array_merge( $a, $b );
836  if ( isset( $a['class'] ) && isset( $b['class'] )
837  && is_string( $a['class'] ) && is_string( $b['class'] )
838  && $a['class'] !== $b['class']
839  ) {
840  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
841  -1, PREG_SPLIT_NO_EMPTY );
842  $out['class'] = implode( ' ', array_unique( $classes ) );
843  }
844  return $out;
845  }
846 
856  public static function normalizeCss( $value ) {
857 
858  // Decode character references like &#123;
860 
861  // Decode escape sequences and line continuation
862  // See the grammar in the CSS 2 spec, appendix D.
863  // This has to be done AFTER decoding character references.
864  // This means it isn't possible for this function to return
865  // unsanitized escape sequences. It is possible to manufacture
866  // input that contains character references that decode to
867  // escape sequences that decode to character references, but
868  // it's OK for the return value to contain character references
869  // because the caller is supposed to escape those anyway.
870  static $decodeRegex;
871  if ( !$decodeRegex ) {
872  $space = '[\\x20\\t\\r\\n\\f]';
873  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
874  $backslash = '\\\\';
875  $decodeRegex = "/ $backslash
876  (?:
877  ($nl) | # 1. Line continuation
878  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
879  (.) | # 3. backslash cancelling special meaning
880  () | # 4. backslash at end of string
881  )/xu";
882  }
883  $value = preg_replace_callback( $decodeRegex,
884  array( __CLASS__, 'cssDecodeCallback' ), $value );
885 
886  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
887  $value = preg_replace_callback(
888  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
889  function ( $matches ) {
891  if ( $cp === false ) {
892  return '';
893  }
894  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
895  },
896  $value
897  );
898 
899  // Convert more characters IE6 might treat as ascii
900  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
901  $value = str_replace(
902  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
903  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
904  $value
905  );
906 
907  // Let the value through if it's nothing but a single comment, to
908  // allow other functions which may reject it to pass some error
909  // message through.
910  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
911  // Remove any comments; IE gets token splitting wrong
912  // This must be done AFTER decoding character references and
913  // escape sequences, because those steps can introduce comments
914  // This step cannot introduce character references or escape
915  // sequences, because it replaces comments with spaces rather
916  // than removing them completely.
917  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
918 
919  // Remove anything after a comment-start token, to guard against
920  // incorrect client implementations.
921  $commentPos = strpos( $value, '/*' );
922  if ( $commentPos !== false ) {
923  $value = substr( $value, 0, $commentPos );
924  }
925  }
926 
927  // S followed by repeat, iteration, or prolonged sound marks,
928  // which IE will treat as "ss"
929  $value = preg_replace(
930  '/s(?:
931  \xE3\x80\xB1 | # U+3031
932  \xE3\x82\x9D | # U+309D
933  \xE3\x83\xBC | # U+30FC
934  \xE3\x83\xBD | # U+30FD
935  \xEF\xB9\xBC | # U+FE7C
936  \xEF\xB9\xBD | # U+FE7D
937  \xEF\xBD\xB0 # U+FF70
938  )/ix',
939  'ss',
940  $value
941  );
942 
943  return $value;
944  }
945 
946 
965  static function checkCss( $value ) {
966  $value = self::normalizeCss( $value );
967 
968  // Reject problematic keywords and control characters
969  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
970  return '/* invalid control char */';
971  } elseif ( preg_match(
972  '! expression
973  | filter\s*:
974  | accelerator\s*:
975  | -o-link\s*:
976  | -o-link-source\s*:
977  | -o-replace\s*:
978  | url\s*\(
979  | image\s*\(
980  | image-set\s*\(
981  !ix', $value ) ) {
982  return '/* insecure input */';
983  }
984  return $value;
985  }
986 
991  static function cssDecodeCallback( $matches ) {
992  if ( $matches[1] !== '' ) {
993  // Line continuation
994  return '';
995  } elseif ( $matches[2] !== '' ) {
996  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
997  } elseif ( $matches[3] !== '' ) {
998  $char = $matches[3];
999  } else {
1000  $char = '\\';
1001  }
1002  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1003  // These characters need to be escaped in strings
1004  // Clean up the escape sequence to avoid parsing errors by clients
1005  return '\\' . dechex( ord( $char ) ) . ' ';
1006  } else {
1007  // Decode unnecessary escape
1008  return $char;
1009  }
1010  }
1011 
1031  static function fixTagAttributes( $text, $element ) {
1032  if ( trim( $text ) == '' ) {
1033  return '';
1034  }
1035 
1036  $decoded = Sanitizer::decodeTagAttributes( $text );
1037  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1038 
1039  return Sanitizer::safeEncodeTagAttributes( $stripped );
1040  }
1041 
1047  static function encodeAttribute( $text ) {
1048  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1049 
1050  // Whitespace is normalized during attribute decoding,
1051  // so if we've been passed non-spaces we must encode them
1052  // ahead of time or they won't be preserved.
1053  $encValue = strtr( $encValue, array(
1054  "\n" => '&#10;',
1055  "\r" => '&#13;',
1056  "\t" => '&#9;',
1057  ) );
1058 
1059  return $encValue;
1060  }
1061 
1068  static function safeEncodeAttribute( $text ) {
1069  $encValue = Sanitizer::encodeAttribute( $text );
1070 
1071  # Templates and links may be expanded in later parsing,
1072  # creating invalid or dangerous output. Suppress this.
1073  $encValue = strtr( $encValue, array(
1074  '<' => '&lt;', // This should never happen,
1075  '>' => '&gt;', // we've received invalid input
1076  '"' => '&quot;', // which should have been escaped.
1077  '{' => '&#123;',
1078  '[' => '&#91;',
1079  "''" => '&#39;&#39;',
1080  'ISBN' => '&#73;SBN',
1081  'RFC' => '&#82;FC',
1082  'PMID' => '&#80;MID',
1083  '|' => '&#124;',
1084  '__' => '&#95;_',
1085  ) );
1086 
1087  # Stupid hack
1088  $encValue = preg_replace_callback(
1089  '/((?i)' . wfUrlProtocols() . ')/',
1090  array( 'Sanitizer', 'armorLinksCallback' ),
1091  $encValue );
1092  return $encValue;
1093  }
1094 
1126  static function escapeId( $id, $options = array() ) {
1128  $options = (array)$options;
1129 
1130  $id = Sanitizer::decodeCharReferences( $id );
1131 
1132  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1133  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1134  $id = trim( $id, '_' );
1135  if ( $id === '' ) {
1136  // Must have been all whitespace to start with.
1137  return '_';
1138  } else {
1139  return $id;
1140  }
1141  }
1142 
1143  // HTML4-style escaping
1144  static $replace = array(
1145  '%3A' => ':',
1146  '%' => '.'
1147  );
1148 
1149  $id = urlencode( strtr( $id, ' ', '_' ) );
1150  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1151 
1152  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1153  // Initial character must be a letter!
1154  $id = "x$id";
1155  }
1156  return $id;
1157  }
1158 
1170  static function escapeClass( $class ) {
1171  // Convert ugly stuff to underscores and kill underscores in ugly places
1172  return rtrim( preg_replace(
1173  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1174  '_',
1175  $class ), '_' );
1176  }
1177 
1185  static function escapeHtmlAllowEntities( $html ) {
1187  # It seems wise to escape ' as well as ", as a matter of course. Can't
1188  # hurt.
1189  $html = htmlspecialchars( $html, ENT_QUOTES );
1190  return $html;
1191  }
1192 
1198  private static function armorLinksCallback( $matches ) {
1199  return str_replace( ':', '&#58;', $matches[1] );
1200  }
1201 
1210  public static function decodeTagAttributes( $text ) {
1211  if ( trim( $text ) == '' ) {
1212  return array();
1213  }
1214 
1215  $attribs = array();
1216  $pairs = array();
1217  if ( !preg_match_all(
1218  self::getAttribsRegex(),
1219  $text,
1220  $pairs,
1221  PREG_SET_ORDER ) ) {
1222  return $attribs;
1223  }
1224 
1225  foreach ( $pairs as $set ) {
1226  $attribute = strtolower( $set[1] );
1228 
1229  // Normalize whitespace
1230  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1231  $value = trim( $value );
1232 
1233  // Decode character references
1235  }
1236  return $attribs;
1237  }
1238 
1246  public static function safeEncodeTagAttributes( $assoc_array ) {
1247  $attribs = array();
1248  foreach ( $assoc_array as $attribute => $value ) {
1249  $encAttribute = htmlspecialchars( $attribute );
1250  $encValue = Sanitizer::safeEncodeAttribute( $value );
1251 
1252  $attribs[] = "$encAttribute=\"$encValue\"";
1253  }
1254  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1255  }
1256 
1265  private static function getTagAttributeCallback( $set ) {
1266  if ( isset( $set[6] ) ) {
1267  # Illegal #XXXXXX color with no quotes.
1268  return $set[6];
1269  } elseif ( isset( $set[5] ) ) {
1270  # No quotes.
1271  return $set[5];
1272  } elseif ( isset( $set[4] ) ) {
1273  # Single-quoted
1274  return $set[4];
1275  } elseif ( isset( $set[3] ) ) {
1276  # Double-quoted
1277  return $set[3];
1278  } elseif ( !isset( $set[2] ) ) {
1279  # In XHTML, attributes must have a value.
1280  # For 'reduced' form, return explicitly the attribute name here.
1281  return $set[1];
1282  } else {
1283  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1284  }
1285  }
1286 
1291  private static function normalizeWhitespace( $text ) {
1292  return preg_replace(
1293  '/\r\n|[\x20\x0d\x0a\x09]/',
1294  ' ',
1295  $text );
1296  }
1297 
1307  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1308  }
1309 
1325  static function normalizeCharReferences( $text ) {
1326  return preg_replace_callback(
1327  self::CHAR_REFS_REGEX,
1328  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1329  $text );
1330  }
1331 
1337  $ret = null;
1338  if ( $matches[1] != '' ) {
1340  } elseif ( $matches[2] != '' ) {
1342  } elseif ( $matches[3] != '' ) {
1344  }
1345  if ( is_null( $ret ) ) {
1346  return htmlspecialchars( $matches[0] );
1347  } else {
1348  return $ret;
1349  }
1350  }
1351 
1362  static function normalizeEntity( $name ) {
1363  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1364  return '&' . self::$htmlEntityAliases[$name] . ';';
1365  } elseif ( in_array( $name, array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1366  return "&$name;";
1367  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1368  return '&#' . self::$htmlEntities[$name] . ';';
1369  } else {
1370  return "&amp;$name;";
1371  }
1372  }
1373 
1378  static function decCharReference( $codepoint ) {
1379  $point = intval( $codepoint );
1380  if ( Sanitizer::validateCodepoint( $point ) ) {
1381  return sprintf( '&#%d;', $point );
1382  } else {
1383  return null;
1384  }
1385  }
1386 
1391  static function hexCharReference( $codepoint ) {
1392  $point = hexdec( $codepoint );
1393  if ( Sanitizer::validateCodepoint( $point ) ) {
1394  return sprintf( '&#x%x;', $point );
1395  } else {
1396  return null;
1397  }
1398  }
1399 
1405  private static function validateCodepoint( $codepoint ) {
1406  return $codepoint == 0x09
1407  || $codepoint == 0x0a
1408  || $codepoint == 0x0d
1409  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1410  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1411  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1412  }
1413 
1421  public static function decodeCharReferences( $text ) {
1422  return preg_replace_callback(
1423  self::CHAR_REFS_REGEX,
1424  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1425  $text );
1426  }
1427 
1438  public static function decodeCharReferencesAndNormalize( $text ) {
1440  $text = preg_replace_callback(
1441  self::CHAR_REFS_REGEX,
1442  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1443  $text, /* limit */ -1, $count );
1444 
1445  if ( $count ) {
1446  return $wgContLang->normalize( $text );
1447  } else {
1448  return $text;
1449  }
1450  }
1451 
1457  if ( $matches[1] != '' ) {
1458  return Sanitizer::decodeEntity( $matches[1] );
1459  } elseif ( $matches[2] != '' ) {
1460  return Sanitizer::decodeChar( intval( $matches[2] ) );
1461  } elseif ( $matches[3] != '' ) {
1462  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1463  }
1464  # Last case should be an ampersand by itself
1465  return $matches[0];
1466  }
1467 
1475  static function decodeChar( $codepoint ) {
1476  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1477  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1478  } else {
1480  }
1481  }
1482 
1491  static function decodeEntity( $name ) {
1492  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1493  $name = self::$htmlEntityAliases[$name];
1494  }
1495  if ( isset( self::$htmlEntities[$name] ) ) {
1496  return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1497  } else {
1498  return "&$name;";
1499  }
1500  }
1501 
1508  static function attributeWhitelist( $element ) {
1510  return isset( $list[$element] )
1511  ? $list[$element]
1512  : array();
1513  }
1514 
1520  static function setupAttributeWhitelist() {
1522  static $whitelist, $staticInitialised;
1523 
1524  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1525 
1526  if ( $whitelist !== null && $staticInitialised == $globalContext ) {
1527  return $whitelist;
1528  }
1529 
1530  $common = array(
1531  # HTML
1532  'id',
1533  'class',
1534  'style',
1535  'lang',
1536  'dir',
1537  'title',
1538 
1539  # WAI-ARIA
1540  'role',
1541  );
1542 
1543  if ( $wgAllowRdfaAttributes ) {
1544  # RDFa attributes as specified in section 9 of
1545  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1546  $common = array_merge( $common, array(
1547  'about', 'property', 'resource', 'datatype', 'typeof',
1548  ) );
1549  }
1550 
1551  if ( $wgAllowMicrodataAttributes ) {
1552  # add HTML5 microdata tags as specified by
1553  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1554  $common = array_merge( $common, array(
1555  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1556  ) );
1557  }
1558 
1559  $block = array_merge( $common, array( 'align' ) );
1560  $tablealign = array( 'align', 'valign' );
1561  $tablecell = array(
1562  'abbr',
1563  'axis',
1564  'headers',
1565  'scope',
1566  'rowspan',
1567  'colspan',
1568  'nowrap', # deprecated
1569  'width', # deprecated
1570  'height', # deprecated
1571  'bgcolor', # deprecated
1572  );
1573 
1574  # Numbers refer to sections in HTML 4.01 standard describing the element.
1575  # See: http://www.w3.org/TR/html4/
1576  $whitelist = array(
1577  # 7.5.4
1578  'div' => $block,
1579  'center' => $common, # deprecated
1580  'span' => $common,
1581 
1582  # 7.5.5
1583  'h1' => $block,
1584  'h2' => $block,
1585  'h3' => $block,
1586  'h4' => $block,
1587  'h5' => $block,
1588  'h6' => $block,
1589 
1590  # 7.5.6
1591  # address
1592 
1593  # 8.2.4
1594  'bdo' => $common,
1595 
1596  # 9.2.1
1597  'em' => $common,
1598  'strong' => $common,
1599  'cite' => $common,
1600  'dfn' => $common,
1601  'code' => $common,
1602  'samp' => $common,
1603  'kbd' => $common,
1604  'var' => $common,
1605  'abbr' => $common,
1606  # acronym
1607 
1608  # 9.2.2
1609  'blockquote' => array_merge( $common, array( 'cite' ) ),
1610  'q' => array_merge( $common, array( 'cite' ) ),
1611 
1612  # 9.2.3
1613  'sub' => $common,
1614  'sup' => $common,
1615 
1616  # 9.3.1
1617  'p' => $block,
1618 
1619  # 9.3.2
1620  'br' => array_merge( $common, array( 'clear' ) ),
1621 
1622  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1623  'wbr' => $common,
1624 
1625  # 9.3.4
1626  'pre' => array_merge( $common, array( 'width' ) ),
1627 
1628  # 9.4
1629  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1630  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1631 
1632  # 10.2
1633  'ul' => array_merge( $common, array( 'type' ) ),
1634  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1635  'li' => array_merge( $common, array( 'type', 'value' ) ),
1636 
1637  # 10.3
1638  'dl' => $common,
1639  'dd' => $common,
1640  'dt' => $common,
1641 
1642  # 11.2.1
1643  'table' => array_merge( $common,
1644  array( 'summary', 'width', 'border', 'frame',
1645  'rules', 'cellspacing', 'cellpadding',
1646  'align', 'bgcolor',
1647  ) ),
1648 
1649  # 11.2.2
1650  'caption' => $block,
1651 
1652  # 11.2.3
1653  'thead' => $common,
1654  'tfoot' => $common,
1655  'tbody' => $common,
1656 
1657  # 11.2.4
1658  'colgroup' => array_merge( $common, array( 'span' ) ),
1659  'col' => array_merge( $common, array( 'span' ) ),
1660 
1661  # 11.2.5
1662  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1663 
1664  # 11.2.6
1665  'td' => array_merge( $common, $tablecell, $tablealign ),
1666  'th' => array_merge( $common, $tablecell, $tablealign ),
1667 
1668  # 12.2
1669  # NOTE: <a> is not allowed directly, but the attrib
1670  # whitelist is used from the Parser object
1671  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1672 
1673  # 13.2
1674  # Not usually allowed, but may be used for extension-style hooks
1675  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1676  # true
1677  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1678 
1679  # 15.2.1
1680  'tt' => $common,
1681  'b' => $common,
1682  'i' => $common,
1683  'big' => $common,
1684  'small' => $common,
1685  'strike' => $common,
1686  's' => $common,
1687  'u' => $common,
1688 
1689  # 15.2.2
1690  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1691  # basefont
1692 
1693  # 15.3
1694  'hr' => array_merge( $common, array( 'width' ) ),
1695 
1696  # HTML Ruby annotation text module, simple ruby only.
1697  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1698  'ruby' => $common,
1699  # rbc
1700  'rb' => $common,
1701  'rp' => $common,
1702  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1703  'rtc' => $common,
1704 
1705  # MathML root element, where used for extensions
1706  # 'title' may not be 100% valid here; it's XHTML
1707  # http://www.w3.org/TR/REC-MathML/
1708  'math' => array( 'class', 'style', 'id', 'title' ),
1709 
1710  # HTML 5 section 4.6
1711  'bdi' => $common,
1712 
1713  # HTML5 elements, defined by:
1714  # http://www.whatwg.org/html/
1715  'data' => array_merge( $common, array( 'value' ) ),
1716  'time' => array_merge( $common, array( 'datetime' ) ),
1717  'mark' => $common,
1718 
1719  // meta and link are only permitted by removeHTMLtags when Microdata
1720  // is enabled so we don't bother adding a conditional to hide these
1721  // Also meta and link are only valid in WikiText as Microdata elements
1722  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1723  // So we don't bother including $common attributes that have no purpose.
1724  'meta' => array( 'itemprop', 'content' ),
1725  'link' => array( 'itemprop', 'href' ),
1726  );
1727 
1728  $staticInitialised = $globalContext;
1729 
1730  return $whitelist;
1731  }
1732 
1743  static function stripAllTags( $text ) {
1744  # Actual <tags>
1745  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1746 
1747  # Normalize &entities and whitespace
1748  $text = self::decodeCharReferences( $text );
1749  $text = self::normalizeWhitespace( $text );
1750 
1751  return $text;
1752  }
1753 
1763  static function hackDocType() {
1764  $out = "<!DOCTYPE html [\n";
1765  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1766  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1767  }
1768  $out .= "]>\n";
1769  return $out;
1770  }
1771 
1776  static function cleanUrl( $url ) {
1777  # Normalize any HTML entities in input. They will be
1778  # re-escaped by makeExternalLink().
1779  $url = Sanitizer::decodeCharReferences( $url );
1780 
1781  # Escape any control characters introduced by the above step
1782  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1783  array( __CLASS__, 'cleanUrlCallback' ), $url );
1784 
1785  # Validate hostname portion
1786  $matches = array();
1787  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1788  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1789 
1790  // Characters that will be ignored in IDNs.
1791  // http://tools.ietf.org/html/3454#section-3.1
1792  // Strip them before further processing so blacklists and such work.
1793  $strip = "/
1794  \\s| # general whitespace
1795  \xc2\xad| # 00ad SOFT HYPHEN
1796  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1797  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1798  \xe2\x81\xa0| # 2060 WORD JOINER
1799  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1800  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1801  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1802  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1803  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1804  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1805  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1806  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1807  /xuD";
1808 
1809  $host = preg_replace( $strip, '', $host );
1810 
1811  // @todo FIXME: Validate hostnames here
1812 
1813  return $protocol . $host . $rest;
1814  } else {
1815  return $url;
1816  }
1817  }
1818 
1823  static function cleanUrlCallback( $matches ) {
1824  return urlencode( $matches[0] );
1825  }
1826 
1855  public static function validateEmail( $addr ) {
1856  $result = null;
1857  if ( !Hooks::run( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1858  return $result;
1859  }
1860 
1861  // Please note strings below are enclosed in brackets [], this make the
1862  // hyphen "-" a range indicator. Hence it is double backslashed below.
1863  // See bug 26948
1864  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1865  $rfc1034_ldh_str = "a-z0-9\\-";
1866 
1867  $html5_email_regexp = "/
1868  ^ # start of string
1869  [$rfc5322_atext\\.]+ # user part which is liberal :p
1870  @ # 'apostrophe'
1871  [$rfc1034_ldh_str]+ # First domain part
1872  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1873  $ # End of string
1874  /ix"; // case Insensitive, eXtended
1875 
1876  return (bool)preg_match( $html5_email_regexp, $addr );
1877  }
1878 }
utf8ToCodepoint($char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
static decCharReference($codepoint)
Definition: Sanitizer.php:1378
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1740
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1210
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1362
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
or
false for read/write
static safeEncodeTagAttributes($assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1246
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:1336
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:324
static removeHTMLtags($text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:457
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1520
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1740
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content.The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content.These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text.All manipulation and analysis of page content must be done via the appropriate methods of the Content object.For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers.The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id).Also Title, WikiPage and Revision now have getContentHandler() methods for convenience.ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page.ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type.However, it is recommended to instead use WikiPage::getContent() resp.Revision::getContent() to get a page's content as a Content object.These two methods should be the ONLY way in which page content is accessed.Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides().This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based.Objects implementing the Content interface are used to represent and handle the content internally.For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content).The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats().Content serialization formats are identified using MIME type like strings.The following formats are built in:*text/x-wiki-wikitext *text/javascript-for js pages *text/css-for css pages *text/plain-for future use, e.g.with plain text messages.*text/html-for future use, e.g.with plain html messages.*application/vnd.php.serialized-for future use with the api and for extensions *application/json-for future use with the api, and for use by extensions *application/xml-for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant.Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly.Without that information, interpretation of the provided content is not reliable.The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export.Also note that the API will provide encapsulated, serialized content-so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure.Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content.However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page's content model, and will now generate warnings when used.Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent()*WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject().However, both methods should be avoided since they do not provide clean access to the page's actual content.For instance, they may return a system message for non-existing pages.Use WikiPage::getContent() instead.Code that relies on a textual representation of the page content should eventually be rewritten.However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page.Its behavior is controlled by $wgContentHandlerTextFallback it
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1456
static cssDecodeCallback($matches)
Definition: Sanitizer.php:991
static getRecognizedTagData($extratags=array(), $removetags=array())
Return the various lists of recognized tags.
Definition: Sanitizer.php:367
$value
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by similarly to how extensions are installed You can then make that skin the default by adding
Definition: skin.txt:57
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1170
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1763
static cleanUrl($url)
Definition: Sanitizer.php:1776
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:1743
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
const UTF8_REPLACEMENT
static hexCharReference($codepoint)
Definition: Sanitizer.php:1391
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and local administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:718
static normalizeWhitespace($text)
Definition: Sanitizer.php:1291
Apache License January http
it sets a lot of them automatically from query and such
Definition: design.txt:93
if($line===false) $args
Definition: cdb.php:64
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1421
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
pull multiple revisions may often pull multiple times from the same blob.
Definition: deferred.txt:11
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1405
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:56
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec http://www.w3.org/TR/html5/syntax.html#tag-open-state.
Definition: Sanitizer.php:46
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1475
static normalizeSectionNameWhitespace($section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id's that are used for section links.
Definition: Sanitizer.php:1306
Some quick notes on the file repository architecture Functionality is
Definition: README:3
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
Definition: Sanitizer.php:64
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1508
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
namespace and then decline to actually register it file or subcat img or subcat RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context as context $options
Definition: hooks.txt:968
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1185
static validateTag($params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:682
static mergeAttributes($a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:834
MediaWiki exception.
Definition: MWException.php:26
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
static run($event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:137
$params
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
be sent.
static validateAttributes($attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:738
static decodeCharReferencesAndNormalize($text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1438
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable or merely the Work and Derivative Works thereof Contribution shall mean any work of including the original version of the Work and any modifications or additions to that Work or Derivative Works that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner For the purposes of this submitted means any form of or written communication sent to the Licensor or its including but not limited to communication on electronic mailing source code control and issue tracking systems that are managed by
static escapeId($id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it. ...
Definition: Sanitizer.php:1126
static cleanUrlCallback($matches)
Definition: Sanitizer.php:1823
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
usually copyright or history_copyright This message must be in HTML not wikitext if the section is included from a template $section
Definition: hooks.txt:2611
#define the
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1198
wfUrlProtocols($includeProtocolRelative=true)
Returns a regular expression of url protocols.
static normalizeCss($value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:856
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:57
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types as usual *javascript user provided javascript code *json simple implementation for use by extensions
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:323
$wgUseTidy
$wgUseTidy: use tidy to make sure HTML output is sane.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1325
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message.Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item.Return false to stop further processing of the tag $reader:XMLReader object $logInfo:Array of information 'ImportHandlePageXMLTag':When parsing a XML tag in a page.Return false to stop further processing of the tag $reader:XMLReader object $pageInfo:Array of information 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision.Return false to stop further processing of the tag $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information 'ImportHandleToplevelXMLTag':When parsing a top level XML tag.Return false to stop further processing of the tag $reader:XMLReader object 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload.Return false to stop further processing of the tag $reader:XMLReader object $revisionInfo:Array of information 'InfoAction':When building information to display on the action=info page.$context:IContextSource object &$pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect.$title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InternalParseBeforeLinks':during Parser's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InternalParseBeforeSanitize':during Parser's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings.Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not.Return true without providing an interwiki to continue interwiki search.$prefix:interwiki prefix we are looking for.&$iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InvalidateEmailComplete':Called after a user's email has been invalidated successfully.$user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification.Callee may modify $url and $query, URL will be constructed as $url.$query &$url:URL to index.php &$query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from &$allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of Sanitizer::validateEmail(), for instance to return false if the domain name doesn't match your organization.$addr:The e-mail address entered by the user &$result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user &$result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we're looking for a messages file for &$file:The messages file path, you can override this to change the location. 'LanguageGetMagic':DEPRECATED!Use $magicWords in a file listed in $wgExtensionMessagesFiles instead.Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces.Do not use this hook to add namespaces.Use CanonicalNamespaces for that.&$namespaces:Array of namespaces indexed by their numbers 'LanguageGetSpecialPageAliases':DEPRECATED!Use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead.Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names.&$names:array of language code=> language name $code:language of the preferred translations 'LanguageLinks':Manipulate a page's language links.This is called in various places to allow extensions to define the effective language links for a page.$title:The page's Title.&$links:Associative array mapping language codes to prefixed links of the form"language:title".&$linkFlags:Associative array mapping prefixed links to arrays of flags.Currently unused, but planned to provide support for marking individual language links in the UI, e.g.for featured articles. 'LanguageSelector':Hook to change the language selector available on a page.$out:The output page.$cssClassName:CSS class name of the language selector. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts.Return false to skip default processing and return $ret.See documentation for Linker::link() for details on the expected meanings of parameters.$skin:the Skin object $target:the Title that the link is pointing to &$html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1738
Bar style
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$wgExperimentalHtmlIds
Should we allow a broader set of characters in id attributes, per HTML5? If not, use only HTML 4-comp...
$count
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:1031
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:636
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:339
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
static validateEmail($addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1855
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:965
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1265
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
maintenance dev scripts can help quickly setup a local MediaWiki for development purposes Wikis setup in this way are NOT meant to be publicly available They use a development database not acceptible for use in production Place a sqlite database in an unsafe location a real wiki should never place it in And use predictable default logins for the initial administrator user Running maintenance dev install sh will download and install a local copy of php
Definition: README:5
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1491
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:1068
$wgAllowRdfaAttributes
Enabled RDFa attributes for use in wikitext.
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1939
$wgAllowMicrodataAttributes
Enabled HTML5 microdata attributes for use in wikitext.
PHP Parser - Processes wiki markup (which uses a more user-friendly syntax, such as "[[link]]" for ma...
Definition: Parser.php:67
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1740
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:331
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1047
$matches