MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
46  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
47 
56  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58 
64  private static $htmlEntities = array(
65  'Aacute' => 193,
66  'aacute' => 225,
67  'Acirc' => 194,
68  'acirc' => 226,
69  'acute' => 180,
70  'AElig' => 198,
71  'aelig' => 230,
72  'Agrave' => 192,
73  'agrave' => 224,
74  'alefsym' => 8501,
75  'Alpha' => 913,
76  'alpha' => 945,
77  'amp' => 38,
78  'and' => 8743,
79  'ang' => 8736,
80  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
81  'Aring' => 197,
82  'aring' => 229,
83  'asymp' => 8776,
84  'Atilde' => 195,
85  'atilde' => 227,
86  'Auml' => 196,
87  'auml' => 228,
88  'bdquo' => 8222,
89  'Beta' => 914,
90  'beta' => 946,
91  'brvbar' => 166,
92  'bull' => 8226,
93  'cap' => 8745,
94  'Ccedil' => 199,
95  'ccedil' => 231,
96  'cedil' => 184,
97  'cent' => 162,
98  'Chi' => 935,
99  'chi' => 967,
100  'circ' => 710,
101  'clubs' => 9827,
102  'cong' => 8773,
103  'copy' => 169,
104  'crarr' => 8629,
105  'cup' => 8746,
106  'curren' => 164,
107  'dagger' => 8224,
108  'Dagger' => 8225,
109  'darr' => 8595,
110  'dArr' => 8659,
111  'deg' => 176,
112  'Delta' => 916,
113  'delta' => 948,
114  'diams' => 9830,
115  'divide' => 247,
116  'Eacute' => 201,
117  'eacute' => 233,
118  'Ecirc' => 202,
119  'ecirc' => 234,
120  'Egrave' => 200,
121  'egrave' => 232,
122  'empty' => 8709,
123  'emsp' => 8195,
124  'ensp' => 8194,
125  'Epsilon' => 917,
126  'epsilon' => 949,
127  'equiv' => 8801,
128  'Eta' => 919,
129  'eta' => 951,
130  'ETH' => 208,
131  'eth' => 240,
132  'Euml' => 203,
133  'euml' => 235,
134  'euro' => 8364,
135  'exist' => 8707,
136  'fnof' => 402,
137  'forall' => 8704,
138  'frac12' => 189,
139  'frac14' => 188,
140  'frac34' => 190,
141  'frasl' => 8260,
142  'Gamma' => 915,
143  'gamma' => 947,
144  'ge' => 8805,
145  'gt' => 62,
146  'harr' => 8596,
147  'hArr' => 8660,
148  'hearts' => 9829,
149  'hellip' => 8230,
150  'Iacute' => 205,
151  'iacute' => 237,
152  'Icirc' => 206,
153  'icirc' => 238,
154  'iexcl' => 161,
155  'Igrave' => 204,
156  'igrave' => 236,
157  'image' => 8465,
158  'infin' => 8734,
159  'int' => 8747,
160  'Iota' => 921,
161  'iota' => 953,
162  'iquest' => 191,
163  'isin' => 8712,
164  'Iuml' => 207,
165  'iuml' => 239,
166  'Kappa' => 922,
167  'kappa' => 954,
168  'Lambda' => 923,
169  'lambda' => 955,
170  'lang' => 9001,
171  'laquo' => 171,
172  'larr' => 8592,
173  'lArr' => 8656,
174  'lceil' => 8968,
175  'ldquo' => 8220,
176  'le' => 8804,
177  'lfloor' => 8970,
178  'lowast' => 8727,
179  'loz' => 9674,
180  'lrm' => 8206,
181  'lsaquo' => 8249,
182  'lsquo' => 8216,
183  'lt' => 60,
184  'macr' => 175,
185  'mdash' => 8212,
186  'micro' => 181,
187  'middot' => 183,
188  'minus' => 8722,
189  'Mu' => 924,
190  'mu' => 956,
191  'nabla' => 8711,
192  'nbsp' => 160,
193  'ndash' => 8211,
194  'ne' => 8800,
195  'ni' => 8715,
196  'not' => 172,
197  'notin' => 8713,
198  'nsub' => 8836,
199  'Ntilde' => 209,
200  'ntilde' => 241,
201  'Nu' => 925,
202  'nu' => 957,
203  'Oacute' => 211,
204  'oacute' => 243,
205  'Ocirc' => 212,
206  'ocirc' => 244,
207  'OElig' => 338,
208  'oelig' => 339,
209  'Ograve' => 210,
210  'ograve' => 242,
211  'oline' => 8254,
212  'Omega' => 937,
213  'omega' => 969,
214  'Omicron' => 927,
215  'omicron' => 959,
216  'oplus' => 8853,
217  'or' => 8744,
218  'ordf' => 170,
219  'ordm' => 186,
220  'Oslash' => 216,
221  'oslash' => 248,
222  'Otilde' => 213,
223  'otilde' => 245,
224  'otimes' => 8855,
225  'Ouml' => 214,
226  'ouml' => 246,
227  'para' => 182,
228  'part' => 8706,
229  'permil' => 8240,
230  'perp' => 8869,
231  'Phi' => 934,
232  'phi' => 966,
233  'Pi' => 928,
234  'pi' => 960,
235  'piv' => 982,
236  'plusmn' => 177,
237  'pound' => 163,
238  'prime' => 8242,
239  'Prime' => 8243,
240  'prod' => 8719,
241  'prop' => 8733,
242  'Psi' => 936,
243  'psi' => 968,
244  'quot' => 34,
245  'radic' => 8730,
246  'rang' => 9002,
247  'raquo' => 187,
248  'rarr' => 8594,
249  'rArr' => 8658,
250  'rceil' => 8969,
251  'rdquo' => 8221,
252  'real' => 8476,
253  'reg' => 174,
254  'rfloor' => 8971,
255  'Rho' => 929,
256  'rho' => 961,
257  'rlm' => 8207,
258  'rsaquo' => 8250,
259  'rsquo' => 8217,
260  'sbquo' => 8218,
261  'Scaron' => 352,
262  'scaron' => 353,
263  'sdot' => 8901,
264  'sect' => 167,
265  'shy' => 173,
266  'Sigma' => 931,
267  'sigma' => 963,
268  'sigmaf' => 962,
269  'sim' => 8764,
270  'spades' => 9824,
271  'sub' => 8834,
272  'sube' => 8838,
273  'sum' => 8721,
274  'sup' => 8835,
275  'sup1' => 185,
276  'sup2' => 178,
277  'sup3' => 179,
278  'supe' => 8839,
279  'szlig' => 223,
280  'Tau' => 932,
281  'tau' => 964,
282  'there4' => 8756,
283  'Theta' => 920,
284  'theta' => 952,
285  'thetasym' => 977,
286  'thinsp' => 8201,
287  'THORN' => 222,
288  'thorn' => 254,
289  'tilde' => 732,
290  'times' => 215,
291  'trade' => 8482,
292  'Uacute' => 218,
293  'uacute' => 250,
294  'uarr' => 8593,
295  'uArr' => 8657,
296  'Ucirc' => 219,
297  'ucirc' => 251,
298  'Ugrave' => 217,
299  'ugrave' => 249,
300  'uml' => 168,
301  'upsih' => 978,
302  'Upsilon' => 933,
303  'upsilon' => 965,
304  'Uuml' => 220,
305  'uuml' => 252,
306  'weierp' => 8472,
307  'Xi' => 926,
308  'xi' => 958,
309  'Yacute' => 221,
310  'yacute' => 253,
311  'yen' => 165,
312  'Yuml' => 376,
313  'yuml' => 255,
314  'Zeta' => 918,
315  'zeta' => 950,
316  'zwj' => 8205,
317  'zwnj' => 8204
318  );
319 
323  private static $htmlEntityAliases = array(
324  'רלמ' => 'rlm',
325  'رلم' => 'rlm',
326  );
327 
331  private static $attribsRegex;
332 
339  static function getAttribsRegex() {
340  if ( self::$attribsRegex === null ) {
341  $attribFirst = '[:A-Z_a-z0-9]';
342  $attrib = '[:A-Z_a-z-.0-9]';
343  $space = '[\x09\x0a\x0d\x20]';
344  self::$attribsRegex =
345  "/(?:^|$space)({$attribFirst}{$attrib}*)
346  ($space*=$space*
347  (?:
348  # The attribute value: quoted or alone
349  \"([^<\"]*)(?:\"|\$)
350  | '([^<']*)(?:'|\$)
351  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
352  )
353  )?(?=$space|\$)/sx";
354  }
355  return self::$attribsRegex;
356  }
357 
364  public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) {
366 
367  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
368  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
369 
370  // Base our staticInitialised variable off of the global config state so that if the globals
371  // are changed (like in the screwed up test system) we will re-initialise the settings.
372  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
373  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
374  $htmlpairsStatic = array( # Tags that must be closed
375  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
376  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
377  'strike', 'strong', 'tt', 'var', 'div', 'center',
378  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
379  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
380  'kbd', 'samp', 'data', 'time', 'mark'
381  );
382  $htmlsingle = array(
383  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
384  );
385  $htmlsingleonly = array( # Elements that cannot have close tags
386  'br', 'wbr', 'hr'
387  );
388  if ( $wgAllowMicrodataAttributes ) {
389  $htmlsingle[] = $htmlsingleonly[] = 'meta';
390  $htmlsingle[] = $htmlsingleonly[] = 'link';
391  }
392  $htmlnest = array( # Tags that can be nested--??
393  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
394  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
395  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
396  );
397  $tabletags = array( # Can only appear inside table, we will close them
398  'td', 'th', 'tr',
399  );
400  $htmllist = array( # Tags used by list
401  'ul', 'ol',
402  );
403  $listtags = array( # Tags that can appear in a list
404  'li',
405  );
406 
407  if ( $wgAllowImageTag ) {
408  $htmlsingle[] = 'img';
409  $htmlsingleonly[] = 'img';
410  }
411 
412  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
413  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
414 
415  # Convert them all to hashtables for faster lookup
416  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
417  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
418  foreach ( $vars as $var ) {
419  $$var = array_flip( $$var );
420  }
421  $staticInitialised = $globalContext;
422  }
423 
424  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
425  $extratags = array_flip( $extratags );
426  $removetags = array_flip( $removetags );
427  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
428  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
429 
430  return array(
431  'htmlpairs' => $htmlpairs,
432  'htmlsingle' => $htmlsingle,
433  'htmlsingleonly' => $htmlsingleonly,
434  'htmlnest' => $htmlnest,
435  'tabletags' => $tabletags,
436  'htmllist' => $htmllist,
437  'listtags' => $listtags,
438  'htmlsingleallowed' => $htmlsingleallowed,
439  'htmlelements' => $htmlelements,
440  );
441  }
442 
454  public static function removeHTMLtags( $text, $processCallback = null,
455  $args = array(), $extratags = array(), $removetags = array()
456  ) {
457  extract( self::getRecognizedTagData( $extratags, $removetags ) );
458 
459  # Remove HTML comments
460  $text = Sanitizer::removeHTMLcomments( $text );
461  $bits = explode( '<', $text );
462  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
463  if ( !MWTidy::isEnabled() ) {
464  $tagstack = $tablestack = array();
465  foreach ( $bits as $x ) {
466  $regs = array();
467  # $slash: Does the current element start with a '/'?
468  # $t: Current element name
469  # $params: String between element name and >
470  # $brace: Ending '>' or '/>'
471  # $rest: Everything until the next element of $bits
472  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
473  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
474  } else {
475  $slash = $t = $params = $brace = $rest = null;
476  }
477 
478  $badtag = false;
479  $t = strtolower( $t );
480  if ( isset( $htmlelements[$t] ) ) {
481  # Check our stack
482  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
483  $badtag = true;
484  } elseif ( $slash ) {
485  # Closing a tag... is it the one we just opened?
486  MediaWiki\suppressWarnings();
487  $ot = array_pop( $tagstack );
488  MediaWiki\restoreWarnings();
489 
490  if ( $ot != $t ) {
491  if ( isset( $htmlsingleallowed[$ot] ) ) {
492  # Pop all elements with an optional close tag
493  # and see if we find a match below them
494  $optstack = array();
495  array_push( $optstack, $ot );
496  MediaWiki\suppressWarnings();
497  $ot = array_pop( $tagstack );
498  MediaWiki\restoreWarnings();
499  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
500  array_push( $optstack, $ot );
501  MediaWiki\suppressWarnings();
502  $ot = array_pop( $tagstack );
503  MediaWiki\restoreWarnings();
504  }
505  if ( $t != $ot ) {
506  # No match. Push the optional elements back again
507  $badtag = true;
508  MediaWiki\suppressWarnings();
509  $ot = array_pop( $optstack );
510  MediaWiki\restoreWarnings();
511  while ( $ot ) {
512  array_push( $tagstack, $ot );
513  MediaWiki\suppressWarnings();
514  $ot = array_pop( $optstack );
515  MediaWiki\restoreWarnings();
516  }
517  }
518  } else {
519  MediaWiki\suppressWarnings();
520  array_push( $tagstack, $ot );
521  MediaWiki\restoreWarnings();
522 
523  # <li> can be nested in <ul> or <ol>, skip those cases:
524  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
525  $badtag = true;
526  }
527  }
528  } else {
529  if ( $t == 'table' ) {
530  $tagstack = array_pop( $tablestack );
531  }
532  }
533  $newparams = '';
534  } else {
535  # Keep track for later
536  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
537  $badtag = true;
538  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
539  $badtag = true;
540  #  Is it a self closed htmlpair ? (bug 5487)
541  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
542  $badtag = true;
543  } elseif ( isset( $htmlsingleonly[$t] ) ) {
544  # Hack to force empty tag for unclosable elements
545  $brace = '/>';
546  } elseif ( isset( $htmlsingle[$t] ) ) {
547  # Hack to not close $htmlsingle tags
548  $brace = null;
549  # Still need to push this optionally-closed tag to
550  # the tag stack so that we can match end tags
551  # instead of marking them as bad.
552  array_push( $tagstack, $t );
553  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
554  // New table tag but forgot to close the previous one
555  $text .= "</$t>";
556  } else {
557  if ( $t == 'table' ) {
558  array_push( $tablestack, $tagstack );
559  $tagstack = array();
560  }
561  array_push( $tagstack, $t );
562  }
563 
564  # Replace any variables or template parameters with
565  # plaintext results.
566  if ( is_callable( $processCallback ) ) {
567  call_user_func_array( $processCallback, array( &$params, $args ) );
568  }
569 
570  if ( !Sanitizer::validateTag( $params, $t ) ) {
571  $badtag = true;
572  }
573 
574  # Strip non-approved attributes from the tag
575  $newparams = Sanitizer::fixTagAttributes( $params, $t );
576  }
577  if ( !$badtag ) {
578  $rest = str_replace( '>', '&gt;', $rest );
579  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
580  $text .= "<$slash$t$newparams$close>$rest";
581  continue;
582  }
583  }
584  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
585  }
586  # Close off any remaining tags
587  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
588  $text .= "</$t>\n";
589  if ( $t == 'table' ) {
590  $tagstack = array_pop( $tablestack );
591  }
592  }
593  } else {
594  # this might be possible using tidy itself
595  foreach ( $bits as $x ) {
596  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
597  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
598 
599  $badtag = false;
600  $t = strtolower( $t );
601  if ( isset( $htmlelements[$t] ) ) {
602  if ( is_callable( $processCallback ) ) {
603  call_user_func_array( $processCallback, array( &$params, $args ) );
604  }
605 
606  if ( !Sanitizer::validateTag( $params, $t ) ) {
607  $badtag = true;
608  }
609 
610  $newparams = Sanitizer::fixTagAttributes( $params, $t );
611  if ( !$badtag ) {
612  $rest = str_replace( '>', '&gt;', $rest );
613  $text .= "<$slash$t$newparams$brace$rest";
614  continue;
615  }
616  }
617  }
618  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
619  }
620  }
621  return $text;
622  }
623 
633  public static function removeHTMLcomments( $text ) {
634  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
635  $end = strpos( $text, '-->', $start + 4 );
636  if ( $end === false ) {
637  # Unterminated comment; bail out
638  break;
639  }
640 
641  $end += 3;
642 
643  # Trim space and newline if the comment is both
644  # preceded and followed by a newline
645  $spaceStart = max( $start - 1, 0 );
646  $spaceLen = $end - $spaceStart;
647  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
648  $spaceStart--;
649  $spaceLen++;
650  }
651  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
652  $spaceLen++;
653  }
654  if ( substr( $text, $spaceStart, 1 ) === "\n"
655  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
656  # Remove the comment, leading and trailing
657  # spaces, and leave only one newline.
658  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
659  } else {
660  # Remove just the comment.
661  $text = substr_replace( $text, '', $start, $end - $start );
662  }
663  }
664  return $text;
665  }
666 
679  static function validateTag( $params, $element ) {
681 
682  if ( $element == 'meta' || $element == 'link' ) {
683  if ( !isset( $params['itemprop'] ) ) {
684  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
685  return false;
686  }
687  if ( $element == 'meta' && !isset( $params['content'] ) ) {
688  // <meta> must have a content="" for the itemprop
689  return false;
690  }
691  if ( $element == 'link' && !isset( $params['href'] ) ) {
692  // <link> must have an associated href=""
693  return false;
694  }
695  }
696 
697  return true;
698  }
699 
715  static function validateTagAttributes( $attribs, $element ) {
717  Sanitizer::attributeWhitelist( $element ) );
718  }
719 
735  static function validateAttributes( $attribs, $whitelist ) {
737 
738  $whitelist = array_flip( $whitelist );
739  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
740 
741  $out = array();
742  foreach ( $attribs as $attribute => $value ) {
743  # allow XML namespace declaration if RDFa is enabled
744  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
745  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
746  $out[$attribute] = $value;
747  }
748 
749  continue;
750  }
751 
752  # Allow any attribute beginning with "data-"
753  # However:
754  # * data-ooui is reserved for ooui
755  # * data-mw and data-parsoid are reserved for parsoid
756  # * data-mw-<name here> is reserved for extensions (or core) if
757  # they need to communicate some data to the client and want to be
758  # sure that it isn't coming from an untrusted user.
759  # * Ensure that the attribute is not namespaced by banning
760  # colons.
761  if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute )
762  && !isset( $whitelist[$attribute] )
763  ) {
764  continue;
765  }
766 
767  # Strip javascript "expression" from stylesheets.
768  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
769  if ( $attribute == 'style' ) {
771  }
772 
773  # Escape HTML id attributes
774  if ( $attribute === 'id' ) {
775  $value = Sanitizer::escapeId( $value, 'noninitial' );
776  }
777 
778  # Escape HTML id reference lists
779  if ( $attribute === 'aria-describedby'
780  || $attribute === 'aria-flowto'
781  || $attribute === 'aria-labelledby'
782  || $attribute === 'aria-owns'
783  ) {
785  }
786 
787  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
788  // Check them for sanity.
789  if ( $attribute === 'rel' || $attribute === 'rev'
790  # RDFa
791  || $attribute === 'about' || $attribute === 'property'
792  || $attribute === 'resource' || $attribute === 'datatype'
793  || $attribute === 'typeof'
794  # HTML5 microdata
795  || $attribute === 'itemid' || $attribute === 'itemprop'
796  || $attribute === 'itemref' || $attribute === 'itemscope'
797  || $attribute === 'itemtype'
798  ) {
799  // Paranoia. Allow "simple" values but suppress javascript
800  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
801  continue;
802  }
803  }
804 
805  # NOTE: even though elements using href/src are not allowed directly, supply
806  # validation code that can be used by tag hook handlers, etc
807  if ( $attribute === 'href' || $attribute === 'src' ) {
808  if ( !preg_match( $hrefExp, $value ) ) {
809  continue; // drop any href or src attributes not using an allowed protocol.
810  // NOTE: this also drops all relative URLs
811  }
812  }
813 
814  // If this attribute was previously set, override it.
815  // Output should only have one attribute of each name.
816  $out[$attribute] = $value;
817  }
818 
819  if ( $wgAllowMicrodataAttributes ) {
820  # itemtype, itemid, itemref don't make sense without itemscope
821  if ( !array_key_exists( 'itemscope', $out ) ) {
822  unset( $out['itemtype'] );
823  unset( $out['itemid'] );
824  unset( $out['itemref'] );
825  }
826  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
827  }
828  return $out;
829  }
830 
841  static function mergeAttributes( $a, $b ) {
842  $out = array_merge( $a, $b );
843  if ( isset( $a['class'] ) && isset( $b['class'] )
844  && is_string( $a['class'] ) && is_string( $b['class'] )
845  && $a['class'] !== $b['class']
846  ) {
847  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
848  -1, PREG_SPLIT_NO_EMPTY );
849  $out['class'] = implode( ' ', array_unique( $classes ) );
850  }
851  return $out;
852  }
853 
863  public static function normalizeCss( $value ) {
864 
865  // Decode character references like &#123;
867 
868  // Decode escape sequences and line continuation
869  // See the grammar in the CSS 2 spec, appendix D.
870  // This has to be done AFTER decoding character references.
871  // This means it isn't possible for this function to return
872  // unsanitized escape sequences. It is possible to manufacture
873  // input that contains character references that decode to
874  // escape sequences that decode to character references, but
875  // it's OK for the return value to contain character references
876  // because the caller is supposed to escape those anyway.
877  static $decodeRegex;
878  if ( !$decodeRegex ) {
879  $space = '[\\x20\\t\\r\\n\\f]';
880  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
881  $backslash = '\\\\';
882  $decodeRegex = "/ $backslash
883  (?:
884  ($nl) | # 1. Line continuation
885  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
886  (.) | # 3. backslash cancelling special meaning
887  () | # 4. backslash at end of string
888  )/xu";
889  }
890  $value = preg_replace_callback( $decodeRegex,
891  array( __CLASS__, 'cssDecodeCallback' ), $value );
892 
893  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
894  $value = preg_replace_callback(
895  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
896  function ( $matches ) {
898  if ( $cp === false ) {
899  return '';
900  }
901  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
902  },
903  $value
904  );
905 
906  // Convert more characters IE6 might treat as ascii
907  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
908  $value = str_replace(
909  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
910  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
911  $value
912  );
913 
914  // Let the value through if it's nothing but a single comment, to
915  // allow other functions which may reject it to pass some error
916  // message through.
917  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
918  // Remove any comments; IE gets token splitting wrong
919  // This must be done AFTER decoding character references and
920  // escape sequences, because those steps can introduce comments
921  // This step cannot introduce character references or escape
922  // sequences, because it replaces comments with spaces rather
923  // than removing them completely.
924  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
925 
926  // Remove anything after a comment-start token, to guard against
927  // incorrect client implementations.
928  $commentPos = strpos( $value, '/*' );
929  if ( $commentPos !== false ) {
930  $value = substr( $value, 0, $commentPos );
931  }
932  }
933 
934  // S followed by repeat, iteration, or prolonged sound marks,
935  // which IE will treat as "ss"
936  $value = preg_replace(
937  '/s(?:
938  \xE3\x80\xB1 | # U+3031
939  \xE3\x82\x9D | # U+309D
940  \xE3\x83\xBC | # U+30FC
941  \xE3\x83\xBD | # U+30FD
942  \xEF\xB9\xBC | # U+FE7C
943  \xEF\xB9\xBD | # U+FE7D
944  \xEF\xBD\xB0 # U+FF70
945  )/ix',
946  'ss',
947  $value
948  );
949 
950  return $value;
951  }
952 
971  static function checkCss( $value ) {
972  $value = self::normalizeCss( $value );
973 
974  // Reject problematic keywords and control characters
975  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
976  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
977  return '/* invalid control char */';
978  } elseif ( preg_match(
979  '! expression
980  | filter\s*:
981  | accelerator\s*:
982  | -o-link\s*:
983  | -o-link-source\s*:
984  | -o-replace\s*:
985  | url\s*\(
986  | image\s*\(
987  | image-set\s*\(
988  !ix', $value ) ) {
989  return '/* insecure input */';
990  }
991  return $value;
992  }
993 
998  static function cssDecodeCallback( $matches ) {
999  if ( $matches[1] !== '' ) {
1000  // Line continuation
1001  return '';
1002  } elseif ( $matches[2] !== '' ) {
1003  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1004  } elseif ( $matches[3] !== '' ) {
1005  $char = $matches[3];
1006  } else {
1007  $char = '\\';
1008  }
1009  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1010  // These characters need to be escaped in strings
1011  // Clean up the escape sequence to avoid parsing errors by clients
1012  return '\\' . dechex( ord( $char ) ) . ' ';
1013  } else {
1014  // Decode unnecessary escape
1015  return $char;
1016  }
1017  }
1018 
1038  static function fixTagAttributes( $text, $element ) {
1039  if ( trim( $text ) == '' ) {
1040  return '';
1041  }
1042 
1043  $decoded = Sanitizer::decodeTagAttributes( $text );
1044  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1045 
1046  return Sanitizer::safeEncodeTagAttributes( $stripped );
1047  }
1048 
1054  static function encodeAttribute( $text ) {
1055  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1056 
1057  // Whitespace is normalized during attribute decoding,
1058  // so if we've been passed non-spaces we must encode them
1059  // ahead of time or they won't be preserved.
1060  $encValue = strtr( $encValue, array(
1061  "\n" => '&#10;',
1062  "\r" => '&#13;',
1063  "\t" => '&#9;',
1064  ) );
1065 
1066  return $encValue;
1067  }
1068 
1075  static function safeEncodeAttribute( $text ) {
1076  $encValue = Sanitizer::encodeAttribute( $text );
1077 
1078  # Templates and links may be expanded in later parsing,
1079  # creating invalid or dangerous output. Suppress this.
1080  $encValue = strtr( $encValue, array(
1081  '<' => '&lt;', // This should never happen,
1082  '>' => '&gt;', // we've received invalid input
1083  '"' => '&quot;', // which should have been escaped.
1084  '{' => '&#123;',
1085  '[' => '&#91;',
1086  "''" => '&#39;&#39;',
1087  'ISBN' => '&#73;SBN',
1088  'RFC' => '&#82;FC',
1089  'PMID' => '&#80;MID',
1090  '|' => '&#124;',
1091  '__' => '&#95;_',
1092  ) );
1093 
1094  # Stupid hack
1095  $encValue = preg_replace_callback(
1096  '/((?i)' . wfUrlProtocols() . ')/',
1097  array( 'Sanitizer', 'armorLinksCallback' ),
1098  $encValue );
1099  return $encValue;
1100  }
1101 
1133  static function escapeId( $id, $options = array() ) {
1135  $options = (array)$options;
1136 
1137  $id = Sanitizer::decodeCharReferences( $id );
1138 
1139  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1140  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1141  $id = trim( $id, '_' );
1142  if ( $id === '' ) {
1143  // Must have been all whitespace to start with.
1144  return '_';
1145  } else {
1146  return $id;
1147  }
1148  }
1149 
1150  // HTML4-style escaping
1151  static $replace = array(
1152  '%3A' => ':',
1153  '%' => '.'
1154  );
1155 
1156  $id = urlencode( strtr( $id, ' ', '_' ) );
1157  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1158 
1159  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1160  // Initial character must be a letter!
1161  $id = "x$id";
1162  }
1163  return $id;
1164  }
1165 
1183  static function escapeIdReferenceList( $referenceString, $options = array() ) {
1184  # Explode the space delimited list string into an array of tokens
1185  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1186 
1187  # Escape each token as an id
1188  foreach ( $references as &$ref ) {
1189  $ref = Sanitizer::escapeId( $ref, $options );
1190  }
1191 
1192  # Merge the array back to a space delimited list string
1193  # If the array is empty, the result will be an empty string ('')
1194  $referenceString = implode( ' ', $references );
1195 
1196  return $referenceString;
1197  }
1198 
1210  static function escapeClass( $class ) {
1211  // Convert ugly stuff to underscores and kill underscores in ugly places
1212  return rtrim( preg_replace(
1213  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1214  '_',
1215  $class ), '_' );
1216  }
1217 
1225  static function escapeHtmlAllowEntities( $html ) {
1227  # It seems wise to escape ' as well as ", as a matter of course. Can't
1228  # hurt.
1229  $html = htmlspecialchars( $html, ENT_QUOTES );
1230  return $html;
1231  }
1232 
1238  private static function armorLinksCallback( $matches ) {
1239  return str_replace( ':', '&#58;', $matches[1] );
1240  }
1241 
1250  public static function decodeTagAttributes( $text ) {
1251  if ( trim( $text ) == '' ) {
1252  return array();
1253  }
1254 
1255  $attribs = array();
1256  $pairs = array();
1257  if ( !preg_match_all(
1258  self::getAttribsRegex(),
1259  $text,
1260  $pairs,
1261  PREG_SET_ORDER ) ) {
1262  return $attribs;
1263  }
1264 
1265  foreach ( $pairs as $set ) {
1266  $attribute = strtolower( $set[1] );
1268 
1269  // Normalize whitespace
1270  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1271  $value = trim( $value );
1272 
1273  // Decode character references
1275  }
1276  return $attribs;
1277  }
1278 
1286  public static function safeEncodeTagAttributes( $assoc_array ) {
1287  $attribs = array();
1288  foreach ( $assoc_array as $attribute => $value ) {
1289  $encAttribute = htmlspecialchars( $attribute );
1290  $encValue = Sanitizer::safeEncodeAttribute( $value );
1291 
1292  $attribs[] = "$encAttribute=\"$encValue\"";
1293  }
1294  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1295  }
1296 
1305  private static function getTagAttributeCallback( $set ) {
1306  if ( isset( $set[5] ) ) {
1307  # No quotes.
1308  return $set[5];
1309  } elseif ( isset( $set[4] ) ) {
1310  # Single-quoted
1311  return $set[4];
1312  } elseif ( isset( $set[3] ) ) {
1313  # Double-quoted
1314  return $set[3];
1315  } elseif ( !isset( $set[2] ) ) {
1316  # In XHTML, attributes must have a value so return an empty string.
1317  # See "Empty attribute syntax",
1318  # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1319  return "";
1320  } else {
1321  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1322  }
1323  }
1324 
1329  private static function normalizeWhitespace( $text ) {
1330  return preg_replace(
1331  '/\r\n|[\x20\x0d\x0a\x09]/',
1332  ' ',
1333  $text );
1334  }
1335 
1345  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1346  }
1347 
1363  static function normalizeCharReferences( $text ) {
1364  return preg_replace_callback(
1365  self::CHAR_REFS_REGEX,
1366  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1367  $text );
1368  }
1369 
1375  $ret = null;
1376  if ( $matches[1] != '' ) {
1378  } elseif ( $matches[2] != '' ) {
1380  } elseif ( $matches[3] != '' ) {
1382  }
1383  if ( is_null( $ret ) ) {
1384  return htmlspecialchars( $matches[0] );
1385  } else {
1386  return $ret;
1387  }
1388  }
1389 
1400  static function normalizeEntity( $name ) {
1401  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1402  return '&' . self::$htmlEntityAliases[$name] . ';';
1403  } elseif ( in_array( $name, array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1404  return "&$name;";
1405  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1406  return '&#' . self::$htmlEntities[$name] . ';';
1407  } else {
1408  return "&amp;$name;";
1409  }
1410  }
1411 
1416  static function decCharReference( $codepoint ) {
1417  $point = intval( $codepoint );
1418  if ( Sanitizer::validateCodepoint( $point ) ) {
1419  return sprintf( '&#%d;', $point );
1420  } else {
1421  return null;
1422  }
1423  }
1424 
1429  static function hexCharReference( $codepoint ) {
1430  $point = hexdec( $codepoint );
1431  if ( Sanitizer::validateCodepoint( $point ) ) {
1432  return sprintf( '&#x%x;', $point );
1433  } else {
1434  return null;
1435  }
1436  }
1437 
1444  private static function validateCodepoint( $codepoint ) {
1445  # U+000C is valid in HTML5 but not allowed in XML.
1446  # U+000D is valid in XML but not allowed in HTML5.
1447  # U+007F - U+009F are disallowed in HTML5 (control characters).
1448  return $codepoint == 0x09
1449  || $codepoint == 0x0a
1450  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1451  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1452  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1453  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1454  }
1455 
1463  public static function decodeCharReferences( $text ) {
1464  return preg_replace_callback(
1465  self::CHAR_REFS_REGEX,
1466  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1467  $text );
1468  }
1469 
1480  public static function decodeCharReferencesAndNormalize( $text ) {
1482  $text = preg_replace_callback(
1483  self::CHAR_REFS_REGEX,
1484  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1485  $text, /* limit */ -1, $count );
1486 
1487  if ( $count ) {
1488  return $wgContLang->normalize( $text );
1489  } else {
1490  return $text;
1491  }
1492  }
1493 
1499  if ( $matches[1] != '' ) {
1500  return Sanitizer::decodeEntity( $matches[1] );
1501  } elseif ( $matches[2] != '' ) {
1502  return Sanitizer::decodeChar( intval( $matches[2] ) );
1503  } elseif ( $matches[3] != '' ) {
1504  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1505  }
1506  # Last case should be an ampersand by itself
1507  return $matches[0];
1508  }
1509 
1517  static function decodeChar( $codepoint ) {
1518  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1519  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1520  } else {
1522  }
1523  }
1524 
1533  static function decodeEntity( $name ) {
1534  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1535  $name = self::$htmlEntityAliases[$name];
1536  }
1537  if ( isset( self::$htmlEntities[$name] ) ) {
1538  return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1539  } else {
1540  return "&$name;";
1541  }
1542  }
1543 
1550  static function attributeWhitelist( $element ) {
1552  return isset( $list[$element] )
1553  ? $list[$element]
1554  : array();
1555  }
1556 
1562  static function setupAttributeWhitelist() {
1564  static $whitelist, $staticInitialised;
1565 
1566  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1567 
1568  if ( $whitelist !== null && $staticInitialised == $globalContext ) {
1569  return $whitelist;
1570  }
1571 
1572  $common = array(
1573  # HTML
1574  'id',
1575  'class',
1576  'style',
1577  'lang',
1578  'dir',
1579  'title',
1580 
1581  # WAI-ARIA
1582  'aria-describedby',
1583  'aria-flowto',
1584  'aria-label',
1585  'aria-labelledby',
1586  'aria-owns',
1587  'role',
1588  );
1589 
1590  if ( $wgAllowRdfaAttributes ) {
1591  # RDFa attributes as specified in section 9 of
1592  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1593  $common = array_merge( $common, array(
1594  'about', 'property', 'resource', 'datatype', 'typeof',
1595  ) );
1596  }
1597 
1598  if ( $wgAllowMicrodataAttributes ) {
1599  # add HTML5 microdata tags as specified by
1600  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1601  $common = array_merge( $common, array(
1602  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1603  ) );
1604  }
1605 
1606  $block = array_merge( $common, array( 'align' ) );
1607  $tablealign = array( 'align', 'valign' );
1608  $tablecell = array(
1609  'abbr',
1610  'axis',
1611  'headers',
1612  'scope',
1613  'rowspan',
1614  'colspan',
1615  'nowrap', # deprecated
1616  'width', # deprecated
1617  'height', # deprecated
1618  'bgcolor', # deprecated
1619  );
1620 
1621  # Numbers refer to sections in HTML 4.01 standard describing the element.
1622  # See: http://www.w3.org/TR/html4/
1623  $whitelist = array(
1624  # 7.5.4
1625  'div' => $block,
1626  'center' => $common, # deprecated
1627  'span' => $common,
1628 
1629  # 7.5.5
1630  'h1' => $block,
1631  'h2' => $block,
1632  'h3' => $block,
1633  'h4' => $block,
1634  'h5' => $block,
1635  'h6' => $block,
1636 
1637  # 7.5.6
1638  # address
1639 
1640  # 8.2.4
1641  'bdo' => $common,
1642 
1643  # 9.2.1
1644  'em' => $common,
1645  'strong' => $common,
1646  'cite' => $common,
1647  'dfn' => $common,
1648  'code' => $common,
1649  'samp' => $common,
1650  'kbd' => $common,
1651  'var' => $common,
1652  'abbr' => $common,
1653  # acronym
1654 
1655  # 9.2.2
1656  'blockquote' => array_merge( $common, array( 'cite' ) ),
1657  'q' => array_merge( $common, array( 'cite' ) ),
1658 
1659  # 9.2.3
1660  'sub' => $common,
1661  'sup' => $common,
1662 
1663  # 9.3.1
1664  'p' => $block,
1665 
1666  # 9.3.2
1667  'br' => array_merge( $common, array( 'clear' ) ),
1668 
1669  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1670  'wbr' => $common,
1671 
1672  # 9.3.4
1673  'pre' => array_merge( $common, array( 'width' ) ),
1674 
1675  # 9.4
1676  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1677  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1678 
1679  # 10.2
1680  'ul' => array_merge( $common, array( 'type' ) ),
1681  'ol' => array_merge( $common, array( 'type', 'start', 'reversed' ) ),
1682  'li' => array_merge( $common, array( 'type', 'value' ) ),
1683 
1684  # 10.3
1685  'dl' => $common,
1686  'dd' => $common,
1687  'dt' => $common,
1688 
1689  # 11.2.1
1690  'table' => array_merge( $common,
1691  array( 'summary', 'width', 'border', 'frame',
1692  'rules', 'cellspacing', 'cellpadding',
1693  'align', 'bgcolor',
1694  ) ),
1695 
1696  # 11.2.2
1697  'caption' => $block,
1698 
1699  # 11.2.3
1700  'thead' => $common,
1701  'tfoot' => $common,
1702  'tbody' => $common,
1703 
1704  # 11.2.4
1705  'colgroup' => array_merge( $common, array( 'span' ) ),
1706  'col' => array_merge( $common, array( 'span' ) ),
1707 
1708  # 11.2.5
1709  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1710 
1711  # 11.2.6
1712  'td' => array_merge( $common, $tablecell, $tablealign ),
1713  'th' => array_merge( $common, $tablecell, $tablealign ),
1714 
1715  # 12.2
1716  # NOTE: <a> is not allowed directly, but the attrib
1717  # whitelist is used from the Parser object
1718  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1719 
1720  # 13.2
1721  # Not usually allowed, but may be used for extension-style hooks
1722  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1723  # true
1724  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1725 
1726  # 15.2.1
1727  'tt' => $common,
1728  'b' => $common,
1729  'i' => $common,
1730  'big' => $common,
1731  'small' => $common,
1732  'strike' => $common,
1733  's' => $common,
1734  'u' => $common,
1735 
1736  # 15.2.2
1737  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1738  # basefont
1739 
1740  # 15.3
1741  'hr' => array_merge( $common, array( 'width' ) ),
1742 
1743  # HTML Ruby annotation text module, simple ruby only.
1744  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1745  'ruby' => $common,
1746  # rbc
1747  'rb' => $common,
1748  'rp' => $common,
1749  'rt' => $common, # array_merge( $common, array( 'rbspan' ) ),
1750  'rtc' => $common,
1751 
1752  # MathML root element, where used for extensions
1753  # 'title' may not be 100% valid here; it's XHTML
1754  # http://www.w3.org/TR/REC-MathML/
1755  'math' => array( 'class', 'style', 'id', 'title' ),
1756 
1757  # HTML 5 section 4.6
1758  'bdi' => $common,
1759 
1760  # HTML5 elements, defined by:
1761  # http://www.whatwg.org/html/
1762  'data' => array_merge( $common, array( 'value' ) ),
1763  'time' => array_merge( $common, array( 'datetime' ) ),
1764  'mark' => $common,
1765 
1766  // meta and link are only permitted by removeHTMLtags when Microdata
1767  // is enabled so we don't bother adding a conditional to hide these
1768  // Also meta and link are only valid in WikiText as Microdata elements
1769  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1770  // So we don't bother including $common attributes that have no purpose.
1771  'meta' => array( 'itemprop', 'content' ),
1772  'link' => array( 'itemprop', 'href' ),
1773  );
1774 
1775  $staticInitialised = $globalContext;
1776 
1777  return $whitelist;
1778  }
1779 
1790  static function stripAllTags( $text ) {
1791  # Actual <tags>
1792  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1793 
1794  # Normalize &entities and whitespace
1795  $text = self::decodeCharReferences( $text );
1796  $text = self::normalizeWhitespace( $text );
1797 
1798  return $text;
1799  }
1800 
1810  static function hackDocType() {
1811  $out = "<!DOCTYPE html [\n";
1812  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1813  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1814  }
1815  $out .= "]>\n";
1816  return $out;
1817  }
1818 
1823  static function cleanUrl( $url ) {
1824  # Normalize any HTML entities in input. They will be
1825  # re-escaped by makeExternalLink().
1826  $url = Sanitizer::decodeCharReferences( $url );
1827 
1828  # Escape any control characters introduced by the above step
1829  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1830  array( __CLASS__, 'cleanUrlCallback' ), $url );
1831 
1832  # Validate hostname portion
1833  $matches = array();
1834  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1835  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1836 
1837  // Characters that will be ignored in IDNs.
1838  // http://tools.ietf.org/html/3454#section-3.1
1839  // Strip them before further processing so blacklists and such work.
1840  $strip = "/
1841  \\s| # general whitespace
1842  \xc2\xad| # 00ad SOFT HYPHEN
1843  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1844  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1845  \xe2\x81\xa0| # 2060 WORD JOINER
1846  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1847  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1848  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1849  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1850  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1851  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1852  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1853  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1854  /xuD";
1855 
1856  $host = preg_replace( $strip, '', $host );
1857 
1858  // IPv6 host names are bracketed with []. Url-decode these.
1859  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1860  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1861  ) {
1862  $host = '//[' . $matches[1] . ']' . $matches[2];
1863  }
1864 
1865  // @todo FIXME: Validate hostnames here
1866 
1867  return $protocol . $host . $rest;
1868  } else {
1869  return $url;
1870  }
1871  }
1872 
1877  static function cleanUrlCallback( $matches ) {
1878  return urlencode( $matches[0] );
1879  }
1880 
1909  public static function validateEmail( $addr ) {
1910  $result = null;
1911  if ( !Hooks::run( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1912  return $result;
1913  }
1914 
1915  // Please note strings below are enclosed in brackets [], this make the
1916  // hyphen "-" a range indicator. Hence it is double backslashed below.
1917  // See bug 26948
1918  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1919  $rfc1034_ldh_str = "a-z0-9\\-";
1920 
1921  $html5_email_regexp = "/
1922  ^ # start of string
1923  [$rfc5322_atext\\.]+ # user part which is liberal :p
1924  @ # 'apostrophe'
1925  [$rfc1034_ldh_str]+ # First domain part
1926  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1927  $ # End of string
1928  /ix"; // case Insensitive, eXtended
1929 
1930  return (bool)preg_match( $html5_email_regexp, $addr );
1931  }
1932 }
utf8ToCodepoint($char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
#define the
table suitable for use with IDatabase::select()
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
Definition: deferred.txt:11
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
static decCharReference($codepoint)
Definition: Sanitizer.php:1416
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1769
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1250
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1400
the array() calling protocol came about after MediaWiki 1.4rc1.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
static safeEncodeTagAttributes($assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1286
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:1374
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:311
static removeHTMLtags($text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:454
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1562
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1769
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content.The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content.These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text.All manipulation and analysis of page content must be done via the appropriate methods of the Content object.For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers.The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id).Also Title, WikiPage and Revision now have getContentHandler() methods for convenience.ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page.ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type.However, it is recommended to instead use WikiPage::getContent() resp.Revision::getContent() to get a page's content as a Content object.These two methods should be the ONLY way in which page content is accessed.Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides().This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based.Objects implementing the Content interface are used to represent and handle the content internally.For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content).The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats().Content serialization formats are identified using MIME type like strings.The following formats are built in:*text/x-wiki-wikitext *text/javascript-for js pages *text/css-for css pages *text/plain-for future use, e.g.with plain text messages.*text/html-for future use, e.g.with plain html messages.*application/vnd.php.serialized-for future use with the api and for extensions *application/json-for future use with the api, and for use by extensions *application/xml-for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant.Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly.Without that information, interpretation of the provided content is not reliable.The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export.Also note that the API will provide encapsulated, serialized content-so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure.Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content.However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page's content model, and will now generate warnings when used.Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent()*WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject().However, both methods should be avoided since they do not provide clean access to the page's actual content.For instance, they may return a system message for non-existing pages.Use WikiPage::getContent() instead.Code that relies on a textual representation of the page content should eventually be rewritten.However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page.Its behavior is controlled by $wgContentHandlerTextFallback it
static isEnabled()
Definition: MWTidy.php:92
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1498
static cssDecodeCallback($matches)
Definition: Sanitizer.php:998
static getRecognizedTagData($extratags=array(), $removetags=array())
Return the various lists of recognized tags.
Definition: Sanitizer.php:364
null for the local wiki Added in
Definition: hooks.txt:1389
$value
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by similarly to how extensions are installed You can then make that skin the default by adding
Definition: skin.txt:57
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1210
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1810
static cleanUrl($url)
Definition: Sanitizer.php:1823
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:1790
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
const UTF8_REPLACEMENT
static hexCharReference($codepoint)
Definition: Sanitizer.php:1429
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and local administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:715
static normalizeWhitespace($text)
Definition: Sanitizer.php:1329
Apache License January http
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message.Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item.Return false to stop further processing of the tag $reader:XMLReader object $logInfo:Array of information 'ImportHandlePageXMLTag':When parsing a XML tag in a page.Return false to stop further processing of the tag $reader:XMLReader object &$pageInfo:Array of information 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision.Return false to stop further processing of the tag $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information 'ImportHandleToplevelXMLTag':When parsing a top level XML tag.Return false to stop further processing of the tag $reader:XMLReader object 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload.Return false to stop further processing of the tag $reader:XMLReader object $revisionInfo:Array of information 'ImportLogInterwikiLink':Hook to change the interwiki link used in log entries and edit summaries for transwiki imports.&$fullInterwikiPrefix:Interwiki prefix, may contain colons.&$pageTitle:String that contains page title. 'ImportSources':Called when reading from the $wgImportSources configuration variable.Can be used to lazy-load the import sources list.&$importSources:The value of $wgImportSources.Modify as necessary.See the comment in DefaultSettings.php for the detail of how to structure this array. 'InfoAction':When building information to display on the action=info page.$context:IContextSource object &$pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect.&$title:Title object for the current page &$request:WebRequest &$ignoreRedirect:boolean to skip redirect check &$target:Title/string of redirect target &$article:Article object 'InternalParseBeforeLinks':during Parser's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InternalParseBeforeSanitize':during Parser's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings.Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments.&$parser:Parser object &$text:string containing partially parsed text &$stripState:Parser's internal StripState object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not.Return true without providing an interwiki to continue interwiki search.$prefix:interwiki prefix we are looking for.&$iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InvalidateEmailComplete':Called after a user's email has been invalidated successfully.$user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification.Callee may modify $url and $query, URL will be constructed as $url.$query &$url:URL to index.php &$query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) &$article:article(object) being checked 'IsTrustedProxy':Override the result of IP::isTrustedProxy() &$ip:IP being check &$result:Change this value to override the result of IP::isTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from &$allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of Sanitizer::validateEmail(), for instance to return false if the domain name doesn't match your organization.$addr:The e-mail address entered by the user &$result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user &$result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we're looking for a messages file for &$file:The messages file path, you can override this to change the location. 'LanguageGetMagic':DEPRECATED!Use $magicWords in a file listed in $wgExtensionMessagesFiles instead.Use this to define synonyms of magic words depending of the language &$magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces.Do not use this hook to add namespaces.Use CanonicalNamespaces for that.&$namespaces:Array of namespaces indexed by their numbers 'LanguageGetSpecialPageAliases':DEPRECATED!Use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead.Use to define aliases of special pages names depending of the language &$specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names.&$names:array of language code=> language name $code:language of the preferred translations 'LanguageLinks':Manipulate a page's language links.This is called in various places to allow extensions to define the effective language links for a page.$title:The page's Title.&$links:Associative array mapping language codes to prefixed links of the form"language:title".&$linkFlags:Associative array mapping prefixed links to arrays of flags.Currently unused, but planned to provide support for marking individual language links in the UI, e.g.for featured articles. 'LanguageSelector':Hook to change the language selector available on a page.$out:The output page.$cssClassName:CSS class name of the language selector. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts.Return false to skip default processing and return $ret.See documentation for Linker::link() for details on the expected meanings of parameters.$skin:the Skin object $target:the Title that the link is pointing to &$html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1767
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
it sets a lot of them automatically from query and such
Definition: design.txt:93
if($line===false) $args
Definition: cdb.php:64
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1463
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1444
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:56
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec http://www.w3.org/TR/html5/syntax.html#tag-open-state.
Definition: Sanitizer.php:46
Unicode normalization routines for working with UTF-8 strings.
Definition: UtfNormal.php:48
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1517
static normalizeSectionNameWhitespace($section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id's that are used for section links.
Definition: Sanitizer.php:1344
Some quick notes on the file repository architecture Functionality is
Definition: README:3
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types as usual *javascript user provided javascript code *json simple implementation for use by extensions
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
Definition: Sanitizer.php:64
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1550
namespace and then decline to actually register it file or subcat img or subcat RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context as context $options
Definition: hooks.txt:975
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1225
static validateTag($params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:679
static mergeAttributes($a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:841
MediaWiki exception.
Definition: MWException.php:26
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
static run($event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:131
$params
static validateAttributes($attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:735
static decodeCharReferencesAndNormalize($text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1480
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
static escapeIdReferenceList($referenceString, $options=array())
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1183
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable or merely the Work and Derivative Works thereof Contribution shall mean any work of including the original version of the Work and any modifications or additions to that Work or Derivative Works that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner For the purposes of this submitted means any form of or written communication sent to the Licensor or its including but not limited to communication on electronic mailing source code control and issue tracking systems that are managed by
static escapeId($id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it. ...
Definition: Sanitizer.php:1133
static cleanUrlCallback($matches)
Definition: Sanitizer.php:1877
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
usually copyright or history_copyright This message must be in HTML not wikitext if the section is included from a template $section
Definition: hooks.txt:2684
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1238
wfUrlProtocols($includeProtocolRelative=true)
Returns a regular expression of url protocols.
static normalizeCss($value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:863
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:57
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:323
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1363
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
Bar style
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$wgExperimentalHtmlIds
Should we allow a broader set of characters in id attributes, per HTML5? If not, use only HTML 4-comp...
$count
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:1038
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:633
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:339
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
static validateEmail($addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1909
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:971
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1305
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
maintenance dev scripts can help quickly setup a local MediaWiki for development purposes Wikis setup in this way are NOT meant to be publicly available They use a development database not acceptible for use in production Place a sqlite database in an unsafe location a real wiki should never place it in And use predictable default logins for the initial administrator user Running maintenance dev install sh will download and install a local copy of php
Definition: README:5
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1533
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:1075
$wgAllowRdfaAttributes
Enabled RDFa attributes for use in wikitext.
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1972
$wgAllowMicrodataAttributes
Enabled HTML5 microdata attributes for use in wikitext.
PHP Parser - Processes wiki markup (which uses a more user-friendly syntax, such as "[[link]]" for ma...
Definition: Parser.php:67
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1769
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:331
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1054
$matches