MediaWiki  1.23.5
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
50  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
51  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
52 
58  private static $htmlEntities = array(
59  'Aacute' => 193,
60  'aacute' => 225,
61  'Acirc' => 194,
62  'acirc' => 226,
63  'acute' => 180,
64  'AElig' => 198,
65  'aelig' => 230,
66  'Agrave' => 192,
67  'agrave' => 224,
68  'alefsym' => 8501,
69  'Alpha' => 913,
70  'alpha' => 945,
71  'amp' => 38,
72  'and' => 8743,
73  'ang' => 8736,
74  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
75  'Aring' => 197,
76  'aring' => 229,
77  'asymp' => 8776,
78  'Atilde' => 195,
79  'atilde' => 227,
80  'Auml' => 196,
81  'auml' => 228,
82  'bdquo' => 8222,
83  'Beta' => 914,
84  'beta' => 946,
85  'brvbar' => 166,
86  'bull' => 8226,
87  'cap' => 8745,
88  'Ccedil' => 199,
89  'ccedil' => 231,
90  'cedil' => 184,
91  'cent' => 162,
92  'Chi' => 935,
93  'chi' => 967,
94  'circ' => 710,
95  'clubs' => 9827,
96  'cong' => 8773,
97  'copy' => 169,
98  'crarr' => 8629,
99  'cup' => 8746,
100  'curren' => 164,
101  'dagger' => 8224,
102  'Dagger' => 8225,
103  'darr' => 8595,
104  'dArr' => 8659,
105  'deg' => 176,
106  'Delta' => 916,
107  'delta' => 948,
108  'diams' => 9830,
109  'divide' => 247,
110  'Eacute' => 201,
111  'eacute' => 233,
112  'Ecirc' => 202,
113  'ecirc' => 234,
114  'Egrave' => 200,
115  'egrave' => 232,
116  'empty' => 8709,
117  'emsp' => 8195,
118  'ensp' => 8194,
119  'Epsilon' => 917,
120  'epsilon' => 949,
121  'equiv' => 8801,
122  'Eta' => 919,
123  'eta' => 951,
124  'ETH' => 208,
125  'eth' => 240,
126  'Euml' => 203,
127  'euml' => 235,
128  'euro' => 8364,
129  'exist' => 8707,
130  'fnof' => 402,
131  'forall' => 8704,
132  'frac12' => 189,
133  'frac14' => 188,
134  'frac34' => 190,
135  'frasl' => 8260,
136  'Gamma' => 915,
137  'gamma' => 947,
138  'ge' => 8805,
139  'gt' => 62,
140  'harr' => 8596,
141  'hArr' => 8660,
142  'hearts' => 9829,
143  'hellip' => 8230,
144  'Iacute' => 205,
145  'iacute' => 237,
146  'Icirc' => 206,
147  'icirc' => 238,
148  'iexcl' => 161,
149  'Igrave' => 204,
150  'igrave' => 236,
151  'image' => 8465,
152  'infin' => 8734,
153  'int' => 8747,
154  'Iota' => 921,
155  'iota' => 953,
156  'iquest' => 191,
157  'isin' => 8712,
158  'Iuml' => 207,
159  'iuml' => 239,
160  'Kappa' => 922,
161  'kappa' => 954,
162  'Lambda' => 923,
163  'lambda' => 955,
164  'lang' => 9001,
165  'laquo' => 171,
166  'larr' => 8592,
167  'lArr' => 8656,
168  'lceil' => 8968,
169  'ldquo' => 8220,
170  'le' => 8804,
171  'lfloor' => 8970,
172  'lowast' => 8727,
173  'loz' => 9674,
174  'lrm' => 8206,
175  'lsaquo' => 8249,
176  'lsquo' => 8216,
177  'lt' => 60,
178  'macr' => 175,
179  'mdash' => 8212,
180  'micro' => 181,
181  'middot' => 183,
182  'minus' => 8722,
183  'Mu' => 924,
184  'mu' => 956,
185  'nabla' => 8711,
186  'nbsp' => 160,
187  'ndash' => 8211,
188  'ne' => 8800,
189  'ni' => 8715,
190  'not' => 172,
191  'notin' => 8713,
192  'nsub' => 8836,
193  'Ntilde' => 209,
194  'ntilde' => 241,
195  'Nu' => 925,
196  'nu' => 957,
197  'Oacute' => 211,
198  'oacute' => 243,
199  'Ocirc' => 212,
200  'ocirc' => 244,
201  'OElig' => 338,
202  'oelig' => 339,
203  'Ograve' => 210,
204  'ograve' => 242,
205  'oline' => 8254,
206  'Omega' => 937,
207  'omega' => 969,
208  'Omicron' => 927,
209  'omicron' => 959,
210  'oplus' => 8853,
211  'or' => 8744,
212  'ordf' => 170,
213  'ordm' => 186,
214  'Oslash' => 216,
215  'oslash' => 248,
216  'Otilde' => 213,
217  'otilde' => 245,
218  'otimes' => 8855,
219  'Ouml' => 214,
220  'ouml' => 246,
221  'para' => 182,
222  'part' => 8706,
223  'permil' => 8240,
224  'perp' => 8869,
225  'Phi' => 934,
226  'phi' => 966,
227  'Pi' => 928,
228  'pi' => 960,
229  'piv' => 982,
230  'plusmn' => 177,
231  'pound' => 163,
232  'prime' => 8242,
233  'Prime' => 8243,
234  'prod' => 8719,
235  'prop' => 8733,
236  'Psi' => 936,
237  'psi' => 968,
238  'quot' => 34,
239  'radic' => 8730,
240  'rang' => 9002,
241  'raquo' => 187,
242  'rarr' => 8594,
243  'rArr' => 8658,
244  'rceil' => 8969,
245  'rdquo' => 8221,
246  'real' => 8476,
247  'reg' => 174,
248  'rfloor' => 8971,
249  'Rho' => 929,
250  'rho' => 961,
251  'rlm' => 8207,
252  'rsaquo' => 8250,
253  'rsquo' => 8217,
254  'sbquo' => 8218,
255  'Scaron' => 352,
256  'scaron' => 353,
257  'sdot' => 8901,
258  'sect' => 167,
259  'shy' => 173,
260  'Sigma' => 931,
261  'sigma' => 963,
262  'sigmaf' => 962,
263  'sim' => 8764,
264  'spades' => 9824,
265  'sub' => 8834,
266  'sube' => 8838,
267  'sum' => 8721,
268  'sup' => 8835,
269  'sup1' => 185,
270  'sup2' => 178,
271  'sup3' => 179,
272  'supe' => 8839,
273  'szlig' => 223,
274  'Tau' => 932,
275  'tau' => 964,
276  'there4' => 8756,
277  'Theta' => 920,
278  'theta' => 952,
279  'thetasym' => 977,
280  'thinsp' => 8201,
281  'THORN' => 222,
282  'thorn' => 254,
283  'tilde' => 732,
284  'times' => 215,
285  'trade' => 8482,
286  'Uacute' => 218,
287  'uacute' => 250,
288  'uarr' => 8593,
289  'uArr' => 8657,
290  'Ucirc' => 219,
291  'ucirc' => 251,
292  'Ugrave' => 217,
293  'ugrave' => 249,
294  'uml' => 168,
295  'upsih' => 978,
296  'Upsilon' => 933,
297  'upsilon' => 965,
298  'Uuml' => 220,
299  'uuml' => 252,
300  'weierp' => 8472,
301  'Xi' => 926,
302  'xi' => 958,
303  'Yacute' => 221,
304  'yacute' => 253,
305  'yen' => 165,
306  'Yuml' => 376,
307  'yuml' => 255,
308  'Zeta' => 918,
309  'zeta' => 950,
310  'zwj' => 8205,
311  'zwnj' => 8204
312  );
313 
317  private static $htmlEntityAliases = array(
318  'רלמ' => 'rlm',
319  'رلم' => 'rlm',
320  );
321 
325  private static $attribsRegex;
326 
332  static function getAttribsRegex() {
333  if ( self::$attribsRegex === null ) {
334  $attribFirst = '[:A-Z_a-z0-9]';
335  $attrib = '[:A-Z_a-z-.0-9]';
336  $space = '[\x09\x0a\x0d\x20]';
337  self::$attribsRegex =
338  "/(?:^|$space)({$attribFirst}{$attrib}*)
339  ($space*=$space*
340  (?:
341  # The attribute value: quoted or alone
342  \"([^<\"]*)\"
343  | '([^<']*)'
344  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
345  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
346  # colors are specified like this.
347  # We'll be normalizing it.
348  )
349  )?(?=$space|\$)/sx";
350  }
351  return self::$attribsRegex;
352  }
353 
366  static function removeHTMLtags( $text, $processCallback = null,
367  $args = array(), $extratags = array(), $removetags = array()
368  ) {
369  global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
370 
371  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
372  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
373 
374  wfProfileIn( __METHOD__ );
375 
376  // Base our staticInitialised variable off of the global config state so that if the globals
377  // are changed (like in the screwed up test system) we will re-initialise the settings.
378  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
379  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
380 
381  $htmlpairsStatic = array( # Tags that must be closed
382  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
383  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
384  'strike', 'strong', 'tt', 'var', 'div', 'center',
385  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
386  'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
387  'kbd', 'samp', 'data', 'time', 'mark'
388  );
389  $htmlsingle = array(
390  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
391  );
392  $htmlsingleonly = array( # Elements that cannot have close tags
393  'br', 'wbr', 'hr'
394  );
395  if ( $wgAllowMicrodataAttributes ) {
396  $htmlsingle[] = $htmlsingleonly[] = 'meta';
397  $htmlsingle[] = $htmlsingleonly[] = 'link';
398  }
399  $htmlnest = array( # Tags that can be nested--??
400  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
401  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
402  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
403  );
404  $tabletags = array( # Can only appear inside table, we will close them
405  'td', 'th', 'tr',
406  );
407  $htmllist = array( # Tags used by list
408  'ul', 'ol',
409  );
410  $listtags = array( # Tags that can appear in a list
411  'li',
412  );
413 
414  if ( $wgAllowImageTag ) {
415  $htmlsingle[] = 'img';
416  $htmlsingleonly[] = 'img';
417  }
418 
419  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
420  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
421 
422  # Convert them all to hashtables for faster lookup
423  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
424  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
425  foreach ( $vars as $var ) {
426  $$var = array_flip( $$var );
427  }
428  $staticInitialised = $globalContext;
429  }
430  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
431  $extratags = array_flip( $extratags );
432  $removetags = array_flip( $removetags );
433  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
434  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
435 
436  # Remove HTML comments
437  $text = Sanitizer::removeHTMLcomments( $text );
438  $bits = explode( '<', $text );
439  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
440  if ( !$wgUseTidy ) {
441  $tagstack = $tablestack = array();
442  foreach ( $bits as $x ) {
443  $regs = array();
444  # $slash: Does the current element start with a '/'?
445  # $t: Current element name
446  # $params: String between element name and >
447  # $brace: Ending '>' or '/>'
448  # $rest: Everything until the next element of $bits
449  if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
450  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
451  } else {
452  $slash = $t = $params = $brace = $rest = null;
453  }
454 
455  $badtag = false;
456  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
457  # Check our stack
458  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
459  $badtag = true;
460  } elseif ( $slash ) {
461  # Closing a tag... is it the one we just opened?
462  $ot = @array_pop( $tagstack );
463  if ( $ot != $t ) {
464  if ( isset( $htmlsingleallowed[$ot] ) ) {
465  # Pop all elements with an optional close tag
466  # and see if we find a match below them
467  $optstack = array();
468  array_push( $optstack, $ot );
470  $ot = array_pop( $tagstack );
472  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
473  array_push( $optstack, $ot );
475  $ot = array_pop( $tagstack );
477  }
478  if ( $t != $ot ) {
479  # No match. Push the optional elements back again
480  $badtag = true;
482  $ot = array_pop( $optstack );
484  while ( $ot ) {
485  array_push( $tagstack, $ot );
487  $ot = array_pop( $optstack );
489  }
490  }
491  } else {
492  @array_push( $tagstack, $ot );
493  # <li> can be nested in <ul> or <ol>, skip those cases:
494  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
495  $badtag = true;
496  }
497  }
498  } else {
499  if ( $t == 'table' ) {
500  $tagstack = array_pop( $tablestack );
501  }
502  }
503  $newparams = '';
504  } else {
505  # Keep track for later
506  if ( isset( $tabletags[$t] ) &&
507  !in_array( 'table', $tagstack ) ) {
508  $badtag = true;
509  } elseif ( in_array( $t, $tagstack ) &&
510  !isset( $htmlnest[$t] ) ) {
511  $badtag = true;
512  # Is it a self closed htmlpair ? (bug 5487)
513  } elseif ( $brace == '/>' &&
514  isset( $htmlpairs[$t] ) ) {
515  $badtag = true;
516  } elseif ( isset( $htmlsingleonly[$t] ) ) {
517  # Hack to force empty tag for unclosable elements
518  $brace = '/>';
519  } elseif ( isset( $htmlsingle[$t] ) ) {
520  # Hack to not close $htmlsingle tags
521  $brace = null;
522  # Still need to push this optionally-closed tag to
523  # the tag stack so that we can match end tags
524  # instead of marking them as bad.
525  array_push( $tagstack, $t );
526  } elseif ( isset( $tabletags[$t] )
527  && in_array( $t, $tagstack ) ) {
528  // New table tag but forgot to close the previous one
529  $text .= "</$t>";
530  } else {
531  if ( $t == 'table' ) {
532  array_push( $tablestack, $tagstack );
533  $tagstack = array();
534  }
535  array_push( $tagstack, $t );
536  }
537 
538  # Replace any variables or template parameters with
539  # plaintext results.
540  if ( is_callable( $processCallback ) ) {
541  call_user_func_array( $processCallback, array( &$params, $args ) );
542  }
543 
544  if ( !Sanitizer::validateTag( $params, $t ) ) {
545  $badtag = true;
546  }
547 
548  # Strip non-approved attributes from the tag
549  $newparams = Sanitizer::fixTagAttributes( $params, $t );
550  }
551  if ( !$badtag ) {
552  $rest = str_replace( '>', '&gt;', $rest );
553  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
554  $text .= "<$slash$t$newparams$close>$rest";
555  continue;
556  }
557  }
558  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
559  }
560  # Close off any remaining tags
561  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
562  $text .= "</$t>\n";
563  if ( $t == 'table' ) {
564  $tagstack = array_pop( $tablestack );
565  }
566  }
567  } else {
568  # this might be possible using tidy itself
569  foreach ( $bits as $x ) {
570  preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
571  $x, $regs );
572  @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
573  $badtag = false;
574  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
575  if ( is_callable( $processCallback ) ) {
576  call_user_func_array( $processCallback, array( &$params, $args ) );
577  }
578 
579  if ( !Sanitizer::validateTag( $params, $t ) ) {
580  $badtag = true;
581  }
582 
583  $newparams = Sanitizer::fixTagAttributes( $params, $t );
584  if ( !$badtag ) {
585  $rest = str_replace( '>', '&gt;', $rest );
586  $text .= "<$slash$t$newparams$brace$rest";
587  continue;
588  }
589  }
590  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
591  }
592  }
593  wfProfileOut( __METHOD__ );
594  return $text;
595  }
596 
607  static function removeHTMLcomments( $text ) {
608  wfProfileIn( __METHOD__ );
609  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
610  $end = strpos( $text, '-->', $start + 4 );
611  if ( $end === false ) {
612  # Unterminated comment; bail out
613  break;
614  }
615 
616  $end += 3;
617 
618  # Trim space and newline if the comment is both
619  # preceded and followed by a newline
620  $spaceStart = max( $start - 1, 0 );
621  $spaceLen = $end - $spaceStart;
622  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
623  $spaceStart--;
624  $spaceLen++;
625  }
626  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
627  $spaceLen++;
628  }
629  if ( substr( $text, $spaceStart, 1 ) === "\n"
630  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
631  # Remove the comment, leading and trailing
632  # spaces, and leave only one newline.
633  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
634  } else {
635  # Remove just the comment.
636  $text = substr_replace( $text, '', $start, $end - $start );
637  }
638  }
639  wfProfileOut( __METHOD__ );
640  return $text;
641  }
642 
655  static function validateTag( $params, $element ) {
657 
658  if ( $element == 'meta' || $element == 'link' ) {
659  if ( !isset( $params['itemprop'] ) ) {
660  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
661  return false;
662  }
663  if ( $element == 'meta' && !isset( $params['content'] ) ) {
664  // <meta> must have a content="" for the itemprop
665  return false;
666  }
667  if ( $element == 'link' && !isset( $params['href'] ) ) {
668  // <link> must have an associated href=""
669  return false;
670  }
671  }
672 
673  return true;
674  }
675 
691  static function validateTagAttributes( $attribs, $element ) {
693  Sanitizer::attributeWhitelist( $element ) );
694  }
695 
711  static function validateAttributes( $attribs, $whitelist ) {
712  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
713 
714  $whitelist = array_flip( $whitelist );
715  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
716 
717  $out = array();
718  foreach ( $attribs as $attribute => $value ) {
719  #allow XML namespace declaration if RDFa is enabled
720  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
721  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
722  $out[$attribute] = $value;
723  }
724 
725  continue;
726  }
727 
728  # Allow any attribute beginning with "data-"
729  if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
730  continue;
731  }
732 
733  # Strip javascript "expression" from stylesheets.
734  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
735  if ( $attribute == 'style' ) {
737  }
738 
739  if ( $attribute === 'id' ) {
740  $value = Sanitizer::escapeId( $value, 'noninitial' );
741  }
742 
743  # WAI-ARIA
744  # http://www.w3.org/TR/wai-aria/
745  # http://www.whatwg.org/html/elements.html#wai-aria
746  # For now we only support role="presentation" until we work out what roles should be
747  # usable by content and we ensure that our code explicitly rejects patterns that
748  # violate HTML5's ARIA restrictions.
749  if ( $attribute === 'role' && $value !== 'presentation' ) {
750  continue;
751  }
752 
753  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
754  // Check them for sanity.
755  if ( $attribute === 'rel' || $attribute === 'rev'
756  # RDFa
757  || $attribute === 'about' || $attribute === 'property'
758  || $attribute === 'resource' || $attribute === 'datatype'
759  || $attribute === 'typeof'
760  # HTML5 microdata
761  || $attribute === 'itemid' || $attribute === 'itemprop'
762  || $attribute === 'itemref' || $attribute === 'itemscope'
763  || $attribute === 'itemtype'
764  ) {
765  //Paranoia. Allow "simple" values but suppress javascript
766  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
767  continue;
768  }
769  }
770 
771  # NOTE: even though elements using href/src are not allowed directly, supply
772  # validation code that can be used by tag hook handlers, etc
773  if ( $attribute === 'href' || $attribute === 'src' ) {
774  if ( !preg_match( $hrefExp, $value ) ) {
775  continue; //drop any href or src attributes not using an allowed protocol.
776  // NOTE: this also drops all relative URLs
777  }
778  }
779 
780  // If this attribute was previously set, override it.
781  // Output should only have one attribute of each name.
782  $out[$attribute] = $value;
783  }
784 
785  if ( $wgAllowMicrodataAttributes ) {
786  # itemtype, itemid, itemref don't make sense without itemscope
787  if ( !array_key_exists( 'itemscope', $out ) ) {
788  unset( $out['itemtype'] );
789  unset( $out['itemid'] );
790  unset( $out['itemref'] );
791  }
792  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
793  }
794  return $out;
795  }
796 
807  static function mergeAttributes( $a, $b ) {
808  $out = array_merge( $a, $b );
809  if ( isset( $a['class'] ) && isset( $b['class'] )
810  && is_string( $a['class'] ) && is_string( $b['class'] )
811  && $a['class'] !== $b['class']
812  ) {
813  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
814  -1, PREG_SPLIT_NO_EMPTY );
815  $out['class'] = implode( ' ', array_unique( $classes ) );
816  }
817  return $out;
818  }
819 
829  public static function normalizeCss( $value ) {
830 
831  // Decode character references like &#123;
833 
834  // Decode escape sequences and line continuation
835  // See the grammar in the CSS 2 spec, appendix D.
836  // This has to be done AFTER decoding character references.
837  // This means it isn't possible for this function to return
838  // unsanitized escape sequences. It is possible to manufacture
839  // input that contains character references that decode to
840  // escape sequences that decode to character references, but
841  // it's OK for the return value to contain character references
842  // because the caller is supposed to escape those anyway.
843  static $decodeRegex;
844  if ( !$decodeRegex ) {
845  $space = '[\\x20\\t\\r\\n\\f]';
846  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
847  $backslash = '\\\\';
848  $decodeRegex = "/ $backslash
849  (?:
850  ($nl) | # 1. Line continuation
851  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
852  (.) | # 3. backslash cancelling special meaning
853  () | # 4. backslash at end of string
854  )/xu";
855  }
856  $value = preg_replace_callback( $decodeRegex,
857  array( __CLASS__, 'cssDecodeCallback' ), $value );
858 
859  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
860  $value = preg_replace_callback(
861  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
862  function ( $matches ) {
863  $cp = utf8ToCodepoint( $matches[0] );
864  if ( $cp === false ) {
865  return '';
866  }
867  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
868  },
869  $value
870  );
871 
872  // Convert more characters IE6 might treat as ascii
873  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
874  $value = str_replace(
875  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
876  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
877  $value
878  );
879 
880  // Let the value through if it's nothing but a single comment, to
881  // allow other functions which may reject it to pass some error
882  // message through.
883  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
884  // Remove any comments; IE gets token splitting wrong
885  // This must be done AFTER decoding character references and
886  // escape sequences, because those steps can introduce comments
887  // This step cannot introduce character references or escape
888  // sequences, because it replaces comments with spaces rather
889  // than removing them completely.
890  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
891 
892  // Remove anything after a comment-start token, to guard against
893  // incorrect client implementations.
894  $commentPos = strpos( $value, '/*' );
895  if ( $commentPos !== false ) {
896  $value = substr( $value, 0, $commentPos );
897  }
898  }
899 
900  // S followed by repeat, iteration, or prolonged sound marks,
901  // which IE will treat as "ss"
902  $value = preg_replace(
903  '/s(?:
904  \xE3\x80\xB1 | # U+3031
905  \xE3\x82\x9D | # U+309D
906  \xE3\x83\xBC | # U+30FC
907  \xE3\x83\xBD | # U+30FD
908  \xEF\xB9\xBC | # U+FE7C
909  \xEF\xB9\xBD | # U+FE7D
910  \xEF\xBD\xB0 # U+FF70
911  )/ix',
912  'ss',
913  $value
914  );
915 
916  return $value;
917  }
918 
919 
938  static function checkCss( $value ) {
940 
941  // Reject problematic keywords and control characters
942  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
943  return '/* invalid control char */';
944  } elseif ( preg_match(
945  '! expression
946  | filter\s*:
947  | accelerator\s*:
948  | -o-link\s*:
949  | -o-link-source\s*:
950  | -o-replace\s*:
951  | url\s*\(
952  | image\s*\(
953  | image-set\s*\(
954  !ix', $value ) ) {
955  return '/* insecure input */';
956  }
957  return $value;
958  }
959 
964  static function cssDecodeCallback( $matches ) {
965  if ( $matches[1] !== '' ) {
966  // Line continuation
967  return '';
968  } elseif ( $matches[2] !== '' ) {
969  $char = codepointToUtf8( hexdec( $matches[2] ) );
970  } elseif ( $matches[3] !== '' ) {
971  $char = $matches[3];
972  } else {
973  $char = '\\';
974  }
975  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
976  // These characters need to be escaped in strings
977  // Clean up the escape sequence to avoid parsing errors by clients
978  return '\\' . dechex( ord( $char ) ) . ' ';
979  } else {
980  // Decode unnecessary escape
981  return $char;
982  }
983  }
984 
1004  static function fixTagAttributes( $text, $element ) {
1005  if ( trim( $text ) == '' ) {
1006  return '';
1007  }
1008 
1009  $decoded = Sanitizer::decodeTagAttributes( $text );
1010  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1011 
1012  return Sanitizer::safeEncodeTagAttributes( $stripped );
1013  }
1014 
1020  static function encodeAttribute( $text ) {
1021  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1022 
1023  // Whitespace is normalized during attribute decoding,
1024  // so if we've been passed non-spaces we must encode them
1025  // ahead of time or they won't be preserved.
1026  $encValue = strtr( $encValue, array(
1027  "\n" => '&#10;',
1028  "\r" => '&#13;',
1029  "\t" => '&#9;',
1030  ) );
1031 
1032  return $encValue;
1033  }
1034 
1041  static function safeEncodeAttribute( $text ) {
1042  $encValue = Sanitizer::encodeAttribute( $text );
1043 
1044  # Templates and links may be expanded in later parsing,
1045  # creating invalid or dangerous output. Suppress this.
1046  $encValue = strtr( $encValue, array(
1047  '<' => '&lt;', // This should never happen,
1048  '>' => '&gt;', // we've received invalid input
1049  '"' => '&quot;', // which should have been escaped.
1050  '{' => '&#123;',
1051  '[' => '&#91;',
1052  "''" => '&#39;&#39;',
1053  'ISBN' => '&#73;SBN',
1054  'RFC' => '&#82;FC',
1055  'PMID' => '&#80;MID',
1056  '|' => '&#124;',
1057  '__' => '&#95;_',
1058  ) );
1059 
1060  # Stupid hack
1061  $encValue = preg_replace_callback(
1062  '/((?i)' . wfUrlProtocols() . ')/',
1063  array( 'Sanitizer', 'armorLinksCallback' ),
1064  $encValue );
1065  return $encValue;
1066  }
1067 
1099  static function escapeId( $id, $options = array() ) {
1100  global $wgExperimentalHtmlIds;
1101  $options = (array)$options;
1102 
1103  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1104  $id = Sanitizer::decodeCharReferences( $id );
1105  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1106  $id = trim( $id, '_' );
1107  if ( $id === '' ) {
1108  # Must have been all whitespace to start with.
1109  return '_';
1110  } else {
1111  return $id;
1112  }
1113  }
1114 
1115  # HTML4-style escaping
1116  static $replace = array(
1117  '%3A' => ':',
1118  '%' => '.'
1119  );
1120 
1121  $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
1122  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1123 
1124  if ( !preg_match( '/^[a-zA-Z]/', $id )
1125  && !in_array( 'noninitial', $options ) ) {
1126  // Initial character must be a letter!
1127  $id = "x$id";
1128  }
1129  return $id;
1130  }
1131 
1143  static function escapeClass( $class ) {
1144  // Convert ugly stuff to underscores and kill underscores in ugly places
1145  return rtrim( preg_replace(
1146  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1147  '_',
1148  $class ), '_' );
1149  }
1150 
1158  static function escapeHtmlAllowEntities( $html ) {
1160  # It seems wise to escape ' as well as ", as a matter of course. Can't
1161  # hurt.
1162  $html = htmlspecialchars( $html, ENT_QUOTES );
1163  return $html;
1164  }
1165 
1171  private static function armorLinksCallback( $matches ) {
1172  return str_replace( ':', '&#58;', $matches[1] );
1173  }
1174 
1183  public static function decodeTagAttributes( $text ) {
1184  if ( trim( $text ) == '' ) {
1185  return array();
1186  }
1187 
1188  $attribs = array();
1189  $pairs = array();
1190  if ( !preg_match_all(
1191  self::getAttribsRegex(),
1192  $text,
1193  $pairs,
1194  PREG_SET_ORDER ) ) {
1195  return $attribs;
1196  }
1197 
1198  foreach ( $pairs as $set ) {
1199  $attribute = strtolower( $set[1] );
1201 
1202  // Normalize whitespace
1203  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1204  $value = trim( $value );
1205 
1206  // Decode character references
1208  }
1209  return $attribs;
1210  }
1211 
1219  public static function safeEncodeTagAttributes( $assoc_array ) {
1220  $attribs = array();
1221  foreach ( $assoc_array as $attribute => $value ) {
1222  $encAttribute = htmlspecialchars( $attribute );
1223  $encValue = Sanitizer::safeEncodeAttribute( $value );
1224 
1225  $attribs[] = "$encAttribute=\"$encValue\"";
1226  }
1227  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1228  }
1229 
1238  private static function getTagAttributeCallback( $set ) {
1239  if ( isset( $set[6] ) ) {
1240  # Illegal #XXXXXX color with no quotes.
1241  return $set[6];
1242  } elseif ( isset( $set[5] ) ) {
1243  # No quotes.
1244  return $set[5];
1245  } elseif ( isset( $set[4] ) ) {
1246  # Single-quoted
1247  return $set[4];
1248  } elseif ( isset( $set[3] ) ) {
1249  # Double-quoted
1250  return $set[3];
1251  } elseif ( !isset( $set[2] ) ) {
1252  # In XHTML, attributes must have a value.
1253  # For 'reduced' form, return explicitly the attribute name here.
1254  return $set[1];
1255  } else {
1256  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1257  }
1258  }
1259 
1272  private static function normalizeAttributeValue( $text ) {
1273  return str_replace( '"', '&quot;',
1274  self::normalizeWhitespace(
1276  }
1277 
1282  private static function normalizeWhitespace( $text ) {
1283  return preg_replace(
1284  '/\r\n|[\x20\x0d\x0a\x09]/',
1285  ' ',
1286  $text );
1287  }
1288 
1298  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1299  }
1300 
1316  static function normalizeCharReferences( $text ) {
1317  return preg_replace_callback(
1318  self::CHAR_REFS_REGEX,
1319  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1320  $text );
1321  }
1322 
1328  $ret = null;
1329  if ( $matches[1] != '' ) {
1331  } elseif ( $matches[2] != '' ) {
1333  } elseif ( $matches[3] != '' ) {
1335  }
1336  if ( is_null( $ret ) ) {
1337  return htmlspecialchars( $matches[0] );
1338  } else {
1339  return $ret;
1340  }
1341  }
1342 
1353  static function normalizeEntity( $name ) {
1354  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1355  return '&' . self::$htmlEntityAliases[$name] . ';';
1356  } elseif ( in_array( $name,
1357  array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1358  return "&$name;";
1359  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1360  return '&#' . self::$htmlEntities[$name] . ';';
1361  } else {
1362  return "&amp;$name;";
1363  }
1364  }
1365 
1370  static function decCharReference( $codepoint ) {
1371  $point = intval( $codepoint );
1372  if ( Sanitizer::validateCodepoint( $point ) ) {
1373  return sprintf( '&#%d;', $point );
1374  } else {
1375  return null;
1376  }
1377  }
1378 
1383  static function hexCharReference( $codepoint ) {
1384  $point = hexdec( $codepoint );
1385  if ( Sanitizer::validateCodepoint( $point ) ) {
1386  return sprintf( '&#x%x;', $point );
1387  } else {
1388  return null;
1389  }
1390  }
1391 
1397  private static function validateCodepoint( $codepoint ) {
1398  return $codepoint == 0x09
1399  || $codepoint == 0x0a
1400  || $codepoint == 0x0d
1401  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1402  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1403  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1404  }
1405 
1413  public static function decodeCharReferences( $text ) {
1414  return preg_replace_callback(
1415  self::CHAR_REFS_REGEX,
1416  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1417  $text );
1418  }
1419 
1430  public static function decodeCharReferencesAndNormalize( $text ) {
1432  $text = preg_replace_callback(
1433  self::CHAR_REFS_REGEX,
1434  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1435  $text, /* limit */ -1, $count );
1436 
1437  if ( $count ) {
1438  return $wgContLang->normalize( $text );
1439  } else {
1440  return $text;
1441  }
1442  }
1443 
1449  if ( $matches[1] != '' ) {
1450  return Sanitizer::decodeEntity( $matches[1] );
1451  } elseif ( $matches[2] != '' ) {
1452  return Sanitizer::decodeChar( intval( $matches[2] ) );
1453  } elseif ( $matches[3] != '' ) {
1454  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1455  }
1456  # Last case should be an ampersand by itself
1457  return $matches[0];
1458  }
1459 
1467  static function decodeChar( $codepoint ) {
1468  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1469  return codepointToUtf8( $codepoint );
1470  } else {
1471  return UTF8_REPLACEMENT;
1472  }
1473  }
1474 
1483  static function decodeEntity( $name ) {
1484  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1485  $name = self::$htmlEntityAliases[$name];
1486  }
1487  if ( isset( self::$htmlEntities[$name] ) ) {
1488  return codepointToUtf8( self::$htmlEntities[$name] );
1489  } else {
1490  return "&$name;";
1491  }
1492  }
1493 
1500  static function attributeWhitelist( $element ) {
1502  return isset( $list[$element] )
1503  ? $list[$element]
1504  : array();
1505  }
1506 
1512  static function setupAttributeWhitelist() {
1513  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
1514 
1515  static $whitelist, $staticInitialised;
1516  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1517 
1518  if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
1519  return $whitelist;
1520  }
1521 
1522  $common = array(
1523  # HTML
1524  'id',
1525  'class',
1526  'style',
1527  'lang',
1528  'dir',
1529  'title',
1530 
1531  # WAI-ARIA
1532  'role',
1533  );
1534 
1535  if ( $wgAllowRdfaAttributes ) {
1536  # RDFa attributes as specified in section 9 of
1537  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1538  $common = array_merge( $common, array(
1539  'about', 'property', 'resource', 'datatype', 'typeof',
1540  ) );
1541  }
1542 
1543  if ( $wgAllowMicrodataAttributes ) {
1544  # add HTML5 microdata tags as specified by
1545  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1546  $common = array_merge( $common, array(
1547  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1548  ) );
1549  }
1550 
1551  $block = array_merge( $common, array( 'align' ) );
1552  $tablealign = array( 'align', 'valign' );
1553  $tablecell = array(
1554  'abbr',
1555  'axis',
1556  'headers',
1557  'scope',
1558  'rowspan',
1559  'colspan',
1560  'nowrap', # deprecated
1561  'width', # deprecated
1562  'height', # deprecated
1563  'bgcolor', # deprecated
1564  );
1565 
1566  # Numbers refer to sections in HTML 4.01 standard describing the element.
1567  # See: http://www.w3.org/TR/html4/
1568  $whitelist = array(
1569  # 7.5.4
1570  'div' => $block,
1571  'center' => $common, # deprecated
1572  'span' => $common,
1573 
1574  # 7.5.5
1575  'h1' => $block,
1576  'h2' => $block,
1577  'h3' => $block,
1578  'h4' => $block,
1579  'h5' => $block,
1580  'h6' => $block,
1581 
1582  # 7.5.6
1583  # address
1584 
1585  # 8.2.4
1586  'bdo' => $common,
1587 
1588  # 9.2.1
1589  'em' => $common,
1590  'strong' => $common,
1591  'cite' => $common,
1592  'dfn' => $common,
1593  'code' => $common,
1594  'samp' => $common,
1595  'kbd' => $common,
1596  'var' => $common,
1597  'abbr' => $common,
1598  # acronym
1599 
1600  # 9.2.2
1601  'blockquote' => array_merge( $common, array( 'cite' ) ),
1602  'q' => array_merge( $common, array( 'cite' ) ),
1603 
1604  # 9.2.3
1605  'sub' => $common,
1606  'sup' => $common,
1607 
1608  # 9.3.1
1609  'p' => $block,
1610 
1611  # 9.3.2
1612  'br' => array_merge( $common, array( 'clear' ) ),
1613 
1614  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1615  'wbr' => $common,
1616 
1617  # 9.3.4
1618  'pre' => array_merge( $common, array( 'width' ) ),
1619 
1620  # 9.4
1621  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1622  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1623 
1624  # 10.2
1625  'ul' => array_merge( $common, array( 'type' ) ),
1626  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1627  'li' => array_merge( $common, array( 'type', 'value' ) ),
1628 
1629  # 10.3
1630  'dl' => $common,
1631  'dd' => $common,
1632  'dt' => $common,
1633 
1634  # 11.2.1
1635  'table' => array_merge( $common,
1636  array( 'summary', 'width', 'border', 'frame',
1637  'rules', 'cellspacing', 'cellpadding',
1638  'align', 'bgcolor',
1639  ) ),
1640 
1641  # 11.2.2
1642  'caption' => $block,
1643 
1644  # 11.2.3
1645  'thead' => $common,
1646  'tfoot' => $common,
1647  'tbody' => $common,
1648 
1649  # 11.2.4
1650  'colgroup' => array_merge( $common, array( 'span' ) ),
1651  'col' => array_merge( $common, array( 'span' ) ),
1652 
1653  # 11.2.5
1654  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1655 
1656  # 11.2.6
1657  'td' => array_merge( $common, $tablecell, $tablealign ),
1658  'th' => array_merge( $common, $tablecell, $tablealign ),
1659 
1660  # 12.2
1661  # NOTE: <a> is not allowed directly, but the attrib
1662  # whitelist is used from the Parser object
1663  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1664 
1665  # 13.2
1666  # Not usually allowed, but may be used for extension-style hooks
1667  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1668  # true
1669  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1670 
1671  # 15.2.1
1672  'tt' => $common,
1673  'b' => $common,
1674  'i' => $common,
1675  'big' => $common,
1676  'small' => $common,
1677  'strike' => $common,
1678  's' => $common,
1679  'u' => $common,
1680 
1681  # 15.2.2
1682  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1683  # basefont
1684 
1685  # 15.3
1686  'hr' => array_merge( $common, array( 'width' ) ),
1687 
1688  # HTML Ruby annotation text module, simple ruby only.
1689  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1690  'ruby' => $common,
1691  # rbc
1692  # rtc
1693  'rb' => $common,
1694  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1695  'rp' => $common,
1696 
1697  # MathML root element, where used for extensions
1698  # 'title' may not be 100% valid here; it's XHTML
1699  # http://www.w3.org/TR/REC-MathML/
1700  'math' => array( 'class', 'style', 'id', 'title' ),
1701 
1702  # HTML 5 section 4.6
1703  'bdi' => $common,
1704 
1705  # HTML5 elements, defined by:
1706  # http://www.whatwg.org/html/
1707  'data' => array_merge( $common, array( 'value' ) ),
1708  'time' => array_merge( $common, array( 'datetime' ) ),
1709  'mark' => $common,
1710 
1711  // meta and link are only permitted by removeHTMLtags when Microdata
1712  // is enabled so we don't bother adding a conditional to hide these
1713  // Also meta and link are only valid in WikiText as Microdata elements
1714  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1715  // So we don't bother including $common attributes that have no purpose.
1716  'meta' => array( 'itemprop', 'content' ),
1717  'link' => array( 'itemprop', 'href' ),
1718  );
1719 
1720  $staticInitialised = $globalContext;
1721 
1722  return $whitelist;
1723  }
1724 
1735  static function stripAllTags( $text ) {
1736  # Actual <tags>
1737  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1738 
1739  # Normalize &entities and whitespace
1740  $text = self::decodeCharReferences( $text );
1741  $text = self::normalizeWhitespace( $text );
1742 
1743  return $text;
1744  }
1745 
1755  static function hackDocType() {
1756  $out = "<!DOCTYPE html [\n";
1757  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1758  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1759  }
1760  $out .= "]>\n";
1761  return $out;
1762  }
1763 
1768  static function cleanUrl( $url ) {
1769  # Normalize any HTML entities in input. They will be
1770  # re-escaped by makeExternalLink().
1771  $url = Sanitizer::decodeCharReferences( $url );
1772 
1773  # Escape any control characters introduced by the above step
1774  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1775  array( __CLASS__, 'cleanUrlCallback' ), $url );
1776 
1777  # Validate hostname portion
1778  $matches = array();
1779  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1780  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1781 
1782  // Characters that will be ignored in IDNs.
1783  // http://tools.ietf.org/html/3454#section-3.1
1784  // Strip them before further processing so blacklists and such work.
1785  $strip = "/
1786  \\s| # general whitespace
1787  \xc2\xad| # 00ad SOFT HYPHEN
1788  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1789  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1790  \xe2\x81\xa0| # 2060 WORD JOINER
1791  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1792  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1793  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1794  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1795  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1796  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1797  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1798  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1799  /xuD";
1800 
1801  $host = preg_replace( $strip, '', $host );
1802 
1803  // @todo FIXME: Validate hostnames here
1804 
1805  return $protocol . $host . $rest;
1806  } else {
1807  return $url;
1808  }
1809  }
1810 
1815  static function cleanUrlCallback( $matches ) {
1816  return urlencode( $matches[0] );
1817  }
1818 
1847  public static function validateEmail( $addr ) {
1848  $result = null;
1849  if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1850  return $result;
1851  }
1852 
1853  // Please note strings below are enclosed in brackets [], this make the
1854  // hyphen "-" a range indicator. Hence it is double backslashed below.
1855  // See bug 26948
1856  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1857  $rfc1034_ldh_str = "a-z0-9\\-";
1858 
1859  $html5_email_regexp = "/
1860  ^ # start of string
1861  [$rfc5322_atext\\.]+ # user part which is liberal :p
1862  @ # 'apostrophe'
1863  [$rfc1034_ldh_str]+ # First domain part
1864  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1865  $ # End of string
1866  /ix"; // case Insensitive, eXtended
1867 
1868  return (bool)preg_match( $html5_email_regexp, $addr );
1869  }
1870 }
Sanitizer\normalizeAttributeValue
static normalizeAttributeValue( $text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:1272
$result
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message. Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item. $reader:XMLReader object $logInfo:Array of information Return false to stop further processing of the tag 'ImportHandlePageXMLTag':When parsing a XML tag in a page. $reader:XMLReader object $pageInfo:Array of information Return false to stop further processing of the tag 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision. $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information Return false to stop further processing of the tag 'ImportHandleToplevelXMLTag':When parsing a top level XML tag. $reader:XMLReader object Return false to stop further processing of the tag 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload. $reader:XMLReader object $revisionInfo:Array of information Return false to stop further processing of the tag 'InfoAction':When building information to display on the action=info page. $context:IContextSource object & $pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect. $title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not. Return true without providing an interwiki to continue interwiki search. $prefix:interwiki prefix we are looking for. & $iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InternalParseBeforeSanitize':during Parser 's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings. Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InternalParseBeforeLinks':during Parser 's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InvalidateEmailComplete':Called after a user 's email has been invalidated successfully. $user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification. Callee may modify $url and $query, URL will be constructed as $url . $query & $url:URL to index.php & $query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from & $allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of User::isValidEmailAddr(), for instance to return false if the domain name doesn 't match your organization. $addr:The e-mail address entered by the user & $result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user & $result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we 're looking for a messages file for & $file:The messages file path, you can override this to change the location. 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces. Do not use this hook to add namespaces. Use CanonicalNamespaces for that. & $namespaces:Array of namespaces indexed by their numbers 'LanguageGetMagic':DEPRECATED, use $magicWords in a file listed in $wgExtensionMessagesFiles instead. Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetSpecialPageAliases':DEPRECATED, use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead. Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names. & $names:array of language code=> language name $code language of the preferred translations 'LanguageLinks':Manipulate a page 's language links. This is called in various places to allow extensions to define the effective language links for a page. $title:The page 's Title. & $links:Associative array mapping language codes to prefixed links of the form "language:title". & $linkFlags:Associative array mapping prefixed links to arrays of flags. Currently unused, but planned to provide support for marking individual language links in the UI, e.g. for featured articles. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts. Return false to skip default processing and return $ret. See documentation for Linker::link() for details on the expected meanings of parameters. $skin:the Skin object $target:the Title that the link is pointing to & $html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1528
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1353
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1238
data
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
Sanitizer\attributeWhitelist
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1500
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:607
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
$html
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1530
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:50
is
We use the convention $dbr for read and $dbw for write to help you keep track of whether the database object is a the world will explode Or to be a subsequent write query which succeeded on the master may fail when replicated to the slave due to a unique key collision Replication on the slave will stop and it may take hours to repair the database and get it back online Setting read_only in my cnf on the slave will avoid this but given the dire we prefer to have as many checks as possible We provide a but the wrapper functions like please read the documentation for except in special pages derived from QueryPage It s a common pitfall for new developers to submit code containing SQL queries which examine huge numbers of rows Remember that COUNT * is(N), counting rows in atable is like counting beans in a bucket.------------------------------------------------------------------------ Replication------------------------------------------------------------------------The largest installation of MediaWiki, Wikimedia, uses a large set ofslave MySQL servers replicating writes made to a master MySQL server. Itis important to understand the issues associated with this setup if youwant to write code destined for Wikipedia.It 's often the case that the best algorithm to use for a given taskdepends on whether or not replication is in use. Due to our unabashedWikipedia-centrism, we often just use the replication-friendly version, but if you like, you can use wfGetLB() ->getServerCount() > 1 tocheck to see if replication is in use.===Lag===Lag primarily occurs when large write queries are sent to the master.Writes on the master are executed in parallel, but they are executed inserial when they are replicated to the slaves. The master writes thequery to the binlog when the transaction is committed. The slaves pollthe binlog and start executing the query as soon as it appears. They canservice reads while they are performing a write query, but will not readanything more from the binlog and thus will perform no more writes. Thismeans that if the write query runs for a long time, the slaves will lagbehind the master for the time it takes for the write query to complete.Lag can be exacerbated by high read load. MediaWiki 's load balancer willstop sending reads to a slave when it is lagged by more than 30 seconds.If the load ratios are set incorrectly, or if there is too much loadgenerally, this may lead to a slave permanently hovering around 30seconds lag.If all slaves are lagged by more than 30 seconds, MediaWiki will stopwriting to the database. All edits and other write operations will berefused, with an error returned to the user. This gives the slaves achance to catch up. Before we had this mechanism, the slaves wouldregularly lag by several minutes, making review of recent editsdifficult.In addition to this, MediaWiki attempts to ensure that the user seesevents occurring on the wiki in chronological order. A few seconds of lagcan be tolerated, as long as the user sees a consistent picture fromsubsequent requests. This is done by saving the master binlog positionin the session, and then at the start of each request, waiting for theslave to catch up to that position before doing any reads from it. Ifthis wait times out, reads are allowed anyway, but the request isconsidered to be in "lagged slave mode". Lagged slave mode can bechecked by calling wfGetLB() ->getLaggedSlaveMode(). The onlypractical consequence at present is a warning displayed in the pagefooter.===Lag avoidance===To avoid excessive lag, queries which write large numbers of rows shouldbe split up, generally to write one row at a time. Multi-row INSERT ...SELECT queries are the worst offenders should be avoided altogether.Instead do the select first and then the insert.===Working with lag===Despite our best efforts, it 's not practical to guarantee a low-lagenvironment. Lag will usually be less than one second, but mayoccasionally be up to 30 seconds. For scalability, it 's very importantto keep load on the master low, so simply sending all your queries tothe master is not the answer. So when you have a genuine need forup-to-date data, the following approach is advised:1) Do a quick query to the master for a sequence number or timestamp 2) Run the full query on the slave and check if it matches the data you gotfrom the master 3) If it doesn 't, run the full query on the masterTo avoid swamping the master every time the slaves lag, use of thisapproach should be kept to a minimum. In most cases you should just readfrom the slave and let the user deal with the delay.------------------------------------------------------------------------ Lock contention------------------------------------------------------------------------Due to the high write rate on Wikipedia(and some other wikis), MediaWiki developers need to be very careful to structure their writesto avoid long-lasting locks. By default, MediaWiki opens a transactionat the first query, and commits it before the output is sent. Locks willbe held from the time when the query is done until the commit. So youcan reduce lock time by doing as much processing as possible before youdo your write queries.Often this approach is not good enough, and it becomes necessary toenclose small groups of queries in their own transaction. Use thefollowing syntax:$dbw=wfGetDB(DB_MASTER
Sanitizer\$htmlEntities
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:58
text
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1483
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:807
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
$ret
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1530
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2387
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1297
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1847
$params
$params
Definition: styleTest.css.php:40
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1370
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1219
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1430
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:325
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1143
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1327
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
Sanitizer\stripAllTags
static stripAllTags( $text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1735
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:655
title
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
Definition: All_system_messages.txt:2703
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1755
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:51
codepointToUtf8
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
Definition: UtfNormalUtil.php:36
MWException
MediaWiki exception.
Definition: MWException.php:26
Sanitizer\$htmlEntityAliases
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:317
$out
$out
Definition: UtfNormalGenerate.php:167
wfRestoreWarnings
wfRestoreWarnings()
Restore error level to previous value.
Definition: GlobalFunctions.php:2417
hooks
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
table
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:1041
directly
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1020
Sanitizer\armorLinksCallback
static armorLinksCallback( $matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1171
wfRunHooks
wfRunHooks( $event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in $wgHooks.
Definition: GlobalFunctions.php:4010
Sanitizer\validateAttributes
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:711
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
simple
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1383
Sanitizer\escapeId
static escapeId( $id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it.
Definition: Sanitizer.php:1099
will
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
Definition: All_system_messages.txt:914
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1397
$options
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1530
$section
$section
Definition: Utf8Test.php:88
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:695
root
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
Definition: distributors.txt:39
$name
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:336
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
$value
$value
Definition: styleTest.css.php:45
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:691
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1768
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:964
tags
pre inside other HTML tags(bug 54946) !! wikitext a< div >< pre > foo</pre ></div >< pre ></pre > !! html< p >a</p >< div >< pre > foo</pre ></div >< pre ></pre > !! end !! test HTML pre followed by indent-pre !! wikitext< pre >foo</pre > bar !! html< pre >foo</pre >< pre >bar</pre > !! end !!test Block tag pre !!options parsoid !! wikitext< p >< pre >foo</pre ></p > !! html< p data-parsoid
UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormalDefines.php:64
only
published in in Madrid In the first edition of the Vocabolario for was published In in Rotterdam was the Dictionnaire Universel ! html< p > The first monolingual dictionary written in a Romance language was< i > Sebastián Covarrubias</i >< i > Tesoro de la lengua castellana o published in in Madrid In the first edition of the< i > Vocabolario dell< a href="/index.php?title=Accademia_della_Crusca&amp;action=edit&amp;redlink=1" class="new" title="Accademia della Crusca (page does not exist)"> Accademia della Crusca</a ></i > for was published In in Rotterdam was the< i > Dictionnaire Universel</i ></p > ! end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html php< p >< i > foo</i ></p > ! html parsoid< p >< i > foo</i >< b ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! options ! wikitext foo ! html< p >< b >< i > foo</i ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< i ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< b ></b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s family</b ></i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s</b > family</i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo</i ></b >< i > s family</i ></p > !end ! test Italics and ! options ! wikitext this is about foo s family ! html< p >< i > this is about</i > foo< b > s family</b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo s</i > family</b ></p > !end ! test Italicized possessive ! wikitext The s talk page ! html< p > The< i >< a href="/wiki/Main_Page" title="Main Page"> Main Page</a ></i > s talk page</p > ! end ! test Parsoid only
Definition: parserTests.txt:396
$count
$count
Definition: UtfNormalTest2.php:96
it
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content. The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content. These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text. All manipulation and analysis of page content must be done via the appropriate methods of the Content object. For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers. The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id). Also Title, WikiPage and Revision now have getContentHandler() methods for convenience. ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page. ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type. However, it is recommended to instead use WikiPage::getContent() resp. Revision::getContent() to get a page 's content as a Content object. These two methods should be the ONLY way in which page content is accessed. Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides(). This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based. Objects implementing the Content interface are used to represent and handle the content internally. For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content). The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats(). Content serialization formats are identified using MIME type like strings. The following formats are built in:*text/x-wiki - wikitext *text/javascript - for js pages *text/css - for css pages *text/plain - for future use, e.g. with plain text messages. *text/html - for future use, e.g. with plain html messages. *application/vnd.php.serialized - for future use with the api and for extensions *application/json - for future use with the api, and for use by extensions *application/xml - for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant. Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly. Without that information, interpretation of the provided content is not reliable. The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export. Also note that the API will provide encapsulated, serialized content - so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure. Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content. However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page 's content model, and will now generate warnings when used. Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent() *WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject(). However, both methods should be avoided since they do not provide clean access to the page 's actual content. For instance, they may return a system message for non-existing pages. Use WikiPage::getContent() instead. Code that relies on a textual representation of the page content should eventually be rewritten. However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page. Its behavior is controlled by $wgContentHandlerTextFallback it
Definition: contenthandler.txt:107
$args
if( $line===false) $args
Definition: cdb.php:62
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:829
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:1004
in
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
used
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1282
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
such
it sets a lot of them automatically from query and such
Definition: design.txt:93
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1467
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to.
Definition: StringUtils.php:256
utf8ToCodepoint
utf8ToCodepoint( $char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
Definition: UtfNormalUtil.php:94
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1316
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1183
from
Please log in again after you receive it</td >< td > s a saved copy from
Definition: All_system_messages.txt:3297
that
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
Definition: deferred.txt:11
Sanitizer\setupAttributeWhitelist
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1512
$t
$t
Definition: testCompression.php:65
$vars
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1679
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1413
$attribs
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1530
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:938
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:332
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1158
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1448
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1815
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:366