MediaWiki  1.23.16
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
50  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
51  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
52 
58  private static $htmlEntities = array(
59  'Aacute' => 193,
60  'aacute' => 225,
61  'Acirc' => 194,
62  'acirc' => 226,
63  'acute' => 180,
64  'AElig' => 198,
65  'aelig' => 230,
66  'Agrave' => 192,
67  'agrave' => 224,
68  'alefsym' => 8501,
69  'Alpha' => 913,
70  'alpha' => 945,
71  'amp' => 38,
72  'and' => 8743,
73  'ang' => 8736,
74  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
75  'Aring' => 197,
76  'aring' => 229,
77  'asymp' => 8776,
78  'Atilde' => 195,
79  'atilde' => 227,
80  'Auml' => 196,
81  'auml' => 228,
82  'bdquo' => 8222,
83  'Beta' => 914,
84  'beta' => 946,
85  'brvbar' => 166,
86  'bull' => 8226,
87  'cap' => 8745,
88  'Ccedil' => 199,
89  'ccedil' => 231,
90  'cedil' => 184,
91  'cent' => 162,
92  'Chi' => 935,
93  'chi' => 967,
94  'circ' => 710,
95  'clubs' => 9827,
96  'cong' => 8773,
97  'copy' => 169,
98  'crarr' => 8629,
99  'cup' => 8746,
100  'curren' => 164,
101  'dagger' => 8224,
102  'Dagger' => 8225,
103  'darr' => 8595,
104  'dArr' => 8659,
105  'deg' => 176,
106  'Delta' => 916,
107  'delta' => 948,
108  'diams' => 9830,
109  'divide' => 247,
110  'Eacute' => 201,
111  'eacute' => 233,
112  'Ecirc' => 202,
113  'ecirc' => 234,
114  'Egrave' => 200,
115  'egrave' => 232,
116  'empty' => 8709,
117  'emsp' => 8195,
118  'ensp' => 8194,
119  'Epsilon' => 917,
120  'epsilon' => 949,
121  'equiv' => 8801,
122  'Eta' => 919,
123  'eta' => 951,
124  'ETH' => 208,
125  'eth' => 240,
126  'Euml' => 203,
127  'euml' => 235,
128  'euro' => 8364,
129  'exist' => 8707,
130  'fnof' => 402,
131  'forall' => 8704,
132  'frac12' => 189,
133  'frac14' => 188,
134  'frac34' => 190,
135  'frasl' => 8260,
136  'Gamma' => 915,
137  'gamma' => 947,
138  'ge' => 8805,
139  'gt' => 62,
140  'harr' => 8596,
141  'hArr' => 8660,
142  'hearts' => 9829,
143  'hellip' => 8230,
144  'Iacute' => 205,
145  'iacute' => 237,
146  'Icirc' => 206,
147  'icirc' => 238,
148  'iexcl' => 161,
149  'Igrave' => 204,
150  'igrave' => 236,
151  'image' => 8465,
152  'infin' => 8734,
153  'int' => 8747,
154  'Iota' => 921,
155  'iota' => 953,
156  'iquest' => 191,
157  'isin' => 8712,
158  'Iuml' => 207,
159  'iuml' => 239,
160  'Kappa' => 922,
161  'kappa' => 954,
162  'Lambda' => 923,
163  'lambda' => 955,
164  'lang' => 9001,
165  'laquo' => 171,
166  'larr' => 8592,
167  'lArr' => 8656,
168  'lceil' => 8968,
169  'ldquo' => 8220,
170  'le' => 8804,
171  'lfloor' => 8970,
172  'lowast' => 8727,
173  'loz' => 9674,
174  'lrm' => 8206,
175  'lsaquo' => 8249,
176  'lsquo' => 8216,
177  'lt' => 60,
178  'macr' => 175,
179  'mdash' => 8212,
180  'micro' => 181,
181  'middot' => 183,
182  'minus' => 8722,
183  'Mu' => 924,
184  'mu' => 956,
185  'nabla' => 8711,
186  'nbsp' => 160,
187  'ndash' => 8211,
188  'ne' => 8800,
189  'ni' => 8715,
190  'not' => 172,
191  'notin' => 8713,
192  'nsub' => 8836,
193  'Ntilde' => 209,
194  'ntilde' => 241,
195  'Nu' => 925,
196  'nu' => 957,
197  'Oacute' => 211,
198  'oacute' => 243,
199  'Ocirc' => 212,
200  'ocirc' => 244,
201  'OElig' => 338,
202  'oelig' => 339,
203  'Ograve' => 210,
204  'ograve' => 242,
205  'oline' => 8254,
206  'Omega' => 937,
207  'omega' => 969,
208  'Omicron' => 927,
209  'omicron' => 959,
210  'oplus' => 8853,
211  'or' => 8744,
212  'ordf' => 170,
213  'ordm' => 186,
214  'Oslash' => 216,
215  'oslash' => 248,
216  'Otilde' => 213,
217  'otilde' => 245,
218  'otimes' => 8855,
219  'Ouml' => 214,
220  'ouml' => 246,
221  'para' => 182,
222  'part' => 8706,
223  'permil' => 8240,
224  'perp' => 8869,
225  'Phi' => 934,
226  'phi' => 966,
227  'Pi' => 928,
228  'pi' => 960,
229  'piv' => 982,
230  'plusmn' => 177,
231  'pound' => 163,
232  'prime' => 8242,
233  'Prime' => 8243,
234  'prod' => 8719,
235  'prop' => 8733,
236  'Psi' => 936,
237  'psi' => 968,
238  'quot' => 34,
239  'radic' => 8730,
240  'rang' => 9002,
241  'raquo' => 187,
242  'rarr' => 8594,
243  'rArr' => 8658,
244  'rceil' => 8969,
245  'rdquo' => 8221,
246  'real' => 8476,
247  'reg' => 174,
248  'rfloor' => 8971,
249  'Rho' => 929,
250  'rho' => 961,
251  'rlm' => 8207,
252  'rsaquo' => 8250,
253  'rsquo' => 8217,
254  'sbquo' => 8218,
255  'Scaron' => 352,
256  'scaron' => 353,
257  'sdot' => 8901,
258  'sect' => 167,
259  'shy' => 173,
260  'Sigma' => 931,
261  'sigma' => 963,
262  'sigmaf' => 962,
263  'sim' => 8764,
264  'spades' => 9824,
265  'sub' => 8834,
266  'sube' => 8838,
267  'sum' => 8721,
268  'sup' => 8835,
269  'sup1' => 185,
270  'sup2' => 178,
271  'sup3' => 179,
272  'supe' => 8839,
273  'szlig' => 223,
274  'Tau' => 932,
275  'tau' => 964,
276  'there4' => 8756,
277  'Theta' => 920,
278  'theta' => 952,
279  'thetasym' => 977,
280  'thinsp' => 8201,
281  'THORN' => 222,
282  'thorn' => 254,
283  'tilde' => 732,
284  'times' => 215,
285  'trade' => 8482,
286  'Uacute' => 218,
287  'uacute' => 250,
288  'uarr' => 8593,
289  'uArr' => 8657,
290  'Ucirc' => 219,
291  'ucirc' => 251,
292  'Ugrave' => 217,
293  'ugrave' => 249,
294  'uml' => 168,
295  'upsih' => 978,
296  'Upsilon' => 933,
297  'upsilon' => 965,
298  'Uuml' => 220,
299  'uuml' => 252,
300  'weierp' => 8472,
301  'Xi' => 926,
302  'xi' => 958,
303  'Yacute' => 221,
304  'yacute' => 253,
305  'yen' => 165,
306  'Yuml' => 376,
307  'yuml' => 255,
308  'Zeta' => 918,
309  'zeta' => 950,
310  'zwj' => 8205,
311  'zwnj' => 8204
312  );
313 
317  private static $htmlEntityAliases = array(
318  'רלמ' => 'rlm',
319  'رلم' => 'rlm',
320  );
321 
325  private static $attribsRegex;
326 
332  static function getAttribsRegex() {
333  if ( self::$attribsRegex === null ) {
334  $attribFirst = '[:A-Z_a-z0-9]';
335  $attrib = '[:A-Z_a-z-.0-9]';
336  $space = '[\x09\x0a\x0d\x20]';
337  self::$attribsRegex =
338  "/(?:^|$space)({$attribFirst}{$attrib}*)
339  ($space*=$space*
340  (?:
341  # The attribute value: quoted or alone
342  \"([^<\"]*)\"
343  | '([^<']*)'
344  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
345  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
346  # colors are specified like this.
347  # We'll be normalizing it.
348  )
349  )?(?=$space|\$)/sx";
350  }
351  return self::$attribsRegex;
352  }
353 
366  static function removeHTMLtags( $text, $processCallback = null,
367  $args = array(), $extratags = array(), $removetags = array()
368  ) {
369  global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
370 
371  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
372  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
373 
374  wfProfileIn( __METHOD__ );
375 
376  // Base our staticInitialised variable off of the global config state so that if the globals
377  // are changed (like in the screwed up test system) we will re-initialise the settings.
378  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
379  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
380 
381  $htmlpairsStatic = array( # Tags that must be closed
382  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
383  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
384  'strike', 'strong', 'tt', 'var', 'div', 'center',
385  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
386  'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
387  'kbd', 'samp', 'data', 'time', 'mark'
388  );
389  $htmlsingle = array(
390  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
391  );
392  $htmlsingleonly = array( # Elements that cannot have close tags
393  'br', 'wbr', 'hr'
394  );
395  if ( $wgAllowMicrodataAttributes ) {
396  $htmlsingle[] = $htmlsingleonly[] = 'meta';
397  $htmlsingle[] = $htmlsingleonly[] = 'link';
398  }
399  $htmlnest = array( # Tags that can be nested--??
400  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
401  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
402  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
403  );
404  $tabletags = array( # Can only appear inside table, we will close them
405  'td', 'th', 'tr',
406  );
407  $htmllist = array( # Tags used by list
408  'ul', 'ol',
409  );
410  $listtags = array( # Tags that can appear in a list
411  'li',
412  );
413 
414  if ( $wgAllowImageTag ) {
415  $htmlsingle[] = 'img';
416  $htmlsingleonly[] = 'img';
417  }
418 
419  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
420  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
421 
422  # Convert them all to hashtables for faster lookup
423  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
424  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
425  foreach ( $vars as $var ) {
426  $$var = array_flip( $$var );
427  }
428  $staticInitialised = $globalContext;
429  }
430  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
431  $extratags = array_flip( $extratags );
432  $removetags = array_flip( $removetags );
433  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
434  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
435 
436  # Remove HTML comments
437  $text = Sanitizer::removeHTMLcomments( $text );
438  $bits = explode( '<', $text );
439  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
440  if ( !$wgUseTidy ) {
441  $tagstack = $tablestack = array();
442  foreach ( $bits as $x ) {
443  $regs = array();
444  # $slash: Does the current element start with a '/'?
445  # $t: Current element name
446  # $params: String between element name and >
447  # $brace: Ending '>' or '/>'
448  # $rest: Everything until the next element of $bits
449  if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
450  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
451  } else {
452  $slash = $t = $params = $brace = $rest = null;
453  }
454 
455  $badtag = false;
456  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
457  # Check our stack
458  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
459  $badtag = true;
460  } elseif ( $slash ) {
461  # Closing a tag... is it the one we just opened?
462  $ot = @array_pop( $tagstack );
463  if ( $ot != $t ) {
464  if ( isset( $htmlsingleallowed[$ot] ) ) {
465  # Pop all elements with an optional close tag
466  # and see if we find a match below them
467  $optstack = array();
468  array_push( $optstack, $ot );
470  $ot = array_pop( $tagstack );
472  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
473  array_push( $optstack, $ot );
475  $ot = array_pop( $tagstack );
477  }
478  if ( $t != $ot ) {
479  # No match. Push the optional elements back again
480  $badtag = true;
482  $ot = array_pop( $optstack );
484  while ( $ot ) {
485  array_push( $tagstack, $ot );
487  $ot = array_pop( $optstack );
489  }
490  }
491  } else {
492  @array_push( $tagstack, $ot );
493  # <li> can be nested in <ul> or <ol>, skip those cases:
494  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
495  $badtag = true;
496  }
497  }
498  } else {
499  if ( $t == 'table' ) {
500  $tagstack = array_pop( $tablestack );
501  }
502  }
503  $newparams = '';
504  } else {
505  # Keep track for later
506  if ( isset( $tabletags[$t] ) &&
507  !in_array( 'table', $tagstack ) ) {
508  $badtag = true;
509  } elseif ( in_array( $t, $tagstack ) &&
510  !isset( $htmlnest[$t] ) ) {
511  $badtag = true;
512  # Is it a self closed htmlpair ? (bug 5487)
513  } elseif ( $brace == '/>' &&
514  isset( $htmlpairs[$t] ) ) {
515  $badtag = true;
516  } elseif ( isset( $htmlsingleonly[$t] ) ) {
517  # Hack to force empty tag for unclosable elements
518  $brace = '/>';
519  } elseif ( isset( $htmlsingle[$t] ) ) {
520  # Hack to not close $htmlsingle tags
521  $brace = null;
522  # Still need to push this optionally-closed tag to
523  # the tag stack so that we can match end tags
524  # instead of marking them as bad.
525  array_push( $tagstack, $t );
526  } elseif ( isset( $tabletags[$t] )
527  && in_array( $t, $tagstack ) ) {
528  // New table tag but forgot to close the previous one
529  $text .= "</$t>";
530  } else {
531  if ( $t == 'table' ) {
532  array_push( $tablestack, $tagstack );
533  $tagstack = array();
534  }
535  array_push( $tagstack, $t );
536  }
537 
538  # Replace any variables or template parameters with
539  # plaintext results.
540  if ( is_callable( $processCallback ) ) {
541  call_user_func_array( $processCallback, array( &$params, $args ) );
542  }
543 
544  if ( !Sanitizer::validateTag( $params, $t ) ) {
545  $badtag = true;
546  }
547 
548  # Strip non-approved attributes from the tag
549  $newparams = Sanitizer::fixTagAttributes( $params, $t );
550  }
551  if ( !$badtag ) {
552  $rest = str_replace( '>', '&gt;', $rest );
553  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
554  $text .= "<$slash$t$newparams$close>$rest";
555  continue;
556  }
557  }
558  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
559  }
560  # Close off any remaining tags
561  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
562  $text .= "</$t>\n";
563  if ( $t == 'table' ) {
564  $tagstack = array_pop( $tablestack );
565  }
566  }
567  } else {
568  # this might be possible using tidy itself
569  foreach ( $bits as $x ) {
570  preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
571  $x, $regs );
572  @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
573  $badtag = false;
574  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
575  if ( is_callable( $processCallback ) ) {
576  call_user_func_array( $processCallback, array( &$params, $args ) );
577  }
578 
579  if ( !Sanitizer::validateTag( $params, $t ) ) {
580  $badtag = true;
581  }
582 
583  $newparams = Sanitizer::fixTagAttributes( $params, $t );
584  if ( !$badtag ) {
585  $rest = str_replace( '>', '&gt;', $rest );
586  $text .= "<$slash$t$newparams$brace$rest";
587  continue;
588  }
589  }
590  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
591  }
592  }
593  wfProfileOut( __METHOD__ );
594  return $text;
595  }
596 
607  static function removeHTMLcomments( $text ) {
608  wfProfileIn( __METHOD__ );
609  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
610  $end = strpos( $text, '-->', $start + 4 );
611  if ( $end === false ) {
612  # Unterminated comment; bail out
613  break;
614  }
615 
616  $end += 3;
617 
618  # Trim space and newline if the comment is both
619  # preceded and followed by a newline
620  $spaceStart = max( $start - 1, 0 );
621  $spaceLen = $end - $spaceStart;
622  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
623  $spaceStart--;
624  $spaceLen++;
625  }
626  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
627  $spaceLen++;
628  }
629  if ( substr( $text, $spaceStart, 1 ) === "\n"
630  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
631  # Remove the comment, leading and trailing
632  # spaces, and leave only one newline.
633  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
634  } else {
635  # Remove just the comment.
636  $text = substr_replace( $text, '', $start, $end - $start );
637  }
638  }
639  wfProfileOut( __METHOD__ );
640  return $text;
641  }
642 
655  static function validateTag( $params, $element ) {
657 
658  if ( $element == 'meta' || $element == 'link' ) {
659  if ( !isset( $params['itemprop'] ) ) {
660  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
661  return false;
662  }
663  if ( $element == 'meta' && !isset( $params['content'] ) ) {
664  // <meta> must have a content="" for the itemprop
665  return false;
666  }
667  if ( $element == 'link' && !isset( $params['href'] ) ) {
668  // <link> must have an associated href=""
669  return false;
670  }
671  }
672 
673  return true;
674  }
675 
691  static function validateTagAttributes( $attribs, $element ) {
693  Sanitizer::attributeWhitelist( $element ) );
694  }
695 
711  static function validateAttributes( $attribs, $whitelist ) {
712  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
713 
714  $whitelist = array_flip( $whitelist );
715  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
716 
717  $out = array();
718  foreach ( $attribs as $attribute => $value ) {
719  #allow XML namespace declaration if RDFa is enabled
720  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
721  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
722  $out[$attribute] = $value;
723  }
724 
725  continue;
726  }
727 
728  # Allow any attribute beginning with "data-"
729  if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
730  continue;
731  }
732 
733  # Strip javascript "expression" from stylesheets.
734  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
735  if ( $attribute == 'style' ) {
737  }
738 
739  if ( $attribute === 'id' ) {
740  $value = Sanitizer::escapeId( $value, 'noninitial' );
741  }
742 
743  # WAI-ARIA
744  # http://www.w3.org/TR/wai-aria/
745  # http://www.whatwg.org/html/elements.html#wai-aria
746  # For now we only support role="presentation" until we work out what roles should be
747  # usable by content and we ensure that our code explicitly rejects patterns that
748  # violate HTML5's ARIA restrictions.
749  if ( $attribute === 'role' && $value !== 'presentation' ) {
750  continue;
751  }
752 
753  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
754  // Check them for sanity.
755  if ( $attribute === 'rel' || $attribute === 'rev'
756  # RDFa
757  || $attribute === 'about' || $attribute === 'property'
758  || $attribute === 'resource' || $attribute === 'datatype'
759  || $attribute === 'typeof'
760  # HTML5 microdata
761  || $attribute === 'itemid' || $attribute === 'itemprop'
762  || $attribute === 'itemref' || $attribute === 'itemscope'
763  || $attribute === 'itemtype'
764  ) {
765  //Paranoia. Allow "simple" values but suppress javascript
766  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
767  continue;
768  }
769  }
770 
771  # NOTE: even though elements using href/src are not allowed directly, supply
772  # validation code that can be used by tag hook handlers, etc
773  if ( $attribute === 'href' || $attribute === 'src' ) {
774  if ( !preg_match( $hrefExp, $value ) ) {
775  continue; //drop any href or src attributes not using an allowed protocol.
776  // NOTE: this also drops all relative URLs
777  }
778  }
779 
780  // If this attribute was previously set, override it.
781  // Output should only have one attribute of each name.
782  $out[$attribute] = $value;
783  }
784 
785  if ( $wgAllowMicrodataAttributes ) {
786  # itemtype, itemid, itemref don't make sense without itemscope
787  if ( !array_key_exists( 'itemscope', $out ) ) {
788  unset( $out['itemtype'] );
789  unset( $out['itemid'] );
790  unset( $out['itemref'] );
791  }
792  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
793  }
794  return $out;
795  }
796 
807  static function mergeAttributes( $a, $b ) {
808  $out = array_merge( $a, $b );
809  if ( isset( $a['class'] ) && isset( $b['class'] )
810  && is_string( $a['class'] ) && is_string( $b['class'] )
811  && $a['class'] !== $b['class']
812  ) {
813  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
814  -1, PREG_SPLIT_NO_EMPTY );
815  $out['class'] = implode( ' ', array_unique( $classes ) );
816  }
817  return $out;
818  }
819 
829  public static function normalizeCss( $value ) {
830 
831  // Decode character references like &#123;
833 
834  // Decode escape sequences and line continuation
835  // See the grammar in the CSS 2 spec, appendix D.
836  // This has to be done AFTER decoding character references.
837  // This means it isn't possible for this function to return
838  // unsanitized escape sequences. It is possible to manufacture
839  // input that contains character references that decode to
840  // escape sequences that decode to character references, but
841  // it's OK for the return value to contain character references
842  // because the caller is supposed to escape those anyway.
843  static $decodeRegex;
844  if ( !$decodeRegex ) {
845  $space = '[\\x20\\t\\r\\n\\f]';
846  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
847  $backslash = '\\\\';
848  $decodeRegex = "/ $backslash
849  (?:
850  ($nl) | # 1. Line continuation
851  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
852  (.) | # 3. backslash cancelling special meaning
853  () | # 4. backslash at end of string
854  )/xu";
855  }
856  $value = preg_replace_callback( $decodeRegex,
857  array( __CLASS__, 'cssDecodeCallback' ), $value );
858 
859  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
860  $value = preg_replace_callback(
861  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
862  function ( $matches ) {
863  $cp = utf8ToCodepoint( $matches[0] );
864  if ( $cp === false ) {
865  return '';
866  }
867  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
868  },
869  $value
870  );
871 
872  // Convert more characters IE6 might treat as ascii
873  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
874  $value = str_replace(
875  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
876  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
877  $value
878  );
879 
880  // Let the value through if it's nothing but a single comment, to
881  // allow other functions which may reject it to pass some error
882  // message through.
883  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
884  // Remove any comments; IE gets token splitting wrong
885  // This must be done AFTER decoding character references and
886  // escape sequences, because those steps can introduce comments
887  // This step cannot introduce character references or escape
888  // sequences, because it replaces comments with spaces rather
889  // than removing them completely.
890  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
891 
892  // Remove anything after a comment-start token, to guard against
893  // incorrect client implementations.
894  $commentPos = strpos( $value, '/*' );
895  if ( $commentPos !== false ) {
896  $value = substr( $value, 0, $commentPos );
897  }
898  }
899 
900  // S followed by repeat, iteration, or prolonged sound marks,
901  // which IE will treat as "ss"
902  $value = preg_replace(
903  '/s(?:
904  \xE3\x80\xB1 | # U+3031
905  \xE3\x82\x9D | # U+309D
906  \xE3\x83\xBC | # U+30FC
907  \xE3\x83\xBD | # U+30FD
908  \xEF\xB9\xBC | # U+FE7C
909  \xEF\xB9\xBD | # U+FE7D
910  \xEF\xBD\xB0 # U+FF70
911  )/ix',
912  'ss',
913  $value
914  );
915 
916  return $value;
917  }
918 
919 
938  static function checkCss( $value ) {
940 
941  // Reject problematic keywords and control characters
942  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
943  return '/* invalid control char */';
944  } elseif ( preg_match(
945  '! expression
946  | filter\s*:
947  | accelerator\s*:
948  | -o-link\s*:
949  | -o-link-source\s*:
950  | -o-replace\s*:
951  | url\s*\(
952  | image\s*\(
953  | image-set\s*\(
954  | attr\s*\([^)]+[\s,]+url
955  !ix', $value ) ) {
956  return '/* insecure input */';
957  }
958  return $value;
959  }
960 
965  static function cssDecodeCallback( $matches ) {
966  if ( $matches[1] !== '' ) {
967  // Line continuation
968  return '';
969  } elseif ( $matches[2] !== '' ) {
970  $char = codepointToUtf8( hexdec( $matches[2] ) );
971  } elseif ( $matches[3] !== '' ) {
972  $char = $matches[3];
973  } else {
974  $char = '\\';
975  }
976  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
977  // These characters need to be escaped in strings
978  // Clean up the escape sequence to avoid parsing errors by clients
979  return '\\' . dechex( ord( $char ) ) . ' ';
980  } else {
981  // Decode unnecessary escape
982  return $char;
983  }
984  }
985 
1005  static function fixTagAttributes( $text, $element ) {
1006  if ( trim( $text ) == '' ) {
1007  return '';
1008  }
1009 
1010  $decoded = Sanitizer::decodeTagAttributes( $text );
1011  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1012 
1013  return Sanitizer::safeEncodeTagAttributes( $stripped );
1014  }
1015 
1021  static function encodeAttribute( $text ) {
1022  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1023 
1024  // Whitespace is normalized during attribute decoding,
1025  // so if we've been passed non-spaces we must encode them
1026  // ahead of time or they won't be preserved.
1027  $encValue = strtr( $encValue, array(
1028  "\n" => '&#10;',
1029  "\r" => '&#13;',
1030  "\t" => '&#9;',
1031  ) );
1032 
1033  return $encValue;
1034  }
1035 
1042  static function safeEncodeAttribute( $text ) {
1043  $encValue = Sanitizer::encodeAttribute( $text );
1044 
1045  # Templates and links may be expanded in later parsing,
1046  # creating invalid or dangerous output. Suppress this.
1047  $encValue = strtr( $encValue, array(
1048  '<' => '&lt;', // This should never happen,
1049  '>' => '&gt;', // we've received invalid input
1050  '"' => '&quot;', // which should have been escaped.
1051  '{' => '&#123;',
1052  '[' => '&#91;',
1053  "''" => '&#39;&#39;',
1054  'ISBN' => '&#73;SBN',
1055  'RFC' => '&#82;FC',
1056  'PMID' => '&#80;MID',
1057  '|' => '&#124;',
1058  '__' => '&#95;_',
1059  ) );
1060 
1061  # Stupid hack
1062  $encValue = preg_replace_callback(
1063  '/((?i)' . wfUrlProtocols() . ')/',
1064  array( 'Sanitizer', 'armorLinksCallback' ),
1065  $encValue );
1066  return $encValue;
1067  }
1068 
1100  static function escapeId( $id, $options = array() ) {
1101  global $wgExperimentalHtmlIds;
1102  $options = (array)$options;
1103 
1104  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1105  $id = Sanitizer::decodeCharReferences( $id );
1106  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1107  $id = trim( $id, '_' );
1108  if ( $id === '' ) {
1109  # Must have been all whitespace to start with.
1110  return '_';
1111  } else {
1112  return $id;
1113  }
1114  }
1115 
1116  # HTML4-style escaping
1117  static $replace = array(
1118  '%3A' => ':',
1119  '%' => '.'
1120  );
1121 
1122  $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
1123  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1124 
1125  if ( !preg_match( '/^[a-zA-Z]/', $id )
1126  && !in_array( 'noninitial', $options ) ) {
1127  // Initial character must be a letter!
1128  $id = "x$id";
1129  }
1130  return $id;
1131  }
1132 
1144  static function escapeClass( $class ) {
1145  // Convert ugly stuff to underscores and kill underscores in ugly places
1146  return rtrim( preg_replace(
1147  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1148  '_',
1149  $class ), '_' );
1150  }
1151 
1159  static function escapeHtmlAllowEntities( $html ) {
1161  # It seems wise to escape ' as well as ", as a matter of course. Can't
1162  # hurt.
1163  $html = htmlspecialchars( $html, ENT_QUOTES );
1164  return $html;
1165  }
1166 
1172  private static function armorLinksCallback( $matches ) {
1173  return str_replace( ':', '&#58;', $matches[1] );
1174  }
1175 
1184  public static function decodeTagAttributes( $text ) {
1185  if ( trim( $text ) == '' ) {
1186  return array();
1187  }
1188 
1189  $attribs = array();
1190  $pairs = array();
1191  if ( !preg_match_all(
1192  self::getAttribsRegex(),
1193  $text,
1194  $pairs,
1195  PREG_SET_ORDER ) ) {
1196  return $attribs;
1197  }
1198 
1199  foreach ( $pairs as $set ) {
1200  $attribute = strtolower( $set[1] );
1202 
1203  // Normalize whitespace
1204  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1205  $value = trim( $value );
1206 
1207  // Decode character references
1209  }
1210  return $attribs;
1211  }
1212 
1220  public static function safeEncodeTagAttributes( $assoc_array ) {
1221  $attribs = array();
1222  foreach ( $assoc_array as $attribute => $value ) {
1223  $encAttribute = htmlspecialchars( $attribute );
1224  $encValue = Sanitizer::safeEncodeAttribute( $value );
1225 
1226  $attribs[] = "$encAttribute=\"$encValue\"";
1227  }
1228  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1229  }
1230 
1239  private static function getTagAttributeCallback( $set ) {
1240  if ( isset( $set[6] ) ) {
1241  # Illegal #XXXXXX color with no quotes.
1242  return $set[6];
1243  } elseif ( isset( $set[5] ) ) {
1244  # No quotes.
1245  return $set[5];
1246  } elseif ( isset( $set[4] ) ) {
1247  # Single-quoted
1248  return $set[4];
1249  } elseif ( isset( $set[3] ) ) {
1250  # Double-quoted
1251  return $set[3];
1252  } elseif ( !isset( $set[2] ) ) {
1253  # In XHTML, attributes must have a value.
1254  # For 'reduced' form, return explicitly the attribute name here.
1255  return $set[1];
1256  } else {
1257  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1258  }
1259  }
1260 
1273  private static function normalizeAttributeValue( $text ) {
1274  return str_replace( '"', '&quot;',
1275  self::normalizeWhitespace(
1277  }
1278 
1283  private static function normalizeWhitespace( $text ) {
1284  return preg_replace(
1285  '/\r\n|[\x20\x0d\x0a\x09]/',
1286  ' ',
1287  $text );
1288  }
1289 
1299  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1300  }
1301 
1317  static function normalizeCharReferences( $text ) {
1318  return preg_replace_callback(
1319  self::CHAR_REFS_REGEX,
1320  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1321  $text );
1322  }
1323 
1329  $ret = null;
1330  if ( $matches[1] != '' ) {
1332  } elseif ( $matches[2] != '' ) {
1334  } elseif ( $matches[3] != '' ) {
1336  }
1337  if ( is_null( $ret ) ) {
1338  return htmlspecialchars( $matches[0] );
1339  } else {
1340  return $ret;
1341  }
1342  }
1343 
1354  static function normalizeEntity( $name ) {
1355  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1356  return '&' . self::$htmlEntityAliases[$name] . ';';
1357  } elseif ( in_array( $name,
1358  array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1359  return "&$name;";
1360  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1361  return '&#' . self::$htmlEntities[$name] . ';';
1362  } else {
1363  return "&amp;$name;";
1364  }
1365  }
1366 
1371  static function decCharReference( $codepoint ) {
1372  $point = intval( $codepoint );
1373  if ( Sanitizer::validateCodepoint( $point ) ) {
1374  return sprintf( '&#%d;', $point );
1375  } else {
1376  return null;
1377  }
1378  }
1379 
1384  static function hexCharReference( $codepoint ) {
1385  $point = hexdec( $codepoint );
1386  if ( Sanitizer::validateCodepoint( $point ) ) {
1387  return sprintf( '&#x%x;', $point );
1388  } else {
1389  return null;
1390  }
1391  }
1392 
1398  private static function validateCodepoint( $codepoint ) {
1399  return $codepoint == 0x09
1400  || $codepoint == 0x0a
1401  || $codepoint == 0x0d
1402  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1403  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1404  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1405  }
1406 
1414  public static function decodeCharReferences( $text ) {
1415  return preg_replace_callback(
1416  self::CHAR_REFS_REGEX,
1417  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1418  $text );
1419  }
1420 
1431  public static function decodeCharReferencesAndNormalize( $text ) {
1433  $text = preg_replace_callback(
1434  self::CHAR_REFS_REGEX,
1435  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1436  $text, /* limit */ -1, $count );
1437 
1438  if ( $count ) {
1439  return $wgContLang->normalize( $text );
1440  } else {
1441  return $text;
1442  }
1443  }
1444 
1450  if ( $matches[1] != '' ) {
1451  return Sanitizer::decodeEntity( $matches[1] );
1452  } elseif ( $matches[2] != '' ) {
1453  return Sanitizer::decodeChar( intval( $matches[2] ) );
1454  } elseif ( $matches[3] != '' ) {
1455  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1456  }
1457  # Last case should be an ampersand by itself
1458  return $matches[0];
1459  }
1460 
1468  static function decodeChar( $codepoint ) {
1469  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1470  return codepointToUtf8( $codepoint );
1471  } else {
1472  return UTF8_REPLACEMENT;
1473  }
1474  }
1475 
1484  static function decodeEntity( $name ) {
1485  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1486  $name = self::$htmlEntityAliases[$name];
1487  }
1488  if ( isset( self::$htmlEntities[$name] ) ) {
1489  return codepointToUtf8( self::$htmlEntities[$name] );
1490  } else {
1491  return "&$name;";
1492  }
1493  }
1494 
1501  static function attributeWhitelist( $element ) {
1503  return isset( $list[$element] )
1504  ? $list[$element]
1505  : array();
1506  }
1507 
1513  static function setupAttributeWhitelist() {
1514  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
1515 
1516  static $whitelist, $staticInitialised;
1517  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1518 
1519  if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
1520  return $whitelist;
1521  }
1522 
1523  $common = array(
1524  # HTML
1525  'id',
1526  'class',
1527  'style',
1528  'lang',
1529  'dir',
1530  'title',
1531 
1532  # WAI-ARIA
1533  'role',
1534  );
1535 
1536  if ( $wgAllowRdfaAttributes ) {
1537  # RDFa attributes as specified in section 9 of
1538  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1539  $common = array_merge( $common, array(
1540  'about', 'property', 'resource', 'datatype', 'typeof',
1541  ) );
1542  }
1543 
1544  if ( $wgAllowMicrodataAttributes ) {
1545  # add HTML5 microdata tags as specified by
1546  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1547  $common = array_merge( $common, array(
1548  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1549  ) );
1550  }
1551 
1552  $block = array_merge( $common, array( 'align' ) );
1553  $tablealign = array( 'align', 'valign' );
1554  $tablecell = array(
1555  'abbr',
1556  'axis',
1557  'headers',
1558  'scope',
1559  'rowspan',
1560  'colspan',
1561  'nowrap', # deprecated
1562  'width', # deprecated
1563  'height', # deprecated
1564  'bgcolor', # deprecated
1565  );
1566 
1567  # Numbers refer to sections in HTML 4.01 standard describing the element.
1568  # See: http://www.w3.org/TR/html4/
1569  $whitelist = array(
1570  # 7.5.4
1571  'div' => $block,
1572  'center' => $common, # deprecated
1573  'span' => $common,
1574 
1575  # 7.5.5
1576  'h1' => $block,
1577  'h2' => $block,
1578  'h3' => $block,
1579  'h4' => $block,
1580  'h5' => $block,
1581  'h6' => $block,
1582 
1583  # 7.5.6
1584  # address
1585 
1586  # 8.2.4
1587  'bdo' => $common,
1588 
1589  # 9.2.1
1590  'em' => $common,
1591  'strong' => $common,
1592  'cite' => $common,
1593  'dfn' => $common,
1594  'code' => $common,
1595  'samp' => $common,
1596  'kbd' => $common,
1597  'var' => $common,
1598  'abbr' => $common,
1599  # acronym
1600 
1601  # 9.2.2
1602  'blockquote' => array_merge( $common, array( 'cite' ) ),
1603  'q' => array_merge( $common, array( 'cite' ) ),
1604 
1605  # 9.2.3
1606  'sub' => $common,
1607  'sup' => $common,
1608 
1609  # 9.3.1
1610  'p' => $block,
1611 
1612  # 9.3.2
1613  'br' => array_merge( $common, array( 'clear' ) ),
1614 
1615  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1616  'wbr' => $common,
1617 
1618  # 9.3.4
1619  'pre' => array_merge( $common, array( 'width' ) ),
1620 
1621  # 9.4
1622  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1623  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1624 
1625  # 10.2
1626  'ul' => array_merge( $common, array( 'type' ) ),
1627  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1628  'li' => array_merge( $common, array( 'type', 'value' ) ),
1629 
1630  # 10.3
1631  'dl' => $common,
1632  'dd' => $common,
1633  'dt' => $common,
1634 
1635  # 11.2.1
1636  'table' => array_merge( $common,
1637  array( 'summary', 'width', 'border', 'frame',
1638  'rules', 'cellspacing', 'cellpadding',
1639  'align', 'bgcolor',
1640  ) ),
1641 
1642  # 11.2.2
1643  'caption' => $block,
1644 
1645  # 11.2.3
1646  'thead' => $common,
1647  'tfoot' => $common,
1648  'tbody' => $common,
1649 
1650  # 11.2.4
1651  'colgroup' => array_merge( $common, array( 'span' ) ),
1652  'col' => array_merge( $common, array( 'span' ) ),
1653 
1654  # 11.2.5
1655  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1656 
1657  # 11.2.6
1658  'td' => array_merge( $common, $tablecell, $tablealign ),
1659  'th' => array_merge( $common, $tablecell, $tablealign ),
1660 
1661  # 12.2
1662  # NOTE: <a> is not allowed directly, but the attrib
1663  # whitelist is used from the Parser object
1664  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1665 
1666  # 13.2
1667  # Not usually allowed, but may be used for extension-style hooks
1668  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1669  # true
1670  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1671 
1672  # 15.2.1
1673  'tt' => $common,
1674  'b' => $common,
1675  'i' => $common,
1676  'big' => $common,
1677  'small' => $common,
1678  'strike' => $common,
1679  's' => $common,
1680  'u' => $common,
1681 
1682  # 15.2.2
1683  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1684  # basefont
1685 
1686  # 15.3
1687  'hr' => array_merge( $common, array( 'width' ) ),
1688 
1689  # HTML Ruby annotation text module, simple ruby only.
1690  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1691  'ruby' => $common,
1692  # rbc
1693  # rtc
1694  'rb' => $common,
1695  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1696  'rp' => $common,
1697 
1698  # MathML root element, where used for extensions
1699  # 'title' may not be 100% valid here; it's XHTML
1700  # http://www.w3.org/TR/REC-MathML/
1701  'math' => array( 'class', 'style', 'id', 'title' ),
1702 
1703  # HTML 5 section 4.6
1704  'bdi' => $common,
1705 
1706  # HTML5 elements, defined by:
1707  # http://www.whatwg.org/html/
1708  'data' => array_merge( $common, array( 'value' ) ),
1709  'time' => array_merge( $common, array( 'datetime' ) ),
1710  'mark' => $common,
1711 
1712  // meta and link are only permitted by removeHTMLtags when Microdata
1713  // is enabled so we don't bother adding a conditional to hide these
1714  // Also meta and link are only valid in WikiText as Microdata elements
1715  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1716  // So we don't bother including $common attributes that have no purpose.
1717  'meta' => array( 'itemprop', 'content' ),
1718  'link' => array( 'itemprop', 'href' ),
1719  );
1720 
1721  $staticInitialised = $globalContext;
1722 
1723  return $whitelist;
1724  }
1725 
1736  static function stripAllTags( $text ) {
1737  # Actual <tags>
1738  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1739 
1740  # Normalize &entities and whitespace
1741  $text = self::decodeCharReferences( $text );
1742  $text = self::normalizeWhitespace( $text );
1743 
1744  return $text;
1745  }
1746 
1756  static function hackDocType() {
1757  $out = "<!DOCTYPE html [\n";
1758  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1759  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1760  }
1761  $out .= "]>\n";
1762  return $out;
1763  }
1764 
1769  static function cleanUrl( $url ) {
1770  # Normalize any HTML entities in input. They will be
1771  # re-escaped by makeExternalLink().
1772  $url = Sanitizer::decodeCharReferences( $url );
1773 
1774  # Escape any control characters introduced by the above step
1775  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1776  array( __CLASS__, 'cleanUrlCallback' ), $url );
1777 
1778  # Validate hostname portion
1779  $matches = array();
1780  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1781  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1782 
1783  // Characters that will be ignored in IDNs.
1784  // http://tools.ietf.org/html/3454#section-3.1
1785  // Strip them before further processing so blacklists and such work.
1786  $strip = "/
1787  \\s| # general whitespace
1788  \xc2\xad| # 00ad SOFT HYPHEN
1789  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1790  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1791  \xe2\x81\xa0| # 2060 WORD JOINER
1792  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1793  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1794  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1795  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1796  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1797  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1798  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1799  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1800  /xuD";
1801 
1802  $host = preg_replace( $strip, '', $host );
1803 
1804  // @todo FIXME: Validate hostnames here
1805 
1806  return $protocol . $host . $rest;
1807  } else {
1808  return $url;
1809  }
1810  }
1811 
1816  static function cleanUrlCallback( $matches ) {
1817  return urlencode( $matches[0] );
1818  }
1819 
1848  public static function validateEmail( $addr ) {
1849  $result = null;
1850  if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1851  return $result;
1852  }
1853 
1854  // Please note strings below are enclosed in brackets [], this make the
1855  // hyphen "-" a range indicator. Hence it is double backslashed below.
1856  // See bug 26948
1857  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1858  $rfc1034_ldh_str = "a-z0-9\\-";
1859 
1860  $html5_email_regexp = "/
1861  ^ # start of string
1862  [$rfc5322_atext\\.]+ # user part which is liberal :p
1863  @ # 'apostrophe'
1864  [$rfc1034_ldh_str]+ # First domain part
1865  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1866  $ # End of string
1867  /ix"; // case Insensitive, eXtended
1868 
1869  return (bool)preg_match( $html5_email_regexp, $addr );
1870  }
1871 }
Sanitizer\normalizeAttributeValue
static normalizeAttributeValue( $text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:1273
$result
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message. Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item. $reader:XMLReader object $logInfo:Array of information Return false to stop further processing of the tag 'ImportHandlePageXMLTag':When parsing a XML tag in a page. $reader:XMLReader object $pageInfo:Array of information Return false to stop further processing of the tag 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision. $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information Return false to stop further processing of the tag 'ImportHandleToplevelXMLTag':When parsing a top level XML tag. $reader:XMLReader object Return false to stop further processing of the tag 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload. $reader:XMLReader object $revisionInfo:Array of information Return false to stop further processing of the tag 'InfoAction':When building information to display on the action=info page. $context:IContextSource object & $pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect. $title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not. Return true without providing an interwiki to continue interwiki search. $prefix:interwiki prefix we are looking for. & $iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InternalParseBeforeSanitize':during Parser 's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings. Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InternalParseBeforeLinks':during Parser 's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InvalidateEmailComplete':Called after a user 's email has been invalidated successfully. $user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification. Callee may modify $url and $query, URL will be constructed as $url . $query & $url:URL to index.php & $query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from & $allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of User::isValidEmailAddr(), for instance to return false if the domain name doesn 't match your organization. $addr:The e-mail address entered by the user & $result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user & $result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we 're looking for a messages file for & $file:The messages file path, you can override this to change the location. 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces. Do not use this hook to add namespaces. Use CanonicalNamespaces for that. & $namespaces:Array of namespaces indexed by their numbers 'LanguageGetMagic':DEPRECATED, use $magicWords in a file listed in $wgExtensionMessagesFiles instead. Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetSpecialPageAliases':DEPRECATED, use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead. Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names. & $names:array of language code=> language name $code language of the preferred translations 'LanguageLinks':Manipulate a page 's language links. This is called in various places to allow extensions to define the effective language links for a page. $title:The page 's Title. & $links:Associative array mapping language codes to prefixed links of the form "language:title". & $linkFlags:Associative array mapping prefixed links to arrays of flags. Currently unused, but planned to provide support for marking individual language links in the UI, e.g. for featured articles. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts. Return false to skip default processing and return $ret. See documentation for Linker::link() for details on the expected meanings of parameters. $skin:the Skin object $target:the Title that the link is pointing to & $html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1528
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1354
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1239
data
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
Sanitizer\attributeWhitelist
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1501
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:607
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
$html
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1530
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:50
is
We use the convention $dbr for read and $dbw for write to help you keep track of whether the database object is a the world will explode Or to be a subsequent write query which succeeded on the master may fail when replicated to the slave due to a unique key collision Replication on the slave will stop and it may take hours to repair the database and get it back online Setting read_only in my cnf on the slave will avoid this but given the dire we prefer to have as many checks as possible We provide a but the wrapper functions like please read the documentation for except in special pages derived from QueryPage It s a common pitfall for new developers to submit code containing SQL queries which examine huge numbers of rows Remember that COUNT * is(N), counting rows in atable is like counting beans in a bucket.------------------------------------------------------------------------ Replication------------------------------------------------------------------------The largest installation of MediaWiki, Wikimedia, uses a large set ofslave MySQL servers replicating writes made to a master MySQL server. Itis important to understand the issues associated with this setup if youwant to write code destined for Wikipedia.It 's often the case that the best algorithm to use for a given taskdepends on whether or not replication is in use. Due to our unabashedWikipedia-centrism, we often just use the replication-friendly version, but if you like, you can use wfGetLB() ->getServerCount() > 1 tocheck to see if replication is in use.===Lag===Lag primarily occurs when large write queries are sent to the master.Writes on the master are executed in parallel, but they are executed inserial when they are replicated to the slaves. The master writes thequery to the binlog when the transaction is committed. The slaves pollthe binlog and start executing the query as soon as it appears. They canservice reads while they are performing a write query, but will not readanything more from the binlog and thus will perform no more writes. Thismeans that if the write query runs for a long time, the slaves will lagbehind the master for the time it takes for the write query to complete.Lag can be exacerbated by high read load. MediaWiki 's load balancer willstop sending reads to a slave when it is lagged by more than 30 seconds.If the load ratios are set incorrectly, or if there is too much loadgenerally, this may lead to a slave permanently hovering around 30seconds lag.If all slaves are lagged by more than 30 seconds, MediaWiki will stopwriting to the database. All edits and other write operations will berefused, with an error returned to the user. This gives the slaves achance to catch up. Before we had this mechanism, the slaves wouldregularly lag by several minutes, making review of recent editsdifficult.In addition to this, MediaWiki attempts to ensure that the user seesevents occurring on the wiki in chronological order. A few seconds of lagcan be tolerated, as long as the user sees a consistent picture fromsubsequent requests. This is done by saving the master binlog positionin the session, and then at the start of each request, waiting for theslave to catch up to that position before doing any reads from it. Ifthis wait times out, reads are allowed anyway, but the request isconsidered to be in "lagged slave mode". Lagged slave mode can bechecked by calling wfGetLB() ->getLaggedSlaveMode(). The onlypractical consequence at present is a warning displayed in the pagefooter.===Lag avoidance===To avoid excessive lag, queries which write large numbers of rows shouldbe split up, generally to write one row at a time. Multi-row INSERT ...SELECT queries are the worst offenders should be avoided altogether.Instead do the select first and then the insert.===Working with lag===Despite our best efforts, it 's not practical to guarantee a low-lagenvironment. Lag will usually be less than one second, but mayoccasionally be up to 30 seconds. For scalability, it 's very importantto keep load on the master low, so simply sending all your queries tothe master is not the answer. So when you have a genuine need forup-to-date data, the following approach is advised:1) Do a quick query to the master for a sequence number or timestamp 2) Run the full query on the slave and check if it matches the data you gotfrom the master 3) If it doesn 't, run the full query on the masterTo avoid swamping the master every time the slaves lag, use of thisapproach should be kept to a minimum. In most cases you should just readfrom the slave and let the user deal with the delay.------------------------------------------------------------------------ Lock contention------------------------------------------------------------------------Due to the high write rate on Wikipedia(and some other wikis), MediaWiki developers need to be very careful to structure their writesto avoid long-lasting locks. By default, MediaWiki opens a transactionat the first query, and commits it before the output is sent. Locks willbe held from the time when the query is done until the commit. So youcan reduce lock time by doing as much processing as possible before youdo your write queries.Often this approach is not good enough, and it becomes necessary toenclose small groups of queries in their own transaction. Use thefollowing syntax:$dbw=wfGetDB(DB_MASTER
Sanitizer\$htmlEntities
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:58
text
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1484
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:807
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
$ret
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1530
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2434
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1298
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1848
$params
$params
Definition: styleTest.css.php:40
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1371
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1220
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1431
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:325
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1144
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1328
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
Sanitizer\stripAllTags
static stripAllTags( $text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1736
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:655
title
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
Definition: All_system_messages.txt:2703
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1756
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:51
codepointToUtf8
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
Definition: UtfNormalUtil.php:36
MWException
MediaWiki exception.
Definition: MWException.php:26
Sanitizer\$htmlEntityAliases
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:317
$out
$out
Definition: UtfNormalGenerate.php:167
wfRestoreWarnings
wfRestoreWarnings()
Restore error level to previous value.
Definition: GlobalFunctions.php:2464
hooks
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
table
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:1042
directly
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1021
Sanitizer\armorLinksCallback
static armorLinksCallback( $matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1172
wfRunHooks
wfRunHooks( $event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in $wgHooks.
Definition: GlobalFunctions.php:4066
Sanitizer\validateAttributes
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:711
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
simple
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1384
Sanitizer\escapeId
static escapeId( $id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it.
Definition: Sanitizer.php:1100
will
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
Definition: All_system_messages.txt:914
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1398
$options
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1530
$section
$section
Definition: Utf8Test.php:88
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:742
root
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
Definition: distributors.txt:39
$name
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:336
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
$value
$value
Definition: styleTest.css.php:45
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:691
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1769
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:965
tags
pre inside other HTML tags(bug 54946) !! wikitext a< div >< pre > foo</pre ></div >< pre ></pre > !! html< p >a</p >< div >< pre > foo</pre ></div >< pre ></pre > !! end !! test HTML pre followed by indent-pre !! wikitext< pre >foo</pre > bar !! html< pre >foo</pre >< pre >bar</pre > !! end !!test Block tag pre !!options parsoid !! wikitext< p >< pre >foo</pre ></p > !! html< p data-parsoid
UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormalDefines.php:64
only
published in in Madrid In the first edition of the Vocabolario for was published In in Rotterdam was the Dictionnaire Universel ! html< p > The first monolingual dictionary written in a Romance language was< i > Sebastián Covarrubias</i >< i > Tesoro de la lengua castellana o published in in Madrid In the first edition of the< i > Vocabolario dell< a href="/index.php?title=Accademia_della_Crusca&amp;action=edit&amp;redlink=1" class="new" title="Accademia della Crusca (page does not exist)"> Accademia della Crusca</a ></i > for was published In in Rotterdam was the< i > Dictionnaire Universel</i ></p > ! end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html php< p >< i > foo</i ></p > ! html parsoid< p >< i > foo</i >< b ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! options ! wikitext foo ! html< p >< b >< i > foo</i ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< i ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< b ></b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s family</b ></i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s</b > family</i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo</i ></b >< i > s family</i ></p > !end ! test Italics and ! options ! wikitext this is about foo s family ! html< p >< i > this is about</i > foo< b > s family</b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo s</i > family</b ></p > !end ! test Italicized possessive ! wikitext The s talk page ! html< p > The< i >< a href="/wiki/Main_Page" title="Main Page"> Main Page</a ></i > s talk page</p > ! end ! test Parsoid only
Definition: parserTests.txt:396
$count
$count
Definition: UtfNormalTest2.php:96
it
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content. The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content. These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text. All manipulation and analysis of page content must be done via the appropriate methods of the Content object. For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers. The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id). Also Title, WikiPage and Revision now have getContentHandler() methods for convenience. ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page. ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type. However, it is recommended to instead use WikiPage::getContent() resp. Revision::getContent() to get a page 's content as a Content object. These two methods should be the ONLY way in which page content is accessed. Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides(). This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based. Objects implementing the Content interface are used to represent and handle the content internally. For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content). The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats(). Content serialization formats are identified using MIME type like strings. The following formats are built in:*text/x-wiki - wikitext *text/javascript - for js pages *text/css - for css pages *text/plain - for future use, e.g. with plain text messages. *text/html - for future use, e.g. with plain html messages. *application/vnd.php.serialized - for future use with the api and for extensions *application/json - for future use with the api, and for use by extensions *application/xml - for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant. Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly. Without that information, interpretation of the provided content is not reliable. The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export. Also note that the API will provide encapsulated, serialized content - so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure. Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content. However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page 's content model, and will now generate warnings when used. Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent() *WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject(). However, both methods should be avoided since they do not provide clean access to the page 's actual content. For instance, they may return a system message for non-existing pages. Use WikiPage::getContent() instead. Code that relies on a textual representation of the page content should eventually be rewritten. However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page. Its behavior is controlled by $wgContentHandlerTextFallback it
Definition: contenthandler.txt:107
$args
if( $line===false) $args
Definition: cdb.php:62
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:829
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:1005
in
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
used
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1283
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
such
it sets a lot of them automatically from query and such
Definition: design.txt:93
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1468
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to.
Definition: StringUtils.php:256
utf8ToCodepoint
utf8ToCodepoint( $char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
Definition: UtfNormalUtil.php:94
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1317
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1184
from
Please log in again after you receive it</td >< td > s a saved copy from
Definition: All_system_messages.txt:3297
that
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
Definition: deferred.txt:11
Sanitizer\setupAttributeWhitelist
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1513
$t
$t
Definition: testCompression.php:65
$vars
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1684
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1414
$attribs
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1530
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:938
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:332
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1159
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1449
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1816
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:366