MediaWiki  1.23.2
Sanitizer.php
Go to the documentation of this file.
1 <?php
31 class Sanitizer {
37  '/&([A-Za-z0-9\x80-\xff]+);
38  |&\#([0-9]+);
39  |&\#[xX]([0-9A-Fa-f]+);
40  |(&)/x';
41 
50  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
51  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
52 
58  private static $htmlEntities = array(
59  'Aacute' => 193,
60  'aacute' => 225,
61  'Acirc' => 194,
62  'acirc' => 226,
63  'acute' => 180,
64  'AElig' => 198,
65  'aelig' => 230,
66  'Agrave' => 192,
67  'agrave' => 224,
68  'alefsym' => 8501,
69  'Alpha' => 913,
70  'alpha' => 945,
71  'amp' => 38,
72  'and' => 8743,
73  'ang' => 8736,
74  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
75  'Aring' => 197,
76  'aring' => 229,
77  'asymp' => 8776,
78  'Atilde' => 195,
79  'atilde' => 227,
80  'Auml' => 196,
81  'auml' => 228,
82  'bdquo' => 8222,
83  'Beta' => 914,
84  'beta' => 946,
85  'brvbar' => 166,
86  'bull' => 8226,
87  'cap' => 8745,
88  'Ccedil' => 199,
89  'ccedil' => 231,
90  'cedil' => 184,
91  'cent' => 162,
92  'Chi' => 935,
93  'chi' => 967,
94  'circ' => 710,
95  'clubs' => 9827,
96  'cong' => 8773,
97  'copy' => 169,
98  'crarr' => 8629,
99  'cup' => 8746,
100  'curren' => 164,
101  'dagger' => 8224,
102  'Dagger' => 8225,
103  'darr' => 8595,
104  'dArr' => 8659,
105  'deg' => 176,
106  'Delta' => 916,
107  'delta' => 948,
108  'diams' => 9830,
109  'divide' => 247,
110  'Eacute' => 201,
111  'eacute' => 233,
112  'Ecirc' => 202,
113  'ecirc' => 234,
114  'Egrave' => 200,
115  'egrave' => 232,
116  'empty' => 8709,
117  'emsp' => 8195,
118  'ensp' => 8194,
119  'Epsilon' => 917,
120  'epsilon' => 949,
121  'equiv' => 8801,
122  'Eta' => 919,
123  'eta' => 951,
124  'ETH' => 208,
125  'eth' => 240,
126  'Euml' => 203,
127  'euml' => 235,
128  'euro' => 8364,
129  'exist' => 8707,
130  'fnof' => 402,
131  'forall' => 8704,
132  'frac12' => 189,
133  'frac14' => 188,
134  'frac34' => 190,
135  'frasl' => 8260,
136  'Gamma' => 915,
137  'gamma' => 947,
138  'ge' => 8805,
139  'gt' => 62,
140  'harr' => 8596,
141  'hArr' => 8660,
142  'hearts' => 9829,
143  'hellip' => 8230,
144  'Iacute' => 205,
145  'iacute' => 237,
146  'Icirc' => 206,
147  'icirc' => 238,
148  'iexcl' => 161,
149  'Igrave' => 204,
150  'igrave' => 236,
151  'image' => 8465,
152  'infin' => 8734,
153  'int' => 8747,
154  'Iota' => 921,
155  'iota' => 953,
156  'iquest' => 191,
157  'isin' => 8712,
158  'Iuml' => 207,
159  'iuml' => 239,
160  'Kappa' => 922,
161  'kappa' => 954,
162  'Lambda' => 923,
163  'lambda' => 955,
164  'lang' => 9001,
165  'laquo' => 171,
166  'larr' => 8592,
167  'lArr' => 8656,
168  'lceil' => 8968,
169  'ldquo' => 8220,
170  'le' => 8804,
171  'lfloor' => 8970,
172  'lowast' => 8727,
173  'loz' => 9674,
174  'lrm' => 8206,
175  'lsaquo' => 8249,
176  'lsquo' => 8216,
177  'lt' => 60,
178  'macr' => 175,
179  'mdash' => 8212,
180  'micro' => 181,
181  'middot' => 183,
182  'minus' => 8722,
183  'Mu' => 924,
184  'mu' => 956,
185  'nabla' => 8711,
186  'nbsp' => 160,
187  'ndash' => 8211,
188  'ne' => 8800,
189  'ni' => 8715,
190  'not' => 172,
191  'notin' => 8713,
192  'nsub' => 8836,
193  'Ntilde' => 209,
194  'ntilde' => 241,
195  'Nu' => 925,
196  'nu' => 957,
197  'Oacute' => 211,
198  'oacute' => 243,
199  'Ocirc' => 212,
200  'ocirc' => 244,
201  'OElig' => 338,
202  'oelig' => 339,
203  'Ograve' => 210,
204  'ograve' => 242,
205  'oline' => 8254,
206  'Omega' => 937,
207  'omega' => 969,
208  'Omicron' => 927,
209  'omicron' => 959,
210  'oplus' => 8853,
211  'or' => 8744,
212  'ordf' => 170,
213  'ordm' => 186,
214  'Oslash' => 216,
215  'oslash' => 248,
216  'Otilde' => 213,
217  'otilde' => 245,
218  'otimes' => 8855,
219  'Ouml' => 214,
220  'ouml' => 246,
221  'para' => 182,
222  'part' => 8706,
223  'permil' => 8240,
224  'perp' => 8869,
225  'Phi' => 934,
226  'phi' => 966,
227  'Pi' => 928,
228  'pi' => 960,
229  'piv' => 982,
230  'plusmn' => 177,
231  'pound' => 163,
232  'prime' => 8242,
233  'Prime' => 8243,
234  'prod' => 8719,
235  'prop' => 8733,
236  'Psi' => 936,
237  'psi' => 968,
238  'quot' => 34,
239  'radic' => 8730,
240  'rang' => 9002,
241  'raquo' => 187,
242  'rarr' => 8594,
243  'rArr' => 8658,
244  'rceil' => 8969,
245  'rdquo' => 8221,
246  'real' => 8476,
247  'reg' => 174,
248  'rfloor' => 8971,
249  'Rho' => 929,
250  'rho' => 961,
251  'rlm' => 8207,
252  'rsaquo' => 8250,
253  'rsquo' => 8217,
254  'sbquo' => 8218,
255  'Scaron' => 352,
256  'scaron' => 353,
257  'sdot' => 8901,
258  'sect' => 167,
259  'shy' => 173,
260  'Sigma' => 931,
261  'sigma' => 963,
262  'sigmaf' => 962,
263  'sim' => 8764,
264  'spades' => 9824,
265  'sub' => 8834,
266  'sube' => 8838,
267  'sum' => 8721,
268  'sup' => 8835,
269  'sup1' => 185,
270  'sup2' => 178,
271  'sup3' => 179,
272  'supe' => 8839,
273  'szlig' => 223,
274  'Tau' => 932,
275  'tau' => 964,
276  'there4' => 8756,
277  'Theta' => 920,
278  'theta' => 952,
279  'thetasym' => 977,
280  'thinsp' => 8201,
281  'THORN' => 222,
282  'thorn' => 254,
283  'tilde' => 732,
284  'times' => 215,
285  'trade' => 8482,
286  'Uacute' => 218,
287  'uacute' => 250,
288  'uarr' => 8593,
289  'uArr' => 8657,
290  'Ucirc' => 219,
291  'ucirc' => 251,
292  'Ugrave' => 217,
293  'ugrave' => 249,
294  'uml' => 168,
295  'upsih' => 978,
296  'Upsilon' => 933,
297  'upsilon' => 965,
298  'Uuml' => 220,
299  'uuml' => 252,
300  'weierp' => 8472,
301  'Xi' => 926,
302  'xi' => 958,
303  'Yacute' => 221,
304  'yacute' => 253,
305  'yen' => 165,
306  'Yuml' => 376,
307  'yuml' => 255,
308  'Zeta' => 918,
309  'zeta' => 950,
310  'zwj' => 8205,
311  'zwnj' => 8204
312  );
313 
317  private static $htmlEntityAliases = array(
318  'רלמ' => 'rlm',
319  'رلم' => 'rlm',
320  );
321 
325  private static $attribsRegex;
326 
332  static function getAttribsRegex() {
333  if ( self::$attribsRegex === null ) {
334  $attribFirst = '[:A-Z_a-z0-9]';
335  $attrib = '[:A-Z_a-z-.0-9]';
336  $space = '[\x09\x0a\x0d\x20]';
337  self::$attribsRegex =
338  "/(?:^|$space)({$attribFirst}{$attrib}*)
339  ($space*=$space*
340  (?:
341  # The attribute value: quoted or alone
342  \"([^<\"]*)\"
343  | '([^<']*)'
344  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
345  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
346  # colors are specified like this.
347  # We'll be normalizing it.
348  )
349  )?(?=$space|\$)/sx";
350  }
351  return self::$attribsRegex;
352  }
353 
366  static function removeHTMLtags( $text, $processCallback = null,
367  $args = array(), $extratags = array(), $removetags = array()
368  ) {
369  global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
370 
371  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
372  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
373 
374  wfProfileIn( __METHOD__ );
375 
376  // Base our staticInitialised variable off of the global config state so that if the globals
377  // are changed (like in the screwed up test system) we will re-initialise the settings.
378  $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
379  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
380 
381  $htmlpairsStatic = array( # Tags that must be closed
382  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
383  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
384  'strike', 'strong', 'tt', 'var', 'div', 'center',
385  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
386  'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
387  'kbd', 'samp', 'data', 'time', 'mark'
388  );
389  $htmlsingle = array(
390  'br', 'wbr', 'hr', 'li', 'dt', 'dd'
391  );
392  $htmlsingleonly = array( # Elements that cannot have close tags
393  'br', 'wbr', 'hr'
394  );
395  if ( $wgAllowMicrodataAttributes ) {
396  $htmlsingle[] = $htmlsingleonly[] = 'meta';
397  $htmlsingle[] = $htmlsingleonly[] = 'link';
398  }
399  $htmlnest = array( # Tags that can be nested--??
400  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
401  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
402  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
403  );
404  $tabletags = array( # Can only appear inside table, we will close them
405  'td', 'th', 'tr',
406  );
407  $htmllist = array( # Tags used by list
408  'ul', 'ol',
409  );
410  $listtags = array( # Tags that can appear in a list
411  'li',
412  );
413 
414  if ( $wgAllowImageTag ) {
415  $htmlsingle[] = 'img';
416  $htmlsingleonly[] = 'img';
417  }
418 
419  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
420  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
421 
422  # Convert them all to hashtables for faster lookup
423  $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
424  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
425  foreach ( $vars as $var ) {
426  $$var = array_flip( $$var );
427  }
428  $staticInitialised = $globalContext;
429  }
430  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
431  $extratags = array_flip( $extratags );
432  $removetags = array_flip( $removetags );
433  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
434  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
435 
436  # Remove HTML comments
437  $text = Sanitizer::removeHTMLcomments( $text );
438  $bits = explode( '<', $text );
439  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
440  if ( !$wgUseTidy ) {
441  $tagstack = $tablestack = array();
442  foreach ( $bits as $x ) {
443  $regs = array();
444  # $slash: Does the current element start with a '/'?
445  # $t: Current element name
446  # $params: String between element name and >
447  # $brace: Ending '>' or '/>'
448  # $rest: Everything until the next element of $bits
449  if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
450  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
451  } else {
452  $slash = $t = $params = $brace = $rest = null;
453  }
454 
455  $badtag = false;
456  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
457  # Check our stack
458  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
459  $badtag = true;
460  } elseif ( $slash ) {
461  # Closing a tag... is it the one we just opened?
462  $ot = @array_pop( $tagstack );
463  if ( $ot != $t ) {
464  if ( isset( $htmlsingleallowed[$ot] ) ) {
465  # Pop all elements with an optional close tag
466  # and see if we find a match below them
467  $optstack = array();
468  array_push( $optstack, $ot );
470  $ot = array_pop( $tagstack );
472  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
473  array_push( $optstack, $ot );
475  $ot = array_pop( $tagstack );
477  }
478  if ( $t != $ot ) {
479  # No match. Push the optional elements back again
480  $badtag = true;
482  $ot = array_pop( $optstack );
484  while ( $ot ) {
485  array_push( $tagstack, $ot );
487  $ot = array_pop( $optstack );
489  }
490  }
491  } else {
492  @array_push( $tagstack, $ot );
493  # <li> can be nested in <ul> or <ol>, skip those cases:
494  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
495  $badtag = true;
496  }
497  }
498  } else {
499  if ( $t == 'table' ) {
500  $tagstack = array_pop( $tablestack );
501  }
502  }
503  $newparams = '';
504  } else {
505  # Keep track for later
506  if ( isset( $tabletags[$t] ) &&
507  !in_array( 'table', $tagstack ) ) {
508  $badtag = true;
509  } elseif ( in_array( $t, $tagstack ) &&
510  !isset( $htmlnest[$t] ) ) {
511  $badtag = true;
512  # Is it a self closed htmlpair ? (bug 5487)
513  } elseif ( $brace == '/>' &&
514  isset( $htmlpairs[$t] ) ) {
515  $badtag = true;
516  } elseif ( isset( $htmlsingleonly[$t] ) ) {
517  # Hack to force empty tag for unclosable elements
518  $brace = '/>';
519  } elseif ( isset( $htmlsingle[$t] ) ) {
520  # Hack to not close $htmlsingle tags
521  $brace = null;
522  # Still need to push this optionally-closed tag to
523  # the tag stack so that we can match end tags
524  # instead of marking them as bad.
525  array_push( $tagstack, $t );
526  } elseif ( isset( $tabletags[$t] )
527  && in_array( $t, $tagstack ) ) {
528  // New table tag but forgot to close the previous one
529  $text .= "</$t>";
530  } else {
531  if ( $t == 'table' ) {
532  array_push( $tablestack, $tagstack );
533  $tagstack = array();
534  }
535  array_push( $tagstack, $t );
536  }
537 
538  # Replace any variables or template parameters with
539  # plaintext results.
540  if ( is_callable( $processCallback ) ) {
541  call_user_func_array( $processCallback, array( &$params, $args ) );
542  }
543 
544  if ( !Sanitizer::validateTag( $params, $t ) ) {
545  $badtag = true;
546  }
547 
548  # Strip non-approved attributes from the tag
549  $newparams = Sanitizer::fixTagAttributes( $params, $t );
550  }
551  if ( !$badtag ) {
552  $rest = str_replace( '>', '&gt;', $rest );
553  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
554  $text .= "<$slash$t$newparams$close>$rest";
555  continue;
556  }
557  }
558  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
559  }
560  # Close off any remaining tags
561  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
562  $text .= "</$t>\n";
563  if ( $t == 'table' ) {
564  $tagstack = array_pop( $tablestack );
565  }
566  }
567  } else {
568  # this might be possible using tidy itself
569  foreach ( $bits as $x ) {
570  preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
571  $x, $regs );
572  @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
573  $badtag = false;
574  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
575  if ( is_callable( $processCallback ) ) {
576  call_user_func_array( $processCallback, array( &$params, $args ) );
577  }
578 
579  if ( !Sanitizer::validateTag( $params, $t ) ) {
580  $badtag = true;
581  }
582 
583  $newparams = Sanitizer::fixTagAttributes( $params, $t );
584  if ( !$badtag ) {
585  $rest = str_replace( '>', '&gt;', $rest );
586  $text .= "<$slash$t$newparams$brace$rest";
587  continue;
588  }
589  }
590  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
591  }
592  }
593  wfProfileOut( __METHOD__ );
594  return $text;
595  }
596 
607  static function removeHTMLcomments( $text ) {
608  wfProfileIn( __METHOD__ );
609  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
610  $end = strpos( $text, '-->', $start + 4 );
611  if ( $end === false ) {
612  # Unterminated comment; bail out
613  break;
614  }
615 
616  $end += 3;
617 
618  # Trim space and newline if the comment is both
619  # preceded and followed by a newline
620  $spaceStart = max( $start - 1, 0 );
621  $spaceLen = $end - $spaceStart;
622  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
623  $spaceStart--;
624  $spaceLen++;
625  }
626  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
627  $spaceLen++;
628  }
629  if ( substr( $text, $spaceStart, 1 ) === "\n"
630  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
631  # Remove the comment, leading and trailing
632  # spaces, and leave only one newline.
633  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
634  } else {
635  # Remove just the comment.
636  $text = substr_replace( $text, '', $start, $end - $start );
637  }
638  }
639  wfProfileOut( __METHOD__ );
640  return $text;
641  }
642 
655  static function validateTag( $params, $element ) {
657 
658  if ( $element == 'meta' || $element == 'link' ) {
659  if ( !isset( $params['itemprop'] ) ) {
660  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
661  return false;
662  }
663  if ( $element == 'meta' && !isset( $params['content'] ) ) {
664  // <meta> must have a content="" for the itemprop
665  return false;
666  }
667  if ( $element == 'link' && !isset( $params['href'] ) ) {
668  // <link> must have an associated href=""
669  return false;
670  }
671  }
672 
673  return true;
674  }
675 
691  static function validateTagAttributes( $attribs, $element ) {
693  Sanitizer::attributeWhitelist( $element ) );
694  }
695 
711  static function validateAttributes( $attribs, $whitelist ) {
712  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
713 
714  $whitelist = array_flip( $whitelist );
715  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
716 
717  $out = array();
718  foreach ( $attribs as $attribute => $value ) {
719  #allow XML namespace declaration if RDFa is enabled
720  if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
721  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
722  $out[$attribute] = $value;
723  }
724 
725  continue;
726  }
727 
728  # Allow any attribute beginning with "data-"
729  if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
730  continue;
731  }
732 
733  # Strip javascript "expression" from stylesheets.
734  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
735  if ( $attribute == 'style' ) {
737  }
738 
739  if ( $attribute === 'id' ) {
740  $value = Sanitizer::escapeId( $value, 'noninitial' );
741  }
742 
743  # WAI-ARIA
744  # http://www.w3.org/TR/wai-aria/
745  # http://www.whatwg.org/html/elements.html#wai-aria
746  # For now we only support role="presentation" until we work out what roles should be
747  # usable by content and we ensure that our code explicitly rejects patterns that
748  # violate HTML5's ARIA restrictions.
749  if ( $attribute === 'role' && $value !== 'presentation' ) {
750  continue;
751  }
752 
753  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
754  // Check them for sanity.
755  if ( $attribute === 'rel' || $attribute === 'rev'
756  # RDFa
757  || $attribute === 'about' || $attribute === 'property'
758  || $attribute === 'resource' || $attribute === 'datatype'
759  || $attribute === 'typeof'
760  # HTML5 microdata
761  || $attribute === 'itemid' || $attribute === 'itemprop'
762  || $attribute === 'itemref' || $attribute === 'itemscope'
763  || $attribute === 'itemtype'
764  ) {
765  //Paranoia. Allow "simple" values but suppress javascript
766  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
767  continue;
768  }
769  }
770 
771  # NOTE: even though elements using href/src are not allowed directly, supply
772  # validation code that can be used by tag hook handlers, etc
773  if ( $attribute === 'href' || $attribute === 'src' ) {
774  if ( !preg_match( $hrefExp, $value ) ) {
775  continue; //drop any href or src attributes not using an allowed protocol.
776  // NOTE: this also drops all relative URLs
777  }
778  }
779 
780  // If this attribute was previously set, override it.
781  // Output should only have one attribute of each name.
782  $out[$attribute] = $value;
783  }
784 
785  if ( $wgAllowMicrodataAttributes ) {
786  # itemtype, itemid, itemref don't make sense without itemscope
787  if ( !array_key_exists( 'itemscope', $out ) ) {
788  unset( $out['itemtype'] );
789  unset( $out['itemid'] );
790  unset( $out['itemref'] );
791  }
792  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
793  }
794  return $out;
795  }
796 
807  static function mergeAttributes( $a, $b ) {
808  $out = array_merge( $a, $b );
809  if ( isset( $a['class'] ) && isset( $b['class'] )
810  && is_string( $a['class'] ) && is_string( $b['class'] )
811  && $a['class'] !== $b['class']
812  ) {
813  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
814  -1, PREG_SPLIT_NO_EMPTY );
815  $out['class'] = implode( ' ', array_unique( $classes ) );
816  }
817  return $out;
818  }
819 
838  static function checkCss( $value ) {
839  // Decode character references like &#123;
841 
842  // Decode escape sequences and line continuation
843  // See the grammar in the CSS 2 spec, appendix D.
844  // This has to be done AFTER decoding character references.
845  // This means it isn't possible for this function to return
846  // unsanitized escape sequences. It is possible to manufacture
847  // input that contains character references that decode to
848  // escape sequences that decode to character references, but
849  // it's OK for the return value to contain character references
850  // because the caller is supposed to escape those anyway.
851  static $decodeRegex;
852  if ( !$decodeRegex ) {
853  $space = '[\\x20\\t\\r\\n\\f]';
854  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
855  $backslash = '\\\\';
856  $decodeRegex = "/ $backslash
857  (?:
858  ($nl) | # 1. Line continuation
859  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
860  (.) | # 3. backslash cancelling special meaning
861  () | # 4. backslash at end of string
862  )/xu";
863  }
864  $value = preg_replace_callback( $decodeRegex,
865  array( __CLASS__, 'cssDecodeCallback' ), $value );
866 
867  // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
868  $value = preg_replace_callback(
869  '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
870  function ( $matches ) {
871  $cp = utf8ToCodepoint( $matches[0] );
872  if ( $cp === false ) {
873  return '';
874  }
875  return chr( $cp - 65248 ); // ASCII range \x21-\x7A
876  },
877  $value
878  );
879 
880  // Convert more characters IE6 might treat as ascii
881  // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
882  $value = str_replace(
883  array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
884  array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
885  $value
886  );
887 
888  // Let the value through if it's nothing but a single comment, to
889  // allow other functions which may reject it to pass some error
890  // message through.
891  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
892  // Remove any comments; IE gets token splitting wrong
893  // This must be done AFTER decoding character references and
894  // escape sequences, because those steps can introduce comments
895  // This step cannot introduce character references or escape
896  // sequences, because it replaces comments with spaces rather
897  // than removing them completely.
898  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
899 
900  // Remove anything after a comment-start token, to guard against
901  // incorrect client implementations.
902  $commentPos = strpos( $value, '/*' );
903  if ( $commentPos !== false ) {
904  $value = substr( $value, 0, $commentPos );
905  }
906  }
907 
908  // S followed by repeat, iteration, or prolonged sound marks,
909  // which IE will treat as "ss"
910  $value = preg_replace(
911  '/s(?:
912  \xE3\x80\xB1 | # U+3031
913  \xE3\x82\x9D | # U+309D
914  \xE3\x83\xBC | # U+30FC
915  \xE3\x83\xBD | # U+30FD
916  \xEF\xB9\xBC | # U+FE7C
917  \xEF\xB9\xBD | # U+FE7D
918  \xEF\xBD\xB0 # U+FF70
919  )/ix',
920  'ss',
921  $value
922  );
923 
924  // Reject problematic keywords and control characters
925  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
926  return '/* invalid control char */';
927  } elseif ( preg_match(
928  '! expression
929  | filter\s*:
930  | accelerator\s*:
931  | -o-link\s*:
932  | -o-link-source\s*:
933  | -o-replace\s*:
934  | url\s*\(
935  | image\s*\(
936  | image-set\s*\(
937  !ix', $value ) ) {
938  return '/* insecure input */';
939  }
940  return $value;
941  }
942 
947  static function cssDecodeCallback( $matches ) {
948  if ( $matches[1] !== '' ) {
949  // Line continuation
950  return '';
951  } elseif ( $matches[2] !== '' ) {
952  $char = codepointToUtf8( hexdec( $matches[2] ) );
953  } elseif ( $matches[3] !== '' ) {
954  $char = $matches[3];
955  } else {
956  $char = '\\';
957  }
958  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
959  // These characters need to be escaped in strings
960  // Clean up the escape sequence to avoid parsing errors by clients
961  return '\\' . dechex( ord( $char ) ) . ' ';
962  } else {
963  // Decode unnecessary escape
964  return $char;
965  }
966  }
967 
987  static function fixTagAttributes( $text, $element ) {
988  if ( trim( $text ) == '' ) {
989  return '';
990  }
991 
992  $decoded = Sanitizer::decodeTagAttributes( $text );
993  $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
994 
995  return Sanitizer::safeEncodeTagAttributes( $stripped );
996  }
997 
1003  static function encodeAttribute( $text ) {
1004  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1005 
1006  // Whitespace is normalized during attribute decoding,
1007  // so if we've been passed non-spaces we must encode them
1008  // ahead of time or they won't be preserved.
1009  $encValue = strtr( $encValue, array(
1010  "\n" => '&#10;',
1011  "\r" => '&#13;',
1012  "\t" => '&#9;',
1013  ) );
1014 
1015  return $encValue;
1016  }
1017 
1024  static function safeEncodeAttribute( $text ) {
1025  $encValue = Sanitizer::encodeAttribute( $text );
1026 
1027  # Templates and links may be expanded in later parsing,
1028  # creating invalid or dangerous output. Suppress this.
1029  $encValue = strtr( $encValue, array(
1030  '<' => '&lt;', // This should never happen,
1031  '>' => '&gt;', // we've received invalid input
1032  '"' => '&quot;', // which should have been escaped.
1033  '{' => '&#123;',
1034  '[' => '&#91;',
1035  "''" => '&#39;&#39;',
1036  'ISBN' => '&#73;SBN',
1037  'RFC' => '&#82;FC',
1038  'PMID' => '&#80;MID',
1039  '|' => '&#124;',
1040  '__' => '&#95;_',
1041  ) );
1042 
1043  # Stupid hack
1044  $encValue = preg_replace_callback(
1045  '/((?i)' . wfUrlProtocols() . ')/',
1046  array( 'Sanitizer', 'armorLinksCallback' ),
1047  $encValue );
1048  return $encValue;
1049  }
1050 
1082  static function escapeId( $id, $options = array() ) {
1083  global $wgExperimentalHtmlIds;
1084  $options = (array)$options;
1085 
1086  if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1087  $id = Sanitizer::decodeCharReferences( $id );
1088  $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1089  $id = trim( $id, '_' );
1090  if ( $id === '' ) {
1091  # Must have been all whitespace to start with.
1092  return '_';
1093  } else {
1094  return $id;
1095  }
1096  }
1097 
1098  # HTML4-style escaping
1099  static $replace = array(
1100  '%3A' => ':',
1101  '%' => '.'
1102  );
1103 
1104  $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
1105  $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1106 
1107  if ( !preg_match( '/^[a-zA-Z]/', $id )
1108  && !in_array( 'noninitial', $options ) ) {
1109  // Initial character must be a letter!
1110  $id = "x$id";
1111  }
1112  return $id;
1113  }
1114 
1126  static function escapeClass( $class ) {
1127  // Convert ugly stuff to underscores and kill underscores in ugly places
1128  return rtrim( preg_replace(
1129  array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
1130  '_',
1131  $class ), '_' );
1132  }
1133 
1141  static function escapeHtmlAllowEntities( $html ) {
1143  # It seems wise to escape ' as well as ", as a matter of course. Can't
1144  # hurt.
1145  $html = htmlspecialchars( $html, ENT_QUOTES );
1146  return $html;
1147  }
1148 
1154  private static function armorLinksCallback( $matches ) {
1155  return str_replace( ':', '&#58;', $matches[1] );
1156  }
1157 
1166  public static function decodeTagAttributes( $text ) {
1167  if ( trim( $text ) == '' ) {
1168  return array();
1169  }
1170 
1171  $attribs = array();
1172  $pairs = array();
1173  if ( !preg_match_all(
1174  self::getAttribsRegex(),
1175  $text,
1176  $pairs,
1177  PREG_SET_ORDER ) ) {
1178  return $attribs;
1179  }
1180 
1181  foreach ( $pairs as $set ) {
1182  $attribute = strtolower( $set[1] );
1184 
1185  // Normalize whitespace
1186  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1187  $value = trim( $value );
1188 
1189  // Decode character references
1191  }
1192  return $attribs;
1193  }
1194 
1202  public static function safeEncodeTagAttributes( $assoc_array ) {
1203  $attribs = array();
1204  foreach ( $assoc_array as $attribute => $value ) {
1205  $encAttribute = htmlspecialchars( $attribute );
1206  $encValue = Sanitizer::safeEncodeAttribute( $value );
1207 
1208  $attribs[] = "$encAttribute=\"$encValue\"";
1209  }
1210  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1211  }
1212 
1221  private static function getTagAttributeCallback( $set ) {
1222  if ( isset( $set[6] ) ) {
1223  # Illegal #XXXXXX color with no quotes.
1224  return $set[6];
1225  } elseif ( isset( $set[5] ) ) {
1226  # No quotes.
1227  return $set[5];
1228  } elseif ( isset( $set[4] ) ) {
1229  # Single-quoted
1230  return $set[4];
1231  } elseif ( isset( $set[3] ) ) {
1232  # Double-quoted
1233  return $set[3];
1234  } elseif ( !isset( $set[2] ) ) {
1235  # In XHTML, attributes must have a value.
1236  # For 'reduced' form, return explicitly the attribute name here.
1237  return $set[1];
1238  } else {
1239  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1240  }
1241  }
1242 
1255  private static function normalizeAttributeValue( $text ) {
1256  return str_replace( '"', '&quot;',
1257  self::normalizeWhitespace(
1259  }
1260 
1265  private static function normalizeWhitespace( $text ) {
1266  return preg_replace(
1267  '/\r\n|[\x20\x0d\x0a\x09]/',
1268  ' ',
1269  $text );
1270  }
1271 
1281  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1282  }
1283 
1299  static function normalizeCharReferences( $text ) {
1300  return preg_replace_callback(
1301  self::CHAR_REFS_REGEX,
1302  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1303  $text );
1304  }
1305 
1311  $ret = null;
1312  if ( $matches[1] != '' ) {
1314  } elseif ( $matches[2] != '' ) {
1316  } elseif ( $matches[3] != '' ) {
1318  }
1319  if ( is_null( $ret ) ) {
1320  return htmlspecialchars( $matches[0] );
1321  } else {
1322  return $ret;
1323  }
1324  }
1325 
1336  static function normalizeEntity( $name ) {
1337  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1338  return '&' . self::$htmlEntityAliases[$name] . ';';
1339  } elseif ( in_array( $name,
1340  array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
1341  return "&$name;";
1342  } elseif ( isset( self::$htmlEntities[$name] ) ) {
1343  return '&#' . self::$htmlEntities[$name] . ';';
1344  } else {
1345  return "&amp;$name;";
1346  }
1347  }
1348 
1353  static function decCharReference( $codepoint ) {
1354  $point = intval( $codepoint );
1355  if ( Sanitizer::validateCodepoint( $point ) ) {
1356  return sprintf( '&#%d;', $point );
1357  } else {
1358  return null;
1359  }
1360  }
1361 
1366  static function hexCharReference( $codepoint ) {
1367  $point = hexdec( $codepoint );
1368  if ( Sanitizer::validateCodepoint( $point ) ) {
1369  return sprintf( '&#x%x;', $point );
1370  } else {
1371  return null;
1372  }
1373  }
1374 
1380  private static function validateCodepoint( $codepoint ) {
1381  return $codepoint == 0x09
1382  || $codepoint == 0x0a
1383  || $codepoint == 0x0d
1384  || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
1385  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1386  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1387  }
1388 
1396  public static function decodeCharReferences( $text ) {
1397  return preg_replace_callback(
1398  self::CHAR_REFS_REGEX,
1399  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1400  $text );
1401  }
1402 
1413  public static function decodeCharReferencesAndNormalize( $text ) {
1415  $text = preg_replace_callback(
1416  self::CHAR_REFS_REGEX,
1417  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1418  $text, /* limit */ -1, $count );
1419 
1420  if ( $count ) {
1421  return $wgContLang->normalize( $text );
1422  } else {
1423  return $text;
1424  }
1425  }
1426 
1432  if ( $matches[1] != '' ) {
1433  return Sanitizer::decodeEntity( $matches[1] );
1434  } elseif ( $matches[2] != '' ) {
1435  return Sanitizer::decodeChar( intval( $matches[2] ) );
1436  } elseif ( $matches[3] != '' ) {
1437  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1438  }
1439  # Last case should be an ampersand by itself
1440  return $matches[0];
1441  }
1442 
1450  static function decodeChar( $codepoint ) {
1451  if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1452  return codepointToUtf8( $codepoint );
1453  } else {
1454  return UTF8_REPLACEMENT;
1455  }
1456  }
1457 
1466  static function decodeEntity( $name ) {
1467  if ( isset( self::$htmlEntityAliases[$name] ) ) {
1468  $name = self::$htmlEntityAliases[$name];
1469  }
1470  if ( isset( self::$htmlEntities[$name] ) ) {
1471  return codepointToUtf8( self::$htmlEntities[$name] );
1472  } else {
1473  return "&$name;";
1474  }
1475  }
1476 
1483  static function attributeWhitelist( $element ) {
1485  return isset( $list[$element] )
1486  ? $list[$element]
1487  : array();
1488  }
1489 
1495  static function setupAttributeWhitelist() {
1496  global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
1497 
1498  static $whitelist, $staticInitialised;
1499  $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
1500 
1501  if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
1502  return $whitelist;
1503  }
1504 
1505  $common = array(
1506  # HTML
1507  'id',
1508  'class',
1509  'style',
1510  'lang',
1511  'dir',
1512  'title',
1513 
1514  # WAI-ARIA
1515  'role',
1516  );
1517 
1518  if ( $wgAllowRdfaAttributes ) {
1519  # RDFa attributes as specified in section 9 of
1520  # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1521  $common = array_merge( $common, array(
1522  'about', 'property', 'resource', 'datatype', 'typeof',
1523  ) );
1524  }
1525 
1526  if ( $wgAllowMicrodataAttributes ) {
1527  # add HTML5 microdata tags as specified by
1528  # http://www.whatwg.org/html/microdata.html#the-microdata-model
1529  $common = array_merge( $common, array(
1530  'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1531  ) );
1532  }
1533 
1534  $block = array_merge( $common, array( 'align' ) );
1535  $tablealign = array( 'align', 'valign' );
1536  $tablecell = array(
1537  'abbr',
1538  'axis',
1539  'headers',
1540  'scope',
1541  'rowspan',
1542  'colspan',
1543  'nowrap', # deprecated
1544  'width', # deprecated
1545  'height', # deprecated
1546  'bgcolor', # deprecated
1547  );
1548 
1549  # Numbers refer to sections in HTML 4.01 standard describing the element.
1550  # See: http://www.w3.org/TR/html4/
1551  $whitelist = array(
1552  # 7.5.4
1553  'div' => $block,
1554  'center' => $common, # deprecated
1555  'span' => $common,
1556 
1557  # 7.5.5
1558  'h1' => $block,
1559  'h2' => $block,
1560  'h3' => $block,
1561  'h4' => $block,
1562  'h5' => $block,
1563  'h6' => $block,
1564 
1565  # 7.5.6
1566  # address
1567 
1568  # 8.2.4
1569  'bdo' => $common,
1570 
1571  # 9.2.1
1572  'em' => $common,
1573  'strong' => $common,
1574  'cite' => $common,
1575  'dfn' => $common,
1576  'code' => $common,
1577  'samp' => $common,
1578  'kbd' => $common,
1579  'var' => $common,
1580  'abbr' => $common,
1581  # acronym
1582 
1583  # 9.2.2
1584  'blockquote' => array_merge( $common, array( 'cite' ) ),
1585  'q' => array_merge( $common, array( 'cite' ) ),
1586 
1587  # 9.2.3
1588  'sub' => $common,
1589  'sup' => $common,
1590 
1591  # 9.3.1
1592  'p' => $block,
1593 
1594  # 9.3.2
1595  'br' => array_merge( $common, array( 'clear' ) ),
1596 
1597  # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
1598  'wbr' => $common,
1599 
1600  # 9.3.4
1601  'pre' => array_merge( $common, array( 'width' ) ),
1602 
1603  # 9.4
1604  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1605  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1606 
1607  # 10.2
1608  'ul' => array_merge( $common, array( 'type' ) ),
1609  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1610  'li' => array_merge( $common, array( 'type', 'value' ) ),
1611 
1612  # 10.3
1613  'dl' => $common,
1614  'dd' => $common,
1615  'dt' => $common,
1616 
1617  # 11.2.1
1618  'table' => array_merge( $common,
1619  array( 'summary', 'width', 'border', 'frame',
1620  'rules', 'cellspacing', 'cellpadding',
1621  'align', 'bgcolor',
1622  ) ),
1623 
1624  # 11.2.2
1625  'caption' => $block,
1626 
1627  # 11.2.3
1628  'thead' => $common,
1629  'tfoot' => $common,
1630  'tbody' => $common,
1631 
1632  # 11.2.4
1633  'colgroup' => array_merge( $common, array( 'span' ) ),
1634  'col' => array_merge( $common, array( 'span' ) ),
1635 
1636  # 11.2.5
1637  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1638 
1639  # 11.2.6
1640  'td' => array_merge( $common, $tablecell, $tablealign ),
1641  'th' => array_merge( $common, $tablecell, $tablealign ),
1642 
1643  # 12.2
1644  # NOTE: <a> is not allowed directly, but the attrib
1645  # whitelist is used from the Parser object
1646  'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1647 
1648  # 13.2
1649  # Not usually allowed, but may be used for extension-style hooks
1650  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1651  # true
1652  'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
1653 
1654  # 15.2.1
1655  'tt' => $common,
1656  'b' => $common,
1657  'i' => $common,
1658  'big' => $common,
1659  'small' => $common,
1660  'strike' => $common,
1661  's' => $common,
1662  'u' => $common,
1663 
1664  # 15.2.2
1665  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1666  # basefont
1667 
1668  # 15.3
1669  'hr' => array_merge( $common, array( 'width' ) ),
1670 
1671  # HTML Ruby annotation text module, simple ruby only.
1672  # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
1673  'ruby' => $common,
1674  # rbc
1675  # rtc
1676  'rb' => $common,
1677  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1678  'rp' => $common,
1679 
1680  # MathML root element, where used for extensions
1681  # 'title' may not be 100% valid here; it's XHTML
1682  # http://www.w3.org/TR/REC-MathML/
1683  'math' => array( 'class', 'style', 'id', 'title' ),
1684 
1685  # HTML 5 section 4.6
1686  'bdi' => $common,
1687 
1688  # HTML5 elements, defined by:
1689  # http://www.whatwg.org/html/
1690  'data' => array_merge( $common, array( 'value' ) ),
1691  'time' => array_merge( $common, array( 'datetime' ) ),
1692  'mark' => $common,
1693 
1694  // meta and link are only permitted by removeHTMLtags when Microdata
1695  // is enabled so we don't bother adding a conditional to hide these
1696  // Also meta and link are only valid in WikiText as Microdata elements
1697  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1698  // So we don't bother including $common attributes that have no purpose.
1699  'meta' => array( 'itemprop', 'content' ),
1700  'link' => array( 'itemprop', 'href' ),
1701  );
1702 
1703  $staticInitialised = $globalContext;
1704 
1705  return $whitelist;
1706  }
1707 
1718  static function stripAllTags( $text ) {
1719  # Actual <tags>
1720  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1721 
1722  # Normalize &entities and whitespace
1723  $text = self::decodeCharReferences( $text );
1724  $text = self::normalizeWhitespace( $text );
1725 
1726  return $text;
1727  }
1728 
1738  static function hackDocType() {
1739  $out = "<!DOCTYPE html [\n";
1740  foreach ( self::$htmlEntities as $entity => $codepoint ) {
1741  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1742  }
1743  $out .= "]>\n";
1744  return $out;
1745  }
1746 
1751  static function cleanUrl( $url ) {
1752  # Normalize any HTML entities in input. They will be
1753  # re-escaped by makeExternalLink().
1754  $url = Sanitizer::decodeCharReferences( $url );
1755 
1756  # Escape any control characters introduced by the above step
1757  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1758  array( __CLASS__, 'cleanUrlCallback' ), $url );
1759 
1760  # Validate hostname portion
1761  $matches = array();
1762  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1763  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1764 
1765  // Characters that will be ignored in IDNs.
1766  // http://tools.ietf.org/html/3454#section-3.1
1767  // Strip them before further processing so blacklists and such work.
1768  $strip = "/
1769  \\s| # general whitespace
1770  \xc2\xad| # 00ad SOFT HYPHEN
1771  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1772  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1773  \xe2\x81\xa0| # 2060 WORD JOINER
1774  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1775  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1776  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1777  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1778  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1779  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1780  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1781  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1782  /xuD";
1783 
1784  $host = preg_replace( $strip, '', $host );
1785 
1786  // @todo FIXME: Validate hostnames here
1787 
1788  return $protocol . $host . $rest;
1789  } else {
1790  return $url;
1791  }
1792  }
1793 
1798  static function cleanUrlCallback( $matches ) {
1799  return urlencode( $matches[0] );
1800  }
1801 
1830  public static function validateEmail( $addr ) {
1831  $result = null;
1832  if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
1833  return $result;
1834  }
1835 
1836  // Please note strings below are enclosed in brackets [], this make the
1837  // hyphen "-" a range indicator. Hence it is double backslashed below.
1838  // See bug 26948
1839  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1840  $rfc1034_ldh_str = "a-z0-9\\-";
1841 
1842  $html5_email_regexp = "/
1843  ^ # start of string
1844  [$rfc5322_atext\\.]+ # user part which is liberal :p
1845  @ # 'apostrophe'
1846  [$rfc1034_ldh_str]+ # First domain part
1847  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1848  $ # End of string
1849  /ix"; // case Insensitive, eXtended
1850 
1851  return (bool)preg_match( $html5_email_regexp, $addr );
1852  }
1853 }
Sanitizer\normalizeAttributeValue
static normalizeAttributeValue( $text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:1255
$result
The index of the header message $result[1]=The index of the body text message $result[2 through n]=Parameters passed to body text message. Please note the header message cannot receive/use parameters. 'ImportHandleLogItemXMLTag':When parsing a XML tag in a log item. $reader:XMLReader object $logInfo:Array of information Return false to stop further processing of the tag 'ImportHandlePageXMLTag':When parsing a XML tag in a page. $reader:XMLReader object $pageInfo:Array of information Return false to stop further processing of the tag 'ImportHandleRevisionXMLTag':When parsing a XML tag in a page revision. $reader:XMLReader object $pageInfo:Array of page information $revisionInfo:Array of revision information Return false to stop further processing of the tag 'ImportHandleToplevelXMLTag':When parsing a top level XML tag. $reader:XMLReader object Return false to stop further processing of the tag 'ImportHandleUploadXMLTag':When parsing a XML tag in a file upload. $reader:XMLReader object $revisionInfo:Array of information Return false to stop further processing of the tag 'InfoAction':When building information to display on the action=info page. $context:IContextSource object & $pageInfo:Array of information 'InitializeArticleMaybeRedirect':MediaWiki check to see if title is a redirect. $title:Title object for the current page $request:WebRequest $ignoreRedirect:boolean to skip redirect check $target:Title/string of redirect target $article:Article object 'InterwikiLoadPrefix':When resolving if a given prefix is an interwiki or not. Return true without providing an interwiki to continue interwiki search. $prefix:interwiki prefix we are looking for. & $iwData:output array describing the interwiki with keys iw_url, iw_local, iw_trans and optionally iw_api and iw_wikiid. 'InternalParseBeforeSanitize':during Parser 's internalParse method just before the parser removes unwanted/dangerous HTML tags and after nowiki/noinclude/includeonly/onlyinclude and other processings. Ideal for syntax-extensions after template/parser function execution which respect nowiki and HTML-comments. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InternalParseBeforeLinks':during Parser 's internalParse method before links but after nowiki/noinclude/includeonly/onlyinclude and other processings. & $parser:Parser object & $text:string containing partially parsed text & $stripState:Parser 's internal StripState object 'InvalidateEmailComplete':Called after a user 's email has been invalidated successfully. $user:user(object) whose email is being invalidated 'IRCLineURL':When constructing the URL to use in an IRC notification. Callee may modify $url and $query, URL will be constructed as $url . $query & $url:URL to index.php & $query:Query string $rc:RecentChange object that triggered url generation 'IsFileCacheable':Override the result of Article::isFileCacheable()(if true) $article:article(object) being checked 'IsTrustedProxy':Override the result of wfIsTrustedProxy() $ip:IP being check $result:Change this value to override the result of wfIsTrustedProxy() 'IsUploadAllowedFromUrl':Override the result of UploadFromUrl::isAllowedUrl() $url:URL used to upload from & $allowed:Boolean indicating if uploading is allowed for given URL 'isValidEmailAddr':Override the result of User::isValidEmailAddr(), for instance to return false if the domain name doesn 't match your organization. $addr:The e-mail address entered by the user & $result:Set this and return false to override the internal checks 'isValidPassword':Override the result of User::isValidPassword() $password:The password entered by the user & $result:Set this and return false to override the internal checks $user:User the password is being validated for 'Language::getMessagesFileName':$code:The language code or the language we 're looking for a messages file for & $file:The messages file path, you can override this to change the location. 'LanguageGetNamespaces':Provide custom ordering for namespaces or remove namespaces. Do not use this hook to add namespaces. Use CanonicalNamespaces for that. & $namespaces:Array of namespaces indexed by their numbers 'LanguageGetMagic':DEPRECATED, use $magicWords in a file listed in $wgExtensionMessagesFiles instead. Use this to define synonyms of magic words depending of the language $magicExtensions:associative array of magic words synonyms $lang:language code(string) 'LanguageGetSpecialPageAliases':DEPRECATED, use $specialPageAliases in a file listed in $wgExtensionMessagesFiles instead. Use to define aliases of special pages names depending of the language $specialPageAliases:associative array of magic words synonyms $lang:language code(string) 'LanguageGetTranslatedLanguageNames':Provide translated language names. & $names:array of language code=> language name $code language of the preferred translations 'LanguageLinks':Manipulate a page 's language links. This is called in various places to allow extensions to define the effective language links for a page. $title:The page 's Title. & $links:Associative array mapping language codes to prefixed links of the form "language:title". & $linkFlags:Associative array mapping prefixed links to arrays of flags. Currently unused, but planned to provide support for marking individual language links in the UI, e.g. for featured articles. 'LinkBegin':Used when generating internal and interwiki links in Linker::link(), before processing starts. Return false to skip default processing and return $ret. See documentation for Linker::link() for details on the expected meanings of parameters. $skin:the Skin object $target:the Title that the link is pointing to & $html:the contents that the< a > tag should have(raw HTML) $result
Definition: hooks.txt:1528
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1336
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1221
data
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of data
Definition: hooks.txt:6
Sanitizer\attributeWhitelist
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1483
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:607
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
$html
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition: hooks.txt:1530
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:50
is
We use the convention $dbr for read and $dbw for write to help you keep track of whether the database object is a the world will explode Or to be a subsequent write query which succeeded on the master may fail when replicated to the slave due to a unique key collision Replication on the slave will stop and it may take hours to repair the database and get it back online Setting read_only in my cnf on the slave will avoid this but given the dire we prefer to have as many checks as possible We provide a but the wrapper functions like please read the documentation for except in special pages derived from QueryPage It s a common pitfall for new developers to submit code containing SQL queries which examine huge numbers of rows Remember that COUNT * is(N), counting rows in atable is like counting beans in a bucket.------------------------------------------------------------------------ Replication------------------------------------------------------------------------The largest installation of MediaWiki, Wikimedia, uses a large set ofslave MySQL servers replicating writes made to a master MySQL server. Itis important to understand the issues associated with this setup if youwant to write code destined for Wikipedia.It 's often the case that the best algorithm to use for a given taskdepends on whether or not replication is in use. Due to our unabashedWikipedia-centrism, we often just use the replication-friendly version, but if you like, you can use wfGetLB() ->getServerCount() > 1 tocheck to see if replication is in use.===Lag===Lag primarily occurs when large write queries are sent to the master.Writes on the master are executed in parallel, but they are executed inserial when they are replicated to the slaves. The master writes thequery to the binlog when the transaction is committed. The slaves pollthe binlog and start executing the query as soon as it appears. They canservice reads while they are performing a write query, but will not readanything more from the binlog and thus will perform no more writes. Thismeans that if the write query runs for a long time, the slaves will lagbehind the master for the time it takes for the write query to complete.Lag can be exacerbated by high read load. MediaWiki 's load balancer willstop sending reads to a slave when it is lagged by more than 30 seconds.If the load ratios are set incorrectly, or if there is too much loadgenerally, this may lead to a slave permanently hovering around 30seconds lag.If all slaves are lagged by more than 30 seconds, MediaWiki will stopwriting to the database. All edits and other write operations will berefused, with an error returned to the user. This gives the slaves achance to catch up. Before we had this mechanism, the slaves wouldregularly lag by several minutes, making review of recent editsdifficult.In addition to this, MediaWiki attempts to ensure that the user seesevents occurring on the wiki in chronological order. A few seconds of lagcan be tolerated, as long as the user sees a consistent picture fromsubsequent requests. This is done by saving the master binlog positionin the session, and then at the start of each request, waiting for theslave to catch up to that position before doing any reads from it. Ifthis wait times out, reads are allowed anyway, but the request isconsidered to be in "lagged slave mode". Lagged slave mode can bechecked by calling wfGetLB() ->getLaggedSlaveMode(). The onlypractical consequence at present is a warning displayed in the pagefooter.===Lag avoidance===To avoid excessive lag, queries which write large numbers of rows shouldbe split up, generally to write one row at a time. Multi-row INSERT ...SELECT queries are the worst offenders should be avoided altogether.Instead do the select first and then the insert.===Working with lag===Despite our best efforts, it 's not practical to guarantee a low-lagenvironment. Lag will usually be less than one second, but mayoccasionally be up to 30 seconds. For scalability, it 's very importantto keep load on the master low, so simply sending all your queries tothe master is not the answer. So when you have a genuine need forup-to-date data, the following approach is advised:1) Do a quick query to the master for a sequence number or timestamp 2) Run the full query on the slave and check if it matches the data you gotfrom the master 3) If it doesn 't, run the full query on the masterTo avoid swamping the master every time the slaves lag, use of thisapproach should be kept to a minimum. In most cases you should just readfrom the slave and let the user deal with the delay.------------------------------------------------------------------------ Lock contention------------------------------------------------------------------------Due to the high write rate on Wikipedia(and some other wikis), MediaWiki developers need to be very careful to structure their writesto avoid long-lasting locks. By default, MediaWiki opens a transactionat the first query, and commits it before the output is sent. Locks willbe held from the time when the query is done until the commit. So youcan reduce lock time by doing as much processing as possible before youdo your write queries.Often this approach is not good enough, and it becomes necessary toenclose small groups of queries in their own transaction. Use thefollowing syntax:$dbw=wfGetDB(DB_MASTER
Sanitizer\$htmlEntities
static $htmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:58
text
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add text
Definition: design.txt:12
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1466
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:807
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
$ret
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1530
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2387
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1280
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1830
$params
$params
Definition: styleTest.css.php:40
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1353
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1202
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1413
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:325
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1126
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1310
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
Sanitizer\stripAllTags
static stripAllTags( $text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1718
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:655
title
to move a page</td >< td > &*You are moving the page across *A non empty talk page already exists under the new or *You uncheck the box below In those you will have to move or merge the page manually if desired</td >< td > be sure to &You are responsible for making sure that links continue to point where they are supposed to go Note that the page will &a page at the new title
Definition: All_system_messages.txt:2703
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1738
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:51
codepointToUtf8
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
Definition: UtfNormalUtil.php:36
MWException
MediaWiki exception.
Definition: MWException.php:26
Sanitizer\$htmlEntityAliases
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:317
$out
$out
Definition: UtfNormalGenerate.php:167
wfRestoreWarnings
wfRestoreWarnings()
Restore error level to previous value.
Definition: GlobalFunctions.php:2417
hooks
Using a hook running we can avoid having all this option specific stuff in our mainline code Using hooks
Definition: hooks.txt:73
table
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition: deferred.txt:11
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:1024
directly
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database etc For and for historical it also represents a few features of articles that don t involve their such as access rights See also title txt Article Encapsulates access to the page table of the database The object represents a an and maintains state such as etc Revision Encapsulates individual page revision data and access to the revision text blobs storage system Higher level code should never touch text storage directly
Definition: design.txt:34
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1003
Sanitizer\armorLinksCallback
static armorLinksCallback( $matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:1154
wfRunHooks
wfRunHooks( $event, array $args=array(), $deprecatedVersion=null)
Call hook functions defined in $wgHooks.
Definition: GlobalFunctions.php:4001
Sanitizer\validateAttributes
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:711
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
simple
and how to run hooks for an and one after Each event has a preferably in CamelCase For ArticleDelete hook A clump of code and data that should be run when an event happens This can be either a function and a chunk of or an object and a method hook function The function part of a third party developers and administrators to define code that will be run at certain points in the mainline and to modify the data run by that mainline code Hooks can keep mainline code simple
Definition: hooks.txt:23
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1366
Sanitizer\escapeId
static escapeId( $id, $options=array())
Given a value, escape it so that it can be used in an id attribute and return it.
Definition: Sanitizer.php:1082
will
</td >< td > &</td >< td > t want your writing to be edited mercilessly and redistributed at will
Definition: All_system_messages.txt:914
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1380
$options
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1530
$section
$section
Definition: Utf8Test.php:88
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:695
root
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as root
Definition: distributors.txt:39
$name
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:336
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
$value
$value
Definition: styleTest.css.php:45
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:691
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1751
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:947
tags
pre inside other HTML tags(bug 54946) !! wikitext a< div >< pre > foo</pre ></div >< pre ></pre > !! html< p >a</p >< div >< pre > foo</pre ></div >< pre ></pre > !! end !! test HTML pre followed by indent-pre !! wikitext< pre >foo</pre > bar !! html< pre >foo</pre >< pre >bar</pre > !! end !!test Block tag pre !!options parsoid !! wikitext< p >< pre >foo</pre ></p > !! html< p data-parsoid
UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormalDefines.php:64
only
published in in Madrid In the first edition of the Vocabolario for was published In in Rotterdam was the Dictionnaire Universel ! html< p > The first monolingual dictionary written in a Romance language was< i > Sebastián Covarrubias</i >< i > Tesoro de la lengua castellana o published in in Madrid In the first edition of the< i > Vocabolario dell< a href="/index.php?title=Accademia_della_Crusca&amp;action=edit&amp;redlink=1" class="new" title="Accademia della Crusca (page does not exist)"> Accademia della Crusca</a ></i > for was published In in Rotterdam was the< i > Dictionnaire Universel</i ></p > ! end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html php< p >< i > foo</i ></p > ! html parsoid< p >< i > foo</i >< b ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i > foo</i ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html< p >< b > foo</b ></p > !end ! test Italics and ! wikitext foo ! html php< p >< b > foo</b ></p > ! html parsoid< p >< b > foo</b >< i ></i ></p > !end ! test Italics and ! options ! wikitext foo ! html< p >< b >< i > foo</i ></b ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo ! html< p >< i >< b > foo</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html< p >< i > foo< b > bar</b ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< i ></i ></p > !end ! test Italics and ! wikitext foo bar ! html php< p >< b > foo</b > bar</p > ! html parsoid< p >< b > foo</b > bar< b ></b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s family</b ></i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< i > this is about< b > foo s</b > family</i ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo</i ></b >< i > s family</i ></p > !end ! test Italics and ! options ! wikitext this is about foo s family ! html< p >< i > this is about</i > foo< b > s family</b ></p > !end ! test Italics and ! wikitext this is about foo s family ! html< p >< b > this is about< i > foo s</i > family</b ></p > !end ! test Italicized possessive ! wikitext The s talk page ! html< p > The< i >< a href="/wiki/Main_Page" title="Main Page"> Main Page</a ></i > s talk page</p > ! end ! test Parsoid only
Definition: parserTests.txt:396
$count
$count
Definition: UtfNormalTest2.php:96
it
=Architecture==Two class hierarchies are used to provide the functionality associated with the different content models:*Content interface(and AbstractContent base class) define functionality that acts on the concrete content of a page, and *ContentHandler base class provides functionality specific to a content model, but not acting on concrete content. The most important function of ContentHandler is to act as a factory for the appropriate implementation of Content. These Content objects are to be used by MediaWiki everywhere, instead of passing page content around as text. All manipulation and analysis of page content must be done via the appropriate methods of the Content object. For each content model, a subclass of ContentHandler has to be registered with $wgContentHandlers. The ContentHandler object for a given content model can be obtained using ContentHandler::getForModelID($id). Also Title, WikiPage and Revision now have getContentHandler() methods for convenience. ContentHandler objects are singletons that provide functionality specific to the content type, but not directly acting on the content of some page. ContentHandler::makeEmptyContent() and ContentHandler::unserializeContent() can be used to create a Content object of the appropriate type. However, it is recommended to instead use WikiPage::getContent() resp. Revision::getContent() to get a page 's content as a Content object. These two methods should be the ONLY way in which page content is accessed. Another important function of ContentHandler objects is to define custom action handlers for a content model, see ContentHandler::getActionOverrides(). This is similar to what WikiPage::getActionOverrides() was already doing.==Serialization==With the ContentHandler facility, page content no longer has to be text based. Objects implementing the Content interface are used to represent and handle the content internally. For storage and data exchange, each content model supports at least one serialization format via ContentHandler::serializeContent($content). The list of supported formats for a given content model can be accessed using ContentHandler::getSupportedFormats(). Content serialization formats are identified using MIME type like strings. The following formats are built in:*text/x-wiki - wikitext *text/javascript - for js pages *text/css - for css pages *text/plain - for future use, e.g. with plain text messages. *text/html - for future use, e.g. with plain html messages. *application/vnd.php.serialized - for future use with the api and for extensions *application/json - for future use with the api, and for use by extensions *application/xml - for future use with the api, and for use by extensions In PHP, use the corresponding CONTENT_FORMAT_XXX constant. Note that when using the API to access page content, especially action=edit, action=parse and action=query &prop=revisions, the model and format of the content should always be handled explicitly. Without that information, interpretation of the provided content is not reliable. The same applies to XML dumps generated via maintenance/dumpBackup.php or Special:Export. Also note that the API will provide encapsulated, serialized content - so if the API was called with format=json, and contentformat is also json(or rather, application/json), the page content is represented as a string containing an escaped json structure. Extensions that use JSON to serialize some types of page content may provide specialized API modules that allow access to that content in a more natural form.==Compatibility==The ContentHandler facility is introduced in a way that should allow all existing code to keep functioning at least for pages that contain wikitext or other text based content. However, a number of functions and hooks have been deprecated in favor of new versions that are aware of the page 's content model, and will now generate warnings when used. Most importantly, the following functions have been deprecated:*Revisions::getText() and Revisions::getRawText() is deprecated in favor Revisions::getContent() *WikiPage::getText() is deprecated in favor WikiPage::getContent() Also, the old Article::getContent()(which returns text) is superceded by Article::getContentObject(). However, both methods should be avoided since they do not provide clean access to the page 's actual content. For instance, they may return a system message for non-existing pages. Use WikiPage::getContent() instead. Code that relies on a textual representation of the page content should eventually be rewritten. However, ContentHandler::getContentText() provides a stop-gap that can be used to get text for a page. Its behavior is controlled by $wgContentHandlerTextFallback it
Definition: contenthandler.txt:107
$args
if( $line===false) $args
Definition: cdb.php:62
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:987
in
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
used
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:36
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1265
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
such
it sets a lot of them automatically from query and such
Definition: design.txt:93
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1450
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to.
Definition: StringUtils.php:256
utf8ToCodepoint
utf8ToCodepoint( $char)
Determine the Unicode codepoint of a single-character UTF-8 sequence.
Definition: UtfNormalUtil.php:94
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1299
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1166
from
Please log in again after you receive it</td >< td > s a saved copy from
Definition: All_system_messages.txt:3297
that
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
Definition: deferred.txt:11
Sanitizer\setupAttributeWhitelist
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1495
$t
$t
Definition: testCompression.php:65
$vars
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition: hooks.txt:1679
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1396
$attribs
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1530
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:31
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:838
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:332
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1141
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1431
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1798
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=array(), $extratags=array(), $removetags=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:366