MediaWiki REL1_31
SanitizerTest.php
Go to the documentation of this file.
1<?php
2
9class SanitizerTest extends MediaWikiTestCase {
10
11 protected function tearDown() {
13 parent::tearDown();
14 }
15
19 public function testDecodeNamedEntities() {
20 $this->assertEquals(
21 "\xc3\xa9cole",
22 Sanitizer::decodeCharReferences( '&eacute;cole' ),
23 'decode named entities'
24 );
25 }
26
30 public function testDecodeNumericEntities() {
31 $this->assertEquals(
32 "\xc4\x88io bonas dans l'\xc3\xa9cole!",
33 Sanitizer::decodeCharReferences( "&#x108;io bonas dans l'&#233;cole!" ),
34 'decode numeric entities'
35 );
36 }
37
41 public function testDecodeMixedEntities() {
42 $this->assertEquals(
43 "\xc4\x88io bonas dans l'\xc3\xa9cole!",
44 Sanitizer::decodeCharReferences( "&#x108;io bonas dans l'&eacute;cole!" ),
45 'decode mixed numeric/named entities'
46 );
47 }
48
52 public function testDecodeMixedComplexEntities() {
53 $this->assertEquals(
54 "\xc4\x88io bonas dans l'\xc3\xa9cole! (mais pas &#x108;io dans l'&eacute;cole)",
55 Sanitizer::decodeCharReferences(
56 "&#x108;io bonas dans l'&eacute;cole! (mais pas &amp;#x108;io dans l'&#38;eacute;cole)"
57 ),
58 'decode mixed complex entities'
59 );
60 }
61
65 public function testInvalidAmpersand() {
66 $this->assertEquals(
67 'a & b',
68 Sanitizer::decodeCharReferences( 'a & b' ),
69 'Invalid ampersand'
70 );
71 }
72
76 public function testInvalidEntities() {
77 $this->assertEquals(
78 '&foo;',
79 Sanitizer::decodeCharReferences( '&foo;' ),
80 'Invalid named entity'
81 );
82 }
83
87 public function testInvalidNumberedEntities() {
88 $this->assertEquals(
89 UtfNormal\Constants::UTF8_REPLACEMENT,
90 Sanitizer::decodeCharReferences( "&#88888888888888;" ),
91 'Invalid numbered entity'
92 );
93 }
94
102 public function testRemovehtmltagsOnHtml5Tags( $tag, $escaped ) {
103 MWTidy::setInstance( false );
104
105 if ( $escaped ) {
106 $this->assertEquals( "&lt;$tag&gt;",
107 Sanitizer::removeHTMLtags( "<$tag>" )
108 );
109 } else {
110 $this->assertEquals( "<$tag></$tag>\n",
111 Sanitizer::removeHTMLtags( "<$tag>" )
112 );
113 }
114 }
115
119 public static function provideHtml5Tags() {
120 $ESCAPED = true; # We want tag to be escaped
121 $VERBATIM = false; # We want to keep the tag
122 return [
123 [ 'data', $VERBATIM ],
124 [ 'mark', $VERBATIM ],
125 [ 'time', $VERBATIM ],
126 [ 'video', $ESCAPED ],
127 ];
128 }
129
130 function dataRemoveHTMLtags() {
131 return [
132 // former testSelfClosingTag
133 [
134 '<div>Hello world</div />',
135 '<div>Hello world</div>',
136 'Self-closing closing div'
137 ],
138 // Make sure special nested HTML5 semantics are not broken
139 // https://html.spec.whatwg.org/multipage/semantics.html#the-kbd-element
140 [
141 '<kbd><kbd>Shift</kbd>+<kbd>F3</kbd></kbd>',
142 '<kbd><kbd>Shift</kbd>+<kbd>F3</kbd></kbd>',
143 'Nested <kbd>.'
144 ],
145 // https://html.spec.whatwg.org/multipage/semantics.html#the-sub-and-sup-elements
146 [
147 '<var>x<sub><var>i</var></sub></var>, <var>y<sub><var>i</var></sub></var>',
148 '<var>x<sub><var>i</var></sub></var>, <var>y<sub><var>i</var></sub></var>',
149 'Nested <var>.'
150 ],
151 // https://html.spec.whatwg.org/multipage/semantics.html#the-dfn-element
152 [
153 '<dfn><abbr title="Garage Door Opener">GDO</abbr></dfn>',
154 '<dfn><abbr title="Garage Door Opener">GDO</abbr></dfn>',
155 '<abbr> inside <dfn>',
156 ],
157 ];
158 }
159
164 public function testRemoveHTMLtags( $input, $output, $msg = null ) {
165 MWTidy::setInstance( false );
166 $this->assertEquals( $output, Sanitizer::removeHTMLtags( $input ), $msg );
167 }
168
173 public function testDecodeTagAttributes( $expected, $attributes, $message = '' ) {
174 $this->assertEquals( $expected,
175 Sanitizer::decodeTagAttributes( $attributes ),
176 $message
177 );
178 }
179
180 public static function provideTagAttributesToDecode() {
181 return [
182 [ [ 'foo' => 'bar' ], 'foo=bar', 'Unquoted attribute' ],
183 [ [ 'עברית' => 'bar' ], 'עברית=bar', 'Non-Latin attribute' ],
184 [ [ '६' => 'bar' ], '६=bar', 'Devanagari number' ],
185 [ [ '搭𨋢' => 'bar' ], '搭𨋢=bar', 'Non-BMP character' ],
186 [ [], 'ńgh=bar', 'Combining accent is not allowed' ],
187 [ [ 'foo' => 'bar' ], ' foo = bar ', 'Spaced attribute' ],
188 [ [ 'foo' => 'bar' ], 'foo="bar"', 'Double-quoted attribute' ],
189 [ [ 'foo' => 'bar' ], 'foo=\'bar\'', 'Single-quoted attribute' ],
190 [
191 [ 'foo' => 'bar', 'baz' => 'foo' ],
192 'foo=\'bar\' baz="foo"',
193 'Several attributes'
194 ],
195 [
196 [ 'foo' => 'bar', 'baz' => 'foo' ],
197 'foo=\'bar\' baz="foo"',
198 'Several attributes'
199 ],
200 [
201 [ 'foo' => 'bar', 'baz' => 'foo' ],
202 'foo=\'bar\' baz="foo"',
203 'Several attributes'
204 ],
205 [ [ ':foo' => 'bar' ], ':foo=\'bar\'', 'Leading :' ],
206 [ [ '_foo' => 'bar' ], '_foo=\'bar\'', 'Leading _' ],
207 [ [ 'foo' => 'bar' ], 'Foo=\'bar\'', 'Leading capital' ],
208 [ [ 'foo' => 'BAR' ], 'FOO=BAR', 'Attribute keys are normalized to lowercase' ],
209
210 # Invalid beginning
211 [ [], '-foo=bar', 'Leading - is forbidden' ],
212 [ [], '.foo=bar', 'Leading . is forbidden' ],
213 [ [ 'foo-bar' => 'bar' ], 'foo-bar=bar', 'A - is allowed inside the attribute' ],
214 [ [ 'foo-' => 'bar' ], 'foo-=bar', 'A - is allowed inside the attribute' ],
215 [ [ 'foo.bar' => 'baz' ], 'foo.bar=baz', 'A . is allowed inside the attribute' ],
216 [ [ 'foo.' => 'baz' ], 'foo.=baz', 'A . is allowed as last character' ],
217 [ [ 'foo6' => 'baz' ], 'foo6=baz', 'Numbers are allowed' ],
218
219 # This bit is more relaxed than XML rules, but some extensions use
220 # it, like ProofreadPage (see T29539)
221 [ [ '1foo' => 'baz' ], '1foo=baz', 'Leading numbers are allowed' ],
222 [ [], 'foo$=baz', 'Symbols are not allowed' ],
223 [ [], 'foo@=baz', 'Symbols are not allowed' ],
224 [ [], 'foo~=baz', 'Symbols are not allowed' ],
225 [
226 [ 'foo' => '1[#^`*%w/(' ],
227 'foo=1[#^`*%w/(',
228 'All kind of characters are allowed as values'
229 ],
230 [
231 [ 'foo' => '1[#^`*%\'w/(' ],
232 'foo="1[#^`*%\'w/("',
233 'Double quotes are allowed if quoted by single quotes'
234 ],
235 [
236 [ 'foo' => '1[#^`*%"w/(' ],
237 'foo=\'1[#^`*%"w/(\'',
238 'Single quotes are allowed if quoted by double quotes'
239 ],
240 [ [ 'foo' => '&"' ], 'foo=&amp;&quot;', 'Special chars can be provided as entities' ],
241 [ [ 'foo' => '&foobar;' ], 'foo=&foobar;', 'Entity-like items are accepted' ],
242 ];
243 }
244
249 public function testDeprecatedAttributesUnaltered( $inputAttr, $inputEl, $message = '' ) {
250 $this->assertEquals( " $inputAttr",
251 Sanitizer::fixTagAttributes( $inputAttr, $inputEl ),
252 $message
253 );
254 }
255
256 public static function provideDeprecatedAttributes() {
258 return [
259 [ 'clear="left"', 'br' ],
260 [ 'clear="all"', 'br' ],
261 [ 'width="100"', 'td' ],
262 [ 'nowrap="true"', 'td' ],
263 [ 'nowrap=""', 'td' ],
264 [ 'align="right"', 'td' ],
265 [ 'align="center"', 'table' ],
266 [ 'align="left"', 'tr' ],
267 [ 'align="center"', 'div' ],
268 [ 'align="left"', 'h1' ],
269 [ 'align="left"', 'p' ],
270 ];
271 }
272
277 public function testCssCommentsChecking( $expected, $css, $message = '' ) {
278 $this->assertEquals( $expected,
279 Sanitizer::checkCss( $css ),
280 $message
281 );
282 }
283
284 public static function provideCssCommentsFixtures() {
286 return [
287 // Valid comments spanning entire input
288 [ '/**/', '/**/' ],
289 [ '/* comment */', '/* comment */' ],
290 // Weird stuff
291 [ ' ', '/****/' ],
292 [ ' ', '/* /* */' ],
293 [ 'display: block;', "display:/* foo */block;" ],
294 [ 'display: block;', "display:\\2f\\2a foo \\2a\\2f block;",
295 'Backslash-escaped comments must be stripped (T30450)' ],
296 [ '', '/* unfinished comment structure',
297 'Remove anything after a comment-start token' ],
298 [ '', "\\2f\\2a unifinished comment'",
299 'Remove anything after a backslash-escaped comment-start token' ],
300 [
301 '/* insecure input */',
302 'filter: progid:DXImageTransform.Microsoft.AlphaImageLoader'
303 . '(src=\'asdf.png\',sizingMethod=\'scale\');'
304 ],
305 [
306 '/* insecure input */',
307 '-ms-filter: "progid:DXImageTransform.Microsoft.AlphaImageLoader'
308 . '(src=\'asdf.png\',sizingMethod=\'scale\')";'
309 ],
310 [ '/* insecure input */', 'width: expression(1+1);' ],
311 [ '/* insecure input */', 'background-image: image(asdf.png);' ],
312 [ '/* insecure input */', 'background-image: -webkit-image(asdf.png);' ],
313 [ '/* insecure input */', 'background-image: -moz-image(asdf.png);' ],
314 [ '/* insecure input */', 'background-image: image-set("asdf.png" 1x, "asdf.png" 2x);' ],
315 [
316 '/* insecure input */',
317 'background-image: -webkit-image-set("asdf.png" 1x, "asdf.png" 2x);'
318 ],
319 [
320 '/* insecure input */',
321 'background-image: -moz-image-set("asdf.png" 1x, "asdf.png" 2x);'
322 ],
323 [ '/* insecure input */', 'foo: attr( title, url );' ],
324 [ '/* insecure input */', 'foo: attr( title url );' ],
325 [ '/* insecure input */', 'foo: var(--evil-attribute)' ],
326 ];
327 }
328
333 public function testEscapeHtmlAllowEntities( $expected, $html ) {
334 $this->assertEquals(
335 $expected,
336 Sanitizer::escapeHtmlAllowEntities( $html )
337 );
338 }
339
340 public static function provideEscapeHtmlAllowEntities() {
341 return [
342 [ 'foo', 'foo' ],
343 [ 'a¡b', 'a&#161;b' ],
344 [ 'foo&#039;bar', "foo'bar" ],
345 [ '&lt;script&gt;foo&lt;/script&gt;', '<script>foo</script>' ],
346 ];
347 }
348
355 public function testEscapeId( $input, $output ) {
356 $this->assertEquals(
357 $output,
358 Sanitizer::escapeId( $input, [ 'noninitial', 'legacy' ] )
359 );
360 }
361
362 public static function provideEscapeId() {
363 return [
364 [ '+', '.2B' ],
365 [ '&', '.26' ],
366 [ '=', '.3D' ],
367 [ ':', ':' ],
368 [ ';', '.3B' ],
369 [ '@', '.40' ],
370 [ '$', '.24' ],
371 [ '-_.', '-_.' ],
372 [ '!', '.21' ],
373 [ '*', '.2A' ],
374 [ '/', '.2F' ],
375 [ '[]', '.5B.5D' ],
376 [ '<>', '.3C.3E' ],
377 [ '\'', '.27' ],
378 [ '§', '.C2.A7' ],
379 [ 'Test:A & B/Here', 'Test:A_.26_B.2FHere' ],
380 [ 'A&B&amp;C&amp;amp;D&amp;amp;amp;E', 'A.26B.26amp.3BC.26amp.3Bamp.3BD.26amp.3Bamp.3Bamp.3BE' ],
381 ];
382 }
383
390 public function testEscapeIdReferenceList( $referenceList, $id1, $id2 ) {
391 $this->assertEquals(
392 Sanitizer::escapeIdReferenceList( $referenceList ),
393 Sanitizer::escapeIdForAttribute( $id1 )
394 . ' '
395 . Sanitizer::escapeIdForAttribute( $id2 )
396 );
397 }
398
399 public static function provideEscapeIdReferenceList() {
401 return [
402 [ 'foo bar', 'foo', 'bar' ],
403 [ '#1 #2', '#1', '#2' ],
404 [ '+1 +2', '+1', '+2' ],
405 ];
406 }
407
412 public function testIsReservedDataAttribute( $attr, $expected ) {
413 $this->assertSame( $expected, Sanitizer::isReservedDataAttribute( $attr ) );
414 }
415
416 public static function provideIsReservedDataAttribute() {
417 return [
418 [ 'foo', false ],
419 [ 'data', false ],
420 [ 'data-foo', false ],
421 [ 'data-mw', true ],
422 [ 'data-ooui', true ],
423 [ 'data-parsoid', true ],
424 [ 'data-mw-foo', true ],
425 [ 'data-ooui-foo', true ],
426 [ 'data-mwfoo', true ], // could be false but this is how it's implemented currently
427 ];
428 }
429
444 public function testEscapeIdForStuff( $stuff, array $config, $id, $expected, $mode = null ) {
445 $func = "Sanitizer::escapeIdFor{$stuff}";
446 $iwFlavor = array_pop( $config );
447 $this->setMwGlobals( [
448 'wgFragmentMode' => $config,
449 'wgExternalInterwikiFragmentMode' => $iwFlavor,
450 ] );
451 $escaped = call_user_func( $func, $id, $mode );
452 self::assertEquals( $expected, $escaped );
453 }
454
455 public function provideEscapeIdForStuff() {
456 // Test inputs and outputs
457 $text = 'foo тест_#%!\'()[]:<>&&amp;&amp;amp;';
458 $legacyEncoded = 'foo_.D1.82.D0.B5.D1.81.D1.82_.23.25.21.27.28.29.5B.5D:.3C.3E' .
459 '.26.26amp.3B.26amp.3Bamp.3B';
460 $html5Encoded = 'foo_тест_#%!\'()[]:<>&&amp;&amp;amp;';
461 $html5Experimental = 'foo_тест_!_()[]:<>_amp;_amp;amp;';
462
463 // Settings: last element is $wgExternalInterwikiFragmentMode, the rest is $wgFragmentMode
464 $legacy = [ 'legacy', 'legacy' ];
465 $legacyNew = [ 'legacy', 'html5', 'legacy' ];
466 $newLegacy = [ 'html5', 'legacy', 'legacy' ];
467 $new = [ 'html5', 'legacy' ];
468 $allNew = [ 'html5', 'html5' ];
469 $experimentalLegacy = [ 'html5-legacy', 'legacy', 'legacy' ];
470 $newExperimental = [ 'html5', 'html5-legacy', 'legacy' ];
471
472 return [
473 // Pure legacy: how MW worked before 2017
474 [ 'Attribute', $legacy, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ],
475 [ 'Attribute', $legacy, $text, false, Sanitizer::ID_FALLBACK ],
476 [ 'Link', $legacy, $text, $legacyEncoded ],
477 [ 'ExternalInterwiki', $legacy, $text, $legacyEncoded ],
478
479 // Transition to a new world: legacy links with HTML5 fallback
480 [ 'Attribute', $legacyNew, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ],
481 [ 'Attribute', $legacyNew, $text, $html5Encoded, Sanitizer::ID_FALLBACK ],
482 [ 'Link', $legacyNew, $text, $legacyEncoded ],
483 [ 'ExternalInterwiki', $legacyNew, $text, $legacyEncoded ],
484
485 // New world: HTML5 links, legacy fallbacks
486 [ 'Attribute', $newLegacy, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
487 [ 'Attribute', $newLegacy, $text, $legacyEncoded, Sanitizer::ID_FALLBACK ],
488 [ 'Link', $newLegacy, $text, $html5Encoded ],
489 [ 'ExternalInterwiki', $newLegacy, $text, $legacyEncoded ],
490
491 // Distant future: no legacy fallbacks, but still linking to leagacy wikis
492 [ 'Attribute', $new, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
493 [ 'Attribute', $new, $text, false, Sanitizer::ID_FALLBACK ],
494 [ 'Link', $new, $text, $html5Encoded ],
495 [ 'ExternalInterwiki', $new, $text, $legacyEncoded ],
496
497 // Just before the heat death of universe: external interwikis are also HTML5 \m/
498 [ 'Attribute', $allNew, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
499 [ 'Attribute', $allNew, $text, false, Sanitizer::ID_FALLBACK ],
500 [ 'Link', $allNew, $text, $html5Encoded ],
501 [ 'ExternalInterwiki', $allNew, $text, $html5Encoded ],
502
503 // Someone flipped $wgExperimentalHtmlIds on
504 [ 'Attribute', $experimentalLegacy, $text, $html5Experimental, Sanitizer::ID_PRIMARY ],
505 [ 'Attribute', $experimentalLegacy, $text, $legacyEncoded, Sanitizer::ID_FALLBACK ],
506 [ 'Link', $experimentalLegacy, $text, $html5Experimental ],
507 [ 'ExternalInterwiki', $experimentalLegacy, $text, $legacyEncoded ],
508
509 // Migration from $wgExperimentalHtmlIds to modern HTML5
510 [ 'Attribute', $newExperimental, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
511 [ 'Attribute', $newExperimental, $text, $html5Experimental, Sanitizer::ID_FALLBACK ],
512 [ 'Link', $newExperimental, $text, $html5Encoded ],
513 [ 'ExternalInterwiki', $newExperimental, $text, $legacyEncoded ],
514 ];
515 }
516
526 public function testStripAllTags( $input, $expected ) {
527 $this->assertEquals( $expected, Sanitizer::stripAllTags( $input ) );
528 }
529
530 public function provideStripAllTags() {
531 return [
532 [ '<p>Foo</p>', 'Foo' ],
533 [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
534 [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
535 [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
536 [
537 '<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
538 'Bar Whee!'
539 ],
540 [ '1<span class="<?php">2</span>3', '123' ],
541 [ '1<span class="<?">2</span>3', '123' ],
542 ];
543 }
544
549 public function testInvalidFragmentThrows() {
550 $this->setMwGlobals( 'wgFragmentMode', [ 'boom!' ] );
551 Sanitizer::escapeIdForAttribute( 'This should throw' );
552 }
553
558 public function testNoPrimaryFragmentModeThrows() {
559 $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] );
560 Sanitizer::escapeIdForAttribute( 'This should throw' );
561 }
562
567 public function testNoPrimaryFragmentModeThrows2() {
568 $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] );
569 Sanitizer::escapeIdForLink( 'This should throw' );
570 }
571}
static setInstance( $instance)
Set the driver to be used.
Definition MWTidy.php:135
static destroySingleton()
Destroy the current singleton instance.
Definition MWTidy.php:142
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:72
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:64
Unicode normalization routines for working with UTF-8 strings.
Definition UtfNormal.php:48
static configuration should be added through ResourceLoaderGetConfigVars instead can be used to get the real title after the basic globals have been set but before ordinary actions take place $output
Definition hooks.txt:2255
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return true
Definition hooks.txt:2006
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition hooks.txt:2013
processing should stop and the error should be shown to the user * false
Definition hooks.txt:187
if(is_array($mode)) switch( $mode) $input