MediaWiki  1.23.8
CleanUpTest.php
Go to the documentation of this file.
1 <?php
43  public function testAscii() {
44  $text = 'This is plain ASCII text.';
45  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
46  }
47 
49  public function testNull() {
50  $text = "a \x00 null";
51  $expect = "a \xef\xbf\xbd null";
52  $this->assertEquals(
53  bin2hex( $expect ),
54  bin2hex( UtfNormal::cleanUp( $text ) ) );
55  }
56 
58  public function testLatin() {
59  $text = "L'\xc3\xa9cole";
60  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
61  }
62 
64  public function testLatinNormal() {
65  $text = "L'e\xcc\x81cole";
66  $expect = "L'\xc3\xa9cole";
67  $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
68  }
69 
74  function XtestAllChars() {
75  $rep = UTF8_REPLACEMENT;
76  for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
77  $char = codepointToUtf8( $i );
78  $clean = UtfNormal::cleanUp( $char );
79  $x = sprintf( "%04X", $i );
80 
81  if ( $i % 0x1000 == 0 ) {
82  echo "U+$x\n";
83  }
84 
85  if ( $i == 0x0009 ||
86  $i == 0x000a ||
87  $i == 0x000d ||
88  ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
89  ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
90  ( $i > 0xffff && $i <= UNICODE_MAX )
91  ) {
92  if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
93  $comp = UtfNormal::NFC( $char );
94  $this->assertEquals(
95  bin2hex( $comp ),
96  bin2hex( $clean ),
97  "U+$x should be decomposed" );
98  } else {
99  $this->assertEquals(
100  bin2hex( $char ),
101  bin2hex( $clean ),
102  "U+$x should be intact" );
103  }
104  } else {
105  $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
106  }
107  }
108  }
109 
111  public function testAllBytes() {
112  $this->doTestBytes( '', '' );
113  $this->doTestBytes( 'x', '' );
114  $this->doTestBytes( '', 'x' );
115  $this->doTestBytes( 'x', 'x' );
116  }
117 
119  function doTestBytes( $head, $tail ) {
120  for ( $i = 0x0; $i < 256; $i++ ) {
121  $char = $head . chr( $i ) . $tail;
122  $clean = UtfNormal::cleanUp( $char );
123  $x = sprintf( "%02X", $i );
124 
125  if ( $i == 0x0009 ||
126  $i == 0x000a ||
127  $i == 0x000d ||
128  ( $i > 0x001f && $i < 0x80 )
129  ) {
130  $this->assertEquals(
131  bin2hex( $char ),
132  bin2hex( $clean ),
133  "ASCII byte $x should be intact" );
134  if ( $char != $clean ) {
135  return;
136  }
137  } else {
138  $norm = $head . UTF8_REPLACEMENT . $tail;
139  $this->assertEquals(
140  bin2hex( $norm ),
141  bin2hex( $clean ),
142  "Forbidden byte $x should be rejected" );
143  if ( $norm != $clean ) {
144  return;
145  }
146  }
147  }
148  }
149 
151  public function testDoubleBytes() {
152  $this->doTestDoubleBytes( '', '' );
153  $this->doTestDoubleBytes( 'x', '' );
154  $this->doTestDoubleBytes( '', 'x' );
155  $this->doTestDoubleBytes( 'x', 'x' );
156  }
157 
161  function doTestDoubleBytes( $head, $tail ) {
162  for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
163  for ( $second = 0x80; $second < 0x100; $second += 2 ) {
164  $char = $head . chr( $first ) . chr( $second ) . $tail;
165  $clean = UtfNormal::cleanUp( $char );
166  $x = sprintf( "%02X,%02X", $first, $second );
167  if ( $first > 0xc1 &&
168  $first < 0xe0 &&
169  $second < 0xc0
170  ) {
171  $norm = UtfNormal::NFC( $char );
172  $this->assertEquals(
173  bin2hex( $norm ),
174  bin2hex( $clean ),
175  "Pair $x should be intact" );
176  if ( $norm != $clean ) {
177  return;
178  }
179  } elseif ( $first > 0xfd || $second > 0xbf ) {
180  # fe and ff are not legal head bytes -- expect two replacement chars
181  $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
182  $this->assertEquals(
183  bin2hex( $norm ),
184  bin2hex( $clean ),
185  "Forbidden pair $x should be rejected" );
186  if ( $norm != $clean ) {
187  return;
188  }
189  } else {
190  $norm = $head . UTF8_REPLACEMENT . $tail;
191  $this->assertEquals(
192  bin2hex( $norm ),
193  bin2hex( $clean ),
194  "Forbidden pair $x should be rejected" );
195  if ( $norm != $clean ) {
196  return;
197  }
198  }
199  }
200  }
201  }
202 
204  public function testTripleBytes() {
205  $this->doTestTripleBytes( '', '' );
206  $this->doTestTripleBytes( 'x', '' );
207  $this->doTestTripleBytes( '', 'x' );
208  $this->doTestTripleBytes( 'x', 'x' );
209  }
210 
212  function doTestTripleBytes( $head, $tail ) {
213  for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
214  for ( $second = 0x80; $second < 0x100; $second += 2 ) {
215  #for( $third = 0x80; $third < 0x100; $third++ ) {
216  for ( $third = 0x80; $third < 0x81; $third++ ) {
217  $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
218  $clean = UtfNormal::cleanUp( $char );
219  $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
220 
221  if ( $first >= 0xe0 &&
222  $first < 0xf0 &&
223  $second < 0xc0 &&
224  $third < 0xc0
225  ) {
226  if ( $first == 0xe0 && $second < 0xa0 ) {
227  $this->assertEquals(
228  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
229  bin2hex( $clean ),
230  "Overlong triplet $x should be rejected" );
231  } elseif ( $first == 0xed &&
232  ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
233  ) {
234  $this->assertEquals(
235  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
236  bin2hex( $clean ),
237  "Surrogate triplet $x should be rejected" );
238  } else {
239  $this->assertEquals(
240  bin2hex( UtfNormal::NFC( $char ) ),
241  bin2hex( $clean ),
242  "Triplet $x should be intact" );
243  }
244  } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
245  $this->assertEquals(
246  bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
247  bin2hex( $clean ),
248  "Valid 2-byte $x + broken tail" );
249  } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
250  $this->assertEquals(
251  bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
252  bin2hex( $clean ),
253  "Broken head + valid 2-byte $x" );
254  } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
255  ( ( $second > 0xbf && $third > 0xbf ) ||
256  ( $second < 0xc0 && $third < 0xc0 ) ||
257  ( $second > 0xfd ) ||
258  ( $third > 0xfd ) )
259  ) {
260  # fe and ff are not legal head bytes -- expect three replacement chars
261  $this->assertEquals(
262  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
263  bin2hex( $clean ),
264  "Forbidden triplet $x should be rejected" );
265  } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
266  $this->assertEquals(
267  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
268  bin2hex( $clean ),
269  "Forbidden triplet $x should be rejected" );
270  } else {
271  $this->assertEquals(
272  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
273  bin2hex( $clean ),
274  "Forbidden triplet $x should be rejected" );
275  }
276  }
277  }
278  }
279  }
280 
282  public function testChunkRegression() {
283  # Check for regression against a chunking bug
284  $text = "\x46\x55\xb8" .
285  "\xdc\x96" .
286  "\xee" .
287  "\xe7" .
288  "\x44" .
289  "\xaa" .
290  "\x2f\x25";
291  $expect = "\x46\x55\xef\xbf\xbd" .
292  "\xdc\x96" .
293  "\xef\xbf\xbd" .
294  "\xef\xbf\xbd" .
295  "\x44" .
296  "\xef\xbf\xbd" .
297  "\x2f\x25";
298 
299  $this->assertEquals(
300  bin2hex( $expect ),
301  bin2hex( UtfNormal::cleanUp( $text ) ) );
302  }
303 
305  public function testInterposeRegression() {
306  $text = "\x4e\x30" .
307  "\xb1" . # bad tail
308  "\x3a" .
309  "\x92" . # bad tail
310  "\x62\x3a" .
311  "\x84" . # bad tail
312  "\x43" .
313  "\xc6" . # bad head
314  "\x3f" .
315  "\x92" . # bad tail
316  "\xad" . # bad tail
317  "\x7d" .
318  "\xd9\x95";
319 
320  $expect = "\x4e\x30" .
321  "\xef\xbf\xbd" .
322  "\x3a" .
323  "\xef\xbf\xbd" .
324  "\x62\x3a" .
325  "\xef\xbf\xbd" .
326  "\x43" .
327  "\xef\xbf\xbd" .
328  "\x3f" .
329  "\xef\xbf\xbd" .
330  "\xef\xbf\xbd" .
331  "\x7d" .
332  "\xd9\x95";
333 
334  $this->assertEquals(
335  bin2hex( $expect ),
336  bin2hex( UtfNormal::cleanUp( $text ) ) );
337  }
338 
340  public function testOverlongRegression() {
341  $text = "\x67" .
342  "\x1a" . # forbidden ascii
343  "\xea" . # bad head
344  "\xc1\xa6" . # overlong sequence
345  "\xad" . # bad tail
346  "\x1c" . # forbidden ascii
347  "\xb0" . # bad tail
348  "\x3c" .
349  "\x9e"; # bad tail
350  $expect = "\x67" .
351  "\xef\xbf\xbd" .
352  "\xef\xbf\xbd" .
353  "\xef\xbf\xbd" .
354  "\xef\xbf\xbd" .
355  "\xef\xbf\xbd" .
356  "\xef\xbf\xbd" .
357  "\x3c" .
358  "\xef\xbf\xbd";
359  $this->assertEquals(
360  bin2hex( $expect ),
361  bin2hex( UtfNormal::cleanUp( $text ) ) );
362  }
363 
365  public function testSurrogateRegression() {
366  $text = "\xed\xb4\x96" . # surrogate 0xDD16
367  "\x83" . # bad tail
368  "\xb4" . # bad tail
369  "\xac"; # bad head
370  $expect = "\xef\xbf\xbd" .
371  "\xef\xbf\xbd" .
372  "\xef\xbf\xbd" .
373  "\xef\xbf\xbd";
374  $this->assertEquals(
375  bin2hex( $expect ),
376  bin2hex( UtfNormal::cleanUp( $text ) ) );
377  }
378 
380  public function testBomRegression() {
381  $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
382  "\xb2" . # bad tail
383  "\xef" . # bad head
384  "\x59";
385  $expect = "\xef\xbf\xbd" .
386  "\xef\xbf\xbd" .
387  "\xef\xbf\xbd" .
388  "\x59";
389  $this->assertEquals(
390  bin2hex( $expect ),
391  bin2hex( UtfNormal::cleanUp( $text ) ) );
392  }
393 
395  public function testForbiddenRegression() {
396  $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
397  $expect = "\xef\xbf\xbd";
398  $this->assertEquals(
399  bin2hex( $expect ),
400  bin2hex( UtfNormal::cleanUp( $text ) ) );
401  }
402 
404  public function testHangulRegression() {
405  $text = "\xed\x9c\xaf" . # Hangul char
406  "\xe1\x87\x81"; # followed by another final jamo
407  $expect = $text; # Should *not* change.
408  $this->assertEquals(
409  bin2hex( $expect ),
410  bin2hex( UtfNormal::cleanUp( $text ) ) );
411  }
412 }
CleanUpTest\testDoubleBytes
testDoubleBytes()
Definition: CleanUpTest.php:151
CleanUpTest\testChunkRegression
testChunkRegression()
Definition: CleanUpTest.php:282
CleanUpTest\testBomRegression
testBomRegression()
Definition: CleanUpTest.php:380
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
CleanUpTest\testHangulRegression
testHangulRegression()
Definition: CleanUpTest.php:404
UNICODE_SURROGATE_FIRST
const UNICODE_SURROGATE_FIRST
Definition: UtfNormalDefines.php:44
UNICODE_SURROGATE_LAST
const UNICODE_SURROGATE_LAST
Definition: UtfNormalDefines.php:45
CleanUpTest\XtestAllChars
XtestAllChars()
This test is very expensive!
Definition: CleanUpTest.php:74
UtfNormal\cleanUp
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:79
CleanUpTest\testOverlongRegression
testOverlongRegression()
Definition: CleanUpTest.php:340
CleanUpTest\testLatin
testLatin()
Definition: CleanUpTest.php:58
UtfNormal\$utfCanonicalDecomp
static $utfCanonicalDecomp
Definition: UtfNormal.php:62
UtfNormal\$utfCanonicalComp
static $utfCanonicalComp
Definition: UtfNormal.php:61
codepointToUtf8
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
Definition: UtfNormalUtil.php:36
CleanUpTest\testSurrogateRegression
testSurrogateRegression()
Definition: CleanUpTest.php:365
MediaWikiTestCase
Definition: MediaWikiTestCase.php:6
UTF8_SURROGATE_FIRST
const UTF8_SURROGATE_FIRST
Definition: UtfNormalDefines.php:61
CleanUpTest\testInterposeRegression
testInterposeRegression()
Definition: CleanUpTest.php:305
CleanUpTest\testNull
testNull()
Definition: CleanUpTest.php:49
CleanUpTest\doTestBytes
doTestBytes( $head, $tail)
Definition: CleanUpTest.php:119
CleanUpTest\doTestTripleBytes
doTestTripleBytes( $head, $tail)
Definition: CleanUpTest.php:212
CleanUpTest\testForbiddenRegression
testForbiddenRegression()
Definition: CleanUpTest.php:395
CleanUpTest\testTripleBytes
testTripleBytes()
Definition: CleanUpTest.php:204
UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormalDefines.php:64
UtfNormal\NFC
static NFC( $string)
Definition: UtfNormal.php:462
CleanUpTest\doTestDoubleBytes
doTestDoubleBytes( $head, $tail)
Definition: CleanUpTest.php:161
CleanUpTest\testAscii
testAscii()
Definition: CleanUpTest.php:43
change
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could change
Definition: distributors.txt:9
CleanUpTest
Additional tests for UtfNormal::cleanUp() function, inclusion regression checks for known problems.
Definition: CleanUpTest.php:41
CleanUpTest\testAllBytes
testAllBytes()
Definition: CleanUpTest.php:111
CleanUpTest\testLatinNormal
testLatinNormal()
Definition: CleanUpTest.php:64
UNICODE_MAX
const UNICODE_MAX
Definition: UtfNormalDefines.php:46