MediaWiki  1.23.14
UtfNormal.php
Go to the documentation of this file.
1 <?php
31 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
32 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
33 
48 class UtfNormal {
52  const UNORM_NONE = 1;
53  const UNORM_NFD = 2;
54  const UNORM_NFKD = 3;
55  const UNORM_NFC = 4;
56  const UNORM_NFKC = 5;
57  const UNORM_FCD = 6;
59 
60  static $utfCombiningClass = null;
61  static $utfCanonicalComp = null;
62  static $utfCanonicalDecomp = null;
63 
64  # Load compatibility decompositions on demand if they are needed.
65  static $utfCompatibilityDecomp = null;
66 
67  static $utfCheckNFC;
68 
79  static function cleanUp( $string ) {
80  if( NORMALIZE_ICU ) {
81  $string = self::replaceForNativeNormalize( $string );
82 
83  # UnicodeString constructor fails if the string ends with a
84  # head byte. Add a junk char at the end, we'll strip it off.
85  return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
86  } elseif( NORMALIZE_INTL ) {
87  $string = self::replaceForNativeNormalize( $string );
88  $norm = normalizer_normalize( $string, Normalizer::FORM_C );
89  if( $norm === null || $norm === false ) {
90  # normalizer_normalize will either return false or null
91  # (depending on which doc you read) if invalid utf8 string.
92  # quickIsNFCVerify cleans up invalid sequences.
93 
94  if( UtfNormal::quickIsNFCVerify( $string ) ) {
95  # if that's true, the string is actually already normal.
96  return $string;
97  } else {
98  # Now we are valid but non-normal
99  return normalizer_normalize( $string, Normalizer::FORM_C );
100  }
101  } else {
102  return $norm;
103  }
104  } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
105  # Side effect -- $string has had UTF-8 errors cleaned up.
106  return $string;
107  } else {
108  return UtfNormal::NFC( $string );
109  }
110  }
111 
120  static function toNFC( $string ) {
121  if( NORMALIZE_INTL )
122  return normalizer_normalize( $string, Normalizer::FORM_C );
123  elseif( NORMALIZE_ICU )
124  return utf8_normalize( $string, self::UNORM_NFC );
125  elseif( UtfNormal::quickIsNFC( $string ) )
126  return $string;
127  else
128  return UtfNormal::NFC( $string );
129  }
130 
138  static function toNFD( $string ) {
139  if( NORMALIZE_INTL )
140  return normalizer_normalize( $string, Normalizer::FORM_D );
141  elseif( NORMALIZE_ICU )
142  return utf8_normalize( $string, self::UNORM_NFD );
143  elseif( preg_match( '/[\x80-\xff]/', $string ) )
144  return UtfNormal::NFD( $string );
145  else
146  return $string;
147  }
148 
157  static function toNFKC( $string ) {
158  if( NORMALIZE_INTL )
159  return normalizer_normalize( $string, Normalizer::FORM_KC );
160  elseif( NORMALIZE_ICU )
161  return utf8_normalize( $string, self::UNORM_NFKC );
162  elseif( preg_match( '/[\x80-\xff]/', $string ) )
163  return UtfNormal::NFKC( $string );
164  else
165  return $string;
166  }
167 
176  static function toNFKD( $string ) {
177  if( NORMALIZE_INTL )
178  return normalizer_normalize( $string, Normalizer::FORM_KD );
179  elseif( NORMALIZE_ICU )
180  return utf8_normalize( $string, self::UNORM_NFKD );
181  elseif( preg_match( '/[\x80-\xff]/', $string ) )
182  return UtfNormal::NFKD( $string );
183  else
184  return $string;
185  }
186 
191  static function loadData() {
192  if( !isset( self::$utfCombiningClass ) ) {
193  require_once __DIR__ . '/UtfNormalData.inc';
194  }
195  }
196 
203  static function quickIsNFC( $string ) {
204  # ASCII is always valid NFC!
205  # If it's pure ASCII, let it through.
206  if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
207 
209  $len = strlen( $string );
210  for( $i = 0; $i < $len; $i++ ) {
211  $c = $string[$i];
212  $n = ord( $c );
213  if( $n < 0x80 ) {
214  continue;
215  } elseif( $n >= 0xf0 ) {
216  $c = substr( $string, $i, 4 );
217  $i += 3;
218  } elseif( $n >= 0xe0 ) {
219  $c = substr( $string, $i, 3 );
220  $i += 2;
221  } elseif( $n >= 0xc0 ) {
222  $c = substr( $string, $i, 2 );
223  $i++;
224  }
225  if( isset( self::$utfCheckNFC[$c] ) ) {
226  # If it's NO or MAYBE, bail and do the slow check.
227  return false;
228  }
229  if( isset( self::$utfCombiningClass[$c] ) ) {
230  # Combining character? We might have to do sorting, at least.
231  return false;
232  }
233  }
234  return true;
235  }
236 
243  static function quickIsNFCVerify( &$string ) {
244  # Screen out some characters that eg won't be allowed in XML
245  $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
246 
247  # ASCII is always valid NFC!
248  # If we're only ever given plain ASCII, we can avoid the overhead
249  # of initializing the decomposition tables by skipping out early.
250  if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
251 
252  static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
253  if( !isset( $checkit ) ) {
254  # Load/build some scary lookup tables...
256 
257  $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
258 
259  # Head bytes for sequences which we should do further validity checks
260  $checkit = array_flip( array_map( 'chr',
261  array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
262  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
263  0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
264 
265  # Each UTF-8 head byte is followed by a certain
266  # number of tail bytes.
267  $tailBytes = array();
268  for( $n = 0; $n < 256; $n++ ) {
269  if( $n < 0xc0 ) {
270  $remaining = 0;
271  } elseif( $n < 0xe0 ) {
272  $remaining = 1;
273  } elseif( $n < 0xf0 ) {
274  $remaining = 2;
275  } elseif( $n < 0xf8 ) {
276  $remaining = 3;
277  } elseif( $n < 0xfc ) {
278  $remaining = 4;
279  } elseif( $n < 0xfe ) {
280  $remaining = 5;
281  } else {
282  $remaining = 0;
283  }
284  $tailBytes[chr($n)] = $remaining;
285  }
286  }
287 
288  # Chop the text into pure-ASCII and non-ASCII areas;
289  # large ASCII parts can be handled much more quickly.
290  # Don't chop up Unicode areas for punctuation, though,
291  # that wastes energy.
292  $matches = array();
293  preg_match_all(
294  '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
295  $string, $matches );
296 
297  $looksNormal = true;
298  $base = 0;
299  $replace = array();
300  foreach( $matches[1] as $str ) {
301  $chunk = strlen( $str );
302 
303  if( $str[0] < "\x80" ) {
304  # ASCII chunk: guaranteed to be valid UTF-8
305  # and in normal form C, so skip over it.
306  $base += $chunk;
307  continue;
308  }
309 
310  # We'll have to examine the chunk byte by byte to ensure
311  # that it consists of valid UTF-8 sequences, and to see
312  # if any of them might not be normalized.
313  #
314  # Since PHP is not the fastest language on earth, some of
315  # this code is a little ugly with inner loop optimizations.
316 
317  $head = '';
318  $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
319 
320  for( $i = -1; --$len; ) {
321  $remaining = $tailBytes[$c = $str[++$i]];
322  if( $remaining ) {
323  # UTF-8 head byte!
324  $sequence = $head = $c;
325  do {
326  # Look for the defined number of tail bytes...
327  if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
328  # Legal tail bytes are nice.
329  $sequence .= $c;
330  } else {
331  if( 0 == $len ) {
332  # Premature end of string!
333  # Drop a replacement character into output to
334  # represent the invalid UTF-8 sequence.
335  $replace[] = array( UTF8_REPLACEMENT,
336  $base + $i + 1 - strlen( $sequence ),
337  strlen( $sequence ) );
338  break 2;
339  } else {
340  # Illegal tail byte; abandon the sequence.
341  $replace[] = array( UTF8_REPLACEMENT,
342  $base + $i - strlen( $sequence ),
343  strlen( $sequence ) );
344  # Back up and reprocess this byte; it may itself
345  # be a legal ASCII or UTF-8 sequence head.
346  --$i;
347  ++$len;
348  continue 2;
349  }
350  }
351  } while( --$remaining );
352 
353  if( isset( $checkit[$head] ) ) {
354  # Do some more detailed validity checks, for
355  # invalid characters and illegal sequences.
356  if( $head == "\xed" ) {
357  # 0xed is relatively frequent in Korean, which
358  # abuts the surrogate area, so we're doing
359  # this check separately to speed things up.
360 
361  if( $sequence >= UTF8_SURROGATE_FIRST ) {
362  # Surrogates are legal only in UTF-16 code.
363  # They are totally forbidden here in UTF-8
364  # utopia.
365  $replace[] = array( UTF8_REPLACEMENT,
366  $base + $i + 1 - strlen( $sequence ),
367  strlen( $sequence ) );
368  $head = '';
369  continue;
370  }
371  } else {
372  # Slower, but rarer checks...
373  $n = ord( $head );
374  if(
375  # "Overlong sequences" are those that are syntactically
376  # correct but use more UTF-8 bytes than are necessary to
377  # encode a character. Naïve string comparisons can be
378  # tricked into failing to see a match for an ASCII
379  # character, for instance, which can be a security hole
380  # if blacklist checks are being used.
381  ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
382  || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
383  || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
384 
385  # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
386  || ($n == 0xef &&
387  ($sequence == UTF8_FFFE)
388  || ($sequence == UTF8_FFFF) )
389 
390  # Unicode has been limited to 21 bits; longer
391  # sequences are not allowed.
392  || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
393 
394  $replace[] = array( UTF8_REPLACEMENT,
395  $base + $i + 1 - strlen( $sequence ),
396  strlen( $sequence ) );
397  $head = '';
398  continue;
399  }
400  }
401  }
402 
403  if( isset( $utfCheckOrCombining[$sequence] ) ) {
404  # If it's NO or MAYBE, we'll have to rip
405  # the string apart and put it back together.
406  # That's going to be mighty slow.
407  $looksNormal = false;
408  }
409 
410  # The sequence is legal!
411  $head = '';
412  } elseif( $c < "\x80" ) {
413  # ASCII byte.
414  $head = '';
415  } elseif( $c < "\xc0" ) {
416  # Illegal tail bytes
417  if( $head == '' ) {
418  # Out of the blue!
419  $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
420  } else {
421  # Don't add if we're continuing a broken sequence;
422  # we already put a replacement character when we looked
423  # at the broken sequence.
424  $replace[] = array( '', $base + $i, 1 );
425  }
426  } else {
427  # Miscellaneous freaks.
428  $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
429  $head = '';
430  }
431  }
432  $base += $chunk;
433  }
434  if( count( $replace ) ) {
435  # There were illegal UTF-8 sequences we need to fix up.
436  $out = '';
437  $last = 0;
438  foreach( $replace as $rep ) {
439  list( $replacement, $start, $length ) = $rep;
440  if( $last < $start ) {
441  $out .= substr( $string, $last, $start - $last );
442  }
443  $out .= $replacement;
444  $last = $start + $length;
445  }
446  if( $last < strlen( $string ) ) {
447  $out .= substr( $string, $last );
448  }
449  $string = $out;
450  }
451  return $looksNormal;
452  }
453 
454  # These take a string and run the normalization on them, without
455  # checking for validity or any optimization etc. Input must be
456  # VALID UTF-8!
457 
462  static function NFC( $string ) {
463  return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
464  }
465 
471  static function NFD( $string ) {
473 
475  UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
476  }
477 
483  static function NFKC( $string ) {
484  return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
485  }
486 
492  static function NFKD( $string ) {
493  if( !isset( self::$utfCompatibilityDecomp ) ) {
494  require_once 'UtfNormalDataK.inc';
495  }
496  return self::fastCombiningSort(
497  self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
498  }
499 
500 
510  static function fastDecompose( $string, $map ) {
512  $len = strlen( $string );
513  $out = '';
514  for( $i = 0; $i < $len; $i++ ) {
515  $c = $string[$i];
516  $n = ord( $c );
517  if( $n < 0x80 ) {
518  # ASCII chars never decompose
519  # THEY ARE IMMORTAL
520  $out .= $c;
521  continue;
522  } elseif( $n >= 0xf0 ) {
523  $c = substr( $string, $i, 4 );
524  $i += 3;
525  } elseif( $n >= 0xe0 ) {
526  $c = substr( $string, $i, 3 );
527  $i += 2;
528  } elseif( $n >= 0xc0 ) {
529  $c = substr( $string, $i, 2 );
530  $i++;
531  }
532  if( isset( $map[$c] ) ) {
533  $out .= $map[$c];
534  continue;
535  } else {
536  if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
537  # Decompose a hangul syllable into jamo;
538  # hardcoded for three-byte UTF-8 sequence.
539  # A lookup table would be slightly faster,
540  # but adds a lot of memory & disk needs.
541  #
542  $index = ( (ord( $c[0] ) & 0x0f) << 12
543  | (ord( $c[1] ) & 0x3f) << 6
544  | (ord( $c[2] ) & 0x3f) )
546  $l = intval( $index / UNICODE_HANGUL_NCOUNT );
547  $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
548  $t = $index % UNICODE_HANGUL_TCOUNT;
549  $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
550  if( $t >= 25 ) {
551  $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
552  } elseif( $t ) {
553  $out .= "\xe1\x86" . chr( 0xa7 + $t );
554  }
555  continue;
556  }
557  }
558  $out .= $c;
559  }
560  return $out;
561  }
562 
570  static function fastCombiningSort( $string ) {
572  $len = strlen( $string );
573  $out = '';
574  $combiners = array();
575  $lastClass = -1;
576  for( $i = 0; $i < $len; $i++ ) {
577  $c = $string[$i];
578  $n = ord( $c );
579  if( $n >= 0x80 ) {
580  if( $n >= 0xf0 ) {
581  $c = substr( $string, $i, 4 );
582  $i += 3;
583  } elseif( $n >= 0xe0 ) {
584  $c = substr( $string, $i, 3 );
585  $i += 2;
586  } elseif( $n >= 0xc0 ) {
587  $c = substr( $string, $i, 2 );
588  $i++;
589  }
590  if( isset( self::$utfCombiningClass[$c] ) ) {
591  $lastClass = self::$utfCombiningClass[$c];
592  if( isset( $combiners[$lastClass] ) ) {
593  $combiners[$lastClass] .= $c;
594  } else {
595  $combiners[$lastClass] = $c;
596  }
597  continue;
598  }
599  }
600  if( $lastClass ) {
601  ksort( $combiners );
602  $out .= implode( '', $combiners );
603  $combiners = array();
604  }
605  $out .= $c;
606  $lastClass = 0;
607  }
608  if( $lastClass ) {
609  ksort( $combiners );
610  $out .= implode( '', $combiners );
611  }
612  return $out;
613  }
614 
622  static function fastCompose( $string ) {
624  $len = strlen( $string );
625  $out = '';
626  $lastClass = -1;
627  $lastHangul = 0;
628  $startChar = '';
629  $combining = '';
630  $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
631  $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
632  for( $i = 0; $i < $len; $i++ ) {
633  $c = $string[$i];
634  $n = ord( $c );
635  if( $n < 0x80 ) {
636  # No combining characters here...
637  $out .= $startChar;
638  $out .= $combining;
639  $startChar = $c;
640  $combining = '';
641  $lastClass = 0;
642  continue;
643  } elseif( $n >= 0xf0 ) {
644  $c = substr( $string, $i, 4 );
645  $i += 3;
646  } elseif( $n >= 0xe0 ) {
647  $c = substr( $string, $i, 3 );
648  $i += 2;
649  } elseif( $n >= 0xc0 ) {
650  $c = substr( $string, $i, 2 );
651  $i++;
652  }
653  $pair = $startChar . $c;
654  if( $n > 0x80 ) {
655  if( isset( self::$utfCombiningClass[$c] ) ) {
656  # A combining char; see what we can do with it
657  $class = self::$utfCombiningClass[$c];
658  if( !empty( $startChar ) &&
659  $lastClass < $class &&
660  $class > 0 &&
661  isset( self::$utfCanonicalComp[$pair] ) ) {
662  $startChar = self::$utfCanonicalComp[$pair];
663  $class = 0;
664  } else {
665  $combining .= $c;
666  }
667  $lastClass = $class;
668  $lastHangul = 0;
669  continue;
670  }
671  }
672  # New start char
673  if( $lastClass == 0 ) {
674  if( isset( self::$utfCanonicalComp[$pair] ) ) {
675  $startChar = self::$utfCanonicalComp[$pair];
676  $lastHangul = 0;
677  continue;
678  }
679  if( $n >= $x1 && $n <= $x2 ) {
680  # WARNING: Hangul code is painfully slow.
681  # I apologize for this ugly, ugly code; however
682  # performance is even more teh suck if we call
683  # out to nice clean functions. Lookup tables are
684  # marginally faster, but require a lot of space.
685  #
686  if( $c >= UTF8_HANGUL_VBASE &&
687  $c <= UTF8_HANGUL_VEND &&
688  $startChar >= UTF8_HANGUL_LBASE &&
689  $startChar <= UTF8_HANGUL_LEND ) {
690  #
691  #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
692  #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
693  $lIndex = ord( $startChar[2] ) - 0x80;
694  $vIndex = ord( $c[2] ) - 0xa1;
695 
696  $hangulPoint = UNICODE_HANGUL_FIRST +
698  (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
699 
700  # Hardcode the limited-range UTF-8 conversion:
701  $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
702  chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
703  chr( $hangulPoint & 0x3f | 0x80 );
704  $lastHangul = 0;
705  continue;
706  } elseif( $c >= UTF8_HANGUL_TBASE &&
707  $c <= UTF8_HANGUL_TEND &&
708  $startChar >= UTF8_HANGUL_FIRST &&
709  $startChar <= UTF8_HANGUL_LAST &&
710  !$lastHangul ) {
711  # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
712  $tIndex = ord( $c[2] ) - 0xa7;
713  if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
714 
715  # Increment the code point by $tIndex, without
716  # the function overhead of decoding and recoding UTF-8
717  #
718  $tail = ord( $startChar[2] ) + $tIndex;
719  if( $tail > 0xbf ) {
720  $tail -= 0x40;
721  $mid = ord( $startChar[1] ) + 1;
722  if( $mid > 0xbf ) {
723  $startChar[0] = chr( ord( $startChar[0] ) + 1 );
724  $mid -= 0x40;
725  }
726  $startChar[1] = chr( $mid );
727  }
728  $startChar[2] = chr( $tail );
729 
730  # If there's another jamo char after this, *don't* try to merge it.
731  $lastHangul = 1;
732  continue;
733  }
734  }
735  }
736  $out .= $startChar;
737  $out .= $combining;
738  $startChar = $c;
739  $combining = '';
740  $lastClass = 0;
741  $lastHangul = 0;
742  }
743  $out .= $startChar . $combining;
744  return $out;
745  }
746 
753  static function placebo( $string ) {
754  $len = strlen( $string );
755  $out = '';
756  for( $i = 0; $i < $len; $i++ ) {
757  $out .= $string[$i];
758  }
759  return $out;
760  }
768  private static function replaceForNativeNormalize( $string ) {
769  $string = preg_replace(
770  '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
772  $string );
773  $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
774  $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
775  return $string;
776  }
777 }
UtfNormal\$utfCombiningClass
static $utfCombiningClass
Definition: UtfNormal.php:60
UtfNormal\fastCompose
static fastCompose( $string)
Produces canonically composed sequences, i.e.
Definition: UtfNormal.php:622
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
UtfNormal\NFKC
static NFKC( $string)
Definition: UtfNormal.php:483
is
We use the convention $dbr for read and $dbw for write to help you keep track of whether the database object is a the world will explode Or to be a subsequent write query which succeeded on the master may fail when replicated to the slave due to a unique key collision Replication on the slave will stop and it may take hours to repair the database and get it back online Setting read_only in my cnf on the slave will avoid this but given the dire we prefer to have as many checks as possible We provide a but the wrapper functions like please read the documentation for except in special pages derived from QueryPage It s a common pitfall for new developers to submit code containing SQL queries which examine huge numbers of rows Remember that COUNT * is(N), counting rows in atable is like counting beans in a bucket.------------------------------------------------------------------------ Replication------------------------------------------------------------------------The largest installation of MediaWiki, Wikimedia, uses a large set ofslave MySQL servers replicating writes made to a master MySQL server. Itis important to understand the issues associated with this setup if youwant to write code destined for Wikipedia.It 's often the case that the best algorithm to use for a given taskdepends on whether or not replication is in use. Due to our unabashedWikipedia-centrism, we often just use the replication-friendly version, but if you like, you can use wfGetLB() ->getServerCount() > 1 tocheck to see if replication is in use.===Lag===Lag primarily occurs when large write queries are sent to the master.Writes on the master are executed in parallel, but they are executed inserial when they are replicated to the slaves. The master writes thequery to the binlog when the transaction is committed. The slaves pollthe binlog and start executing the query as soon as it appears. They canservice reads while they are performing a write query, but will not readanything more from the binlog and thus will perform no more writes. Thismeans that if the write query runs for a long time, the slaves will lagbehind the master for the time it takes for the write query to complete.Lag can be exacerbated by high read load. MediaWiki 's load balancer willstop sending reads to a slave when it is lagged by more than 30 seconds.If the load ratios are set incorrectly, or if there is too much loadgenerally, this may lead to a slave permanently hovering around 30seconds lag.If all slaves are lagged by more than 30 seconds, MediaWiki will stopwriting to the database. All edits and other write operations will berefused, with an error returned to the user. This gives the slaves achance to catch up. Before we had this mechanism, the slaves wouldregularly lag by several minutes, making review of recent editsdifficult.In addition to this, MediaWiki attempts to ensure that the user seesevents occurring on the wiki in chronological order. A few seconds of lagcan be tolerated, as long as the user sees a consistent picture fromsubsequent requests. This is done by saving the master binlog positionin the session, and then at the start of each request, waiting for theslave to catch up to that position before doing any reads from it. Ifthis wait times out, reads are allowed anyway, but the request isconsidered to be in "lagged slave mode". Lagged slave mode can bechecked by calling wfGetLB() ->getLaggedSlaveMode(). The onlypractical consequence at present is a warning displayed in the pagefooter.===Lag avoidance===To avoid excessive lag, queries which write large numbers of rows shouldbe split up, generally to write one row at a time. Multi-row INSERT ...SELECT queries are the worst offenders should be avoided altogether.Instead do the select first and then the insert.===Working with lag===Despite our best efforts, it 's not practical to guarantee a low-lagenvironment. Lag will usually be less than one second, but mayoccasionally be up to 30 seconds. For scalability, it 's very importantto keep load on the master low, so simply sending all your queries tothe master is not the answer. So when you have a genuine need forup-to-date data, the following approach is advised:1) Do a quick query to the master for a sequence number or timestamp 2) Run the full query on the slave and check if it matches the data you gotfrom the master 3) If it doesn 't, run the full query on the masterTo avoid swamping the master every time the slaves lag, use of thisapproach should be kept to a minimum. In most cases you should just readfrom the slave and let the user deal with the delay.------------------------------------------------------------------------ Lock contention------------------------------------------------------------------------Due to the high write rate on Wikipedia(and some other wikis), MediaWiki developers need to be very careful to structure their writesto avoid long-lasting locks. By default, MediaWiki opens a transactionat the first query, and commits it before the output is sent. Locks willbe held from the time when the query is done until the commit. So youcan reduce lock time by doing as much processing as possible before youdo your write queries.Often this approach is not good enough, and it becomes necessary toenclose small groups of queries in their own transaction. Use thefollowing syntax:$dbw=wfGetDB(DB_MASTER
UTF8_FFFF
const UTF8_FFFF
Definition: UtfNormalDefines.php:75
UTF8_HANGUL_LBASE
const UTF8_HANGUL_LBASE
Definition: UtfNormalDefines.php:53
$last
$last
Definition: profileinfo.php:365
UtfNormal\placebo
static placebo( $string)
This is just used for the benchmark, comparing how long it takes to interate through a string without...
Definition: UtfNormal.php:753
UTF8_HANGUL_VEND
const UTF8_HANGUL_VEND
Definition: UtfNormalDefines.php:58
UtfNormal\UNORM_NFD
const UNORM_NFD
Definition: UtfNormal.php:53
UtfNormal\cleanUp
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:79
character
</p > ! end ! test Bare pipe character(bug 52363) !! wikitext|!! html< p >|</p > !! end !! test Bare pipe character from a template(bug 52363) !! wikitext
Definition: parserTests.txt:918
NORMALIZE_ICU
const NORMALIZE_ICU
Definition: UtfNormal.php:31
UtfNormal\fastCombiningSort
static fastCombiningSort( $string)
Sorts combining characters into canonical order.
Definition: UtfNormal.php:570
$n
$n
Definition: RandomTest.php:76
UtfNormal\toNFD
static toNFD( $string)
Convert a UTF-8 string to normal form D, canonical decomposition.
Definition: UtfNormal.php:138
UTF8_OVERLONG_B
const UTF8_OVERLONG_B
Definition: UtfNormalDefines.php:68
UtfNormal\$utfCanonicalDecomp
static $utfCanonicalDecomp
Definition: UtfNormal.php:62
UtfNormal\$utfCompatibilityDecomp
static $utfCompatibilityDecomp
Definition: UtfNormal.php:65
UtfNormal\$utfCanonicalComp
static $utfCanonicalComp
Definition: UtfNormal.php:61
UtfNormal\UNORM_DEFAULT
const UNORM_DEFAULT
Definition: UtfNormal.php:58
NORMALIZE_INTL
const NORMALIZE_INTL
Definition: UtfNormal.php:32
UtfNormal\toNFC
static toNFC( $string)
Convert a UTF-8 string to normal form C, canonical composition.
Definition: UtfNormal.php:120
UTF8_OVERLONG_A
const UTF8_OVERLONG_A
Definition: UtfNormalDefines.php:67
UtfNormal\UNORM_NFKC
const UNORM_NFKC
Definition: UtfNormal.php:56
$out
$out
Definition: UtfNormalGenerate.php:167
UTF8_HANGUL_TEND
const UTF8_HANGUL_TEND
Definition: UtfNormalDefines.php:59
UNICODE_HANGUL_TCOUNT
const UNICODE_HANGUL_TCOUNT
Definition: UtfNormalDefines.php:37
UtfNormal\UNORM_NFKD
const UNORM_NFKD
Definition: UtfNormal.php:54
UTF8_HANGUL_LEND
const UTF8_HANGUL_LEND
Definition: UtfNormalDefines.php:57
UtfNormal
Unicode normalization routines for working with UTF-8 strings.
Definition: UtfNormal.php:48
UTF8_HANGUL_FIRST
const UTF8_HANGUL_FIRST
Definition: UtfNormalDefines.php:50
UTF8_HANGUL_TBASE
const UTF8_HANGUL_TBASE
Definition: UtfNormalDefines.php:55
UTF8_SURROGATE_FIRST
const UTF8_SURROGATE_FIRST
Definition: UtfNormalDefines.php:61
UTF8_FFFE
const UTF8_FFFE
Definition: UtfNormalDefines.php:74
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
UtfNormal\toNFKD
static toNFKD( $string)
Convert a UTF-8 string to normal form KD, compatibility decomposition.
Definition: UtfNormal.php:176
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
UNICODE_HANGUL_NCOUNT
const UNICODE_HANGUL_NCOUNT
Definition: UtfNormalDefines.php:38
UtfNormal\NFD
static NFD( $string)
Definition: UtfNormal.php:471
see
Some information about database access in MediaWiki By Tim January Database layout For information about the MediaWiki database such as a description of the tables and their please see
Definition: database.txt:2
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
UtfNormal\UNORM_NFC
const UNORM_NFC
Definition: UtfNormal.php:55
UtfNormal\replaceForNativeNormalize
static replaceForNativeNormalize( $string)
Function to replace some characters that we don't want but most of the native normalize functions kee...
Definition: UtfNormal.php:768
UtfNormal\UNORM_FCD
const UNORM_FCD
Definition: UtfNormal.php:57
UTF8_HANGUL_LAST
const UTF8_HANGUL_LAST
Definition: UtfNormalDefines.php:51
UNICODE_HANGUL_FIRST
const UNICODE_HANGUL_FIRST
Definition: UtfNormalDefines.php:28
UtfNormal\fastDecompose
static fastDecompose( $string, $map)
Perform decomposition of a UTF-8 string into either D or KD form (depending on which decomposition ma...
Definition: UtfNormal.php:510
UTF8_HANGUL_VBASE
const UTF8_HANGUL_VBASE
Definition: UtfNormalDefines.php:54
UtfNormal\UNORM_NONE
const UNORM_NONE
For using the ICU wrapper.
Definition: UtfNormal.php:52
UTF8_OVERLONG_C
const UTF8_OVERLONG_C
Definition: UtfNormalDefines.php:69
UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormalDefines.php:64
UtfNormal\$utfCheckNFC
static $utfCheckNFC
Definition: UtfNormal.php:67
UtfNormal\NFC
static NFC( $string)
Definition: UtfNormal.php:462
UNICODE_HANGUL_VCOUNT
const UNICODE_HANGUL_VCOUNT
Definition: UtfNormalDefines.php:36
UTF8_MAX
const UTF8_MAX
Definition: UtfNormalDefines.php:63
UtfNormal\NFKD
static NFKD( $string)
Definition: UtfNormal.php:492
in
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
are
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types are
Definition: contenthandler.txt:5
used
you don t have to do a grep find to see where the $wgReverseTitle variable is used
Definition: hooks.txt:117
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
UtfNormal\quickIsNFC
static quickIsNFC( $string)
Returns true if the string is definitely in NFC.
Definition: UtfNormal.php:203
that
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if that
Definition: deferred.txt:11
$t
$t
Definition: testCompression.php:65
UtfNormal\toNFKC
static toNFKC( $string)
Convert a UTF-8 string to normal form KC, compatibility composition.
Definition: UtfNormal.php:157
UtfNormal\loadData
static loadData()
Load the basic composition data if necessary.
Definition: UtfNormal.php:191
UtfNormal\quickIsNFCVerify
static quickIsNFCVerify(&$string)
Returns true if the string is definitely in NFC.
Definition: UtfNormal.php:243