24require_once __DIR__ .
'/../Maintenance.php';
54 parent::__construct();
55 $this->
addOption(
'data-dir',
'A directory on the local filesystem ' .
56 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
58 $this->
addOption(
'debug-output',
'Filename for sending debug output to',
63 $this->dataDir = $this->
getOption(
'data-dir',
'.' );
65 $allkeysPresent = file_exists(
"{$this->dataDir}/allkeys.txt" );
66 $ucdallPresent = file_exists(
"{$this->dataDir}/ucd.all.grouped.xml" );
70 $allkeysURL =
"http://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
71 $ucdallURL =
"http://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
73 if ( !$allkeysPresent || !$ucdallPresent ) {
79 if ( !$allkeysPresent ) {
80 $error .=
"Unable to find allkeys.txt. "
81 .
"Download it and specify its location with --data-dir=<DIR>. "
84 if ( !$ucdallPresent ) {
85 $error .=
"Unable to find ucd.all.grouped.xml. "
86 .
"Download it, unzip, and specify its location with --data-dir=<DIR>. "
90 $versionKnown =
false;
94 $error .=
"As MediaWiki could not determine the version of ICU library used by your PHP's "
95 .
"intl extension it can't suggest which file version to download. "
96 .
"This can be caused by running a very old version of intl or PHP < 5.3.7. "
97 .
"If you are sure everything is all right, find out the ICU version "
98 .
"by running phpinfo(), check what is the Unicode version it is using "
99 .
"at http://site.icu-project.org/download, then try finding appropriate data file(s) at:";
100 } elseif ( version_compare( $icuVersion,
"4.0",
"<" ) ) {
102 $error .=
"You are using outdated version of ICU ($icuVersion), intended for "
103 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
104 .
"; this file might not be avalaible for it, and it's not supported by MediaWiki. "
105 .
" You are on your own; consider upgrading PHP's intl extension or try "
106 .
"one of the files available at:";
107 } elseif ( version_compare( $icuVersion,
"51.0",
">=" ) ) {
109 $error .=
"You are using ICU $icuVersion, released after this script was last updated. "
110 .
"Check what is the Unicode version it is using at http://site.icu-project.org/download . "
111 .
"It can't be guaranteed everything will work, but appropriate file(s) should "
112 .
"be available at:";
115 $versionKnown =
true;
116 $error .=
"You are using ICU $icuVersion, intended for "
117 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
118 .
". Appropriate file(s) should be available at:";
122 if ( $versionKnown && $unicodeVersion ) {
123 $allkeysURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $allkeysURL );
124 $ucdallURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $ucdallURL );
127 if ( !$allkeysPresent ) {
128 $error .=
"* $allkeysURL\n";
130 if ( !$ucdallPresent ) {
131 $error .=
"* $ucdallURL\n";
134 $this->
error( $error );
138 $debugOutFileName = $this->
getOption(
'debug-output' );
139 if ( $debugOutFileName ) {
140 $this->debugOutFile = fopen( $debugOutFileName,
'w' );
141 if ( !$this->debugOutFile ) {
142 $this->
error(
"Unable to open debug output file for writing" );
151 $uxr =
new UcdXmlReader(
"{$this->dataDir}/ucd.all.grouped.xml" );
152 $uxr->readChars( [ $this,
'charCallback' ] );
159 $category = substr( $data[
'gc'], 0, 1 );
160 if ( strpos(
'LNPS', $category ) ===
false
161 && $data[
'cp'] !==
'0020'
165 $cp = hexdec( $data[
'cp'] );
175 if ( $data[
'block'] ==
'Hangul Syllables' ) {
180 if ( $data[
'UIdeo'] ===
'Y' ) {
181 if ( $data[
'block'] ==
'CJK Unified Ideographs'
182 || $data[
'block'] ==
'CJK Compatibility Ideographs'
191 $a =
$base + ( $cp >> 15 );
192 $b = ( $cp & 0x7fff ) | 0x8000;
194 $this->weights[$cp] = sprintf(
".%04X.%04X", $a, $b );
196 if ( $data[
'dm'] !==
'#' ) {
197 $this->mappedChars[$cp] =
true;
200 if ( $cp % 4096 == 0 ) {
201 print "{$data['cp']}\n";
206 $file = fopen(
"{$this->dataDir}/allkeys.txt",
'r' );
208 $this->
error(
"Unable to open allkeys.txt" );
212 $outFile = fopen(
"$IP/serialized/first-letters-root.ser",
'w' );
214 $this->
error(
"Unable to open output file first-letters-root.ser" );
218 $goodTertiaryChars = [];
223 while (
false !== (
$line = fgets( $file ) ) ) {
226 if ( !preg_match(
'/^([0-9A-F]+)\s*;\s*([^#]*)/',
$line, $m ) ) {
230 $cp = hexdec( $m[1] );
231 $allWeights = trim( $m[2] );
235 if ( !isset( $this->weights[$cp] ) ) {
240 preg_match_all(
'/[*.]([0-9A-F]+)/', $weightStr, $m );
241 if ( !empty( $m[1] ) ) {
242 if ( $m[1][0] !==
'0000' ) {
243 $primary .=
'.' . $m[1][0];
245 if ( $m[1][2] !==
'0000' ) {
246 $tertiary .=
'.' . $m[1][2];
250 $this->weights[$cp] = $primary;
251 if ( $tertiary ===
'.0008'
252 || $tertiary ===
'.000E'
254 $goodTertiaryChars[$cp] =
true;
261 asort( $this->weights, SORT_STRING );
262 $prevWeight = reset( $this->weights );
264 foreach ( $this->weights as $cp => $weight ) {
265 if ( $weight !== $prevWeight ) {
266 $this->
groups[$prevWeight] = $group;
267 $prevWeight = $weight;
268 if ( isset( $this->
groups[$weight] ) ) {
269 $group = $this->
groups[$weight];
277 $this->
groups[$prevWeight] = $group;
286 foreach ( $this->
groups as $weight => $group ) {
287 if ( preg_match(
'/(\.[0-9A-F]*)\./', $weight, $m ) ) {
288 if ( isset( $this->
groups[$m[1]] ) ) {
289 unset( $this->
groups[$weight] );
294 ksort( $this->
groups, SORT_STRING );
299 $tertiaryCollator =
new Collator(
'root' );
300 $primaryCollator =
new Collator(
'root' );
301 $primaryCollator->setStrength( Collator::PRIMARY );
303 foreach ( $this->
groups as $weight => $group ) {
304 $uncomposedChars = [];
306 foreach ( $group as $cp ) {
307 if ( isset( $goodTertiaryChars[$cp] ) ) {
310 if ( !isset( $this->mappedChars[$cp] ) ) {
311 $uncomposedChars[] = $cp;
314 $x = array_intersect( $goodChars, $uncomposedChars );
316 $x = $uncomposedChars;
323 $tertiaryCollator->sort( $x );
326 $char = UtfNormal\Utils::codepointToUtf8( $cp );
327 $headerChars[] = $char;
328 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
338 if ( $this->debugOutFile ) {
339 fwrite( $this->debugOutFile, sprintf(
"%05X %s %s (%s)\n", $cp, $weight, $char,
340 implode(
' ', array_map(
'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
344 print "Out of order: $numOutOfOrder / " . count( $headerChars ) .
"\n";
346 fwrite( $outFile,
serialize( $headerChars ) );
364 $this->currentBlock = reset( $this->blocks );
368 while (
$xml->name !==
'repertoire' &&
$xml->next() );
370 while (
$xml->read() ) {
371 if (
$xml->nodeType == XMLReader::ELEMENT ) {
372 if (
$xml->name ===
'group' ) {
374 } elseif (
$xml->name ===
'char' ) {
377 } elseif (
$xml->nodeType === XMLReader::END_ELEMENT ) {
378 if (
$xml->name ===
'group' ) {
379 $this->groupAttrs = [];
387 $this->xml =
new XMLReader;
388 $this->xml->open( $this->fileName );
390 throw new MWException( __METHOD__ .
": unable to open {$this->fileName}" );
392 while ( $this->xml->name !==
'ucd' && $this->xml->read() );
405 while ( $this->xml->moveToNextAttribute() ) {
406 $attrs[$this->xml->name] = $this->xml->value;
414 if ( isset( $attrs[
'cp'] ) ) {
415 $first =
$last = hexdec( $attrs[
'cp'] );
417 $first = hexdec( $attrs[
'first-cp'] );
418 $last = hexdec( $attrs[
'last-cp'] );
419 unset( $attrs[
'first-cp'] );
420 unset( $attrs[
'last-cp'] );
423 for ( $cp = $first; $cp <=
$last; $cp++ ) {
424 $hexCp = sprintf(
"%04X", $cp );
425 foreach ( [
'na',
'na1' ] as $nameProp ) {
426 if ( isset( $attrs[$nameProp] ) ) {
427 $attrs[$nameProp] = str_replace(
'#', $hexCp, $attrs[$nameProp] );
431 while ( $this->currentBlock ) {
432 if ( $cp < $this->currentBlock[0] ) {
434 } elseif ( $cp <= $this->currentBlock[1] ) {
435 $attrs[
'block'] =
key( $this->blocks );
438 $this->currentBlock = next( $this->blocks );
442 $attrs[
'cp'] = $hexCp;
443 call_user_func( $this->callback, $attrs );
448 if ( $this->blocks ) {
453 while (
$xml->name !==
'blocks' &&
$xml->read() );
455 while (
$xml->read() ) {
456 if (
$xml->nodeType == XMLReader::ELEMENT ) {
457 if (
$xml->name ===
'block' ) {
459 $first = hexdec( $attrs[
'first-cp'] );
460 $last = hexdec( $attrs[
'last-cp'] );
461 $this->blocks[$attrs[
'name']] = [ $first,
$last ];
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
const NORMAL_UPPERCASE
Important tertiary weights from UTS #10 section 7.2.
static getICUVersion()
Return the version of ICU library used by PHP's intl extension, or false when the extension is not in...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database key
do that in ParserLimitReportFormat instead use this to modify the parameters of the image all existing parser cache entries will be invalid To avoid you ll need to handle that somehow(e.g. with the RejectParserCacheValue hook) because MediaWiki won 't do it for you. & $defaults error
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist Do not use this to implement individual filters if they are compatible with the ChangesListFilter and ChangesListFilterGroup structure use sub classes of those in conjunction with the ChangesListSpecialPageStructuredFilters hook This hook can be used to implement filters that do not implement that or custom behavior that is not an individual filter e g Watchlist and Watchlist you will want to construct new ChangesListBooleanFilter or ChangesListStringOptionsFilter objects When constructing you specify which group they belong to You can reuse existing groups(accessed through $special->getFilterGroup)
require_once RUN_MAINTENANCE_IF_MAIN