24require_once __DIR__ .
'/../Maintenance.php';
54 parent::__construct();
55 $this->
addOption(
'data-dir',
'A directory on the local filesystem ' .
56 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
58 $this->
addOption(
'debug-output',
'Filename for sending debug output to',
63 $this->dataDir = $this->
getOption(
'data-dir',
'.' );
65 $allkeysPresent = file_exists(
"{$this->dataDir}/allkeys.txt" );
66 $ucdallPresent = file_exists(
"{$this->dataDir}/ucd.all.grouped.xml" );
70 $allkeysURL =
"http://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
71 $ucdallURL =
"http://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
73 if ( !$allkeysPresent || !$ucdallPresent ) {
79 if ( !$allkeysPresent ) {
80 $error .=
"Unable to find allkeys.txt. "
81 .
"Download it and specify its location with --data-dir=<DIR>. "
84 if ( !$ucdallPresent ) {
85 $error .=
"Unable to find ucd.all.grouped.xml. "
86 .
"Download it, unzip, and specify its location with --data-dir=<DIR>. "
90 $versionKnown =
false;
94 $error .=
"As MediaWiki could not determine the version of ICU library used by your PHP's "
95 .
"intl extension it can't suggest which file version to download. "
96 .
"This can be caused by running a very old version of intl or PHP < 5.3.7. "
97 .
"If you are sure everything is all right, find out the ICU version "
98 .
"by running phpinfo(), check what is the Unicode version it is using "
99 .
"at http://site.icu-project.org/download, then try finding appropriate data file(s) at:";
100 } elseif ( version_compare( $icuVersion,
"4.0",
"<" ) ) {
102 $error .=
"You are using outdated version of ICU ($icuVersion), intended for "
103 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
104 .
"; this file might not be avalaible for it, and it's not supported by MediaWiki. "
105 .
" You are on your own; consider upgrading PHP's intl extension or try "
106 .
"one of the files available at:";
107 } elseif ( version_compare( $icuVersion,
"51.0",
">=" ) ) {
109 $error .=
"You are using ICU $icuVersion, released after this script was last updated. "
110 .
"Check what is the Unicode version it is using at http://site.icu-project.org/download . "
111 .
"It can't be guaranteed everything will work, but appropriate file(s) should "
112 .
"be available at:";
115 $versionKnown =
true;
116 $error .=
"You are using ICU $icuVersion, intended for "
117 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
118 .
". Appropriate file(s) should be available at:";
122 if ( $versionKnown && $unicodeVersion ) {
123 $allkeysURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $allkeysURL );
124 $ucdallURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $ucdallURL );
127 if ( !$allkeysPresent ) {
128 $error .=
"* $allkeysURL\n";
130 if ( !$ucdallPresent ) {
131 $error .=
"* $ucdallURL\n";
137 $debugOutFileName = $this->
getOption(
'debug-output' );
138 if ( $debugOutFileName ) {
139 $this->debugOutFile = fopen( $debugOutFileName,
'w' );
140 if ( !$this->debugOutFile ) {
141 $this->
fatalError(
"Unable to open debug output file for writing" );
149 $uxr =
new UcdXmlReader(
"{$this->dataDir}/ucd.all.grouped.xml" );
150 $uxr->readChars( [ $this,
'charCallback' ] );
157 $category = substr( $data[
'gc'], 0, 1 );
158 if ( strpos(
'LNPS', $category ) ===
false
159 && $data[
'cp'] !==
'0020'
163 $cp = hexdec( $data[
'cp'] );
173 if ( $data[
'block'] ==
'Hangul Syllables' ) {
178 if ( $data[
'UIdeo'] ===
'Y' ) {
179 if ( $data[
'block'] ==
'CJK Unified Ideographs'
180 || $data[
'block'] ==
'CJK Compatibility Ideographs'
189 $a =
$base + ( $cp >> 15 );
190 $b = ( $cp & 0x7fff ) | 0x8000;
192 $this->weights[$cp] = sprintf(
".%04X.%04X", $a, $b );
194 if ( $data[
'dm'] !==
'#' ) {
195 $this->mappedChars[$cp] =
true;
198 if ( $cp % 4096 == 0 ) {
199 print "{$data['cp']}\n";
204 $file = fopen(
"{$this->dataDir}/allkeys.txt",
'r' );
206 $this->
fatalError(
"Unable to open allkeys.txt" );
209 $outFile = fopen(
"$IP/serialized/first-letters-root.ser",
'w' );
211 $this->
fatalError(
"Unable to open output file first-letters-root.ser" );
214 $goodTertiaryChars = [];
219 while (
false !== (
$line = fgets( $file ) ) ) {
222 if ( !preg_match(
'/^([0-9A-F]+)\s*;\s*([^#]*)/',
$line, $m ) ) {
226 $cp = hexdec( $m[1] );
227 $allWeights = trim( $m[2] );
231 if ( !isset( $this->weights[$cp] ) ) {
236 preg_match_all(
'/[*.]([0-9A-F]+)/', $weightStr, $m );
237 if ( !empty( $m[1] ) ) {
238 if ( $m[1][0] !==
'0000' ) {
239 $primary .=
'.' . $m[1][0];
241 if ( $m[1][2] !==
'0000' ) {
242 $tertiary .=
'.' . $m[1][2];
246 $this->weights[$cp] = $primary;
247 if ( $tertiary ===
'.0008'
248 || $tertiary ===
'.000E'
250 $goodTertiaryChars[$cp] =
true;
257 asort( $this->weights, SORT_STRING );
258 $prevWeight = reset( $this->weights );
260 foreach ( $this->weights as $cp => $weight ) {
261 if ( $weight !== $prevWeight ) {
262 $this->
groups[$prevWeight] = $group;
263 $prevWeight = $weight;
264 if ( isset( $this->
groups[$weight] ) ) {
265 $group = $this->
groups[$weight];
273 $this->
groups[$prevWeight] = $group;
282 foreach ( $this->
groups as $weight => $group ) {
283 if ( preg_match(
'/(\.[0-9A-F]*)\./', $weight, $m ) ) {
284 if ( isset( $this->
groups[$m[1]] ) ) {
285 unset( $this->
groups[$weight] );
290 ksort( $this->
groups, SORT_STRING );
295 $tertiaryCollator =
new Collator(
'root' );
296 $primaryCollator =
new Collator(
'root' );
297 $primaryCollator->setStrength( Collator::PRIMARY );
299 foreach ( $this->
groups as $weight => $group ) {
300 $uncomposedChars = [];
302 foreach ( $group as $cp ) {
303 if ( isset( $goodTertiaryChars[$cp] ) ) {
306 if ( !isset( $this->mappedChars[$cp] ) ) {
307 $uncomposedChars[] = $cp;
310 $x = array_intersect( $goodChars, $uncomposedChars );
312 $x = $uncomposedChars;
319 $tertiaryCollator->sort( $x );
322 $char = UtfNormal\Utils::codepointToUtf8( $cp );
323 $headerChars[] = $char;
324 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
329 if ( $this->debugOutFile ) {
330 fwrite( $this->debugOutFile, sprintf(
"%05X %s %s (%s)\n", $cp, $weight, $char,
331 implode(
' ', array_map(
'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
335 print "Out of order: $numOutOfOrder / " . count( $headerChars ) .
"\n";
337 fwrite( $outFile,
serialize( $headerChars ) );
355 $this->currentBlock = reset( $this->blocks );
359 while (
$xml->name !==
'repertoire' &&
$xml->next() );
361 while (
$xml->read() ) {
362 if (
$xml->nodeType == XMLReader::ELEMENT ) {
363 if (
$xml->name ===
'group' ) {
365 } elseif (
$xml->name ===
'char' ) {
368 } elseif (
$xml->nodeType === XMLReader::END_ELEMENT ) {
369 if (
$xml->name ===
'group' ) {
370 $this->groupAttrs = [];
378 $this->xml =
new XMLReader;
379 $this->xml->open( $this->fileName );
381 throw new MWException( __METHOD__ .
": unable to open {$this->fileName}" );
383 while ( $this->xml->name !==
'ucd' && $this->xml->read() );
396 while ( $this->xml->moveToNextAttribute() ) {
397 $attrs[$this->xml->name] = $this->xml->value;
405 if ( isset( $attrs[
'cp'] ) ) {
406 $first =
$last = hexdec( $attrs[
'cp'] );
408 $first = hexdec( $attrs[
'first-cp'] );
409 $last = hexdec( $attrs[
'last-cp'] );
410 unset( $attrs[
'first-cp'] );
411 unset( $attrs[
'last-cp'] );
414 for ( $cp = $first; $cp <=
$last; $cp++ ) {
415 $hexCp = sprintf(
"%04X", $cp );
416 foreach ( [
'na',
'na1' ] as $nameProp ) {
417 if ( isset( $attrs[$nameProp] ) ) {
418 $attrs[$nameProp] = str_replace(
'#', $hexCp, $attrs[$nameProp] );
422 while ( $this->currentBlock ) {
423 if ( $cp < $this->currentBlock[0] ) {
425 } elseif ( $cp <= $this->currentBlock[1] ) {
426 $attrs[
'block'] =
key( $this->blocks );
429 $this->currentBlock = next( $this->blocks );
433 $attrs[
'cp'] = $hexCp;
434 call_user_func( $this->callback, $attrs );
439 if ( $this->blocks ) {
444 while (
$xml->name !==
'blocks' &&
$xml->read() );
446 while (
$xml->read() ) {
447 if (
$xml->nodeType == XMLReader::ELEMENT ) {
448 if (
$xml->name ===
'block' ) {
450 $first = hexdec( $attrs[
'first-cp'] );
451 $last = hexdec( $attrs[
'last-cp'] );
452 $this->blocks[$attrs[
'name']] = [ $first,
$last ];
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
const NORMAL_UPPERCASE
Important tertiary weights from UTS #10 section 7.2.
static getICUVersion()
Return the version of ICU library used by PHP's intl extension, or false when the extension is not in...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database key
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist Do not use this to implement individual filters if they are compatible with the ChangesListFilter and ChangesListFilterGroup structure use sub classes of those in conjunction with the ChangesListSpecialPageStructuredFilters hook This hook can be used to implement filters that do not implement that or custom behavior that is not an individual filter e g Watchlist and Watchlist you will want to construct new ChangesListBooleanFilter or ChangesListStringOptionsFilter objects When constructing you specify which group they belong to You can reuse existing groups(accessed through $special->getFilterGroup)
while(( $__line=Maintenance::readconsole()) !==false) print
require_once RUN_MAINTENANCE_IF_MAIN