MediaWiki  master
generateNormalizerDataAr.php
Go to the documentation of this file.
1 <?php
24 require_once __DIR__ . '/../Maintenance.php';
25 
27 
41  public function __construct() {
42  parent::__construct();
43  $this->addDescription( 'Generate the normalizer data file for Arabic' );
44  $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
45  'from https://unicode.org/Public/6.0.0/ucd/UnicodeData.txt', false, true );
46  }
47 
48  public function getDbType() {
49  return Maintenance::DB_NONE;
50  }
51 
52  public function execute() {
53  if ( !$this->hasOption( 'unicode-data-file' ) ) {
54  $dataFile = 'UnicodeData.txt';
55  if ( !file_exists( $dataFile ) ) {
56  $this->fatalError( "Unable to find UnicodeData.txt. Please specify " .
57  "its location with --unicode-data-file=<FILE>" );
58  }
59  } else {
60  $dataFile = $this->getOption( 'unicode-data-file' );
61  if ( !file_exists( $dataFile ) ) {
62  $this->fatalError( 'Unable to find the specified data file.' );
63  }
64  }
65 
66  $file = fopen( $dataFile, 'r' );
67  if ( !$file ) {
68  $this->fatalError( 'Unable to open the data file.' );
69  }
70 
71  // For the file format, see https://www.unicode.org/reports/tr44/
72  $fieldNames = [
73  'Code',
74  'Name',
75  'General_Category',
76  'Canonical_Combining_Class',
77  'Bidi_Class',
78  'Decomposition_Type_Mapping',
79  'Numeric_Type_Value_6',
80  'Numeric_Type_Value_7',
81  'Numeric_Type_Value_8',
82  'Bidi_Mirrored',
83  'Unicode_1_Name',
84  'ISO_Comment',
85  'Simple_Uppercase_Mapping',
86  'Simple_Lowercase_Mapping',
87  'Simple_Titlecase_Mapping'
88  ];
89 
90  $pairs = [];
91 
92  $lineNum = 0;
93  while ( ( $line = fgets( $file ) ) !== false ) {
94  ++$lineNum;
95 
96  # Strip comments
97  $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
98  if ( $line === '' ) {
99  continue;
100  }
101 
102  # Split fields
103  $numberedData = explode( ';', $line );
104  $data = [];
105  foreach ( $fieldNames as $number => $name ) {
106  $data[$name] = $numberedData[$number];
107  }
108 
109  $code = base_convert( $data['Code'], 16, 10 );
110  if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
111  || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B
112  ) {
113  if ( $data['Decomposition_Type_Mapping'] === '' ) {
114  // No decomposition
115  continue;
116  }
117  if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
118  $data['Decomposition_Type_Mapping'], $m )
119  ) {
120  $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
121  $this->error( $line );
122  continue;
123  }
124 
125  $source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] );
126  $dest = UtfNormal\Utils::hexSequenceToUtf8( $m[2] );
127  $pairs[$source] = $dest;
128  }
129  }
130 
131  global $IP;
132  $writer = new StaticArrayWriter();
133  file_put_contents( "$IP/languages/data/normalize-ar.php", $writer->create(
134  $pairs,
135  'File created by generateNormalizerDataAr.php'
136  ) );
137 
138  echo "ar: " . count( $pairs ) . " pairs written.\n";
139  }
140 }
141 
142 $maintClass = GenerateNormalizerDataAr::class;
143 require_once RUN_MAINTENANCE_IF_MAIN;
Generates the normalizer data file for Arabic.
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
const DB_NONE
Constants for DB access type.
Definition: Maintenance.php:91
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
error( $err, $die=0)
Throw an error to the user.
$IP
Definition: WebStart.php:41
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:86
$source
hasOption( $name)
Checks to see if a particular option exists.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
$line
Definition: mcc.php:119
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Format a static PHP array to be written to a file.