MediaWiki master
languageNameIndexer.php
Go to the documentation of this file.
1<?php
19// @codeCoverageIgnoreStart
20require_once __DIR__ . '/Maintenance.php';
21// @codeCoverageIgnoreEnd
22
23use MediaWiki\Extension\CLDR\LanguageNames;
30
32 public function __construct() {
33 parent::__construct();
34 $this->addDescription( 'Script to create language names index.' );
35
36 $extensionRegistry = ExtensionRegistry::getInstance();
37 if ( !$extensionRegistry->isLoaded( 'cldr' ) ) {
38 $this->requireExtension( 'cldr' );
39 }
40 if ( !$extensionRegistry->isLoaded( 'UniversalLanguageSelector' ) ) {
41 $this->requireExtension( 'UniversalLanguageSelector' );
42 }
43 }
44
45 public function execute() {
46 // Avoid local configuration leaking to this script
47 if ( $this->getConfig()->get( MainConfigNames::ExtraLanguageNames ) !== [] ) {
48 $this->fatalError( 'You have entries in $wgExtraLanguageNames. Needs to be empty for this script.' );
49 }
50
51 $languageNames = [];
52 // Add languages from language-data
53 $ulsLanguages = $this->getLanguageData()[ 'languages' ];
54 foreach ( $ulsLanguages as $languageCode => $languageEntry ) {
55 // Redirect have only one item
56 if ( isset( $languageEntry[ 2 ] ) ) {
57 $languageNames[ 'autonyms' ][ $languageCode ] = $languageEntry[ 2 ];
58 }
59 }
60
61 // Languages and their names in different languages from Names.php and the cldr extension
62 // This comes after $ulsLanguages so that for example the als/gsw mixup is using the code
63 // used in the Wikimedia world.
64 $mwLanguages = $this->getServiceContainer()->getLanguageNameUtils()
65 ->getLanguageNames( LanguageNameUtils::AUTONYMS, LanguageNameUtils::ALL );
66 foreach ( array_keys( $mwLanguages ) as $languageCode ) {
67 // This method is in the CLDR extension
68 // @phan-suppress-next-line PhanUndeclaredClassMethod
69 $languageNames[ $languageCode ] = LanguageNames::getNames( $languageCode, 0, 2 );
70 }
71
72 $buckets = [];
73 foreach ( $languageNames as $translations ) {
74 foreach ( $translations as $targetLanguage => $translation ) {
75 $translation = mb_strtolower( $translation );
76 $translation = trim( $translation );
77
78 // Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane".
79 // The original name is still shown, but avoid us creating entries such as
80 // "(dialekti" or "alpeve)".
81 $basicForm = preg_replace( '/\‍(.+\‍)$/', '', $translation );
82 $words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY );
83
84 foreach ( $words as $index => $word ) {
85 $bucket = LanguageNameSearch::getIndex( $word );
86
87 $type = 'prefix';
88 $display = $translation;
89 if ( $index > 0 ) {
90 // Avoid creating infix entries for short strings like punctuation, articles, prepositions...
91 if ( mb_strlen( $word ) < 3 ) {
92 continue;
93 }
94
95 $type = 'infix';
96 $display = "$word — $translation";
97 }
98 $buckets[$bucket][$type][$display] = $targetLanguage;
99 }
100 }
101 }
102
103 // Some languages don't have a conveniently searchable name in CLDR.
104 // For example, the name of Western Punjabi doesn't start with
105 // the string "punjabi" in any language, so it cannot be found
106 // by people who search in English.
107 // To resolve this, some languages are added here locally.
108 $specialLanguages = [
109 // Abron / Brong / Bono (T369464)
110 'abr' => [ 'bono', 'brong' ],
111 // Acholi (T376060)
112 'ach' => [ 'leb acoli' ],
113 // Hadhrami Arabic (T397355)
114 'ayh' => [ 'حضرمية' ],
115 // Catalan, sometimes searched as "Valencià"
116 'ca' => [ 'valencia' ],
117 // Compatibility with the old name and other Chinese varieties
118 'cdo' => [ 'chinese min dong' ],
119 // Alternate names for Anufo in linguistic literature
120 'cko' => [ 'chakosi', 'chokosi', 'tchokossi' ],
121 // Dolgan (T395396)
122 'dlg' => [ 'һака' ],
123 // Older name, see T375891
124 'dtp' => [ 'bundu-liwan, dusun' ],
125 // Spanish, the transliteration of the autonym is often used for searching
126 'es' => [ 'castellano' ],
127 // Armenian, the transliteration of the autonym is often used for searching
128 'hy' => [ 'hayeren' ],
129 // Japanese, the transliteration of the autonym is often used for searching
130 'ja' => [ 'nihongo', 'にほんご' ],
131 // Javanese (T393746)
132 'jv-java' => [ 'jawa hanacaraka' ],
133 // Georgian, the transliteration of the autonym is often used for searching
134 'ka' => [ 'kartuli', 'qartuli' ],
135 // Lango (Uganda; T376054).
136 // The second alias help avoid ambiguity with
137 // other languages named "Lango" and also
138 // with "Langi"
139 'laj' => [ 'leb lango', 'lango, leb' ],
140 // Chiluvale (T368856)
141 'lue' => [ 'luvale, chi-' ],
142 // Shan (T377856)
143 'shn' => [ 'ၽႃႇသႃႇတႆး', 'လိၵ်ႈတႆး' ],
144 // Tigrinya: variant names in Hebrew,
145 // to ensure they can be found in different spellings
146 'ti' => [
147 'טגריניה',
148 'טגרינית',
149 'טיגריניה',
150 'טיגרינית',
151 'תגריניה',
152 'תגרינית',
153 'תיגריניה',
154 ],
155 // Tigre: variant names in Hebrew,
156 // to ensure they can be found in different spellings
157 'tig' => [
158 'טגרה',
159 'טגרית',
160 'טיגרה',
161 'תגרה',
162 'תגרית',
163 'תיגרה',
164 'תיגרית',
165 ],
166 // Mon, renamed in core MediaWiki's Names.php (T352776)
167 'mnw' => [ 'ဘာသာ မန်' ],
168 // Palembang, also known as "Musi".
169 // Writing this as two words ensures that it has a unique key,
170 // so that Moore (mos), which is known as "musi" in one of the languages,
171 // can also be found
172 'mui' => [ 'musi palembang' ],
173 // Western Punjabi, doesn't start with the word "Punjabi" in any language
174 'pnb' => [ 'punjabi western' ],
175 // Tai Nuea (T367377)
176 'tdd' => [ 'ᥖᥭᥰᥖᥬᥳᥑᥨᥒᥰ' ],
177 // Waale (T368046) - support alternate spellings of the name
178 'wlx' => [ 'waali', 'waalii' ],
179 // Simplified and Traditional Chinese, because zh-hans and zh-hant
180 // are not mapped to any English name
181 'zh-hans' => [ 'chinese simplified' ],
182 'zh-hant' => [ 'chinese traditional' ],
183 // Compatibility with the old name and other Chinese varieties
184 'zh-min-nan' => [ 'chinese min nan' ],
185 ];
186
187 foreach ( $specialLanguages as $targetLanguage => $translations ) {
188 foreach ( $translations as $translation ) {
189 $bucket = LanguageNameSearch::getIndex( $translation );
190 $buckets[$bucket]['prefix'][$translation] = $targetLanguage;
191 }
192 }
193
194 $lengths = [];
195 // Sorting the bucket contents gives two benefits:
196 // - more consistent output across environments
197 // - shortest matches appear first, especially exact matches
198 // Sort buckets by index
199 ksort( $buckets );
200 foreach ( $buckets as &$bucketTypes ) {
201 $lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
202 // Ensure 'prefix' is before 'infix';
203 krsort( $bucketTypes );
204 // Ensure each bucket has entries sorted
205 foreach ( $bucketTypes as &$bucket ) {
206 ksort( $bucket );
207 }
208 }
209
210 $count = count( $buckets );
211 $min = min( $lengths );
212 $max = max( $lengths );
213 $median = $lengths[ceil( $count / 2 )];
214 $avg = array_sum( $lengths ) / $count;
215 $this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" );
216 $this->output( " - largest has $max entries\n - median size is $median entries\n" );
217 $this->output( " - average size is $avg entries\n" );
218
219 $this->generateFile( $buckets );
220 }
221
225 private function getLanguageData() {
226 $file = __DIR__ . '/../extensions/UniversalLanguageSelector/lib/jquery.uls/src/jquery.uls.data.js';
227 $contents = file_get_contents( $file );
228 if ( !preg_match( '/.*\$\.uls\.data\s*=\s*(.*?)\s*}\s*\‍(\s*jQuery\s*\‍)/s', $contents, $matches ) ) {
229 throw new LogicException( 'Syntax error in jquery.uls.data.js?' );
230 }
231 $json = $matches[ 1 ];
232 $data = json_decode( $json, true );
233 if ( !$data ) {
234 throw new LogicException( 'json_decode failed. Syntax error in jquery.uls.data.js?' );
235 }
236 return $data;
237 }
238
242 private function generateFile( array $buckets ) {
243 // Add metadata to indicate this is a generated file
244 $data = [
245 '_comment' => 'This file is generated by a script!',
246 '_generator' => 'maintenance/languageNameIndexer.php',
247 'buckets' => $buckets
248 ];
249 $json = FormatJson::encode( $data, "\t", FormatJson::ALL_OK );
250 file_put_contents( __DIR__ . '/../languages/data/LanguageNameSearchData.json', $json . "\n" );
251 }
252}
253
254// @codeCoverageIgnoreStart
255$maintClass = LanguageNameIndexer::class;
256require_once RUN_MAINTENANCE_IF_MAIN;
257// @codeCoverageIgnoreEnd
execute()
Do the actual work.
__construct()
Default constructor.
JSON formatter wrapper class.
Cross-Language Language name search.
A service that provides utilities to do with language names and codes.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
output( $out, $channel=null)
Throw some output to the user.
requireExtension( $name)
Indicate that the specified extension must be loaded before the script can run.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Load JSON files, and uses a Processor to extract information.