Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
groupStatistics.php
Go to the documentation of this file.
1<?php
17use MediaWiki\Languages\LanguageNameUtils;
18
19// Standard boilerplate to define $IP
20if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
21 $IP = getenv( 'MW_INSTALL_PATH' );
22} else {
23 $dir = __DIR__;
24 $IP = "$dir/../../..";
25}
26require_once "$IP/maintenance/Maintenance.php";
27
28class GroupStatistics extends Maintenance {
41 'en' => [ 1, 1500, 'multiple' ],
42 'zh-hans' => [ 2, 1300, 'asia' ],
43 'zh-hant' => [ 2, 1300, 'asia' ],
44 'hi' => [ 3, 550, 'asia' ],
45 'ar' => [ 4, 530, 'multiple' ],
46 'es' => [ 5, 500, 'multiple' ],
47 'ms' => [ 6, 300, 'asia' ],
48 'pt' => [ 7, 290, 'multiple' ],
49 'pt-br' => [ 7, 290, 'america' ],
50 'ru' => [ 8, 278, 'multiple' ],
51 'id' => [ 9, 250, 'asia' ],
52 'bn' => [ 10, 230, 'asia' ],
53 'fr' => [ 11, 200, 'multiple' ],
54 'de' => [ 12, 185, 'europe' ],
55 'ja' => [ 13, 132, 'asia' ],
56 'fa' => [ 14, 107, 'asia' ],
57 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant
58 'tl' => [ 16, 90, 'asia' ],
59 'mr' => [ 17, 90, 'asia' ],
60 'vi' => [ 18, 80, 'asia' ],
61 'jv' => [ 19, 80, 'asia' ],
62 'te' => [ 20, 80, 'asia' ],
63 'ko' => [ 21, 78, 'asia' ],
64 'wuu' => [ 22, 77, 'asia' ],
65 'arz' => [ 23, 76, 'africa' ],
66 'th' => [ 24, 73, 'asia' ],
67 'yue' => [ 25, 71, 'asia' ],
68 'tr' => [ 26, 70, 'multiple' ],
69 'it' => [ 27, 70, 'europe' ],
70 'ta' => [ 28, 66, 'asia' ],
71 'ur' => [ 29, 60, 'asia' ],
72 'my' => [ 30, 52, 'asia' ],
73 'sw' => [ 31, 50, 'africa' ],
74 'nan' => [ 32, 49, 'asia' ],
75 'kn' => [ 33, 47, 'asia' ],
76 'gu' => [ 34, 46, 'asia' ],
77 'uk' => [ 35, 45, 'europe' ],
78 'pl' => [ 36, 43, 'europe' ],
79 'sd' => [ 37, 41, 'asia' ],
80 'ha' => [ 38, 39, 'africa' ],
81 'ml' => [ 39, 37, 'asia' ],
82 'gan-hans' => [ 40, 35, 'asia' ],
83 'gan-hant' => [ 40, 35, 'asia' ],
84 'hak' => [ 41, 34, 'asia' ],
85 'or' => [ 42, 31, 'asia' ],
86 'ne' => [ 43, 30, 'asia' ],
87 'ro' => [ 44, 28, 'europe' ],
88 'su' => [ 45, 27, 'asia' ],
89 'az' => [ 46, 27, 'asia' ],
90 'nl' => [ 47, 27, 'europe' ],
91 'zu' => [ 48, 26, 'africa' ],
92 'ps' => [ 49, 26, 'asia' ],
93 'ckb' => [ 50, 26, 'asia' ],
94 'ku-latn' => [ 50, 26, 'asia' ],
95 ];
107 'wikimedia' => [
108 // 'core-0-mostused' => 40,
109 'core' => 50,
110 'ext-0-wikimedia' => 50
111 ],
112 'fundraiser' => [
113 'ext-di-di' => 16,
114 'ext-di-pfpg' => 84,
115 ],
116 'mediawiki' => [
117 // 'core-0-mostused' => 30,
118 'core' => 50,
119 'ext-0-wikimedia' => 25,
120 'ext-0-all' => 25
121 ]
122 ];
129 // Codes containing a dash
130 'bat-smg' => 'bat-smg',
131 'cbk-zam' => 'cbk-zam',
132 'map-bms' => 'map-bms',
133 'nds-nl' => 'nds-nl',
134 'roa-rup' => 'roa-rup',
135 'roa-tara' => 'roa-tara',
136
137 // Remaps
138 'be-tarask' => 'be-x-old',
139 'gsw' => 'als',
140 'ike-cans' => 'iu',
141 'ike-latn' => 'iu',
142 'lzh' => 'zh-classical',
143 'nan' => 'zh-min-nan',
144 'vro' => 'fiu-vro',
145 'yue' => 'zh-yue',
146
147 // Ignored language codes. See reason.
148 'als' => '', // gsw
149 'be-x-old' => '', // be-tarask
150 'crh' => '', // crh-*
151 'de-at' => '', // de
152 'de-ch' => '', // de
153 'de-formal' => '', // de, not reporting formal form
154 'dk' => '', // da
155 'en-au' => '', // en
156 'en-ca' => '', // no MW code
157 'en-gb' => '', // no MW code
158 'es-419' => '', // no MW code
159 'fiu-vro' => '', // vro
160 'gan' => '', // gan-*
161 'got' => '', // extinct. not reporting formal form
162 'hif' => '', // hif-*
163 'hu-formal' => '', // not reporting
164 'iu' => '', // ike-*
165 'kk' => '', // kk-*
166 'kk-cn' => '', // kk-arab
167 'kk-kz' => '', // kk-cyrl
168 'kk-tr' => '', // kk-latn
169 'ko-kp' => '', // ko
170 'ku' => '', // ku-*
171 'ku-arab' => '', // ckb
172 'nb' => '', // no
173 'nl-be' => '', // no MW code
174 'nl-informal' => '', // nl, not reporting informal form
175 'ruq' => '', // ruq-*
176 'simple' => '', // en
177 'sr' => '', // sr-*
178 'tg' => '', // tg-*
179 'tp' => '', // tokipona
180 'tt' => '', // tt-*
181 'ug' => '', // ug-*
182 'zh' => '', // zh-*
183 'zh-classical' => '', // lzh
184 'zh-cn' => '', // zh
185 'zh-sg' => '', // zh
186 'zh-hk' => '', // zh
187 'zh-min-nan' => '', // nan
188 'zh-mo' => '', // zh
189 'zh-my' => '', // zh
190 'zh-tw' => '', // zh
191 'zh-yue' => '', // yue
192 ];
193
194 public function __construct() {
195 parent::__construct();
196 $this->addDescription( 'Script to generate statistics about the localisation ' .
197 'level of one or more message groups.' );
198 $this->addOption(
199 'groups',
200 '(optional) Comma separated list of groups',
201 false, /*required*/
202 true /*has arg*/
203 );
204 $this->addOption(
205 'output',
206 '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' .
207 'text: Text with tabs. Default: default',
208 false, /*required*/
209 true /*has arg*/
210 );
211 $this->addOption(
212 'skiplanguages',
213 '(optional) Comma separated list of languages to be skipped',
214 false, /*required*/
215 true /*has arg*/
216 );
217 $this->addOption(
218 'skipzero',
219 '(optional) Skip languages that do not have any localisation at all'
220 );
221 $this->addOption(
222 'legenddetail',
223 '(optional) Page name for legend to be transcluded at the top of the details table',
224 false, /*required*/
225 true /*has arg*/
226 );
227 $this->addOption(
228 'legendsummary',
229 '(optional) Page name for legend to be transcluded at the top of the summary table',
230 false, /*required*/
231 true /*has arg*/
232 );
233 $this->addOption(
234 'fuzzy',
235 '(optional) Add column for fuzzy counts'
236 );
237 $this->addOption(
238 'speakers',
239 '(optional) Add column for number of speakers (est.). ' .
240 'Only valid when combined with "most"'
241 );
242 $this->addOption(
243 'nol10n',
244 '(optional) Do not add localised language name if I18ntags is installed'
245 );
246 $this->addOption(
247 'continent',
248 '(optional) Add a continent column. Only available when output is ' .
249 '"wiki" or not specified.'
250 );
251 $this->addOption(
252 'summary',
253 '(optional) Add a summary with counts and scores per continent category ' .
254 'and totals. Only available for a valid "most" value.',
255 false, /*required*/
256 true /*has arg*/
257 );
258 $this->addOption(
259 'wmfscore',
260 'Only output WMF language code and weighted score for all ' .
261 'language codes for weighing group "wikimedia" in CSV. This ' .
262 'report must keep a stable layout as it is used/will be ' .
263 'used in the Wikimedia statistics.'
264 );
265 $this->addOption(
266 'most',
267 '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' .
268 'spoken languages. Skipzero is ignored. If a valid scope is ' .
269 'defined, the group list and fuzzy are ignored and the ' .
270 'localisation levels are weighted and reported.',
271 false, /*required*/
272 true /*has arg*/
273 );
274 $this->requireExtension( 'Translate' );
275 }
276
277 public function execute() {
278 $output = $this->getOption( 'output', 'default' );
279
280 // Select an output engine
281 switch ( $output ) {
282 case 'wiki':
283 $out = new WikiStatsOutput();
284 break;
285 case 'text':
286 $out = new TextStatsOutput();
287 break;
288 case 'csv':
289 $out = new CsvStatsOutput();
290 break;
291 default:
292 $out = new TranslateStatsOutput();
293 }
294
295 $skipLanguages = [];
296 if ( $this->hasOption( 'skiplanguages' ) ) {
297 $skipLanguages = array_map(
298 'trim',
299 explode( ',', $this->getOption( 'skiplanguages' ) )
300 );
301 }
302
303 $reportScore = false;
304 // Check if score should be reported and prepare weights
305 $most = $this->getOption( 'most' );
306 $weights = [];
307 if ( $most && isset( $this->localisedWeights[$most] ) ) {
308 $reportScore = true;
309
310 foreach ( $this->localisedWeights[$most] as $weight ) {
311 $weights[] = $weight;
312 }
313 }
314
315 // check if l10n should be done
316 $l10n = false;
317 if ( ( $output === 'wiki' || $output === 'default' ) &&
318 !$this->hasOption( 'nol10n' )
319 ) {
320 $l10n = true;
321 }
322
323 $wmfscore = $this->hasOption( 'wmfscore' );
324
325 // Get groups from input
326 $groups = [];
327 if ( $reportScore ) {
328 $reqGroups = array_keys( $this->localisedWeights[$most] );
329 } elseif ( $wmfscore ) {
330 $reqGroups = array_keys( $this->localisedWeights['wikimedia'] );
331 } else {
332 $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) );
333 }
334
335 // List of all groups
336 $allGroups = MessageGroups::singleton()->getGroups();
337
338 // Get list of valid groups
339 foreach ( $reqGroups as $id ) {
340 // Page translation group ids use spaces which are not nice on command line
341 $id = str_replace( '_', ' ', $id );
342 if ( isset( $allGroups[$id] ) ) {
343 $groups[$id] = $allGroups[$id];
344 } else {
345 $this->output( "Unknown group: $id" );
346 }
347 }
348
349 if ( $wmfscore ) {
350 // Override/set parameters
351 $out = new CsvStatsOutput();
352 $reportScore = true;
353
354 $weights = [];
355 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
356 $weights[] = $weight;
357 }
358 $wmfscores = [];
359 }
360
361 if ( !count( $groups ) ) {
362 $this->fatalError( 'No groups given' );
363 }
364
365 // List of all languages.
366 $languages = Utilities::getLanguageNames( LanguageNameUtils::AUTONYMS );
367 // Default sorting order by language code, users can sort wiki output.
368 ksort( $languages );
369
370 if ( $this->hasOption( 'legenddetail' ) ) {
371 $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" );
372 }
373
374 $totalWeight = 0;
375 if ( $reportScore ) {
376 if ( $wmfscore ) {
377 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
378 $totalWeight += $weight;
379 }
380 } else {
381 foreach ( $this->localisedWeights[$most] as $weight ) {
382 $totalWeight += $weight;
383 }
384 }
385 }
386
387 $showContinent = $this->getOption( 'continent' );
388 if ( !$wmfscore ) {
389 // Output headers
390 $out->heading();
391
392 $out->blockstart();
393
394 if ( $most ) {
395 $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true );
396 }
397
398 $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true );
399 $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true );
400 if ( $showContinent ) {
401 $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true );
402 }
403
404 if ( $most && $this->hasOption( 'speakers' ) ) {
405 $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true );
406 }
407
408 if ( $reportScore ) {
409 $out->element(
410 ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')',
411 true
412 );
413 }
414
416 foreach ( $groups as $g ) {
417 // Add unprocessed description of group as heading
418 if ( $reportScore ) {
419 $gid = $g->getId();
420 $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')';
421 } else {
422 $heading = $g->getLabel();
423 }
424 $out->element( $heading, true );
425 if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) {
426 $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true );
427 }
428 }
429
430 $out->blockend();
431 }
432
433 $rows = [];
434 foreach ( $languages as $code => $name ) {
435 // Skip list
436 if ( in_array( $code, $skipLanguages ) ) {
437 continue;
438 }
439 $rows[$code] = [];
440 }
441
442 foreach ( $groups as $groupName => $g ) {
443 $stats = MessageGroupStats::forGroup( $groupName );
444
445 // Perform the statistic calculations on every language
446 foreach ( $languages as $code => $name ) {
447 // Skip list
448 if ( !$most && in_array( $code, $skipLanguages ) ) {
449 continue;
450 }
451
452 // Do not calculate if we do not need it for anything.
453 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
454 && $this->wikimediaCodeMap[$code] === ''
455 ) {
456 continue;
457 }
458
459 // If --most is set, skip all other
460 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
461 continue;
462 }
463
464 $total = $stats[$code][MessageGroupStats::TOTAL];
465 $translated = $stats[$code][MessageGroupStats::TRANSLATED];
466 $fuzzy = $stats[$code][MessageGroupStats::FUZZY];
467
468 $rows[$code][] = [ false, $translated, $total ];
469
470 if ( $this->hasOption( 'fuzzy' ) ) {
471 $rows[$code][] = [ true, $fuzzy, $total ];
472 }
473 }
474 }
475
476 // init summary array
477 $summarise = false;
478 if ( $this->hasOption( 'summary' ) ) {
479 $summarise = true;
480 $summary = [];
481 }
482
483 foreach ( $languages as $code => $name ) {
484 // Skip list
485 if ( !$most && in_array( $code, $skipLanguages ) ) {
486 continue;
487 }
488
489 // Skip unneeded
490 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
491 && $this->wikimediaCodeMap[$code] === ''
492 ) {
493 continue;
494 }
495
496 // If --most is set, skip all other
497 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
498 continue;
499 }
500
501 $columns = $rows[$code];
502
503 $allZero = true;
504 foreach ( $columns as $fields ) {
505 if ( (int)$fields[1] !== 0 ) {
506 $allZero = false;
507 }
508 }
509
510 // Skip dummy languages if requested
511 if ( $allZero && $this->hasOption( 'skipzero' ) ) {
512 continue;
513 }
514
515 // Output the row
516 if ( !$wmfscore ) {
517 $out->blockstart();
518 }
519
520 // Fill language position field
521 if ( $most ) {
522 $out->element( $this->mostSpokenLanguages[$code][0] );
523 }
524
525 // Fill language name field
526 if ( !$wmfscore ) {
527 // Fill language code field
528 $out->element( $code );
529
530 if ( $l10n && function_exists( 'efI18nTagsInit' ) ) {
531 $out->element( '{{#languagename:' . $code . '}}' );
532 } else {
533 $out->element( $name );
534 }
535 }
536
537 // Fill continent field
538 if ( $showContinent ) {
539 if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) {
540 $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
541 } else {
542 $continent = $l10n ?
543 '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' :
544 ucfirst( $this->mostSpokenLanguages[$code][2] );
545 }
546
547 $out->element( $continent );
548 }
549
550 // Fill speakers field
551 if ( $most && $this->hasOption( 'speakers' ) ) {
552 $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) );
553 }
554
555 // Fill the score field
556 if ( $reportScore ) {
557 // Keep count
558 $i = 0;
559 // Start with 0 points
560 $score = 0;
561
562 foreach ( $columns as $fields ) {
563 [ , $upper, $total ] = $fields;
564 // Weigh the score and add it to the current score
565 $score += ( $weights[$i] * $upper ) / $total;
566 $i++;
567 }
568
569 // Report a round numbers
570 $score = number_format( $score, 0 );
571
572 if ( $summarise ) {
573 $continent = $this->mostSpokenLanguages[$code][2];
574 if ( isset( $summary[$continent] ) ) {
575 $newcount = $summary[$continent][0] + 1;
576 $newscore = $summary[$continent][1] + (int)$score;
577 } else {
578 $newcount = 1;
579 $newscore = $score;
580 }
581
582 $summary[$continent] = [ $newcount, $newscore ];
583 }
584
585 if ( $wmfscore ) {
586 // Multiple variants can be used for the same wiki.
587 // Store the scores in an array and output them later
588 // when they can be averaged.
589 $wmfcode = $this->wikimediaCodeMap[$code] ?? explode( '-', $code, 2 )[0];
590
591 if ( isset( $wmfscores[$wmfcode] ) ) {
592 $count = $wmfscores[$wmfcode]['count'] + 1;
593 $tmpWmfScore = (int)$wmfscores[$wmfcode]['score'];
594 $tmpWmfCount = (int)$wmfscores[$wmfcode]['count'];
595 $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count;
596 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ];
597 } else {
598 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ];
599 }
600 } else {
601 $out->element( $score );
602 }
603 }
604
605 // Fill fields for groups
606 if ( !$wmfscore ) {
607 foreach ( $columns as $fields ) {
608 [ $invert, $upper, $total ] = $fields;
609 $c = $out->formatPercent( $upper, $total, $invert );
610 $out->element( $c );
611 }
612
613 $out->blockend();
614 }
615 }
616
617 $out->footer();
618
619 if ( $reportScore && $this->hasOption( 'summary' ) ) {
620 if ( $this->hasOption( 'legendsummary' ) ) {
621 $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" );
622 }
623
624 $out->summaryheading();
625
626 $out->blockstart();
627
628 $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true );
629 $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true );
630 $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true );
631
632 $out->blockend();
633
634 ksort( $summary );
635
636 $totals = [ 0, 0 ];
637
638 foreach ( $summary as $key => $values ) {
639 $out->blockstart();
640
641 if ( $key === 'multiple' ) {
642 $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
643 } else {
644 $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) );
645 }
646 $out->element( $values[0] );
647 $out->element( number_format( $values[1] / $values[0] ) );
648
649 $out->blockend();
650
651 $totals[0] += $values[0];
652 $totals[1] += $values[1];
653 }
654
655 $out->blockstart();
656 $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' );
657 $out->element( $totals[0] );
658 $out->element( number_format( $totals[1] / $totals[0] ) );
659 $out->blockend();
660
661 $out->footer();
662 }
663
664 // Custom output
665 if ( $wmfscore ) {
666 ksort( $wmfscores );
667
668 foreach ( $wmfscores as $code => $stats ) {
669 echo $code . ';' . number_format( $stats['score'] ) . ";\n";
670 }
671 }
672 }
673}
674
675$maintClass = GroupStatistics::class;
676require_once RUN_MAINTENANCE_IF_MAIN;
$wikimediaCodeMap
Code map to map localisation codes to Wikimedia project codes.
$mostSpokenLanguages
Array of the most spoken languages in the world.
$localisedWeights
Variable with key-value pairs with a named index and an array of key-value pairs where the key is a M...
Factory class for accessing message groups individually by id or all of them as a list.
This class aims to provide efficient mechanism for fetching translation completion stats.
Essentially random collection of helper functions, similar to GlobalFunctions.php.
Definition Utilities.php:31
Provides heading, summaryheading and free text addition for stats output in wiki format.