Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
groupStatistics.php
Go to the documentation of this file.
1<?php
16use MediaWiki\Languages\LanguageNameUtils;
17
18// Standard boilerplate to define $IP
19if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
20 $IP = getenv( 'MW_INSTALL_PATH' );
21} else {
22 $dir = __DIR__;
23 $IP = "$dir/../../..";
24}
25require_once "$IP/maintenance/Maintenance.php";
26
27class GroupStatistics extends Maintenance {
40 'en' => [ 1, 1500, 'multiple' ],
41 'zh-hans' => [ 2, 1300, 'asia' ],
42 'zh-hant' => [ 2, 1300, 'asia' ],
43 'hi' => [ 3, 550, 'asia' ],
44 'ar' => [ 4, 530, 'multiple' ],
45 'es' => [ 5, 500, 'multiple' ],
46 'ms' => [ 6, 300, 'asia' ],
47 'pt' => [ 7, 290, 'multiple' ],
48 'pt-br' => [ 7, 290, 'america' ],
49 'ru' => [ 8, 278, 'multiple' ],
50 'id' => [ 9, 250, 'asia' ],
51 'bn' => [ 10, 230, 'asia' ],
52 'fr' => [ 11, 200, 'multiple' ],
53 'de' => [ 12, 185, 'europe' ],
54 'ja' => [ 13, 132, 'asia' ],
55 'fa' => [ 14, 107, 'asia' ],
56 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant
57 'tl' => [ 16, 90, 'asia' ],
58 'mr' => [ 17, 90, 'asia' ],
59 'vi' => [ 18, 80, 'asia' ],
60 'jv' => [ 19, 80, 'asia' ],
61 'te' => [ 20, 80, 'asia' ],
62 'ko' => [ 21, 78, 'asia' ],
63 'wuu' => [ 22, 77, 'asia' ],
64 'arz' => [ 23, 76, 'africa' ],
65 'th' => [ 24, 73, 'asia' ],
66 'yue' => [ 25, 71, 'asia' ],
67 'tr' => [ 26, 70, 'multiple' ],
68 'it' => [ 27, 70, 'europe' ],
69 'ta' => [ 28, 66, 'asia' ],
70 'ur' => [ 29, 60, 'asia' ],
71 'my' => [ 30, 52, 'asia' ],
72 'sw' => [ 31, 50, 'africa' ],
73 'nan' => [ 32, 49, 'asia' ],
74 'kn' => [ 33, 47, 'asia' ],
75 'gu' => [ 34, 46, 'asia' ],
76 'uk' => [ 35, 45, 'europe' ],
77 'pl' => [ 36, 43, 'europe' ],
78 'sd' => [ 37, 41, 'asia' ],
79 'ha' => [ 38, 39, 'africa' ],
80 'ml' => [ 39, 37, 'asia' ],
81 'gan-hans' => [ 40, 35, 'asia' ],
82 'gan-hant' => [ 40, 35, 'asia' ],
83 'hak' => [ 41, 34, 'asia' ],
84 'or' => [ 42, 31, 'asia' ],
85 'ne' => [ 43, 30, 'asia' ],
86 'ro' => [ 44, 28, 'europe' ],
87 'su' => [ 45, 27, 'asia' ],
88 'az' => [ 46, 27, 'asia' ],
89 'nl' => [ 47, 27, 'europe' ],
90 'zu' => [ 48, 26, 'africa' ],
91 'ps' => [ 49, 26, 'asia' ],
92 'ckb' => [ 50, 26, 'asia' ],
93 'ku-latn' => [ 50, 26, 'asia' ],
94 ];
106 'wikimedia' => [
107 // 'core-0-mostused' => 40,
108 'core' => 50,
109 'ext-0-wikimedia' => 50
110 ],
111 'fundraiser' => [
112 'ext-di-di' => 16,
113 'ext-di-pfpg' => 84,
114 ],
115 'mediawiki' => [
116 // 'core-0-mostused' => 30,
117 'core' => 50,
118 'ext-0-wikimedia' => 25,
119 'ext-0-all' => 25
120 ]
121 ];
128 // Codes containing a dash
129 'bat-smg' => 'bat-smg',
130 'cbk-zam' => 'cbk-zam',
131 'map-bms' => 'map-bms',
132 'nds-nl' => 'nds-nl',
133 'roa-rup' => 'roa-rup',
134 'roa-tara' => 'roa-tara',
135
136 // Remaps
137 'be-tarask' => 'be-x-old',
138 'gsw' => 'als',
139 'ike-cans' => 'iu',
140 'ike-latn' => 'iu',
141 'lzh' => 'zh-classical',
142 'nan' => 'zh-min-nan',
143 'vro' => 'fiu-vro',
144 'yue' => 'zh-yue',
145
146 // Ignored language codes. See reason.
147 'als' => '', // gsw
148 'be-x-old' => '', // be-tarask
149 'crh' => '', // crh-*
150 'de-at' => '', // de
151 'de-ch' => '', // de
152 'de-formal' => '', // de, not reporting formal form
153 'dk' => '', // da
154 'en-au' => '', // en
155 'en-ca' => '', // no MW code
156 'en-gb' => '', // no MW code
157 'es-419' => '', // no MW code
158 'fiu-vro' => '', // vro
159 'gan' => '', // gan-*
160 'got' => '', // extinct. not reporting formal form
161 'hif' => '', // hif-*
162 'hu-formal' => '', // not reporting
163 'iu' => '', // ike-*
164 'kk' => '', // kk-*
165 'kk-cn' => '', // kk-arab
166 'kk-kz' => '', // kk-cyrl
167 'kk-tr' => '', // kk-latn
168 'ko-kp' => '', // ko
169 'ku' => '', // ku-*
170 'ku-arab' => '', // ckb
171 'nb' => '', // no
172 'nl-be' => '', // no MW code
173 'nl-informal' => '', // nl, not reporting informal form
174 'ruq' => '', // ruq-*
175 'simple' => '', // en
176 'sr' => '', // sr-*
177 'tg' => '', // tg-*
178 'tp' => '', // tokipona
179 'tt' => '', // tt-*
180 'ug' => '', // ug-*
181 'zh' => '', // zh-*
182 'zh-classical' => '', // lzh
183 'zh-cn' => '', // zh
184 'zh-sg' => '', // zh
185 'zh-hk' => '', // zh
186 'zh-min-nan' => '', // nan
187 'zh-mo' => '', // zh
188 'zh-my' => '', // zh
189 'zh-tw' => '', // zh
190 'zh-yue' => '', // yue
191 ];
192
193 public function __construct() {
194 parent::__construct();
195 $this->addDescription( 'Script to generate statistics about the localisation ' .
196 'level of one or more message groups.' );
197 $this->addOption(
198 'groups',
199 '(optional) Comma separated list of groups',
200 false, /*required*/
201 true /*has arg*/
202 );
203 $this->addOption(
204 'output',
205 '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' .
206 'text: Text with tabs. Default: default',
207 false, /*required*/
208 true /*has arg*/
209 );
210 $this->addOption(
211 'skiplanguages',
212 '(optional) Comma separated list of languages to be skipped',
213 false, /*required*/
214 true /*has arg*/
215 );
216 $this->addOption(
217 'skipzero',
218 '(optional) Skip languages that do not have any localisation at all'
219 );
220 $this->addOption(
221 'legenddetail',
222 '(optional) Page name for legend to be transcluded at the top of the details table',
223 false, /*required*/
224 true /*has arg*/
225 );
226 $this->addOption(
227 'legendsummary',
228 '(optional) Page name for legend to be transcluded at the top of the summary table',
229 false, /*required*/
230 true /*has arg*/
231 );
232 $this->addOption(
233 'fuzzy',
234 '(optional) Add column for fuzzy counts'
235 );
236 $this->addOption(
237 'speakers',
238 '(optional) Add column for number of speakers (est.). ' .
239 'Only valid when combined with "most"'
240 );
241 $this->addOption(
242 'nol10n',
243 '(optional) Do not add localised language name if I18ntags is installed'
244 );
245 $this->addOption(
246 'continent',
247 '(optional) Add a continent column. Only available when output is ' .
248 '"wiki" or not specified.'
249 );
250 $this->addOption(
251 'summary',
252 '(optional) Add a summary with counts and scores per continent category ' .
253 'and totals. Only available for a valid "most" value.',
254 false, /*required*/
255 true /*has arg*/
256 );
257 $this->addOption(
258 'wmfscore',
259 'Only output WMF language code and weighted score for all ' .
260 'language codes for weighing group "wikimedia" in CSV. This ' .
261 'report must keep a stable layout as it is used/will be ' .
262 'used in the Wikimedia statistics.'
263 );
264 $this->addOption(
265 'most',
266 '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' .
267 'spoken languages. Skipzero is ignored. If a valid scope is ' .
268 'defined, the group list and fuzzy are ignored and the ' .
269 'localisation levels are weighted and reported.',
270 false, /*required*/
271 true /*has arg*/
272 );
273 $this->requireExtension( 'Translate' );
274 }
275
276 public function execute() {
277 $output = $this->getOption( 'output', 'default' );
278
279 // Select an output engine
280 switch ( $output ) {
281 case 'wiki':
282 $out = new WikiStatsOutput();
283 break;
284 case 'text':
285 $out = new TextStatsOutput();
286 break;
287 case 'csv':
288 $out = new CsvStatsOutput();
289 break;
290 default:
291 $out = new TranslateStatsOutput();
292 }
293
294 $skipLanguages = [];
295 if ( $this->hasOption( 'skiplanguages' ) ) {
296 $skipLanguages = array_map(
297 'trim',
298 explode( ',', $this->getOption( 'skiplanguages' ) )
299 );
300 }
301
302 $reportScore = false;
303 // Check if score should be reported and prepare weights
304 $most = $this->getOption( 'most' );
305 $weights = [];
306 if ( $most && isset( $this->localisedWeights[$most] ) ) {
307 $reportScore = true;
308
309 foreach ( $this->localisedWeights[$most] as $weight ) {
310 $weights[] = $weight;
311 }
312 }
313
314 // check if l10n should be done
315 $l10n = false;
316 if ( ( $output === 'wiki' || $output === 'default' ) &&
317 !$this->hasOption( 'nol10n' )
318 ) {
319 $l10n = true;
320 }
321
322 $wmfscore = $this->hasOption( 'wmfscore' );
323
324 // Get groups from input
325 $groups = [];
326 if ( $reportScore ) {
327 $reqGroups = array_keys( $this->localisedWeights[$most] );
328 } elseif ( $wmfscore ) {
329 $reqGroups = array_keys( $this->localisedWeights['wikimedia'] );
330 } else {
331 $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) );
332 }
333
334 // List of all groups
335 $allGroups = MessageGroups::singleton()->getGroups();
336
337 // Get list of valid groups
338 foreach ( $reqGroups as $id ) {
339 // Page translation group ids use spaces which are not nice on command line
340 $id = str_replace( '_', ' ', $id );
341 if ( isset( $allGroups[$id] ) ) {
342 $groups[$id] = $allGroups[$id];
343 } else {
344 $this->output( "Unknown group: $id" );
345 }
346 }
347
348 if ( $wmfscore ) {
349 // Override/set parameters
350 $out = new CsvStatsOutput();
351 $reportScore = true;
352
353 $weights = [];
354 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
355 $weights[] = $weight;
356 }
357 $wmfscores = [];
358 }
359
360 if ( !count( $groups ) ) {
361 $this->fatalError( 'No groups given' );
362 }
363
364 // List of all languages.
365 $languages = Utilities::getLanguageNames( LanguageNameUtils::AUTONYMS );
366 // Default sorting order by language code, users can sort wiki output.
367 ksort( $languages );
368
369 if ( $this->hasOption( 'legenddetail' ) ) {
370 $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" );
371 }
372
373 $totalWeight = 0;
374 if ( $reportScore ) {
375 if ( $wmfscore ) {
376 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
377 $totalWeight += $weight;
378 }
379 } else {
380 foreach ( $this->localisedWeights[$most] as $weight ) {
381 $totalWeight += $weight;
382 }
383 }
384 }
385
386 $showContinent = $this->getOption( 'continent' );
387 if ( !$wmfscore ) {
388 // Output headers
389 $out->heading();
390
391 $out->blockstart();
392
393 if ( $most ) {
394 $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true );
395 }
396
397 $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true );
398 $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true );
399 if ( $showContinent ) {
400 $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true );
401 }
402
403 if ( $most && $this->hasOption( 'speakers' ) ) {
404 $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true );
405 }
406
407 if ( $reportScore ) {
408 $out->element(
409 ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')',
410 true
411 );
412 }
413
415 foreach ( $groups as $g ) {
416 // Add unprocessed description of group as heading
417 if ( $reportScore ) {
418 $gid = $g->getId();
419 $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')';
420 } else {
421 $heading = $g->getLabel();
422 }
423 $out->element( $heading, true );
424 if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) {
425 $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true );
426 }
427 }
428
429 $out->blockend();
430 }
431
432 $rows = [];
433 foreach ( $languages as $code => $name ) {
434 // Skip list
435 if ( in_array( $code, $skipLanguages ) ) {
436 continue;
437 }
438 $rows[$code] = [];
439 }
440
441 foreach ( $groups as $groupName => $g ) {
442 $stats = MessageGroupStats::forGroup( $groupName );
443
444 // Perform the statistic calculations on every language
445 foreach ( $languages as $code => $name ) {
446 // Skip list
447 if ( !$most && in_array( $code, $skipLanguages ) ) {
448 continue;
449 }
450
451 // Do not calculate if we do not need it for anything.
452 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
453 && $this->wikimediaCodeMap[$code] === ''
454 ) {
455 continue;
456 }
457
458 // If --most is set, skip all other
459 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
460 continue;
461 }
462
463 $total = $stats[$code][MessageGroupStats::TOTAL];
464 $translated = $stats[$code][MessageGroupStats::TRANSLATED];
465 $fuzzy = $stats[$code][MessageGroupStats::FUZZY];
466
467 $rows[$code][] = [ false, $translated, $total ];
468
469 if ( $this->hasOption( 'fuzzy' ) ) {
470 $rows[$code][] = [ true, $fuzzy, $total ];
471 }
472 }
473 }
474
475 // init summary array
476 $summarise = false;
477 if ( $this->hasOption( 'summary' ) ) {
478 $summarise = true;
479 $summary = [];
480 }
481
482 foreach ( $languages as $code => $name ) {
483 // Skip list
484 if ( !$most && in_array( $code, $skipLanguages ) ) {
485 continue;
486 }
487
488 // Skip unneeded
489 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
490 && $this->wikimediaCodeMap[$code] === ''
491 ) {
492 continue;
493 }
494
495 // If --most is set, skip all other
496 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
497 continue;
498 }
499
500 $columns = $rows[$code];
501
502 $allZero = true;
503 foreach ( $columns as $fields ) {
504 if ( (int)$fields[1] !== 0 ) {
505 $allZero = false;
506 }
507 }
508
509 // Skip dummy languages if requested
510 if ( $allZero && $this->hasOption( 'skipzero' ) ) {
511 continue;
512 }
513
514 // Output the row
515 if ( !$wmfscore ) {
516 $out->blockstart();
517 }
518
519 // Fill language position field
520 if ( $most ) {
521 $out->element( $this->mostSpokenLanguages[$code][0] );
522 }
523
524 // Fill language name field
525 if ( !$wmfscore ) {
526 // Fill language code field
527 $out->element( $code );
528
529 if ( $l10n && function_exists( 'efI18nTagsInit' ) ) {
530 $out->element( '{{#languagename:' . $code . '}}' );
531 } else {
532 $out->element( $name );
533 }
534 }
535
536 // Fill continent field
537 if ( $showContinent ) {
538 if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) {
539 $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
540 } else {
541 $continent = $l10n ?
542 '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' :
543 ucfirst( $this->mostSpokenLanguages[$code][2] );
544 }
545
546 $out->element( $continent );
547 }
548
549 // Fill speakers field
550 if ( $most && $this->hasOption( 'speakers' ) ) {
551 $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) );
552 }
553
554 // Fill the score field
555 if ( $reportScore ) {
556 // Keep count
557 $i = 0;
558 // Start with 0 points
559 $score = 0;
560
561 foreach ( $columns as $fields ) {
562 [ , $upper, $total ] = $fields;
563 // Weigh the score and add it to the current score
564 $score += ( $weights[$i] * $upper ) / $total;
565 $i++;
566 }
567
568 // Report a round numbers
569 $score = number_format( $score, 0 );
570
571 if ( $summarise ) {
572 $continent = $this->mostSpokenLanguages[$code][2];
573 if ( isset( $summary[$continent] ) ) {
574 $newcount = $summary[$continent][0] + 1;
575 $newscore = $summary[$continent][1] + (int)$score;
576 } else {
577 $newcount = 1;
578 $newscore = $score;
579 }
580
581 $summary[$continent] = [ $newcount, $newscore ];
582 }
583
584 if ( $wmfscore ) {
585 // Multiple variants can be used for the same wiki.
586 // Store the scores in an array and output them later
587 // when they can be averaged.
588 $wmfcode = $this->wikimediaCodeMap[$code] ?? explode( '-', $code, 2 )[0];
589
590 if ( isset( $wmfscores[$wmfcode] ) ) {
591 $count = $wmfscores[$wmfcode]['count'] + 1;
592 $tmpWmfScore = (int)$wmfscores[$wmfcode]['score'];
593 $tmpWmfCount = (int)$wmfscores[$wmfcode]['count'];
594 $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count;
595 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ];
596 } else {
597 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ];
598 }
599 } else {
600 $out->element( $score );
601 }
602 }
603
604 // Fill fields for groups
605 if ( !$wmfscore ) {
606 foreach ( $columns as $fields ) {
607 [ $invert, $upper, $total ] = $fields;
608 $c = $out->formatPercent( $upper, $total, $invert );
609 $out->element( $c );
610 }
611
612 $out->blockend();
613 }
614 }
615
616 $out->footer();
617
618 if ( $reportScore && $this->hasOption( 'summary' ) ) {
619 if ( $this->hasOption( 'legendsummary' ) ) {
620 $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" );
621 }
622
623 $out->summaryheading();
624
625 $out->blockstart();
626
627 $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true );
628 $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true );
629 $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true );
630
631 $out->blockend();
632
633 ksort( $summary );
634
635 $totals = [ 0, 0 ];
636
637 foreach ( $summary as $key => $values ) {
638 $out->blockstart();
639
640 if ( $key === 'multiple' ) {
641 $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
642 } else {
643 $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) );
644 }
645 $out->element( $values[0] );
646 $out->element( number_format( $values[1] / $values[0] ) );
647
648 $out->blockend();
649
650 $totals[0] += $values[0];
651 $totals[1] += $values[1];
652 }
653
654 $out->blockstart();
655 $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' );
656 $out->element( $totals[0] );
657 $out->element( number_format( $totals[1] / $totals[0] ) );
658 $out->blockend();
659
660 $out->footer();
661 }
662
663 // Custom output
664 if ( $wmfscore ) {
665 ksort( $wmfscores );
666
667 foreach ( $wmfscores as $code => $stats ) {
668 echo $code . ';' . number_format( $stats['score'] ) . ";\n";
669 }
670 }
671 }
672}
673
674$maintClass = GroupStatistics::class;
675require_once RUN_MAINTENANCE_IF_MAIN;
$wikimediaCodeMap
Code map to map localisation codes to Wikimedia project codes.
$mostSpokenLanguages
Array of the most spoken languages in the world.
$localisedWeights
Variable with key-value pairs with a named index and an array of key-value pairs where the key is a M...
Factory class for accessing message groups individually by id or all of them as a list.
Essentially random collection of helper functions, similar to GlobalFunctions.php.
Definition Utilities.php:31
const TOTAL
Array index.
const FUZZY
Array index.
const TRANSLATED
Array index.
static forGroup( $id, $flags=0)
Returns stats for all languages in given group.
Provides heading, summaryheading and free text addition for stats output in wiki format.