Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
groupStatistics.php
Go to the documentation of this file.
1<?php
17use MediaWiki\Languages\LanguageNameUtils;
18use MediaWiki\Maintenance\Maintenance;
19
20// Standard boilerplate to define $IP
21if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
22 $IP = getenv( 'MW_INSTALL_PATH' );
23} else {
24 $dir = __DIR__;
25 $IP = "$dir/../../..";
26}
27require_once "$IP/maintenance/Maintenance.php";
28
29class GroupStatistics extends Maintenance {
42 public $mostSpokenLanguages = [
43 'en' => [ 1, 1500, 'multiple' ],
44 'zh-hans' => [ 2, 1300, 'asia' ],
45 'zh-hant' => [ 2, 1300, 'asia' ],
46 'hi' => [ 3, 550, 'asia' ],
47 'ar' => [ 4, 530, 'multiple' ],
48 'es' => [ 5, 500, 'multiple' ],
49 'ms' => [ 6, 300, 'asia' ],
50 'pt' => [ 7, 290, 'multiple' ],
51 'pt-br' => [ 7, 290, 'america' ],
52 'ru' => [ 8, 278, 'multiple' ],
53 'id' => [ 9, 250, 'asia' ],
54 'bn' => [ 10, 230, 'asia' ],
55 'fr' => [ 11, 200, 'multiple' ],
56 'de' => [ 12, 185, 'europe' ],
57 'ja' => [ 13, 132, 'asia' ],
58 'fa' => [ 14, 107, 'asia' ],
59 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant
60 'tl' => [ 16, 90, 'asia' ],
61 'mr' => [ 17, 90, 'asia' ],
62 'vi' => [ 18, 80, 'asia' ],
63 'jv' => [ 19, 80, 'asia' ],
64 'te' => [ 20, 80, 'asia' ],
65 'ko' => [ 21, 78, 'asia' ],
66 'wuu' => [ 22, 77, 'asia' ],
67 'arz' => [ 23, 76, 'africa' ],
68 'th' => [ 24, 73, 'asia' ],
69 'yue' => [ 25, 71, 'asia' ],
70 'tr' => [ 26, 70, 'multiple' ],
71 'it' => [ 27, 70, 'europe' ],
72 'ta' => [ 28, 66, 'asia' ],
73 'ur' => [ 29, 60, 'asia' ],
74 'my' => [ 30, 52, 'asia' ],
75 'sw' => [ 31, 50, 'africa' ],
76 'nan' => [ 32, 49, 'asia' ],
77 'kn' => [ 33, 47, 'asia' ],
78 'gu' => [ 34, 46, 'asia' ],
79 'uk' => [ 35, 45, 'europe' ],
80 'pl' => [ 36, 43, 'europe' ],
81 'sd' => [ 37, 41, 'asia' ],
82 'ha' => [ 38, 39, 'africa' ],
83 'ml' => [ 39, 37, 'asia' ],
84 'gan-hans' => [ 40, 35, 'asia' ],
85 'gan-hant' => [ 40, 35, 'asia' ],
86 'hak' => [ 41, 34, 'asia' ],
87 'or' => [ 42, 31, 'asia' ],
88 'ne' => [ 43, 30, 'asia' ],
89 'ro' => [ 44, 28, 'europe' ],
90 'su' => [ 45, 27, 'asia' ],
91 'az' => [ 46, 27, 'asia' ],
92 'nl' => [ 47, 27, 'europe' ],
93 'zu' => [ 48, 26, 'africa' ],
94 'ps' => [ 49, 26, 'asia' ],
95 'ckb' => [ 50, 26, 'asia' ],
96 'ku-latn' => [ 50, 26, 'asia' ],
97 ];
109 public $localisedWeights = [
110 'wikimedia' => [
111 // 'core-0-mostused' => 40,
112 'core' => 50,
113 'ext-0-wikimedia' => 50
114 ],
115 'fundraiser' => [
116 'ext-di-di' => 16,
117 'ext-di-pfpg' => 84,
118 ],
119 'mediawiki' => [
120 // 'core-0-mostused' => 30,
121 'core' => 50,
122 'ext-0-wikimedia' => 25,
123 'ext-0-all' => 25
124 ]
125 ];
132 public $wikimediaCodeMap = [
133 // Codes containing a dash
134 'bat-smg' => 'bat-smg',
135 'cbk-zam' => 'cbk-zam',
136 'map-bms' => 'map-bms',
137 'nds-nl' => 'nds-nl',
138 'roa-rup' => 'roa-rup',
139 'roa-tara' => 'roa-tara',
140
141 // Remaps
142 'be-tarask' => 'be-x-old',
143 'gsw' => 'als',
144 'ike-cans' => 'iu',
145 'ike-latn' => 'iu',
146 'lzh' => 'zh-classical',
147 'nan' => 'zh-min-nan',
148 'vro' => 'fiu-vro',
149 'yue' => 'zh-yue',
150
151 // Ignored language codes. See reason.
152 'als' => '', // gsw
153 'be-x-old' => '', // be-tarask
154 'crh' => '', // crh-*
155 'de-at' => '', // de
156 'de-ch' => '', // de
157 'de-formal' => '', // de, not reporting formal form
158 'dk' => '', // da
159 'en-au' => '', // en
160 'en-ca' => '', // no MW code
161 'en-gb' => '', // no MW code
162 'es-419' => '', // no MW code
163 'fiu-vro' => '', // vro
164 'gan' => '', // gan-*
165 'got' => '', // extinct. not reporting formal form
166 'hif' => '', // hif-*
167 'hu-formal' => '', // not reporting
168 'iu' => '', // ike-*
169 'kk' => '', // kk-*
170 'kk-cn' => '', // kk-arab
171 'kk-kz' => '', // kk-cyrl
172 'kk-tr' => '', // kk-latn
173 'ko-kp' => '', // ko
174 'ku' => '', // ku-*
175 'ku-arab' => '', // ckb
176 'nb' => '', // no
177 'nl-be' => '', // no MW code
178 'nl-informal' => '', // nl, not reporting informal form
179 'ruq' => '', // ruq-*
180 'simple' => '', // en
181 'sr' => '', // sr-*
182 'tg' => '', // tg-*
183 'tp' => '', // tokipona
184 'tt' => '', // tt-*
185 'ug' => '', // ug-*
186 'zh' => '', // zh-*
187 'zh-classical' => '', // lzh
188 'zh-cn' => '', // zh
189 'zh-sg' => '', // zh
190 'zh-hk' => '', // zh
191 'zh-min-nan' => '', // nan
192 'zh-mo' => '', // zh
193 'zh-my' => '', // zh
194 'zh-tw' => '', // zh
195 'zh-yue' => '', // yue
196 ];
197
198 public function __construct() {
199 parent::__construct();
200 $this->addDescription( 'Script to generate statistics about the localisation ' .
201 'level of one or more message groups.' );
202 $this->addOption(
203 'groups',
204 '(optional) Comma separated list of groups',
205 false, /*required*/
206 true /*has arg*/
207 );
208 $this->addOption(
209 'output',
210 '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' .
211 'text: Text with tabs. Default: default',
212 false, /*required*/
213 true /*has arg*/
214 );
215 $this->addOption(
216 'skiplanguages',
217 '(optional) Comma separated list of languages to be skipped',
218 false, /*required*/
219 true /*has arg*/
220 );
221 $this->addOption(
222 'skipzero',
223 '(optional) Skip languages that do not have any localisation at all'
224 );
225 $this->addOption(
226 'legenddetail',
227 '(optional) Page name for legend to be transcluded at the top of the details table',
228 false, /*required*/
229 true /*has arg*/
230 );
231 $this->addOption(
232 'legendsummary',
233 '(optional) Page name for legend to be transcluded at the top of the summary table',
234 false, /*required*/
235 true /*has arg*/
236 );
237 $this->addOption(
238 'fuzzy',
239 '(optional) Add column for fuzzy counts'
240 );
241 $this->addOption(
242 'speakers',
243 '(optional) Add column for number of speakers (est.). ' .
244 'Only valid when combined with "most"'
245 );
246 $this->addOption(
247 'nol10n',
248 '(optional) Do not add localised language name if I18ntags is installed'
249 );
250 $this->addOption(
251 'continent',
252 '(optional) Add a continent column. Only available when output is ' .
253 '"wiki" or not specified.'
254 );
255 $this->addOption(
256 'summary',
257 '(optional) Add a summary with counts and scores per continent category ' .
258 'and totals. Only available for a valid "most" value.',
259 false, /*required*/
260 true /*has arg*/
261 );
262 $this->addOption(
263 'wmfscore',
264 'Only output WMF language code and weighted score for all ' .
265 'language codes for weighing group "wikimedia" in CSV. This ' .
266 'report must keep a stable layout as it is used/will be ' .
267 'used in the Wikimedia statistics.'
268 );
269 $this->addOption(
270 'most',
271 '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' .
272 'spoken languages. Skipzero is ignored. If a valid scope is ' .
273 'defined, the group list and fuzzy are ignored and the ' .
274 'localisation levels are weighted and reported.',
275 false, /*required*/
276 true /*has arg*/
277 );
278 $this->requireExtension( 'Translate' );
279 }
280
281 public function execute() {
282 $output = $this->getOption( 'output', 'default' );
283
284 // Select an output engine
285 switch ( $output ) {
286 case 'wiki':
287 $out = new WikiStatsOutput();
288 break;
289 case 'text':
290 $out = new TextStatsOutput();
291 break;
292 case 'csv':
293 $out = new CsvStatsOutput();
294 break;
295 default:
296 $out = new TranslateStatsOutput();
297 }
298
299 $skipLanguages = [];
300 if ( $this->hasOption( 'skiplanguages' ) ) {
301 $skipLanguages = array_map(
302 'trim',
303 explode( ',', $this->getOption( 'skiplanguages' ) )
304 );
305 }
306
307 $reportScore = false;
308 // Check if score should be reported and prepare weights
309 $most = $this->getOption( 'most' );
310 $weights = [];
311 if ( $most && isset( $this->localisedWeights[$most] ) ) {
312 $reportScore = true;
313
314 foreach ( $this->localisedWeights[$most] as $weight ) {
315 $weights[] = $weight;
316 }
317 }
318
319 // check if l10n should be done
320 $l10n = false;
321 if ( ( $output === 'wiki' || $output === 'default' ) &&
322 !$this->hasOption( 'nol10n' )
323 ) {
324 $l10n = true;
325 }
326
327 $wmfscore = $this->hasOption( 'wmfscore' );
328
329 // Get groups from input
330 $groups = [];
331 if ( $reportScore ) {
332 $reqGroups = array_keys( $this->localisedWeights[$most] );
333 } elseif ( $wmfscore ) {
334 $reqGroups = array_keys( $this->localisedWeights['wikimedia'] );
335 } else {
336 $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) );
337 }
338
339 // List of all groups
340 $allGroups = MessageGroups::singleton()->getGroups();
341
342 // Get list of valid groups
343 foreach ( $reqGroups as $id ) {
344 // Page translation group ids use spaces which are not nice on command line
345 $id = str_replace( '_', ' ', $id );
346 if ( isset( $allGroups[$id] ) ) {
347 $groups[$id] = $allGroups[$id];
348 } else {
349 $this->output( "Unknown group: $id" );
350 }
351 }
352
353 if ( $wmfscore ) {
354 // Override/set parameters
355 $out = new CsvStatsOutput();
356 $reportScore = true;
357
358 $weights = [];
359 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
360 $weights[] = $weight;
361 }
362 $wmfscores = [];
363 }
364
365 if ( !count( $groups ) ) {
366 $this->fatalError( 'No groups given' );
367 }
368
369 // List of all languages.
370 $languages = Utilities::getLanguageNames( LanguageNameUtils::AUTONYMS );
371 // Default sorting order by language code, users can sort wiki output.
372 ksort( $languages );
373
374 if ( $this->hasOption( 'legenddetail' ) ) {
375 $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" );
376 }
377
378 $totalWeight = 0;
379 if ( $reportScore ) {
380 if ( $wmfscore ) {
381 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
382 $totalWeight += $weight;
383 }
384 } else {
385 foreach ( $this->localisedWeights[$most] as $weight ) {
386 $totalWeight += $weight;
387 }
388 }
389 }
390
391 $showContinent = $this->getOption( 'continent' );
392 if ( !$wmfscore ) {
393 // Output headers
394 $out->heading();
395
396 $out->blockstart();
397
398 if ( $most ) {
399 $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true );
400 }
401
402 $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true );
403 $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true );
404 if ( $showContinent ) {
405 $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true );
406 }
407
408 if ( $most && $this->hasOption( 'speakers' ) ) {
409 $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true );
410 }
411
412 if ( $reportScore ) {
413 $out->element(
414 ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')',
415 true
416 );
417 }
418
420 foreach ( $groups as $g ) {
421 // Add unprocessed description of group as heading
422 if ( $reportScore ) {
423 $gid = $g->getId();
424 $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')';
425 } else {
426 $heading = $g->getLabel();
427 }
428 $out->element( $heading, true );
429 if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) {
430 $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true );
431 }
432 }
433
434 $out->blockend();
435 }
436
437 $rows = [];
438 foreach ( $languages as $code => $name ) {
439 // Skip list
440 if ( in_array( $code, $skipLanguages ) ) {
441 continue;
442 }
443 $rows[$code] = [];
444 }
445
446 foreach ( $groups as $groupName => $g ) {
447 $stats = MessageGroupStats::forGroup( $groupName );
448
449 // Perform the statistic calculations on every language
450 foreach ( $languages as $code => $name ) {
451 // Skip list
452 if ( !$most && in_array( $code, $skipLanguages ) ) {
453 continue;
454 }
455
456 // Do not calculate if we do not need it for anything.
457 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
458 && $this->wikimediaCodeMap[$code] === ''
459 ) {
460 continue;
461 }
462
463 // If --most is set, skip all other
464 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
465 continue;
466 }
467
468 $total = $stats[$code][MessageGroupStats::TOTAL];
469 $translated = $stats[$code][MessageGroupStats::TRANSLATED];
470 $fuzzy = $stats[$code][MessageGroupStats::FUZZY];
471
472 $rows[$code][] = [ false, $translated, $total ];
473
474 if ( $this->hasOption( 'fuzzy' ) ) {
475 $rows[$code][] = [ true, $fuzzy, $total ];
476 }
477 }
478 }
479
480 // init summary array
481 $summarise = false;
482 if ( $this->hasOption( 'summary' ) ) {
483 $summarise = true;
484 $summary = [];
485 }
486
487 foreach ( $languages as $code => $name ) {
488 // Skip list
489 if ( !$most && in_array( $code, $skipLanguages ) ) {
490 continue;
491 }
492
493 // Skip unneeded
494 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
495 && $this->wikimediaCodeMap[$code] === ''
496 ) {
497 continue;
498 }
499
500 // If --most is set, skip all other
501 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
502 continue;
503 }
504
505 $columns = $rows[$code];
506
507 $allZero = true;
508 foreach ( $columns as $fields ) {
509 if ( (int)$fields[1] !== 0 ) {
510 $allZero = false;
511 }
512 }
513
514 // Skip dummy languages if requested
515 if ( $allZero && $this->hasOption( 'skipzero' ) ) {
516 continue;
517 }
518
519 // Output the row
520 if ( !$wmfscore ) {
521 $out->blockstart();
522 }
523
524 // Fill language position field
525 if ( $most ) {
526 $out->element( $this->mostSpokenLanguages[$code][0] );
527 }
528
529 // Fill language name field
530 if ( !$wmfscore ) {
531 // Fill language code field
532 $out->element( $code );
533
534 if ( $l10n && function_exists( 'efI18nTagsInit' ) ) {
535 $out->element( '{{#languagename:' . $code . '}}' );
536 } else {
537 $out->element( $name );
538 }
539 }
540
541 // Fill continent field
542 if ( $showContinent ) {
543 if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) {
544 $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
545 } else {
546 $continent = $l10n ?
547 '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' :
548 ucfirst( $this->mostSpokenLanguages[$code][2] );
549 }
550
551 $out->element( $continent );
552 }
553
554 // Fill speakers field
555 if ( $most && $this->hasOption( 'speakers' ) ) {
556 $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) );
557 }
558
559 // Fill the score field
560 if ( $reportScore ) {
561 // Keep count
562 $i = 0;
563 // Start with 0 points
564 $score = 0;
565
566 foreach ( $columns as $fields ) {
567 [ , $upper, $total ] = $fields;
568 // Weigh the score and add it to the current score
569 $score += ( $weights[$i] * $upper ) / $total;
570 $i++;
571 }
572
573 // Report a round numbers
574 $score = number_format( $score, 0 );
575
576 if ( $summarise ) {
577 $continent = $this->mostSpokenLanguages[$code][2];
578 if ( isset( $summary[$continent] ) ) {
579 $newcount = $summary[$continent][0] + 1;
580 $newscore = $summary[$continent][1] + (int)$score;
581 } else {
582 $newcount = 1;
583 $newscore = $score;
584 }
585
586 $summary[$continent] = [ $newcount, $newscore ];
587 }
588
589 if ( $wmfscore ) {
590 // Multiple variants can be used for the same wiki.
591 // Store the scores in an array and output them later
592 // when they can be averaged.
593 $wmfcode = $this->wikimediaCodeMap[$code] ?? explode( '-', $code, 2 )[0];
594
595 if ( isset( $wmfscores[$wmfcode] ) ) {
596 $count = $wmfscores[$wmfcode]['count'] + 1;
597 $tmpWmfScore = (int)$wmfscores[$wmfcode]['score'];
598 $tmpWmfCount = (int)$wmfscores[$wmfcode]['count'];
599 $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count;
600 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ];
601 } else {
602 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ];
603 }
604 } else {
605 $out->element( $score );
606 }
607 }
608
609 // Fill fields for groups
610 if ( !$wmfscore ) {
611 foreach ( $columns as $fields ) {
612 [ $invert, $upper, $total ] = $fields;
613 $c = $out->formatPercent( $upper, $total, $invert );
614 $out->element( $c );
615 }
616
617 $out->blockend();
618 }
619 }
620
621 $out->footer();
622
623 if ( $reportScore && $this->hasOption( 'summary' ) ) {
624 if ( $this->hasOption( 'legendsummary' ) ) {
625 $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" );
626 }
627
628 $out->summaryheading();
629
630 $out->blockstart();
631
632 $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true );
633 $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true );
634 $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true );
635
636 $out->blockend();
637
638 ksort( $summary );
639
640 $totals = [ 0, 0 ];
641
642 foreach ( $summary as $key => $values ) {
643 $out->blockstart();
644
645 if ( $key === 'multiple' ) {
646 $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
647 } else {
648 $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) );
649 }
650 $out->element( $values[0] );
651 $out->element( number_format( $values[1] / $values[0] ) );
652
653 $out->blockend();
654
655 $totals[0] += $values[0];
656 $totals[1] += $values[1];
657 }
658
659 $out->blockstart();
660 $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' );
661 $out->element( $totals[0] );
662 $out->element( number_format( $totals[1] / $totals[0] ) );
663 $out->blockend();
664
665 $out->footer();
666 }
667
668 // Custom output
669 if ( $wmfscore ) {
670 ksort( $wmfscores );
671
672 foreach ( $wmfscores as $code => $stats ) {
673 echo $code . ';' . number_format( $stats['score'] ) . ";\n";
674 }
675 }
676 }
677}
678
679$maintClass = GroupStatistics::class;
680require_once RUN_MAINTENANCE_IF_MAIN;
Factory class for accessing message groups individually by id or all of them as a list.
This class aims to provide efficient mechanism for fetching translation completion stats.
Essentially random collection of helper functions, similar to GlobalFunctions.php.
Definition Utilities.php:31
Provides heading, summaryheading and free text addition for stats output in wiki format.