Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
groupStatistics.php
Go to the documentation of this file.
1<?php
14// Standard boilerplate to define $IP
15if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
16 $IP = getenv( 'MW_INSTALL_PATH' );
17} else {
18 $dir = __DIR__;
19 $IP = "$dir/../../..";
20}
21require_once "$IP/maintenance/Maintenance.php";
22
23class GroupStatistics extends Maintenance {
36 'en' => [ 1, 1500, 'multiple' ],
37 'zh-hans' => [ 2, 1300, 'asia' ],
38 'zh-hant' => [ 2, 1300, 'asia' ],
39 'hi' => [ 3, 550, 'asia' ],
40 'ar' => [ 4, 530, 'multiple' ],
41 'es' => [ 5, 500, 'multiple' ],
42 'ms' => [ 6, 300, 'asia' ],
43 'pt' => [ 7, 290, 'multiple' ],
44 'pt-br' => [ 7, 290, 'america' ],
45 'ru' => [ 8, 278, 'multiple' ],
46 'id' => [ 9, 250, 'asia' ],
47 'bn' => [ 10, 230, 'asia' ],
48 'fr' => [ 11, 200, 'multiple' ],
49 'de' => [ 12, 185, 'europe' ],
50 'ja' => [ 13, 132, 'asia' ],
51 'fa' => [ 14, 107, 'asia' ],
52 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant
53 'tl' => [ 16, 90, 'asia' ],
54 'mr' => [ 17, 90, 'asia' ],
55 'vi' => [ 18, 80, 'asia' ],
56 'jv' => [ 19, 80, 'asia' ],
57 'te' => [ 20, 80, 'asia' ],
58 'ko' => [ 21, 78, 'asia' ],
59 'wuu' => [ 22, 77, 'asia' ],
60 'arz' => [ 23, 76, 'africa' ],
61 'th' => [ 24, 73, 'asia' ],
62 'yue' => [ 25, 71, 'asia' ],
63 'tr' => [ 26, 70, 'multiple' ],
64 'it' => [ 27, 70, 'europe' ],
65 'ta' => [ 28, 66, 'asia' ],
66 'ur' => [ 29, 60, 'asia' ],
67 'my' => [ 30, 52, 'asia' ],
68 'sw' => [ 31, 50, 'africa' ],
69 'nan' => [ 32, 49, 'asia' ],
70 'kn' => [ 33, 47, 'asia' ],
71 'gu' => [ 34, 46, 'asia' ],
72 'uk' => [ 35, 45, 'europe' ],
73 'pl' => [ 36, 43, 'europe' ],
74 'sd' => [ 37, 41, 'asia' ],
75 'ha' => [ 38, 39, 'africa' ],
76 'ml' => [ 39, 37, 'asia' ],
77 'gan-hans' => [ 40, 35, 'asia' ],
78 'gan-hant' => [ 40, 35, 'asia' ],
79 'hak' => [ 41, 34, 'asia' ],
80 'or' => [ 42, 31, 'asia' ],
81 'ne' => [ 43, 30, 'asia' ],
82 'ro' => [ 44, 28, 'europe' ],
83 'su' => [ 45, 27, 'asia' ],
84 'az' => [ 46, 27, 'asia' ],
85 'nl' => [ 47, 27, 'europe' ],
86 'zu' => [ 48, 26, 'africa' ],
87 'ps' => [ 49, 26, 'asia' ],
88 'ckb' => [ 50, 26, 'asia' ],
89 'ku-latn' => [ 50, 26, 'asia' ],
90 ];
102 'wikimedia' => [
103 // 'core-0-mostused' => 40,
104 'core' => 50,
105 'ext-0-wikimedia' => 50
106 ],
107 'fundraiser' => [
108 'ext-di-di' => 16,
109 'ext-di-pfpg' => 84,
110 ],
111 'mediawiki' => [
112 // 'core-0-mostused' => 30,
113 'core' => 50,
114 'ext-0-wikimedia' => 25,
115 'ext-0-all' => 25
116 ]
117 ];
124 // Codes containing a dash
125 'bat-smg' => 'bat-smg',
126 'cbk-zam' => 'cbk-zam',
127 'map-bms' => 'map-bms',
128 'nds-nl' => 'nds-nl',
129 'roa-rup' => 'roa-rup',
130 'roa-tara' => 'roa-tara',
131
132 // Remaps
133 'be-tarask' => 'be-x-old',
134 'gsw' => 'als',
135 'ike-cans' => 'iu',
136 'ike-latn' => 'iu',
137 'lzh' => 'zh-classical',
138 'nan' => 'zh-min-nan',
139 'vro' => 'fiu-vro',
140 'yue' => 'zh-yue',
141
142 // Ignored language codes. See reason.
143 'als' => '', // gsw
144 'be-x-old' => '', // be-tarask
145 'crh' => '', // crh-*
146 'de-at' => '', // de
147 'de-ch' => '', // de
148 'de-formal' => '', // de, not reporting formal form
149 'dk' => '', // da
150 'en-au' => '', // en
151 'en-ca' => '', // no MW code
152 'en-gb' => '', // no MW code
153 'es-419' => '', // no MW code
154 'fiu-vro' => '', // vro
155 'gan' => '', // gan-*
156 'got' => '', // extinct. not reporting formal form
157 'hif' => '', // hif-*
158 'hu-formal' => '', // not reporting
159 'iu' => '', // ike-*
160 'kk' => '', // kk-*
161 'kk-cn' => '', // kk-arab
162 'kk-kz' => '', // kk-cyrl
163 'kk-tr' => '', // kk-latn
164 'ko-kp' => '', // ko
165 'ku' => '', // ku-*
166 'ku-arab' => '', // ckb
167 'nb' => '', // no
168 'nl-be' => '', // no MW code
169 'nl-informal' => '', // nl, not reporting informal form
170 'ruq' => '', // ruq-*
171 'simple' => '', // en
172 'sr' => '', // sr-*
173 'tg' => '', // tg-*
174 'tp' => '', // tokipona
175 'tt' => '', // tt-*
176 'ug' => '', // ug-*
177 'zh' => '', // zh-*
178 'zh-classical' => '', // lzh
179 'zh-cn' => '', // zh
180 'zh-sg' => '', // zh
181 'zh-hk' => '', // zh
182 'zh-min-nan' => '', // nan
183 'zh-mo' => '', // zh
184 'zh-my' => '', // zh
185 'zh-tw' => '', // zh
186 'zh-yue' => '', // yue
187 ];
188
189 public function __construct() {
190 parent::__construct();
191 $this->addDescription( 'Script to generate statistics about the localisation ' .
192 'level of one or more message groups.' );
193 $this->addOption(
194 'groups',
195 '(optional) Comma separated list of groups',
196 false, /*required*/
197 true /*has arg*/
198 );
199 $this->addOption(
200 'output',
201 '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' .
202 'text: Text with tabs. Default: default',
203 false, /*required*/
204 true /*has arg*/
205 );
206 $this->addOption(
207 'skiplanguages',
208 '(optional) Comma separated list of languages to be skipped',
209 false, /*required*/
210 true /*has arg*/
211 );
212 $this->addOption(
213 'skipzero',
214 '(optional) Skip languages that do not have any localisation at all'
215 );
216 $this->addOption(
217 'legenddetail',
218 '(optional) Page name for legend to be transcluded at the top of the details table',
219 false, /*required*/
220 true /*has arg*/
221 );
222 $this->addOption(
223 'legendsummary',
224 '(optional) Page name for legend to be transcluded at the top of the summary table',
225 false, /*required*/
226 true /*has arg*/
227 );
228 $this->addOption(
229 'fuzzy',
230 '(optional) Add column for fuzzy counts'
231 );
232 $this->addOption(
233 'speakers',
234 '(optional) Add column for number of speakers (est.). ' .
235 'Only valid when combined with "most"'
236 );
237 $this->addOption(
238 'nol10n',
239 '(optional) Do not add localised language name if I18ntags is installed'
240 );
241 $this->addOption(
242 'continent',
243 '(optional) Add a continent column. Only available when output is ' .
244 '"wiki" or not specified.'
245 );
246 $this->addOption(
247 'summary',
248 '(optional) Add a summary with counts and scores per continent category ' .
249 'and totals. Only available for a valid "most" value.',
250 false, /*required*/
251 true /*has arg*/
252 );
253 $this->addOption(
254 'wmfscore',
255 'Only output WMF language code and weighted score for all ' .
256 'language codes for weighing group "wikimedia" in CSV. This ' .
257 'report must keep a stable layout as it is used/will be ' .
258 'used in the Wikimedia statistics.'
259 );
260 $this->addOption(
261 'most',
262 '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' .
263 'spoken languages. Skipzero is ignored. If a valid scope is ' .
264 'defined, the group list and fuzzy are ignored and the ' .
265 'localisation levels are weighted and reported.',
266 false, /*required*/
267 true /*has arg*/
268 );
269 $this->requireExtension( 'Translate' );
270 }
271
272 public function execute() {
273 $output = $this->getOption( 'output', 'default' );
274
275 // Select an output engine
276 switch ( $output ) {
277 case 'wiki':
278 $out = new WikiStatsOutput();
279 break;
280 case 'text':
281 $out = new TextStatsOutput();
282 break;
283 case 'csv':
284 $out = new CsvStatsOutput();
285 break;
286 default:
287 $out = new TranslateStatsOutput();
288 }
289
290 $skipLanguages = [];
291 if ( $this->hasOption( 'skiplanguages' ) ) {
292 $skipLanguages = array_map(
293 'trim',
294 explode( ',', $this->getOption( 'skiplanguages' ) )
295 );
296 }
297
298 $reportScore = false;
299 // Check if score should be reported and prepare weights
300 $most = $this->getOption( 'most' );
301 $weights = [];
302 if ( $most && isset( $this->localisedWeights[$most] ) ) {
303 $reportScore = true;
304
305 foreach ( $this->localisedWeights[$most] as $weight ) {
306 $weights[] = $weight;
307 }
308 }
309
310 // check if l10n should be done
311 $l10n = false;
312 if ( ( $output === 'wiki' || $output === 'default' ) &&
313 !$this->hasOption( 'nol10n' )
314 ) {
315 $l10n = true;
316 }
317
318 $wmfscore = $this->hasOption( 'wmfscore' );
319
320 // Get groups from input
321 $groups = [];
322 if ( $reportScore ) {
323 $reqGroups = array_keys( $this->localisedWeights[$most] );
324 } elseif ( $wmfscore ) {
325 $reqGroups = array_keys( $this->localisedWeights['wikimedia'] );
326 } else {
327 $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) );
328 }
329
330 // List of all groups
331 $allGroups = MessageGroups::singleton()->getGroups();
332
333 // Get list of valid groups
334 foreach ( $reqGroups as $id ) {
335 // Page translation group ids use spaces which are not nice on command line
336 $id = str_replace( '_', ' ', $id );
337 if ( isset( $allGroups[$id] ) ) {
338 $groups[$id] = $allGroups[$id];
339 } else {
340 $this->output( "Unknown group: $id" );
341 }
342 }
343
344 if ( $wmfscore ) {
345 // Override/set parameters
346 $out = new CsvStatsOutput();
347 $reportScore = true;
348
349 $weights = [];
350 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
351 $weights[] = $weight;
352 }
353 $wmfscores = [];
354 }
355
356 if ( !count( $groups ) ) {
357 $this->fatalError( 'No groups given' );
358 }
359
360 // List of all languages.
361 $languages = TranslateUtils::getLanguageNames( null );
362 // Default sorting order by language code, users can sort wiki output.
363 ksort( $languages );
364
365 if ( $this->hasOption( 'legenddetail' ) ) {
366 $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" );
367 }
368
369 $totalWeight = 0;
370 if ( $reportScore ) {
371 if ( $wmfscore ) {
372 foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
373 $totalWeight += $weight;
374 }
375 } else {
376 foreach ( $this->localisedWeights[$most] as $weight ) {
377 $totalWeight += $weight;
378 }
379 }
380 }
381
382 $showContinent = $this->getOption( 'continent' );
383 if ( !$wmfscore ) {
384 // Output headers
385 $out->heading();
386
387 $out->blockstart();
388
389 if ( $most ) {
390 $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true );
391 }
392
393 $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true );
394 $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true );
395 if ( $showContinent ) {
396 $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true );
397 }
398
399 if ( $most && $this->hasOption( 'speakers' ) ) {
400 $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true );
401 }
402
403 if ( $reportScore ) {
404 $out->element(
405 ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')',
406 true
407 );
408 }
409
411 foreach ( $groups as $g ) {
412 // Add unprocessed description of group as heading
413 if ( $reportScore ) {
414 $gid = $g->getId();
415 $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')';
416 } else {
417 $heading = $g->getLabel();
418 }
419 $out->element( $heading, true );
420 if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) {
421 $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true );
422 }
423 }
424
425 $out->blockend();
426 }
427
428 $rows = [];
429 foreach ( $languages as $code => $name ) {
430 // Skip list
431 if ( in_array( $code, $skipLanguages ) ) {
432 continue;
433 }
434 $rows[$code] = [];
435 }
436
437 foreach ( $groups as $groupName => $g ) {
438 $stats = MessageGroupStats::forGroup( $groupName );
439
440 // Perform the statistic calculations on every language
441 foreach ( $languages as $code => $name ) {
442 // Skip list
443 if ( !$most && in_array( $code, $skipLanguages ) ) {
444 continue;
445 }
446
447 // Do not calculate if we do not need it for anything.
448 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
449 && $this->wikimediaCodeMap[$code] === ''
450 ) {
451 continue;
452 }
453
454 // If --most is set, skip all other
455 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
456 continue;
457 }
458
459 $total = $stats[$code][MessageGroupStats::TOTAL];
460 $translated = $stats[$code][MessageGroupStats::TRANSLATED];
461 $fuzzy = $stats[$code][MessageGroupStats::FUZZY];
462
463 $rows[$code][] = [ false, $translated, $total ];
464
465 if ( $this->hasOption( 'fuzzy' ) ) {
466 $rows[$code][] = [ true, $fuzzy, $total ];
467 }
468 }
469 }
470
471 // init summary array
472 $summarise = false;
473 if ( $this->hasOption( 'summary' ) ) {
474 $summarise = true;
475 $summary = [];
476 }
477
478 foreach ( $languages as $code => $name ) {
479 // Skip list
480 if ( !$most && in_array( $code, $skipLanguages ) ) {
481 continue;
482 }
483
484 // Skip unneeded
485 if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
486 && $this->wikimediaCodeMap[$code] === ''
487 ) {
488 continue;
489 }
490
491 // If --most is set, skip all other
492 if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
493 continue;
494 }
495
496 $columns = $rows[$code];
497
498 $allZero = true;
499 foreach ( $columns as $fields ) {
500 if ( (int)$fields[1] !== 0 ) {
501 $allZero = false;
502 }
503 }
504
505 // Skip dummy languages if requested
506 if ( $allZero && $this->hasOption( 'skipzero' ) ) {
507 continue;
508 }
509
510 // Output the row
511 if ( !$wmfscore ) {
512 $out->blockstart();
513 }
514
515 // Fill language position field
516 if ( $most ) {
517 $out->element( $this->mostSpokenLanguages[$code][0] );
518 }
519
520 // Fill language name field
521 if ( !$wmfscore ) {
522 // Fill language code field
523 $out->element( $code );
524
525 if ( $l10n && function_exists( 'efI18nTagsInit' ) ) {
526 $out->element( '{{#languagename:' . $code . '}}' );
527 } else {
528 $out->element( $name );
529 }
530 }
531
532 // Fill continent field
533 if ( $showContinent ) {
534 if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) {
535 $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
536 } else {
537 $continent = $l10n ?
538 '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' :
539 ucfirst( $this->mostSpokenLanguages[$code][2] );
540 }
541
542 $out->element( $continent );
543 }
544
545 // Fill speakers field
546 if ( $most && $this->hasOption( 'speakers' ) ) {
547 $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) );
548 }
549
550 // Fill the score field
551 if ( $reportScore ) {
552 // Keep count
553 $i = 0;
554 // Start with 0 points
555 $score = 0;
556
557 foreach ( $columns as $fields ) {
558 list( , $upper, $total ) = $fields;
559 // Weigh the score and add it to the current score
560 $score += ( $weights[$i] * $upper ) / $total;
561 $i++;
562 }
563
564 // Report a round numbers
565 $score = number_format( $score, 0 );
566
567 if ( $summarise ) {
568 $continent = $this->mostSpokenLanguages[$code][2];
569 if ( isset( $summary[$continent] ) ) {
570 $newcount = $summary[$continent][0] + 1;
571 $newscore = $summary[$continent][1] + (int)$score;
572 } else {
573 $newcount = 1;
574 $newscore = $score;
575 }
576
577 $summary[$continent] = [ $newcount, $newscore ];
578 }
579
580 if ( $wmfscore ) {
581 // Multiple variants can be used for the same wiki.
582 // Store the scores in an array and output them later
583 // when they can be averaged.
584 $wmfcode = $this->wikimediaCodeMap[$code] ?? explode( '-', $code, 2 )[0];
585
586 if ( isset( $wmfscores[$wmfcode] ) ) {
587 $count = $wmfscores[$wmfcode]['count'] + 1;
588 $tmpWmfScore = (int)$wmfscores[$wmfcode]['score'];
589 $tmpWmfCount = (int)$wmfscores[$wmfcode]['count'];
590 $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count;
591 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ];
592 } else {
593 $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ];
594 }
595 } else {
596 $out->element( $score );
597 }
598 }
599
600 // Fill fields for groups
601 if ( !$wmfscore ) {
602 foreach ( $columns as $fields ) {
603 list( $invert, $upper, $total ) = $fields;
604 $c = $out->formatPercent( $upper, $total, $invert );
605 $out->element( $c );
606 }
607
608 $out->blockend();
609 }
610 }
611
612 $out->footer();
613
614 if ( $reportScore && $this->hasOption( 'summary' ) ) {
615 if ( $this->hasOption( 'legendsummary' ) ) {
616 $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" );
617 }
618
619 $out->summaryheading();
620
621 $out->blockstart();
622
623 $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true );
624 $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true );
625 $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true );
626
627 $out->blockend();
628
629 ksort( $summary );
630
631 $totals = [ 0, 0 ];
632
633 foreach ( $summary as $key => $values ) {
634 $out->blockstart();
635
636 if ( $key === 'multiple' ) {
637 $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
638 } else {
639 $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) );
640 }
641 $out->element( $values[0] );
642 $out->element( number_format( $values[1] / $values[0] ) );
643
644 $out->blockend();
645
646 $totals[0] += $values[0];
647 $totals[1] += $values[1];
648 }
649
650 $out->blockstart();
651 $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' );
652 $out->element( $totals[0] );
653 $out->element( number_format( $totals[1] / $totals[0] ) );
654 $out->blockend();
655
656 $out->footer();
657 }
658
659 // Custom output
660 if ( $wmfscore ) {
661 ksort( $wmfscores );
662
663 foreach ( $wmfscores as $code => $stats ) {
664 echo $code . ';' . number_format( $stats['score'] ) . ";\n";
665 }
666 }
667 }
668}
669
670$maintClass = GroupStatistics::class;
671require_once RUN_MAINTENANCE_IF_MAIN;
$wikimediaCodeMap
Code map to map localisation codes to Wikimedia project codes.
$mostSpokenLanguages
Array of the most spoken languages in the world.
$localisedWeights
Variable with key-value pairs with a named index and an array of key-value pairs where the key is a M...
const TOTAL
Array index.
const FUZZY
Array index.
const TRANSLATED
Array index.
static forGroup( $id, $flags=0)
Returns stats for all languages in given group.
Provides heading, summaryheading and free text addition for stats output in wiki format.
static getLanguageNames( $code)
Get translated language names for the languages generally supported for translation in the current wi...