Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
GettextFFS.php
Go to the documentation of this file.
1<?php
17use MediaWiki\Logger\LoggerFactory;
18use MediaWiki\MediaWikiServices;
19
25 private $allowPotMode = false;
26 protected $offlineMode = false;
27
28 public function supportsFuzzy() {
29 return 'yes';
30 }
31
32 public function getFileExtensions() {
33 return [ '.pot', '.po' ];
34 }
35
37 public function setOfflineMode( $value ) {
38 $this->offlineMode = $value;
39 }
40
42 public function read( $code ) {
43 // This is somewhat hacky, but pot mode should only ever be used for the source language.
44 // See https://phabricator.wikimedia.org/T230361
45 $this->allowPotMode = $this->getGroup()->getSourceLanguage() === $code;
46
47 try {
48 return parent::read( $code );
49 } finally {
50 $this->allowPotMode = false;
51 }
52 }
53
58 public function readFromVariable( $data ) {
59 # Authors first
60 $matches = [];
61 preg_match_all( '/^#\s*Author:\s*(.*)$/m', $data, $matches );
62 $authors = $matches[1];
63
64 # Then messages and everything else
65 $parsedData = $this->parseGettext( $data );
66 $parsedData['AUTHORS'] = $authors;
67
68 foreach ( $parsedData['MESSAGES'] as $key => $value ) {
69 if ( $value === '' ) {
70 unset( $parsedData['MESSAGES'][$key] );
71 }
72 }
73
74 return $parsedData;
75 }
76
77 public function parseGettext( $data ) {
78 $mangler = $this->group->getMangler();
79 $useCtxtAsKey = $this->extra['CtxtAsKey'] ?? false;
80 $keyAlgorithm = 'simple';
81 if ( isset( $this->extra['keyAlgorithm'] ) ) {
82 $keyAlgorithm = $this->extra['keyAlgorithm'];
83 }
84
85 return self::parseGettextData( $data, $useCtxtAsKey, $mangler, $keyAlgorithm, $this->allowPotMode );
86 }
87
99 public static function parseGettextData(
100 $data,
101 $useCtxtAsKey,
102 StringMangler $mangler,
103 $keyAlgorithm,
104 bool $allowPotMode
105 ) {
106 $potmode = false;
107
108 // Normalise newlines, to make processing easier
109 $data = str_replace( "\r\n", "\n", $data );
110
111 /* Delimit the file into sections, which are separated by two newlines.
112 * We are permissive and accept more than two. This parsing method isn't
113 * efficient wrt memory, but was easy to implement */
114 $sections = preg_split( '/\n{2,}/', $data );
115
116 /* First one isn't an actual message. We'll handle it specially below */
117 $headerSection = array_shift( $sections );
118 /* Since this is the header section, we are only interested in the tags
119 * and msgid is empty. Somewhere we should extract the header comments
120 * too */
121 $match = self::expectKeyword( 'msgstr', $headerSection );
122 if ( $match !== null ) {
123 $headerBlock = self::formatForWiki( $match, 'trim' );
124 $headers = self::parseHeaderTags( $headerBlock );
125
126 // Check for pot-mode by checking if the header is fuzzy
127 $flags = self::parseFlags( $headerSection );
128 if ( in_array( 'fuzzy', $flags, true ) ) {
129 $potmode = $allowPotMode;
130 }
131 } else {
132 $message = "Gettext file header was not found:\n\n$data";
133 throw new GettextParseException( $message );
134 }
135
136 $template = [];
137 $messages = [];
138
139 // Extract some metadata from headers for easier use
140 $metadata = [];
141 if ( isset( $headers['X-Language-Code'] ) ) {
142 $metadata['code'] = $headers['X-Language-Code'];
143 }
144
145 if ( isset( $headers['X-Message-Group'] ) ) {
146 $metadata['group'] = $headers['X-Message-Group'];
147 }
148
149 /* At this stage we are only interested how many plurals forms we should
150 * be expecting when parsing the rest of this file. */
151 $pluralCount = false;
152 if ( $potmode ) {
153 $pluralCount = 2;
154 } elseif ( isset( $headers['Plural-Forms'] ) ) {
155 $pluralCount = $metadata['plural'] = GettextPlural::getPluralCount( $headers['Plural-Forms'] );
156 }
157
158 $metadata['plural'] = $pluralCount;
159
160 // Then parse the messages
161 foreach ( $sections as $section ) {
162 $item = self::parseGettextSection( $section, $pluralCount );
163 if ( $item === false ) {
164 continue;
165 }
166
167 if ( $useCtxtAsKey ) {
168 if ( !isset( $item['ctxt'] ) ) {
169 error_log( "ctxt missing for: $section" );
170 continue;
171 }
172 $key = $item['ctxt'];
173 } else {
174 $key = self::generateKeyFromItem( $item, $keyAlgorithm );
175 }
176
177 $key = $mangler->mangle( $key );
178 $messages[$key] = $potmode ? $item['id'] : $item['str'];
179 $template[$key] = $item;
180 }
181
182 return [
183 'MESSAGES' => $messages,
184 'EXTRA' => [
185 'TEMPLATE' => $template,
186 'METADATA' => $metadata,
187 'HEADERS' => $headers,
188 ],
189 ];
190 }
191
192 public static function parseGettextSection( $section, $pluralCount ) {
193 if ( trim( $section ) === '' ) {
194 return false;
195 }
196
197 /* These inactive sections are of no interest to us. Multiline mode
198 * is needed because there may be flags or other annoying stuff
199 * before the commented out sections.
200 */
201 if ( preg_match( '/^#~/m', $section ) ) {
202 return false;
203 }
204
205 $item = [
206 'ctxt' => false,
207 'id' => '',
208 'str' => '',
209 'flags' => [],
210 'comments' => [],
211 ];
212
213 $match = self::expectKeyword( 'msgid', $section );
214 if ( $match !== null ) {
215 $item['id'] = self::formatForWiki( $match );
216 } else {
217 throw new MWException( "Unable to parse msgid:\n\n$section" );
218 }
219
220 $match = self::expectKeyword( 'msgctxt', $section );
221 if ( $match !== null ) {
222 $item['ctxt'] = self::formatForWiki( $match );
223 }
224
225 $pluralMessage = false;
226 $match = self::expectKeyword( 'msgid_plural', $section );
227 if ( $match !== null ) {
228 $pluralMessage = true;
229 $plural = self::formatForWiki( $match );
230 $item['id'] = GettextPlural::flatten( [ $item['id'], $plural ] );
231 }
232
233 if ( $pluralMessage ) {
234 $pluralMessageText = self::processGettextPluralMessage( $pluralCount, $section );
235
236 // Keep the translation empty if no form has translation
237 if ( $pluralMessageText !== '' ) {
238 $item['str'] = $pluralMessageText;
239 }
240 } else {
241 $match = self::expectKeyword( 'msgstr', $section );
242 if ( $match !== null ) {
243 $item['str'] = self::formatForWiki( $match );
244 } else {
245 throw new MWException( "Unable to parse msgstr:\n\n$section" );
246 }
247 }
248
249 // Parse flags
250 $flags = self::parseFlags( $section );
251 foreach ( $flags as $key => $flag ) {
252 if ( $flag === 'fuzzy' ) {
253 $item['str'] = TRANSLATE_FUZZY . $item['str'];
254 unset( $flags[$key] );
255 }
256 }
257 $item['flags'] = $flags;
258
259 // Rest of the comments
260 $matches = [];
261 if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {
262 foreach ( $matches as $match ) {
263 if ( $match[1] !== ',' && strpos( $match[1], '[Wiki]' ) !== 0 ) {
264 $item['comments'][$match[1]][] = $match[2];
265 }
266 }
267 }
268
269 return $item;
270 }
271
272 public static function processGettextPluralMessage( $pluralCount, $section ) {
273 $actualForms = [];
274
275 for ( $i = 0; $i < $pluralCount; $i++ ) {
276 $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
277
278 if ( $match !== null ) {
279 $actualForms[] = self::formatForWiki( $match );
280 } else {
281 $actualForms[] = '';
282 error_log( "Plural $i not found, expecting total of $pluralCount for $section" );
283 }
284 }
285
286 if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
287 return GettextPlural::flatten( $actualForms );
288 } else {
289 return '';
290 }
291 }
292
293 public static function parseFlags( $section ) {
294 $matches = [];
295 if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {
296 return array_map( 'trim', explode( ',', $matches[1] ) );
297 } else {
298 return [];
299 }
300 }
301
302 public static function expectKeyword( $name, $section ) {
303 /* Catches the multiline textblock that comes after keywords msgid,
304 * msgstr, msgid_plural, msgctxt.
305 */
306 $poformat = '".*"\n?(^".*"$\n?)*';
307
308 $matches = [];
309 if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
310 return $matches[1];
311 } else {
312 return null;
313 }
314 }
315
323 public static function generateKeyFromItem( array $item, $algorithm = 'simple' ) {
324 $lang = MediaWikiServices::getInstance()->getLanguageFactory()->getLanguage( 'en' );
325
326 if ( $item['ctxt'] === '' ) {
327 /* Messages with msgctxt as empty string should be different
328 * from messages without any msgctxt. To avoid BC break make
329 * the empty ctxt a special case */
330 $hash = sha1( $item['id'] . 'MSGEMPTYCTXT' );
331 } else {
332 $hash = sha1( $item['ctxt'] . $item['id'] );
333 }
334
335 if ( $algorithm === 'simple' ) {
336 $hash = substr( $hash, 0, 6 );
337 $snippet = $lang->truncateForDatabase( $item['id'], 30, '' );
338 $snippet = str_replace( ' ', '_', trim( $snippet ) );
339 } else { // legacy
340 $legalChars = Title::legalChars();
341 $snippet = $item['id'];
342 $snippet = preg_replace( "/[^$legalChars]/", ' ', $snippet );
343 $snippet = preg_replace( "/[:&%\/_]/", ' ', $snippet );
344 $snippet = preg_replace( '/ {2,}/', ' ', $snippet );
345 $snippet = $lang->truncateForDatabase( $snippet, 30, '' );
346 $snippet = str_replace( ' ', '_', trim( $snippet ) );
347 }
348
349 return "$hash-$snippet";
350 }
351
362 public static function formatForWiki( $data, $whitespace = 'mark' ) {
363 $quotePattern = '/(^"|"$\n?)/m';
364 $data = preg_replace( $quotePattern, '', $data );
365 $data = stripcslashes( $data );
366
367 if ( preg_match( '/\s$/', $data ) ) {
368 if ( $whitespace === 'mark' ) {
369 $data .= '\\';
370 } elseif ( $whitespace === 'trim' ) {
371 $data = rtrim( $data );
372 } else {
373 // @todo Only triggered if there is trailing whitespace
374 throw new MWException( 'Unknown action for whitespace' );
375 }
376 }
377
378 return $data;
379 }
380
381 public static function parseHeaderTags( $headers ) {
382 $tags = [];
383 foreach ( explode( "\n", $headers ) as $line ) {
384 if ( strpos( $line, ':' ) === false ) {
385 error_log( __METHOD__ . ": $line" );
386 }
387 [ $key, $value ] = explode( ':', $line, 2 );
388 $tags[trim( $key )] = trim( $value );
389 }
390
391 return $tags;
392 }
393
394 protected function writeReal( MessageCollection $collection ) {
395 // FIXME: this should be the source language
396 $pot = $this->read( 'en' ) ?? [];
397 $code = $collection->code;
398 $template = $this->read( $code ) ?? [];
399 $output = $this->doGettextHeader( $collection, $template['EXTRA'] ?? [] );
400
401 $pluralRule = GettextPlural::getPluralRule( $code );
402 if ( !$pluralRule ) {
403 $pluralRule = GettextPlural::getPluralRule( 'en' );
404 LoggerFactory::getInstance( 'Translate' )->warning(
405 "T235180: Missing Gettext plural rule for '{languagecode}'",
406 [ 'languagecode' => $code ]
407 );
408 }
409 $pluralCount = GettextPlural::getPluralCount( $pluralRule );
410
412 foreach ( $collection as $key => $m ) {
413 $transTemplate = $template['EXTRA']['TEMPLATE'][$key] ?? [];
414 $potTemplate = $pot['EXTRA']['TEMPLATE'][$key] ?? [];
415
416 $output .= $this->formatMessageBlock( $key, $m, $transTemplate, $potTemplate, $pluralCount );
417 }
418
419 return $output;
420 }
421
422 protected function doGettextHeader( MessageCollection $collection, $template ) {
423 global $wgSitename;
424
425 $code = $collection->code;
426 $name = Utilities::getLanguageName( $code );
427 $native = Utilities::getLanguageName( $code, $code );
428 $authors = $this->doAuthors( $collection );
429 if ( isset( $this->extra['header'] ) ) {
430 $extra = "# --\n" . $this->extra['header'];
431 } else {
432 $extra = '';
433 }
434
435 $output =
436 <<<EOT
437 # Translation of {$this->group->getLabel()} to $name ($native)
438 # Exported from $wgSitename
439 #
440 $authors$extra
441 EOT;
442
443 // Make sure there is no empty line before msgid
444 $output = trim( $output ) . "\n";
445
446 $specs = $template['HEADERS'] ?? [];
447
448 $timestamp = wfTimestampNow();
449 $specs['PO-Revision-Date'] = self::formatTime( $timestamp );
450 if ( $this->offlineMode ) {
451 $specs['POT-Creation-Date'] = self::formatTime( $timestamp );
452 } elseif ( $this->group instanceof MessageGroupBase ) {
453 $specs['X-POT-Import-Date'] = self::formatTime( wfTimestamp( TS_MW, $this->getPotTime() ) );
454 }
455 $specs['Content-Type'] = 'text/plain; charset=UTF-8';
456 $specs['Content-Transfer-Encoding'] = '8bit';
457 $specs['Language'] = LanguageCode::bcp47( $this->group->mapCode( $code ) );
458 Hooks::run( 'Translate:GettextFFS:headerFields', [ &$specs, $this->group, $code ] );
459 $specs['X-Generator'] = $this->getGenerator();
460
461 if ( $this->offlineMode ) {
462 $specs['X-Language-Code'] = $code;
463 $specs['X-Message-Group'] = $this->group->getId();
464 }
465
466 $specs['Plural-Forms'] = GettextPlural::getPluralRule( $code )
467 ?: GettextPlural::getPluralRule( 'en' );
468
469 $output .= 'msgid ""' . "\n";
470 $output .= 'msgstr ""' . "\n";
471 $output .= '""' . "\n";
472
473 foreach ( $specs as $k => $v ) {
474 $output .= self::escape( "$k: $v\n" ) . "\n";
475 }
476
477 $output .= "\n";
478
479 return $output;
480 }
481
482 protected function doAuthors( MessageCollection $collection ) {
483 $output = '';
484 $authors = $collection->getAuthors();
485 $authors = $this->filterAuthors( $authors, $collection->code );
486
487 foreach ( $authors as $author ) {
488 $output .= "# Author: $author\n";
489 }
490
491 return $output;
492 }
493
502 protected function formatMessageBlock( $key, $m, $trans, $pot, $pluralCount ) {
503 $header = $this->formatDocumentation( $key );
504 $content = '';
505
506 $comments = self::chainGetter( 'comments', $pot, $trans, [] );
507 foreach ( $comments as $type => $typecomments ) {
508 foreach ( $typecomments as $comment ) {
509 $header .= "#$type $comment\n";
510 }
511 }
512
513 $flags = self::chainGetter( 'flags', $pot, $trans, [] );
514 $flags = array_merge( $m->getTags(), $flags );
515
516 if ( $this->offlineMode ) {
517 $content .= 'msgctxt ' . self::escape( $key ) . "\n";
518 } else {
519 $ctxt = self::chainGetter( 'ctxt', $pot, $trans, false );
520 if ( $ctxt !== false ) {
521 $content .= 'msgctxt ' . self::escape( $ctxt ) . "\n";
522 }
523 }
524
525 $msgid = $m->definition();
526 $msgstr = $m->translation();
527 if ( strpos( $msgstr, TRANSLATE_FUZZY ) !== false ) {
528 $msgstr = str_replace( TRANSLATE_FUZZY, '', $msgstr );
529 // Might by fuzzy infile
530 $flags[] = 'fuzzy';
531 }
532
533 if ( GettextPlural::hasPlural( $msgid ) ) {
534 $forms = GettextPlural::unflatten( $msgid, 2 );
535 $content .= 'msgid ' . self::escape( $forms[0] ) . "\n";
536 $content .= 'msgid_plural ' . self::escape( $forms[1] ) . "\n";
537
538 try {
539 $forms = GettextPlural::unflatten( $msgstr, $pluralCount );
540 foreach ( $forms as $index => $form ) {
541 $content .= "msgstr[$index] " . self::escape( $form ) . "\n";
542 }
543 } catch ( GettextPluralException $e ) {
544 $flags[] = 'invalid-plural';
545 for ( $i = 0; $i < $pluralCount; $i++ ) {
546 $content .= "msgstr[$i] \"\"\n";
547 }
548 }
549 } else {
550 $content .= 'msgid ' . self::escape( $msgid ) . "\n";
551 $content .= 'msgstr ' . self::escape( $msgstr ) . "\n";
552 }
553
554 if ( $flags ) {
555 sort( $flags );
556 $header .= '#, ' . implode( ', ', array_unique( $flags ) ) . "\n";
557 }
558
559 $output = $header ?: "#\n";
560 $output .= $content . "\n";
561
562 return $output;
563 }
564
572 protected static function chainGetter( $key, $a, $b, $default ) {
573 return $a[$key] ?? $b[$key] ?? $default;
574 }
575
576 protected static function formatTime( $time ) {
577 $lang = MediaWikiServices::getInstance()->getLanguageFactory()->getLanguage( 'en' );
578
579 return $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $time );
580 }
581
582 protected function getPotTime() {
583 $cache = $this->group->getMessageGroupCache( $this->group->getSourceLanguage() );
584
585 return $cache->exists() ? $cache->getTimestamp() : wfTimestampNow();
586 }
587
588 protected function getGenerator() {
589 return 'MediaWiki ' . SpecialVersion::getVersion() .
590 '; Translate ' . Utilities::getVersion();
591 }
592
593 protected function formatDocumentation( $key ) {
594 global $wgTranslateDocumentationLanguageCode;
595
596 if ( !$this->offlineMode ) {
597 return '';
598 }
599
600 $code = $wgTranslateDocumentationLanguageCode;
601 if ( !$code ) {
602 return '';
603 }
604
605 $documentation = Utilities::getMessageContent( $key, $code, $this->group->getNamespace() );
606 if ( !is_string( $documentation ) ) {
607 return '';
608 }
609
610 $lines = explode( "\n", $documentation );
611 $out = '';
612 foreach ( $lines as $line ) {
613 $out .= "#. [Wiki] $line\n";
614 }
615
616 return $out;
617 }
618
619 protected static function escape( $line ) {
620 // There may be \ as a last character, for keeping trailing whitespace
621 $line = preg_replace( '/(\s)\\\\$/', '\1', $line );
622 $line = addcslashes( $line, '\\"' );
623 $line = str_replace( "\n", '\n', $line );
624 $line = '"' . $line . '"';
625
626 return $line;
627 }
628
629 public function shouldOverwrite( $a, $b ) {
630 $regex = '/^"(.+)-Date: \d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\+\d\d\d\d\\\\n"$/m';
631
632 $a = preg_replace( $regex, '', $a );
633 $b = preg_replace( $regex, '', $b );
634
635 return $a !== $b;
636 }
637
638 public static function getExtraSchema(): array {
639 $schema = [
640 'root' => [
641 '_type' => 'array',
642 '_children' => [
643 'FILES' => [
644 '_type' => 'array',
645 '_children' => [
646 'header' => [
647 '_type' => 'text',
648 ],
649 'keyAlgorithm' => [
650 '_type' => 'enum',
651 '_values' => [ 'simple', 'legacy' ],
652 ],
653 'CtxtAsKey' => [
654 '_type' => 'boolean',
655 ],
656 ]
657 ]
658 ]
659 ]
660 ];
661
662 return $schema;
663 }
664
665 public function isContentEqual( $a, $b ) {
666 if ( $a === $b ) {
667 return true;
668 }
669
670 try {
671 $parsedA = GettextPlural::parsePluralForms( $a );
672 $parsedB = GettextPlural::parsePluralForms( $b );
673
674 // if they have the different number of plural forms, just fail
675 if ( count( $parsedA[1] ) !== count( $parsedB[1] ) ) {
676 return false;
677 }
678
679 } catch ( GettextPluralException $e ) {
680 // Something failed, invalid syntax?
681 return false;
682 }
683
684 $expectedPluralCount = count( $parsedA[1] );
685
686 // GettextPlural::unflatten() will return an empty array when $expectedPluralCount is 0
687 // So if they do not have translations and are different strings, they are not equal
688 if ( $expectedPluralCount === 0 ) {
689 return false;
690 }
691
692 return GettextPlural::unflatten( $a, $expectedPluralCount )
693 === GettextPlural::unflatten( $b, $expectedPluralCount );
694 }
695}
New-style FFS class that implements support for gettext file format.
static generateKeyFromItem(array $item, $algorithm='simple')
Generates unique key for each message.
getFileExtensions()
Return the commonly used file extensions for these formats.
readFromVariable( $data)
setOfflineMode( $value)
static parseGettextData( $data, $useCtxtAsKey, StringMangler $mangler, $keyAlgorithm, bool $allowPotMode)
Parses gettext file as string into internal representation.
supportsFuzzy()
Query the capabilities of this FFS.
static formatForWiki( $data, $whitespace='mark')
This parses the Gettext text block format.
writeReal(MessageCollection $collection)
read( $code)
@inheritDoc
Exception thrown when a Gettext file could not be parsed, such as when missing required headers.
Identifies Gettext plural exceptions.
This file contains the class for core message collections implementation.
getAuthors()
Lists all translators that have contributed to the latest revisions of each translation.
Essentially random collection of helper functions, similar to GlobalFunctions.php.
Definition Utilities.php:30
This class implements some basic functions that wrap around the YAML message group configurations.
filterAuthors(array $authors, $code)
Remove excluded authors.
shouldOverwrite( $a, $b)
Allows to skip writing the export output into a file.
$extra
Stores the FILES section of the YAML configuration, which can be accessed for extra FFS class specifi...
Definition SimpleFFS.php:36
Message groups are usually configured in YAML, though the actual storage format does not matter,...
static getExtraSchema()
Return a data structure that will be merged with the base schema.
Interface that key-mangling classes must implement.