Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
GettextFFS.php
Go to the documentation of this file.
1<?php
14use MediaWiki\Logger\LoggerFactory;
15
21 private $allowPotMode = false;
22 protected $offlineMode = false;
23
24 public function supportsFuzzy() {
25 return 'yes';
26 }
27
28 public function getFileExtensions() {
29 return [ '.pot', '.po' ];
30 }
31
33 public function setOfflineMode( $value ) {
34 $this->offlineMode = $value;
35 }
36
38 public function read( $code ) {
39 // This is somewhat hacky, but pot mode should only ever be used for the source language.
40 // See https://phabricator.wikimedia.org/T230361
41 $this->allowPotMode = $this->getGroup()->getSourceLanguage() === $code;
42
43 try {
44 return parent::read( $code );
45 } finally {
46 $this->allowPotMode = false;
47 }
48 }
49
54 public function readFromVariable( $data ) {
55 # Authors first
56 $matches = [];
57 preg_match_all( '/^#\s*Author:\s*(.*)$/m', $data, $matches );
58 $authors = $matches[1];
59
60 # Then messages and everything else
61 $parsedData = $this->parseGettext( $data );
62 $parsedData['AUTHORS'] = $authors;
63
64 foreach ( $parsedData['MESSAGES'] as $key => $value ) {
65 if ( $value === '' ) {
66 unset( $parsedData['MESSAGES'][$key] );
67 }
68 }
69
70 return $parsedData;
71 }
72
73 public function parseGettext( $data ) {
74 $mangler = $this->group->getMangler();
75 $useCtxtAsKey = $this->extra['CtxtAsKey'] ?? false;
76 $keyAlgorithm = 'simple';
77 if ( isset( $this->extra['keyAlgorithm'] ) ) {
78 $keyAlgorithm = $this->extra['keyAlgorithm'];
79 }
80
81 return self::parseGettextData( $data, $useCtxtAsKey, $mangler, $keyAlgorithm, $this->allowPotMode );
82 }
83
95 public static function parseGettextData(
96 $data,
97 $useCtxtAsKey,
98 StringMangler $mangler,
99 $keyAlgorithm,
100 bool $allowPotMode
101 ) {
102 $potmode = false;
103
104 // Normalise newlines, to make processing easier
105 $data = str_replace( "\r\n", "\n", $data );
106
107 /* Delimit the file into sections, which are separated by two newlines.
108 * We are permissive and accept more than two. This parsing method isn't
109 * efficient wrt memory, but was easy to implement */
110 $sections = preg_split( '/\n{2,}/', $data );
111
112 /* First one isn't an actual message. We'll handle it specially below */
113 $headerSection = array_shift( $sections );
114 /* Since this is the header section, we are only interested in the tags
115 * and msgid is empty. Somewhere we should extract the header comments
116 * too */
117 $match = self::expectKeyword( 'msgstr', $headerSection );
118 if ( $match !== null ) {
119 $headerBlock = self::formatForWiki( $match, 'trim' );
120 $headers = self::parseHeaderTags( $headerBlock );
121
122 // Check for pot-mode by checking if the header is fuzzy
123 $flags = self::parseFlags( $headerSection );
124 if ( in_array( 'fuzzy', $flags, true ) ) {
125 $potmode = $allowPotMode;
126 }
127 } else {
128 $message = "Gettext file header was not found:\n\n$data";
129 throw new GettextParseException( $message );
130 }
131
132 $template = [];
133 $messages = [];
134
135 // Extract some metadata from headers for easier use
136 $metadata = [];
137 if ( isset( $headers['X-Language-Code'] ) ) {
138 $metadata['code'] = $headers['X-Language-Code'];
139 }
140
141 if ( isset( $headers['X-Message-Group'] ) ) {
142 $metadata['group'] = $headers['X-Message-Group'];
143 }
144
145 /* At this stage we are only interested how many plurals forms we should
146 * be expecting when parsing the rest of this file. */
147 $pluralCount = false;
148 if ( $potmode ) {
149 $pluralCount = 2;
150 } elseif ( isset( $headers['Plural-Forms'] ) ) {
151 $pluralCount = $metadata['plural'] = GettextPlural::getPluralCount( $headers['Plural-Forms'] );
152 }
153
154 $metadata['plural'] = $pluralCount;
155
156 // Then parse the messages
157 foreach ( $sections as $section ) {
158 $item = self::parseGettextSection( $section, $pluralCount );
159 if ( $item === false ) {
160 continue;
161 }
162
163 if ( $useCtxtAsKey ) {
164 if ( !isset( $item['ctxt'] ) ) {
165 error_log( "ctxt missing for: $section" );
166 continue;
167 }
168 $key = $item['ctxt'];
169 } else {
170 $key = self::generateKeyFromItem( $item, $keyAlgorithm );
171 }
172
173 $key = $mangler->mangle( $key );
174 $messages[$key] = $potmode ? $item['id'] : $item['str'];
175 $template[$key] = $item;
176 }
177
178 return [
179 'MESSAGES' => $messages,
180 'EXTRA' => [
181 'TEMPLATE' => $template,
182 'METADATA' => $metadata,
183 'HEADERS' => $headers,
184 ],
185 ];
186 }
187
188 public static function parseGettextSection( $section, $pluralCount ) {
189 if ( trim( $section ) === '' ) {
190 return false;
191 }
192
193 /* These inactive sections are of no interest to us. Multiline mode
194 * is needed because there may be flags or other annoying stuff
195 * before the commented out sections.
196 */
197 if ( preg_match( '/^#~/m', $section ) ) {
198 return false;
199 }
200
201 $item = [
202 'ctxt' => false,
203 'id' => '',
204 'str' => '',
205 'flags' => [],
206 'comments' => [],
207 ];
208
209 $match = self::expectKeyword( 'msgid', $section );
210 if ( $match !== null ) {
211 $item['id'] = self::formatForWiki( $match );
212 } else {
213 throw new MWException( "Unable to parse msgid:\n\n$section" );
214 }
215
216 $match = self::expectKeyword( 'msgctxt', $section );
217 if ( $match !== null ) {
218 $item['ctxt'] = self::formatForWiki( $match );
219 }
220
221 $pluralMessage = false;
222 $match = self::expectKeyword( 'msgid_plural', $section );
223 if ( $match !== null ) {
224 $pluralMessage = true;
225 $plural = self::formatForWiki( $match );
226 $item['id'] = GettextPlural::flatten( [ $item['id'], $plural ] );
227 }
228
229 if ( $pluralMessage ) {
230 $pluralMessageText = self::processGettextPluralMessage( $pluralCount, $section );
231
232 // Keep the translation empty if no form has translation
233 if ( $pluralMessageText !== '' ) {
234 $item['str'] = $pluralMessageText;
235 }
236 } else {
237 $match = self::expectKeyword( 'msgstr', $section );
238 if ( $match !== null ) {
239 $item['str'] = self::formatForWiki( $match );
240 } else {
241 throw new MWException( "Unable to parse msgstr:\n\n$section" );
242 }
243 }
244
245 // Parse flags
246 $flags = self::parseFlags( $section );
247 foreach ( $flags as $key => $flag ) {
248 if ( $flag === 'fuzzy' ) {
249 $item['str'] = TRANSLATE_FUZZY . $item['str'];
250 unset( $flags[$key] );
251 }
252 }
253 $item['flags'] = $flags;
254
255 // Rest of the comments
256 $matches = [];
257 if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {
258 foreach ( $matches as $match ) {
259 if ( $match[1] !== ',' && strpos( $match[1], '[Wiki]' ) !== 0 ) {
260 $item['comments'][$match[1]][] = $match[2];
261 }
262 }
263 }
264
265 return $item;
266 }
267
268 public static function processGettextPluralMessage( $pluralCount, $section ) {
269 $actualForms = [];
270
271 for ( $i = 0; $i < $pluralCount; $i++ ) {
272 $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
273
274 if ( $match !== null ) {
275 $actualForms[] = self::formatForWiki( $match );
276 } else {
277 $actualForms[] = '';
278 error_log( "Plural $i not found, expecting total of $pluralCount for $section" );
279 }
280 }
281
282 if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
283 return GettextPlural::flatten( $actualForms );
284 } else {
285 return '';
286 }
287 }
288
289 public static function parseFlags( $section ) {
290 $matches = [];
291 if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {
292 return array_map( 'trim', explode( ',', $matches[1] ) );
293 } else {
294 return [];
295 }
296 }
297
298 public static function expectKeyword( $name, $section ) {
299 /* Catches the multiline textblock that comes after keywords msgid,
300 * msgstr, msgid_plural, msgctxt.
301 */
302 $poformat = '".*"\n?(^".*"$\n?)*';
303
304 $matches = [];
305 if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
306 return $matches[1];
307 } else {
308 return null;
309 }
310 }
311
319 public static function generateKeyFromItem( array $item, $algorithm = 'simple' ) {
320 $lang = Language::factory( 'en' );
321
322 if ( $item['ctxt'] === '' ) {
323 /* Messages with msgctxt as empty string should be different
324 * from messages without any msgctxt. To avoid BC break make
325 * the empty ctxt a special case */
326 $hash = sha1( $item['id'] . 'MSGEMPTYCTXT' );
327 } else {
328 $hash = sha1( $item['ctxt'] . $item['id'] );
329 }
330
331 if ( $algorithm === 'simple' ) {
332 $hash = substr( $hash, 0, 6 );
333 $snippet = $lang->truncateForDatabase( $item['id'], 30, '' );
334 $snippet = str_replace( ' ', '_', trim( $snippet ) );
335 } else { // legacy
336 $legalChars = Title::legalChars();
337 $snippet = $item['id'];
338 $snippet = preg_replace( "/[^$legalChars]/", ' ', $snippet );
339 $snippet = preg_replace( "/[:&%\/_]/", ' ', $snippet );
340 $snippet = preg_replace( '/ {2,}/', ' ', $snippet );
341 $snippet = $lang->truncateForDatabase( $snippet, 30, '' );
342 $snippet = str_replace( ' ', '_', trim( $snippet ) );
343 }
344
345 return "$hash-$snippet";
346 }
347
358 public static function formatForWiki( $data, $whitespace = 'mark' ) {
359 $quotePattern = '/(^"|"$\n?)/m';
360 $data = preg_replace( $quotePattern, '', $data );
361 $data = stripcslashes( $data );
362
363 if ( preg_match( '/\s$/', $data ) ) {
364 if ( $whitespace === 'mark' ) {
365 $data .= '\\';
366 } elseif ( $whitespace === 'trim' ) {
367 $data = rtrim( $data );
368 } else {
369 // @todo Only triggered if there is trailing whitespace
370 throw new MWException( 'Unknown action for whitespace' );
371 }
372 }
373
374 return $data;
375 }
376
377 public static function parseHeaderTags( $headers ) {
378 $tags = [];
379 foreach ( explode( "\n", $headers ) as $line ) {
380 if ( strpos( $line, ':' ) === false ) {
381 error_log( __METHOD__ . ": $line" );
382 }
383 [ $key, $value ] = explode( ':', $line, 2 );
384 $tags[trim( $key )] = trim( $value );
385 }
386
387 return $tags;
388 }
389
390 protected function writeReal( MessageCollection $collection ) {
391 // FIXME: this should be the source language
392 $pot = $this->read( 'en' ) ?? [];
393 $code = $collection->code;
394 $template = $this->read( $code ) ?? [];
395 $output = $this->doGettextHeader( $collection, $template['EXTRA'] ?? [] );
396
397 $pluralRule = GettextPlural::getPluralRule( $code );
398 if ( !$pluralRule ) {
399 $pluralRule = GettextPlural::getPluralRule( 'en' );
400 LoggerFactory::getInstance( 'Translate' )->warning(
401 "T235180: Missing Gettext plural rule for '{languagecode}'",
402 [ 'languagecode' => $code ]
403 );
404 }
405 $pluralCount = GettextPlural::getPluralCount( $pluralRule );
406
408 foreach ( $collection as $key => $m ) {
409 $transTemplate = $template['EXTRA']['TEMPLATE'][$key] ?? [];
410 $potTemplate = $pot['EXTRA']['TEMPLATE'][$key] ?? [];
411
412 $output .= $this->formatMessageBlock( $key, $m, $transTemplate, $potTemplate, $pluralCount );
413 }
414
415 return $output;
416 }
417
418 protected function doGettextHeader( MessageCollection $collection, $template ) {
419 global $wgSitename;
420
421 $code = $collection->code;
422 $name = TranslateUtils::getLanguageName( $code );
423 $native = TranslateUtils::getLanguageName( $code, $code );
424 $authors = $this->doAuthors( $collection );
425 if ( isset( $this->extra['header'] ) ) {
426 $extra = "# --\n" . $this->extra['header'];
427 } else {
428 $extra = '';
429 }
430
431 $output = <<<PHP
432# Translation of {$this->group->getLabel()} to $name ($native)
433# Exported from $wgSitename
434#
435$authors$extra
436PHP;
437
438 // Make sure there is no empty line before msgid
439 $output = trim( $output ) . "\n";
440
441 $specs = $template['HEADERS'] ?? [];
442
443 $timestamp = wfTimestampNow();
444 $specs['PO-Revision-Date'] = self::formatTime( $timestamp );
445 if ( $this->offlineMode ) {
446 $specs['POT-Creation-Date'] = self::formatTime( $timestamp );
447 } elseif ( $this->group instanceof MessageGroupBase ) {
448 $specs['X-POT-Import-Date'] = self::formatTime( wfTimestamp( TS_MW, $this->getPotTime() ) );
449 }
450 $specs['Content-Type'] = 'text/plain; charset=UTF-8';
451 $specs['Content-Transfer-Encoding'] = '8bit';
452 $specs['Language'] = LanguageCode::bcp47( $this->group->mapCode( $code ) );
453 Hooks::run( 'Translate:GettextFFS:headerFields', [ &$specs, $this->group, $code ] );
454 $specs['X-Generator'] = $this->getGenerator();
455
456 if ( $this->offlineMode ) {
457 $specs['X-Language-Code'] = $code;
458 $specs['X-Message-Group'] = $this->group->getId();
459 }
460
461 $specs['Plural-Forms'] = GettextPlural::getPluralRule( $code )
462 ?: GettextPlural::getPluralRule( 'en' );
463
464 $output .= 'msgid ""' . "\n";
465 $output .= 'msgstr ""' . "\n";
466 $output .= '""' . "\n";
467
468 foreach ( $specs as $k => $v ) {
469 $output .= self::escape( "$k: $v\n" ) . "\n";
470 }
471
472 $output .= "\n";
473
474 return $output;
475 }
476
477 protected function doAuthors( MessageCollection $collection ) {
478 $output = '';
479 $authors = $collection->getAuthors();
480 $authors = $this->filterAuthors( $authors, $collection->code );
481
482 foreach ( $authors as $author ) {
483 $output .= "# Author: $author\n";
484 }
485
486 return $output;
487 }
488
497 protected function formatMessageBlock( $key, $m, $trans, $pot, $pluralCount ) {
498 $header = $this->formatDocumentation( $key );
499 $content = '';
500
501 $comments = self::chainGetter( 'comments', $pot, $trans, [] );
502 foreach ( $comments as $type => $typecomments ) {
503 foreach ( $typecomments as $comment ) {
504 $header .= "#$type $comment\n";
505 }
506 }
507
508 $flags = self::chainGetter( 'flags', $pot, $trans, [] );
509 $flags = array_merge( $m->getTags(), $flags );
510
511 if ( $this->offlineMode ) {
512 $content .= 'msgctxt ' . self::escape( $key ) . "\n";
513 } else {
514 $ctxt = self::chainGetter( 'ctxt', $pot, $trans, false );
515 if ( $ctxt !== false ) {
516 $content .= 'msgctxt ' . self::escape( $ctxt ) . "\n";
517 }
518 }
519
520 $msgid = $m->definition();
521 $msgstr = $m->translation();
522 if ( strpos( $msgstr, TRANSLATE_FUZZY ) !== false ) {
523 $msgstr = str_replace( TRANSLATE_FUZZY, '', $msgstr );
524 // Might by fuzzy infile
525 $flags[] = 'fuzzy';
526 }
527
528 if ( GettextPlural::hasPlural( $msgid ) ) {
529 $forms = GettextPlural::unflatten( $msgid, 2 );
530 $content .= 'msgid ' . self::escape( $forms[0] ) . "\n";
531 $content .= 'msgid_plural ' . self::escape( $forms[1] ) . "\n";
532
533 try {
534 $forms = GettextPlural::unflatten( $msgstr, $pluralCount );
535 foreach ( $forms as $index => $form ) {
536 $content .= "msgstr[$index] " . self::escape( $form ) . "\n";
537 }
538 } catch ( GettextPluralException $e ) {
539 $flags[] = 'invalid-plural';
540 for ( $i = 0; $i < $pluralCount; $i++ ) {
541 $content .= "msgstr[$i] \"\"\n";
542 }
543 }
544 } else {
545 $content .= 'msgid ' . self::escape( $msgid ) . "\n";
546 $content .= 'msgstr ' . self::escape( $msgstr ) . "\n";
547 }
548
549 if ( $flags ) {
550 sort( $flags );
551 $header .= '#, ' . implode( ', ', array_unique( $flags ) ) . "\n";
552 }
553
554 $output = $header ?: "#\n";
555 $output .= $content . "\n";
556
557 return $output;
558 }
559
567 protected static function chainGetter( $key, $a, $b, $default ) {
568 return $a[$key] ?? $b[$key] ?? $default;
569 }
570
571 protected static function formatTime( $time ) {
572 $lang = Language::factory( 'en' );
573
574 return $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $time );
575 }
576
577 protected function getPotTime() {
578 $cache = $this->group->getMessageGroupCache( $this->group->getSourceLanguage() );
579
580 return $cache->exists() ? $cache->getTimestamp() : wfTimestampNow();
581 }
582
583 protected function getGenerator() {
584 return 'MediaWiki ' . SpecialVersion::getVersion() .
585 '; Translate ' . TranslateUtils::getVersion();
586 }
587
588 protected function formatDocumentation( $key ) {
589 global $wgTranslateDocumentationLanguageCode;
590
591 if ( !$this->offlineMode ) {
592 return '';
593 }
594
595 $code = $wgTranslateDocumentationLanguageCode;
596 if ( !$code ) {
597 return '';
598 }
599
600 $documentation = TranslateUtils::getMessageContent( $key, $code, $this->group->getNamespace() );
601 if ( !is_string( $documentation ) ) {
602 return '';
603 }
604
605 $lines = explode( "\n", $documentation );
606 $out = '';
607 foreach ( $lines as $line ) {
608 $out .= "#. [Wiki] $line\n";
609 }
610
611 return $out;
612 }
613
614 protected static function escape( $line ) {
615 // There may be \ as a last character, for keeping trailing whitespace
616 $line = preg_replace( '/(\s)\\\\$/', '\1', $line );
617 $line = addcslashes( $line, '\\"' );
618 $line = str_replace( "\n", '\n', $line );
619 $line = '"' . $line . '"';
620
621 return $line;
622 }
623
624 public function shouldOverwrite( $a, $b ) {
625 $regex = '/^"(.+)-Date: \d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\+\d\d\d\d\\\\n"$/m';
626
627 $a = preg_replace( $regex, '', $a );
628 $b = preg_replace( $regex, '', $b );
629
630 return $a !== $b;
631 }
632
633 public static function getExtraSchema() {
634 $schema = [
635 'root' => [
636 '_type' => 'array',
637 '_children' => [
638 'FILES' => [
639 '_type' => 'array',
640 '_children' => [
641 'header' => [
642 '_type' => 'text',
643 ],
644 'keyAlgorithm' => [
645 '_type' => 'enum',
646 '_values' => [ 'simple', 'legacy' ],
647 ],
648 'CtxtAsKey' => [
649 '_type' => 'boolean',
650 ],
651 ]
652 ]
653 ]
654 ]
655 ];
656
657 return $schema;
658 }
659
660 public function isContentEqual( $a, $b ) {
661 if ( $a === $b ) {
662 return true;
663 }
664
665 try {
666 $parsedA = GettextPlural::parsePluralForms( $a );
667 $parsedB = GettextPlural::parsePluralForms( $b );
668
669 // if they have the different number of plural forms, just fail
670 if ( count( $parsedA[1] ) !== count( $parsedB[1] ) ) {
671 return false;
672 }
673
674 } catch ( GettextPluralException $e ) {
675 // Something failed, invalid syntax?
676 return false;
677 }
678
679 $expectedPluralCount = count( $parsedA[1] );
680
681 // GettextPlural::unflatten() will return an empty array when $expectedPluralCount is 0
682 // So if they do not have translations and are different strings, they are not equal
683 if ( $expectedPluralCount === 0 ) {
684 return false;
685 }
686
687 return GettextPlural::unflatten( $a, $expectedPluralCount )
688 === GettextPlural::unflatten( $b, $expectedPluralCount );
689 }
690}
New-style FFS class that implements support for gettext file format.
static generateKeyFromItem(array $item, $algorithm='simple')
Generates unique key for each message.
isContentEqual( $a, $b)
Checks whether two strings are equal.
getFileExtensions()
Return the commonly used file extensions for these formats.
formatMessageBlock( $key, $m, $trans, $pot, $pluralCount)
static getExtraSchema()
Return a data structure that will be merged with the base schema.
readFromVariable( $data)
setOfflineMode( $value)
static parseGettextData( $data, $useCtxtAsKey, StringMangler $mangler, $keyAlgorithm, bool $allowPotMode)
Parses gettext file as string into internal representation.
static chainGetter( $key, $a, $b, $default)
shouldOverwrite( $a, $b)
Allows to skip writing the export output into a file.
supportsFuzzy()
Query the capabilities of this FFS.
static formatForWiki( $data, $whitespace='mark')
This parses the Gettext text block format.
writeReal(MessageCollection $collection)
read( $code)
@inheritDoc
Exception thrown when a Gettext file could not be parsed, such as when missing required headers.
Identifies Gettext plural exceptions.
Core message collection class.
getAuthors()
Lists all translators that have contributed to the latest revisions of each translation.
This class implements some basic functions that wrap around the YAML message group configurations.
filterAuthors(array $authors, $code)
Remove excluded authors.
$extra
Stores the FILES section of the YAML configuration, which can be accessed for extra FFS class specifi...
Definition SimpleFFS.php:35
static getMessageContent( $key, $language, $namespace=NS_MEDIAWIKI)
Loads page content without side effects.
Interface that key-mangling classes must implement.
Message groups are usually configured in YAML, though the actual storage format does not matter,...