Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 194 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
ExternalMessageSourceStateComparator | |
0.00% |
0 / 194 |
|
0.00% |
0 / 9 |
2970 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
processGroup | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
processLanguage | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
addMessageUpdateChanges | |
0.00% |
0 / 61 |
|
0.00% |
0 / 1 |
420 | |||
checkNonSourceAdditionsForRename | |
0.00% |
0 / 41 |
|
0.00% |
0 / 1 |
56 | |||
findAndMarkSourceRenames | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
42 | |||
addNonSourceRenames | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
matchRenames | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
20 | |||
hasCacheEntry | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
110 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\Synchronization; |
5 | |
6 | use FileBasedMessageGroup; |
7 | use MediaWiki\Extension\Translate\LogNames; |
8 | use MediaWiki\Extension\Translate\MessageGroupProcessing\MessageGroupCache; |
9 | use MediaWiki\Extension\Translate\MessageLoading\Message; |
10 | use MediaWiki\Extension\Translate\MessageLoading\MessageCollection; |
11 | use MediaWiki\Extension\Translate\MessageSync\MessageSourceChange; |
12 | use MediaWiki\Extension\Translate\Utilities\StringComparators\StringComparator; |
13 | use MediaWiki\Extension\Translate\Utilities\Utilities; |
14 | use MediaWiki\Logger\LoggerFactory; |
15 | use MediaWiki\Page\PageStore; |
16 | use MediaWiki\Revision\RevisionLookup; |
17 | use MediaWiki\Utils\MWTimestamp; |
18 | use RuntimeException; |
19 | |
20 | /** |
21 | * Finds external changes for file based message groups. |
22 | * |
23 | * @author Niklas Laxström |
24 | * @license GPL-2.0-or-later |
25 | */ |
26 | class ExternalMessageSourceStateComparator { |
27 | private StringComparator $stringComparator; |
28 | private RevisionLookup $revisionLookup; |
29 | private PageStore $pageStore; |
30 | |
31 | public function __construct( |
32 | StringComparator $stringComparator, |
33 | RevisionLookup $revisionLookup, |
34 | PageStore $pageStore |
35 | ) { |
36 | $this->stringComparator = $stringComparator; |
37 | $this->revisionLookup = $revisionLookup; |
38 | $this->pageStore = $pageStore; |
39 | } |
40 | |
41 | /** |
42 | * Finds modifications in external sources compared to wiki state. |
43 | * |
44 | * The MessageSourceChange object returned stores the following about each modification, |
45 | * - First level of classification is the language code |
46 | * - Second level of classification is the type of modification, |
47 | * - addition (new message in the file) |
48 | * - deletion (message in wiki not present in the file) |
49 | * - change (difference in content) |
50 | * - rename (message key is modified) |
51 | * - Third level is a list of modifications |
52 | * - For each modification, the following is saved, |
53 | * - key (the message key) |
54 | * - content (the message content in external source, null for deletions) |
55 | * - matched_to (present in case of renames, key of the matched message) |
56 | * - similarity (present in case of renames, similarity % with the matched message) |
57 | * - previous_state (present in case of renames, state of the message before rename) |
58 | */ |
59 | public function processGroup( FileBasedMessageGroup $group ): MessageSourceChange { |
60 | $changes = new MessageSourceChange(); |
61 | $languages = $group->getTranslatableLanguages() ?? Utilities::getLanguageNames( 'en' ); |
62 | |
63 | // Process the source language before others. Source language might not |
64 | // be included in $group->getTranslatableLanguages(). The expected |
65 | // behavior is that source language is always processed. |
66 | $sourceLanguage = $group->getSourceLanguage(); |
67 | $this->processLanguage( $group, $sourceLanguage, $changes ); |
68 | unset( $languages[ $sourceLanguage] ); |
69 | |
70 | foreach ( array_keys( $languages ) as $language ) { |
71 | $this->processLanguage( $group, $language, $changes ); |
72 | } |
73 | |
74 | return $changes; |
75 | } |
76 | |
77 | private function processLanguage( |
78 | FileBasedMessageGroup $group, |
79 | string $language, |
80 | MessageSourceChange $changes |
81 | ): void { |
82 | $cache = $group->getMessageGroupCache( $language ); |
83 | $reason = 0; |
84 | if ( !$cache->isValid( $reason ) ) { |
85 | $this->addMessageUpdateChanges( $group, $language, $changes, $reason, $cache ); |
86 | |
87 | if ( $changes->getModificationsForLanguage( $language ) === [] ) { |
88 | /* Update the cache immediately if file and wiki state match. |
89 | * Otherwise the cache will get outdated compared to file state |
90 | * and will give false positive conflicts later. */ |
91 | $cache->create(); |
92 | } |
93 | } |
94 | } |
95 | |
96 | /** |
97 | * This is the detective novel. We have three sources of information: |
98 | * - current message state in the file |
99 | * - current message state in the wiki |
100 | * - cached message state since cache was last build |
101 | * (usually after export from wiki) |
102 | * |
103 | * Now we must try to guess what in earth has driven the file state and |
104 | * wiki state out of sync. Then we must compile list of events that would |
105 | * bring those to sync. Types of events are addition, deletion, (content) |
106 | * change and key renames. After that the list of events are stored for |
107 | * later processing of a translation administrator, who can decide what |
108 | * actions to take on those events to bring the state more or less in sync. |
109 | * |
110 | * @throws RuntimeException |
111 | */ |
112 | protected function addMessageUpdateChanges( |
113 | FileBasedMessageGroup $group, |
114 | string $language, |
115 | MessageSourceChange $changes, |
116 | int $reason, |
117 | MessageGroupCache $cache |
118 | ): void { |
119 | // initCollection returns empty list before first import |
120 | $wiki = $group->initCollection( $language ); |
121 | $wiki->filter( MessageCollection::FILTER_HAS_TRANSLATION, MessageCollection::INCLUDE_MATCHING ); |
122 | $wiki->loadTranslations(); |
123 | $wikiKeys = $wiki->getMessageKeys(); |
124 | |
125 | $sourceLanguage = $group->getSourceLanguage(); |
126 | // By-pass cached message definitions |
127 | $ffs = $group->getFFS(); |
128 | if ( $language === $sourceLanguage && !$ffs->exists( $language ) ) { |
129 | $path = $group->getSourceFilePath( $language ); |
130 | throw new RuntimeException( "Source message file for {$group->getId()} does not exist: $path" ); |
131 | } |
132 | |
133 | $file = $ffs->read( $language ); |
134 | |
135 | // Does not exist |
136 | if ( $file === false ) { |
137 | return; |
138 | } |
139 | |
140 | // Something went wrong |
141 | if ( !isset( $file['MESSAGES'] ) ) { |
142 | $id = $group->getId(); |
143 | $ffsClass = get_class( $ffs ); |
144 | |
145 | error_log( "$id has an FFS ($ffsClass) - it didn't return cake for $language" ); |
146 | |
147 | return; |
148 | } |
149 | |
150 | $fileKeys = array_keys( $file['MESSAGES'] ); |
151 | |
152 | $common = array_intersect( $fileKeys, $wikiKeys ); |
153 | |
154 | $supportsFuzzy = $ffs->supportsFuzzy(); |
155 | $changesToRemove = []; |
156 | |
157 | foreach ( $common as $key ) { |
158 | $sourceContent = $file['MESSAGES'][$key]; |
159 | /** @var Message $wikiMessage */ |
160 | $wikiMessage = $wiki[$key]; |
161 | $wikiContent = $wikiMessage->translation(); |
162 | |
163 | // @todo: Fuzzy checking can also be moved to $ffs->isContentEqual(); |
164 | // If FFS doesn't support it, ignore fuzziness as difference |
165 | $wikiContent = str_replace( TRANSLATE_FUZZY, '', $wikiContent ); |
166 | |
167 | // But if it does, ensure we have exactly one fuzzy marker prefixed |
168 | if ( $supportsFuzzy === 'yes' && $wikiMessage->hasTag( 'fuzzy' ) ) { |
169 | $wikiContent = TRANSLATE_FUZZY . $wikiContent; |
170 | } |
171 | |
172 | if ( $ffs->isContentEqual( $sourceContent, $wikiContent ) ) { |
173 | // File and wiki stage agree, nothing to do |
174 | continue; |
175 | } |
176 | |
177 | // Check against interim cache to see whether we have changes |
178 | // in the wiki, in the file or both. |
179 | |
180 | if ( $reason !== MessageGroupCache::NO_CACHE ) { |
181 | $cacheContent = $cache->get( $key ); |
182 | |
183 | /* We want to ignore the following situations: |
184 | * 1. The string in the wiki has been changed since the last export. |
185 | * Hence we check that source === cache && cache !== wiki |
186 | * 2. Missing cache entry due to the string being translated on translatewiki.net, |
187 | * exported and then being updated on translatewiki.net again. |
188 | */ |
189 | if ( |
190 | $this->hasCacheEntry( $cache, $wiki, $key ) && |
191 | !$ffs->isContentEqual( $wikiContent, $cacheContent ) && |
192 | $ffs->isContentEqual( $sourceContent, $cacheContent ) |
193 | ) { |
194 | continue; |
195 | } |
196 | } |
197 | |
198 | if ( $language !== $sourceLanguage ) { |
199 | // Assuming that this is the old key, lets check if it has a corresponding |
200 | // rename in the source language. The key of the matching message will be |
201 | // the new renamed key. |
202 | $renameMsg = $changes->getMatchedMessage( $sourceLanguage, $key ); |
203 | if ( $renameMsg !== null ) { |
204 | // Rename present in source language but this message has a content change |
205 | // with the OLD key in a non-source language. We will not process this |
206 | // here but add it as a rename instead. This way, the key will be renamed |
207 | // and then the content updated. |
208 | $this->addNonSourceRenames( |
209 | $changes, $key, $renameMsg['key'], $sourceContent, $wikiContent, $language |
210 | ); |
211 | $changesToRemove[] = $key; |
212 | continue; |
213 | } |
214 | } |
215 | $changes->addChange( $language, $key, $sourceContent ); |
216 | } |
217 | |
218 | $changes->removeChanges( $language, $changesToRemove ); |
219 | |
220 | $added = array_diff( $fileKeys, $wikiKeys ); |
221 | foreach ( $added as $key ) { |
222 | $sourceContent = $file['MESSAGES'][$key]; |
223 | $changes->addAddition( $language, $key, $sourceContent ); |
224 | } |
225 | |
226 | /* Should the cache not exist, don't consider the messages |
227 | * missing from the file as deleted - they probably aren't |
228 | * yet exported. For example new language translations are |
229 | * exported the first time. */ |
230 | if ( $reason !== MessageGroupCache::NO_CACHE ) { |
231 | $deleted = array_diff( $wikiKeys, $fileKeys ); |
232 | foreach ( $deleted as $key ) { |
233 | if ( $cache->get( $key ) === false ) { |
234 | /* This message has never existed in the cache, so it |
235 | * must be a newly made in the wiki. */ |
236 | continue; |
237 | } |
238 | $changes->addDeletion( $language, $key, $wiki[$key]->translation() ); |
239 | } |
240 | } |
241 | |
242 | if ( $language === $sourceLanguage ) { |
243 | $this->findAndMarkSourceRenames( $changes, $language ); |
244 | } else { |
245 | // Non source language |
246 | $this->checkNonSourceAdditionsForRename( |
247 | $changes, $sourceLanguage, $language, $wiki, $wikiKeys |
248 | ); |
249 | } |
250 | } |
251 | |
252 | /** |
253 | * For non source languages, we look at additions and see if they have been |
254 | * added as renames in the source language. |
255 | * @param MessageSourceChange $changes |
256 | * @param string $sourceLanguage |
257 | * @param string $targetLanguage |
258 | * @param MessageCollection $wiki |
259 | * @param string[] $wikiKeys |
260 | */ |
261 | private function checkNonSourceAdditionsForRename( |
262 | MessageSourceChange $changes, |
263 | string $sourceLanguage, |
264 | string $targetLanguage, |
265 | MessageCollection $wiki, |
266 | array $wikiKeys |
267 | ): void { |
268 | $additions = $changes->getAdditions( $targetLanguage ); |
269 | if ( $additions === [] ) { |
270 | return; |
271 | } |
272 | |
273 | $additionsToRemove = []; |
274 | $deletionsToRemove = []; |
275 | foreach ( $additions as $addedMsg ) { |
276 | $addedMsgKey = $addedMsg['key']; |
277 | |
278 | // Check if this key is renamed in source. |
279 | $renamedSourceMsg = $changes->findMessage( |
280 | $sourceLanguage, $addedMsgKey, [ MessageSourceChange::RENAME ] |
281 | ); |
282 | |
283 | if ( $renamedSourceMsg === null ) { |
284 | continue; |
285 | } |
286 | |
287 | // Since this key is new, and is present in the renames for the source language, |
288 | // we will add it as a rename. |
289 | $deletedSource = $changes->getMatchedMessage( $sourceLanguage, $renamedSourceMsg['key'] ); |
290 | if ( $deletedSource === null ) { |
291 | continue; |
292 | } |
293 | $deletedMsgKey = $deletedSource['key']; |
294 | $deletedMsg = $changes->findMessage( |
295 | $targetLanguage, $deletedMsgKey, [ MessageSourceChange::DELETION ] |
296 | ); |
297 | |
298 | // Sometimes when the cache does not have the translations, the deleted message |
299 | // is not added in the translations. It is also possible that for this non-source |
300 | // language the key has not been removed. |
301 | if ( $deletedMsg === null ) { |
302 | $content = ''; |
303 | if ( in_array( $deletedMsgKey, $wikiKeys ) ) { |
304 | $content = $wiki[ $deletedMsgKey ]->translation(); |
305 | } |
306 | $deletedMsg = [ |
307 | 'key' => $deletedMsgKey, |
308 | 'content' => $content |
309 | ]; |
310 | } |
311 | |
312 | $similarityPercent = $this->stringComparator->getSimilarity( |
313 | $addedMsg['content'], $deletedMsg['content'] |
314 | ); |
315 | |
316 | $changes->addRename( $targetLanguage, [ |
317 | 'key' => $addedMsgKey, |
318 | 'content' => $addedMsg['content'] |
319 | ], [ |
320 | 'key' => $deletedMsgKey, |
321 | 'content' => $deletedMsg['content'] |
322 | ], $similarityPercent ); |
323 | |
324 | $deletionsToRemove[] = $deletedMsgKey; |
325 | $additionsToRemove[] = $addedMsgKey; |
326 | } |
327 | |
328 | $changes->removeAdditions( $targetLanguage, $additionsToRemove ); |
329 | $changes->removeDeletions( $targetLanguage, $deletionsToRemove ); |
330 | } |
331 | |
332 | /** |
333 | * Check for renames and add them to the changes. To identify renames we need to |
334 | * compare the contents of the added messages with the deleted ones and identify |
335 | * messages that match. |
336 | */ |
337 | private function findAndMarkSourceRenames( MessageSourceChange $changes, string $sourceLanguage ): void { |
338 | // Now check for renames. To identify renames we need to compare |
339 | // the contents of the added messages with the deleted ones and |
340 | // identify messages that match. |
341 | $deletions = $changes->getDeletions( $sourceLanguage ); |
342 | $additions = $changes->getAdditions( $sourceLanguage ); |
343 | if ( $deletions === [] || $additions === [] ) { |
344 | return; |
345 | } |
346 | |
347 | // This array contains a dictionary with matching renames in the following structure - |
348 | // [ A1|D1 => 1.0, A1|D2 => 0.95, A2|D1 => 0.95 ] |
349 | $potentialRenames = []; |
350 | foreach ( $additions as $addedMsg ) { |
351 | $addedMsgKey = $addedMsg['key']; |
352 | |
353 | foreach ( $deletions as $deletedMsg ) { |
354 | $similarityPercent = $this->stringComparator->getSimilarity( |
355 | $addedMsg['content'], $deletedMsg['content'] |
356 | ); |
357 | |
358 | if ( $changes->areStringsSimilar( $similarityPercent ) ) { |
359 | $potentialRenames[ $addedMsgKey . '|' . $deletedMsg['key'] ] = $similarityPercent; |
360 | } |
361 | } |
362 | } |
363 | |
364 | $this->matchRenames( $changes, $potentialRenames, $sourceLanguage ); |
365 | } |
366 | |
367 | /** Adds non source language renames to the list of changes */ |
368 | private function addNonSourceRenames( |
369 | MessageSourceChange $changes, |
370 | string $key, |
371 | string $renameKey, |
372 | string $sourceContent, |
373 | string $wikiContent, |
374 | string $language |
375 | ): void { |
376 | $addedMsg = [ |
377 | 'key' => $renameKey, |
378 | 'content' => $sourceContent |
379 | ]; |
380 | |
381 | $removedMsg = [ |
382 | 'key' => $key, |
383 | 'content' => $wikiContent |
384 | ]; |
385 | |
386 | $similarityPercent = $this->stringComparator->getSimilarity( |
387 | $sourceContent, $wikiContent |
388 | ); |
389 | $changes->addRename( $language, $addedMsg, $removedMsg, $similarityPercent ); |
390 | } |
391 | |
392 | /** |
393 | * Identifies which added message to be associated with the deleted message based on |
394 | * similarity percentage. |
395 | * |
396 | * We sort the $trackRename array on the similarity percentage and then start adding the |
397 | * messages as renames. |
398 | */ |
399 | private function matchRenames( MessageSourceChange $changes, array $trackRename, string $language ): void { |
400 | arsort( $trackRename, SORT_NUMERIC ); |
401 | |
402 | $alreadyRenamed = $additionsToRemove = $deletionsToRemove = []; |
403 | foreach ( $trackRename as $key => $similarityPercent ) { |
404 | [ $addKey, $deleteKey ] = explode( '|', $key, 2 ); |
405 | if ( isset( $alreadyRenamed[ $addKey ] ) || isset( $alreadyRenamed[ $deleteKey ] ) ) { |
406 | // Already mapped with another name. |
407 | continue; |
408 | } |
409 | |
410 | // Using key should be faster than saving values and searching for them in the array. |
411 | $alreadyRenamed[ $addKey ] = 1; |
412 | $alreadyRenamed[ $deleteKey ] = 1; |
413 | |
414 | $addMsg = $changes->findMessage( $language, $addKey, [ MessageSourceChange::ADDITION ] ); |
415 | $deleteMsg = $changes->findMessage( $language, $deleteKey, [ MessageSourceChange::DELETION ] ); |
416 | |
417 | $changes->addRename( $language, $addMsg, $deleteMsg, $similarityPercent ); |
418 | |
419 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable |
420 | $additionsToRemove[] = $addMsg['key']; |
421 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable |
422 | $deletionsToRemove[] = $deleteMsg['key']; |
423 | } |
424 | |
425 | $changes->removeAdditions( $language, $additionsToRemove ); |
426 | $changes->removeDeletions( $language, $deletionsToRemove ); |
427 | } |
428 | |
429 | /** |
430 | * Checks if the cache has an entry for the given key |
431 | * @return bool True if entry is present, false if entry is not present but that is the expected behavior |
432 | * @throws RuntimeException If the cache should have an entry but is unexpectedly missing |
433 | */ |
434 | private function hasCacheEntry( |
435 | MessageGroupCache $cache, |
436 | MessageCollection $collection, |
437 | string $messageKey |
438 | ): bool { |
439 | $cacheContent = $cache->get( $messageKey ); |
440 | if ( $cacheContent !== false ) { |
441 | return true; |
442 | } |
443 | |
444 | $cacheUpdateTime = $cache->getUpdateTimestamp(); |
445 | $cacheUpdateTime = $cacheUpdateTime !== false ? MWTimestamp::convert( TS_MW, $cacheUpdateTime ) : false; |
446 | |
447 | $pageIdentity = $this->pageStore->getPageForLink( $collection->keys()[ $messageKey ] ); |
448 | $oldestRevision = $this->revisionLookup->getFirstRevision( $pageIdentity ); |
449 | $latestRevision = $this->revisionLookup->getRevisionByTitle( $pageIdentity ); |
450 | |
451 | $logger = LoggerFactory::getInstance( LogNames::GROUP_SYNCHRONIZATION ); |
452 | // Here we are checking for the following: |
453 | // 1. New translation was added for a message on translatewiki.net |
454 | // 2. Translation was exported |
455 | // 3. Translation was updated on translatewiki.net |
456 | // In this case the cache does not have the message |
457 | if ( |
458 | $cacheUpdateTime !== false && |
459 | ( $oldestRevision && $oldestRevision->getTimestamp() < $cacheUpdateTime ) && |
460 | ( $latestRevision && $cacheUpdateTime < $latestRevision->getTimestamp() ) |
461 | ) { |
462 | $logger->info( |
463 | 'Expected cache miss for {messageKey} in language: {language}. Cache update time: {cacheUpdateTime}', |
464 | [ |
465 | 'messageKey' => $messageKey, |
466 | 'language' => $collection->getLanguage(), |
467 | 'cacheUpdateTime' => $cacheUpdateTime, |
468 | 'oldestRevisionTs' => $oldestRevision->getTimestamp(), |
469 | 'latestRevisionTs' => $latestRevision->getTimestamp() |
470 | ] |
471 | ); |
472 | return false; |
473 | } |
474 | |
475 | $logger->warning( |
476 | 'Unexpected cache miss for {messageKey} in language: {language}. Cache update time: {cacheUpdateTime}', |
477 | [ |
478 | 'messageKey' => $messageKey, |
479 | 'language' => $collection->getLanguage(), |
480 | 'cacheUpdateTime' => $cacheUpdateTime, |
481 | 'oldestRevisionTs' => $oldestRevision ? $oldestRevision->getTimestamp() : 'N/A', |
482 | 'latestRevisionTs' => $latestRevision ? $latestRevision->getTimestamp() : 'N/A' |
483 | ] |
484 | ); |
485 | return false; |
486 | } |
487 | |
488 | } |