Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
93.15% |
68 / 73 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
TranslationSplitter | |
93.15% |
68 / 73 |
|
40.00% |
2 / 5 |
15.07 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
splitIntoSectionTranslations | |
93.33% |
42 / 45 |
|
0.00% |
0 / 1 |
8.02 | |||
validateMwSectionNumbers | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
validateSourceSectionKeys | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
searchForTargetTitleInCorporaUnits | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace ContentTranslation\Service; |
6 | |
7 | use ContentTranslation\DTO\TranslationUnitDTO; |
8 | use ContentTranslation\Entity\SectionTranslation; |
9 | use ContentTranslation\Manager\TranslationCorporaManager; |
10 | use ContentTranslation\Store\SectionTranslationStore; |
11 | use ContentTranslation\Translation; |
12 | use DOMDocument; |
13 | |
14 | /** |
15 | * This class implements a service that given a specific translation (from cx_translations table), |
16 | * and it creates a new section translation (rows for the cx_section_translations table) for each |
17 | * article section that has been translated in this translation - except for the lead section. |
18 | * The information about which sections have been translated, is extracted from the translation |
19 | * parallel corpora (cx_corpora table). |
20 | * |
21 | * @author Nik Gkountas |
22 | */ |
23 | class TranslationSplitter { |
24 | private const LEAD_SECTION_DUMMY_TITLE = '__LEAD_SECTION__'; |
25 | |
26 | private TranslationCorporaManager $corporaManager; |
27 | private SectionTitleFetcher $sectionTitleFetcher; |
28 | |
29 | public function __construct( TranslationCorporaManager $corporaManager, SectionTitleFetcher $sectionTitleFetcher ) { |
30 | $this->corporaManager = $corporaManager; |
31 | $this->sectionTitleFetcher = $sectionTitleFetcher; |
32 | } |
33 | |
34 | /** |
35 | * @param Translation $translation |
36 | * @return SectionTranslation[] |
37 | */ |
38 | public function splitIntoSectionTranslations( Translation $translation ): array { |
39 | $translationUnits = $this->corporaManager->getTranslationUnitDTOsByTranslationId( |
40 | $translation->getTranslationId() |
41 | ); |
42 | |
43 | if ( !$translationUnits || !$this->validateMwSectionNumbers( $translationUnits ) ) { |
44 | // TODO: Should we throw an exception or log something here? |
45 | return []; |
46 | } |
47 | |
48 | $translationUnitsBySections = []; |
49 | foreach ( $translationUnits as $unit ) { |
50 | // @phan-suppress-next-line PhanTypeMismatchDimAssignment False positive |
51 | $translationUnitsBySections[$unit->getMwSectionNumber()][] = $unit; |
52 | } |
53 | |
54 | $revision = array_values( $translationUnits )[0]->getRevision(); |
55 | $sourceSectionTitles = $this->sectionTitleFetcher->fetchSectionTitles( |
56 | $translation->getSourceLanguage(), |
57 | null, |
58 | $revision |
59 | ); |
60 | |
61 | if ( !$sourceSectionTitles ) { |
62 | return []; |
63 | } |
64 | |
65 | $mwSectionNumbers = array_keys( $translationUnitsBySections ); |
66 | $mwSectionNumbers = array_filter( $mwSectionNumbers, static function ( int $value ) { |
67 | return $value !== 0; |
68 | } ); |
69 | |
70 | if ( !$this->validateSourceSectionKeys( $mwSectionNumbers, array_keys( $sourceSectionTitles ) ) ) { |
71 | // TODO: Throw an exception (maybe a custom one) or log something here |
72 | return []; |
73 | } |
74 | |
75 | $draftStatusIndex = array_search( |
76 | SectionTranslationStore::TRANSLATION_STATUS_DRAFT, |
77 | SectionTranslationStore::TRANSLATION_STATUSES |
78 | ); |
79 | |
80 | $newSectionTranslations = []; |
81 | |
82 | foreach ( $translationUnitsBySections as $mwSectionNumber => $translationUnitDTOs ) { |
83 | $translationUnitDTO = $translationUnitDTOs[0]; |
84 | // It's guaranteed that the "$mwSectionNumber" index exists inside $sectionTitles, as we have |
85 | // already validated source section titles above. Lead section is also guaranteed to be present |
86 | // inside the target article. |
87 | if ( $mwSectionNumber === 0 ) { |
88 | $sourceSectionTitle = self::LEAD_SECTION_DUMMY_TITLE; |
89 | $targetSectionTitle = self::LEAD_SECTION_DUMMY_TITLE; |
90 | } else { |
91 | $sourceSectionTitle = $sourceSectionTitles[$mwSectionNumber]; |
92 | $targetSectionTitle = |
93 | $this->searchForTargetTitleInCorporaUnits( $translationUnitDTOs ) ?? $sourceSectionTitle; |
94 | } |
95 | |
96 | $newSectionTranslations[] = new SectionTranslation( |
97 | null, |
98 | $translation->getTranslationId(), |
99 | $translationUnitDTO->getBaseSectionId(), |
100 | $sourceSectionTitle, |
101 | $targetSectionTitle, |
102 | $draftStatusIndex, |
103 | json_encode( [ "any" => null, "mt" => null, "human" => null ] ) |
104 | ); |
105 | } |
106 | |
107 | return $newSectionTranslations; |
108 | } |
109 | |
110 | /** |
111 | * Given an array of TranslationUnitDTO objects, this method returns a boolean |
112 | * indicating whether all translation units refer to valid (not-null) mw section |
113 | * numbers (that is when sectionId is in the "$revision_$mwSectionNumber_$subSectionNumber" form) |
114 | * |
115 | * @param TranslationUnitDTO[] $translationUnits |
116 | * @return bool |
117 | */ |
118 | private function validateMwSectionNumbers( array $translationUnits ): bool { |
119 | foreach ( $translationUnits as $translationUnit ) { |
120 | if ( $translationUnit->getMwSectionNumber() === null ) { |
121 | return false; |
122 | } |
123 | } |
124 | |
125 | return true; |
126 | } |
127 | |
128 | /** |
129 | * Given an array of integers representing the mw section numbers of corpora translation units, |
130 | * and an array of integers representing the section keys of the section titles fetched from the API, |
131 | * this method returns a boolean indicating if every corpora translation units has a section number |
132 | * that belongs in the list of keys returned by the API. |
133 | * |
134 | * @param int[] $mwSectionNumbers |
135 | * @param int[] $sectionKeys |
136 | * @return bool |
137 | */ |
138 | private function validateSourceSectionKeys( array $mwSectionNumbers, array $sectionKeys ): bool { |
139 | return !array_diff( $mwSectionNumbers, $sectionKeys ); |
140 | } |
141 | |
142 | /** |
143 | * Given an array of translation unit DTOs, this method concatenates the contents of these DTOs |
144 | * and search for first level section titles inside these contents. Such section titles are only |
145 | * contained inside <h2> elements, so we are only searching for such elements. |
146 | * |
147 | * @param TranslationUnitDTO[] $translationUnitDTOs |
148 | * @return string|null |
149 | */ |
150 | private function searchForTargetTitleInCorporaUnits( array $translationUnitDTOs ): ?string { |
151 | $translatedContent = array_reduce( |
152 | $translationUnitDTOs, |
153 | static function ( string $html, TranslationUnitDTO $translationUnitDTO ) { |
154 | $translatedContent = $translationUnitDTO->getUserBlob() ?? $translationUnitDTO->getMtBlob(); |
155 | $translatedContent = $translatedContent['content'] ?? ''; |
156 | |
157 | return $html . $translatedContent; |
158 | }, |
159 | "" |
160 | ); |
161 | |
162 | $doc = new DOMDocument(); |
163 | libxml_use_internal_errors( true ); |
164 | $doc->loadHTML( |
165 | '<!doctype html><html><head><meta charset="UTF-8"/></head><body>' . |
166 | $translatedContent . |
167 | '</body></html>' |
168 | ); |
169 | $h2ElementsList = $doc->getElementsByTagName( 'h2' ); |
170 | |
171 | if ( $h2ElementsList->count() ) { |
172 | $h2Element = $h2ElementsList->item( 0 ); |
173 | return $h2Element->textContent; |
174 | } |
175 | |
176 | return null; |
177 | } |
178 | } |