Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
93.06% |
67 / 72 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
| TranslationSplitter | |
93.06% |
67 / 72 |
|
40.00% |
2 / 5 |
15.08 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| splitIntoSectionTranslations | |
93.33% |
42 / 45 |
|
0.00% |
0 / 1 |
8.02 | |||
| validateMwSectionNumbers | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
| validateSourceSectionKeys | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| searchForTargetTitleInCorporaUnits | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | declare( strict_types = 1 ); |
| 4 | |
| 5 | namespace ContentTranslation\Service; |
| 6 | |
| 7 | use ContentTranslation\DTO\TranslationUnitDTO; |
| 8 | use ContentTranslation\Entity\SectionTranslation; |
| 9 | use ContentTranslation\Manager\TranslationCorporaManager; |
| 10 | use ContentTranslation\Store\SectionTranslationStore; |
| 11 | use ContentTranslation\Translation; |
| 12 | use DOMDocument; |
| 13 | |
| 14 | /** |
| 15 | * This class implements a service that given a specific translation (from cx_translations table), |
| 16 | * and it creates a new section translation (rows for the cx_section_translations table) for each |
| 17 | * article section that has been translated in this translation - except for the lead section. |
| 18 | * The information about which sections have been translated, is extracted from the translation |
| 19 | * parallel corpora (cx_corpora table). |
| 20 | * |
| 21 | * @author Nik Gkountas |
| 22 | */ |
| 23 | class TranslationSplitter { |
| 24 | private const LEAD_SECTION_DUMMY_TITLE = '__LEAD_SECTION__'; |
| 25 | |
| 26 | public function __construct( |
| 27 | private readonly TranslationCorporaManager $corporaManager, |
| 28 | private readonly SectionTitleFetcher $sectionTitleFetcher |
| 29 | ) { |
| 30 | } |
| 31 | |
| 32 | /** |
| 33 | * @param Translation $translation |
| 34 | * @return SectionTranslation[] |
| 35 | */ |
| 36 | public function splitIntoSectionTranslations( Translation $translation ): array { |
| 37 | $translationUnits = $this->corporaManager->getTranslationUnitDTOsByTranslationId( |
| 38 | $translation->getTranslationId() |
| 39 | ); |
| 40 | |
| 41 | if ( !$translationUnits || !$this->validateMwSectionNumbers( $translationUnits ) ) { |
| 42 | // TODO: Should we throw an exception or log something here? |
| 43 | return []; |
| 44 | } |
| 45 | |
| 46 | $translationUnitsBySections = []; |
| 47 | foreach ( $translationUnits as $unit ) { |
| 48 | // @phan-suppress-next-line PhanTypeMismatchDimAssignment False positive |
| 49 | $translationUnitsBySections[$unit->getMwSectionNumber()][] = $unit; |
| 50 | } |
| 51 | |
| 52 | $revision = array_values( $translationUnits )[0]->getRevision(); |
| 53 | $sourceSectionTitles = $this->sectionTitleFetcher->fetchSectionTitles( |
| 54 | $translation->getSourceLanguage(), |
| 55 | null, |
| 56 | $revision |
| 57 | ); |
| 58 | |
| 59 | if ( !$sourceSectionTitles ) { |
| 60 | return []; |
| 61 | } |
| 62 | |
| 63 | $mwSectionNumbers = array_keys( $translationUnitsBySections ); |
| 64 | $mwSectionNumbers = array_filter( $mwSectionNumbers, static function ( int $value ) { |
| 65 | return $value !== 0; |
| 66 | } ); |
| 67 | |
| 68 | if ( !$this->validateSourceSectionKeys( $mwSectionNumbers, array_keys( $sourceSectionTitles ) ) ) { |
| 69 | // TODO: Throw an exception (maybe a custom one) or log something here |
| 70 | return []; |
| 71 | } |
| 72 | |
| 73 | $draftStatusIndex = array_search( |
| 74 | SectionTranslationStore::TRANSLATION_STATUS_DRAFT, |
| 75 | SectionTranslationStore::TRANSLATION_STATUSES |
| 76 | ); |
| 77 | |
| 78 | $newSectionTranslations = []; |
| 79 | |
| 80 | foreach ( $translationUnitsBySections as $mwSectionNumber => $translationUnitDTOs ) { |
| 81 | $translationUnitDTO = $translationUnitDTOs[0]; |
| 82 | // It's guaranteed that the "$mwSectionNumber" index exists inside $sectionTitles, as we have |
| 83 | // already validated source section titles above. Lead section is also guaranteed to be present |
| 84 | // inside the target article. |
| 85 | if ( $mwSectionNumber === 0 ) { |
| 86 | $sourceSectionTitle = self::LEAD_SECTION_DUMMY_TITLE; |
| 87 | $targetSectionTitle = self::LEAD_SECTION_DUMMY_TITLE; |
| 88 | } else { |
| 89 | $sourceSectionTitle = $sourceSectionTitles[$mwSectionNumber]; |
| 90 | $targetSectionTitle = |
| 91 | $this->searchForTargetTitleInCorporaUnits( $translationUnitDTOs ) ?? $sourceSectionTitle; |
| 92 | } |
| 93 | |
| 94 | $newSectionTranslations[] = new SectionTranslation( |
| 95 | null, |
| 96 | $translation->getTranslationId(), |
| 97 | $translationUnitDTO->getBaseSectionId(), |
| 98 | $sourceSectionTitle, |
| 99 | $targetSectionTitle, |
| 100 | $draftStatusIndex, |
| 101 | json_encode( [ "any" => null, "mt" => null, "human" => null ] ) |
| 102 | ); |
| 103 | } |
| 104 | |
| 105 | return $newSectionTranslations; |
| 106 | } |
| 107 | |
| 108 | /** |
| 109 | * Given an array of TranslationUnitDTO objects, this method returns a boolean |
| 110 | * indicating whether all translation units refer to valid (not-null) mw section |
| 111 | * numbers (that is when sectionId is in the "$revision_$mwSectionNumber_$subSectionNumber" form) |
| 112 | * |
| 113 | * @param TranslationUnitDTO[] $translationUnits |
| 114 | * @return bool |
| 115 | */ |
| 116 | private function validateMwSectionNumbers( array $translationUnits ): bool { |
| 117 | foreach ( $translationUnits as $translationUnit ) { |
| 118 | if ( $translationUnit->getMwSectionNumber() === null ) { |
| 119 | return false; |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | return true; |
| 124 | } |
| 125 | |
| 126 | /** |
| 127 | * Given an array of integers representing the mw section numbers of corpora translation units, |
| 128 | * and an array of integers representing the section keys of the section titles fetched from the API, |
| 129 | * this method returns a boolean indicating if every corpora translation units has a section number |
| 130 | * that belongs in the list of keys returned by the API. |
| 131 | * |
| 132 | * @param int[] $mwSectionNumbers |
| 133 | * @param int[] $sectionKeys |
| 134 | * @return bool |
| 135 | */ |
| 136 | private function validateSourceSectionKeys( array $mwSectionNumbers, array $sectionKeys ): bool { |
| 137 | return !array_diff( $mwSectionNumbers, $sectionKeys ); |
| 138 | } |
| 139 | |
| 140 | /** |
| 141 | * Given an array of translation unit DTOs, this method concatenates the contents of these DTOs |
| 142 | * and search for first level section titles inside these contents. Such section titles are only |
| 143 | * contained inside <h2> elements, so we are only searching for such elements. |
| 144 | * |
| 145 | * @param TranslationUnitDTO[] $translationUnitDTOs |
| 146 | * @return string|null |
| 147 | */ |
| 148 | private function searchForTargetTitleInCorporaUnits( array $translationUnitDTOs ): ?string { |
| 149 | $translatedContent = array_reduce( |
| 150 | $translationUnitDTOs, |
| 151 | static function ( string $html, TranslationUnitDTO $translationUnitDTO ) { |
| 152 | $translatedContent = $translationUnitDTO->getUserBlob() ?? $translationUnitDTO->getMtBlob(); |
| 153 | $translatedContent = $translatedContent['content'] ?? ''; |
| 154 | |
| 155 | return $html . $translatedContent; |
| 156 | }, |
| 157 | "" |
| 158 | ); |
| 159 | |
| 160 | $doc = new DOMDocument(); |
| 161 | libxml_use_internal_errors( true ); |
| 162 | $doc->loadHTML( |
| 163 | '<!doctype html><html><head><meta charset="UTF-8"/></head><body>' . |
| 164 | $translatedContent . |
| 165 | '</body></html>' |
| 166 | ); |
| 167 | $h2ElementsList = $doc->getElementsByTagName( 'h2' ); |
| 168 | |
| 169 | if ( $h2ElementsList->count() ) { |
| 170 | $h2Element = $h2ElementsList->item( 0 ); |
| 171 | return $h2Element->textContent; |
| 172 | } |
| 173 | |
| 174 | return null; |
| 175 | } |
| 176 | } |