Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
77.98% |
85 / 109 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
TranslatablePageParser | |
77.98% |
85 / 109 |
|
75.00% |
6 / 8 |
28.65 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
containsMarkup | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
cleanupTags | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
71.43% |
30 / 42 |
|
0.00% |
0 / 1 |
9.49 | |||
parseSection | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
3 | |||
parseUnit | |
55.56% |
15 / 27 |
|
0.00% |
0 / 1 |
9.16 | |||
armourNowiki | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
unarmourNowiki | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\PageTranslation; |
5 | |
6 | use MediaWiki\Extension\Translate\Utilities\ParsingPlaceholderFactory; |
7 | |
8 | /** |
9 | * Generates ParserOutput from text or removes all tags from a text. |
10 | * |
11 | * @author Niklas Laxström |
12 | * @license GPL-2.0-or-later |
13 | * @since 2020.08 |
14 | */ |
15 | class TranslatablePageParser { |
16 | private $placeholderFactory; |
17 | |
18 | public function __construct( ParsingPlaceholderFactory $placeholderFactory ) { |
19 | $this->placeholderFactory = $placeholderFactory; |
20 | } |
21 | |
22 | public function containsMarkup( string $text ): bool { |
23 | $nowiki = []; |
24 | $text = $this->armourNowiki( $nowiki, $text ); |
25 | return preg_match( '~</?translate[ >]~', $text ) !== 0; |
26 | } |
27 | |
28 | /** |
29 | * Remove all opening and closing translate tags following the same whitespace rules as the |
30 | * regular parsing. This doesn't try to parse the page, so it can handle unbalanced tags. |
31 | */ |
32 | public function cleanupTags( string $text ): string { |
33 | $nowiki = []; |
34 | $text = $this->armourNowiki( $nowiki, $text ); |
35 | $text = preg_replace( '~<translate( nowrap)?>\n?~s', '', $text ); |
36 | $text = preg_replace( '~\n?</translate>~s', '', $text ); |
37 | // Markers: headers and the rest |
38 | $ic = preg_quote( TranslationUnit::UNIT_MARKER_INVALID_CHARS, '~' ); |
39 | $text = preg_replace( "~(^=.*=) <!--T:[^$ic]+-->$~um", '\1', $text ); |
40 | $text = preg_replace( "~<!--T:[^$ic]+-->[\n ]?~um", '', $text ); |
41 | // Remove variables |
42 | $unit = new TranslationUnit( $text ); |
43 | $text = $unit->getTextForTrans(); |
44 | |
45 | $text = $this->unarmourNowiki( $nowiki, $text ); |
46 | return $text; |
47 | } |
48 | |
49 | /** @throws ParsingFailure */ |
50 | public function parse( string $text ): ParserOutput { |
51 | $nowiki = []; |
52 | $text = $this->armourNowiki( $nowiki, $text ); |
53 | |
54 | $sections = []; |
55 | $tagPlaceHolders = []; |
56 | |
57 | while ( true ) { |
58 | $re = '~(<translate(?: nowrap)?>)(.*?)</translate>~s'; |
59 | $matches = []; |
60 | $ok = preg_match( $re, $text, $matches, PREG_OFFSET_CAPTURE ); |
61 | |
62 | if ( $ok === 0 || $ok === false ) { |
63 | break; // No match or failure |
64 | } |
65 | |
66 | $contentWithTags = $matches[0][0]; |
67 | $contentWithoutTags = $matches[2][0]; |
68 | // These are offsets to the content inside the tags in $text |
69 | $offsetStart = $matches[0][1]; |
70 | $offsetEnd = $offsetStart + strlen( $contentWithTags ); |
71 | |
72 | // Replace the whole match with a placeholder |
73 | $ph = $this->placeholderFactory->make(); |
74 | $text = substr( $text, 0, $offsetStart ) . $ph . substr( $text, $offsetEnd ); |
75 | |
76 | if ( preg_match( '~<translate( nowrap)?>~', $contentWithoutTags ) !== 0 ) { |
77 | throw new ParsingFailure( |
78 | 'Nested tags', |
79 | [ 'pt-parse-nested', $contentWithoutTags ] |
80 | ); |
81 | } |
82 | |
83 | $openTag = $matches[1][0]; |
84 | $canWrap = $openTag !== '<translate nowrap>'; |
85 | |
86 | // Parse the content inside the tags |
87 | $contentWithoutTags = $this->unarmourNowiki( $nowiki, $contentWithoutTags ); |
88 | $parse = $this->parseSection( $contentWithoutTags, $canWrap ); |
89 | |
90 | // Update list of sections and the template with the results |
91 | $sections += $parse['sections']; |
92 | $tagPlaceHolders[$ph] = new Section( $openTag, $parse['template'], '</translate>' ); |
93 | } |
94 | |
95 | $prettyTemplate = $text; |
96 | foreach ( $tagPlaceHolders as $ph => $value ) { |
97 | $prettyTemplate = str_replace( $ph, '[...]', $prettyTemplate ); |
98 | } |
99 | |
100 | if ( preg_match( '~<translate( nowrap)?>~', $text ) !== 0 ) { |
101 | throw new ParsingFailure( |
102 | 'Unmatched opening tag', |
103 | [ 'pt-parse-open', $prettyTemplate ] |
104 | ); |
105 | } elseif ( str_contains( $text, '</translate>' ) ) { |
106 | throw new ParsingFailure( |
107 | "Unmatched closing tag", |
108 | [ 'pt-parse-close', $prettyTemplate ] |
109 | ); |
110 | } |
111 | |
112 | $text = $this->unarmourNowiki( $nowiki, $text ); |
113 | |
114 | return new ParserOutput( $text, $tagPlaceHolders, $sections ); |
115 | } |
116 | |
117 | /** |
118 | * Splits the content marked with \<translate> tags into translation units, which are |
119 | * separated with two or more newlines. Extra whitespace is captured in the template and |
120 | * is not included in the translation units. |
121 | * @internal |
122 | */ |
123 | public function parseSection( string $text, bool $canWrap ): array { |
124 | $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE; |
125 | $parts = preg_split( '~(^\s*|\s*\n\n\s*|\s*$)~', $text, -1, $flags ); |
126 | |
127 | $inline = preg_match( '~\n~', $text ) === 0; |
128 | |
129 | $template = ''; |
130 | $sections = []; |
131 | |
132 | foreach ( $parts as $_ ) { |
133 | if ( trim( $_ ) === '' ) { |
134 | $template .= $_; |
135 | } else { |
136 | $ph = $this->placeholderFactory->make(); |
137 | $tpsection = $this->parseUnit( $_ ); |
138 | $tpsection->setIsInline( $inline ); |
139 | $tpsection->setCanWrap( $canWrap ); |
140 | $sections[$ph] = $tpsection; |
141 | $template .= $ph; |
142 | } |
143 | } |
144 | |
145 | return [ |
146 | 'template' => $template, |
147 | 'sections' => $sections, |
148 | ]; |
149 | } |
150 | |
151 | /** |
152 | * Checks if this unit already contains a section marker. If there |
153 | * is not, a new one will be created. Marker will have the value of |
154 | * -1, which will later be replaced with a real value. |
155 | * @internal |
156 | */ |
157 | public function parseUnit( string $content ): TranslationUnit { |
158 | $re = '~<!--T:(.*?)-->~'; |
159 | $matches = []; |
160 | $count = preg_match_all( $re, $content, $matches, PREG_SET_ORDER ); |
161 | |
162 | if ( $count > 1 ) { |
163 | throw new ParsingFailure( |
164 | 'Multiple translation unit markers', |
165 | [ 'pt-shake-multiple', $content ] |
166 | ); |
167 | } |
168 | |
169 | // If no id given in the source, default to a new section id |
170 | $id = TranslationUnit::NEW_UNIT_ID; |
171 | if ( $count === 1 ) { |
172 | foreach ( $matches as $match ) { |
173 | [ /*full*/, $id ] = $match; |
174 | |
175 | // Currently handle only these two standard places. |
176 | // Is this too strict? |
177 | $rer1 = '~^<!--T:(.*?)-->( |\n)~'; // Normal sections |
178 | $rer2 = '~\s*<!--T:(.*?)-->$~m'; // Sections with title |
179 | $content = preg_replace( $rer1, '', $content ); |
180 | $content = preg_replace( $rer2, '', $content ); |
181 | |
182 | if ( preg_match( $re, $content ) === 1 ) { |
183 | throw new ParsingFailure( |
184 | 'Translation unit marker is in unsupported position', |
185 | [ 'pt-shake-position', $content ] |
186 | ); |
187 | } elseif ( trim( $content ) === '' ) { |
188 | throw new ParsingFailure( |
189 | 'Translation unit has no content besides marker', |
190 | [ 'pt-shake-empty', $id ] |
191 | ); |
192 | } |
193 | } |
194 | } |
195 | |
196 | return new TranslationUnit( $content, $id ); |
197 | } |
198 | |
199 | /** @internal */ |
200 | public function armourNowiki( array &$holders, string $text ): string { |
201 | $re = '~(<nowiki>)(.*?)(</nowiki>)~s'; |
202 | |
203 | while ( preg_match( $re, $text, $matches ) ) { |
204 | $ph = $this->placeholderFactory->make(); |
205 | $text = str_replace( $matches[0], $ph, $text ); |
206 | $holders[$ph] = $matches[0]; |
207 | } |
208 | |
209 | return $text; |
210 | } |
211 | |
212 | /** @internal */ |
213 | public function unarmourNowiki( array $holders, string $text ): string { |
214 | return strtr( $text, $holders ); |
215 | } |
216 | } |