Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
145 / 145 |
|
100.00% |
11 / 11 |
CRAP | |
100.00% |
1 / 1 |
WikitextContentCleaner | |
100.00% |
145 / 145 |
|
100.00% |
11 / 11 |
45 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getLatestNumberOfReplacements | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setSourceWikiLanguageTemplate | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
cleanWikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
cleanHeadings | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
cleanTemplates | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
5 | |||
parseTemplate | |
100.00% |
52 / 52 |
|
100.00% |
1 / 1 |
17 | |||
scanFormatSnippet | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
scanValue | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
renameTemplateParameters | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
8 | |||
addRequiredTemplateParameters | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace FileImporter\Services\Wikitext; |
4 | |
5 | use FileImporter\Data\WikitextConversions; |
6 | |
7 | /** |
8 | * @license GPL-2.0-or-later |
9 | * @author Thiemo Kreuz |
10 | */ |
11 | class WikitextContentCleaner { |
12 | |
13 | private int $latestNumberOfReplacements = 0; |
14 | private WikitextConversions $wikitextConversions; |
15 | /** @var string|null Name of a language template to wrap parameters in, e.g. "de" for the {{de|…}} template */ |
16 | private $sourceWikiLanguageTemplate = null; |
17 | |
18 | public function __construct( WikitextConversions $conversions ) { |
19 | $this->wikitextConversions = $conversions; |
20 | } |
21 | |
22 | public function getLatestNumberOfReplacements(): int { |
23 | return $this->latestNumberOfReplacements; |
24 | } |
25 | |
26 | public function setSourceWikiLanguageTemplate( string $template ): void { |
27 | $this->sourceWikiLanguageTemplate = $template; |
28 | } |
29 | |
30 | public function cleanWikitext( string $wikitext ): string { |
31 | $wikitext = $this->cleanHeadings( $wikitext ); |
32 | $wikitext = $this->cleanTemplates( $wikitext ); |
33 | return trim( $wikitext ); |
34 | } |
35 | |
36 | private function cleanHeadings( string $wikitext ): string { |
37 | return preg_replace_callback( |
38 | '/^ |
39 | # Group 1 |
40 | ( |
41 | # Group 2 captures any opening equal signs, the extra + avoids backtracking |
42 | (=++) |
43 | # Consume horizontal whitespace |
44 | \h*+ |
45 | ) |
46 | # The ungreedy group 3 will capture the trimmed heading |
47 | (.*?) |
48 | # Look-ahead for what group 2 captured |
49 | (?=\h*\2\h*$) |
50 | /mx', |
51 | function ( array $matches ): string { |
52 | return $matches[1] . $this->wikitextConversions->swapHeading( $matches[3] ); |
53 | }, |
54 | $wikitext |
55 | ); |
56 | } |
57 | |
58 | private function cleanTemplates( string $wikitext ): string { |
59 | $this->latestNumberOfReplacements = 0; |
60 | |
61 | preg_match_all( |
62 | // This intentionally only searches for the start of each template |
63 | '/(?<!{){{\s*+([^{|}]+?)\s*(?=\||}})/s', |
64 | $wikitext, |
65 | $matches, |
66 | PREG_OFFSET_CAPTURE |
67 | ); |
68 | |
69 | // Replacements must be applied in reverse order to not mess with the captured offsets! |
70 | for ( $i = count( $matches[1] ); $i-- > 0; ) { |
71 | [ $oldTemplateName, $offset ] = $matches[1][$i]; |
72 | |
73 | $isObsolete = $this->wikitextConversions->isObsoleteTemplate( $oldTemplateName ); |
74 | $newTemplateName = $this->wikitextConversions->swapTemplate( $oldTemplateName ); |
75 | if ( !$isObsolete && !$newTemplateName ) { |
76 | continue; |
77 | } |
78 | |
79 | $endOfTemplateName = (int)$offset + strlen( $oldTemplateName ); |
80 | $parseResult = $this->parseTemplate( $wikitext, $endOfTemplateName ); |
81 | |
82 | $this->latestNumberOfReplacements++; |
83 | |
84 | if ( $isObsolete ) { |
85 | $start = $matches[0][$i][1]; |
86 | $wikitext = substr_replace( $wikitext, '', $start, $parseResult['end'] - $start ); |
87 | continue; |
88 | } |
89 | '@phan-var string $newTemplateName'; |
90 | |
91 | $wikitext = $this->renameTemplateParameters( |
92 | $wikitext, |
93 | $parseResult['parameters'], |
94 | $this->wikitextConversions->getTemplateParameters( $oldTemplateName ), |
95 | $this->sourceWikiLanguageTemplate |
96 | ); |
97 | |
98 | $wikitext = $this->addRequiredTemplateParameters( |
99 | $wikitext, |
100 | $this->wikitextConversions->getRequiredTemplateParameters( $oldTemplateName ), |
101 | $parseResult['parameters'], |
102 | $endOfTemplateName |
103 | ); |
104 | |
105 | $wikitext = substr_replace( |
106 | $wikitext, |
107 | $newTemplateName, |
108 | $offset, |
109 | strlen( $oldTemplateName ) |
110 | ); |
111 | } |
112 | |
113 | // Collapse any amount of line breaks to a maximum of two (= one empty line) |
114 | return preg_replace( '/\n\s*\n\s*\n/', "\n\n", $wikitext ); |
115 | } |
116 | |
117 | /** |
118 | * @suppress PhanTypeInvalidDimOffset false positive with $p being -1 |
119 | * @param string $wikitext |
120 | * @param int $startPosition Must be after the opening {{, and before or exactly at the first | |
121 | * |
122 | * @return array Parse result in the following format: |
123 | * [ |
124 | * 'parameters' => [ |
125 | * [ |
126 | * 'offset' => absolute position of the parameter name in the wikitext, or where the |
127 | * parameter name needs to be placed for unnamed parameters, |
128 | * 'number' => positive integer number, only present for unnamed parameters, |
129 | * 'name' => optional string name of the parameter, |
130 | * 'valueOffset' => int Absolute position of the value's first non-whitespace |
131 | * character in the wikitext |
132 | * 'value' => string Trimmed value, might be an empty string |
133 | * ], |
134 | * … |
135 | * ] |
136 | * ] |
137 | */ |
138 | private function parseTemplate( string $wikitext, int $startPosition ): array { |
139 | $max = strlen( $wikitext ); |
140 | // Templates can be nested, but links can not |
141 | $inWikiLink = false; |
142 | $nesting = 0; |
143 | $params = []; |
144 | $p = -1; |
145 | $number = 0; |
146 | |
147 | for ( $i = $startPosition; $i < $max; $i++ ) { |
148 | // Optimization: Skip over irrelevant chars without slow loop |
149 | $i += strcspn( $wikitext, '=[]{|}', $i ); |
150 | |
151 | $currentChar = $wikitext[$i] ?? null; |
152 | $currentPair = substr( $wikitext, $i, 2 ); |
153 | |
154 | if ( $currentPair === '[[' ) { |
155 | $inWikiLink = true; |
156 | } elseif ( $currentPair === ']]' ) { |
157 | $inWikiLink = false; |
158 | } elseif ( $currentPair === '{{' ) { |
159 | $nesting++; |
160 | // Skip the second bracket, it can't be the start of another pair |
161 | $i++; |
162 | } elseif ( $currentPair === '}}' || $currentPair === '}' ) { |
163 | if ( !$nesting ) { |
164 | if ( isset( $params[$p] ) ) { |
165 | $this->scanValue( $wikitext, $i, $params[$p] ); |
166 | } |
167 | |
168 | // Note this parser intentionally accepts incomplete, cut-off templates |
169 | $max = min( $max, $i + 2 ); |
170 | break; |
171 | } |
172 | |
173 | $nesting--; |
174 | // Skip the second bracket, it can't be the end of another pair |
175 | $i++; |
176 | } elseif ( $currentChar === '|' && !$inWikiLink && !$nesting ) { |
177 | if ( isset( $params[$p] ) ) { |
178 | $this->scanValue( $wikitext, $i, $params[$p] ); |
179 | } |
180 | |
181 | $params[++$p] = [ |
182 | 'number' => ++$number, |
183 | 'offset' => $i + 1, |
184 | 'format' => $this->scanFormatSnippet( $wikitext, $i ) . '_=', |
185 | 'valueOffset' => $i + 1, |
186 | 'value' => substr( $wikitext, $i + 1 ), |
187 | ]; |
188 | } elseif ( $currentChar === '=' |
189 | && !$nesting |
190 | && isset( $params[$p] ) |
191 | && !isset( $params[$p]['name'] ) |
192 | ) { |
193 | unset( $params[$p]['number'] ); |
194 | $number--; |
195 | |
196 | $offset = $params[$p]['offset']; |
197 | $name = rtrim( substr( $wikitext, $offset, $i - $offset ) ); |
198 | $params[$p]['name'] = ltrim( $name ); |
199 | // Skip (optional) whitespace between | and the parameter name |
200 | $params[$p]['offset'] += strlen( $name ) - strlen( $params[$p]['name'] ); |
201 | // @phan-suppress-next-line PhanTypeMismatchArgumentInternal "format" is guaranteed |
202 | $params[$p]['format'] = rtrim( $params[$p]['format'], '=' ) |
203 | . $this->scanFormatSnippet( $wikitext, $i ); |
204 | $params[$p]['valueOffset'] = $i + 1; |
205 | } |
206 | } |
207 | |
208 | return [ |
209 | 'end' => $max, |
210 | 'parameters' => $params, |
211 | ]; |
212 | } |
213 | |
214 | /** |
215 | * @return string Substring from $wikitext including the character at $offset, and all |
216 | * whitespace left and right |
217 | */ |
218 | private function scanFormatSnippet( string $wikitext, int $offset ): string { |
219 | $from = $offset; |
220 | while ( $from > 0 && ctype_space( $wikitext[$from - 1] ) ) { |
221 | $from--; |
222 | } |
223 | |
224 | $to = $offset + 1; |
225 | $max = strlen( $wikitext ); |
226 | while ( $to < $max && ctype_space( $wikitext[$to] ) ) { |
227 | $to++; |
228 | } |
229 | |
230 | return substr( $wikitext, $from, $to - $from ); |
231 | } |
232 | |
233 | private function scanValue( string $wikitext, int $end, array &$param ): void { |
234 | // To not place replacements for empty values in the next line, we skip horizontal |
235 | // whitespace only |
236 | preg_match( '/(?!\h)/u', $wikitext, $matches, PREG_OFFSET_CAPTURE, $param['valueOffset'] ); |
237 | $newOffset = $matches[0][1]; |
238 | $param['valueOffset'] = $newOffset; |
239 | $param['value'] = rtrim( substr( $wikitext, $newOffset, $end - $newOffset ) ); |
240 | } |
241 | |
242 | /** |
243 | * @param string $wikitext |
244 | * @param array[] $parameters "parameters" list as returned by {@see parseTemplateParameters} |
245 | * @param array[] $replacements Array mapping old to new parameters, as returned by |
246 | * {@see WikitextConversions::getTemplateParameters} |
247 | * @param string|null $languageTemplate Name of a language template to wrap parameters in, |
248 | * e.g. "de" for the {{de|…}} template |
249 | */ |
250 | private function renameTemplateParameters( |
251 | string $wikitext, |
252 | array $parameters, |
253 | array $replacements, |
254 | ?string $languageTemplate |
255 | ): string { |
256 | if ( $replacements === [] ) { |
257 | return $wikitext; |
258 | } |
259 | |
260 | // Replacements must be applied in reverse order to not mess with the captured offsets! |
261 | for ( $i = count( $parameters ); $i-- > 0; ) { |
262 | $from = $parameters[$i]['name'] ?? $parameters[$i]['number']; |
263 | |
264 | if ( isset( $replacements[$from] ) ) { |
265 | if ( $languageTemplate !== null && $replacements[$from]['addLanguageTemplate'] ) { |
266 | $regex = '/\{\{\s*(' . preg_quote( $languageTemplate, '/' ) . '|[a-z]{2})\s*\|/A'; |
267 | $start = $parameters[$i]['valueOffset']; |
268 | if ( !preg_match( $regex, $wikitext, $matches, 0, $start ) ) { |
269 | $end = $start + strlen( $parameters[$i]['value'] ); |
270 | $wikitext = substr_replace( $wikitext, '}}', $end, 0 ); |
271 | $wikitext = substr_replace( $wikitext, '{{' . $languageTemplate . '|', $start, 0 ); |
272 | } |
273 | } |
274 | |
275 | $to = $replacements[$from]['target']; |
276 | $offset = $parameters[$i]['offset']; |
277 | if ( isset( $parameters[$i]['name'] ) ) { |
278 | $wikitext = substr_replace( $wikitext, $to, $offset, strlen( $from ) ); |
279 | } else { |
280 | // Insert parameter name when the source parameter was unnamed |
281 | $wikitext = substr_replace( $wikitext, $to . '=', $offset, 0 ); |
282 | } |
283 | } |
284 | } |
285 | |
286 | return $wikitext; |
287 | } |
288 | |
289 | /** |
290 | * @param string $wikitext |
291 | * @param string[] $required List of parameter name => string value pairs |
292 | * @param array[] $parameters "parameters" list as returned by {@see parseTemplateParameters} |
293 | * @param int $offset Exact position where to insert the new parameter |
294 | */ |
295 | private function addRequiredTemplateParameters( |
296 | string $wikitext, |
297 | array $required, |
298 | array $parameters, |
299 | int $offset |
300 | ): string { |
301 | if ( !$required ) { |
302 | return $wikitext; |
303 | } |
304 | |
305 | foreach ( $parameters as $param ) { |
306 | $name = $param['name'] ?? $param['number']; |
307 | unset( $required[$name] ); |
308 | } |
309 | |
310 | $format = $parameters[0]['format'] ?? '|_='; |
311 | $newWikitext = ''; |
312 | foreach ( $required as $name => $value ) { |
313 | $newWikitext .= str_replace( '_', $name, $format ) . $value; |
314 | } |
315 | |
316 | return substr_replace( $wikitext, $newWikitext, $offset, 0 ); |
317 | } |
318 | |
319 | } |