Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
145 / 145 |
|
100.00% |
11 / 11 |
CRAP | |
100.00% |
1 / 1 |
| WikitextContentCleaner | |
100.00% |
145 / 145 |
|
100.00% |
11 / 11 |
45 | |
100.00% |
1 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getLatestNumberOfReplacements | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| setSourceWikiLanguageTemplate | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| cleanWikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| cleanHeadings | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
| cleanTemplates | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
5 | |||
| parseTemplate | |
100.00% |
52 / 52 |
|
100.00% |
1 / 1 |
17 | |||
| scanFormatSnippet | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
| scanValue | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| renameTemplateParameters | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
8 | |||
| addRequiredTemplateParameters | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace FileImporter\Services\Wikitext; |
| 4 | |
| 5 | use FileImporter\Data\WikitextConversions; |
| 6 | |
| 7 | /** |
| 8 | * @license GPL-2.0-or-later |
| 9 | * @author Thiemo Kreuz |
| 10 | */ |
| 11 | class WikitextContentCleaner { |
| 12 | |
| 13 | private int $latestNumberOfReplacements = 0; |
| 14 | /** @var string|null Name of a language template to wrap parameters in, e.g. "de" for the {{de|…}} template */ |
| 15 | private $sourceWikiLanguageTemplate = null; |
| 16 | |
| 17 | public function __construct( |
| 18 | private readonly WikitextConversions $wikitextConversions, |
| 19 | ) { |
| 20 | } |
| 21 | |
| 22 | public function getLatestNumberOfReplacements(): int { |
| 23 | return $this->latestNumberOfReplacements; |
| 24 | } |
| 25 | |
| 26 | public function setSourceWikiLanguageTemplate( string $template ): void { |
| 27 | $this->sourceWikiLanguageTemplate = $template; |
| 28 | } |
| 29 | |
| 30 | public function cleanWikitext( string $wikitext ): string { |
| 31 | $wikitext = $this->cleanHeadings( $wikitext ); |
| 32 | $wikitext = $this->cleanTemplates( $wikitext ); |
| 33 | return trim( $wikitext ); |
| 34 | } |
| 35 | |
| 36 | private function cleanHeadings( string $wikitext ): string { |
| 37 | return preg_replace_callback( |
| 38 | '/^ |
| 39 | # Group 1 |
| 40 | ( |
| 41 | # Group 2 captures any opening equal signs, the extra + avoids backtracking |
| 42 | (=++) |
| 43 | # Consume horizontal whitespace |
| 44 | \h*+ |
| 45 | ) |
| 46 | # The ungreedy group 3 will capture the trimmed heading |
| 47 | (.*?) |
| 48 | # Look-ahead for what group 2 captured |
| 49 | (?=\h*\2\h*$) |
| 50 | /mx', |
| 51 | function ( array $matches ): string { |
| 52 | return $matches[1] . $this->wikitextConversions->swapHeading( $matches[3] ); |
| 53 | }, |
| 54 | $wikitext |
| 55 | ); |
| 56 | } |
| 57 | |
| 58 | private function cleanTemplates( string $wikitext ): string { |
| 59 | $this->latestNumberOfReplacements = 0; |
| 60 | |
| 61 | preg_match_all( |
| 62 | // This intentionally only searches for the start of each template |
| 63 | '/(?<!{){{\s*+([^{|}]+?)\s*(?=\||}})/s', |
| 64 | $wikitext, |
| 65 | $matches, |
| 66 | PREG_OFFSET_CAPTURE |
| 67 | ); |
| 68 | |
| 69 | // Replacements must be applied in reverse order to not mess with the captured offsets! |
| 70 | for ( $i = count( $matches[1] ); $i--; ) { |
| 71 | [ $oldTemplateName, $offset ] = $matches[1][$i]; |
| 72 | |
| 73 | $isObsolete = $this->wikitextConversions->isObsoleteTemplate( $oldTemplateName ); |
| 74 | $newTemplateName = $this->wikitextConversions->swapTemplate( $oldTemplateName ); |
| 75 | if ( !$isObsolete && !$newTemplateName ) { |
| 76 | continue; |
| 77 | } |
| 78 | |
| 79 | $endOfTemplateName = (int)$offset + strlen( $oldTemplateName ); |
| 80 | $parseResult = $this->parseTemplate( $wikitext, $endOfTemplateName ); |
| 81 | |
| 82 | $this->latestNumberOfReplacements++; |
| 83 | |
| 84 | if ( $isObsolete ) { |
| 85 | $start = $matches[0][$i][1]; |
| 86 | $wikitext = substr_replace( $wikitext, '', $start, $parseResult['end'] - $start ); |
| 87 | continue; |
| 88 | } |
| 89 | '@phan-var string $newTemplateName'; |
| 90 | |
| 91 | $wikitext = $this->renameTemplateParameters( |
| 92 | $wikitext, |
| 93 | $parseResult['parameters'], |
| 94 | $this->wikitextConversions->getTemplateParameters( $oldTemplateName ), |
| 95 | $this->sourceWikiLanguageTemplate |
| 96 | ); |
| 97 | |
| 98 | $wikitext = $this->addRequiredTemplateParameters( |
| 99 | $wikitext, |
| 100 | $this->wikitextConversions->getRequiredTemplateParameters( $oldTemplateName ), |
| 101 | $parseResult['parameters'], |
| 102 | $endOfTemplateName |
| 103 | ); |
| 104 | |
| 105 | $wikitext = substr_replace( |
| 106 | $wikitext, |
| 107 | $newTemplateName, |
| 108 | $offset, |
| 109 | strlen( $oldTemplateName ) |
| 110 | ); |
| 111 | } |
| 112 | |
| 113 | // Collapse any amount of line breaks to a maximum of two (= one empty line) |
| 114 | return preg_replace( '/\n\s*\n\s*\n/', "\n\n", $wikitext ); |
| 115 | } |
| 116 | |
| 117 | /** |
| 118 | * @suppress PhanTypeInvalidDimOffset false positive with $p being -1 |
| 119 | * @param string $wikitext |
| 120 | * @param int $startPosition Must be after the opening {{, and before or exactly at the first | |
| 121 | * |
| 122 | * @return array{end: int, parameters: array[]} Parse result in the following format: |
| 123 | * [ |
| 124 | * 'parameters' => [ |
| 125 | * [ |
| 126 | * 'offset' => absolute position of the parameter name in the wikitext, or where the |
| 127 | * parameter name needs to be placed for unnamed parameters, |
| 128 | * 'number' => positive integer number, only present for unnamed parameters, |
| 129 | * 'name' => optional string name of the parameter, |
| 130 | * 'valueOffset' => int Absolute position of the value's first non-whitespace |
| 131 | * character in the wikitext |
| 132 | * 'value' => string Trimmed value, might be an empty string |
| 133 | * ], |
| 134 | * … |
| 135 | * ] |
| 136 | * ] |
| 137 | */ |
| 138 | private function parseTemplate( string $wikitext, int $startPosition ): array { |
| 139 | $max = strlen( $wikitext ); |
| 140 | // Templates can be nested, but links can not |
| 141 | $inWikiLink = false; |
| 142 | $nesting = 0; |
| 143 | $params = []; |
| 144 | $p = -1; |
| 145 | $number = 0; |
| 146 | |
| 147 | for ( $i = $startPosition; $i < $max; $i++ ) { |
| 148 | // Optimization: Skip over irrelevant chars without slow loop |
| 149 | $i += strcspn( $wikitext, '=[]{|}', $i ); |
| 150 | |
| 151 | $currentChar = $wikitext[$i] ?? null; |
| 152 | $currentPair = substr( $wikitext, $i, 2 ); |
| 153 | |
| 154 | if ( $currentPair === '[[' ) { |
| 155 | $inWikiLink = true; |
| 156 | } elseif ( $currentPair === ']]' ) { |
| 157 | $inWikiLink = false; |
| 158 | } elseif ( $currentPair === '{{' ) { |
| 159 | $nesting++; |
| 160 | // Skip the second bracket, it can't be the start of another pair |
| 161 | $i++; |
| 162 | } elseif ( $currentPair === '}}' || $currentPair === '}' ) { |
| 163 | if ( !$nesting ) { |
| 164 | if ( isset( $params[$p] ) ) { |
| 165 | $this->scanValue( $wikitext, $i, $params[$p] ); |
| 166 | } |
| 167 | |
| 168 | // Note this parser intentionally accepts incomplete, cut-off templates |
| 169 | $max = min( $max, $i + 2 ); |
| 170 | break; |
| 171 | } |
| 172 | |
| 173 | $nesting--; |
| 174 | // Skip the second bracket, it can't be the end of another pair |
| 175 | $i++; |
| 176 | } elseif ( $currentChar === '|' && !$inWikiLink && !$nesting ) { |
| 177 | if ( isset( $params[$p] ) ) { |
| 178 | $this->scanValue( $wikitext, $i, $params[$p] ); |
| 179 | } |
| 180 | |
| 181 | $params[++$p] = [ |
| 182 | 'number' => ++$number, |
| 183 | 'offset' => $i + 1, |
| 184 | 'format' => $this->scanFormatSnippet( $wikitext, $i ) . '_=', |
| 185 | 'valueOffset' => $i + 1, |
| 186 | 'value' => substr( $wikitext, $i + 1 ), |
| 187 | ]; |
| 188 | } elseif ( $currentChar === '=' |
| 189 | && !$nesting |
| 190 | && isset( $params[$p] ) |
| 191 | && !isset( $params[$p]['name'] ) |
| 192 | ) { |
| 193 | unset( $params[$p]['number'] ); |
| 194 | $number--; |
| 195 | |
| 196 | $offset = $params[$p]['offset']; |
| 197 | $name = rtrim( substr( $wikitext, $offset, $i - $offset ) ); |
| 198 | $params[$p]['name'] = ltrim( $name ); |
| 199 | // Skip (optional) whitespace between | and the parameter name |
| 200 | $params[$p]['offset'] += strlen( $name ) - strlen( $params[$p]['name'] ); |
| 201 | // @phan-suppress-next-line PhanTypeMismatchArgumentInternal "format" is guaranteed |
| 202 | $params[$p]['format'] = rtrim( $params[$p]['format'], '=' ) |
| 203 | . $this->scanFormatSnippet( $wikitext, $i ); |
| 204 | $params[$p]['valueOffset'] = $i + 1; |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | return [ |
| 209 | 'end' => $max, |
| 210 | 'parameters' => $params, |
| 211 | ]; |
| 212 | } |
| 213 | |
| 214 | /** |
| 215 | * @return string Substring from $wikitext including the character at $offset, and all |
| 216 | * whitespace left and right |
| 217 | */ |
| 218 | private function scanFormatSnippet( string $wikitext, int $offset ): string { |
| 219 | $from = $offset; |
| 220 | while ( $from > 0 && ctype_space( $wikitext[$from - 1] ) ) { |
| 221 | $from--; |
| 222 | } |
| 223 | |
| 224 | $to = $offset + 1; |
| 225 | $max = strlen( $wikitext ); |
| 226 | while ( $to < $max && ctype_space( $wikitext[$to] ) ) { |
| 227 | $to++; |
| 228 | } |
| 229 | |
| 230 | return substr( $wikitext, $from, $to - $from ); |
| 231 | } |
| 232 | |
| 233 | private function scanValue( string $wikitext, int $end, array &$param ): void { |
| 234 | // To not place replacements for empty values in the next line, we skip horizontal |
| 235 | // whitespace only |
| 236 | preg_match( '/(?!\h)/u', $wikitext, $matches, PREG_OFFSET_CAPTURE, $param['valueOffset'] ); |
| 237 | $newOffset = $matches[0][1]; |
| 238 | $param['valueOffset'] = $newOffset; |
| 239 | $param['value'] = rtrim( substr( $wikitext, $newOffset, $end - $newOffset ) ); |
| 240 | } |
| 241 | |
| 242 | /** |
| 243 | * @param string $wikitext |
| 244 | * @param array[] $parameters "parameters" list as returned by {@see parseTemplateParameters} |
| 245 | * @param array[] $replacements Array mapping old to new parameters, as returned by |
| 246 | * {@see WikitextConversions::getTemplateParameters} |
| 247 | * @param string|null $languageTemplate Name of a language template to wrap parameters in, |
| 248 | * e.g. "de" for the {{de|…}} template |
| 249 | */ |
| 250 | private function renameTemplateParameters( |
| 251 | string $wikitext, |
| 252 | array $parameters, |
| 253 | array $replacements, |
| 254 | ?string $languageTemplate |
| 255 | ): string { |
| 256 | if ( $replacements === [] ) { |
| 257 | return $wikitext; |
| 258 | } |
| 259 | |
| 260 | // Replacements must be applied in reverse order to not mess with the captured offsets! |
| 261 | for ( $i = count( $parameters ); $i--; ) { |
| 262 | $from = $parameters[$i]['name'] ?? $parameters[$i]['number']; |
| 263 | |
| 264 | if ( isset( $replacements[$from] ) ) { |
| 265 | if ( $languageTemplate !== null && $replacements[$from]['addLanguageTemplate'] ) { |
| 266 | $regex = '/\{\{\s*(' . preg_quote( $languageTemplate, '/' ) . '|[a-z]{2})\s*\|/A'; |
| 267 | $start = $parameters[$i]['valueOffset']; |
| 268 | if ( !preg_match( $regex, $wikitext, $matches, 0, $start ) ) { |
| 269 | $end = $start + strlen( $parameters[$i]['value'] ); |
| 270 | $wikitext = substr_replace( $wikitext, '}}', $end, 0 ); |
| 271 | $wikitext = substr_replace( $wikitext, '{{' . $languageTemplate . '|', $start, 0 ); |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | $to = $replacements[$from]['target']; |
| 276 | $offset = $parameters[$i]['offset']; |
| 277 | if ( isset( $parameters[$i]['name'] ) ) { |
| 278 | $wikitext = substr_replace( $wikitext, $to, $offset, strlen( $from ) ); |
| 279 | } else { |
| 280 | // Insert parameter name when the source parameter was unnamed |
| 281 | $wikitext = substr_replace( $wikitext, $to . '=', $offset, 0 ); |
| 282 | } |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | return $wikitext; |
| 287 | } |
| 288 | |
| 289 | /** |
| 290 | * @param string $wikitext |
| 291 | * @param string[] $required List of parameter name => string value pairs |
| 292 | * @param array[] $parameters "parameters" list as returned by {@see parseTemplateParameters} |
| 293 | * @param int $offset Exact position where to insert the new parameter |
| 294 | */ |
| 295 | private function addRequiredTemplateParameters( |
| 296 | string $wikitext, |
| 297 | array $required, |
| 298 | array $parameters, |
| 299 | int $offset |
| 300 | ): string { |
| 301 | if ( !$required ) { |
| 302 | return $wikitext; |
| 303 | } |
| 304 | |
| 305 | foreach ( $parameters as $param ) { |
| 306 | $name = $param['name'] ?? $param['number']; |
| 307 | unset( $required[$name] ); |
| 308 | } |
| 309 | |
| 310 | $format = $parameters[0]['format'] ?? '|_='; |
| 311 | $newWikitext = ''; |
| 312 | foreach ( $required as $name => $value ) { |
| 313 | $newWikitext .= str_replace( '_', $name, $format ) . $value; |
| 314 | } |
| 315 | |
| 316 | return substr_replace( $wikitext, $newWikitext, $offset, 0 ); |
| 317 | } |
| 318 | |
| 319 | } |