Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 179 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
TestUtils | |
0.00% |
0 / 179 |
|
0.00% |
0 / 12 |
3080 | |
0.00% |
0 / 1 |
encodeXml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeAbout | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeOut | |
0.00% |
0 / 57 |
|
0.00% |
0 / 1 |
72 | |||
stripParsoidIds | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanSpans | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
unwrapSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
newlineAround | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
normalizeIEWVisitor | |
0.00% |
0 / 50 |
|
0.00% |
0 / 1 |
650 | |||
unwrapSpansAndNormalizeIEW | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
normalizePhpOutput | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
normalizeHTML | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
6 | |||
colorString | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\ParserTests; |
5 | |
6 | use Error; |
7 | use Exception; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Html2Wt\DOMNormalizer; |
13 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
14 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
15 | use Wikimedia\Parsoid\Mocks\MockEnv; |
16 | use Wikimedia\Parsoid\Utils\ContentUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
20 | use Wikimedia\Parsoid\Utils\Utils; |
21 | use Wikimedia\Parsoid\Utils\WTUtils; |
22 | |
23 | /** |
24 | * This class contains helper functions which should not be directly used |
25 | * outside of Parsoid. |
26 | * |
27 | * Per T332457, most of the code in Wikimedia\Parsoid\ParserTests is |
28 | * "for use in parser test runners only", including the core parser |
29 | * test runner, but this file is "more internal" than that: core's |
30 | * parser test runner should not use these helpers directly. |
31 | * |
32 | * @internal |
33 | */ |
34 | class TestUtils { |
35 | /** @var mixed */ |
36 | private static $consoleColor; |
37 | |
38 | /** |
39 | * Little helper function for encoding XML entities. |
40 | * |
41 | * @param string $str |
42 | * @return string |
43 | */ |
44 | public static function encodeXml( string $str ): string { |
45 | // PORT-FIXME: Find replacement |
46 | // return entities::encodeXML( $str ); |
47 | return $str; |
48 | } |
49 | |
50 | /** |
51 | * Strip the actual about id from the string |
52 | * @param string $str |
53 | * @return string |
54 | */ |
55 | public static function normalizeAbout( string $str ): string { |
56 | return preg_replace( "/(about=\\\\?[\"']#mwt)\d+/", '$1', $str ); |
57 | } |
58 | |
59 | /** |
60 | * Specialized normalization of the PHP parser & Parsoid output, to ignore |
61 | * a few known-ok differences in parser test runs. |
62 | * |
63 | * This code is also used by the Parsoid round-trip testing code. |
64 | * |
65 | * If parsoidOnly is true-ish, we allow more markup through (like property |
66 | * and typeof attributes), for better checking of parsoid-only test cases. |
67 | * |
68 | * @param Element|string $domBody |
69 | * @param array $options |
70 | * - parsoidOnly (bool) Is this test Parsoid Only? Optional. Default: false |
71 | * - preserveIEW (bool) Should inter-element WS be preserved? Optional. Default: false |
72 | * - hackyNormalize (bool) Apply the normalizer to the html. Optional. Default: false |
73 | * @return string |
74 | */ |
75 | public static function normalizeOut( $domBody, array $options = [] ): string { |
76 | $parsoidOnly = !empty( $options['parsoidOnly'] ); |
77 | $preserveIEW = !empty( $options['preserveIEW'] ); |
78 | |
79 | if ( !empty( $options['hackyNormalize'] ) ) { |
80 | // Mock env obj |
81 | // |
82 | // FIXME: This is ugly. |
83 | // (a) The normalizer shouldn't need the full env. |
84 | // Pass options and a logger instead? |
85 | // (b) DOM diff code is using page-id for some reason. |
86 | // That feels like a carryover of 2013 era code. |
87 | // If possible, get rid of it and diff-mark dependency |
88 | // on the env object. |
89 | $mockEnv = new MockEnv( [] ); |
90 | $mockSerializer = new WikitextSerializer( $mockEnv, [] ); |
91 | $mockState = new SerializerState( $mockSerializer, [ 'selserMode' => false ] ); |
92 | if ( is_string( $domBody ) ) { |
93 | // Careful about the lifetime of this document |
94 | $doc = ContentUtils::createDocument( $domBody ); |
95 | $domBody = DOMCompat::getBody( $doc ); |
96 | } |
97 | DOMDataUtils::visitAndLoadDataAttribs( $domBody, [ 'markNew' => true ] ); |
98 | ( new DOMNormalizer( $mockState ) )->normalize( $domBody ); |
99 | DOMDataUtils::visitAndStoreDataAttribs( $domBody ); |
100 | } elseif ( is_string( $domBody ) ) { |
101 | $domBody = DOMCompat::getBody( DOMUtils::parseHTML( $domBody ) ); |
102 | } |
103 | |
104 | $stripTypeof = $parsoidOnly ? |
105 | '/^mw:Placeholder$/' : |
106 | '/^mw:(?:DisplaySpace|Placeholder|Nowiki|Transclusion|Entity)$/'; |
107 | $domBody = self::unwrapSpansAndNormalizeIEW( $domBody, $stripTypeof, $parsoidOnly, $preserveIEW ); |
108 | $out = ContentUtils::toXML( $domBody, [ 'innerXML' => true ] ); |
109 | // NOTE that we use a slightly restricted regexp for "attribute" |
110 | // which works for the output of DOM serialization. For example, |
111 | // we know that attribute values will be surrounded with double quotes, |
112 | // not unquoted or quoted with single quotes. The serialization |
113 | // algorithm is given by: |
114 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments |
115 | if ( !preg_match( '#[^<]*(<\w+(\s+[^\0-\cZ\s"\'>/=]+(="[^"]*")?)*/?>[^<]*)*#u', $out ) ) { |
116 | throw new Error( 'normalizeOut input is not in standard serialized form' ); |
117 | } |
118 | |
119 | // Eliminate a source of indeterminacy from leaked strip markers |
120 | $out = preg_replace( '/UNIQ-.*?-QINU/u', '', $out ); |
121 | |
122 | // Normalize COINS ids -- they aren't stable |
123 | $out = preg_replace( '/\s?id=[\'"]coins_\d+[\'"]/iu', '', $out ); |
124 | |
125 | // maplink extension |
126 | $out = preg_replace( '/\s?data-overlays=\'[^\']*\'/u', '', $out ); |
127 | |
128 | // unnecessary attributes, we don't need to check these. |
129 | $unnecessaryAttribs = 'data-parsoid|prefix|about|rev|datatype|inlist|usemap|vocab'; |
130 | if ( $parsoidOnly ) { |
131 | $unnecessaryAttribs = "/ ($unnecessaryAttribs)="; |
132 | $out = preg_replace( $unnecessaryAttribs . '\\\\?"[^\"]*\\\\?"/u', '', $out ); |
133 | $out = preg_replace( $unnecessaryAttribs . "\\\\?'[^\']*\\\\?'/u", '', $out ); // single-quoted variant |
134 | $out = preg_replace( $unnecessaryAttribs . ''.*?'/u', '', $out ); // apos variant |
135 | if ( !$options['externallinktarget'] ) { |
136 | $out = preg_replace( '/ nofollow/', '', $out ); |
137 | $out = str_replace( ' rel="nofollow"', '', $out ); |
138 | $out = preg_replace( '/ noreferrer noopener/', '', $out ); |
139 | } |
140 | |
141 | // strip self-closed <nowiki /> because we frequently test WTS |
142 | // <nowiki> insertion by providing an html/parsoid section with the |
143 | // <meta> tags stripped out, allowing the html2wt test to verify that |
144 | // the <nowiki> is correctly added during WTS, while still allowing |
145 | // the html2html and wt2html versions of the test to pass as a |
146 | // validity check. If <meta>s were not stripped, these tests would all |
147 | // have to be modified and split up. Not worth it at this time. |
148 | // (see commit 689b22431ad690302420d049b10e689de6b7d426) |
149 | $out = preg_replace( '#<span typeof="mw:Nowiki"></span>#', '', $out ); |
150 | |
151 | return $out; |
152 | } |
153 | |
154 | // Normalize headings by stripping out Parsoid-added ids so that we don't |
155 | // have to add these ids to every parser test that uses headings. |
156 | // We will test the id generation scheme separately via mocha tests. |
157 | $out = preg_replace( '/(<h[1-6].*?) id="[^\"]*"([^>]*>)/u', '$1$2', $out ); |
158 | // strip meta/link elements |
159 | $out = preg_replace( |
160 | '#</?(?:meta|link)(?: [^\0-\cZ\s"\'>/=]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*/?>#u', |
161 | '', $out ); |
162 | // Ignore troublesome attributes. |
163 | // In addition to attributes listed above, strip other Parsoid-inserted attributes |
164 | // since these won't be present in legacay parser output. |
165 | $attribTroubleRE = "/ ($unnecessaryAttribs|data-mw|resource|rel|property|class)=\\\\?"; |
166 | $out = preg_replace( $attribTroubleRE . '"[^"]*\\\\?"/u', '', $out ); |
167 | $out = preg_replace( $attribTroubleRE . "'[^']*\\\\?'/u", '', $out ); // single-quoted variant |
168 | // strip typeof last |
169 | $out = preg_replace( '/ typeof="[^\"]*"/u', '', $out ); |
170 | $out = self::stripParsoidIds( $out ); |
171 | $out = preg_replace( '/<span[^>]+about="[^"]*"[^>]*>/u', '', $out ); |
172 | $out = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $out ); |
173 | $out = preg_replace( '#<span>\s*</span>#u', '', $out ); |
174 | $out = preg_replace( '#(href=")(?:\.?\./)+#u', '$1', $out ); |
175 | // replace unnecessary URL escaping |
176 | $out = preg_replace_callback( '/ href="[^"]*"/u', static function ( $m ) { |
177 | return Utils::decodeURI( $m[0] ); |
178 | }, $out ); |
179 | // strip thumbnail size prefixes |
180 | return preg_replace( |
181 | '#(src="[^"]*?)/thumb(/[0-9a-f]/[0-9a-f]{2}/[^/]+)/[0-9]+px-[^"/]+(?=")#u', '$1$2', |
182 | $out |
183 | ); |
184 | } |
185 | |
186 | /** |
187 | * Strip Parsoid ID attributes (id="mwXX", used to associate NodeData) from an HTML string |
188 | * @param string $s |
189 | * @return string |
190 | */ |
191 | public static function stripParsoidIds( string $s ): string { |
192 | return preg_replace( '/ id="mw([-\w]{2,})"/u', '', $s ); |
193 | } |
194 | |
195 | private static function cleanSpans( |
196 | Node $node, ?string $stripSpanTypeof |
197 | ): void { |
198 | if ( !$stripSpanTypeof ) { |
199 | return; |
200 | } |
201 | |
202 | $child = null; |
203 | $next = null; |
204 | for ( $child = $node->firstChild; $child; $child = $next ) { |
205 | $next = $child->nextSibling; |
206 | if ( $child instanceof Element && DOMCompat::nodeName( $child ) === 'span' && |
207 | preg_match( $stripSpanTypeof, DOMCompat::getAttribute( $child, 'typeof' ) ?? '' ) |
208 | ) { |
209 | self::unwrapSpan( $node, $child, $stripSpanTypeof ); |
210 | } |
211 | } |
212 | } |
213 | |
214 | private static function unwrapSpan( |
215 | Node $parent, Node $node, ?string $stripSpanTypeof |
216 | ): void { |
217 | // first recurse to unwrap any spans in the immediate children. |
218 | self::cleanSpans( $node, $stripSpanTypeof ); |
219 | // now unwrap this span. |
220 | DOMUtils::migrateChildren( $node, $parent, $node ); |
221 | $parent->removeChild( $node ); |
222 | } |
223 | |
224 | private static function newlineAround( ?Node $node ): bool { |
225 | return $node && preg_match( |
226 | '/^(body|caption|div|dd|dt|li|p|table|tr|td|th|tbody|dl|ol|ul|h[1-6])$/D', |
227 | DOMCompat::nodeName( $node ) |
228 | ); |
229 | } |
230 | |
231 | private static function normalizeIEWVisitor( |
232 | Node $node, array $opts |
233 | ): Node { |
234 | $child = null; |
235 | $next = null; |
236 | $prev = null; |
237 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
238 | // Preserve newlines in <pre> tags |
239 | $opts['inPRE'] = true; |
240 | } |
241 | if ( !$opts['preserveIEW'] && $node instanceof Text ) { |
242 | if ( !$opts['inPRE'] ) { |
243 | $node->data = preg_replace( '/\s+/u', ' ', $node->data ); |
244 | } |
245 | if ( $opts['stripLeadingWS'] ) { |
246 | $node->data = preg_replace( '/^\s+/u', '', $node->data, 1 ); |
247 | } |
248 | if ( $opts['stripTrailingWS'] ) { |
249 | $node->data = preg_replace( '/\s+$/uD', '', $node->data, 1 ); |
250 | } |
251 | } |
252 | // unwrap certain SPAN nodes |
253 | self::cleanSpans( $node, $opts['stripSpanTypeof'] ); |
254 | // now remove comment nodes |
255 | if ( !$opts['parsoidOnly'] ) { |
256 | for ( $child = $node->firstChild; $child; $child = $next ) { |
257 | $next = $child->nextSibling; |
258 | if ( $child instanceof Comment ) { |
259 | $node->removeChild( $child ); |
260 | } |
261 | } |
262 | } |
263 | // reassemble text nodes split by a comment or span, if necessary |
264 | if ( $node instanceof Element ) { |
265 | DOMCompat::normalize( $node ); |
266 | } |
267 | // now recurse. |
268 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
269 | // hack, since PHP adds a newline before </pre> |
270 | $opts['stripLeadingWS'] = false; |
271 | $opts['stripTrailingWS'] = true; |
272 | } elseif ( |
273 | DOMCompat::nodeName( $node ) === 'span' && |
274 | DOMUtils::matchTypeOf( $node, '/^mw:/' ) |
275 | ) { |
276 | // SPAN is transparent; pass the strip parameters down to kids |
277 | } else { |
278 | $opts['stripLeadingWS'] = $opts['stripTrailingWS'] = self::newlineAround( $node ); |
279 | } |
280 | $child = $node->firstChild; |
281 | // Skip over the empty mw:FallbackId <span> and strip leading WS |
282 | // on the other side of it. |
283 | if ( $child && DOMUtils::isHeading( $node ) && WTUtils::isFallbackIdSpan( $child ) ) { |
284 | $child = $child->nextSibling; |
285 | } |
286 | for ( ; $child; $child = $next ) { |
287 | $next = $child->nextSibling; |
288 | $newOpts = $opts; |
289 | $newOpts['stripTrailingWS'] = $opts['stripTrailingWS'] && !$child->nextSibling; |
290 | self::normalizeIEWVisitor( $child, $newOpts ); |
291 | $opts['stripLeadingWS'] = false; |
292 | } |
293 | |
294 | if ( $opts['inPRE'] || $opts['preserveIEW'] ) { |
295 | return $node; |
296 | } |
297 | |
298 | // now add newlines around appropriate nodes. |
299 | for ( $child = $node->firstChild; $child; $child = $next ) { |
300 | $prev = $child->previousSibling; |
301 | $next = $child->nextSibling; |
302 | if ( self::newlineAround( $child ) ) { |
303 | if ( $prev instanceof Text ) { |
304 | $prev->data = preg_replace( '/\s*$/uD', "\n", $prev->data, 1 ); |
305 | } else { |
306 | $prev = $node->ownerDocument->createTextNode( "\n" ); |
307 | $node->insertBefore( $prev, $child ); |
308 | } |
309 | if ( $next instanceof Text ) { |
310 | $next->data = preg_replace( '/^\s*/u', "\n", $next->data, 1 ); |
311 | } else { |
312 | $next = $node->ownerDocument->createTextNode( "\n" ); |
313 | $node->insertBefore( $next, $child->nextSibling ); |
314 | } |
315 | } |
316 | } |
317 | return $node; |
318 | } |
319 | |
320 | /** |
321 | * Normalize newlines in IEW to spaces instead. |
322 | * |
323 | * @param Element $body The document body node to normalize. |
324 | * @param ?string $stripSpanTypeof Regular expression to strip typeof attributes |
325 | * @param bool $parsoidOnly |
326 | * @param bool $preserveIEW |
327 | * @return Element |
328 | */ |
329 | public static function unwrapSpansAndNormalizeIEW( |
330 | Element $body, ?string $stripSpanTypeof = null, bool $parsoidOnly = false, bool $preserveIEW = false |
331 | ): Element { |
332 | $opts = [ |
333 | 'preserveIEW' => $preserveIEW, |
334 | 'parsoidOnly' => $parsoidOnly, |
335 | 'stripSpanTypeof' => $stripSpanTypeof, |
336 | 'stripLeadingWS' => true, |
337 | 'stripTrailingWS' => true, |
338 | 'inPRE' => false |
339 | ]; |
340 | // clone body first, since we're going to destructively mutate it. |
341 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
342 | return self::normalizeIEWVisitor( $body->cloneNode( true ), $opts ); |
343 | } |
344 | |
345 | /** |
346 | * Strip some php output we aren't generating. |
347 | * |
348 | * @param string $html |
349 | * @return string |
350 | */ |
351 | public static function normalizePhpOutput( string $html ): string { |
352 | return preg_replace( |
353 | // do not expect section editing for now |
354 | '/<span[^>]+class="mw-headline"[^>]*>(.*?)<\/span> ' |
355 | . '*(<span class="mw-editsection"><span class="mw-editsection-bracket">' |
356 | . '\[<\/span>.*?<span class="mw-editsection-bracket">\]<\/span><\/span>)?/u', |
357 | '$1', |
358 | $html |
359 | ); |
360 | } |
361 | |
362 | /** |
363 | * Normalize the expected parser output by parsing it using a HTML5 parser and |
364 | * re-serializing it to HTML. Ideally, the parser would normalize inter-tag |
365 | * whitespace for us. For now, we fake that by simply stripping all newlines. |
366 | * |
367 | * @param string $source |
368 | * @return string |
369 | */ |
370 | public static function normalizeHTML( string $source ): string { |
371 | try { |
372 | $body = self::unwrapSpansAndNormalizeIEW( DOMCompat::getBody( DOMUtils::parseHTML( $source ) ) ); |
373 | $html = ContentUtils::toXML( $body, [ 'innerXML' => true ] ); |
374 | |
375 | // a few things we ignore for now.. |
376 | // .replace(/\/wiki\/Main_Page/g, 'Main Page') |
377 | // do not expect a toc for now |
378 | $html = preg_replace( |
379 | '/<div[^>]+?id="toc"[^>]*>\s*<div id="toctitle"[^>]*>[\s\S]+?<\/div>[\s\S]+?<\/div>\s*/u', |
380 | '', |
381 | $html ); |
382 | $html = self::normalizePhpOutput( $html ); |
383 | // remove empty span tags |
384 | $html = preg_replace( '/(\s)<span>\s*<\/span>\s*/u', '$1', $html ); |
385 | $html = preg_replace( '/<span>\s*<\/span>/u', '', $html ); |
386 | // general class and titles, typically on links |
387 | $html = preg_replace( '/ (class|rel|about|typeof)="[^"]*"/', '', $html ); |
388 | // strip red link markup, we do not check if a page exists yet |
389 | $html = preg_replace( |
390 | "#/index.php\\?title=([^']+?)&action=edit&redlink=1#", '/wiki/$1', $html ); |
391 | // strip red link title info |
392 | $html = preg_replace( |
393 | "/ \\((?:page does not exist|encara no existeix|bet ele jaratılmaǵan|lonkásá ezalí tɛ̂)\\)/", |
394 | '', $html ); |
395 | // the expected html has some extra space in tags, strip it |
396 | $html = preg_replace( '/<a +href/', '<a href', $html ); |
397 | $html = preg_replace( '#href="/wiki/#', 'href="', $html ); |
398 | $html = preg_replace( '/" +>/', '">', $html ); |
399 | // parsoid always add a page name to lonely fragments |
400 | $html = preg_replace( '/href="#/', 'href="Main Page#', $html ); |
401 | // replace unnecessary URL escaping |
402 | $html = preg_replace_callback( '/ href="[^"]*"/', |
403 | static function ( $m ) { |
404 | return Utils::decodeURI( $m[0] ); |
405 | }, |
406 | $html ); |
407 | // strip empty spans |
408 | $html = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $html ); |
409 | return preg_replace( '#<span>\s*</span>#u', '', $html ); |
410 | } catch ( Exception $e ) { |
411 | error_log( 'normalizeHTML failed on' . $source . ' with the following error: ' . $e ); |
412 | return $source; |
413 | } |
414 | } |
415 | |
416 | /** |
417 | * @param string $string |
418 | * @param string $color |
419 | * @param bool $inverse |
420 | * @return string |
421 | * @suppress PhanUndeclaredClassMethod |
422 | * @suppress UnusedSuppression |
423 | */ |
424 | public static function colorString( |
425 | string $string, string $color, bool $inverse = false |
426 | ): string { |
427 | if ( $inverse ) { |
428 | $color = [ $color, 'reverse' ]; |
429 | } |
430 | |
431 | if ( !self::$consoleColor ) { |
432 | // Attempt to instantiate this class to determine if the |
433 | // (optional) php-console-color library is installed. |
434 | try { |
435 | self::$consoleColor = new \PHP_Parallel_Lint\PhpConsoleColor\ConsoleColor(); |
436 | } catch ( Error $e ) { |
437 | /* fall back to no-color mode */ |
438 | } |
439 | } |
440 | |
441 | if ( self::$consoleColor && self::$consoleColor->isSupported() ) { |
442 | return self::$consoleColor->apply( $color, $string ); |
443 | } else { |
444 | return $string; |
445 | } |
446 | } |
447 | } |