Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 180 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
TestUtils | |
0.00% |
0 / 180 |
|
0.00% |
0 / 12 |
3080 | |
0.00% |
0 / 1 |
encodeXml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeAbout | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeOut | |
0.00% |
0 / 58 |
|
0.00% |
0 / 1 |
72 | |||
stripParsoidIds | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanSpans | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
unwrapSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
newlineAround | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
normalizeIEWVisitor | |
0.00% |
0 / 50 |
|
0.00% |
0 / 1 |
650 | |||
unwrapSpansAndNormalizeIEW | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
normalizePhpOutput | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
normalizeHTML | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
6 | |||
colorString | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\ParserTests; |
5 | |
6 | use Error; |
7 | use Exception; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Html2Wt\DOMNormalizer; |
13 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
14 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
15 | use Wikimedia\Parsoid\Mocks\MockEnv; |
16 | use Wikimedia\Parsoid\Utils\ContentUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
20 | use Wikimedia\Parsoid\Utils\Utils; |
21 | use Wikimedia\Parsoid\Utils\WTUtils; |
22 | |
23 | /** |
24 | * This class contains helper functions which should not be directly used |
25 | * outside of Parsoid. |
26 | * |
27 | * Per T332457, most of the code in Wikimedia\Parsoid\ParserTests is |
28 | * "for use in parser test runners only", including the core parser |
29 | * test runner, but this file is "more internal" than that: core's |
30 | * parser test runner should not use these helpers directly. |
31 | * |
32 | * @internal |
33 | */ |
34 | class TestUtils { |
35 | /** @var mixed */ |
36 | private static $consoleColor; |
37 | |
38 | /** |
39 | * Little helper function for encoding XML entities. |
40 | * |
41 | * @param string $str |
42 | * @return string |
43 | */ |
44 | public static function encodeXml( string $str ): string { |
45 | // PORT-FIXME: Find replacement |
46 | // return entities::encodeXML( $str ); |
47 | return $str; |
48 | } |
49 | |
50 | /** |
51 | * Strip the actual about id from the string |
52 | * @param string $str |
53 | * @return string |
54 | */ |
55 | public static function normalizeAbout( string $str ): string { |
56 | return preg_replace( "/(about=\\\\?[\"']#mwt)\d+/", '$1', $str ); |
57 | } |
58 | |
59 | /** |
60 | * Specialized normalization of the PHP parser & Parsoid output, to ignore |
61 | * a few known-ok differences in parser test runs. |
62 | * |
63 | * This code is also used by the Parsoid round-trip testing code. |
64 | * |
65 | * If parsoidOnly is true-ish, we allow more markup through (like property |
66 | * and typeof attributes), for better checking of parsoid-only test cases. |
67 | * |
68 | * @param Element|string $domBody |
69 | * @param array $options |
70 | * - parsoidOnly (bool) Is this test Parsoid Only? Optional. Default: false |
71 | * - preserveIEW (bool) Should inter-element WS be preserved? Optional. Default: false |
72 | * - hackyNormalize (bool) Apply the normalizer to the html. Optional. Default: false |
73 | * @return string |
74 | */ |
75 | public static function normalizeOut( $domBody, array $options = [] ): string { |
76 | $parsoidOnly = !empty( $options['parsoidOnly'] ); |
77 | $preserveIEW = !empty( $options['preserveIEW'] ); |
78 | |
79 | if ( !empty( $options['hackyNormalize'] ) ) { |
80 | // Mock env obj |
81 | // |
82 | // FIXME: This is ugly. |
83 | // (a) The normalizer shouldn't need the full env. |
84 | // Pass options and a logger instead? |
85 | // (b) DOM diff code is using page-id for some reason. |
86 | // That feels like a carryover of 2013 era code. |
87 | // If possible, get rid of it and diff-mark dependency |
88 | // on the env object. |
89 | $mockEnv = new MockEnv( [] ); |
90 | $mockSerializer = new WikitextSerializer( $mockEnv, [] ); |
91 | $mockState = new SerializerState( $mockSerializer, [ 'selserMode' => false ] ); |
92 | if ( is_string( $domBody ) ) { |
93 | // Careful about the lifetime of this document |
94 | $doc = ContentUtils::createAndLoadDocument( $domBody ); |
95 | $domBody = DOMCompat::getBody( $doc ); |
96 | } else { |
97 | DOMDataUtils::visitAndLoadDataAttribs( $domBody, [ 'markNew' => true ] ); |
98 | } |
99 | ( new DOMNormalizer( $mockState ) )->normalize( $domBody ); |
100 | DOMDataUtils::visitAndStoreDataAttribs( $domBody ); |
101 | DOMDataUtils::getBag( $domBody->ownerDocument )->loaded = false; |
102 | } elseif ( is_string( $domBody ) ) { |
103 | $domBody = DOMCompat::getBody( DOMUtils::parseHTML( $domBody ) ); |
104 | } |
105 | |
106 | $stripTypeof = $parsoidOnly ? |
107 | '/^mw:Placeholder$/' : |
108 | '/^mw:(?:DisplaySpace|Placeholder|Nowiki|Transclusion|Entity)$/'; |
109 | $domBody = self::unwrapSpansAndNormalizeIEW( $domBody, $stripTypeof, $parsoidOnly, $preserveIEW ); |
110 | $out = ContentUtils::toXML( $domBody, [ 'innerXML' => true ] ); |
111 | // NOTE that we use a slightly restricted regexp for "attribute" |
112 | // which works for the output of DOM serialization. For example, |
113 | // we know that attribute values will be surrounded with double quotes, |
114 | // not unquoted or quoted with single quotes. The serialization |
115 | // algorithm is given by: |
116 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments |
117 | if ( !preg_match( '#[^<]*(<\w+(\s+[^\0-\cZ\s"\'>/=]+(="[^"]*")?)*/?>[^<]*)*#u', $out ) ) { |
118 | throw new Error( 'normalizeOut input is not in standard serialized form' ); |
119 | } |
120 | |
121 | // Eliminate a source of indeterminacy from leaked strip markers |
122 | $out = preg_replace( '/UNIQ-.*?-QINU/u', '', $out ); |
123 | |
124 | // Normalize COINS ids -- they aren't stable |
125 | $out = preg_replace( '/\s?id=[\'"]coins_\d+[\'"]/iu', '', $out ); |
126 | |
127 | // maplink extension |
128 | $out = preg_replace( '/\s?data-overlays=\'[^\']*\'/u', '', $out ); |
129 | |
130 | // unnecessary attributes, we don't need to check these. |
131 | $unnecessaryAttribs = 'data-parsoid|prefix|about|rev|datatype|inlist|usemap|vocab'; |
132 | if ( $parsoidOnly ) { |
133 | $unnecessaryAttribs = "/ ($unnecessaryAttribs)="; |
134 | $out = preg_replace( $unnecessaryAttribs . '\\\\?"[^\"]*\\\\?"/u', '', $out ); |
135 | $out = preg_replace( $unnecessaryAttribs . "\\\\?'[^\']*\\\\?'/u", '', $out ); // single-quoted variant |
136 | $out = preg_replace( $unnecessaryAttribs . ''.*?'/u', '', $out ); // apos variant |
137 | if ( !$options['externallinktarget'] ) { |
138 | $out = preg_replace( '/ nofollow/', '', $out ); |
139 | $out = str_replace( ' rel="nofollow"', '', $out ); |
140 | $out = preg_replace( '/ noreferrer noopener/', '', $out ); |
141 | } |
142 | |
143 | // strip self-closed <nowiki /> because we frequently test WTS |
144 | // <nowiki> insertion by providing an html/parsoid section with the |
145 | // <meta> tags stripped out, allowing the html2wt test to verify that |
146 | // the <nowiki> is correctly added during WTS, while still allowing |
147 | // the html2html and wt2html versions of the test to pass as a |
148 | // validity check. If <meta>s were not stripped, these tests would all |
149 | // have to be modified and split up. Not worth it at this time. |
150 | // (see commit 689b22431ad690302420d049b10e689de6b7d426) |
151 | $out = preg_replace( '#<span typeof="mw:Nowiki"></span>#', '', $out ); |
152 | |
153 | return $out; |
154 | } |
155 | |
156 | // Normalize headings by stripping out Parsoid-added ids so that we don't |
157 | // have to add these ids to every parser test that uses headings. |
158 | // We will test the id generation scheme separately via mocha tests. |
159 | $out = preg_replace( '/(<h[1-6].*?) id="[^\"]*"([^>]*>)/u', '$1$2', $out ); |
160 | // strip meta/link elements |
161 | $out = preg_replace( |
162 | '#</?(?:meta|link)(?: [^\0-\cZ\s"\'>/=]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*/?>#u', |
163 | '', $out ); |
164 | // Ignore troublesome attributes. |
165 | // In addition to attributes listed above, strip other Parsoid-inserted attributes |
166 | // since these won't be present in legacay parser output. |
167 | $attribTroubleRE = "/ ($unnecessaryAttribs|data-mw|resource|rel|property|class)=\\\\?"; |
168 | $out = preg_replace( $attribTroubleRE . '"[^"]*\\\\?"/u', '', $out ); |
169 | $out = preg_replace( $attribTroubleRE . "'[^']*\\\\?'/u", '', $out ); // single-quoted variant |
170 | // strip typeof last |
171 | $out = preg_replace( '/ typeof="[^\"]*"/u', '', $out ); |
172 | $out = self::stripParsoidIds( $out ); |
173 | $out = preg_replace( '/<span[^>]+about="[^"]*"[^>]*>/u', '', $out ); |
174 | $out = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $out ); |
175 | $out = preg_replace( '#<span>\s*</span>#u', '', $out ); |
176 | $out = preg_replace( '#(href=")(?:\.?\./)+#u', '$1', $out ); |
177 | // replace unnecessary URL escaping |
178 | $out = preg_replace_callback( '/ href="[^"]*"/u', static function ( $m ) { |
179 | return Utils::decodeURI( $m[0] ); |
180 | }, $out ); |
181 | // strip thumbnail size prefixes |
182 | return preg_replace( |
183 | '#(src="[^"]*?)/thumb(/[0-9a-f]/[0-9a-f]{2}/[^/]+)/[0-9]+px-[^"/]+(?=")#u', '$1$2', |
184 | $out |
185 | ); |
186 | } |
187 | |
188 | /** |
189 | * Strip Parsoid ID attributes (id="mwXX", used to associate NodeData) from an HTML string |
190 | * @param string $s |
191 | * @return string |
192 | */ |
193 | public static function stripParsoidIds( string $s ): string { |
194 | return preg_replace( '/ id="mw([-\w]{2,})"/u', '', $s ); |
195 | } |
196 | |
197 | private static function cleanSpans( |
198 | Node $node, ?string $stripSpanTypeof |
199 | ): void { |
200 | if ( !$stripSpanTypeof ) { |
201 | return; |
202 | } |
203 | |
204 | $child = null; |
205 | $next = null; |
206 | for ( $child = $node->firstChild; $child; $child = $next ) { |
207 | $next = $child->nextSibling; |
208 | if ( $child instanceof Element && DOMCompat::nodeName( $child ) === 'span' && |
209 | preg_match( $stripSpanTypeof, DOMCompat::getAttribute( $child, 'typeof' ) ?? '' ) |
210 | ) { |
211 | self::unwrapSpan( $node, $child, $stripSpanTypeof ); |
212 | } |
213 | } |
214 | } |
215 | |
216 | private static function unwrapSpan( |
217 | Node $parent, Node $node, ?string $stripSpanTypeof |
218 | ): void { |
219 | // first recurse to unwrap any spans in the immediate children. |
220 | self::cleanSpans( $node, $stripSpanTypeof ); |
221 | // now unwrap this span. |
222 | DOMUtils::migrateChildren( $node, $parent, $node ); |
223 | $parent->removeChild( $node ); |
224 | } |
225 | |
226 | private static function newlineAround( ?Node $node ): bool { |
227 | return $node && preg_match( |
228 | '/^(body|caption|div|dd|dt|li|p|table|tr|td|th|tbody|dl|ol|ul|h[1-6])$/D', |
229 | DOMCompat::nodeName( $node ) |
230 | ); |
231 | } |
232 | |
233 | private static function normalizeIEWVisitor( |
234 | Node $node, array $opts |
235 | ): Node { |
236 | $child = null; |
237 | $next = null; |
238 | $prev = null; |
239 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
240 | // Preserve newlines in <pre> tags |
241 | $opts['inPRE'] = true; |
242 | } |
243 | if ( !$opts['preserveIEW'] && $node instanceof Text ) { |
244 | if ( !$opts['inPRE'] ) { |
245 | $node->data = preg_replace( '/\s+/u', ' ', $node->data ); |
246 | } |
247 | if ( $opts['stripLeadingWS'] ) { |
248 | $node->data = preg_replace( '/^\s+/u', '', $node->data, 1 ); |
249 | } |
250 | if ( $opts['stripTrailingWS'] ) { |
251 | $node->data = preg_replace( '/\s+$/uD', '', $node->data, 1 ); |
252 | } |
253 | } |
254 | // unwrap certain SPAN nodes |
255 | self::cleanSpans( $node, $opts['stripSpanTypeof'] ); |
256 | // now remove comment nodes |
257 | if ( !$opts['parsoidOnly'] ) { |
258 | for ( $child = $node->firstChild; $child; $child = $next ) { |
259 | $next = $child->nextSibling; |
260 | if ( $child instanceof Comment ) { |
261 | $node->removeChild( $child ); |
262 | } |
263 | } |
264 | } |
265 | // reassemble text nodes split by a comment or span, if necessary |
266 | if ( $node instanceof Element ) { |
267 | DOMCompat::normalize( $node ); |
268 | } |
269 | // now recurse. |
270 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
271 | // hack, since PHP adds a newline before </pre> |
272 | $opts['stripLeadingWS'] = false; |
273 | $opts['stripTrailingWS'] = true; |
274 | } elseif ( |
275 | DOMCompat::nodeName( $node ) === 'span' && |
276 | DOMUtils::matchTypeOf( $node, '/^mw:/' ) |
277 | ) { |
278 | // SPAN is transparent; pass the strip parameters down to kids |
279 | } else { |
280 | $opts['stripLeadingWS'] = $opts['stripTrailingWS'] = self::newlineAround( $node ); |
281 | } |
282 | $child = $node->firstChild; |
283 | // Skip over the empty mw:FallbackId <span> and strip leading WS |
284 | // on the other side of it. |
285 | if ( $child && DOMUtils::isHeading( $node ) && WTUtils::isFallbackIdSpan( $child ) ) { |
286 | $child = $child->nextSibling; |
287 | } |
288 | for ( ; $child; $child = $next ) { |
289 | $next = $child->nextSibling; |
290 | $newOpts = $opts; |
291 | $newOpts['stripTrailingWS'] = $opts['stripTrailingWS'] && !$child->nextSibling; |
292 | self::normalizeIEWVisitor( $child, $newOpts ); |
293 | $opts['stripLeadingWS'] = false; |
294 | } |
295 | |
296 | if ( $opts['inPRE'] || $opts['preserveIEW'] ) { |
297 | return $node; |
298 | } |
299 | |
300 | // now add newlines around appropriate nodes. |
301 | for ( $child = $node->firstChild; $child; $child = $next ) { |
302 | $prev = $child->previousSibling; |
303 | $next = $child->nextSibling; |
304 | if ( self::newlineAround( $child ) ) { |
305 | if ( $prev instanceof Text ) { |
306 | $prev->data = preg_replace( '/\s*$/uD', "\n", $prev->data, 1 ); |
307 | } else { |
308 | $prev = $node->ownerDocument->createTextNode( "\n" ); |
309 | $node->insertBefore( $prev, $child ); |
310 | } |
311 | if ( $next instanceof Text ) { |
312 | $next->data = preg_replace( '/^\s*/u', "\n", $next->data, 1 ); |
313 | } else { |
314 | $next = $node->ownerDocument->createTextNode( "\n" ); |
315 | $node->insertBefore( $next, $child->nextSibling ); |
316 | } |
317 | } |
318 | } |
319 | return $node; |
320 | } |
321 | |
322 | /** |
323 | * Normalize newlines in IEW to spaces instead. |
324 | * |
325 | * @param Element $body The document body node to normalize. |
326 | * @param ?string $stripSpanTypeof Regular expression to strip typeof attributes |
327 | * @param bool $parsoidOnly |
328 | * @param bool $preserveIEW |
329 | * @return Element |
330 | */ |
331 | public static function unwrapSpansAndNormalizeIEW( |
332 | Element $body, ?string $stripSpanTypeof = null, bool $parsoidOnly = false, bool $preserveIEW = false |
333 | ): Element { |
334 | $opts = [ |
335 | 'preserveIEW' => $preserveIEW, |
336 | 'parsoidOnly' => $parsoidOnly, |
337 | 'stripSpanTypeof' => $stripSpanTypeof, |
338 | 'stripLeadingWS' => true, |
339 | 'stripTrailingWS' => true, |
340 | 'inPRE' => false |
341 | ]; |
342 | // clone body first, since we're going to destructively mutate it. |
343 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
344 | return self::normalizeIEWVisitor( $body->cloneNode( true ), $opts ); |
345 | } |
346 | |
347 | /** |
348 | * Strip some php output we aren't generating. |
349 | * |
350 | * @param string $html |
351 | * @return string |
352 | */ |
353 | public static function normalizePhpOutput( string $html ): string { |
354 | return preg_replace( |
355 | // do not expect section editing for now |
356 | '/<span[^>]+class="mw-headline"[^>]*>(.*?)<\/span> ' |
357 | . '*(<span class="mw-editsection"><span class="mw-editsection-bracket">' |
358 | . '\[<\/span>.*?<span class="mw-editsection-bracket">\]<\/span><\/span>)?/u', |
359 | '$1', |
360 | $html |
361 | ); |
362 | } |
363 | |
364 | /** |
365 | * Normalize the expected parser output by parsing it using a HTML5 parser and |
366 | * re-serializing it to HTML. Ideally, the parser would normalize inter-tag |
367 | * whitespace for us. For now, we fake that by simply stripping all newlines. |
368 | * |
369 | * @param string $source |
370 | * @return string |
371 | */ |
372 | public static function normalizeHTML( string $source ): string { |
373 | try { |
374 | $body = self::unwrapSpansAndNormalizeIEW( DOMCompat::getBody( DOMUtils::parseHTML( $source ) ) ); |
375 | $html = ContentUtils::toXML( $body, [ 'innerXML' => true ] ); |
376 | |
377 | // a few things we ignore for now.. |
378 | // .replace(/\/wiki\/Main_Page/g, 'Main Page') |
379 | // do not expect a toc for now |
380 | $html = preg_replace( |
381 | '/<div[^>]+?id="toc"[^>]*>\s*<div id="toctitle"[^>]*>[\s\S]+?<\/div>[\s\S]+?<\/div>\s*/u', |
382 | '', |
383 | $html ); |
384 | $html = self::normalizePhpOutput( $html ); |
385 | // remove empty span tags |
386 | $html = preg_replace( '/(\s)<span>\s*<\/span>\s*/u', '$1', $html ); |
387 | $html = preg_replace( '/<span>\s*<\/span>/u', '', $html ); |
388 | // general class and titles, typically on links |
389 | $html = preg_replace( '/ (class|rel|about|typeof)="[^"]*"/', '', $html ); |
390 | // strip red link markup, we do not check if a page exists yet |
391 | $html = preg_replace( |
392 | "#/index.php\\?title=([^']+?)&action=edit&redlink=1#", '/wiki/$1', $html ); |
393 | // strip red link title info |
394 | $html = preg_replace( |
395 | "/ \\((?:page does not exist|encara no existeix|bet ele jaratılmaǵan|lonkásá ezalí tɛ̂)\\)/", |
396 | '', $html ); |
397 | // the expected html has some extra space in tags, strip it |
398 | $html = preg_replace( '/<a +href/', '<a href', $html ); |
399 | $html = preg_replace( '#href="/wiki/#', 'href="', $html ); |
400 | $html = preg_replace( '/" +>/', '">', $html ); |
401 | // parsoid always add a page name to lonely fragments |
402 | $html = preg_replace( '/href="#/', 'href="Main Page#', $html ); |
403 | // replace unnecessary URL escaping |
404 | $html = preg_replace_callback( '/ href="[^"]*"/', |
405 | static function ( $m ) { |
406 | return Utils::decodeURI( $m[0] ); |
407 | }, |
408 | $html ); |
409 | // strip empty spans |
410 | $html = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $html ); |
411 | return preg_replace( '#<span>\s*</span>#u', '', $html ); |
412 | } catch ( Exception $e ) { |
413 | error_log( 'normalizeHTML failed on' . $source . ' with the following error: ' . $e ); |
414 | return $source; |
415 | } |
416 | } |
417 | |
418 | /** |
419 | * @param string $string |
420 | * @param string $color |
421 | * @param bool $inverse |
422 | * @return string |
423 | * @suppress PhanUndeclaredClassMethod |
424 | * @suppress UnusedSuppression |
425 | */ |
426 | public static function colorString( |
427 | string $string, string $color, bool $inverse = false |
428 | ): string { |
429 | if ( $inverse ) { |
430 | $color = [ $color, 'reverse' ]; |
431 | } |
432 | |
433 | if ( !self::$consoleColor ) { |
434 | // Attempt to instantiate this class to determine if the |
435 | // (optional) php-console-color library is installed. |
436 | try { |
437 | self::$consoleColor = new \PHP_Parallel_Lint\PhpConsoleColor\ConsoleColor(); |
438 | } catch ( Error $e ) { |
439 | /* fall back to no-color mode */ |
440 | } |
441 | } |
442 | |
443 | if ( self::$consoleColor && self::$consoleColor->isSupported() ) { |
444 | return self::$consoleColor->apply( $color, $string ); |
445 | } else { |
446 | return $string; |
447 | } |
448 | } |
449 | } |