Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 178 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
TestUtils | |
0.00% |
0 / 178 |
|
0.00% |
0 / 11 |
2970 | |
0.00% |
0 / 1 |
encodeXml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeAbout | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
normalizeOut | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
72 | |||
cleanSpans | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
unwrapSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
newlineAround | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
normalizeIEWVisitor | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
650 | |||
unwrapSpansAndNormalizeIEW | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
normalizePhpOutput | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
normalizeHTML | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
6 | |||
colorString | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\ParserTests; |
5 | |
6 | use Error; |
7 | use Exception; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Html2Wt\DOMNormalizer; |
13 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
14 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
15 | use Wikimedia\Parsoid\Mocks\MockEnv; |
16 | use Wikimedia\Parsoid\Utils\ContentUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
20 | use Wikimedia\Parsoid\Utils\Utils; |
21 | use Wikimedia\Parsoid\Utils\WTUtils; |
22 | |
23 | class TestUtils { |
24 | /** @var mixed */ |
25 | private static $consoleColor; |
26 | |
27 | /** |
28 | * Little helper function for encoding XML entities. |
29 | * |
30 | * @param string $str |
31 | * @return string |
32 | */ |
33 | public static function encodeXml( string $str ): string { |
34 | // PORT-FIXME: Find replacement |
35 | // return entities::encodeXML( $str ); |
36 | return $str; |
37 | } |
38 | |
39 | /** |
40 | * Strip the actual about id from the string |
41 | * @param string $str |
42 | * @return string |
43 | */ |
44 | public static function normalizeAbout( string $str ): string { |
45 | return preg_replace( "/(about=\\\\?[\"']#mwt)\d+/", '$1', $str ); |
46 | } |
47 | |
48 | /** |
49 | * Specialized normalization of the PHP parser & Parsoid output, to ignore |
50 | * a few known-ok differences in parser test runs. |
51 | * |
52 | * This code is also used by the Parsoid round-trip testing code. |
53 | * |
54 | * If parsoidOnly is true-ish, we allow more markup through (like property |
55 | * and typeof attributes), for better checking of parsoid-only test cases. |
56 | * |
57 | * @param Element|string $domBody |
58 | * @param array $options |
59 | * - parsoidOnly (bool) Is this test Parsoid Only? Optional. Default: false |
60 | * - preserveIEW (bool) Should inter-element WS be preserved? Optional. Default: false |
61 | * - hackyNormalize (bool) Apply the normalizer to the html. Optional. Default: false |
62 | * @return string |
63 | */ |
64 | public static function normalizeOut( $domBody, array $options = [] ): string { |
65 | $parsoidOnly = !empty( $options['parsoidOnly'] ); |
66 | $preserveIEW = !empty( $options['preserveIEW'] ); |
67 | |
68 | if ( !empty( $options['hackyNormalize'] ) ) { |
69 | // Mock env obj |
70 | // |
71 | // FIXME: This is ugly. |
72 | // (a) The normalizer shouldn't need the full env. |
73 | // Pass options and a logger instead? |
74 | // (b) DOM diff code is using page-id for some reason. |
75 | // That feels like a carryover of 2013 era code. |
76 | // If possible, get rid of it and diff-mark dependency |
77 | // on the env object. |
78 | $mockEnv = new MockEnv( [] ); |
79 | $mockSerializer = new WikitextSerializer( $mockEnv, [] ); |
80 | $mockState = new SerializerState( $mockSerializer, [ 'selserMode' => false ] ); |
81 | if ( is_string( $domBody ) ) { |
82 | // Careful about the lifetime of this document |
83 | $doc = ContentUtils::createDocument( $domBody ); |
84 | $domBody = DOMCompat::getBody( $doc ); |
85 | } |
86 | DOMDataUtils::visitAndLoadDataAttribs( $domBody, [ 'markNew' => true ] ); |
87 | ( new DOMNormalizer( $mockState ) )->normalize( $domBody ); |
88 | DOMDataUtils::visitAndStoreDataAttribs( $domBody ); |
89 | } elseif ( is_string( $domBody ) ) { |
90 | $domBody = DOMCompat::getBody( DOMUtils::parseHTML( $domBody ) ); |
91 | } |
92 | |
93 | $stripTypeof = $parsoidOnly ? |
94 | '/^mw:Placeholder$/' : |
95 | '/^mw:(?:DisplaySpace|Placeholder|Nowiki|Transclusion|Entity)$/'; |
96 | $domBody = self::unwrapSpansAndNormalizeIEW( $domBody, $stripTypeof, $parsoidOnly, $preserveIEW ); |
97 | $out = ContentUtils::toXML( $domBody, [ 'innerXML' => true ] ); |
98 | // NOTE that we use a slightly restricted regexp for "attribute" |
99 | // which works for the output of DOM serialization. For example, |
100 | // we know that attribute values will be surrounded with double quotes, |
101 | // not unquoted or quoted with single quotes. The serialization |
102 | // algorithm is given by: |
103 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments |
104 | if ( !preg_match( '#[^<]*(<\w+(\s+[^\0-\cZ\s"\'>/=]+(="[^"]*")?)*/?>[^<]*)*#u', $out ) ) { |
105 | throw new Error( 'normalizeOut input is not in standard serialized form' ); |
106 | } |
107 | |
108 | // Eliminate a source of indeterminacy from leaked strip markers |
109 | $out = preg_replace( '/UNIQ-.*?-QINU/u', '', $out ); |
110 | |
111 | // Normalize COINS ids -- they aren't stable |
112 | $out = preg_replace( '/\s?id=[\'"]coins_\d+[\'"]/iu', '', $out ); |
113 | |
114 | // maplink extension |
115 | $out = preg_replace( '/\s?data-overlays=\'[^\']*\'/u', '', $out ); |
116 | |
117 | // unnecessary attributes, we don't need to check these. |
118 | $unnecessaryAttribs = 'data-parsoid|prefix|about|rev|datatype|inlist|usemap|vocab'; |
119 | if ( $parsoidOnly ) { |
120 | $unnecessaryAttribs = "/ ($unnecessaryAttribs)="; |
121 | $out = preg_replace( $unnecessaryAttribs . '\\\\?"[^\"]*\\\\?"/u', '', $out ); |
122 | $out = preg_replace( $unnecessaryAttribs . "\\\\?'[^\']*\\\\?'/u", '', $out ); // single-quoted variant |
123 | $out = preg_replace( $unnecessaryAttribs . ''.*?'/u', '', $out ); // apos variant |
124 | if ( !$options['externallinktarget'] ) { |
125 | $out = preg_replace( '/ nofollow/', '', $out ); |
126 | $out = preg_replace( '/ noreferrer noopener/', '', $out ); |
127 | } |
128 | |
129 | // strip self-closed <nowiki /> because we frequently test WTS |
130 | // <nowiki> insertion by providing an html/parsoid section with the |
131 | // <meta> tags stripped out, allowing the html2wt test to verify that |
132 | // the <nowiki> is correctly added during WTS, while still allowing |
133 | // the html2html and wt2html versions of the test to pass as a |
134 | // validity check. If <meta>s were not stripped, these tests would all |
135 | // have to be modified and split up. Not worth it at this time. |
136 | // (see commit 689b22431ad690302420d049b10e689de6b7d426) |
137 | $out = preg_replace( '#<span typeof="mw:Nowiki"></span>#', '', $out ); |
138 | |
139 | return $out; |
140 | } |
141 | |
142 | // Normalize headings by stripping out Parsoid-added ids so that we don't |
143 | // have to add these ids to every parser test that uses headings. |
144 | // We will test the id generation scheme separately via mocha tests. |
145 | $out = preg_replace( '/(<h[1-6].*?) id="[^\"]*"([^>]*>)/u', '$1$2', $out ); |
146 | // strip meta/link elements |
147 | $out = preg_replace( |
148 | '#</?(?:meta|link)(?: [^\0-\cZ\s"\'>/=]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*/?>#u', |
149 | '', $out ); |
150 | // Ignore troublesome attributes. |
151 | // In addition to attributes listed above, strip other Parsoid-inserted attributes |
152 | // since these won't be present in legacay parser output. |
153 | $attribTroubleRE = "/ ($unnecessaryAttribs|data-mw|resource|rel|property|class)=\\\\?"; |
154 | $out = preg_replace( $attribTroubleRE . '"[^"]*\\\\?"/u', '', $out ); |
155 | $out = preg_replace( $attribTroubleRE . "'[^']*\\\\?'/u", '', $out ); // single-quoted variant |
156 | // strip typeof last |
157 | $out = preg_replace( '/ typeof="[^\"]*"/u', '', $out ); |
158 | // replace mwt ids |
159 | $out = preg_replace( '/ id="mw((t\d+)|([\w-]{2,}))"/u', '', $out ); |
160 | $out = preg_replace( '/<span[^>]+about="[^"]*"[^>]*>/u', '', $out ); |
161 | $out = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $out ); |
162 | $out = preg_replace( '#<span>\s*</span>#u', '', $out ); |
163 | $out = preg_replace( '#(href=")(?:\.?\./)+#u', '$1', $out ); |
164 | // replace unnecessary URL escaping |
165 | $out = preg_replace_callback( '/ href="[^"]*"/u', static function ( $m ) { |
166 | return Utils::decodeURI( $m[0] ); |
167 | }, $out ); |
168 | // strip thumbnail size prefixes |
169 | return preg_replace( |
170 | '#(src="[^"]*?)/thumb(/[0-9a-f]/[0-9a-f]{2}/[^/]+)/[0-9]+px-[^"/]+(?=")#u', '$1$2', |
171 | $out |
172 | ); |
173 | } |
174 | |
175 | /** |
176 | * @param Node $node |
177 | * @param ?string $stripSpanTypeof |
178 | */ |
179 | private static function cleanSpans( |
180 | Node $node, ?string $stripSpanTypeof |
181 | ): void { |
182 | if ( !$stripSpanTypeof ) { |
183 | return; |
184 | } |
185 | |
186 | $child = null; |
187 | $next = null; |
188 | for ( $child = $node->firstChild; $child; $child = $next ) { |
189 | $next = $child->nextSibling; |
190 | if ( $child instanceof Element && DOMCompat::nodeName( $child ) === 'span' && |
191 | preg_match( $stripSpanTypeof, $child->getAttribute( 'typeof' ) ?? '' ) |
192 | ) { |
193 | self::unwrapSpan( $node, $child, $stripSpanTypeof ); |
194 | } |
195 | } |
196 | } |
197 | |
198 | /** |
199 | * @param Node $parent |
200 | * @param Node $node |
201 | * @param ?string $stripSpanTypeof |
202 | */ |
203 | private static function unwrapSpan( |
204 | Node $parent, Node $node, ?string $stripSpanTypeof |
205 | ): void { |
206 | // first recurse to unwrap any spans in the immediate children. |
207 | self::cleanSpans( $node, $stripSpanTypeof ); |
208 | // now unwrap this span. |
209 | DOMUtils::migrateChildren( $node, $parent, $node ); |
210 | $parent->removeChild( $node ); |
211 | } |
212 | |
213 | /** |
214 | * @param ?Node $node |
215 | * @return bool |
216 | */ |
217 | private static function newlineAround( ?Node $node ): bool { |
218 | return $node && preg_match( |
219 | '/^(body|caption|div|dd|dt|li|p|table|tr|td|th|tbody|dl|ol|ul|h[1-6])$/D', |
220 | DOMCompat::nodeName( $node ) |
221 | ); |
222 | } |
223 | |
224 | /** |
225 | * @param Node $node |
226 | * @param array $opts |
227 | * @return Node |
228 | */ |
229 | private static function normalizeIEWVisitor( |
230 | Node $node, array $opts |
231 | ): Node { |
232 | $child = null; |
233 | $next = null; |
234 | $prev = null; |
235 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
236 | // Preserve newlines in <pre> tags |
237 | $opts['inPRE'] = true; |
238 | } |
239 | if ( !$opts['preserveIEW'] && $node instanceof Text ) { |
240 | if ( !$opts['inPRE'] ) { |
241 | $node->data = preg_replace( '/\s+/u', ' ', $node->data ); |
242 | } |
243 | if ( $opts['stripLeadingWS'] ) { |
244 | $node->data = preg_replace( '/^\s+/u', '', $node->data, 1 ); |
245 | } |
246 | if ( $opts['stripTrailingWS'] ) { |
247 | $node->data = preg_replace( '/\s+$/uD', '', $node->data, 1 ); |
248 | } |
249 | } |
250 | // unwrap certain SPAN nodes |
251 | self::cleanSpans( $node, $opts['stripSpanTypeof'] ); |
252 | // now remove comment nodes |
253 | if ( !$opts['parsoidOnly'] ) { |
254 | for ( $child = $node->firstChild; $child; $child = $next ) { |
255 | $next = $child->nextSibling; |
256 | if ( $child instanceof Comment ) { |
257 | $node->removeChild( $child ); |
258 | } |
259 | } |
260 | } |
261 | // reassemble text nodes split by a comment or span, if necessary |
262 | if ( $node instanceof Element ) { |
263 | DOMCompat::normalize( $node ); |
264 | } |
265 | // now recurse. |
266 | if ( DOMCompat::nodeName( $node ) === 'pre' ) { |
267 | // hack, since PHP adds a newline before </pre> |
268 | $opts['stripLeadingWS'] = false; |
269 | $opts['stripTrailingWS'] = true; |
270 | } elseif ( DOMCompat::nodeName( $node ) === 'span' && |
271 | preg_match( '/^mw[:]/', $node->getAttribute( 'typeof' ) ?? '' ) |
272 | ) { |
273 | // SPAN is transparent; pass the strip parameters down to kids |
274 | } else { |
275 | $opts['stripLeadingWS'] = $opts['stripTrailingWS'] = self::newlineAround( $node ); |
276 | } |
277 | $child = $node->firstChild; |
278 | // Skip over the empty mw:FallbackId <span> and strip leading WS |
279 | // on the other side of it. |
280 | if ( preg_match( '/^h[1-6]$/D', DOMCompat::nodeName( $node ) ) && |
281 | $child && WTUtils::isFallbackIdSpan( $child ) |
282 | ) { |
283 | $child = $child->nextSibling; |
284 | } |
285 | for ( ; $child; $child = $next ) { |
286 | $next = $child->nextSibling; |
287 | $newOpts = $opts; |
288 | $newOpts['stripTrailingWS'] = $opts['stripTrailingWS'] && !$child->nextSibling; |
289 | self::normalizeIEWVisitor( $child, $newOpts ); |
290 | $opts['stripLeadingWS'] = false; |
291 | } |
292 | |
293 | if ( $opts['inPRE'] || $opts['preserveIEW'] ) { |
294 | return $node; |
295 | } |
296 | |
297 | // now add newlines around appropriate nodes. |
298 | for ( $child = $node->firstChild; $child; $child = $next ) { |
299 | $prev = $child->previousSibling; |
300 | $next = $child->nextSibling; |
301 | if ( self::newlineAround( $child ) ) { |
302 | if ( $prev instanceof Text ) { |
303 | $prev->data = preg_replace( '/\s*$/uD', "\n", $prev->data, 1 ); |
304 | } else { |
305 | $prev = $node->ownerDocument->createTextNode( "\n" ); |
306 | $node->insertBefore( $prev, $child ); |
307 | } |
308 | if ( $next instanceof Text ) { |
309 | $next->data = preg_replace( '/^\s*/u', "\n", $next->data, 1 ); |
310 | } else { |
311 | $next = $node->ownerDocument->createTextNode( "\n" ); |
312 | $node->insertBefore( $next, $child->nextSibling ); |
313 | } |
314 | } |
315 | } |
316 | return $node; |
317 | } |
318 | |
319 | /** |
320 | * Normalize newlines in IEW to spaces instead. |
321 | * |
322 | * @param Element $body The document body node to normalize. |
323 | * @param ?string $stripSpanTypeof Regular expression to strip typeof attributes |
324 | * @param bool $parsoidOnly |
325 | * @param bool $preserveIEW |
326 | * @return Element |
327 | */ |
328 | public static function unwrapSpansAndNormalizeIEW( |
329 | Element $body, ?string $stripSpanTypeof = null, bool $parsoidOnly = false, bool $preserveIEW = false |
330 | ): Element { |
331 | $opts = [ |
332 | 'preserveIEW' => $preserveIEW, |
333 | 'parsoidOnly' => $parsoidOnly, |
334 | 'stripSpanTypeof' => $stripSpanTypeof, |
335 | 'stripLeadingWS' => true, |
336 | 'stripTrailingWS' => true, |
337 | 'inPRE' => false |
338 | ]; |
339 | // clone body first, since we're going to destructively mutate it. |
340 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
341 | return self::normalizeIEWVisitor( $body->cloneNode( true ), $opts ); |
342 | } |
343 | |
344 | /** |
345 | * Strip some php output we aren't generating. |
346 | * |
347 | * @param string $html |
348 | * @return string |
349 | */ |
350 | public static function normalizePhpOutput( string $html ): string { |
351 | return preg_replace( |
352 | // do not expect section editing for now |
353 | '/<span[^>]+class="mw-headline"[^>]*>(.*?)<\/span> ' |
354 | . '*(<span class="mw-editsection"><span class="mw-editsection-bracket">' |
355 | . '\[<\/span>.*?<span class="mw-editsection-bracket">\]<\/span><\/span>)?/u', |
356 | '$1', |
357 | $html |
358 | ); |
359 | } |
360 | |
361 | /** |
362 | * Normalize the expected parser output by parsing it using a HTML5 parser and |
363 | * re-serializing it to HTML. Ideally, the parser would normalize inter-tag |
364 | * whitespace for us. For now, we fake that by simply stripping all newlines. |
365 | * |
366 | * @param string $source |
367 | * @return string |
368 | */ |
369 | public static function normalizeHTML( string $source ): string { |
370 | try { |
371 | $body = self::unwrapSpansAndNormalizeIEW( DOMCompat::getBody( DOMUtils::parseHTML( $source ) ) ); |
372 | $html = ContentUtils::toXML( $body, [ 'innerXML' => true ] ); |
373 | |
374 | // a few things we ignore for now.. |
375 | // .replace(/\/wiki\/Main_Page/g, 'Main Page') |
376 | // do not expect a toc for now |
377 | $html = preg_replace( |
378 | '/<div[^>]+?id="toc"[^>]*>\s*<div id="toctitle"[^>]*>[\s\S]+?<\/div>[\s\S]+?<\/div>\s*/u', |
379 | '', |
380 | $html ); |
381 | $html = self::normalizePhpOutput( $html ); |
382 | // remove empty span tags |
383 | $html = preg_replace( '/(\s)<span>\s*<\/span>\s*/u', '$1', $html ); |
384 | $html = preg_replace( '/<span>\s*<\/span>/u', '', $html ); |
385 | // general class and titles, typically on links |
386 | $html = preg_replace( '/ (class|rel|about|typeof)="[^"]*"/', '', $html ); |
387 | // strip red link markup, we do not check if a page exists yet |
388 | $html = preg_replace( |
389 | "#/index.php\\?title=([^']+?)&action=edit&redlink=1#", '/wiki/$1', $html ); |
390 | // strip red link title info |
391 | $html = preg_replace( |
392 | "/ \\((?:page does not exist|encara no existeix|bet ele jaratılmaǵan|lonkásá ezalí tɛ̂)\\)/", |
393 | '', $html ); |
394 | // the expected html has some extra space in tags, strip it |
395 | $html = preg_replace( '/<a +href/', '<a href', $html ); |
396 | $html = preg_replace( '#href="/wiki/#', 'href="', $html ); |
397 | $html = preg_replace( '/" +>/', '">', $html ); |
398 | // parsoid always add a page name to lonely fragments |
399 | $html = preg_replace( '/href="#/', 'href="Main Page#', $html ); |
400 | // replace unnecessary URL escaping |
401 | $html = preg_replace_callback( '/ href="[^"]*"/', |
402 | static function ( $m ) { |
403 | return Utils::decodeURI( $m[0] ); |
404 | }, |
405 | $html ); |
406 | // strip empty spans |
407 | $html = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $html ); |
408 | return preg_replace( '#<span>\s*</span>#u', '', $html ); |
409 | } catch ( Exception $e ) { |
410 | error_log( 'normalizeHTML failed on' . $source . ' with the following error: ' . $e ); |
411 | return $source; |
412 | } |
413 | } |
414 | |
415 | /** |
416 | * @param string $string |
417 | * @param string $color |
418 | * @param bool $inverse |
419 | * @return string |
420 | * @suppress PhanUndeclaredClassMethod |
421 | * @suppress UnusedSuppression |
422 | */ |
423 | public static function colorString( |
424 | string $string, string $color, bool $inverse = false |
425 | ): string { |
426 | if ( $inverse ) { |
427 | $color = [ $color, 'reverse' ]; |
428 | } |
429 | |
430 | if ( !self::$consoleColor ) { |
431 | // Attempt to instantiate this class to determine if the |
432 | // (optional) php-console-color library is installed. |
433 | try { |
434 | self::$consoleColor = new \JakubOnderka\PhpConsoleColor\ConsoleColor(); |
435 | } catch ( Error $e ) { |
436 | /* fall back to no-color mode */ |
437 | } |
438 | } |
439 | |
440 | if ( self::$consoleColor && self::$consoleColor->isSupported() ) { |
441 | return self::$consoleColor->apply( $color, $string ); |
442 | } else { |
443 | return $string; |
444 | } |
445 | } |
446 | } |