Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 193 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
| TestUtils | |
0.00% |
0 / 193 |
|
0.00% |
0 / 14 |
5112 | |
0.00% |
0 / 1 |
| encodeXml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeAbout | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeOut | |
0.00% |
0 / 57 |
|
0.00% |
0 / 1 |
72 | |||
| stripParsoidIds | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| cleanSpans | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
42 | |||
| unwrapSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| newlineAround | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| normalizeIEWVisitor | |
0.00% |
0 / 47 |
|
0.00% |
0 / 1 |
650 | |||
| unwrapSpansAndNormalizeIEW | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
| normalizePhpOutput | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
90 | |||
| normalizeHTML | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
6 | |||
| colorString | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
72 | |||
| filterDsr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
| filterNodeDsr | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\ParserTests; |
| 5 | |
| 6 | use Error; |
| 7 | use Exception; |
| 8 | use Wikimedia\Parsoid\DOM\Comment; |
| 9 | use Wikimedia\Parsoid\DOM\Element; |
| 10 | use Wikimedia\Parsoid\DOM\Node; |
| 11 | use Wikimedia\Parsoid\DOM\Text; |
| 12 | use Wikimedia\Parsoid\Html2Wt\DOMNormalizer; |
| 13 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
| 14 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
| 15 | use Wikimedia\Parsoid\Mocks\MockEnv; |
| 16 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 20 | use Wikimedia\Parsoid\Utils\Utils; |
| 21 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 22 | |
| 23 | /** |
| 24 | * This class contains helper functions which should not be directly used |
| 25 | * outside of Parsoid. |
| 26 | * |
| 27 | * Per T332457, most of the code in Wikimedia\Parsoid\ParserTests is |
| 28 | * "for use in parser test runners only", including the core parser |
| 29 | * test runner, but this file is "more internal" than that: core's |
| 30 | * parser test runner should not use these helpers directly. |
| 31 | * |
| 32 | * @internal |
| 33 | */ |
| 34 | class TestUtils { |
| 35 | /** @var mixed */ |
| 36 | private static $consoleColor; |
| 37 | /** @var 'auto'|bool Color mode. */ |
| 38 | public static bool|string $colorMode = 'auto'; |
| 39 | |
| 40 | /** |
| 41 | * Little helper function for encoding XML entities. |
| 42 | * |
| 43 | * @param string $str |
| 44 | * @return string |
| 45 | */ |
| 46 | public static function encodeXml( string $str ): string { |
| 47 | // PORT-FIXME: Find replacement |
| 48 | // return entities::encodeXML( $str ); |
| 49 | return $str; |
| 50 | } |
| 51 | |
| 52 | /** |
| 53 | * Strip the actual about id from the string |
| 54 | * @param string $str |
| 55 | * @return string |
| 56 | */ |
| 57 | public static function normalizeAbout( string $str ): string { |
| 58 | return preg_replace( "/(about=\\\\?[\"']#mwt)\d+/", '$1', $str ); |
| 59 | } |
| 60 | |
| 61 | /** |
| 62 | * Specialized normalization of the PHP parser & Parsoid output, to ignore |
| 63 | * a few known-ok differences in parser test runs. |
| 64 | * |
| 65 | * This code is also used by the Parsoid round-trip testing code. |
| 66 | * |
| 67 | * If parsoidOnly is true-ish, we allow more markup through (like property |
| 68 | * and typeof attributes), for better checking of parsoid-only test cases. |
| 69 | * |
| 70 | * @param Element|string $domBody |
| 71 | * @param array $options |
| 72 | * - parsoidOnly (bool) Is this test Parsoid Only? Optional. Default: false |
| 73 | * - preserveIEW (bool) Should inter-element WS be preserved? Optional. Default: false |
| 74 | * - hackyNormalize (bool) Apply the normalizer to the html. Optional. Default: false |
| 75 | * @return string |
| 76 | */ |
| 77 | public static function normalizeOut( $domBody, array $options = [] ): string { |
| 78 | $parsoidOnly = !empty( $options['parsoidOnly'] ); |
| 79 | $preserveIEW = !empty( $options['preserveIEW'] ); |
| 80 | |
| 81 | if ( !empty( $options['hackyNormalize'] ) ) { |
| 82 | // Mock env obj |
| 83 | // |
| 84 | // FIXME: This is ugly. |
| 85 | // (a) The normalizer shouldn't need the full env. |
| 86 | // Pass options and a logger instead? |
| 87 | // (b) DOM diff code is using page-id for some reason. |
| 88 | // That feels like a carryover of 2013 era code. |
| 89 | // If possible, get rid of it and diff-mark dependency |
| 90 | // on the env object. |
| 91 | $mockEnv = new MockEnv( [] ); |
| 92 | $mockSerializer = new WikitextSerializer( $mockEnv, [] ); |
| 93 | $mockState = new SerializerState( $mockSerializer, [ 'selserMode' => false ] ); |
| 94 | if ( is_string( $domBody ) ) { |
| 95 | // Careful about the lifetime of this document |
| 96 | $doc = ContentUtils::createAndLoadDocument( $domBody ); |
| 97 | $domBody = DOMCompat::getBody( $doc ); |
| 98 | } else { |
| 99 | DOMDataUtils::visitAndLoadDataAttribs( $domBody, [ 'markNew' => true ] ); |
| 100 | } |
| 101 | ( new DOMNormalizer( $mockState ) )->normalize( $domBody ); |
| 102 | DOMDataUtils::visitAndStoreDataAttribs( $domBody ); |
| 103 | DOMDataUtils::getBag( $domBody->ownerDocument )->loaded = false; |
| 104 | } elseif ( is_string( $domBody ) ) { |
| 105 | $domBody = DOMCompat::getBody( DOMUtils::parseHTML( $domBody ) ); |
| 106 | } |
| 107 | |
| 108 | $stripTypeof = $parsoidOnly ? |
| 109 | '/^mw:Placeholder$/' : |
| 110 | '/^mw:(?:DisplaySpace|Placeholder|Nowiki|Transclusion|Entity)$/'; |
| 111 | $domBody = self::unwrapSpansAndNormalizeIEW( $domBody, $stripTypeof, $parsoidOnly, $preserveIEW ); |
| 112 | $out = ContentUtils::toXML( $domBody, [ 'innerXML' => true ] ); |
| 113 | // NOTE that we use a slightly restricted regexp for "attribute" |
| 114 | // which works for the output of DOM serialization. For example, |
| 115 | // we know that attribute values will be surrounded with double quotes, |
| 116 | // not unquoted or quoted with single quotes. The serialization |
| 117 | // algorithm is given by: |
| 118 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments |
| 119 | if ( !preg_match( '#[^<]*(<\w+(\s+[^\0-\cZ\s"\'>/=]+(="[^"]*")?)*/?>[^<]*)*#u', $out ) ) { |
| 120 | throw new Error( 'normalizeOut input is not in standard serialized form' ); |
| 121 | } |
| 122 | |
| 123 | // Eliminate a source of indeterminacy from leaked strip markers |
| 124 | $out = preg_replace( '/UNIQ-.*?-QINU/u', '', $out ); |
| 125 | |
| 126 | // Normalize COINS ids -- they aren't stable |
| 127 | $out = preg_replace( '/\s?id=[\'"]coins_\d+[\'"]/iu', '', $out ); |
| 128 | |
| 129 | // maplink extension |
| 130 | $out = preg_replace( '/\s?data-overlays=\'[^\']*\'/u', '', $out ); |
| 131 | |
| 132 | // unnecessary attributes, we don't need to check these. |
| 133 | $unnecessaryAttribs = 'data-parsoid|prefix|about|rev|datatype|inlist|usemap|vocab'; |
| 134 | if ( $parsoidOnly ) { |
| 135 | $unnecessaryAttribs = "/ ($unnecessaryAttribs)="; |
| 136 | $out = preg_replace( $unnecessaryAttribs . '\\\\?"[^\"]*\\\\?"/u', '', $out ); |
| 137 | $out = preg_replace( $unnecessaryAttribs . "\\\\?'[^\']*\\\\?'/u", '', $out ); // single-quoted variant |
| 138 | $out = preg_replace( $unnecessaryAttribs . ''.*?'/u', '', $out ); // apos variant |
| 139 | if ( !$options['externallinktarget'] ) { |
| 140 | $out = preg_replace( '/ nofollow/', '', $out ); |
| 141 | $out = str_replace( ' rel="nofollow"', '', $out ); |
| 142 | $out = preg_replace( '/ noreferrer noopener/', '', $out ); |
| 143 | } |
| 144 | |
| 145 | // strip self-closed <nowiki /> because we frequently test WTS |
| 146 | // <nowiki> insertion by providing an html/parsoid section with the |
| 147 | // <meta> tags stripped out, allowing the html2wt test to verify that |
| 148 | // the <nowiki> is correctly added during WTS, while still allowing |
| 149 | // the html2html and wt2html versions of the test to pass as a |
| 150 | // validity check. If <meta>s were not stripped, these tests would all |
| 151 | // have to be modified and split up. Not worth it at this time. |
| 152 | // (see commit 689b22431ad690302420d049b10e689de6b7d426) |
| 153 | $out = preg_replace( '#<span typeof="mw:Nowiki"></span>#', '', $out ); |
| 154 | |
| 155 | return $out; |
| 156 | } |
| 157 | |
| 158 | // strip meta/link elements |
| 159 | $out = preg_replace( |
| 160 | '#</?(?:meta|link)(?: [^\0-\cZ\s"\'>/=]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*/?>#u', |
| 161 | '', $out ); |
| 162 | // Ignore troublesome attributes. |
| 163 | // In addition to attributes listed above, strip other Parsoid-inserted attributes |
| 164 | // since these won't be present in legacay parser output. |
| 165 | $attribTroubleRE = "/ ($unnecessaryAttribs|data-mw|resource|rel|property|class)=\\\\?"; |
| 166 | $out = preg_replace( $attribTroubleRE . '"[^"]*\\\\?"/u', '', $out ); |
| 167 | $out = preg_replace( $attribTroubleRE . "'[^']*\\\\?'/u", '', $out ); // single-quoted variant |
| 168 | // strip typeof last |
| 169 | $out = preg_replace( '/ typeof="[^\"]*"/u', '', $out ); |
| 170 | $out = self::stripParsoidIds( $out ); |
| 171 | $out = preg_replace( '/<span[^>]+about="[^"]*"[^>]*>/u', '', $out ); |
| 172 | $out = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $out ); |
| 173 | $out = preg_replace( '#<span>\s*</span>#u', '', $out ); |
| 174 | $out = preg_replace( '#(href=")(?:\.?\./)+#u', '$1', $out ); |
| 175 | // replace unnecessary URL escaping |
| 176 | $out = preg_replace_callback( '/ href="[^"]*"/u', static function ( $m ) { |
| 177 | return Utils::decodeURI( $m[0] ); |
| 178 | }, $out ); |
| 179 | // strip thumbnail size prefixes |
| 180 | return preg_replace( |
| 181 | '#(src="[^"]*?)/thumb(/[0-9a-f]/[0-9a-f]{2}/[^/]+)/[0-9]+px-[^"/]+(?=")#u', '$1$2', |
| 182 | $out |
| 183 | ); |
| 184 | } |
| 185 | |
| 186 | /** |
| 187 | * Strip Parsoid ID attributes (id="mwXX", used to associate NodeData) from an HTML string |
| 188 | * @param string $s |
| 189 | * @return string |
| 190 | */ |
| 191 | public static function stripParsoidIds( string $s ): string { |
| 192 | return preg_replace( '/ id="mw([-\w]{2,})"/u', '', $s ); |
| 193 | } |
| 194 | |
| 195 | private static function cleanSpans( |
| 196 | Node $node, ?string $stripSpanTypeof |
| 197 | ): void { |
| 198 | if ( !$stripSpanTypeof ) { |
| 199 | return; |
| 200 | } |
| 201 | |
| 202 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 203 | $next = $child->nextSibling; |
| 204 | if ( $child instanceof Element && DOMUtils::nodeName( $child ) === 'span' && |
| 205 | preg_match( $stripSpanTypeof, DOMCompat::getAttribute( $child, 'typeof' ) ?? '' ) |
| 206 | ) { |
| 207 | self::unwrapSpan( $node, $child, $stripSpanTypeof ); |
| 208 | } |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | private static function unwrapSpan( |
| 213 | Node $parent, Node $node, ?string $stripSpanTypeof |
| 214 | ): void { |
| 215 | // first recurse to unwrap any spans in the immediate children. |
| 216 | self::cleanSpans( $node, $stripSpanTypeof ); |
| 217 | // now unwrap this span. |
| 218 | DOMUtils::migrateChildren( $node, $parent, $node ); |
| 219 | $parent->removeChild( $node ); |
| 220 | } |
| 221 | |
| 222 | private static function newlineAround( ?Node $node ): bool { |
| 223 | return $node && preg_match( |
| 224 | '/^(body|caption|div|dd|dt|li|p|table|tr|td|th|tbody|dl|ol|ul|h[1-6])$/D', |
| 225 | DOMUtils::nodeName( $node ) |
| 226 | ); |
| 227 | } |
| 228 | |
| 229 | private static function normalizeIEWVisitor( |
| 230 | Node $node, array $opts |
| 231 | ): Node { |
| 232 | if ( DOMUtils::nodeName( $node ) === 'pre' ) { |
| 233 | // Preserve newlines in <pre> tags |
| 234 | $opts['inPRE'] = true; |
| 235 | } |
| 236 | if ( !$opts['preserveIEW'] && $node instanceof Text ) { |
| 237 | if ( !$opts['inPRE'] ) { |
| 238 | $node->data = preg_replace( '/\s+/u', ' ', $node->data ); |
| 239 | } |
| 240 | if ( $opts['stripLeadingWS'] ) { |
| 241 | $node->data = preg_replace( '/^\s+/u', '', $node->data, 1 ); |
| 242 | } |
| 243 | if ( $opts['stripTrailingWS'] ) { |
| 244 | $node->data = preg_replace( '/\s+$/uD', '', $node->data, 1 ); |
| 245 | } |
| 246 | } |
| 247 | // unwrap certain SPAN nodes |
| 248 | self::cleanSpans( $node, $opts['stripSpanTypeof'] ); |
| 249 | // now remove comment nodes |
| 250 | if ( !$opts['parsoidOnly'] ) { |
| 251 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 252 | $next = $child->nextSibling; |
| 253 | if ( $child instanceof Comment ) { |
| 254 | $node->removeChild( $child ); |
| 255 | } |
| 256 | } |
| 257 | } |
| 258 | // reassemble text nodes split by a comment or span, if necessary |
| 259 | if ( $node instanceof Element ) { |
| 260 | DOMCompat::normalize( $node ); |
| 261 | } |
| 262 | // now recurse. |
| 263 | if ( DOMUtils::nodeName( $node ) === 'pre' ) { |
| 264 | // hack, since PHP adds a newline before </pre> |
| 265 | $opts['stripLeadingWS'] = false; |
| 266 | $opts['stripTrailingWS'] = true; |
| 267 | } elseif ( |
| 268 | DOMUtils::nodeName( $node ) === 'span' && |
| 269 | DOMUtils::matchTypeOf( $node, '/^mw:/' ) |
| 270 | ) { |
| 271 | // SPAN is transparent; pass the strip parameters down to kids |
| 272 | } else { |
| 273 | $opts['stripLeadingWS'] = $opts['stripTrailingWS'] = self::newlineAround( $node ); |
| 274 | } |
| 275 | $child = $node->firstChild; |
| 276 | // Skip over the empty mw:FallbackId <span> and strip leading WS |
| 277 | // on the other side of it. |
| 278 | if ( $child && DOMUtils::isHeading( $node ) && WTUtils::isFallbackIdSpan( $child ) ) { |
| 279 | $child = $child->nextSibling; |
| 280 | } |
| 281 | for ( ; $child; $child = $next ) { |
| 282 | $next = $child->nextSibling; |
| 283 | $newOpts = $opts; |
| 284 | $newOpts['stripTrailingWS'] = $opts['stripTrailingWS'] && !$child->nextSibling; |
| 285 | self::normalizeIEWVisitor( $child, $newOpts ); |
| 286 | $opts['stripLeadingWS'] = false; |
| 287 | } |
| 288 | |
| 289 | if ( $opts['inPRE'] || $opts['preserveIEW'] ) { |
| 290 | return $node; |
| 291 | } |
| 292 | |
| 293 | // now add newlines around appropriate nodes. |
| 294 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 295 | $prev = $child->previousSibling; |
| 296 | $next = $child->nextSibling; |
| 297 | if ( self::newlineAround( $child ) ) { |
| 298 | if ( $prev instanceof Text ) { |
| 299 | $prev->data = preg_replace( '/\s*$/uD', "\n", $prev->data, 1 ); |
| 300 | } else { |
| 301 | $prev = $node->ownerDocument->createTextNode( "\n" ); |
| 302 | $node->insertBefore( $prev, $child ); |
| 303 | } |
| 304 | if ( $next instanceof Text ) { |
| 305 | $next->data = preg_replace( '/^\s*/u', "\n", $next->data, 1 ); |
| 306 | } else { |
| 307 | $next = $node->ownerDocument->createTextNode( "\n" ); |
| 308 | $node->insertBefore( $next, $child->nextSibling ); |
| 309 | } |
| 310 | } |
| 311 | } |
| 312 | return $node; |
| 313 | } |
| 314 | |
| 315 | /** |
| 316 | * Normalize newlines in IEW to spaces instead. |
| 317 | * |
| 318 | * @param Element $body The document body node to normalize. |
| 319 | * @param ?string $stripSpanTypeof Regular expression to strip typeof attributes |
| 320 | * @param bool $parsoidOnly |
| 321 | * @param bool $preserveIEW |
| 322 | * @return Element |
| 323 | */ |
| 324 | public static function unwrapSpansAndNormalizeIEW( |
| 325 | Element $body, ?string $stripSpanTypeof = null, bool $parsoidOnly = false, bool $preserveIEW = false |
| 326 | ): Element { |
| 327 | $opts = [ |
| 328 | 'preserveIEW' => $preserveIEW, |
| 329 | 'parsoidOnly' => $parsoidOnly, |
| 330 | 'stripSpanTypeof' => $stripSpanTypeof, |
| 331 | 'stripLeadingWS' => true, |
| 332 | 'stripTrailingWS' => true, |
| 333 | 'inPRE' => false |
| 334 | ]; |
| 335 | // clone body first, since we're going to destructively mutate it. |
| 336 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
| 337 | return self::normalizeIEWVisitor( $body->cloneNode( true ), $opts ); |
| 338 | } |
| 339 | |
| 340 | /** |
| 341 | * Strip some php output we aren't generating. |
| 342 | */ |
| 343 | public static function normalizePhpOutput( Element $body ): void { |
| 344 | // Do not expect section editing for now |
| 345 | foreach ( DOMCompat::querySelectorAll( $body, '.mw-editsection' ) as $span ) { |
| 346 | DOMCompat::remove( $span ); |
| 347 | } |
| 348 | // Parsoid adds heading wrappers in an OutputTransform stage |
| 349 | foreach ( DOMCompat::querySelectorAll( $body, '.mw-heading' ) as $div ) { |
| 350 | $nodes = DOMUtils::childNodes( $div ); |
| 351 | foreach ( $nodes as $i => $child ) { |
| 352 | // Remove the nls |
| 353 | if ( $i === 0 && $child instanceof Text ) { |
| 354 | $child->data = preg_replace( "/^\n/uD", '', $child->data ); |
| 355 | } |
| 356 | if ( $i === ( count( $nodes ) - 1 ) && $child instanceof Text ) { |
| 357 | $child->data = preg_replace( "/\n$/uD", '', $child->data ); |
| 358 | } |
| 359 | $div->parentNode->insertBefore( $child, $div ); |
| 360 | } |
| 361 | DOMCompat::remove( $div ); |
| 362 | } |
| 363 | // Do not expect a toc for now |
| 364 | $toc = DOMCompat::querySelector( $body, '#toc' ); |
| 365 | if ( $toc ) { |
| 366 | DOMCompat::remove( $toc ); |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | /** |
| 371 | * Normalize the expected parser output by parsing it using a HTML5 parser and |
| 372 | * re-serializing it to HTML. Ideally, the parser would normalize inter-tag |
| 373 | * whitespace for us. For now, we fake that by simply stripping all newlines. |
| 374 | * |
| 375 | * @param string $source |
| 376 | * @return string |
| 377 | */ |
| 378 | public static function normalizeHTML( string $source ): string { |
| 379 | try { |
| 380 | $body = self::unwrapSpansAndNormalizeIEW( DOMCompat::getBody( DOMUtils::parseHTML( $source ) ) ); |
| 381 | self::normalizePhpOutput( $body ); |
| 382 | $html = ContentUtils::toXML( $body, [ 'innerXML' => true ] ); |
| 383 | |
| 384 | // a few things we ignore for now.. |
| 385 | // .replace(/\/wiki\/Main_Page/g, 'Main Page') |
| 386 | |
| 387 | // remove empty span tags |
| 388 | $html = preg_replace( '/(\s)<span>\s*<\/span>\s*/u', '$1', $html ); |
| 389 | $html = preg_replace( '/<span>\s*<\/span>/u', '', $html ); |
| 390 | // general class and titles, typically on links |
| 391 | $html = preg_replace( '/ (class|rel|about|typeof)="[^"]*"/', '', $html ); |
| 392 | // strip red link markup, we do not check if a page exists yet |
| 393 | $html = preg_replace( |
| 394 | "#/index.php\\?title=([^']+?)&action=edit&redlink=1#", '/wiki/$1', $html ); |
| 395 | // strip red link title info |
| 396 | $html = preg_replace( |
| 397 | "/ \\((?:page does not exist|encara no existeix|bet ele jaratılmaǵan|lonkásá ezalí tɛ̂)\\)/", |
| 398 | '', $html ); |
| 399 | // the expected html has some extra space in tags, strip it |
| 400 | $html = preg_replace( '/<a +href/', '<a href', $html ); |
| 401 | $html = preg_replace( '#href="/wiki/#', 'href="', $html ); |
| 402 | $html = preg_replace( '/" +>/', '">', $html ); |
| 403 | // parsoid always add a page name to lonely fragments |
| 404 | $html = preg_replace( '/href="#/', 'href="Main Page#', $html ); |
| 405 | // replace unnecessary URL escaping |
| 406 | $html = preg_replace_callback( '/ href="[^"]*"/', |
| 407 | static function ( $m ) { |
| 408 | return Utils::decodeURI( $m[0] ); |
| 409 | }, |
| 410 | $html ); |
| 411 | // strip empty spans |
| 412 | $html = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $html ); |
| 413 | return preg_replace( '#<span>\s*</span>#u', '', $html ); |
| 414 | } catch ( Exception $e ) { |
| 415 | error_log( 'normalizeHTML failed on' . $source . ' with the following error: ' . $e ); |
| 416 | return $source; |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | /** |
| 421 | * @param string $string |
| 422 | * @param string $color |
| 423 | * @param bool $inverse |
| 424 | * @return string |
| 425 | * @suppress PhanUndeclaredClassMethod |
| 426 | * @suppress UnusedSuppression |
| 427 | */ |
| 428 | public static function colorString( |
| 429 | string $string, string $color, bool $inverse = false |
| 430 | ): string { |
| 431 | if ( $inverse ) { |
| 432 | $color = [ $color, 'reverse' ]; |
| 433 | } |
| 434 | |
| 435 | if ( !self::$consoleColor ) { |
| 436 | // Attempt to instantiate this class to determine if the |
| 437 | // (optional) php-console-color library is installed. |
| 438 | try { |
| 439 | self::$consoleColor = new \PHP_Parallel_Lint\PhpConsoleColor\ConsoleColor(); |
| 440 | } catch ( Error ) { |
| 441 | /* fall back to no-color mode */ |
| 442 | } |
| 443 | } |
| 444 | |
| 445 | $useColor = is_bool( self::$colorMode ) ? self::$colorMode : |
| 446 | // 'auto' color mode: use color if a tty. |
| 447 | ( self::$consoleColor && self::$consoleColor->isSupported() ); |
| 448 | if ( self::$consoleColor && $useColor ) { |
| 449 | self::$consoleColor->setForceStyle( true ); |
| 450 | return self::$consoleColor->apply( $color, $string ); |
| 451 | } else { |
| 452 | return $string; |
| 453 | } |
| 454 | } |
| 455 | |
| 456 | /** |
| 457 | * Removes DSR from data-parsoid for test normalization of a complete document. If |
| 458 | * data-parsoid gets subsequently empty, removes it too. |
| 459 | * @param string $raw |
| 460 | * @return string |
| 461 | */ |
| 462 | public static function filterDsr( string $raw ): string { |
| 463 | $doc = ContentUtils::createAndLoadDocument( $raw ); |
| 464 | foreach ( DOMUtils::childNodes( $doc ) as $child ) { |
| 465 | if ( $child instanceof Element ) { |
| 466 | self::filterNodeDsr( $child ); |
| 467 | } |
| 468 | } |
| 469 | $ret = ContentUtils::ppToXML( DOMCompat::getBody( $doc ), [ 'innerXML' => true ] ); |
| 470 | $ret = preg_replace( '/\sdata-parsoid="{}"/', '', $ret ); |
| 471 | return $ret; |
| 472 | } |
| 473 | |
| 474 | /** |
| 475 | * Removes DSR from data-parsoid for test normalization of an element. |
| 476 | */ |
| 477 | public static function filterNodeDsr( Element $el ): void { |
| 478 | $dp = DOMDataUtils::getDataParsoid( $el ); |
| 479 | unset( $dp->dsr ); |
| 480 | // XXX: could also set TempData::IS_NEW if !$dp->isModified(), |
| 481 | // rather than using the preg_replace above. |
| 482 | foreach ( DOMUtils::childNodes( $el ) as $child ) { |
| 483 | if ( $child instanceof Element ) { |
| 484 | self::filterNodeDsr( $child ); |
| 485 | } |
| 486 | } |
| 487 | } |
| 488 | } |