Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 208 |
|
0.00% |
0 / 16 |
CRAP | |
0.00% |
0 / 1 |
| TestUtils | |
0.00% |
0 / 208 |
|
0.00% |
0 / 16 |
6162 | |
0.00% |
0 / 1 |
| encodeXml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeAbout | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeOut | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
56 | |||
| stripParsoidIds | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| cleanSpans | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
42 | |||
| unwrapSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| newlineAround | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| newlineBefore | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| newlineAfter | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| normalizeIEWVisitor | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
812 | |||
| unwrapSpansAndNormalizeIEW | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
| normalizePhpOutput | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
90 | |||
| normalizeHTML | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
6 | |||
| colorString | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
72 | |||
| filterDsr | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| filterNodeDsr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\ParserTests; |
| 5 | |
| 6 | use Error; |
| 7 | use Exception; |
| 8 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 9 | use Wikimedia\Parsoid\DOM\Comment; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\DOM\Node; |
| 12 | use Wikimedia\Parsoid\DOM\Text; |
| 13 | use Wikimedia\Parsoid\Html2Wt\DOMNormalizer; |
| 14 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
| 15 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
| 16 | use Wikimedia\Parsoid\Mocks\MockEnv; |
| 17 | use Wikimedia\Parsoid\NodeData\TempData; |
| 18 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 19 | use Wikimedia\Parsoid\Utils\CounterType; |
| 20 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 21 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 22 | use Wikimedia\Parsoid\Utils\Utils; |
| 23 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 24 | |
| 25 | /** |
| 26 | * This class contains helper functions which should not be directly used |
| 27 | * outside of Parsoid. |
| 28 | * |
| 29 | * Per T332457, most of the code in Wikimedia\Parsoid\ParserTests is |
| 30 | * "for use in parser test runners only", including the core parser |
| 31 | * test runner, but this file is "more internal" than that: core's |
| 32 | * parser test runner should not use these helpers directly. |
| 33 | * |
| 34 | * @internal |
| 35 | */ |
| 36 | class TestUtils { |
| 37 | /** @var mixed */ |
| 38 | private static $consoleColor; |
| 39 | /** @var 'auto'|bool Color mode. */ |
| 40 | public static bool|string $colorMode = 'auto'; |
| 41 | |
| 42 | /** |
| 43 | * Little helper function for encoding XML entities. |
| 44 | * |
| 45 | * @param string $str |
| 46 | * @return string |
| 47 | */ |
| 48 | public static function encodeXml( string $str ): string { |
| 49 | // PORT-FIXME: Find replacement |
| 50 | // return entities::encodeXML( $str ); |
| 51 | return $str; |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * Strip the actual about id from the string |
| 56 | * @param string $str |
| 57 | * @return string |
| 58 | */ |
| 59 | public static function normalizeAbout( string $str ): string { |
| 60 | return preg_replace( |
| 61 | "/(about=\\\\?[\"'])" . CounterType::TRANSCLUSION_ABOUT->getRE() . "/", |
| 62 | '$1' . CounterType::TRANSCLUSION_ABOUT->value, |
| 63 | $str |
| 64 | ); |
| 65 | } |
| 66 | |
| 67 | /** |
| 68 | * Specialized normalization of the PHP parser & Parsoid output, to ignore |
| 69 | * a few known-ok differences in parser test runs. |
| 70 | * |
| 71 | * This code is also used by the Parsoid round-trip testing code. |
| 72 | * |
| 73 | * If parsoidOnly is true-ish, we allow more markup through (like property |
| 74 | * and typeof attributes), for better checking of parsoid-only test cases. |
| 75 | * |
| 76 | * @param Element|string $domBody |
| 77 | * @param array $options |
| 78 | * - parsoidOnly (bool) Is this test Parsoid Only? Optional. Default: false |
| 79 | * - preserveIEW (bool) Should inter-element WS be preserved? Optional. Default: false |
| 80 | * - hackyNormalize (bool) Apply the normalizer to the html. Optional. Default: false |
| 81 | * @return string |
| 82 | */ |
| 83 | public static function normalizeOut( $domBody, array $options = [] ): string { |
| 84 | $parsoidOnly = !empty( $options['parsoidOnly'] ); |
| 85 | $preserveIEW = !empty( $options['preserveIEW'] ); |
| 86 | |
| 87 | if ( !empty( $options['hackyNormalize'] ) ) { |
| 88 | // Mock env obj |
| 89 | // |
| 90 | // FIXME: This is ugly. |
| 91 | // (a) The normalizer shouldn't need the full env. |
| 92 | // Pass options and a logger instead? |
| 93 | // (b) DOM diff code is using page-id for some reason. |
| 94 | // That feels like a carryover of 2013 era code. |
| 95 | // If possible, get rid of it and diff-mark dependency |
| 96 | // on the env object. |
| 97 | $mockEnv = new MockEnv( [] ); |
| 98 | $mockSerializer = new WikitextSerializer( $mockEnv, [] ); |
| 99 | $mockState = new SerializerState( $mockSerializer, [ 'selserMode' => false ] ); |
| 100 | if ( is_string( $domBody ) ) { |
| 101 | // Careful about the lifetime of this document |
| 102 | $doc = ContentUtils::createAndLoadDocument( $domBody ); |
| 103 | $domBody = DOMCompat::getBody( $doc ); |
| 104 | } |
| 105 | ( new DOMNormalizer( $mockState ) )->normalize( $domBody ); |
| 106 | DOMDataUtils::visitAndStoreDataAttribs( $domBody ); |
| 107 | DOMDataUtils::getBag( $domBody->ownerDocument )->loaded = false; |
| 108 | } elseif ( is_string( $domBody ) ) { |
| 109 | $domBody = DOMCompat::getBody( DOMUtils::parseHTML( $domBody ) ); |
| 110 | } |
| 111 | |
| 112 | $stripTypeof = $parsoidOnly ? |
| 113 | '/^mw:Placeholder$/' : |
| 114 | '/^mw:(?:DisplaySpace|Placeholder|Nowiki|Transclusion|Entity)$/'; |
| 115 | $domBody = self::unwrapSpansAndNormalizeIEW( $domBody, $stripTypeof, $parsoidOnly, $preserveIEW ); |
| 116 | $out = ContentUtils::toXML( $domBody, [ |
| 117 | 'innerXML' => true, |
| 118 | # don't treat attribute order as significant |
| 119 | 'sortAttrs' => true, |
| 120 | ] ); |
| 121 | // NOTE that we use a slightly restricted regexp for "attribute" |
| 122 | // which works for the output of DOM serialization. For example, |
| 123 | // we know that attribute values will be surrounded with double quotes, |
| 124 | // not unquoted or quoted with single quotes. The serialization |
| 125 | // algorithm is given by: |
| 126 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments |
| 127 | if ( !preg_match( '#[^<]*(<\w+(\s+[^\0-\cZ\s"\'>/=]+(="[^"]*")?)*/?>[^<]*)*#u', $out ) ) { |
| 128 | throw new Error( 'normalizeOut input is not in standard serialized form' ); |
| 129 | } |
| 130 | |
| 131 | // Eliminate a source of indeterminacy from leaked strip markers |
| 132 | $out = preg_replace( '/UNIQ-.*?-QINU/u', '', $out ); |
| 133 | |
| 134 | // FIXME(T415591): Normalize fragments markers leaking into src strings |
| 135 | $out = preg_replace( '/fragment:\d+}}/u', 'fragment}}', $out ); |
| 136 | |
| 137 | // Normalize COINS ids -- they aren't stable |
| 138 | $out = preg_replace( '/\s?id=[\'"]coins_\d+[\'"]/iu', '', $out ); |
| 139 | |
| 140 | // maplink extension |
| 141 | $out = preg_replace( '/\s?data-overlays=\'[^\']*\'/u', '', $out ); |
| 142 | |
| 143 | // unnecessary attributes, we don't need to check these. |
| 144 | $unnecessaryAttribs = 'data-parsoid|prefix|about|rev|datatype|inlist|usemap|vocab'; |
| 145 | if ( $parsoidOnly ) { |
| 146 | $unnecessaryAttribs = "/ ($unnecessaryAttribs)="; |
| 147 | $out = preg_replace( $unnecessaryAttribs . '\\\\?"[^\"]*\\\\?"/u', '', $out ); |
| 148 | $out = preg_replace( $unnecessaryAttribs . "\\\\?'[^\']*\\\\?'/u", '', $out ); // single-quoted variant |
| 149 | $out = preg_replace( $unnecessaryAttribs . ''.*?'/u', '', $out ); // apos variant |
| 150 | |
| 151 | // strip self-closed <nowiki /> because we frequently test WTS |
| 152 | // <nowiki> insertion by providing an html/parsoid section with the |
| 153 | // <meta> tags stripped out, allowing the html2wt test to verify that |
| 154 | // the <nowiki> is correctly added during WTS, while still allowing |
| 155 | // the html2html and wt2html versions of the test to pass as a |
| 156 | // validity check. If <meta>s were not stripped, these tests would all |
| 157 | // have to be modified and split up. Not worth it at this time. |
| 158 | // (see commit 689b22431ad690302420d049b10e689de6b7d426) |
| 159 | $out = preg_replace( '#<span typeof="mw:Nowiki"></span>#', '', $out ); |
| 160 | |
| 161 | return $out; |
| 162 | } |
| 163 | |
| 164 | // strip meta/link elements |
| 165 | $out = preg_replace( |
| 166 | '#</?(?:meta|link)(?: [^\0-\cZ\s"\'>/=]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*/?>#u', |
| 167 | '', $out ); |
| 168 | // Ignore troublesome attributes. |
| 169 | // In addition to attributes listed above, strip other Parsoid-inserted attributes |
| 170 | // since these won't be present in legacay parser output. |
| 171 | $attribTroubleRE = "/ ($unnecessaryAttribs|data-mw|resource|rel|property|class)=\\\\?"; |
| 172 | $out = preg_replace( $attribTroubleRE . '"[^"]*\\\\?"/u', '', $out ); |
| 173 | $out = preg_replace( $attribTroubleRE . "'[^']*\\\\?'/u", '', $out ); // single-quoted variant |
| 174 | // strip typeof last |
| 175 | $out = preg_replace( '/ typeof="[^\"]*"/u', '', $out ); |
| 176 | $out = self::stripParsoidIds( $out ); |
| 177 | $out = preg_replace( '/<span[^>]+about="[^"]*"[^>]*>/u', '', $out ); |
| 178 | $out = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $out ); |
| 179 | $out = preg_replace( '#<span>\s*</span>#u', '', $out ); |
| 180 | $out = preg_replace( '#(href=")(?:\.?\./)+#u', '$1', $out ); |
| 181 | // replace unnecessary URL escaping |
| 182 | $out = preg_replace_callback( '/ href="[^"]*"/u', static function ( $m ) { |
| 183 | return Utils::decodeURI( $m[0] ); |
| 184 | }, $out ); |
| 185 | // strip thumbnail size prefixes |
| 186 | return preg_replace( |
| 187 | '#(src="[^"]*?)/thumb(/[0-9a-f]/[0-9a-f]{2}/[^/]+)/[0-9]+px-[^"/]+(?=")#u', '$1$2', |
| 188 | $out |
| 189 | ); |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * Strip Parsoid ID attributes used to associate NodeData from an HTML string |
| 194 | * @param string $s |
| 195 | * @return string |
| 196 | */ |
| 197 | public static function stripParsoidIds( string $s ): string { |
| 198 | return preg_replace( |
| 199 | '/ id=\\\\*"' . CounterType::NODE_DATA_ID->getRE() . '\\\\*"/u', |
| 200 | '', |
| 201 | $s |
| 202 | ); |
| 203 | } |
| 204 | |
| 205 | private static function cleanSpans( |
| 206 | Node $node, ?string $stripSpanTypeof |
| 207 | ): void { |
| 208 | if ( !$stripSpanTypeof ) { |
| 209 | return; |
| 210 | } |
| 211 | |
| 212 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 213 | $next = $child->nextSibling; |
| 214 | if ( $child instanceof Element && DOMUtils::nodeName( $child ) === 'span' && |
| 215 | preg_match( $stripSpanTypeof, DOMCompat::getAttribute( $child, 'typeof' ) ?? '' ) |
| 216 | ) { |
| 217 | self::unwrapSpan( $node, $child, $stripSpanTypeof ); |
| 218 | } |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | private static function unwrapSpan( |
| 223 | Node $parent, Node $node, ?string $stripSpanTypeof |
| 224 | ): void { |
| 225 | // first recurse to unwrap any spans in the immediate children. |
| 226 | self::cleanSpans( $node, $stripSpanTypeof ); |
| 227 | // now unwrap this span. |
| 228 | DOMUtils::migrateChildren( $node, $parent, $node ); |
| 229 | $parent->removeChild( $node ); |
| 230 | } |
| 231 | |
| 232 | private static function newlineAround( ?Node $node ): bool { |
| 233 | return $node && preg_match( |
| 234 | '/^(body|caption|div|dd|dt|li|p|table|tr|td|th|tbody|dl|ol|ul|h[1-6])$/D', |
| 235 | DOMUtils::nodeName( $node ) |
| 236 | ); |
| 237 | } |
| 238 | |
| 239 | private static function newlineBefore( ?Node $node ): bool { |
| 240 | return false; |
| 241 | } |
| 242 | |
| 243 | private static function newlineAfter( ?Node $node ): bool { |
| 244 | return $node && DOMUtils::nodeName( $node ) === 'br'; |
| 245 | } |
| 246 | |
| 247 | private static function normalizeIEWVisitor( |
| 248 | Node $node, array $opts |
| 249 | ): Node { |
| 250 | if ( DOMUtils::nodeName( $node ) === 'pre' ) { |
| 251 | // Preserve newlines in <pre> tags |
| 252 | $opts['inPRE'] = true; |
| 253 | } |
| 254 | if ( !$opts['preserveIEW'] && $node instanceof Text ) { |
| 255 | if ( !$opts['inPRE'] ) { |
| 256 | $node->data = preg_replace( '/\s+/u', ' ', $node->data ); |
| 257 | } |
| 258 | if ( $opts['stripLeadingWS'] ) { |
| 259 | $node->data = preg_replace( '/^\s+/u', '', $node->data, 1 ); |
| 260 | } |
| 261 | if ( $opts['stripTrailingWS'] ) { |
| 262 | $node->data = preg_replace( '/\s+$/uD', '', $node->data, 1 ); |
| 263 | } |
| 264 | } |
| 265 | // unwrap certain SPAN nodes |
| 266 | self::cleanSpans( $node, $opts['stripSpanTypeof'] ); |
| 267 | // now remove comment nodes |
| 268 | if ( !$opts['parsoidOnly'] ) { |
| 269 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 270 | $next = $child->nextSibling; |
| 271 | if ( $child instanceof Comment ) { |
| 272 | $node->removeChild( $child ); |
| 273 | } |
| 274 | } |
| 275 | } |
| 276 | // reassemble text nodes split by a comment or span, if necessary |
| 277 | if ( $node instanceof Element ) { |
| 278 | DOMCompat::normalize( $node ); |
| 279 | } |
| 280 | // now recurse. |
| 281 | if ( DOMUtils::nodeName( $node ) === 'pre' ) { |
| 282 | // hack, since PHP adds a newline before </pre> |
| 283 | $opts['stripLeadingWS'] = false; |
| 284 | $opts['stripTrailingWS'] = true; |
| 285 | } elseif ( |
| 286 | DOMUtils::nodeName( $node ) === 'span' && |
| 287 | DOMUtils::matchTypeOf( $node, '/^mw:/' ) |
| 288 | ) { |
| 289 | // SPAN is transparent; pass the strip parameters down to kids |
| 290 | } else { |
| 291 | $opts['stripLeadingWS'] = $opts['stripTrailingWS'] = self::newlineAround( $node ); |
| 292 | } |
| 293 | $child = $node->firstChild; |
| 294 | // Skip over the empty mw:FallbackId <span> and strip leading WS |
| 295 | // on the other side of it. |
| 296 | if ( $child && DOMUtils::isHeading( $node ) && WTUtils::isFallbackIdSpan( $child ) ) { |
| 297 | $child = $child->nextSibling; |
| 298 | } |
| 299 | for ( ; $child; $child = $next ) { |
| 300 | $next = $child->nextSibling; |
| 301 | $newOpts = $opts; |
| 302 | $newOpts['stripTrailingWS'] = $opts['stripTrailingWS'] && !$child->nextSibling; |
| 303 | self::normalizeIEWVisitor( $child, $newOpts ); |
| 304 | $opts['stripLeadingWS'] = false; |
| 305 | } |
| 306 | |
| 307 | if ( $opts['inPRE'] || $opts['preserveIEW'] ) { |
| 308 | return $node; |
| 309 | } |
| 310 | |
| 311 | // now add newlines around appropriate nodes. |
| 312 | for ( $child = $node->firstChild; $child; $child = $next ) { |
| 313 | $addBoth = self::newlineAround( $child ); |
| 314 | $addBefore = $addBoth || self::newlineBefore( $child ); |
| 315 | $addAfter = $addBoth || self::newlineAfter( $child ); |
| 316 | $prev = $child->previousSibling; |
| 317 | if ( $addBefore ) { |
| 318 | if ( $prev instanceof Text ) { |
| 319 | $prev->data = preg_replace( '/\s*$/uD', "\n", $prev->data, 1 ); |
| 320 | } else { |
| 321 | $prev = $node->ownerDocument->createTextNode( "\n" ); |
| 322 | $node->insertBefore( $prev, $child ); |
| 323 | } |
| 324 | } |
| 325 | $next = $child->nextSibling; |
| 326 | if ( $addAfter ) { |
| 327 | if ( $next instanceof Text ) { |
| 328 | $next->data = preg_replace( '/^\s*/u', "\n", $next->data, 1 ); |
| 329 | } else { |
| 330 | $next = $node->ownerDocument->createTextNode( "\n" ); |
| 331 | $node->insertBefore( $next, $child->nextSibling ); |
| 332 | } |
| 333 | } |
| 334 | } |
| 335 | return $node; |
| 336 | } |
| 337 | |
| 338 | /** |
| 339 | * Normalize newlines in IEW to spaces instead. |
| 340 | * |
| 341 | * @param Element $body The document body node to normalize. |
| 342 | * @param ?string $stripSpanTypeof Regular expression to strip typeof attributes |
| 343 | * @param bool $parsoidOnly |
| 344 | * @param bool $preserveIEW |
| 345 | * @return Element |
| 346 | */ |
| 347 | public static function unwrapSpansAndNormalizeIEW( |
| 348 | Element $body, ?string $stripSpanTypeof = null, bool $parsoidOnly = false, bool $preserveIEW = false |
| 349 | ): Element { |
| 350 | $opts = [ |
| 351 | 'preserveIEW' => $preserveIEW, |
| 352 | 'parsoidOnly' => $parsoidOnly, |
| 353 | 'stripSpanTypeof' => $stripSpanTypeof, |
| 354 | 'stripLeadingWS' => true, |
| 355 | 'stripTrailingWS' => true, |
| 356 | 'inPRE' => false |
| 357 | ]; |
| 358 | // clone body first, since we're going to destructively mutate it. |
| 359 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
| 360 | return self::normalizeIEWVisitor( $body->cloneNode( true ), $opts ); |
| 361 | } |
| 362 | |
| 363 | /** |
| 364 | * Strip some php output we aren't generating. |
| 365 | */ |
| 366 | public static function normalizePhpOutput( Element $body ): void { |
| 367 | // Do not expect section editing for now |
| 368 | foreach ( DOMCompat::querySelectorAll( $body, '.mw-editsection' ) as $span ) { |
| 369 | DOMCompat::remove( $span ); |
| 370 | } |
| 371 | // Parsoid adds heading wrappers in an OutputTransform stage |
| 372 | foreach ( DOMCompat::querySelectorAll( $body, '.mw-heading' ) as $div ) { |
| 373 | $nodes = DOMUtils::childNodes( $div ); |
| 374 | foreach ( $nodes as $i => $child ) { |
| 375 | // Remove the nls |
| 376 | if ( $i === 0 && $child instanceof Text ) { |
| 377 | $child->data = preg_replace( "/^\n/uD", '', $child->data ); |
| 378 | } |
| 379 | if ( $i === ( count( $nodes ) - 1 ) && $child instanceof Text ) { |
| 380 | $child->data = preg_replace( "/\n$/uD", '', $child->data ); |
| 381 | } |
| 382 | $div->parentNode->insertBefore( $child, $div ); |
| 383 | } |
| 384 | DOMCompat::remove( $div ); |
| 385 | } |
| 386 | // Do not expect a toc for now |
| 387 | $toc = DOMCompat::querySelector( $body, '#toc' ); |
| 388 | if ( $toc ) { |
| 389 | DOMCompat::remove( $toc ); |
| 390 | } |
| 391 | } |
| 392 | |
| 393 | /** |
| 394 | * Normalize the expected parser output by parsing it using a HTML5 parser and |
| 395 | * re-serializing it to HTML. Ideally, the parser would normalize inter-tag |
| 396 | * whitespace for us. For now, we fake that by simply stripping all newlines. |
| 397 | * |
| 398 | * @param string $source |
| 399 | * @return string |
| 400 | */ |
| 401 | public static function normalizeHTML( string $source ): string { |
| 402 | try { |
| 403 | $body = self::unwrapSpansAndNormalizeIEW( DOMCompat::getBody( DOMUtils::parseHTML( $source ) ) ); |
| 404 | self::normalizePhpOutput( $body ); |
| 405 | $html = ContentUtils::toXML( $body, [ |
| 406 | 'innerXML' => true, |
| 407 | # don't treat attribute order as significant |
| 408 | 'sortAttrs' => true, |
| 409 | ] ); |
| 410 | |
| 411 | // a few things we ignore for now.. |
| 412 | // .replace(/\/wiki\/Main_Page/g, 'Main Page') |
| 413 | |
| 414 | // remove empty span tags |
| 415 | $html = preg_replace( '/(\s)<span>\s*<\/span>\s*/u', '$1', $html ); |
| 416 | $html = preg_replace( '/<span>\s*<\/span>/u', '', $html ); |
| 417 | // general class and titles, typically on links |
| 418 | $html = preg_replace( '/ (class|rel|about|typeof)="[^"]*"/', '', $html ); |
| 419 | // strip red link markup, we do not check if a page exists yet |
| 420 | $html = preg_replace( |
| 421 | "#/index.php\\?title=([^']+?)&action=edit&redlink=1#", '/wiki/$1', $html ); |
| 422 | // strip red link title info |
| 423 | $html = preg_replace( |
| 424 | "/ \\((?:page does not exist|encara no existeix|bet ele jaratılmaǵan|lonkásá ezalí tɛ̂)\\)/", |
| 425 | '', $html ); |
| 426 | // the expected html has some extra space in tags, strip it |
| 427 | $html = preg_replace( '/<a +href/', '<a href', $html ); |
| 428 | $html = preg_replace( '#href="/wiki/#', 'href="', $html ); |
| 429 | $html = preg_replace( '/" +>/', '">', $html ); |
| 430 | // parsoid always add a page name to lonely fragments |
| 431 | $html = preg_replace( '/href="#/', 'href="Main Page#', $html ); |
| 432 | // replace unnecessary URL escaping |
| 433 | $html = preg_replace_callback( '/ href="[^"]*"/', |
| 434 | static function ( $m ) { |
| 435 | return Utils::decodeURI( $m[0] ); |
| 436 | }, |
| 437 | $html ); |
| 438 | // strip empty spans |
| 439 | $html = preg_replace( '#(\s)<span>\s*</span>\s*#u', '$1', $html ); |
| 440 | return preg_replace( '#<span>\s*</span>#u', '', $html ); |
| 441 | } catch ( Exception $e ) { |
| 442 | error_log( 'normalizeHTML failed on' . $source . ' with the following error: ' . $e ); |
| 443 | return $source; |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | /** |
| 448 | * @param string $string |
| 449 | * @param string $color |
| 450 | * @param bool $inverse |
| 451 | * @return string |
| 452 | * @suppress PhanUndeclaredClassMethod |
| 453 | * @suppress UnusedSuppression |
| 454 | */ |
| 455 | public static function colorString( |
| 456 | string $string, string $color, bool $inverse = false |
| 457 | ): string { |
| 458 | if ( $inverse ) { |
| 459 | $color = [ $color, 'reverse' ]; |
| 460 | } |
| 461 | |
| 462 | if ( !self::$consoleColor ) { |
| 463 | // Attempt to instantiate this class to determine if the |
| 464 | // (optional) php-console-color library is installed. |
| 465 | try { |
| 466 | self::$consoleColor = new \PHP_Parallel_Lint\PhpConsoleColor\ConsoleColor(); |
| 467 | } catch ( Error ) { |
| 468 | /* fall back to no-color mode */ |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | $useColor = is_bool( self::$colorMode ) ? self::$colorMode : |
| 473 | // 'auto' color mode: use color if a tty. |
| 474 | ( self::$consoleColor && self::$consoleColor->isSupported() ); |
| 475 | if ( self::$consoleColor && $useColor ) { |
| 476 | self::$consoleColor->setForceStyle( true ); |
| 477 | return self::$consoleColor->apply( $color, $string ); |
| 478 | } else { |
| 479 | return $string; |
| 480 | } |
| 481 | } |
| 482 | |
| 483 | /** |
| 484 | * Removes DSR from data-parsoid for test normalization of a complete document. If |
| 485 | * data-parsoid gets subsequently empty, or if it wasn't present in the first place, |
| 486 | * removes it too. |
| 487 | */ |
| 488 | public static function filterDsr( string $raw, bool $removeDataParsoid = false ): string { |
| 489 | $doc = ContentUtils::createAndLoadDocument( $raw ); |
| 490 | foreach ( DOMUtils::childNodes( $doc ) as $child ) { |
| 491 | if ( $child instanceof Element ) { |
| 492 | self::filterNodeDsr( $child, $removeDataParsoid ); |
| 493 | } |
| 494 | } |
| 495 | return ContentUtils::ppToXML( DOMCompat::getBody( $doc ), [ 'innerXML' => true ] ); |
| 496 | } |
| 497 | |
| 498 | /** |
| 499 | * Removes DSR from data-parsoid for test normalization of an element. |
| 500 | */ |
| 501 | public static function filterNodeDsr( Element $el, bool $removeDataParsoid = false ): void { |
| 502 | $dp = DOMDataUtils::getDataParsoid( $el ); |
| 503 | unset( $dp->dsr ); |
| 504 | if ( $dp->isEmpty() || $removeDataParsoid ) { |
| 505 | $dp->getTemp()->setFlag( TempData::DISCARDABLE_DP ); |
| 506 | } |
| 507 | |
| 508 | foreach ( DOMUtils::childNodes( $el ) as $child ) { |
| 509 | if ( $child instanceof Element ) { |
| 510 | self::filterNodeDsr( $child, $removeDataParsoid ); |
| 511 | } |
| 512 | } |
| 513 | } |
| 514 | } |