Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
31.67% |
70 / 221 |
|
43.08% |
28 / 65 |
CRAP | |
0.00% |
0 / 1 |
| DOMUtils | |
31.67% |
70 / 221 |
|
43.08% |
28 / 65 |
7619.84 | |
0.00% |
0 / 1 |
| parseHTML | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
5.09 | |||
| visitDOM | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
| migrateChildren | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| childNodes | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| migrateChildrenBetweenDocs | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| assertElt | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| isRemexBlockNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
| isWikitextBlockNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isFormattingElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isQuoteElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isBody | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isRemoved | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| pathToRoot | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| nodeDepth | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| pathToSibling | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| inSiblingOrder | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| isAncestorOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| findAncestorOfName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
| hasNameOrHasAncestorOfName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| matchNameAndTypeOf | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| hasNameAndTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| matchTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| matchRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| matchMultivalAttr | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
| hasTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasClass | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| hasValueInMultivalAttr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
| addTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| addRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| addValueToMultivalAttr | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
42 | |||
| removeValueFromMultivalAttr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| removeTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| removeRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| isFosterablePosition | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| isHeading | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isList | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isListOrListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isNestedInListItem | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| isNestedListOrListItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| isMarkerMeta | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| hasElementChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| hasBlockElementDescendant | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
5 | |||
| isIEW | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isDocumentFragment | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| atTheTop | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| allChildrenAreTextOrComments | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
| treeHasElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
| isTableTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| selectMediaElt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| findHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| addHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| extractInlinedContentVersion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| addAttributes | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| appendToHead | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
| getFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| setFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| parseHTMLToFragment | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| isRawTextElement | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasBlockTag | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
| isMetaDataTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| stripPWrapper | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| nodeName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| isNewlineWrappingSpan | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Utils; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Core\ClientError; |
| 8 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 9 | use Wikimedia\Parsoid\DOM\Comment; |
| 10 | use Wikimedia\Parsoid\DOM\Document; |
| 11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 12 | use Wikimedia\Parsoid\DOM\DOMParser; |
| 13 | use Wikimedia\Parsoid\DOM\Element; |
| 14 | use Wikimedia\Parsoid\DOM\Node; |
| 15 | use Wikimedia\Parsoid\DOM\Text; |
| 16 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 17 | use Wikimedia\Parsoid\Wt2Html\TreeBuilder\ParsoidDOMBuilder; |
| 18 | use Wikimedia\Parsoid\Wt2Html\XHtmlSerializer; |
| 19 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer; |
| 20 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
| 21 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
| 22 | |
| 23 | /** |
| 24 | * DOM utilities for querying the DOM. This is largely independent of Parsoid |
| 25 | * although some Parsoid details (TokenUtils, inline content version) |
| 26 | * have snuck in. |
| 27 | */ |
| 28 | class DOMUtils { |
| 29 | |
| 30 | /** |
| 31 | * Parse HTML, return the tree. |
| 32 | * |
| 33 | * @note The resulting document is not "prepared and loaded"; use |
| 34 | * ContentUtils::prepareAndLoadDocument() instead if that's what |
| 35 | * you need. |
| 36 | */ |
| 37 | public static function parseHTML( |
| 38 | string $html, bool $validateXMLNames = false |
| 39 | ): Document { |
| 40 | if ( !preg_match( '/^<(?:!doctype|html|body)/i', $html ) ) { |
| 41 | // Make sure that we parse fragments in the body. Otherwise comments, |
| 42 | // link and meta tags end up outside the html element or in the head |
| 43 | // elements. |
| 44 | $html = '<body>' . $html; |
| 45 | } |
| 46 | if ( DOMCompat::isUsingDodo() ) { |
| 47 | return ( new DOMParser() )->parseFromString( $html, 'text/html' ); |
| 48 | } |
| 49 | // If DOMCompat::isUsing84Dom use Remex to parse. |
| 50 | |
| 51 | $domBuilder = new ParsoidDOMBuilder; // our DOMBuilder, not remex's |
| 52 | $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); |
| 53 | $dispatcher = new Dispatcher( $treeBuilder ); |
| 54 | $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); |
| 55 | $tokenizer->execute( [] ); |
| 56 | if ( $validateXMLNames && $domBuilder->isCoerced() ) { |
| 57 | throw new ClientError( 'Encountered a name invalid in XML.' ); |
| 58 | } |
| 59 | $frag = $domBuilder->getFragment(); |
| 60 | '@phan-var Document $frag'; // @var Document $frag |
| 61 | return $frag; |
| 62 | } |
| 63 | |
| 64 | /** |
| 65 | * This is a simplified version of the DOMTraverser. |
| 66 | * Consider using that before making this more complex. |
| 67 | * |
| 68 | * FIXME: Move to DOMTraverser OR create a new class? |
| 69 | * @param Node $node |
| 70 | * @param callable $handler |
| 71 | * @param mixed ...$args |
| 72 | */ |
| 73 | public static function visitDOM( Node $node, callable $handler, ...$args ): void { |
| 74 | $handler( $node, ...$args ); |
| 75 | $node = $node->firstChild; |
| 76 | while ( $node ) { |
| 77 | $next = $node->nextSibling; |
| 78 | self::visitDOM( $node, $handler, ...$args ); |
| 79 | $node = $next; |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | /** |
| 84 | * Move 'from'.childNodes to 'to' adding them before 'beforeNode' |
| 85 | * If 'beforeNode' is null, the nodes are appended at the end. |
| 86 | * @param Node $from Source node. Children will be removed. |
| 87 | * @param Node $to Destination node. Children of $from will be added here |
| 88 | * @param ?Node $beforeNode Add the children before this node. |
| 89 | */ |
| 90 | public static function migrateChildren( |
| 91 | Node $from, Node $to, ?Node $beforeNode = null |
| 92 | ): void { |
| 93 | while ( $from->firstChild ) { |
| 94 | $to->insertBefore( $from->firstChild, $beforeNode ); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | /** |
| 99 | * Many DOM implementations will de-optimize the representation of a |
| 100 | * Node if `$node->childNodes` is accessed, converting the linked list |
| 101 | * of node children to an array which is then expensive to mutate. |
| 102 | * |
| 103 | * This method returns an array of child nodes, but uses the |
| 104 | * `->firstChild`/`->nextSibling` accessors to obtain it, avoiding |
| 105 | * deoptimization. This is also robust against concurrent mutation. |
| 106 | * |
| 107 | * @param Node $n |
| 108 | * @return list<Node> the child nodes |
| 109 | */ |
| 110 | public static function childNodes( Node $n ): array { |
| 111 | $result = []; |
| 112 | for ( $child = $n->firstChild; $child !== null; $child = $child->nextSibling ) { |
| 113 | $result[] = $child; |
| 114 | } |
| 115 | return $result; |
| 116 | } |
| 117 | |
| 118 | /** |
| 119 | * Copy 'from'.childNodes to 'to' adding them before 'beforeNode' |
| 120 | * 'from' and 'to' belong to different documents. |
| 121 | * |
| 122 | * If 'beforeNode' is null, the nodes are appended at the end. |
| 123 | * @param Node $from |
| 124 | * @param Node $to |
| 125 | * @param ?Node $beforeNode |
| 126 | */ |
| 127 | public static function migrateChildrenBetweenDocs( |
| 128 | Node $from, Node $to, ?Node $beforeNode = null |
| 129 | ): void { |
| 130 | $destDoc = $to->ownerDocument; |
| 131 | if ( $destDoc === $from->ownerDocument ) { |
| 132 | self::migrateChildren( $from, $to, $beforeNode ); |
| 133 | return; |
| 134 | } |
| 135 | $n = $from->firstChild; |
| 136 | while ( $n ) { |
| 137 | $to->insertBefore( $destDoc->importNode( $n, true ), $beforeNode ); |
| 138 | $n = $n->nextSibling; |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | /** |
| 143 | * Assert that this is a DOM element node. |
| 144 | * This is primarily to help phan analyze variable types. |
| 145 | * |
| 146 | * @phan-assert Element $node |
| 147 | * |
| 148 | * @param ?Node $node |
| 149 | * @return true Always returns true |
| 150 | */ |
| 151 | public static function assertElt( ?Node $node ): bool { |
| 152 | Assert::invariant( $node instanceof Element, "Expected an element" ); |
| 153 | return true; |
| 154 | } |
| 155 | |
| 156 | public static function isRemexBlockNode( ?Node $node ): bool { |
| 157 | return $node instanceof Element && |
| 158 | !isset( Consts::$HTML['OnlyInlineElements'][self::nodeName( $node )] ) && |
| 159 | // This is a superset of \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements |
| 160 | !self::isMetaDataTag( $node ); |
| 161 | } |
| 162 | |
| 163 | public static function isWikitextBlockNode( ?Node $node ): bool { |
| 164 | return $node && TokenUtils::isWikitextBlockTag( self::nodeName( $node ) ); |
| 165 | } |
| 166 | |
| 167 | /** |
| 168 | * Determine whether this is a formatting DOM element. |
| 169 | */ |
| 170 | public static function isFormattingElt( ?Node $node ): bool { |
| 171 | return $node && isset( Consts::$HTML['FormattingTags'][self::nodeName( $node )] ); |
| 172 | } |
| 173 | |
| 174 | /** |
| 175 | * Determine whether this is a quote DOM element. |
| 176 | */ |
| 177 | public static function isQuoteElt( ?Node $node ): bool { |
| 178 | return $node && isset( Consts::$WTQuoteTags[self::nodeName( $node )] ); |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * Determine whether this is the <body> DOM element. |
| 183 | */ |
| 184 | public static function isBody( ?Node $node ): bool { |
| 185 | return $node && self::nodeName( $node ) === 'body'; |
| 186 | } |
| 187 | |
| 188 | /** |
| 189 | * Determine whether this is a removed DOM node but Node object yet |
| 190 | */ |
| 191 | public static function isRemoved( ?Node $node ): bool { |
| 192 | return !$node || !isset( $node->nodeType ); |
| 193 | } |
| 194 | |
| 195 | /** |
| 196 | * Build path from a node to the root of the document. |
| 197 | * |
| 198 | * @param Node $node |
| 199 | * @return Node[] Path including all nodes from $node to the root of the document |
| 200 | */ |
| 201 | public static function pathToRoot( Node $node ): array { |
| 202 | $path = []; |
| 203 | do { |
| 204 | $path[] = $node; |
| 205 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
| 206 | } while ( $node = $node->parentNode ); |
| 207 | return $path; |
| 208 | } |
| 209 | |
| 210 | /** |
| 211 | * Compute the edge length of the path from $node to the root. |
| 212 | * Root document is at depth 0, <html> at 1, <body> at 2. |
| 213 | */ |
| 214 | public static function nodeDepth( Node $node ): int { |
| 215 | $edges = 0; |
| 216 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
| 217 | while ( $node = $node->parentNode ) { |
| 218 | $edges++; |
| 219 | } |
| 220 | return $edges; |
| 221 | } |
| 222 | |
| 223 | /** |
| 224 | * Build path from a node to its passed-in sibling. |
| 225 | * Return will not include the passed-in sibling. |
| 226 | * |
| 227 | * @param Node $node |
| 228 | * @param Node $sibling |
| 229 | * @param bool $left indicates whether to go backwards, use previousSibling instead of nextSibling. |
| 230 | * @return Node[] |
| 231 | */ |
| 232 | public static function pathToSibling( Node $node, Node $sibling, bool $left ): array { |
| 233 | $path = []; |
| 234 | while ( $node && $node !== $sibling ) { |
| 235 | $path[] = $node; |
| 236 | $node = $left ? $node->previousSibling : $node->nextSibling; |
| 237 | } |
| 238 | return $path; |
| 239 | } |
| 240 | |
| 241 | /** |
| 242 | * Check whether a node `n1` comes before another node `n2` in |
| 243 | * their parent's children list. |
| 244 | * |
| 245 | * @param Node $n1 The node you expect to come first. |
| 246 | * @param Node $n2 Expected later sibling. |
| 247 | * @return bool |
| 248 | */ |
| 249 | public static function inSiblingOrder( Node $n1, Node $n2 ): bool { |
| 250 | while ( $n1 && $n1 !== $n2 ) { |
| 251 | $n1 = $n1->nextSibling; |
| 252 | } |
| 253 | return $n1 !== null; |
| 254 | } |
| 255 | |
| 256 | /** |
| 257 | * Check that a node 'n1' is an ancestor of another node 'n2' in |
| 258 | * the DOM. Returns true if n1 === n2. |
| 259 | * |
| 260 | * @param Node $n1 the suspected ancestor. |
| 261 | * @param Node $n2 the suspected descendant. |
| 262 | * @return bool |
| 263 | */ |
| 264 | public static function isAncestorOf( Node $n1, Node $n2 ): bool { |
| 265 | while ( $n2 && $n2 !== $n1 ) { |
| 266 | $n2 = $n2->parentNode; |
| 267 | } |
| 268 | return $n2 !== null; |
| 269 | } |
| 270 | |
| 271 | /** |
| 272 | * Find an ancestor of $node with nodeName $name. |
| 273 | */ |
| 274 | public static function findAncestorOfName( Node $node, string $name ): ?Element { |
| 275 | $node = $node->parentNode; |
| 276 | while ( $node && self::nodeName( $node ) !== $name ) { |
| 277 | $node = $node->parentNode; |
| 278 | } |
| 279 | '@phan-var Element $node'; // @var Element $node |
| 280 | return $node; |
| 281 | } |
| 282 | |
| 283 | /** |
| 284 | * Check whether $node has $name or has an ancestor named $name. |
| 285 | */ |
| 286 | public static function hasNameOrHasAncestorOfName( Node $node, string $name ): bool { |
| 287 | return self::nodeName( $node ) === $name || self::findAncestorOfName( $node, $name ) !== null; |
| 288 | } |
| 289 | |
| 290 | /** |
| 291 | * Determine whether the node matches the given nodeName and attribute value. |
| 292 | * Returns true if node name matches and the attribute equals "typeof" |
| 293 | * |
| 294 | * @param Node $n The node to test |
| 295 | * @param string $name The expected nodeName of $n |
| 296 | * @param string $typeRe Regular expression matching the expected value of |
| 297 | * `typeof` attribute. |
| 298 | * @return ?string The matching `typeof` value, or `null` if there is |
| 299 | * no match. |
| 300 | */ |
| 301 | public static function matchNameAndTypeOf( Node $n, string $name, string $typeRe ): ?string { |
| 302 | return self::nodeName( $n ) === $name ? self::matchTypeOf( $n, $typeRe ) : null; |
| 303 | } |
| 304 | |
| 305 | /** |
| 306 | * Determine whether the node matches the given nodeName and typeof |
| 307 | * attribute value; the typeof is given as string. |
| 308 | * |
| 309 | * @param Node $n |
| 310 | * @param string $name node name to test for |
| 311 | * @param string $type Expected value of "typeof" attribute (literal string) |
| 312 | * @return bool True if the node matches. |
| 313 | */ |
| 314 | public static function hasNameAndTypeOf( Node $n, string $name, string $type ): bool { |
| 315 | return self::matchNameAndTypeOf( |
| 316 | $n, $name, '/^' . preg_quote( $type, '/' ) . '$/' |
| 317 | ) !== null; |
| 318 | } |
| 319 | |
| 320 | /** |
| 321 | * Determine whether the node matches the given `typeof` attribute value. |
| 322 | * |
| 323 | * @param Node $n The node to test |
| 324 | * @param string $typeRe Regular expression matching the expected value of |
| 325 | * the `typeof` attribute. |
| 326 | * @return ?string The matching `typeof` value, or `null` if there is |
| 327 | * no match. |
| 328 | */ |
| 329 | public static function matchTypeOf( Node $n, string $typeRe ): ?string { |
| 330 | return self::matchMultivalAttr( $n, 'typeof', $typeRe ); |
| 331 | } |
| 332 | |
| 333 | /** |
| 334 | * Determine whether the node matches the given `rel` attribute value. |
| 335 | * |
| 336 | * @param Node $n The node to test |
| 337 | * @param string $relRe Regular expression matching the expected value of |
| 338 | * the `rel` attribute. |
| 339 | * @return ?string The matching `rel` value, or `null` if there is |
| 340 | * no match. |
| 341 | */ |
| 342 | public static function matchRel( Node $n, string $relRe ): ?string { |
| 343 | return self::matchMultivalAttr( $n, 'rel', $relRe ); |
| 344 | } |
| 345 | |
| 346 | /** |
| 347 | * Determine whether the node matches the given multivalue attribute value. |
| 348 | * |
| 349 | * @param Node $n The node to test |
| 350 | * @param string $attrName the attribute to test (typically 'rel' or 'typeof') |
| 351 | * @param string $valueRe Regular expression matching the expected value of |
| 352 | * the attribute. |
| 353 | * @return ?string The matching attribute value, or `null` if there is |
| 354 | * no match. |
| 355 | */ |
| 356 | private static function matchMultivalAttr( Node $n, string $attrName, string $valueRe ): ?string { |
| 357 | if ( !( $n instanceof Element ) ) { |
| 358 | return null; |
| 359 | } |
| 360 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
| 361 | if ( $attrValue === null || $attrValue === '' ) { |
| 362 | return null; |
| 363 | } |
| 364 | foreach ( explode( ' ', $attrValue ) as $ty ) { |
| 365 | if ( $ty === '' ) { |
| 366 | continue; |
| 367 | } |
| 368 | $count = preg_match( $valueRe, $ty ); |
| 369 | Assert::invariant( $count !== false, "Bad regexp" ); |
| 370 | if ( $count ) { |
| 371 | return $ty; |
| 372 | } |
| 373 | } |
| 374 | return null; |
| 375 | } |
| 376 | |
| 377 | /** |
| 378 | * Determine whether the node matches the given typeof attribute value. |
| 379 | * |
| 380 | * @param Node $n |
| 381 | * @param string $type Expected value of "typeof" attribute, as a literal |
| 382 | * string. |
| 383 | * @return bool True if the node matches. |
| 384 | */ |
| 385 | public static function hasTypeOf( Node $n, string $type ): bool { |
| 386 | return self::hasValueInMultivalAttr( $n, 'typeof', $type ); |
| 387 | } |
| 388 | |
| 389 | /** |
| 390 | * Determine whether the node matches the given rel attribute value. |
| 391 | * |
| 392 | * @param Node $n |
| 393 | * @param string $rel Expected value of "rel" attribute, as a literal string. |
| 394 | * @return bool True if the node matches. |
| 395 | */ |
| 396 | public static function hasRel( Node $n, string $rel ): bool { |
| 397 | return self::hasValueInMultivalAttr( $n, 'rel', $rel ); |
| 398 | } |
| 399 | |
| 400 | /** |
| 401 | * @param Element $element |
| 402 | * @param string $regex Partial regular expression, e.g. "foo|bar" |
| 403 | * @return bool |
| 404 | */ |
| 405 | public static function hasClass( Element $element, string $regex ): bool { |
| 406 | $value = DOMCompat::getAttribute( $element, 'class' ); |
| 407 | return (bool)preg_match( '{(?<=^|\s)' . $regex . '(?=\s|$)}', $value ?? '' ); |
| 408 | } |
| 409 | |
| 410 | /** |
| 411 | * Determine whether the node matches the given attribute value for a multivalued attribute |
| 412 | * @param Node $n |
| 413 | * @param string $attrName name of the attribute to check (typically 'typeof', 'rel') |
| 414 | * @param string $value Expected value of $attrName" attribute, as a literal string. |
| 415 | * @return bool True if the node matches |
| 416 | */ |
| 417 | private static function hasValueInMultivalAttr( Node $n, string $attrName, string $value ): bool { |
| 418 | // fast path |
| 419 | if ( !( $n instanceof Element ) ) { |
| 420 | return false; |
| 421 | } |
| 422 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
| 423 | if ( $attrValue === null || $attrValue === '' ) { |
| 424 | return false; |
| 425 | } |
| 426 | if ( $attrValue === $value ) { |
| 427 | return true; |
| 428 | } |
| 429 | // fallback |
| 430 | return in_array( $value, explode( ' ', $attrValue ), true ); |
| 431 | } |
| 432 | |
| 433 | /** |
| 434 | * Add a type to the typeof attribute. This method should almost always |
| 435 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 436 | * typeof information. |
| 437 | * |
| 438 | * @param Element $node node |
| 439 | * @param string $type type |
| 440 | * @param bool $prepend If true, adds value to start, rather than end. |
| 441 | * Use of this option in new code is discouraged. |
| 442 | */ |
| 443 | public static function addTypeOf( Element $node, string $type, bool $prepend = false ): void { |
| 444 | self::addValueToMultivalAttr( $node, 'typeof', $type, $prepend ); |
| 445 | } |
| 446 | |
| 447 | /** |
| 448 | * Add a type to the rel attribute. This method should almost always |
| 449 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 450 | * rel information. |
| 451 | */ |
| 452 | public static function addRel( Element $node, string $rel ): void { |
| 453 | self::addValueToMultivalAttr( $node, 'rel', $rel ); |
| 454 | } |
| 455 | |
| 456 | /** |
| 457 | * Add an element to a multivalue attribute (typeof, rel). This method should almost always |
| 458 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 459 | * multivalue information. |
| 460 | * |
| 461 | * @param Element $node |
| 462 | * @param string $attr |
| 463 | * @param string $value |
| 464 | * @param bool $prepend If true, adds value to start, rather than end |
| 465 | */ |
| 466 | private static function addValueToMultivalAttr( |
| 467 | Element $node, string $attr, string $value, bool $prepend = false |
| 468 | ): void { |
| 469 | $value = trim( $value ); |
| 470 | if ( $value === '' ) { |
| 471 | return; |
| 472 | } |
| 473 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
| 474 | if ( $oldValue !== null && trim( $oldValue ) !== '' ) { |
| 475 | $values = explode( ' ', trim( $oldValue ) ); |
| 476 | if ( in_array( $value, $values, true ) ) { |
| 477 | return; |
| 478 | } |
| 479 | $value = $prepend ? "$value $oldValue" : "$oldValue $value"; |
| 480 | } |
| 481 | $node->setAttribute( $attr, $value ); |
| 482 | } |
| 483 | |
| 484 | /** |
| 485 | * Remove a value from a multiple-valued attribute. |
| 486 | * |
| 487 | * @param Element $node node |
| 488 | * @param string $attr The attribute name |
| 489 | * @param string $value The value to remove |
| 490 | */ |
| 491 | private static function removeValueFromMultivalAttr( |
| 492 | Element $node, string $attr, string $value |
| 493 | ): void { |
| 494 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
| 495 | if ( $oldValue !== null && $oldValue !== '' ) { |
| 496 | $value = trim( $value ); |
| 497 | $types = array_diff( explode( ' ', $oldValue ), [ $value ] ); |
| 498 | if ( count( $types ) > 0 ) { |
| 499 | $node->setAttribute( $attr, implode( ' ', $types ) ); |
| 500 | } else { |
| 501 | $node->removeAttribute( $attr ); |
| 502 | } |
| 503 | } |
| 504 | } |
| 505 | |
| 506 | /** |
| 507 | * Remove a type from the typeof attribute. |
| 508 | */ |
| 509 | public static function removeTypeOf( Element $node, string $type ): void { |
| 510 | self::removeValueFromMultivalAttr( $node, 'typeof', $type ); |
| 511 | } |
| 512 | |
| 513 | /** |
| 514 | * Remove a type from the rel attribute. |
| 515 | */ |
| 516 | public static function removeRel( Element $node, string $rel ): void { |
| 517 | self::removeValueFromMultivalAttr( $node, 'rel', $rel ); |
| 518 | } |
| 519 | |
| 520 | /** |
| 521 | * Check whether `node` is in a fosterable position. |
| 522 | */ |
| 523 | public static function isFosterablePosition( ?Node $n ): bool { |
| 524 | return $n && isset( Consts::$HTML['FosterablePosition'][self::nodeName( $n->parentNode )] ); |
| 525 | } |
| 526 | |
| 527 | /** |
| 528 | * Check whether `node` is a heading. |
| 529 | */ |
| 530 | public static function isHeading( ?Node $n ): bool { |
| 531 | return $n && preg_match( '/^h[1-6]$/D', self::nodeName( $n ) ); |
| 532 | } |
| 533 | |
| 534 | /** |
| 535 | * Check whether `node` is a list. |
| 536 | */ |
| 537 | public static function isList( ?Node $n ): bool { |
| 538 | return $n && isset( Consts::$HTML['ListTags'][self::nodeName( $n )] ); |
| 539 | } |
| 540 | |
| 541 | /** |
| 542 | * Check whether `node` is a list item. |
| 543 | */ |
| 544 | public static function isListItem( ?Node $n ): bool { |
| 545 | return $n && isset( Consts::$HTML['ListItemTags'][self::nodeName( $n )] ); |
| 546 | } |
| 547 | |
| 548 | /** |
| 549 | * Check whether `node` is a list or list item. |
| 550 | */ |
| 551 | public static function isListOrListItem( ?Node $n ): bool { |
| 552 | return self::isList( $n ) || self::isListItem( $n ); |
| 553 | } |
| 554 | |
| 555 | /** |
| 556 | * Check whether `node` is nestee in a list item. |
| 557 | */ |
| 558 | public static function isNestedInListItem( ?Node $n ): bool { |
| 559 | $parentNode = $n->parentNode; |
| 560 | while ( $parentNode ) { |
| 561 | if ( self::isListItem( $parentNode ) ) { |
| 562 | return true; |
| 563 | } |
| 564 | $parentNode = $parentNode->parentNode; |
| 565 | } |
| 566 | return false; |
| 567 | } |
| 568 | |
| 569 | /** |
| 570 | * Check whether `node` is a nested list or a list item. |
| 571 | */ |
| 572 | public static function isNestedListOrListItem( ?Node $n ): bool { |
| 573 | return self::isListOrListItem( $n ) && self::isNestedInListItem( $n ); |
| 574 | } |
| 575 | |
| 576 | /** |
| 577 | * Check a node to see whether it's a meta with some typeof. |
| 578 | */ |
| 579 | public static function isMarkerMeta( Node $n, string $type ): bool { |
| 580 | return self::hasNameAndTypeOf( $n, 'meta', $type ); |
| 581 | } |
| 582 | |
| 583 | /** |
| 584 | * Check whether a node has any children that are elements. |
| 585 | */ |
| 586 | public static function hasElementChild( Node $node ): bool { |
| 587 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
| 588 | if ( $child instanceof Element ) { |
| 589 | return true; |
| 590 | } |
| 591 | } |
| 592 | return false; |
| 593 | } |
| 594 | |
| 595 | /** |
| 596 | * Check if a node has a block-level element descendant. |
| 597 | */ |
| 598 | public static function hasBlockElementDescendant( Node $node ): bool { |
| 599 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
| 600 | if ( $child instanceof Element && |
| 601 | ( self::isWikitextBlockNode( $child ) || // Is a block-level node |
| 602 | self::hasBlockElementDescendant( $child ) ) // or has a block-level child or grandchild or.. |
| 603 | ) { |
| 604 | return true; |
| 605 | } |
| 606 | } |
| 607 | return false; |
| 608 | } |
| 609 | |
| 610 | /** |
| 611 | * Is a node representing inter-element whitespace? |
| 612 | */ |
| 613 | public static function isIEW( ?Node $node ): bool { |
| 614 | // ws-only |
| 615 | return $node instanceof Text && preg_match( '/^\s*$/D', $node->nodeValue ); |
| 616 | } |
| 617 | |
| 618 | /** |
| 619 | * Is a node a document fragment? |
| 620 | */ |
| 621 | public static function isDocumentFragment( ?Node $node ): bool { |
| 622 | return $node && $node->nodeType === XML_DOCUMENT_FRAG_NODE; |
| 623 | } |
| 624 | |
| 625 | /** |
| 626 | * Is a node at the top? |
| 627 | */ |
| 628 | public static function atTheTop( ?Node $node ): bool { |
| 629 | return self::isBody( $node ) || self::isDocumentFragment( $node ); |
| 630 | } |
| 631 | |
| 632 | /** |
| 633 | * Are all children of this node text or comment nodes? |
| 634 | */ |
| 635 | public static function allChildrenAreTextOrComments( Node $node ): bool { |
| 636 | $child = $node->firstChild; |
| 637 | while ( $child ) { |
| 638 | if ( !( $child instanceof Text || $child instanceof Comment ) ) { |
| 639 | return false; |
| 640 | } |
| 641 | $child = $child->nextSibling; |
| 642 | } |
| 643 | return true; |
| 644 | } |
| 645 | |
| 646 | /** |
| 647 | * Check if the dom-subtree rooted at node has an element with tag name 'tagName' |
| 648 | * By default, the root node is not checked. |
| 649 | * |
| 650 | * @param Node $node The DOM node whose tree should be checked |
| 651 | * @param string $tagName Tag name to look for |
| 652 | * @param bool $checkRoot Should the root be checked? |
| 653 | * @return bool |
| 654 | */ |
| 655 | public static function treeHasElement( Node $node, string $tagName, bool $checkRoot = false ): bool { |
| 656 | if ( $checkRoot && self::nodeName( $node ) === $tagName ) { |
| 657 | return true; |
| 658 | } |
| 659 | |
| 660 | $node = $node->firstChild; |
| 661 | while ( $node ) { |
| 662 | if ( $node instanceof Element ) { |
| 663 | if ( self::treeHasElement( $node, $tagName, true ) ) { |
| 664 | return true; |
| 665 | } |
| 666 | } |
| 667 | $node = $node->nextSibling; |
| 668 | } |
| 669 | return false; |
| 670 | } |
| 671 | |
| 672 | /** |
| 673 | * Is node a table tag (table, tbody, td, tr, etc.)? |
| 674 | */ |
| 675 | public static function isTableTag( Node $node ): bool { |
| 676 | return isset( Consts::$HTML['TableTags'][self::nodeName( $node )] ); |
| 677 | } |
| 678 | |
| 679 | /** |
| 680 | * Returns a media element nested in `node` |
| 681 | */ |
| 682 | public static function selectMediaElt( Element $node ): ?Element { |
| 683 | return DOMCompat::querySelector( $node, 'img, video, audio' ); |
| 684 | } |
| 685 | |
| 686 | /** |
| 687 | * Extract http-equiv headers from the HTML, including content-language and |
| 688 | * vary headers, if present |
| 689 | * |
| 690 | * @param Document $doc |
| 691 | * @return array<string,string> |
| 692 | */ |
| 693 | public static function findHttpEquivHeaders( Document $doc ): array { |
| 694 | $elts = DOMCompat::querySelectorAll( $doc, 'meta[http-equiv][content]' ); |
| 695 | $r = []; |
| 696 | foreach ( $elts as $el ) { |
| 697 | $r[strtolower( |
| 698 | DOMCompat::getAttribute( $el, 'http-equiv' ) |
| 699 | )] = DOMCompat::getAttribute( $el, 'content' ); |
| 700 | } |
| 701 | return $r; |
| 702 | } |
| 703 | |
| 704 | /** |
| 705 | * Add or replace http-equiv headers in the HTML <head>. |
| 706 | * This is used for content-language and vary headers, among possible |
| 707 | * others. |
| 708 | * @param Document $doc The HTML document to update |
| 709 | * @param array<string,string|string[]> $headers An array mapping HTTP |
| 710 | * header names (which are case-insensitive) to new values. If an |
| 711 | * array of values is provided, they will be joined with commas. |
| 712 | */ |
| 713 | public static function addHttpEquivHeaders( Document $doc, array $headers ): void { |
| 714 | foreach ( $headers as $key => $value ) { |
| 715 | if ( is_array( $value ) ) { |
| 716 | $value = implode( ',', $value ); |
| 717 | } |
| 718 | // HTTP header names are case-insensitive; hence the "i" suffix |
| 719 | // on this selector query. |
| 720 | $el = DOMCompat::querySelector( $doc, "meta[http-equiv=\"{$key}\"i]" ); |
| 721 | if ( !$el ) { |
| 722 | // This also ensures there is a <head> element. |
| 723 | $el = self::appendToHead( $doc, 'meta', [ 'http-equiv' => $key ] ); |
| 724 | } |
| 725 | $el->setAttribute( 'content', $value ); |
| 726 | |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | public static function extractInlinedContentVersion( Document $doc ): ?string { |
| 731 | $el = DOMCompat::querySelector( $doc, |
| 732 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
| 733 | return $el ? DOMCompat::getAttribute( $el, 'content' ) : null; |
| 734 | } |
| 735 | |
| 736 | /** |
| 737 | * Add attributes to a node element. |
| 738 | * |
| 739 | * @param Element $elt element |
| 740 | * @param array $attrs attributes |
| 741 | */ |
| 742 | public static function addAttributes( Element $elt, array $attrs ): void { |
| 743 | foreach ( $attrs as $key => $value ) { |
| 744 | if ( $value !== null ) { |
| 745 | if ( $key === 'id' ) { |
| 746 | DOMCompat::setIdAttribute( $elt, $value ); |
| 747 | } else { |
| 748 | $elt->setAttribute( $key, $value ); |
| 749 | } |
| 750 | } |
| 751 | } |
| 752 | } |
| 753 | |
| 754 | /** |
| 755 | * Create an element in the document head with the given attrs. |
| 756 | * Creates the head element in the document if needed. |
| 757 | * |
| 758 | * @param Document $document |
| 759 | * @param string $tagName |
| 760 | * @param array $attrs |
| 761 | * @return Element The newly-appended Element |
| 762 | */ |
| 763 | public static function appendToHead( Document $document, string $tagName, array $attrs = [] ): Element { |
| 764 | $elt = $document->createElement( $tagName ); |
| 765 | self::addAttributes( $elt, $attrs ); |
| 766 | $head = DOMCompat::getHead( $document ); |
| 767 | if ( !$head ) { |
| 768 | if ( !$document->documentElement ) { |
| 769 | $document->appendChild( $document->createElement( 'html' ) ); |
| 770 | } |
| 771 | $head = $document->createElement( 'head' ); |
| 772 | $document->documentElement->insertBefore( |
| 773 | $head, DOMCompat::getBody( $document ) |
| 774 | ); |
| 775 | } |
| 776 | $head->appendChild( $elt ); |
| 777 | return $elt; |
| 778 | } |
| 779 | |
| 780 | /** |
| 781 | * innerHTML and outerHTML are not defined on DocumentFragment. |
| 782 | * |
| 783 | * Defined similarly to DOMCompat::getInnerHTML() |
| 784 | */ |
| 785 | public static function getFragmentInnerHTML( DocumentFragment $frag ): string { |
| 786 | return XHtmlSerializer::serialize( |
| 787 | $frag, [ 'innerXML' => true ] |
| 788 | )['html']; |
| 789 | } |
| 790 | |
| 791 | /** |
| 792 | * innerHTML and outerHTML are not defined on DocumentFragment. |
| 793 | * @see DOMCompat::setInnerHTML() for the Element version |
| 794 | */ |
| 795 | public static function setFragmentInnerHTML( DocumentFragment $frag, string $html ): void { |
| 796 | // FIXME: This should be an HTML5 template element |
| 797 | $body = $frag->ownerDocument->createElement( 'body' ); |
| 798 | DOMCompat::setInnerHTML( $body, $html ); |
| 799 | self::migrateChildren( $body, $frag ); |
| 800 | } |
| 801 | |
| 802 | public static function parseHTMLToFragment( Document $doc, string $html ): DocumentFragment { |
| 803 | $frag = $doc->createDocumentFragment(); |
| 804 | self::setFragmentInnerHTML( $frag, $html ); |
| 805 | return $frag; |
| 806 | } |
| 807 | |
| 808 | public static function isRawTextElement( Node $node ): bool { |
| 809 | return isset( Consts::$HTML['RawTextElements'][self::nodeName( $node )] ); |
| 810 | } |
| 811 | |
| 812 | /** |
| 813 | * Is $n a block tag OR does the subtree rooted at $n have a block tag in it? |
| 814 | */ |
| 815 | public static function hasBlockTag( Node $n ): bool { |
| 816 | if ( self::isRemexBlockNode( $n ) ) { |
| 817 | return true; |
| 818 | } |
| 819 | $c = $n->firstChild; |
| 820 | while ( $c ) { |
| 821 | if ( self::hasBlockTag( $c ) ) { |
| 822 | return true; |
| 823 | } |
| 824 | $c = $c->nextSibling; |
| 825 | } |
| 826 | return false; |
| 827 | } |
| 828 | |
| 829 | public static function isMetaDataTag( Element $node ): bool { |
| 830 | return isset( Consts::$HTML['MetaDataTags'][self::nodeName( $node )] ); |
| 831 | } |
| 832 | |
| 833 | /** |
| 834 | * Strip a paragraph wrapper, if any, before parsing HTML to DOM |
| 835 | */ |
| 836 | public static function stripPWrapper( string $ret ): string { |
| 837 | return preg_replace( '#(^<p>)|(\n</p>(' . Utils::COMMENT_REGEXP_FRAGMENT . '|\s)*$)#D', '', $ret ); |
| 838 | } |
| 839 | |
| 840 | /** |
| 841 | * Return the lower-case version of the node name. |
| 842 | * FIXME: HTML says this should be capitalized, but we are tailoring |
| 843 | * this to the PHP7.x DOM libraries that return lower-case names. |
| 844 | * @see DOMCompat::nodeName() |
| 845 | */ |
| 846 | public static function nodeName( Node $node ): string { |
| 847 | // We will transition to DOMCompat::nodeName() once we move to |
| 848 | // PHP 8.4 in production, which uses uppercase node names. |
| 849 | return $node instanceof \DOMNode ? $node->nodeName : strtolower( $node->nodeName ); |
| 850 | } |
| 851 | |
| 852 | public static function isNewlineWrappingSpan( Node $elt ): bool { |
| 853 | return self::nodeName( $elt ) === 'span' && |
| 854 | $elt->firstChild === $elt->lastChild && |
| 855 | $elt->firstChild instanceof Text && |
| 856 | preg_match( "/^\n+$/", $elt->textContent ); |
| 857 | } |
| 858 | } |