Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
31.34% |
68 / 217 |
|
43.08% |
28 / 65 |
CRAP | |
0.00% |
0 / 1 |
| DOMUtils | |
31.34% |
68 / 217 |
|
43.08% |
28 / 65 |
7336.07 | |
0.00% |
0 / 1 |
| parseHTML | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
5.09 | |||
| visitDOM | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
| migrateChildren | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| childNodes | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| migrateChildrenBetweenDocs | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| assertElt | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| isRemexBlockNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
| isWikitextBlockNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isFormattingElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isQuoteElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isBody | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isRemoved | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| pathToRoot | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| nodeDepth | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| pathToSibling | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| inSiblingOrder | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| isAncestorOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| findAncestorOfName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
| hasNameOrHasAncestorOfName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| matchNameAndTypeOf | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| hasNameAndTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| matchTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| matchRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| matchMultivalAttr | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
| hasTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasClass | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| hasValueInMultivalAttr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
| addTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| addRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| addValueToMultivalAttr | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
42 | |||
| removeValueFromMultivalAttr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| removeTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| removeRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| isFosterablePosition | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| isHeading | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isList | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isListOrListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isNestedInListItem | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| isNestedListOrListItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| isMarkerMeta | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| hasElementChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| hasBlockElementDescendant | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
5 | |||
| isIEW | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| isDocumentFragment | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| atTheTop | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| allChildrenAreTextOrComments | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
| treeHasElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
| isTableTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| selectMediaElt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| findHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| addHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| extractInlinedContentVersion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| addAttributes | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| appendToHead | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
| getFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| setFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| parseHTMLToFragment | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| isRawTextElement | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasBlockTag | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
| attributes | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| isMetaDataTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| stripPWrapper | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| nodeName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Utils; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Core\ClientError; |
| 8 | use Wikimedia\Parsoid\DOM\Comment; |
| 9 | use Wikimedia\Parsoid\DOM\Document; |
| 10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 11 | use Wikimedia\Parsoid\DOM\DOMParser; |
| 12 | use Wikimedia\Parsoid\DOM\Element; |
| 13 | use Wikimedia\Parsoid\DOM\Node; |
| 14 | use Wikimedia\Parsoid\DOM\Text; |
| 15 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 16 | use Wikimedia\Parsoid\Wt2Html\TreeBuilder\ParsoidDOMBuilder; |
| 17 | use Wikimedia\Parsoid\Wt2Html\XHtmlSerializer; |
| 18 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer; |
| 19 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
| 20 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
| 21 | |
| 22 | /** |
| 23 | * DOM utilities for querying the DOM. This is largely independent of Parsoid |
| 24 | * although some Parsoid details (TokenUtils, inline content version) |
| 25 | * have snuck in. |
| 26 | */ |
| 27 | class DOMUtils { |
| 28 | |
| 29 | /** |
| 30 | * Parse HTML, return the tree. |
| 31 | * |
| 32 | * @note The resulting document is not "prepared and loaded"; use |
| 33 | * ContentUtils::prepareAndLoadDocument() instead if that's what |
| 34 | * you need. |
| 35 | */ |
| 36 | public static function parseHTML( |
| 37 | string $html, bool $validateXMLNames = false |
| 38 | ): Document { |
| 39 | if ( !preg_match( '/^<(?:!doctype|html|body)/i', $html ) ) { |
| 40 | // Make sure that we parse fragments in the body. Otherwise comments, |
| 41 | // link and meta tags end up outside the html element or in the head |
| 42 | // elements. |
| 43 | $html = '<body>' . $html; |
| 44 | } |
| 45 | if ( DOMCompat::isUsingDodo() ) { |
| 46 | return ( new DOMParser() )->parseFromString( $html, 'text/html' ); |
| 47 | } |
| 48 | // If DOMCompat::isUsing84Dom use Remex to parse. |
| 49 | |
| 50 | $domBuilder = new ParsoidDOMBuilder; // our DOMBuilder, not remex's |
| 51 | $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); |
| 52 | $dispatcher = new Dispatcher( $treeBuilder ); |
| 53 | $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); |
| 54 | $tokenizer->execute( [] ); |
| 55 | if ( $validateXMLNames && $domBuilder->isCoerced() ) { |
| 56 | throw new ClientError( 'Encountered a name invalid in XML.' ); |
| 57 | } |
| 58 | $frag = $domBuilder->getFragment(); |
| 59 | '@phan-var Document $frag'; // @var Document $frag |
| 60 | return $frag; |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * This is a simplified version of the DOMTraverser. |
| 65 | * Consider using that before making this more complex. |
| 66 | * |
| 67 | * FIXME: Move to DOMTraverser OR create a new class? |
| 68 | * @param Node $node |
| 69 | * @param callable $handler |
| 70 | * @param mixed ...$args |
| 71 | */ |
| 72 | public static function visitDOM( Node $node, callable $handler, ...$args ): void { |
| 73 | $handler( $node, ...$args ); |
| 74 | $node = $node->firstChild; |
| 75 | while ( $node ) { |
| 76 | $next = $node->nextSibling; |
| 77 | self::visitDOM( $node, $handler, ...$args ); |
| 78 | $node = $next; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | /** |
| 83 | * Move 'from'.childNodes to 'to' adding them before 'beforeNode' |
| 84 | * If 'beforeNode' is null, the nodes are appended at the end. |
| 85 | * @param Node $from Source node. Children will be removed. |
| 86 | * @param Node $to Destination node. Children of $from will be added here |
| 87 | * @param ?Node $beforeNode Add the children before this node. |
| 88 | */ |
| 89 | public static function migrateChildren( |
| 90 | Node $from, Node $to, ?Node $beforeNode = null |
| 91 | ): void { |
| 92 | while ( $from->firstChild ) { |
| 93 | $to->insertBefore( $from->firstChild, $beforeNode ); |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | /** |
| 98 | * Many DOM implementations will de-optimize the representation of a |
| 99 | * Node if `$node->childNodes` is accessed, converting the linked list |
| 100 | * of node children to an array which is then expensive to mutate. |
| 101 | * |
| 102 | * This method returns an array of child nodes, but uses the |
| 103 | * `->firstChild`/`->nextSibling` accessors to obtain it, avoiding |
| 104 | * deoptimization. This is also robust against concurrent mutation. |
| 105 | * |
| 106 | * @param Node $n |
| 107 | * @return list<Node> the child nodes |
| 108 | */ |
| 109 | public static function childNodes( Node $n ): array { |
| 110 | $result = []; |
| 111 | for ( $child = $n->firstChild; $child !== null; $child = $child->nextSibling ) { |
| 112 | $result[] = $child; |
| 113 | } |
| 114 | return $result; |
| 115 | } |
| 116 | |
| 117 | /** |
| 118 | * Copy 'from'.childNodes to 'to' adding them before 'beforeNode' |
| 119 | * 'from' and 'to' belong to different documents. |
| 120 | * |
| 121 | * If 'beforeNode' is null, the nodes are appended at the end. |
| 122 | * @param Node $from |
| 123 | * @param Node $to |
| 124 | * @param ?Node $beforeNode |
| 125 | */ |
| 126 | public static function migrateChildrenBetweenDocs( |
| 127 | Node $from, Node $to, ?Node $beforeNode = null |
| 128 | ): void { |
| 129 | $destDoc = $to->ownerDocument; |
| 130 | if ( $destDoc === $from->ownerDocument ) { |
| 131 | self::migrateChildren( $from, $to, $beforeNode ); |
| 132 | return; |
| 133 | } |
| 134 | $n = $from->firstChild; |
| 135 | while ( $n ) { |
| 136 | $to->insertBefore( $destDoc->importNode( $n, true ), $beforeNode ); |
| 137 | $n = $n->nextSibling; |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | /** |
| 142 | * Assert that this is a DOM element node. |
| 143 | * This is primarily to help phan analyze variable types. |
| 144 | * |
| 145 | * @phan-assert Element $node |
| 146 | * |
| 147 | * @param ?Node $node |
| 148 | * @return true Always returns true |
| 149 | */ |
| 150 | public static function assertElt( ?Node $node ): bool { |
| 151 | Assert::invariant( $node instanceof Element, "Expected an element" ); |
| 152 | return true; |
| 153 | } |
| 154 | |
| 155 | public static function isRemexBlockNode( ?Node $node ): bool { |
| 156 | return $node instanceof Element && |
| 157 | !isset( Consts::$HTML['OnlyInlineElements'][self::nodeName( $node )] ) && |
| 158 | // This is a superset of \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements |
| 159 | !self::isMetaDataTag( $node ); |
| 160 | } |
| 161 | |
| 162 | public static function isWikitextBlockNode( ?Node $node ): bool { |
| 163 | return $node && TokenUtils::isWikitextBlockTag( self::nodeName( $node ) ); |
| 164 | } |
| 165 | |
| 166 | /** |
| 167 | * Determine whether this is a formatting DOM element. |
| 168 | */ |
| 169 | public static function isFormattingElt( ?Node $node ): bool { |
| 170 | return $node && isset( Consts::$HTML['FormattingTags'][self::nodeName( $node )] ); |
| 171 | } |
| 172 | |
| 173 | /** |
| 174 | * Determine whether this is a quote DOM element. |
| 175 | */ |
| 176 | public static function isQuoteElt( ?Node $node ): bool { |
| 177 | return $node && isset( Consts::$WTQuoteTags[self::nodeName( $node )] ); |
| 178 | } |
| 179 | |
| 180 | /** |
| 181 | * Determine whether this is the <body> DOM element. |
| 182 | */ |
| 183 | public static function isBody( ?Node $node ): bool { |
| 184 | return $node && self::nodeName( $node ) === 'body'; |
| 185 | } |
| 186 | |
| 187 | /** |
| 188 | * Determine whether this is a removed DOM node but Node object yet |
| 189 | */ |
| 190 | public static function isRemoved( ?Node $node ): bool { |
| 191 | return !$node || !isset( $node->nodeType ); |
| 192 | } |
| 193 | |
| 194 | /** |
| 195 | * Build path from a node to the root of the document. |
| 196 | * |
| 197 | * @param Node $node |
| 198 | * @return Node[] Path including all nodes from $node to the root of the document |
| 199 | */ |
| 200 | public static function pathToRoot( Node $node ): array { |
| 201 | $path = []; |
| 202 | do { |
| 203 | $path[] = $node; |
| 204 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
| 205 | } while ( $node = $node->parentNode ); |
| 206 | return $path; |
| 207 | } |
| 208 | |
| 209 | /** |
| 210 | * Compute the edge length of the path from $node to the root. |
| 211 | * Root document is at depth 0, <html> at 1, <body> at 2. |
| 212 | */ |
| 213 | public static function nodeDepth( Node $node ): int { |
| 214 | $edges = 0; |
| 215 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
| 216 | while ( $node = $node->parentNode ) { |
| 217 | $edges++; |
| 218 | } |
| 219 | return $edges; |
| 220 | } |
| 221 | |
| 222 | /** |
| 223 | * Build path from a node to its passed-in sibling. |
| 224 | * Return will not include the passed-in sibling. |
| 225 | * |
| 226 | * @param Node $node |
| 227 | * @param Node $sibling |
| 228 | * @param bool $left indicates whether to go backwards, use previousSibling instead of nextSibling. |
| 229 | * @return Node[] |
| 230 | */ |
| 231 | public static function pathToSibling( Node $node, Node $sibling, bool $left ): array { |
| 232 | $path = []; |
| 233 | while ( $node && $node !== $sibling ) { |
| 234 | $path[] = $node; |
| 235 | $node = $left ? $node->previousSibling : $node->nextSibling; |
| 236 | } |
| 237 | return $path; |
| 238 | } |
| 239 | |
| 240 | /** |
| 241 | * Check whether a node `n1` comes before another node `n2` in |
| 242 | * their parent's children list. |
| 243 | * |
| 244 | * @param Node $n1 The node you expect to come first. |
| 245 | * @param Node $n2 Expected later sibling. |
| 246 | * @return bool |
| 247 | */ |
| 248 | public static function inSiblingOrder( Node $n1, Node $n2 ): bool { |
| 249 | while ( $n1 && $n1 !== $n2 ) { |
| 250 | $n1 = $n1->nextSibling; |
| 251 | } |
| 252 | return $n1 !== null; |
| 253 | } |
| 254 | |
| 255 | /** |
| 256 | * Check that a node 'n1' is an ancestor of another node 'n2' in |
| 257 | * the DOM. Returns true if n1 === n2. |
| 258 | * |
| 259 | * @param Node $n1 the suspected ancestor. |
| 260 | * @param Node $n2 the suspected descendant. |
| 261 | * @return bool |
| 262 | */ |
| 263 | public static function isAncestorOf( Node $n1, Node $n2 ): bool { |
| 264 | while ( $n2 && $n2 !== $n1 ) { |
| 265 | $n2 = $n2->parentNode; |
| 266 | } |
| 267 | return $n2 !== null; |
| 268 | } |
| 269 | |
| 270 | /** |
| 271 | * Find an ancestor of $node with nodeName $name. |
| 272 | */ |
| 273 | public static function findAncestorOfName( Node $node, string $name ): ?Element { |
| 274 | $node = $node->parentNode; |
| 275 | while ( $node && self::nodeName( $node ) !== $name ) { |
| 276 | $node = $node->parentNode; |
| 277 | } |
| 278 | '@phan-var Element $node'; // @var Element $node |
| 279 | return $node; |
| 280 | } |
| 281 | |
| 282 | /** |
| 283 | * Check whether $node has $name or has an ancestor named $name. |
| 284 | */ |
| 285 | public static function hasNameOrHasAncestorOfName( Node $node, string $name ): bool { |
| 286 | return self::nodeName( $node ) === $name || self::findAncestorOfName( $node, $name ) !== null; |
| 287 | } |
| 288 | |
| 289 | /** |
| 290 | * Determine whether the node matches the given nodeName and attribute value. |
| 291 | * Returns true if node name matches and the attribute equals "typeof" |
| 292 | * |
| 293 | * @param Node $n The node to test |
| 294 | * @param string $name The expected nodeName of $n |
| 295 | * @param string $typeRe Regular expression matching the expected value of |
| 296 | * `typeof` attribute. |
| 297 | * @return ?string The matching `typeof` value, or `null` if there is |
| 298 | * no match. |
| 299 | */ |
| 300 | public static function matchNameAndTypeOf( Node $n, string $name, string $typeRe ): ?string { |
| 301 | return self::nodeName( $n ) === $name ? self::matchTypeOf( $n, $typeRe ) : null; |
| 302 | } |
| 303 | |
| 304 | /** |
| 305 | * Determine whether the node matches the given nodeName and typeof |
| 306 | * attribute value; the typeof is given as string. |
| 307 | * |
| 308 | * @param Node $n |
| 309 | * @param string $name node name to test for |
| 310 | * @param string $type Expected value of "typeof" attribute (literal string) |
| 311 | * @return bool True if the node matches. |
| 312 | */ |
| 313 | public static function hasNameAndTypeOf( Node $n, string $name, string $type ): bool { |
| 314 | return self::matchNameAndTypeOf( |
| 315 | $n, $name, '/^' . preg_quote( $type, '/' ) . '$/' |
| 316 | ) !== null; |
| 317 | } |
| 318 | |
| 319 | /** |
| 320 | * Determine whether the node matches the given `typeof` attribute value. |
| 321 | * |
| 322 | * @param Node $n The node to test |
| 323 | * @param string $typeRe Regular expression matching the expected value of |
| 324 | * the `typeof` attribute. |
| 325 | * @return ?string The matching `typeof` value, or `null` if there is |
| 326 | * no match. |
| 327 | */ |
| 328 | public static function matchTypeOf( Node $n, string $typeRe ): ?string { |
| 329 | return self::matchMultivalAttr( $n, 'typeof', $typeRe ); |
| 330 | } |
| 331 | |
| 332 | /** |
| 333 | * Determine whether the node matches the given `rel` attribute value. |
| 334 | * |
| 335 | * @param Node $n The node to test |
| 336 | * @param string $relRe Regular expression matching the expected value of |
| 337 | * the `rel` attribute. |
| 338 | * @return ?string The matching `rel` value, or `null` if there is |
| 339 | * no match. |
| 340 | */ |
| 341 | public static function matchRel( Node $n, string $relRe ): ?string { |
| 342 | return self::matchMultivalAttr( $n, 'rel', $relRe ); |
| 343 | } |
| 344 | |
| 345 | /** |
| 346 | * Determine whether the node matches the given multivalue attribute value. |
| 347 | * |
| 348 | * @param Node $n The node to test |
| 349 | * @param string $attrName the attribute to test (typically 'rel' or 'typeof') |
| 350 | * @param string $valueRe Regular expression matching the expected value of |
| 351 | * the attribute. |
| 352 | * @return ?string The matching attribute value, or `null` if there is |
| 353 | * no match. |
| 354 | */ |
| 355 | private static function matchMultivalAttr( Node $n, string $attrName, string $valueRe ): ?string { |
| 356 | if ( !( $n instanceof Element ) ) { |
| 357 | return null; |
| 358 | } |
| 359 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
| 360 | if ( $attrValue === null || $attrValue === '' ) { |
| 361 | return null; |
| 362 | } |
| 363 | foreach ( explode( ' ', $attrValue ) as $ty ) { |
| 364 | if ( $ty === '' ) { |
| 365 | continue; |
| 366 | } |
| 367 | $count = preg_match( $valueRe, $ty ); |
| 368 | Assert::invariant( $count !== false, "Bad regexp" ); |
| 369 | if ( $count ) { |
| 370 | return $ty; |
| 371 | } |
| 372 | } |
| 373 | return null; |
| 374 | } |
| 375 | |
| 376 | /** |
| 377 | * Determine whether the node matches the given typeof attribute value. |
| 378 | * |
| 379 | * @param Node $n |
| 380 | * @param string $type Expected value of "typeof" attribute, as a literal |
| 381 | * string. |
| 382 | * @return bool True if the node matches. |
| 383 | */ |
| 384 | public static function hasTypeOf( Node $n, string $type ): bool { |
| 385 | return self::hasValueInMultivalAttr( $n, 'typeof', $type ); |
| 386 | } |
| 387 | |
| 388 | /** |
| 389 | * Determine whether the node matches the given rel attribute value. |
| 390 | * |
| 391 | * @param Node $n |
| 392 | * @param string $rel Expected value of "rel" attribute, as a literal string. |
| 393 | * @return bool True if the node matches. |
| 394 | */ |
| 395 | public static function hasRel( Node $n, string $rel ): bool { |
| 396 | return self::hasValueInMultivalAttr( $n, 'rel', $rel ); |
| 397 | } |
| 398 | |
| 399 | /** |
| 400 | * @param Element $element |
| 401 | * @param string $regex Partial regular expression, e.g. "foo|bar" |
| 402 | * @return bool |
| 403 | */ |
| 404 | public static function hasClass( Element $element, string $regex ): bool { |
| 405 | $value = DOMCompat::getAttribute( $element, 'class' ); |
| 406 | return (bool)preg_match( '{(?<=^|\s)' . $regex . '(?=\s|$)}', $value ?? '' ); |
| 407 | } |
| 408 | |
| 409 | /** |
| 410 | * Determine whether the node matches the given attribute value for a multivalued attribute |
| 411 | * @param Node $n |
| 412 | * @param string $attrName name of the attribute to check (typically 'typeof', 'rel') |
| 413 | * @param string $value Expected value of $attrName" attribute, as a literal string. |
| 414 | * @return bool True if the node matches |
| 415 | */ |
| 416 | private static function hasValueInMultivalAttr( Node $n, string $attrName, string $value ): bool { |
| 417 | // fast path |
| 418 | if ( !( $n instanceof Element ) ) { |
| 419 | return false; |
| 420 | } |
| 421 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
| 422 | if ( $attrValue === null || $attrValue === '' ) { |
| 423 | return false; |
| 424 | } |
| 425 | if ( $attrValue === $value ) { |
| 426 | return true; |
| 427 | } |
| 428 | // fallback |
| 429 | return in_array( $value, explode( ' ', $attrValue ), true ); |
| 430 | } |
| 431 | |
| 432 | /** |
| 433 | * Add a type to the typeof attribute. This method should almost always |
| 434 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 435 | * typeof information. |
| 436 | * |
| 437 | * @param Element $node node |
| 438 | * @param string $type type |
| 439 | * @param bool $prepend If true, adds value to start, rather than end. |
| 440 | * Use of this option in new code is discouraged. |
| 441 | */ |
| 442 | public static function addTypeOf( Element $node, string $type, bool $prepend = false ): void { |
| 443 | self::addValueToMultivalAttr( $node, 'typeof', $type, $prepend ); |
| 444 | } |
| 445 | |
| 446 | /** |
| 447 | * Add a type to the rel attribute. This method should almost always |
| 448 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 449 | * rel information. |
| 450 | */ |
| 451 | public static function addRel( Element $node, string $rel ): void { |
| 452 | self::addValueToMultivalAttr( $node, 'rel', $rel ); |
| 453 | } |
| 454 | |
| 455 | /** |
| 456 | * Add an element to a multivalue attribute (typeof, rel). This method should almost always |
| 457 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
| 458 | * multivalue information. |
| 459 | * |
| 460 | * @param Element $node |
| 461 | * @param string $attr |
| 462 | * @param string $value |
| 463 | * @param bool $prepend If true, adds value to start, rather than end |
| 464 | */ |
| 465 | private static function addValueToMultivalAttr( |
| 466 | Element $node, string $attr, string $value, bool $prepend = false |
| 467 | ): void { |
| 468 | $value = trim( $value ); |
| 469 | if ( $value === '' ) { |
| 470 | return; |
| 471 | } |
| 472 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
| 473 | if ( $oldValue !== null && trim( $oldValue ) !== '' ) { |
| 474 | $values = explode( ' ', trim( $oldValue ) ); |
| 475 | if ( in_array( $value, $values, true ) ) { |
| 476 | return; |
| 477 | } |
| 478 | $value = $prepend ? "$value $oldValue" : "$oldValue $value"; |
| 479 | } |
| 480 | $node->setAttribute( $attr, $value ); |
| 481 | } |
| 482 | |
| 483 | /** |
| 484 | * Remove a value from a multiple-valued attribute. |
| 485 | * |
| 486 | * @param Element $node node |
| 487 | * @param string $attr The attribute name |
| 488 | * @param string $value The value to remove |
| 489 | */ |
| 490 | private static function removeValueFromMultivalAttr( |
| 491 | Element $node, string $attr, string $value |
| 492 | ): void { |
| 493 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
| 494 | if ( $oldValue !== null && $oldValue !== '' ) { |
| 495 | $value = trim( $value ); |
| 496 | $types = array_diff( explode( ' ', $oldValue ), [ $value ] ); |
| 497 | if ( count( $types ) > 0 ) { |
| 498 | $node->setAttribute( $attr, implode( ' ', $types ) ); |
| 499 | } else { |
| 500 | $node->removeAttribute( $attr ); |
| 501 | } |
| 502 | } |
| 503 | } |
| 504 | |
| 505 | /** |
| 506 | * Remove a type from the typeof attribute. |
| 507 | */ |
| 508 | public static function removeTypeOf( Element $node, string $type ): void { |
| 509 | self::removeValueFromMultivalAttr( $node, 'typeof', $type ); |
| 510 | } |
| 511 | |
| 512 | /** |
| 513 | * Remove a type from the rel attribute. |
| 514 | */ |
| 515 | public static function removeRel( Element $node, string $rel ): void { |
| 516 | self::removeValueFromMultivalAttr( $node, 'rel', $rel ); |
| 517 | } |
| 518 | |
| 519 | /** |
| 520 | * Check whether `node` is in a fosterable position. |
| 521 | */ |
| 522 | public static function isFosterablePosition( ?Node $n ): bool { |
| 523 | return $n && isset( Consts::$HTML['FosterablePosition'][self::nodeName( $n->parentNode )] ); |
| 524 | } |
| 525 | |
| 526 | /** |
| 527 | * Check whether `node` is a heading. |
| 528 | */ |
| 529 | public static function isHeading( ?Node $n ): bool { |
| 530 | return $n && preg_match( '/^h[1-6]$/D', self::nodeName( $n ) ); |
| 531 | } |
| 532 | |
| 533 | /** |
| 534 | * Check whether `node` is a list. |
| 535 | */ |
| 536 | public static function isList( ?Node $n ): bool { |
| 537 | return $n && isset( Consts::$HTML['ListTags'][self::nodeName( $n )] ); |
| 538 | } |
| 539 | |
| 540 | /** |
| 541 | * Check whether `node` is a list item. |
| 542 | */ |
| 543 | public static function isListItem( ?Node $n ): bool { |
| 544 | return $n && isset( Consts::$HTML['ListItemTags'][self::nodeName( $n )] ); |
| 545 | } |
| 546 | |
| 547 | /** |
| 548 | * Check whether `node` is a list or list item. |
| 549 | */ |
| 550 | public static function isListOrListItem( ?Node $n ): bool { |
| 551 | return self::isList( $n ) || self::isListItem( $n ); |
| 552 | } |
| 553 | |
| 554 | /** |
| 555 | * Check whether `node` is nestee in a list item. |
| 556 | */ |
| 557 | public static function isNestedInListItem( ?Node $n ): bool { |
| 558 | $parentNode = $n->parentNode; |
| 559 | while ( $parentNode ) { |
| 560 | if ( self::isListItem( $parentNode ) ) { |
| 561 | return true; |
| 562 | } |
| 563 | $parentNode = $parentNode->parentNode; |
| 564 | } |
| 565 | return false; |
| 566 | } |
| 567 | |
| 568 | /** |
| 569 | * Check whether `node` is a nested list or a list item. |
| 570 | */ |
| 571 | public static function isNestedListOrListItem( ?Node $n ): bool { |
| 572 | return self::isListOrListItem( $n ) && self::isNestedInListItem( $n ); |
| 573 | } |
| 574 | |
| 575 | /** |
| 576 | * Check a node to see whether it's a meta with some typeof. |
| 577 | */ |
| 578 | public static function isMarkerMeta( Node $n, string $type ): bool { |
| 579 | return self::hasNameAndTypeOf( $n, 'meta', $type ); |
| 580 | } |
| 581 | |
| 582 | /** |
| 583 | * Check whether a node has any children that are elements. |
| 584 | */ |
| 585 | public static function hasElementChild( Node $node ): bool { |
| 586 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
| 587 | if ( $child instanceof Element ) { |
| 588 | return true; |
| 589 | } |
| 590 | } |
| 591 | return false; |
| 592 | } |
| 593 | |
| 594 | /** |
| 595 | * Check if a node has a block-level element descendant. |
| 596 | */ |
| 597 | public static function hasBlockElementDescendant( Node $node ): bool { |
| 598 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
| 599 | if ( $child instanceof Element && |
| 600 | ( self::isWikitextBlockNode( $child ) || // Is a block-level node |
| 601 | self::hasBlockElementDescendant( $child ) ) // or has a block-level child or grandchild or.. |
| 602 | ) { |
| 603 | return true; |
| 604 | } |
| 605 | } |
| 606 | return false; |
| 607 | } |
| 608 | |
| 609 | /** |
| 610 | * Is a node representing inter-element whitespace? |
| 611 | */ |
| 612 | public static function isIEW( ?Node $node ): bool { |
| 613 | // ws-only |
| 614 | return $node instanceof Text && preg_match( '/^\s*$/D', $node->nodeValue ); |
| 615 | } |
| 616 | |
| 617 | /** |
| 618 | * Is a node a document fragment? |
| 619 | */ |
| 620 | public static function isDocumentFragment( ?Node $node ): bool { |
| 621 | return $node && $node->nodeType === XML_DOCUMENT_FRAG_NODE; |
| 622 | } |
| 623 | |
| 624 | /** |
| 625 | * Is a node at the top? |
| 626 | */ |
| 627 | public static function atTheTop( ?Node $node ): bool { |
| 628 | return self::isBody( $node ) || self::isDocumentFragment( $node ); |
| 629 | } |
| 630 | |
| 631 | /** |
| 632 | * Are all children of this node text or comment nodes? |
| 633 | */ |
| 634 | public static function allChildrenAreTextOrComments( Node $node ): bool { |
| 635 | $child = $node->firstChild; |
| 636 | while ( $child ) { |
| 637 | if ( !( $child instanceof Text || $child instanceof Comment ) ) { |
| 638 | return false; |
| 639 | } |
| 640 | $child = $child->nextSibling; |
| 641 | } |
| 642 | return true; |
| 643 | } |
| 644 | |
| 645 | /** |
| 646 | * Check if the dom-subtree rooted at node has an element with tag name 'tagName' |
| 647 | * By default, the root node is not checked. |
| 648 | * |
| 649 | * @param Node $node The DOM node whose tree should be checked |
| 650 | * @param string $tagName Tag name to look for |
| 651 | * @param bool $checkRoot Should the root be checked? |
| 652 | * @return bool |
| 653 | */ |
| 654 | public static function treeHasElement( Node $node, string $tagName, bool $checkRoot = false ): bool { |
| 655 | if ( $checkRoot && self::nodeName( $node ) === $tagName ) { |
| 656 | return true; |
| 657 | } |
| 658 | |
| 659 | $node = $node->firstChild; |
| 660 | while ( $node ) { |
| 661 | if ( $node instanceof Element ) { |
| 662 | if ( self::treeHasElement( $node, $tagName, true ) ) { |
| 663 | return true; |
| 664 | } |
| 665 | } |
| 666 | $node = $node->nextSibling; |
| 667 | } |
| 668 | return false; |
| 669 | } |
| 670 | |
| 671 | /** |
| 672 | * Is node a table tag (table, tbody, td, tr, etc.)? |
| 673 | */ |
| 674 | public static function isTableTag( Node $node ): bool { |
| 675 | return isset( Consts::$HTML['TableTags'][self::nodeName( $node )] ); |
| 676 | } |
| 677 | |
| 678 | /** |
| 679 | * Returns a media element nested in `node` |
| 680 | */ |
| 681 | public static function selectMediaElt( Element $node ): ?Element { |
| 682 | return DOMCompat::querySelector( $node, 'img, video, audio' ); |
| 683 | } |
| 684 | |
| 685 | /** |
| 686 | * Extract http-equiv headers from the HTML, including content-language and |
| 687 | * vary headers, if present |
| 688 | * |
| 689 | * @param Document $doc |
| 690 | * @return array<string,string> |
| 691 | */ |
| 692 | public static function findHttpEquivHeaders( Document $doc ): array { |
| 693 | $elts = DOMCompat::querySelectorAll( $doc, 'meta[http-equiv][content]' ); |
| 694 | $r = []; |
| 695 | foreach ( $elts as $el ) { |
| 696 | $r[strtolower( |
| 697 | DOMCompat::getAttribute( $el, 'http-equiv' ) |
| 698 | )] = DOMCompat::getAttribute( $el, 'content' ); |
| 699 | } |
| 700 | return $r; |
| 701 | } |
| 702 | |
| 703 | /** |
| 704 | * Add or replace http-equiv headers in the HTML <head>. |
| 705 | * This is used for content-language and vary headers, among possible |
| 706 | * others. |
| 707 | * @param Document $doc The HTML document to update |
| 708 | * @param array<string,string|string[]> $headers An array mapping HTTP |
| 709 | * header names (which are case-insensitive) to new values. If an |
| 710 | * array of values is provided, they will be joined with commas. |
| 711 | */ |
| 712 | public static function addHttpEquivHeaders( Document $doc, array $headers ): void { |
| 713 | foreach ( $headers as $key => $value ) { |
| 714 | if ( is_array( $value ) ) { |
| 715 | $value = implode( ',', $value ); |
| 716 | } |
| 717 | // HTTP header names are case-insensitive; hence the "i" suffix |
| 718 | // on this selector query. |
| 719 | $el = DOMCompat::querySelector( $doc, "meta[http-equiv=\"{$key}\"i]" ); |
| 720 | if ( !$el ) { |
| 721 | // This also ensures there is a <head> element. |
| 722 | $el = self::appendToHead( $doc, 'meta', [ 'http-equiv' => $key ] ); |
| 723 | } |
| 724 | $el->setAttribute( 'content', $value ); |
| 725 | |
| 726 | } |
| 727 | } |
| 728 | |
| 729 | public static function extractInlinedContentVersion( Document $doc ): ?string { |
| 730 | $el = DOMCompat::querySelector( $doc, |
| 731 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
| 732 | return $el ? DOMCompat::getAttribute( $el, 'content' ) : null; |
| 733 | } |
| 734 | |
| 735 | /** |
| 736 | * Add attributes to a node element. |
| 737 | * |
| 738 | * @param Element $elt element |
| 739 | * @param array $attrs attributes |
| 740 | */ |
| 741 | public static function addAttributes( Element $elt, array $attrs ): void { |
| 742 | foreach ( $attrs as $key => $value ) { |
| 743 | if ( $value !== null ) { |
| 744 | if ( $key === 'id' ) { |
| 745 | DOMCompat::setIdAttribute( $elt, $value ); |
| 746 | } else { |
| 747 | $elt->setAttribute( $key, $value ); |
| 748 | } |
| 749 | } |
| 750 | } |
| 751 | } |
| 752 | |
| 753 | /** |
| 754 | * Create an element in the document head with the given attrs. |
| 755 | * Creates the head element in the document if needed. |
| 756 | * |
| 757 | * @param Document $document |
| 758 | * @param string $tagName |
| 759 | * @param array $attrs |
| 760 | * @return Element The newly-appended Element |
| 761 | */ |
| 762 | public static function appendToHead( Document $document, string $tagName, array $attrs = [] ): Element { |
| 763 | $elt = $document->createElement( $tagName ); |
| 764 | self::addAttributes( $elt, $attrs ); |
| 765 | $head = DOMCompat::getHead( $document ); |
| 766 | if ( !$head ) { |
| 767 | $head = $document->createElement( 'head' ); |
| 768 | $document->documentElement->insertBefore( |
| 769 | $head, DOMCompat::getBody( $document ) |
| 770 | ); |
| 771 | } |
| 772 | $head->appendChild( $elt ); |
| 773 | return $elt; |
| 774 | } |
| 775 | |
| 776 | /** |
| 777 | * innerHTML and outerHTML are not defined on DocumentFragment. |
| 778 | * |
| 779 | * Defined similarly to DOMCompat::getInnerHTML() |
| 780 | */ |
| 781 | public static function getFragmentInnerHTML( DocumentFragment $frag ): string { |
| 782 | return XHtmlSerializer::serialize( |
| 783 | $frag, [ 'innerXML' => true ] |
| 784 | )['html']; |
| 785 | } |
| 786 | |
| 787 | /** |
| 788 | * innerHTML and outerHTML are not defined on DocumentFragment. |
| 789 | * @see DOMCompat::setInnerHTML() for the Element version |
| 790 | */ |
| 791 | public static function setFragmentInnerHTML( DocumentFragment $frag, string $html ): void { |
| 792 | // FIXME: This should be an HTML5 template element |
| 793 | $body = $frag->ownerDocument->createElement( 'body' ); |
| 794 | DOMCompat::setInnerHTML( $body, $html ); |
| 795 | self::migrateChildren( $body, $frag ); |
| 796 | } |
| 797 | |
| 798 | public static function parseHTMLToFragment( Document $doc, string $html ): DocumentFragment { |
| 799 | $frag = $doc->createDocumentFragment(); |
| 800 | self::setFragmentInnerHTML( $frag, $html ); |
| 801 | return $frag; |
| 802 | } |
| 803 | |
| 804 | public static function isRawTextElement( Node $node ): bool { |
| 805 | return isset( Consts::$HTML['RawTextElements'][self::nodeName( $node )] ); |
| 806 | } |
| 807 | |
| 808 | /** |
| 809 | * Is $n a block tag OR does the subtree rooted at $n have a block tag in it? |
| 810 | */ |
| 811 | public static function hasBlockTag( Node $n ): bool { |
| 812 | if ( self::isRemexBlockNode( $n ) ) { |
| 813 | return true; |
| 814 | } |
| 815 | $c = $n->firstChild; |
| 816 | while ( $c ) { |
| 817 | if ( self::hasBlockTag( $c ) ) { |
| 818 | return true; |
| 819 | } |
| 820 | $c = $c->nextSibling; |
| 821 | } |
| 822 | return false; |
| 823 | } |
| 824 | |
| 825 | /** |
| 826 | * @see DOMCompat::attributes() |
| 827 | * @deprecated since 0.22; use DOMCompat::attributes() |
| 828 | */ |
| 829 | public static function attributes( Element $element ): array { |
| 830 | PHPUtils::deprecated( __METHOD__, "0.22" ); |
| 831 | return DOMCompat::attributes( $element ); |
| 832 | } |
| 833 | |
| 834 | public static function isMetaDataTag( Element $node ): bool { |
| 835 | return isset( Consts::$HTML['MetaDataTags'][self::nodeName( $node )] ); |
| 836 | } |
| 837 | |
| 838 | /** |
| 839 | * Strip a paragraph wrapper, if any, before parsing HTML to DOM |
| 840 | */ |
| 841 | public static function stripPWrapper( string $ret ): string { |
| 842 | return preg_replace( '#(^<p>)|(\n</p>(' . Utils::COMMENT_REGEXP_FRAGMENT . '|\s)*$)#D', '', $ret ); |
| 843 | } |
| 844 | |
| 845 | /** |
| 846 | * Return the lower-case version of the node name. |
| 847 | * FIXME: HTML says this should be capitalized, but we are tailoring |
| 848 | * this to the PHP7.x DOM libraries that return lower-case names. |
| 849 | * @see DOMCompat::nodeName() |
| 850 | */ |
| 851 | public static function nodeName( Node $node ): string { |
| 852 | // We will transition to DOMCompat::nodeName() once we move to |
| 853 | // PHP 8.4 in production, which uses uppercase node names. |
| 854 | return $node instanceof \DOMNode ? $node->nodeName : strtolower( $node->nodeName ); |
| 855 | } |
| 856 | } |