Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
28.77% |
63 / 219 |
|
44.44% |
28 / 63 |
CRAP | |
0.00% |
0 / 1 |
DOMUtils | |
28.77% |
63 / 219 |
|
44.44% |
28 / 63 |
7850.55 | |
0.00% |
0 / 1 |
parseHTML | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
20 | |||
visitDOM | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
migrateChildren | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
migrateChildrenBetweenDocs | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
assertElt | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
isRemexBlockNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
isWikitextBlockNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isFormattingElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isQuoteElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isBody | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isRemoved | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
pathToRoot | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
nodeDepth | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
pathToSibling | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
inSiblingOrder | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
isAncestorOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
findAncestorOfName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
hasNameOrHasAncestorOfName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
matchNameAndTypeOf | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
hasNameAndTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
matchTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
matchRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
matchMultivalAttr | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
hasTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasClass | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
hasValueInMultivalAttr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
addTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addValueToMultivalAttr | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
42 | |||
removeValueFromMultivalAttr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
removeTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
removeRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isFosterablePosition | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isHeading | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isList | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isListOrListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isNestedInListItem | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
isNestedListOrListItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isMarkerMeta | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
hasElementChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasBlockElementDescendant | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
5 | |||
isIEW | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isDocumentFragment | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
atTheTop | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
allChildrenAreTextOrComments | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
treeHasElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
isTableTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
selectMediaElt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
findHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
addHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
extractInlinedContentVersion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
addAttributes | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
appendToHead | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
getFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
parseHTMLToFragment | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isRawTextElement | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasBlockTag | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
attributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
isMetaDataTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stripPWrapper | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Core\ClientError; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Wikitext\Consts; |
15 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
16 | use Wikimedia\RemexHtml\DOM\DOMBuilder; |
17 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer; |
18 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
19 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
20 | |
21 | /** |
22 | * DOM utilities for querying the DOM. This is largely independent of Parsoid |
23 | * although some Parsoid details (TokenUtils, inline content version) |
24 | * have snuck in. |
25 | */ |
26 | class DOMUtils { |
27 | |
28 | /** |
29 | * Parse HTML, return the tree. |
30 | * |
31 | * @note The resulting document is not "prepared and loaded"; use |
32 | * ContentUtils::prepareAndLoadDocument() instead if that's what |
33 | * you need. |
34 | * |
35 | * @param string $html |
36 | * @param bool $validateXMLNames |
37 | * @return Document |
38 | */ |
39 | public static function parseHTML( |
40 | string $html, bool $validateXMLNames = false |
41 | ): Document { |
42 | if ( !preg_match( '/^<(?:!doctype|html|body)/i', $html ) ) { |
43 | // Make sure that we parse fragments in the body. Otherwise comments, |
44 | // link and meta tags end up outside the html element or in the head |
45 | // elements. |
46 | $html = '<body>' . $html; |
47 | } |
48 | |
49 | $domBuilder = new class( [ |
50 | 'suppressHtmlNamespace' => true, |
51 | ] ) extends DOMBuilder { |
52 | /** @inheritDoc */ |
53 | protected function createDocument( |
54 | ?string $doctypeName = null, |
55 | ?string $public = null, |
56 | ?string $system = null |
57 | ) { |
58 | // @phan-suppress-next-line PhanTypeMismatchReturn |
59 | return DOMCompat::newDocument( false ); |
60 | } |
61 | }; |
62 | $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); |
63 | $dispatcher = new Dispatcher( $treeBuilder ); |
64 | $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); |
65 | $tokenizer->execute( [] ); |
66 | if ( $validateXMLNames && $domBuilder->isCoerced() ) { |
67 | throw new ClientError( 'Encountered a name invalid in XML.' ); |
68 | } |
69 | $frag = $domBuilder->getFragment(); |
70 | '@phan-var Document $frag'; // @var Document $frag |
71 | return $frag; |
72 | } |
73 | |
74 | /** |
75 | * This is a simplified version of the DOMTraverser. |
76 | * Consider using that before making this more complex. |
77 | * |
78 | * FIXME: Move to DOMTraverser OR create a new class? |
79 | * @param Node $node |
80 | * @param callable $handler |
81 | * @param mixed ...$args |
82 | */ |
83 | public static function visitDOM( Node $node, callable $handler, ...$args ): void { |
84 | $handler( $node, ...$args ); |
85 | $node = $node->firstChild; |
86 | while ( $node ) { |
87 | $next = $node->nextSibling; |
88 | self::visitDOM( $node, $handler, ...$args ); |
89 | $node = $next; |
90 | } |
91 | } |
92 | |
93 | /** |
94 | * Move 'from'.childNodes to 'to' adding them before 'beforeNode' |
95 | * If 'beforeNode' is null, the nodes are appended at the end. |
96 | * @param Node $from Source node. Children will be removed. |
97 | * @param Node $to Destination node. Children of $from will be added here |
98 | * @param ?Node $beforeNode Add the children before this node. |
99 | */ |
100 | public static function migrateChildren( |
101 | Node $from, Node $to, ?Node $beforeNode = null |
102 | ): void { |
103 | while ( $from->firstChild ) { |
104 | $to->insertBefore( $from->firstChild, $beforeNode ); |
105 | } |
106 | } |
107 | |
108 | /** |
109 | * Copy 'from'.childNodes to 'to' adding them before 'beforeNode' |
110 | * 'from' and 'to' belong to different documents. |
111 | * |
112 | * If 'beforeNode' is null, the nodes are appended at the end. |
113 | * @param Node $from |
114 | * @param Node $to |
115 | * @param ?Node $beforeNode |
116 | */ |
117 | public static function migrateChildrenBetweenDocs( |
118 | Node $from, Node $to, ?Node $beforeNode = null |
119 | ): void { |
120 | $destDoc = $to->ownerDocument; |
121 | if ( $destDoc === $from->ownerDocument ) { |
122 | self::migrateChildren( $from, $to, $beforeNode ); |
123 | return; |
124 | } |
125 | $n = $from->firstChild; |
126 | while ( $n ) { |
127 | $to->insertBefore( $destDoc->importNode( $n, true ), $beforeNode ); |
128 | $n = $n->nextSibling; |
129 | } |
130 | } |
131 | |
132 | // phpcs doesn't like @phan-assert... |
133 | // phpcs:disable MediaWiki.Commenting.FunctionAnnotations.UnrecognizedAnnotation |
134 | |
135 | /** |
136 | * Assert that this is a DOM element node. |
137 | * This is primarily to help phan analyze variable types. |
138 | * @phan-assert Element $node |
139 | * @param ?Node $node |
140 | * @return bool Always returns true |
141 | * @phan-assert Element $node |
142 | */ |
143 | public static function assertElt( ?Node $node ): bool { |
144 | Assert::invariant( $node instanceof Element, "Expected an element" ); |
145 | return true; |
146 | } |
147 | |
148 | /** |
149 | * @param ?Node $node |
150 | * @return bool |
151 | */ |
152 | public static function isRemexBlockNode( ?Node $node ): bool { |
153 | return $node instanceof Element && |
154 | !isset( Consts::$HTML['OnlyInlineElements'][DOMCompat::nodeName( $node )] ) && |
155 | // This is a superset of \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements |
156 | !self::isMetaDataTag( $node ); |
157 | } |
158 | |
159 | /** |
160 | * @param ?Node $node |
161 | * @return bool |
162 | */ |
163 | public static function isWikitextBlockNode( ?Node $node ): bool { |
164 | return $node && TokenUtils::isWikitextBlockTag( DOMCompat::nodeName( $node ) ); |
165 | } |
166 | |
167 | /** |
168 | * Determine whether this is a formatting DOM element. |
169 | * @param ?Node $node |
170 | * @return bool |
171 | */ |
172 | public static function isFormattingElt( ?Node $node ): bool { |
173 | return $node && isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $node )] ); |
174 | } |
175 | |
176 | /** |
177 | * Determine whether this is a quote DOM element. |
178 | * @param ?Node $node |
179 | * @return bool |
180 | */ |
181 | public static function isQuoteElt( ?Node $node ): bool { |
182 | return $node && isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $node )] ); |
183 | } |
184 | |
185 | /** |
186 | * Determine whether this is the <body> DOM element. |
187 | * @param ?Node $node |
188 | * @return bool |
189 | */ |
190 | public static function isBody( ?Node $node ): bool { |
191 | return $node && DOMCompat::nodeName( $node ) === 'body'; |
192 | } |
193 | |
194 | /** |
195 | * Determine whether this is a removed DOM node but Node object yet |
196 | * @param ?Node $node |
197 | * @return bool |
198 | */ |
199 | public static function isRemoved( ?Node $node ): bool { |
200 | return !$node || !isset( $node->nodeType ); |
201 | } |
202 | |
203 | /** |
204 | * Build path from a node to the root of the document. |
205 | * |
206 | * @param Node $node |
207 | * @return Node[] Path including all nodes from $node to the root of the document |
208 | */ |
209 | public static function pathToRoot( Node $node ): array { |
210 | $path = []; |
211 | do { |
212 | $path[] = $node; |
213 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
214 | } while ( $node = $node->parentNode ); |
215 | return $path; |
216 | } |
217 | |
218 | /** |
219 | * Compute the edge length of the path from $node to the root. |
220 | * Root document is at depth 0, <html> at 1, <body> at 2. |
221 | * @param Node $node |
222 | * @return int |
223 | */ |
224 | public static function nodeDepth( Node $node ): int { |
225 | $edges = 0; |
226 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
227 | while ( $node = $node->parentNode ) { |
228 | $edges++; |
229 | } |
230 | return $edges; |
231 | } |
232 | |
233 | /** |
234 | * Build path from a node to its passed-in sibling. |
235 | * Return will not include the passed-in sibling. |
236 | * |
237 | * @param Node $node |
238 | * @param Node $sibling |
239 | * @param bool $left indicates whether to go backwards, use previousSibling instead of nextSibling. |
240 | * @return Node[] |
241 | */ |
242 | public static function pathToSibling( Node $node, Node $sibling, bool $left ): array { |
243 | $path = []; |
244 | while ( $node && $node !== $sibling ) { |
245 | $path[] = $node; |
246 | $node = $left ? $node->previousSibling : $node->nextSibling; |
247 | } |
248 | return $path; |
249 | } |
250 | |
251 | /** |
252 | * Check whether a node `n1` comes before another node `n2` in |
253 | * their parent's children list. |
254 | * |
255 | * @param Node $n1 The node you expect to come first. |
256 | * @param Node $n2 Expected later sibling. |
257 | * @return bool |
258 | */ |
259 | public static function inSiblingOrder( Node $n1, Node $n2 ): bool { |
260 | while ( $n1 && $n1 !== $n2 ) { |
261 | $n1 = $n1->nextSibling; |
262 | } |
263 | return $n1 !== null; |
264 | } |
265 | |
266 | /** |
267 | * Check that a node 'n1' is an ancestor of another node 'n2' in |
268 | * the DOM. Returns true if n1 === n2. |
269 | * $n1 is the suspected ancestor. |
270 | * $n2 The suspected descendant. |
271 | * |
272 | * @param Node $n1 |
273 | * @param Node $n2 |
274 | * @return bool |
275 | */ |
276 | public static function isAncestorOf( Node $n1, Node $n2 ): bool { |
277 | while ( $n2 && $n2 !== $n1 ) { |
278 | $n2 = $n2->parentNode; |
279 | } |
280 | return $n2 !== null; |
281 | } |
282 | |
283 | /** |
284 | * Find an ancestor of $node with nodeName $name. |
285 | * |
286 | * @param Node $node |
287 | * @param string $name |
288 | * @return ?Element |
289 | */ |
290 | public static function findAncestorOfName( Node $node, string $name ): ?Element { |
291 | $node = $node->parentNode; |
292 | while ( $node && DOMCompat::nodeName( $node ) !== $name ) { |
293 | $node = $node->parentNode; |
294 | } |
295 | '@phan-var Element $node'; // @var Element $node |
296 | return $node; |
297 | } |
298 | |
299 | /** |
300 | * Check whether $node has $name or has an ancestor named $name. |
301 | * |
302 | * @param Node $node |
303 | * @param string $name |
304 | * @return bool |
305 | */ |
306 | public static function hasNameOrHasAncestorOfName( Node $node, string $name ): bool { |
307 | return DOMCompat::nodeName( $node ) === $name || self::findAncestorOfName( $node, $name ) !== null; |
308 | } |
309 | |
310 | /** |
311 | * Determine whether the node matches the given nodeName and attribute value. |
312 | * Returns true if node name matches and the attribute equals "typeof" |
313 | * |
314 | * @param Node $n The node to test |
315 | * @param string $name The expected nodeName of $n |
316 | * @param string $typeRe Regular expression matching the expected value of |
317 | * `typeof` attribute. |
318 | * @return ?string The matching `typeof` value, or `null` if there is |
319 | * no match. |
320 | */ |
321 | public static function matchNameAndTypeOf( Node $n, string $name, string $typeRe ): ?string { |
322 | return DOMCompat::nodeName( $n ) === $name ? self::matchTypeOf( $n, $typeRe ) : null; |
323 | } |
324 | |
325 | /** |
326 | * Determine whether the node matches the given nodeName and typeof |
327 | * attribute value; the typeof is given as string. |
328 | * |
329 | * @param Node $n |
330 | * @param string $name node name to test for |
331 | * @param string $type Expected value of "typeof" attribute (literal string) |
332 | * @return bool True if the node matches. |
333 | */ |
334 | public static function hasNameAndTypeOf( Node $n, string $name, string $type ): bool { |
335 | return self::matchNameAndTypeOf( |
336 | $n, $name, '/^' . preg_quote( $type, '/' ) . '$/' |
337 | ) !== null; |
338 | } |
339 | |
340 | /** |
341 | * Determine whether the node matches the given `typeof` attribute value. |
342 | * |
343 | * @param Node $n The node to test |
344 | * @param string $typeRe Regular expression matching the expected value of |
345 | * the `typeof` attribute. |
346 | * @return ?string The matching `typeof` value, or `null` if there is |
347 | * no match. |
348 | */ |
349 | public static function matchTypeOf( Node $n, string $typeRe ): ?string { |
350 | return self::matchMultivalAttr( $n, 'typeof', $typeRe ); |
351 | } |
352 | |
353 | /** |
354 | * Determine whether the node matches the given `rel` attribute value. |
355 | * |
356 | * @param Node $n The node to test |
357 | * @param string $relRe Regular expression matching the expected value of |
358 | * the `rel` attribute. |
359 | * @return ?string The matching `rel` value, or `null` if there is |
360 | * no match. |
361 | */ |
362 | public static function matchRel( Node $n, string $relRe ): ?string { |
363 | return self::matchMultivalAttr( $n, 'rel', $relRe ); |
364 | } |
365 | |
366 | /** |
367 | * Determine whether the node matches the given multivalue attribute value. |
368 | * |
369 | * @param Node $n The node to test |
370 | * @param string $attrName the attribute to test (typically 'rel' or 'typeof') |
371 | * @param string $valueRe Regular expression matching the expected value of |
372 | * the attribute. |
373 | * @return ?string The matching attribute value, or `null` if there is |
374 | * no match. |
375 | */ |
376 | private static function matchMultivalAttr( Node $n, string $attrName, string $valueRe ): ?string { |
377 | if ( !( $n instanceof Element ) ) { |
378 | return null; |
379 | } |
380 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
381 | if ( $attrValue === null || $attrValue === '' ) { |
382 | return null; |
383 | } |
384 | foreach ( explode( ' ', $attrValue ) as $ty ) { |
385 | if ( $ty === '' ) { |
386 | continue; |
387 | } |
388 | $count = preg_match( $valueRe, $ty ); |
389 | Assert::invariant( $count !== false, "Bad regexp" ); |
390 | if ( $count ) { |
391 | return $ty; |
392 | } |
393 | } |
394 | return null; |
395 | } |
396 | |
397 | /** |
398 | * Determine whether the node matches the given typeof attribute value. |
399 | * |
400 | * @param Node $n |
401 | * @param string $type Expected value of "typeof" attribute, as a literal |
402 | * string. |
403 | * @return bool True if the node matches. |
404 | */ |
405 | public static function hasTypeOf( Node $n, string $type ): bool { |
406 | return self::hasValueInMultivalAttr( $n, 'typeof', $type ); |
407 | } |
408 | |
409 | /** |
410 | * Determine whether the node matches the given rel attribute value. |
411 | * |
412 | * @param Node $n |
413 | * @param string $rel Expected value of "rel" attribute, as a literal string. |
414 | * @return bool True if the node matches. |
415 | */ |
416 | public static function hasRel( Node $n, string $rel ): bool { |
417 | return self::hasValueInMultivalAttr( $n, 'rel', $rel ); |
418 | } |
419 | |
420 | /** |
421 | * @param Element $element |
422 | * @param string $regex Partial regular expression, e.g. "foo|bar" |
423 | * @return bool |
424 | */ |
425 | public static function hasClass( Element $element, string $regex ): bool { |
426 | $value = DOMCompat::getAttribute( $element, 'class' ); |
427 | return (bool)preg_match( '{(?<=^|\s)' . $regex . '(?=\s|$)}', $value ?? '' ); |
428 | } |
429 | |
430 | /** |
431 | * Determine whether the node matches the given attribute value for a multivalued attribute |
432 | * @param Node $n |
433 | * @param string $attrName name of the attribute to check (typically 'typeof', 'rel') |
434 | * @param string $value Expected value of $attrName" attribute, as a literal string. |
435 | * @return bool True if the node matches |
436 | */ |
437 | private static function hasValueInMultivalAttr( Node $n, string $attrName, string $value ): bool { |
438 | // fast path |
439 | if ( !( $n instanceof Element ) ) { |
440 | return false; |
441 | } |
442 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
443 | if ( $attrValue === null || $attrValue === '' ) { |
444 | return false; |
445 | } |
446 | if ( $attrValue === $value ) { |
447 | return true; |
448 | } |
449 | // fallback |
450 | return in_array( $value, explode( ' ', $attrValue ), true ); |
451 | } |
452 | |
453 | /** |
454 | * Add a type to the typeof attribute. This method should almost always |
455 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
456 | * typeof information. |
457 | * |
458 | * @param Element $node node |
459 | * @param string $type type |
460 | * @param bool $prepend If true, adds value to start, rather than end. |
461 | * Use of this option in new code is discouraged. |
462 | */ |
463 | public static function addTypeOf( Element $node, string $type, bool $prepend = false ): void { |
464 | self::addValueToMultivalAttr( $node, 'typeof', $type, $prepend ); |
465 | } |
466 | |
467 | /** |
468 | * Add a type to the rel attribute. This method should almost always |
469 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
470 | * rel information. |
471 | * |
472 | * @param Element $node node |
473 | * @param string $rel type |
474 | */ |
475 | public static function addRel( Element $node, string $rel ): void { |
476 | self::addValueToMultivalAttr( $node, 'rel', $rel ); |
477 | } |
478 | |
479 | /** |
480 | * Add an element to a multivalue attribute (typeof, rel). This method should almost always |
481 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
482 | * multivalue information. |
483 | * |
484 | * @param Element $node |
485 | * @param string $attr |
486 | * @param string $value |
487 | * @param bool $prepend If true, adds value to start, rather than end |
488 | */ |
489 | private static function addValueToMultivalAttr( |
490 | Element $node, string $attr, string $value, bool $prepend = false |
491 | ): void { |
492 | $value = trim( $value ); |
493 | if ( $value === '' ) { |
494 | return; |
495 | } |
496 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
497 | if ( $oldValue !== null && trim( $oldValue ) !== '' ) { |
498 | $values = explode( ' ', trim( $oldValue ) ); |
499 | if ( in_array( $value, $values, true ) ) { |
500 | return; |
501 | } |
502 | $value = $prepend ? "$value $oldValue" : "$oldValue $value"; |
503 | } |
504 | $node->setAttribute( $attr, $value ); |
505 | } |
506 | |
507 | /** |
508 | * Remove a value from a multiple-valued attribute. |
509 | * |
510 | * @param Element $node node |
511 | * @param string $attr The attribute name |
512 | * @param string $value The value to remove |
513 | */ |
514 | private static function removeValueFromMultivalAttr( |
515 | Element $node, string $attr, string $value |
516 | ): void { |
517 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
518 | if ( $oldValue !== null && $oldValue !== '' ) { |
519 | $value = trim( $value ); |
520 | $types = array_diff( explode( ' ', $oldValue ), [ $value ] ); |
521 | if ( count( $types ) > 0 ) { |
522 | $node->setAttribute( $attr, implode( ' ', $types ) ); |
523 | } else { |
524 | $node->removeAttribute( $attr ); |
525 | } |
526 | } |
527 | } |
528 | |
529 | /** |
530 | * Remove a type from the typeof attribute. |
531 | * |
532 | * @param Element $node node |
533 | * @param string $type type |
534 | */ |
535 | public static function removeTypeOf( Element $node, string $type ): void { |
536 | self::removeValueFromMultivalAttr( $node, 'typeof', $type ); |
537 | } |
538 | |
539 | /** |
540 | * Remove a type from the rel attribute. |
541 | * |
542 | * @param Element $node node |
543 | * @param string $rel rel |
544 | */ |
545 | public static function removeRel( Element $node, string $rel ): void { |
546 | self::removeValueFromMultivalAttr( $node, 'rel', $rel ); |
547 | } |
548 | |
549 | /** |
550 | * Check whether `node` is in a fosterable position. |
551 | * |
552 | * @param ?Node $n |
553 | * @return bool |
554 | */ |
555 | public static function isFosterablePosition( ?Node $n ): bool { |
556 | return $n && isset( Consts::$HTML['FosterablePosition'][DOMCompat::nodeName( $n->parentNode )] ); |
557 | } |
558 | |
559 | /** |
560 | * Check whether `node` is a heading. |
561 | * |
562 | * @param ?Node $n |
563 | * @return bool |
564 | */ |
565 | public static function isHeading( ?Node $n ): bool { |
566 | return $n && preg_match( '/^h[1-6]$/D', DOMCompat::nodeName( $n ) ); |
567 | } |
568 | |
569 | /** |
570 | * Check whether `node` is a list. |
571 | * |
572 | * @param ?Node $n |
573 | * @return bool |
574 | */ |
575 | public static function isList( ?Node $n ): bool { |
576 | return $n && isset( Consts::$HTML['ListTags'][DOMCompat::nodeName( $n )] ); |
577 | } |
578 | |
579 | /** |
580 | * Check whether `node` is a list item. |
581 | * |
582 | * @param ?Node $n |
583 | * @return bool |
584 | */ |
585 | public static function isListItem( ?Node $n ): bool { |
586 | return $n && isset( Consts::$HTML['ListItemTags'][DOMCompat::nodeName( $n )] ); |
587 | } |
588 | |
589 | /** |
590 | * Check whether `node` is a list or list item. |
591 | * |
592 | * @param ?Node $n |
593 | * @return bool |
594 | */ |
595 | public static function isListOrListItem( ?Node $n ): bool { |
596 | return self::isList( $n ) || self::isListItem( $n ); |
597 | } |
598 | |
599 | /** |
600 | * Check whether `node` is nestee in a list item. |
601 | * |
602 | * @param ?Node $n |
603 | * @return bool |
604 | */ |
605 | public static function isNestedInListItem( ?Node $n ): bool { |
606 | $parentNode = $n->parentNode; |
607 | while ( $parentNode ) { |
608 | if ( self::isListItem( $parentNode ) ) { |
609 | return true; |
610 | } |
611 | $parentNode = $parentNode->parentNode; |
612 | } |
613 | return false; |
614 | } |
615 | |
616 | /** |
617 | * Check whether `node` is a nested list or a list item. |
618 | * |
619 | * @param ?Node $n |
620 | * @return bool |
621 | */ |
622 | public static function isNestedListOrListItem( ?Node $n ): bool { |
623 | return self::isListOrListItem( $n ) && self::isNestedInListItem( $n ); |
624 | } |
625 | |
626 | /** |
627 | * Check a node to see whether it's a meta with some typeof. |
628 | * |
629 | * @param Node $n |
630 | * @param string $type |
631 | * @return bool |
632 | */ |
633 | public static function isMarkerMeta( Node $n, string $type ): bool { |
634 | return self::hasNameAndTypeOf( $n, 'meta', $type ); |
635 | } |
636 | |
637 | /** |
638 | * Check whether a node has any children that are elements. |
639 | * |
640 | * @param Node $node |
641 | * @return bool |
642 | */ |
643 | public static function hasElementChild( Node $node ): bool { |
644 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
645 | if ( $child instanceof Element ) { |
646 | return true; |
647 | } |
648 | } |
649 | return false; |
650 | } |
651 | |
652 | /** |
653 | * Check if a node has a block-level element descendant. |
654 | * |
655 | * @param Node $node |
656 | * @return bool |
657 | */ |
658 | public static function hasBlockElementDescendant( Node $node ): bool { |
659 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
660 | if ( $child instanceof Element && |
661 | ( self::isWikitextBlockNode( $child ) || // Is a block-level node |
662 | self::hasBlockElementDescendant( $child ) ) // or has a block-level child or grandchild or.. |
663 | ) { |
664 | return true; |
665 | } |
666 | } |
667 | return false; |
668 | } |
669 | |
670 | /** |
671 | * Is a node representing inter-element whitespace? |
672 | * |
673 | * @param ?Node $node |
674 | * @return bool |
675 | */ |
676 | public static function isIEW( ?Node $node ): bool { |
677 | // ws-only |
678 | return $node instanceof Text && preg_match( '/^\s*$/D', $node->nodeValue ); |
679 | } |
680 | |
681 | /** |
682 | * Is a node a document fragment? |
683 | * |
684 | * @param ?Node $node |
685 | * @return bool |
686 | */ |
687 | public static function isDocumentFragment( ?Node $node ): bool { |
688 | return $node && $node->nodeType === XML_DOCUMENT_FRAG_NODE; |
689 | } |
690 | |
691 | /** |
692 | * Is a node at the top? |
693 | * |
694 | * @param ?Node $node |
695 | * @return bool |
696 | */ |
697 | public static function atTheTop( ?Node $node ): bool { |
698 | return self::isDocumentFragment( $node ) || self::isBody( $node ); |
699 | } |
700 | |
701 | /** |
702 | * Are all children of this node text or comment nodes? |
703 | * |
704 | * @param Node $node |
705 | * @return bool |
706 | */ |
707 | public static function allChildrenAreTextOrComments( Node $node ): bool { |
708 | $child = $node->firstChild; |
709 | while ( $child ) { |
710 | if ( !( $child instanceof Text || $child instanceof Comment ) ) { |
711 | return false; |
712 | } |
713 | $child = $child->nextSibling; |
714 | } |
715 | return true; |
716 | } |
717 | |
718 | /** |
719 | * Check if the dom-subtree rooted at node has an element with tag name 'tagName' |
720 | * By default, the root node is not checked. |
721 | * |
722 | * @param Node $node The DOM node whose tree should be checked |
723 | * @param string $tagName Tag name to look for |
724 | * @param bool $checkRoot Should the root be checked? |
725 | * @return bool |
726 | */ |
727 | public static function treeHasElement( Node $node, string $tagName, bool $checkRoot = false ): bool { |
728 | if ( $checkRoot && DOMCompat::nodeName( $node ) === $tagName ) { |
729 | return true; |
730 | } |
731 | |
732 | $node = $node->firstChild; |
733 | while ( $node ) { |
734 | if ( $node instanceof Element ) { |
735 | if ( self::treeHasElement( $node, $tagName, true ) ) { |
736 | return true; |
737 | } |
738 | } |
739 | $node = $node->nextSibling; |
740 | } |
741 | return false; |
742 | } |
743 | |
744 | /** |
745 | * Is node a table tag (table, tbody, td, tr, etc.)? |
746 | * |
747 | * @param Node $node |
748 | * @return bool |
749 | */ |
750 | public static function isTableTag( Node $node ): bool { |
751 | return isset( Consts::$HTML['TableTags'][DOMCompat::nodeName( $node )] ); |
752 | } |
753 | |
754 | /** |
755 | * Returns a media element nested in `node` |
756 | * |
757 | * @param Element $node |
758 | * @return Element|null |
759 | */ |
760 | public static function selectMediaElt( Element $node ): ?Element { |
761 | return DOMCompat::querySelector( $node, 'img, video, audio' ); |
762 | } |
763 | |
764 | /** |
765 | * Extract http-equiv headers from the HTML, including content-language and |
766 | * vary headers, if present |
767 | * |
768 | * @param Document $doc |
769 | * @return array<string,string> |
770 | */ |
771 | public static function findHttpEquivHeaders( Document $doc ): array { |
772 | $elts = DOMCompat::querySelectorAll( $doc, 'meta[http-equiv][content]' ); |
773 | $r = []; |
774 | foreach ( $elts as $el ) { |
775 | $r[strtolower( |
776 | DOMCompat::getAttribute( $el, 'http-equiv' ) |
777 | )] = DOMCompat::getAttribute( $el, 'content' ); |
778 | } |
779 | return $r; |
780 | } |
781 | |
782 | /** |
783 | * Add or replace http-equiv headers in the HTML <head>. |
784 | * This is used for content-language and vary headers, among possible |
785 | * others. |
786 | * @param Document $doc The HTML document to update |
787 | * @param array<string,string|string[]> $headers An array mapping HTTP |
788 | * header names (which are case-insensitive) to new values. If an |
789 | * array of values is provided, they will be joined with commas. |
790 | */ |
791 | public static function addHttpEquivHeaders( Document $doc, array $headers ): void { |
792 | foreach ( $headers as $key => $value ) { |
793 | if ( is_array( $value ) ) { |
794 | $value = implode( ',', $value ); |
795 | } |
796 | // HTTP header names are case-insensitive; hence the "i" suffix |
797 | // on this selector query. |
798 | $el = DOMCompat::querySelector( $doc, "meta[http-equiv=\"{$key}\"i]" ); |
799 | if ( !$el ) { |
800 | // This also ensures there is a <head> element. |
801 | $el = self::appendToHead( $doc, 'meta', [ 'http-equiv' => $key ] ); |
802 | } |
803 | $el->setAttribute( 'content', $value ); |
804 | |
805 | } |
806 | } |
807 | |
808 | /** |
809 | * @param Document $doc |
810 | * @return string|null |
811 | */ |
812 | public static function extractInlinedContentVersion( Document $doc ): ?string { |
813 | $el = DOMCompat::querySelector( $doc, |
814 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
815 | return $el ? DOMCompat::getAttribute( $el, 'content' ) : null; |
816 | } |
817 | |
818 | /** |
819 | * Add attributes to a node element. |
820 | * |
821 | * @param Element $elt element |
822 | * @param array $attrs attributes |
823 | */ |
824 | public static function addAttributes( Element $elt, array $attrs ): void { |
825 | foreach ( $attrs as $key => $value ) { |
826 | if ( $value !== null ) { |
827 | if ( $key === 'id' ) { |
828 | DOMCompat::setIdAttribute( $elt, $value ); |
829 | } else { |
830 | $elt->setAttribute( $key, $value ); |
831 | } |
832 | } |
833 | } |
834 | } |
835 | |
836 | /** |
837 | * Create an element in the document head with the given attrs. |
838 | * Creates the head element in the document if needed. |
839 | * |
840 | * @param Document $document |
841 | * @param string $tagName |
842 | * @param array $attrs |
843 | * @return Element The newly-appended Element |
844 | */ |
845 | public static function appendToHead( Document $document, string $tagName, array $attrs = [] ): Element { |
846 | $elt = $document->createElement( $tagName ); |
847 | self::addAttributes( $elt, $attrs ); |
848 | $head = DOMCompat::getHead( $document ); |
849 | if ( !$head ) { |
850 | $head = $document->createElement( 'head' ); |
851 | $document->documentElement->insertBefore( |
852 | $head, DOMCompat::getBody( $document ) |
853 | ); |
854 | } |
855 | $head->appendChild( $elt ); |
856 | return $elt; |
857 | } |
858 | |
859 | /** |
860 | * innerHTML and outerHTML are not defined on DocumentFragment. |
861 | * |
862 | * Defined similarly to DOMCompat::getInnerHTML() |
863 | * |
864 | * @param DocumentFragment $frag |
865 | * @return string |
866 | */ |
867 | public static function getFragmentInnerHTML( |
868 | DocumentFragment $frag |
869 | ): string { |
870 | return XMLSerializer::serialize( |
871 | $frag, [ 'innerXML' => true ] |
872 | )['html']; |
873 | } |
874 | |
875 | /** |
876 | * innerHTML and outerHTML are not defined on DocumentFragment. |
877 | * @see DOMCompat::setInnerHTML() for the Element version |
878 | * |
879 | * @param DocumentFragment $frag |
880 | * @param string $html |
881 | */ |
882 | public static function setFragmentInnerHTML( |
883 | DocumentFragment $frag, string $html |
884 | ): void { |
885 | // FIXME: This should be an HTML5 template element |
886 | $body = $frag->ownerDocument->createElement( 'body' ); |
887 | DOMCompat::setInnerHTML( $body, $html ); |
888 | self::migrateChildren( $body, $frag ); |
889 | } |
890 | |
891 | /** |
892 | * @param Document $doc |
893 | * @param string $html |
894 | * @return DocumentFragment |
895 | */ |
896 | public static function parseHTMLToFragment( |
897 | Document $doc, string $html |
898 | ): DocumentFragment { |
899 | $frag = $doc->createDocumentFragment(); |
900 | self::setFragmentInnerHTML( $frag, $html ); |
901 | return $frag; |
902 | } |
903 | |
904 | /** |
905 | * @param Node $node |
906 | * @return bool |
907 | */ |
908 | public static function isRawTextElement( Node $node ): bool { |
909 | return isset( Consts::$HTML['RawTextElements'][DOMCompat::nodeName( $node )] ); |
910 | } |
911 | |
912 | /** |
913 | * Is 'n' a block tag, or does the subtree rooted at 'n' have a block tag |
914 | * in it? |
915 | * |
916 | * @param Node $n |
917 | * @return bool |
918 | */ |
919 | public static function hasBlockTag( Node $n ): bool { |
920 | if ( self::isRemexBlockNode( $n ) ) { |
921 | return true; |
922 | } |
923 | $c = $n->firstChild; |
924 | while ( $c ) { |
925 | if ( self::hasBlockTag( $c ) ) { |
926 | return true; |
927 | } |
928 | $c = $c->nextSibling; |
929 | } |
930 | return false; |
931 | } |
932 | |
933 | /** |
934 | * Get an associative array of attributes, suitable for serialization. |
935 | * |
936 | * Add the xmlns attribute if available, to workaround PHP's surprising |
937 | * behavior with the xmlns attribute: HTML is *not* an XML document, |
938 | * but various parts of PHP (including our misnamed XMLSerializer) pretend |
939 | * that it is, sort of. |
940 | * |
941 | * @param Element $element |
942 | * @return array<string,string> |
943 | * @see https://phabricator.wikimedia.org/T235295 |
944 | */ |
945 | public static function attributes( Element $element ): array { |
946 | $result = []; |
947 | // The 'xmlns' attribute is "invisible" T235295 |
948 | $xmlns = DOMCompat::getAttribute( $element, 'xmlns' ); |
949 | if ( $xmlns !== null ) { |
950 | $result['xmlns'] = $xmlns; |
951 | } |
952 | foreach ( $element->attributes as $attr ) { |
953 | $result[$attr->name] = $attr->value; |
954 | } |
955 | return $result; |
956 | } |
957 | |
958 | /** |
959 | * @param Element $node |
960 | * @return bool |
961 | */ |
962 | public static function isMetaDataTag( Element $node ): bool { |
963 | return isset( Consts::$HTML['MetaDataTags'][DOMCompat::nodeName( $node )] ); |
964 | } |
965 | |
966 | /** |
967 | * Strip a paragraph wrapper, if any, before parsing HTML to DOM |
968 | */ |
969 | public static function stripPWrapper( string $ret ): string { |
970 | return preg_replace( '#(^<p>)|(\n</p>(' . Utils::COMMENT_REGEXP_FRAGMENT . '|\s)*$)#D', '', $ret ); |
971 | } |
972 | } |