Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
29.03% |
63 / 217 |
|
44.44% |
28 / 63 |
CRAP | |
0.00% |
0 / 1 |
DOMUtils | |
29.03% |
63 / 217 |
|
44.44% |
28 / 63 |
7659.83 | |
0.00% |
0 / 1 |
parseHTML | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 | |||
visitDOM | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
migrateChildren | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
migrateChildrenBetweenDocs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
assertElt | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
isRemexBlockNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
isWikitextBlockNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isFormattingElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isQuoteElt | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isBody | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isRemoved | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
pathToRoot | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
nodeDepth | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
pathToSibling | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
inSiblingOrder | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
isAncestorOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
findAncestorOfName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
hasNameOrHasAncestorOfName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
matchNameAndTypeOf | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
hasNameAndTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
matchTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
matchRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
matchMultivalAttr | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
hasTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasClass | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
hasValueInMultivalAttr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
addTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addValueToMultivalAttr | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
42 | |||
removeValueFromMultivalAttr | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
removeTypeOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
removeRel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isFosterablePosition | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isHeading | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isList | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isListOrListItem | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isNestedInListItem | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
isNestedListOrListItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isMarkerMeta | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
hasElementChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasBlockElementDescendant | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
5 | |||
isIEW | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isDocumentFragment | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
atTheTop | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
allChildrenAreTextOrComments | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
treeHasElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
isTableTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
selectMediaElt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
findHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
addHttpEquivHeaders | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
extractInlinedContentVersion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
addAttributes | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
appendToHead | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
getFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setFragmentInnerHTML | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
parseHTMLToFragment | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isRawTextElement | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasBlockTag | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
attributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
isMetaDataTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stripPWrapper | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Core\ClientError; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Wikitext\Consts; |
15 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
16 | use Wikimedia\RemexHtml\DOM\DOMBuilder; |
17 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer; |
18 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
19 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
20 | |
21 | /** |
22 | * DOM utilities for querying the DOM. This is largely independent of Parsoid |
23 | * although some Parsoid details (TokenUtils, inline content version) |
24 | * have snuck in. |
25 | */ |
26 | class DOMUtils { |
27 | |
28 | /** |
29 | * Parse HTML, return the tree. |
30 | * |
31 | * @param string $html |
32 | * @param bool $validateXMLNames |
33 | * @return Document |
34 | */ |
35 | public static function parseHTML( |
36 | string $html, bool $validateXMLNames = false |
37 | ): Document { |
38 | if ( !preg_match( '/^<(?:!doctype|html|body)/i', $html ) ) { |
39 | // Make sure that we parse fragments in the body. Otherwise comments, |
40 | // link and meta tags end up outside the html element or in the head |
41 | // elements. |
42 | $html = '<body>' . $html; |
43 | } |
44 | |
45 | $domBuilder = new class( [ |
46 | 'suppressHtmlNamespace' => true, |
47 | ] ) extends DOMBuilder |
48 | { |
49 | /** @inheritDoc */ |
50 | protected function createDocument( |
51 | string $doctypeName = null, |
52 | string $public = null, |
53 | string $system = null |
54 | ) { |
55 | // @phan-suppress-next-line PhanTypeMismatchReturn |
56 | return DOMCompat::newDocument( false ); |
57 | } |
58 | }; |
59 | $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); |
60 | $dispatcher = new Dispatcher( $treeBuilder ); |
61 | $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); |
62 | $tokenizer->execute( [] ); |
63 | if ( $validateXMLNames && $domBuilder->isCoerced() ) { |
64 | throw new ClientError( 'Encountered a name invalid in XML.' ); |
65 | } |
66 | $frag = $domBuilder->getFragment(); |
67 | '@phan-var Document $frag'; // @var Document $frag |
68 | return $frag; |
69 | } |
70 | |
71 | /** |
72 | * This is a simplified version of the DOMTraverser. |
73 | * Consider using that before making this more complex. |
74 | * |
75 | * FIXME: Move to DOMTraverser OR create a new class? |
76 | * @param Node $node |
77 | * @param callable $handler |
78 | * @param mixed ...$args |
79 | */ |
80 | public static function visitDOM( Node $node, callable $handler, ...$args ): void { |
81 | $handler( $node, ...$args ); |
82 | $node = $node->firstChild; |
83 | while ( $node ) { |
84 | $next = $node->nextSibling; |
85 | self::visitDOM( $node, $handler, ...$args ); |
86 | $node = $next; |
87 | } |
88 | } |
89 | |
90 | /** |
91 | * Move 'from'.childNodes to 'to' adding them before 'beforeNode' |
92 | * If 'beforeNode' is null, the nodes are appended at the end. |
93 | * @param Node $from Source node. Children will be removed. |
94 | * @param Node $to Destination node. Children of $from will be added here |
95 | * @param ?Node $beforeNode Add the children before this node. |
96 | */ |
97 | public static function migrateChildren( |
98 | Node $from, Node $to, ?Node $beforeNode = null |
99 | ): void { |
100 | while ( $from->firstChild ) { |
101 | $to->insertBefore( $from->firstChild, $beforeNode ); |
102 | } |
103 | } |
104 | |
105 | /** |
106 | * Copy 'from'.childNodes to 'to' adding them before 'beforeNode' |
107 | * 'from' and 'to' belong to different documents. |
108 | * |
109 | * If 'beforeNode' is null, the nodes are appended at the end. |
110 | * @param Node $from |
111 | * @param Node $to |
112 | * @param ?Node $beforeNode |
113 | */ |
114 | public static function migrateChildrenBetweenDocs( |
115 | Node $from, Node $to, ?Node $beforeNode = null |
116 | ): void { |
117 | $n = $from->firstChild; |
118 | $destDoc = $to->ownerDocument; |
119 | while ( $n ) { |
120 | $to->insertBefore( $destDoc->importNode( $n, true ), $beforeNode ); |
121 | $n = $n->nextSibling; |
122 | } |
123 | } |
124 | |
125 | // phpcs doesn't like @phan-assert... |
126 | // phpcs:disable MediaWiki.Commenting.FunctionAnnotations.UnrecognizedAnnotation |
127 | |
128 | /** |
129 | * Assert that this is a DOM element node. |
130 | * This is primarily to help phan analyze variable types. |
131 | * @phan-assert Element $node |
132 | * @param ?Node $node |
133 | * @return bool Always returns true |
134 | * @phan-assert Element $node |
135 | */ |
136 | public static function assertElt( ?Node $node ): bool { |
137 | Assert::invariant( $node instanceof Element, "Expected an element" ); |
138 | return true; |
139 | } |
140 | |
141 | /** |
142 | * @param ?Node $node |
143 | * @return bool |
144 | */ |
145 | public static function isRemexBlockNode( ?Node $node ): bool { |
146 | return $node instanceof Element && |
147 | !isset( Consts::$HTML['OnlyInlineElements'][DOMCompat::nodeName( $node )] ) && |
148 | // This is a superset of \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements |
149 | !self::isMetaDataTag( $node ); |
150 | } |
151 | |
152 | /** |
153 | * @param ?Node $node |
154 | * @return bool |
155 | */ |
156 | public static function isWikitextBlockNode( ?Node $node ): bool { |
157 | return $node && TokenUtils::isWikitextBlockTag( DOMCompat::nodeName( $node ) ); |
158 | } |
159 | |
160 | /** |
161 | * Determine whether this is a formatting DOM element. |
162 | * @param ?Node $node |
163 | * @return bool |
164 | */ |
165 | public static function isFormattingElt( ?Node $node ): bool { |
166 | return $node && isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $node )] ); |
167 | } |
168 | |
169 | /** |
170 | * Determine whether this is a quote DOM element. |
171 | * @param ?Node $node |
172 | * @return bool |
173 | */ |
174 | public static function isQuoteElt( ?Node $node ): bool { |
175 | return $node && isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $node )] ); |
176 | } |
177 | |
178 | /** |
179 | * Determine whether this is the <body> DOM element. |
180 | * @param ?Node $node |
181 | * @return bool |
182 | */ |
183 | public static function isBody( ?Node $node ): bool { |
184 | return $node && DOMCompat::nodeName( $node ) === 'body'; |
185 | } |
186 | |
187 | /** |
188 | * Determine whether this is a removed DOM node but Node object yet |
189 | * @param ?Node $node |
190 | * @return bool |
191 | */ |
192 | public static function isRemoved( ?Node $node ): bool { |
193 | return !$node || !isset( $node->nodeType ); |
194 | } |
195 | |
196 | /** |
197 | * Build path from a node to the root of the document. |
198 | * |
199 | * @param Node $node |
200 | * @return Node[] Path including all nodes from $node to the root of the document |
201 | */ |
202 | public static function pathToRoot( Node $node ): array { |
203 | $path = []; |
204 | do { |
205 | $path[] = $node; |
206 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
207 | } while ( $node = $node->parentNode ); |
208 | return $path; |
209 | } |
210 | |
211 | /** |
212 | * Compute the edge length of the path from $node to the root. |
213 | * Root document is at depth 0, <html> at 1, <body> at 2. |
214 | * @param Node $node |
215 | * @return int |
216 | */ |
217 | public static function nodeDepth( Node $node ): int { |
218 | $edges = 0; |
219 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
220 | while ( $node = $node->parentNode ) { |
221 | $edges++; |
222 | } |
223 | return $edges; |
224 | } |
225 | |
226 | /** |
227 | * Build path from a node to its passed-in sibling. |
228 | * Return will not include the passed-in sibling. |
229 | * |
230 | * @param Node $node |
231 | * @param Node $sibling |
232 | * @param bool $left indicates whether to go backwards, use previousSibling instead of nextSibling. |
233 | * @return Node[] |
234 | */ |
235 | public static function pathToSibling( Node $node, Node $sibling, bool $left ): array { |
236 | $path = []; |
237 | while ( $node && $node !== $sibling ) { |
238 | $path[] = $node; |
239 | $node = $left ? $node->previousSibling : $node->nextSibling; |
240 | } |
241 | return $path; |
242 | } |
243 | |
244 | /** |
245 | * Check whether a node `n1` comes before another node `n2` in |
246 | * their parent's children list. |
247 | * |
248 | * @param Node $n1 The node you expect to come first. |
249 | * @param Node $n2 Expected later sibling. |
250 | * @return bool |
251 | */ |
252 | public static function inSiblingOrder( Node $n1, Node $n2 ): bool { |
253 | while ( $n1 && $n1 !== $n2 ) { |
254 | $n1 = $n1->nextSibling; |
255 | } |
256 | return $n1 !== null; |
257 | } |
258 | |
259 | /** |
260 | * Check that a node 'n1' is an ancestor of another node 'n2' in |
261 | * the DOM. Returns true if n1 === n2. |
262 | * $n1 is the suspected ancestor. |
263 | * $n2 The suspected descendant. |
264 | * |
265 | * @param Node $n1 |
266 | * @param Node $n2 |
267 | * @return bool |
268 | */ |
269 | public static function isAncestorOf( Node $n1, Node $n2 ): bool { |
270 | while ( $n2 && $n2 !== $n1 ) { |
271 | $n2 = $n2->parentNode; |
272 | } |
273 | return $n2 !== null; |
274 | } |
275 | |
276 | /** |
277 | * Find an ancestor of $node with nodeName $name. |
278 | * |
279 | * @param Node $node |
280 | * @param string $name |
281 | * @return ?Element |
282 | */ |
283 | public static function findAncestorOfName( Node $node, string $name ): ?Element { |
284 | $node = $node->parentNode; |
285 | while ( $node && DOMCompat::nodeName( $node ) !== $name ) { |
286 | $node = $node->parentNode; |
287 | } |
288 | '@phan-var Element $node'; // @var Element $node |
289 | return $node; |
290 | } |
291 | |
292 | /** |
293 | * Check whether $node has $name or has an ancestor named $name. |
294 | * |
295 | * @param Node $node |
296 | * @param string $name |
297 | * @return bool |
298 | */ |
299 | public static function hasNameOrHasAncestorOfName( Node $node, string $name ): bool { |
300 | return DOMCompat::nodeName( $node ) === $name || self::findAncestorOfName( $node, $name ) !== null; |
301 | } |
302 | |
303 | /** |
304 | * Determine whether the node matches the given nodeName and attribute value. |
305 | * Returns true if node name matches and the attribute equals "typeof" |
306 | * |
307 | * @param Node $n The node to test |
308 | * @param string $name The expected nodeName of $n |
309 | * @param string $typeRe Regular expression matching the expected value of |
310 | * `typeof` attribute. |
311 | * @return ?string The matching `typeof` value, or `null` if there is |
312 | * no match. |
313 | */ |
314 | public static function matchNameAndTypeOf( Node $n, string $name, string $typeRe ): ?string { |
315 | return DOMCompat::nodeName( $n ) === $name ? self::matchTypeOf( $n, $typeRe ) : null; |
316 | } |
317 | |
318 | /** |
319 | * Determine whether the node matches the given nodeName and typeof |
320 | * attribute value; the typeof is given as string. |
321 | * |
322 | * @param Node $n |
323 | * @param string $name node name to test for |
324 | * @param string $type Expected value of "typeof" attribute (literal string) |
325 | * @return bool True if the node matches. |
326 | */ |
327 | public static function hasNameAndTypeOf( Node $n, string $name, string $type ): bool { |
328 | return self::matchNameAndTypeOf( |
329 | $n, $name, '/^' . preg_quote( $type, '/' ) . '$/' |
330 | ) !== null; |
331 | } |
332 | |
333 | /** |
334 | * Determine whether the node matches the given `typeof` attribute value. |
335 | * |
336 | * @param Node $n The node to test |
337 | * @param string $typeRe Regular expression matching the expected value of |
338 | * the `typeof` attribute. |
339 | * @return ?string The matching `typeof` value, or `null` if there is |
340 | * no match. |
341 | */ |
342 | public static function matchTypeOf( Node $n, string $typeRe ): ?string { |
343 | return self::matchMultivalAttr( $n, 'typeof', $typeRe ); |
344 | } |
345 | |
346 | /** |
347 | * Determine whether the node matches the given `rel` attribute value. |
348 | * |
349 | * @param Node $n The node to test |
350 | * @param string $relRe Regular expression matching the expected value of |
351 | * the `rel` attribute. |
352 | * @return ?string The matching `rel` value, or `null` if there is |
353 | * no match. |
354 | */ |
355 | public static function matchRel( Node $n, string $relRe ): ?string { |
356 | return self::matchMultivalAttr( $n, 'rel', $relRe ); |
357 | } |
358 | |
359 | /** |
360 | * Determine whether the node matches the given multivalue attribute value. |
361 | * |
362 | * @param Node $n The node to test |
363 | * @param string $attrName the attribute to test (typically 'rel' or 'typeof') |
364 | * @param string $valueRe Regular expression matching the expected value of |
365 | * the attribute. |
366 | * @return ?string The matching attribute value, or `null` if there is |
367 | * no match. |
368 | */ |
369 | private static function matchMultivalAttr( Node $n, string $attrName, string $valueRe ): ?string { |
370 | if ( !( $n instanceof Element ) ) { |
371 | return null; |
372 | } |
373 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
374 | if ( $attrValue === null || $attrValue === '' ) { |
375 | return null; |
376 | } |
377 | foreach ( explode( ' ', $attrValue ) as $ty ) { |
378 | if ( $ty === '' ) { |
379 | continue; |
380 | } |
381 | $count = preg_match( $valueRe, $ty ); |
382 | Assert::invariant( $count !== false, "Bad regexp" ); |
383 | if ( $count ) { |
384 | return $ty; |
385 | } |
386 | } |
387 | return null; |
388 | } |
389 | |
390 | /** |
391 | * Determine whether the node matches the given typeof attribute value. |
392 | * |
393 | * @param Node $n |
394 | * @param string $type Expected value of "typeof" attribute, as a literal |
395 | * string. |
396 | * @return bool True if the node matches. |
397 | */ |
398 | public static function hasTypeOf( Node $n, string $type ): bool { |
399 | return self::hasValueInMultivalAttr( $n, 'typeof', $type ); |
400 | } |
401 | |
402 | /** |
403 | * Determine whether the node matches the given rel attribute value. |
404 | * |
405 | * @param Node $n |
406 | * @param string $rel Expected value of "rel" attribute, as a literal string. |
407 | * @return bool True if the node matches. |
408 | */ |
409 | public static function hasRel( Node $n, string $rel ): bool { |
410 | return self::hasValueInMultivalAttr( $n, 'rel', $rel ); |
411 | } |
412 | |
413 | /** |
414 | * @param Element $element |
415 | * @param string $regex Partial regular expression, e.g. "foo|bar" |
416 | * @return bool |
417 | */ |
418 | public static function hasClass( Element $element, string $regex ): bool { |
419 | $value = DOMCompat::getAttribute( $element, 'class' ); |
420 | return (bool)preg_match( '{(?<=^|\s)' . $regex . '(?=\s|$)}', $value ?? '' ); |
421 | } |
422 | |
423 | /** |
424 | * Determine whether the node matches the given attribute value for a multivalued attribute |
425 | * @param Node $n |
426 | * @param string $attrName name of the attribute to check (typically 'typeof', 'rel') |
427 | * @param string $value Expected value of $attrName" attribute, as a literal string. |
428 | * @return bool True if the node matches |
429 | */ |
430 | private static function hasValueInMultivalAttr( Node $n, string $attrName, string $value ): bool { |
431 | // fast path |
432 | if ( !( $n instanceof Element ) ) { |
433 | return false; |
434 | } |
435 | $attrValue = DOMCompat::getAttribute( $n, $attrName ); |
436 | if ( $attrValue === null || $attrValue === '' ) { |
437 | return false; |
438 | } |
439 | if ( $attrValue === $value ) { |
440 | return true; |
441 | } |
442 | // fallback |
443 | return in_array( $value, explode( ' ', $attrValue ), true ); |
444 | } |
445 | |
446 | /** |
447 | * Add a type to the typeof attribute. This method should almost always |
448 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
449 | * typeof information. |
450 | * |
451 | * @param Element $node node |
452 | * @param string $type type |
453 | * @param bool $prepend If true, adds value to start, rather than end. |
454 | * Use of this option in new code is discouraged. |
455 | */ |
456 | public static function addTypeOf( Element $node, string $type, bool $prepend = false ): void { |
457 | self::addValueToMultivalAttr( $node, 'typeof', $type, $prepend ); |
458 | } |
459 | |
460 | /** |
461 | * Add a type to the rel attribute. This method should almost always |
462 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
463 | * rel information. |
464 | * |
465 | * @param Element $node node |
466 | * @param string $rel type |
467 | */ |
468 | public static function addRel( Element $node, string $rel ): void { |
469 | self::addValueToMultivalAttr( $node, 'rel', $rel ); |
470 | } |
471 | |
472 | /** |
473 | * Add an element to a multivalue attribute (typeof, rel). This method should almost always |
474 | * be used instead of `setAttribute`, to ensure we don't overwrite existing |
475 | * multivalue information. |
476 | * |
477 | * @param Element $node |
478 | * @param string $attr |
479 | * @param string $value |
480 | * @param bool $prepend If true, adds value to start, rather than end |
481 | */ |
482 | private static function addValueToMultivalAttr( |
483 | Element $node, string $attr, string $value, bool $prepend = false |
484 | ): void { |
485 | $value = trim( $value ); |
486 | if ( $value === '' ) { |
487 | return; |
488 | } |
489 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
490 | if ( $oldValue !== null && trim( $oldValue ) !== '' ) { |
491 | $values = explode( ' ', trim( $oldValue ) ); |
492 | if ( in_array( $value, $values, true ) ) { |
493 | return; |
494 | } |
495 | $value = $prepend ? "$value $oldValue" : "$oldValue $value"; |
496 | } |
497 | $node->setAttribute( $attr, $value ); |
498 | } |
499 | |
500 | /** |
501 | * Remove a value from a multiple-valued attribute. |
502 | * |
503 | * @param Element $node node |
504 | * @param string $attr The attribute name |
505 | * @param string $value The value to remove |
506 | */ |
507 | private static function removeValueFromMultivalAttr( |
508 | Element $node, string $attr, string $value |
509 | ): void { |
510 | $oldValue = DOMCompat::getAttribute( $node, $attr ); |
511 | if ( $oldValue !== null && $oldValue !== '' ) { |
512 | $value = trim( $value ); |
513 | $types = array_diff( explode( ' ', $oldValue ), [ $value ] ); |
514 | if ( count( $types ) > 0 ) { |
515 | $node->setAttribute( $attr, implode( ' ', $types ) ); |
516 | } else { |
517 | $node->removeAttribute( $attr ); |
518 | } |
519 | } |
520 | } |
521 | |
522 | /** |
523 | * Remove a type from the typeof attribute. |
524 | * |
525 | * @param Element $node node |
526 | * @param string $type type |
527 | */ |
528 | public static function removeTypeOf( Element $node, string $type ): void { |
529 | self::removeValueFromMultivalAttr( $node, 'typeof', $type ); |
530 | } |
531 | |
532 | /** |
533 | * Remove a type from the rel attribute. |
534 | * |
535 | * @param Element $node node |
536 | * @param string $rel rel |
537 | */ |
538 | public static function removeRel( Element $node, string $rel ): void { |
539 | self::removeValueFromMultivalAttr( $node, 'rel', $rel ); |
540 | } |
541 | |
542 | /** |
543 | * Check whether `node` is in a fosterable position. |
544 | * |
545 | * @param ?Node $n |
546 | * @return bool |
547 | */ |
548 | public static function isFosterablePosition( ?Node $n ): bool { |
549 | return $n && isset( Consts::$HTML['FosterablePosition'][DOMCompat::nodeName( $n->parentNode )] ); |
550 | } |
551 | |
552 | /** |
553 | * Check whether `node` is a heading. |
554 | * |
555 | * @param ?Node $n |
556 | * @return bool |
557 | */ |
558 | public static function isHeading( ?Node $n ): bool { |
559 | return $n && preg_match( '/^h[1-6]$/D', DOMCompat::nodeName( $n ) ); |
560 | } |
561 | |
562 | /** |
563 | * Check whether `node` is a list. |
564 | * |
565 | * @param ?Node $n |
566 | * @return bool |
567 | */ |
568 | public static function isList( ?Node $n ): bool { |
569 | return $n && isset( Consts::$HTML['ListTags'][DOMCompat::nodeName( $n )] ); |
570 | } |
571 | |
572 | /** |
573 | * Check whether `node` is a list item. |
574 | * |
575 | * @param ?Node $n |
576 | * @return bool |
577 | */ |
578 | public static function isListItem( ?Node $n ): bool { |
579 | return $n && isset( Consts::$HTML['ListItemTags'][DOMCompat::nodeName( $n )] ); |
580 | } |
581 | |
582 | /** |
583 | * Check whether `node` is a list or list item. |
584 | * |
585 | * @param ?Node $n |
586 | * @return bool |
587 | */ |
588 | public static function isListOrListItem( ?Node $n ): bool { |
589 | return self::isList( $n ) || self::isListItem( $n ); |
590 | } |
591 | |
592 | /** |
593 | * Check whether `node` is nestee in a list item. |
594 | * |
595 | * @param ?Node $n |
596 | * @return bool |
597 | */ |
598 | public static function isNestedInListItem( ?Node $n ): bool { |
599 | $parentNode = $n->parentNode; |
600 | while ( $parentNode ) { |
601 | if ( self::isListItem( $parentNode ) ) { |
602 | return true; |
603 | } |
604 | $parentNode = $parentNode->parentNode; |
605 | } |
606 | return false; |
607 | } |
608 | |
609 | /** |
610 | * Check whether `node` is a nested list or a list item. |
611 | * |
612 | * @param ?Node $n |
613 | * @return bool |
614 | */ |
615 | public static function isNestedListOrListItem( ?Node $n ): bool { |
616 | return self::isListOrListItem( $n ) && self::isNestedInListItem( $n ); |
617 | } |
618 | |
619 | /** |
620 | * Check a node to see whether it's a meta with some typeof. |
621 | * |
622 | * @param Node $n |
623 | * @param string $type |
624 | * @return bool |
625 | */ |
626 | public static function isMarkerMeta( Node $n, string $type ): bool { |
627 | return self::hasNameAndTypeOf( $n, 'meta', $type ); |
628 | } |
629 | |
630 | /** |
631 | * Check whether a node has any children that are elements. |
632 | * |
633 | * @param Node $node |
634 | * @return bool |
635 | */ |
636 | public static function hasElementChild( Node $node ): bool { |
637 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
638 | if ( $child instanceof Element ) { |
639 | return true; |
640 | } |
641 | } |
642 | return false; |
643 | } |
644 | |
645 | /** |
646 | * Check if a node has a block-level element descendant. |
647 | * |
648 | * @param Node $node |
649 | * @return bool |
650 | */ |
651 | public static function hasBlockElementDescendant( Node $node ): bool { |
652 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
653 | if ( $child instanceof Element && |
654 | ( self::isWikitextBlockNode( $child ) || // Is a block-level node |
655 | self::hasBlockElementDescendant( $child ) ) // or has a block-level child or grandchild or.. |
656 | ) { |
657 | return true; |
658 | } |
659 | } |
660 | return false; |
661 | } |
662 | |
663 | /** |
664 | * Is a node representing inter-element whitespace? |
665 | * |
666 | * @param ?Node $node |
667 | * @return bool |
668 | */ |
669 | public static function isIEW( ?Node $node ): bool { |
670 | // ws-only |
671 | return $node instanceof Text && preg_match( '/^\s*$/D', $node->nodeValue ); |
672 | } |
673 | |
674 | /** |
675 | * Is a node a document fragment? |
676 | * |
677 | * @param ?Node $node |
678 | * @return bool |
679 | */ |
680 | public static function isDocumentFragment( ?Node $node ): bool { |
681 | return $node && $node->nodeType === XML_DOCUMENT_FRAG_NODE; |
682 | } |
683 | |
684 | /** |
685 | * Is a node at the top? |
686 | * |
687 | * @param ?Node $node |
688 | * @return bool |
689 | */ |
690 | public static function atTheTop( ?Node $node ): bool { |
691 | return self::isDocumentFragment( $node ) || self::isBody( $node ); |
692 | } |
693 | |
694 | /** |
695 | * Are all children of this node text or comment nodes? |
696 | * |
697 | * @param Node $node |
698 | * @return bool |
699 | */ |
700 | public static function allChildrenAreTextOrComments( Node $node ): bool { |
701 | $child = $node->firstChild; |
702 | while ( $child ) { |
703 | if ( !( $child instanceof Text || $child instanceof Comment ) ) { |
704 | return false; |
705 | } |
706 | $child = $child->nextSibling; |
707 | } |
708 | return true; |
709 | } |
710 | |
711 | /** |
712 | * Check if the dom-subtree rooted at node has an element with tag name 'tagName' |
713 | * By default, the root node is not checked. |
714 | * |
715 | * @param Node $node The DOM node whose tree should be checked |
716 | * @param string $tagName Tag name to look for |
717 | * @param bool $checkRoot Should the root be checked? |
718 | * @return bool |
719 | */ |
720 | public static function treeHasElement( Node $node, string $tagName, bool $checkRoot = false ): bool { |
721 | if ( $checkRoot && DOMCompat::nodeName( $node ) === $tagName ) { |
722 | return true; |
723 | } |
724 | |
725 | $node = $node->firstChild; |
726 | while ( $node ) { |
727 | if ( $node instanceof Element ) { |
728 | if ( self::treeHasElement( $node, $tagName, true ) ) { |
729 | return true; |
730 | } |
731 | } |
732 | $node = $node->nextSibling; |
733 | } |
734 | return false; |
735 | } |
736 | |
737 | /** |
738 | * Is node a table tag (table, tbody, td, tr, etc.)? |
739 | * |
740 | * @param Node $node |
741 | * @return bool |
742 | */ |
743 | public static function isTableTag( Node $node ): bool { |
744 | return isset( Consts::$HTML['TableTags'][DOMCompat::nodeName( $node )] ); |
745 | } |
746 | |
747 | /** |
748 | * Returns a media element nested in `node` |
749 | * |
750 | * @param Element $node |
751 | * @return Element|null |
752 | */ |
753 | public static function selectMediaElt( Element $node ): ?Element { |
754 | return DOMCompat::querySelector( $node, 'img, video, audio' ); |
755 | } |
756 | |
757 | /** |
758 | * Extract http-equiv headers from the HTML, including content-language and |
759 | * vary headers, if present |
760 | * |
761 | * @param Document $doc |
762 | * @return array<string,string> |
763 | */ |
764 | public static function findHttpEquivHeaders( Document $doc ): array { |
765 | $elts = DOMCompat::querySelectorAll( $doc, 'meta[http-equiv][content]' ); |
766 | $r = []; |
767 | foreach ( $elts as $el ) { |
768 | $r[strtolower( |
769 | DOMCompat::getAttribute( $el, 'http-equiv' ) |
770 | )] = DOMCompat::getAttribute( $el, 'content' ); |
771 | } |
772 | return $r; |
773 | } |
774 | |
775 | /** |
776 | * Add or replace http-equiv headers in the HTML <head>. |
777 | * This is used for content-language and vary headers, among possible |
778 | * others. |
779 | * @param Document $doc The HTML document to update |
780 | * @param array<string,string|string[]> $headers An array mapping HTTP |
781 | * header names (which are case-insensitive) to new values. If an |
782 | * array of values is provided, they will be joined with commas. |
783 | */ |
784 | public static function addHttpEquivHeaders( Document $doc, array $headers ): void { |
785 | foreach ( $headers as $key => $value ) { |
786 | if ( is_array( $value ) ) { |
787 | $value = implode( ',', $value ); |
788 | } |
789 | // HTTP header names are case-insensitive; hence the "i" suffix |
790 | // on this selector query. |
791 | $el = DOMCompat::querySelector( $doc, "meta[http-equiv=\"{$key}\"i]" ); |
792 | if ( !$el ) { |
793 | // This also ensures there is a <head> element. |
794 | $el = self::appendToHead( $doc, 'meta', [ 'http-equiv' => $key ] ); |
795 | } |
796 | $el->setAttribute( 'content', $value ); |
797 | |
798 | } |
799 | } |
800 | |
801 | /** |
802 | * @param Document $doc |
803 | * @return string|null |
804 | */ |
805 | public static function extractInlinedContentVersion( Document $doc ): ?string { |
806 | $el = DOMCompat::querySelector( $doc, |
807 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
808 | return $el ? DOMCompat::getAttribute( $el, 'content' ) : null; |
809 | } |
810 | |
811 | /** |
812 | * Add attributes to a node element. |
813 | * |
814 | * @param Element $elt element |
815 | * @param array $attrs attributes |
816 | */ |
817 | public static function addAttributes( Element $elt, array $attrs ): void { |
818 | foreach ( $attrs as $key => $value ) { |
819 | if ( $value !== null ) { |
820 | if ( $key === 'id' ) { |
821 | DOMCompat::setIdAttribute( $elt, $value ); |
822 | } else { |
823 | $elt->setAttribute( $key, $value ); |
824 | } |
825 | } |
826 | } |
827 | } |
828 | |
829 | /** |
830 | * Create an element in the document head with the given attrs. |
831 | * Creates the head element in the document if needed. |
832 | * |
833 | * @param Document $document |
834 | * @param string $tagName |
835 | * @param array $attrs |
836 | * @return Element The newly-appended Element |
837 | */ |
838 | public static function appendToHead( Document $document, string $tagName, array $attrs = [] ): Element { |
839 | $elt = $document->createElement( $tagName ); |
840 | self::addAttributes( $elt, $attrs ); |
841 | $head = DOMCompat::getHead( $document ); |
842 | if ( !$head ) { |
843 | $head = $document->createElement( 'head' ); |
844 | $document->documentElement->insertBefore( |
845 | $head, DOMCompat::getBody( $document ) |
846 | ); |
847 | } |
848 | $head->appendChild( $elt ); |
849 | return $elt; |
850 | } |
851 | |
852 | /** |
853 | * innerHTML and outerHTML are not defined on DocumentFragment. |
854 | * |
855 | * Defined similarly to DOMCompat::getInnerHTML() |
856 | * |
857 | * @param DocumentFragment $frag |
858 | * @return string |
859 | */ |
860 | public static function getFragmentInnerHTML( |
861 | DocumentFragment $frag |
862 | ): string { |
863 | return XMLSerializer::serialize( |
864 | $frag, [ 'innerXML' => true ] |
865 | )['html']; |
866 | } |
867 | |
868 | /** |
869 | * innerHTML and outerHTML are not defined on DocumentFragment. |
870 | * @see DOMCompat::setInnerHTML() for the Element version |
871 | * |
872 | * @param DocumentFragment $frag |
873 | * @param string $html |
874 | */ |
875 | public static function setFragmentInnerHTML( |
876 | DocumentFragment $frag, string $html |
877 | ): void { |
878 | // FIXME: This should be an HTML5 template element |
879 | $body = $frag->ownerDocument->createElement( 'body' ); |
880 | DOMCompat::setInnerHTML( $body, $html ); |
881 | self::migrateChildren( $body, $frag ); |
882 | } |
883 | |
884 | /** |
885 | * @param Document $doc |
886 | * @param string $html |
887 | * @return DocumentFragment |
888 | */ |
889 | public static function parseHTMLToFragment( |
890 | Document $doc, string $html |
891 | ): DocumentFragment { |
892 | $frag = $doc->createDocumentFragment(); |
893 | self::setFragmentInnerHTML( $frag, $html ); |
894 | return $frag; |
895 | } |
896 | |
897 | /** |
898 | * @param Node $node |
899 | * @return bool |
900 | */ |
901 | public static function isRawTextElement( Node $node ): bool { |
902 | return isset( Consts::$HTML['RawTextElements'][DOMCompat::nodeName( $node )] ); |
903 | } |
904 | |
905 | /** |
906 | * Is 'n' a block tag, or does the subtree rooted at 'n' have a block tag |
907 | * in it? |
908 | * |
909 | * @param Node $n |
910 | * @return bool |
911 | */ |
912 | public static function hasBlockTag( Node $n ): bool { |
913 | if ( self::isRemexBlockNode( $n ) ) { |
914 | return true; |
915 | } |
916 | $c = $n->firstChild; |
917 | while ( $c ) { |
918 | if ( self::hasBlockTag( $c ) ) { |
919 | return true; |
920 | } |
921 | $c = $c->nextSibling; |
922 | } |
923 | return false; |
924 | } |
925 | |
926 | /** |
927 | * Get an associative array of attributes, suitable for serialization. |
928 | * |
929 | * Add the xmlns attribute if available, to workaround PHP's surprising |
930 | * behavior with the xmlns attribute: HTML is *not* an XML document, |
931 | * but various parts of PHP (including our misnamed XMLSerializer) pretend |
932 | * that it is, sort of. |
933 | * |
934 | * @param Element $element |
935 | * @return array<string,string> |
936 | * @see https://phabricator.wikimedia.org/T235295 |
937 | */ |
938 | public static function attributes( Element $element ): array { |
939 | $result = []; |
940 | // The 'xmlns' attribute is "invisible" T235295 |
941 | $xmlns = DOMCompat::getAttribute( $element, 'xmlns' ); |
942 | if ( $xmlns !== null ) { |
943 | $result['xmlns'] = $xmlns; |
944 | } |
945 | foreach ( $element->attributes as $attr ) { |
946 | $result[$attr->name] = $attr->value; |
947 | } |
948 | return $result; |
949 | } |
950 | |
951 | /** |
952 | * @param Element $node |
953 | * @return bool |
954 | */ |
955 | public static function isMetaDataTag( Element $node ): bool { |
956 | return isset( Consts::$HTML['MetaDataTags'][DOMCompat::nodeName( $node )] ); |
957 | } |
958 | |
959 | /** |
960 | * Strip a paragraph wrapper, if any, before parsing HTML to DOM |
961 | */ |
962 | public static function stripPWrapper( string $ret ): string { |
963 | return preg_replace( '#(^<p>)|(\n</p>(' . Utils::COMMENT_REGEXP_FRAGMENT . '|\s)*$)#D', '', $ret ); |
964 | } |
965 | } |