Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.37% |
131 / 161 |
|
68.42% |
13 / 19 |
CRAP | |
0.00% |
0 / 1 |
DOMBuilder | |
81.37% |
131 / 161 |
|
68.42% |
13 / 19 |
81.52 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
rethrowIfNotDomException | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
getFragment | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
isCoerced | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
startDocument | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
createDocument | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
3.07 | |||
endDocument | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
insertNode | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
coerceName | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
createNode | |
78.57% |
33 / 42 |
|
0.00% |
0 / 1 |
13.42 | |||
characters | |
83.33% |
15 / 18 |
|
0.00% |
0 / 1 |
7.23 | |||
insertElement | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
endTag | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doctype | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
comment | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
error | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
mergeAttributes | |
54.84% |
17 / 31 |
|
0.00% |
0 / 1 |
28.57 | |||
removeNode | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
reparentChildren | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikimedia\RemexHtml\DOM; |
4 | |
5 | use Wikimedia\RemexHtml\HTMLData; |
6 | use Wikimedia\RemexHtml\Tokenizer\Attributes; |
7 | use Wikimedia\RemexHtml\TreeBuilder\Element; |
8 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
9 | use Wikimedia\RemexHtml\TreeBuilder\TreeHandler; |
10 | |
11 | /** |
12 | * A TreeHandler which constructs a DOMDocument. |
13 | * |
14 | * Note that this class permits third-party `DOMImplementation`s |
15 | * (documents other than `\DOMDocument`, nodes other than `\DOMNode`, |
16 | * etc) and so no enforced PHP type hints are used which name these |
17 | * classes directly. For the sake of static type checking, the |
18 | * types *in comments* are given as if the standard PHP `\DOM*` |
19 | * classes are being used but at runtime everything is duck-typed. |
20 | */ |
21 | class DOMBuilder implements TreeHandler { |
22 | |
23 | /** @var string|null The name of the input document type */ |
24 | public $doctypeName; |
25 | |
26 | /** @var string|null The public ID */ |
27 | public $public; |
28 | |
29 | /** @var string|null The system ID */ |
30 | public $system; |
31 | |
32 | /** |
33 | * @var int The quirks mode. May be either TreeBuilder::NO_QUIRKS, |
34 | * TreeBuilder::LIMITED_QUIRKS or TreeBuilder::QUIRKS to indicate |
35 | * no-quirks mode, limited-quirks mode or quirks mode respectively. |
36 | */ |
37 | public $quirks; |
38 | |
39 | /** @var \DOMDocument */ |
40 | private $doc; |
41 | |
42 | /** @var callable|null */ |
43 | private $errorCallback; |
44 | |
45 | /** @var bool */ |
46 | private $suppressHtmlNamespace; |
47 | |
48 | /** @var bool */ |
49 | private $suppressIdAttribute; |
50 | |
51 | /** @var \DOMImplementation */ |
52 | private $domImplementation; |
53 | |
54 | /** @var class-string */ |
55 | private $domExceptionClass; |
56 | |
57 | /** @var bool */ |
58 | private $isFragment; |
59 | |
60 | /** @var bool */ |
61 | private $coerced = false; |
62 | |
63 | /** |
64 | * @param array $options An associative array of options: |
65 | * - errorCallback : A function which is called on parse errors |
66 | * - suppressHtmlNamespace : omit the namespace when creating HTML |
67 | * elements. False by default. |
68 | * - suppressIdAttribute : don't call the nonstandard |
69 | * DOMElement::setIdAttribute() method while constructing elements. |
70 | * False by default (this method is needed for efficient |
71 | * DOMDocument::getElementById() calls). Set to true if you are |
72 | * using a W3C spec-compliant DOMImplementation and wish to avoid |
73 | * nonstandard calls. |
74 | * - domImplementation: The DOMImplementation object to use. If this |
75 | * parameter is missing or null, a new DOMImplementation object will |
76 | * be constructed using the `domImplementationClass` option value. |
77 | * You can use a third-party DOM implementation by passing in an |
78 | * appropriately duck-typed object here. |
79 | * - domImplementationClass: The string name of the DOMImplementation |
80 | * class to use. Defaults to `\DOMImplementation::class` but |
81 | * you can use a third-party DOM implementation by passing |
82 | * an alternative class name here. |
83 | * - domExceptionClass: The string name of the DOMException |
84 | * class to use. Defaults to `\DOMException::class` but |
85 | * you can use a third-party DOM implementation by passing |
86 | * an alternative class name here. |
87 | */ |
88 | public function __construct( $options = [] ) { |
89 | $options += [ |
90 | 'suppressHtmlNamespace' => false, |
91 | 'suppressIdAttribute' => false, |
92 | 'errorCallback' => null, |
93 | 'domImplementation' => null, |
94 | 'domImplementationClass' => \DOMImplementation::class, |
95 | 'domExceptionClass' => \DOMException::class, |
96 | ]; |
97 | $this->errorCallback = $options['errorCallback']; |
98 | $this->suppressHtmlNamespace = $options['suppressHtmlNamespace']; |
99 | $this->suppressIdAttribute = $options['suppressIdAttribute']; |
100 | $this->domImplementation = $options['domImplementation'] ?? |
101 | new $options['domImplementationClass']; |
102 | $this->domExceptionClass = $options['domExceptionClass']; |
103 | } |
104 | |
105 | private function rethrowIfNotDomException( \Throwable $t ) { |
106 | if ( is_a( $t, $this->domExceptionClass, false ) ) { |
107 | return; |
108 | } |
109 | throw $t; |
110 | } |
111 | |
112 | /** |
113 | * Get the constructed document or document fragment. In the fragment case, |
114 | * a DOMElement is returned, and the caller is expected to extract its |
115 | * inner contents, ignoring the wrapping element. This convention is |
116 | * convenient because the wrapping element gives libxml somewhere to put |
117 | * its namespace declarations. If we copied the children into a |
118 | * DOMDocumentFragment, libxml would invent new prefixes for the orphaned |
119 | * namespaces. |
120 | * |
121 | * @return \DOMNode |
122 | */ |
123 | public function getFragment() { |
124 | if ( $this->isFragment ) { |
125 | return $this->doc->documentElement; |
126 | } else { |
127 | return $this->doc; |
128 | } |
129 | } |
130 | |
131 | /** |
132 | * Returns true if the document was coerced due to libxml limitations. We |
133 | * follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset". |
134 | * |
135 | * @return bool |
136 | */ |
137 | public function isCoerced() { |
138 | return $this->coerced; |
139 | } |
140 | |
141 | public function startDocument( $fragmentNamespace, $fragmentName ) { |
142 | $this->isFragment = $fragmentNamespace !== null; |
143 | $this->doc = $this->createDocument(); |
144 | } |
145 | |
146 | /** |
147 | * @param string|null $doctypeName |
148 | * @param string|null $public |
149 | * @param string|null $system |
150 | * @return \DOMDocument |
151 | * @suppress PhanTypeMismatchArgumentInternalReal |
152 | * Null args to DOMImplementation::createDocument |
153 | */ |
154 | protected function createDocument( |
155 | string $doctypeName = null, |
156 | string $public = null, |
157 | string $system = null |
158 | ) { |
159 | $impl = $this->domImplementation; |
160 | if ( $doctypeName === '' ) { |
161 | $this->coerced = true; |
162 | $doc = $impl->createDocument( null, '' ); |
163 | } elseif ( $doctypeName === null ) { |
164 | $doc = $impl->createDocument( null, '' ); |
165 | } else { |
166 | $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); |
167 | $doc = $impl->createDocument( null, '', $doctype ); |
168 | } |
169 | $doc->encoding = 'UTF-8'; |
170 | return $doc; |
171 | } |
172 | |
173 | public function endDocument( $pos ) { |
174 | } |
175 | |
176 | protected function insertNode( $preposition, $refElement, $node ) { |
177 | if ( $preposition === TreeBuilder::ROOT ) { |
178 | $parent = $this->doc; |
179 | $refNode = null; |
180 | } elseif ( $preposition === TreeBuilder::BEFORE ) { |
181 | $parent = $refElement->userData->parentNode; |
182 | $refNode = $refElement->userData; |
183 | } else { |
184 | $parent = $refElement->userData; |
185 | $refNode = null; |
186 | } |
187 | // @phan-suppress-next-line PhanTypeMismatchArgumentInternal |
188 | $parent->insertBefore( $node, $refNode ); |
189 | } |
190 | |
191 | /** |
192 | * Replace unsupported characters with a code of the form U123456. |
193 | * |
194 | * @param string $name |
195 | * @return string |
196 | */ |
197 | private function coerceName( $name ) { |
198 | $coercedName = DOMUtils::coerceName( $name ); |
199 | if ( $name !== $coercedName ) { |
200 | $this->coerced = true; |
201 | } |
202 | return $coercedName; |
203 | } |
204 | |
205 | protected function createNode( Element $element ) { |
206 | $noNS = $this->suppressHtmlNamespace && $element->namespace === HTMLData::NS_HTML; |
207 | try { |
208 | if ( $noNS ) { |
209 | $node = $this->doc->createElement( $element->name ); |
210 | } else { |
211 | $node = $this->doc->createElementNS( |
212 | $element->namespace, |
213 | $element->name ); |
214 | } |
215 | } catch ( \Throwable $e ) { |
216 | $this->rethrowIfNotDomException( $e ); |
217 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
218 | // Attempt to escape the name so that it is more acceptable |
219 | if ( $noNS ) { |
220 | $node = $this->doc->createElement( |
221 | $this->coerceName( $element->name ) |
222 | ); |
223 | } else { |
224 | $node = $this->doc->createElementNS( |
225 | $element->namespace, |
226 | $this->coerceName( $element->name ) ); |
227 | } |
228 | } |
229 | |
230 | foreach ( $element->attrs->getObjects() as $attr ) { |
231 | if ( $attr->namespaceURI === null |
232 | && strpos( $attr->localName, ':' ) !== false |
233 | ) { |
234 | // Create a DOMText explicitly instead of setting $attrNode->value, |
235 | // to work around the DOMAttr entity expansion bug (T324408) |
236 | $textNode = new \DOMText( $attr->value ); |
237 | try { |
238 | // FIXME: this apparently works to create a prefixed localName |
239 | // in the null namespace, but this is probably taking advantage |
240 | // of a bug in PHP's DOM library, and screws up in various |
241 | // interesting ways. For example, attributes created in this |
242 | // way can't be discovered via hasAttribute() or hasAttributeNS(). |
243 | $attrNode = $this->doc->createAttribute( $attr->localName ); |
244 | $attrNode->appendChild( $textNode ); |
245 | $node->setAttributeNodeNS( $attrNode ); |
246 | } catch ( \Throwable $e ) { |
247 | $this->rethrowIfNotDomException( $e ); |
248 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
249 | $attrNode = $this->doc->createAttribute( |
250 | $this->coerceName( $attr->localName ) ); |
251 | $attrNode->appendChild( $textNode ); |
252 | $node->setAttributeNodeNS( $attrNode ); |
253 | } |
254 | } else { |
255 | try { |
256 | $node->setAttributeNS( |
257 | $attr->namespaceURI, |
258 | $attr->qualifiedName, |
259 | $attr->value ); |
260 | } catch ( \Throwable $e ) { |
261 | $this->rethrowIfNotDomException( $e ); |
262 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
263 | $node->setAttributeNS( |
264 | $attr->namespaceURI, |
265 | $this->coerceName( $attr->qualifiedName ), |
266 | $attr->value ); |
267 | } |
268 | } |
269 | } |
270 | if ( ( !$this->suppressIdAttribute ) && $node->hasAttribute( 'id' ) ) { |
271 | // This is a call to a non-standard DOM method required by PHP in |
272 | // order to implement DOMDocument::getElementById() efficiently. |
273 | $node->setIdAttribute( 'id', true ); |
274 | } |
275 | $element->userData = $node; |
276 | return $node; |
277 | } |
278 | |
279 | public function characters( $preposition, $refElement, $text, $start, $length, |
280 | $sourceStart, $sourceLength |
281 | ) { |
282 | // Parse $preposition and $refElement as in self::insertNode() |
283 | if ( $preposition === TreeBuilder::ROOT ) { |
284 | $parent = $this->doc; |
285 | $refNode = null; |
286 | } elseif ( $preposition === TreeBuilder::BEFORE ) { |
287 | $parent = $refElement->userData->parentNode; |
288 | $refNode = $refElement->userData; |
289 | } else { |
290 | $parent = $refElement->userData; |
291 | $refNode = null; |
292 | } |
293 | // https://html.spec.whatwg.org/#insert-a-character |
294 | // If the adjusted insertion location is in a Document node, then |
295 | // return. |
296 | if ( $parent === $this->doc ) { |
297 | return; |
298 | } |
299 | $data = substr( $text, $start, $length ); |
300 | // If there is a Text node immediately before the adjusted insertion |
301 | // location, then append data to that Text node's data. |
302 | if ( $refNode === null ) { |
303 | $prev = $parent->lastChild; |
304 | } else { |
305 | /** @var \DOMNode $refNode */ |
306 | $prev = $refNode->previousSibling; |
307 | } |
308 | if ( $prev !== null && $prev->nodeType === XML_TEXT_NODE ) { |
309 | '@phan-var \DOMCharacterData $prev'; /** @var \DOMCharacterData $prev */ |
310 | $prev->appendData( $data ); |
311 | } else { |
312 | $node = $this->doc->createTextNode( $data ); |
313 | $this->insertNode( $preposition, $refElement, $node ); |
314 | } |
315 | } |
316 | |
317 | public function insertElement( $preposition, $refElement, Element $element, $void, |
318 | $sourceStart, $sourceLength |
319 | ) { |
320 | if ( $element->userData ) { |
321 | $node = $element->userData; |
322 | } else { |
323 | $node = $this->createNode( $element ); |
324 | } |
325 | $this->insertNode( $preposition, $refElement, $node ); |
326 | } |
327 | |
328 | public function endTag( Element $element, $sourceStart, $sourceLength ) { |
329 | } |
330 | |
331 | public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { |
332 | if ( !$this->doc->firstChild ) { |
333 | $this->doc = $this->createDocument( $name, $public, $system ); |
334 | } |
335 | $this->doctypeName = $name; |
336 | $this->public = $public; |
337 | $this->system = $system; |
338 | $this->quirks = $quirks; |
339 | } |
340 | |
341 | public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) { |
342 | $node = $this->doc->createComment( $text ); |
343 | $this->insertNode( $preposition, $refElement, $node ); |
344 | } |
345 | |
346 | public function error( $text, $pos ) { |
347 | if ( $this->errorCallback ) { |
348 | call_user_func( $this->errorCallback, $text, $pos ); |
349 | } |
350 | } |
351 | |
352 | public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { |
353 | $node = $element->userData; |
354 | '@phan-var \DOMElement $node'; /** @var \DOMElement $node */ |
355 | foreach ( $attrs->getObjects() as $name => $attr ) { |
356 | if ( $attr->namespaceURI === null |
357 | && strpos( $attr->localName, ':' ) !== false |
358 | ) { |
359 | try { |
360 | // As noted in createNode(), we can't use hasAttribute() here. |
361 | // However, we can use the return value of setAttributeNodeNS() |
362 | // instead. |
363 | $attrNode = $this->doc->createAttribute( $attr->localName ); |
364 | $attrNode->value = $attr->value; |
365 | $replaced = $node->setAttributeNodeNS( $attrNode ); |
366 | } catch ( \Throwable $e ) { |
367 | $this->rethrowIfNotDomException( $e ); |
368 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
369 | $attrNode = $this->doc->createAttribute( |
370 | $this->coerceName( $attr->localName ) ); |
371 | $attrNode->value = $attr->value; |
372 | $replaced = $node->setAttributeNodeNS( $attrNode ); |
373 | } |
374 | if ( $replaced ) { |
375 | // Put it back how it was |
376 | $node->setAttributeNodeNS( $replaced ); |
377 | } |
378 | } elseif ( $attr->namespaceURI === null ) { |
379 | try { |
380 | if ( !$node->hasAttribute( $attr->localName ) ) { |
381 | $node->setAttribute( $attr->localName, $attr->value ); |
382 | } |
383 | } catch ( \Throwable $e ) { |
384 | $this->rethrowIfNotDomException( $e ); |
385 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
386 | $name = $this->coerceName( $attr->localName ); |
387 | if ( !$node->hasAttribute( $name ) ) { |
388 | $node->setAttribute( $name, $attr->value ); |
389 | } |
390 | } |
391 | } else { |
392 | try { |
393 | if ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) { |
394 | $node->setAttributeNS( $attr->namespaceURI, |
395 | $attr->localName, $attr->value ); |
396 | } |
397 | } catch ( \Throwable $e ) { |
398 | $this->rethrowIfNotDomException( $e ); |
399 | '@phan-var \DOMException $e'; /** @var \DOMException $e */ |
400 | $name = $this->coerceName( $attr->localName ); |
401 | if ( !$node->hasAttributeNS( $attr->namespaceURI, $name ) ) { |
402 | $node->setAttributeNS( $attr->namespaceURI, $name, $attr->value ); |
403 | } |
404 | } |
405 | } |
406 | } |
407 | } |
408 | |
409 | public function removeNode( Element $element, $sourceStart ) { |
410 | $node = $element->userData; |
411 | $node->parentNode->removeChild( $node ); |
412 | } |
413 | |
414 | public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { |
415 | $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 ); |
416 | $node = $element->userData; |
417 | /** @var \DOMElement $newParentNode */ |
418 | $newParentNode = $newParent->userData; |
419 | '@phan-var \DOMElement $newParentNode'; |
420 | while ( $node->firstChild !== $newParentNode ) { |
421 | $firstChild = $node->firstChild; |
422 | '@phan-var \DOMNode $firstChild'; |
423 | $newParentNode->appendChild( $firstChild ); |
424 | } |
425 | } |
426 | } |