Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 183 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
ContentUtils | |
0.00% |
0 / 183 |
|
0.00% |
0 / 11 |
4422 | |
0.00% |
0 / 1 |
toXML | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
ppToXML | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
createAndLoadDocument | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocumentFragment | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
stripUnnecessaryWrappersAndSyntheticNodes | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
processAttributeEmbeddedDom | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 | |||
processAttributeEmbeddedHTML | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
342 | |||
shiftDSR | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
132 | |||
convertOffsets | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
72 | |||
dumpNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
dumpDOM | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
56 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Closure; |
7 | use Wikimedia\Assert\Assert; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * These utilities are for processing content that's generated |
20 | * by parsing source input (ex: wikitext) |
21 | */ |
22 | class ContentUtils { |
23 | |
24 | /** |
25 | * XML Serializer. |
26 | * |
27 | * @param Node $node |
28 | * @param array $options XMLSerializer options. |
29 | * @return string |
30 | */ |
31 | public static function toXML( Node $node, array $options = [] ): string { |
32 | return XMLSerializer::serialize( $node, $options )['html']; |
33 | } |
34 | |
35 | /** |
36 | * dataobject aware XML serializer, to be used in the DOM post-processing phase. |
37 | * |
38 | * @param Node $node |
39 | * @param array $options |
40 | * Data attribute options, see DOMDataUtils::storeDataAttribs() for |
41 | * details. In addition, setting `$options['fragment']` to true |
42 | * should be used when serializing a DocumentFragment unconnected to |
43 | * the parent document; this ensures that we don't mistakenly mark |
44 | * the top level document as "unloaded" if we were just serializing |
45 | * a fragment. |
46 | * |
47 | * Eventually most places which serialize using the `fragment` option |
48 | * should be converted to store the DocumentFragment natively, instead |
49 | * of as a string (T348161). |
50 | * |
51 | * @return string |
52 | */ |
53 | public static function ppToXML( Node $node, array $options = [] ): string { |
54 | $doc = $node->ownerDocument ?? $node; |
55 | DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); |
56 | if ( !( $options['fragment'] ?? false ) ) { |
57 | DOMDataUtils::getBag( $doc )->loaded = false; |
58 | } |
59 | return self::toXML( $node, $options ); |
60 | } |
61 | |
62 | /** |
63 | * Create a new prepared document with the given HTML and load the |
64 | * data attributes. |
65 | * |
66 | * Don't use this inside of the parser pipeline: it shouldn't be necessary |
67 | * to create new documents when parsing or serializing. A document lives |
68 | * on the environment which can be used to create fragments. The bag added |
69 | * as a dynamic property to the PHP wrapper around the libxml doc |
70 | * is at risk of being GC-ed. |
71 | * |
72 | * @param string $html |
73 | * @param array $options |
74 | * @return Document |
75 | */ |
76 | public static function createAndLoadDocument( |
77 | string $html, array $options = [] |
78 | ): Document { |
79 | $options += [ 'markNew' => true, 'validateXMLNames' => true ]; |
80 | $doc = DOMUtils::parseHTML( $html, $options['validateXMLNames'] ); |
81 | DOMDataUtils::prepareDoc( $doc ); |
82 | DOMDataUtils::visitAndLoadDataAttribs( |
83 | DOMCompat::getBody( $doc ), $options |
84 | ); |
85 | DOMDataUtils::getBag( $doc )->loaded = true; |
86 | return $doc; |
87 | } |
88 | |
89 | /** |
90 | * @param Document $doc |
91 | * @param string $html |
92 | * @param array $options |
93 | * @return DocumentFragment |
94 | */ |
95 | public static function createAndLoadDocumentFragment( |
96 | Document $doc, string $html, array $options = [] |
97 | ): DocumentFragment { |
98 | $domFragment = $doc->createDocumentFragment(); |
99 | DOMUtils::setFragmentInnerHTML( $domFragment, $html ); |
100 | DOMDataUtils::visitAndLoadDataAttribs( $domFragment, $options ); |
101 | return $domFragment; |
102 | } |
103 | |
104 | /** |
105 | * Strip Parsoid-inserted section wrappers, annotation wrappers, and synthetic nodes |
106 | * (fallback id spans with HTML4 ids for headings, auto-generated TOC metas |
107 | * and possibly other such in the future) from the DOM. |
108 | * |
109 | * @param Element $node |
110 | */ |
111 | public static function stripUnnecessaryWrappersAndSyntheticNodes( Element $node ): void { |
112 | $n = $node->firstChild; |
113 | while ( $n ) { |
114 | $next = $n->nextSibling; |
115 | if ( $n instanceof Element ) { |
116 | if ( DOMCompat::nodeName( $n ) === 'meta' && |
117 | ( DOMDataUtils::getDataMw( $n )->autoGenerated ?? false ) |
118 | ) { |
119 | // Strip auto-generated synthetic meta tags |
120 | $n->parentNode->removeChild( $n ); |
121 | } elseif ( WTUtils::isFallbackIdSpan( $n ) ) { |
122 | // Strip <span typeof='mw:FallbackId' ...></span> |
123 | $n->parentNode->removeChild( $n ); |
124 | } else { |
125 | // Recurse into subtree before stripping this |
126 | self::stripUnnecessaryWrappersAndSyntheticNodes( $n ); |
127 | |
128 | // Strip <section> tags and synthetic extended-annotation-region wrappers |
129 | if ( WTUtils::isParsoidSectionTag( $n ) || |
130 | DOMUtils::hasTypeOf( $n, 'mw:ExtendedAnnRange' ) ) { |
131 | DOMUtils::migrateChildren( $n, $n->parentNode, $n ); |
132 | $n->parentNode->removeChild( $n ); |
133 | } |
134 | } |
135 | } |
136 | $n = $next; |
137 | } |
138 | } |
139 | |
140 | /** |
141 | * Extensions might be interested in examining their content embedded |
142 | * in data-mw attributes that don't otherwise show up in the DOM. |
143 | * |
144 | * Ex: inline media captions that aren't rendered, language variant markup, |
145 | * attributes that are transcluded. More scenarios might be added later. |
146 | * |
147 | * @param ParsoidExtensionAPI $extAPI |
148 | * @param Element $elt The node whose data attributes need to be examined |
149 | * @param callable(DocumentFragment):bool $proc |
150 | * The processor that will process the embedded HTML. |
151 | * This processor will be provided a DocumentFragment |
152 | * and is expected to return true if that fragment was modified. |
153 | */ |
154 | public static function processAttributeEmbeddedDom( |
155 | ParsoidExtensionAPI $extAPI, Element $elt, callable $proc |
156 | ): void { |
157 | $str2df2str = static function ( string $html ) use ( $extAPI, $proc ): string { |
158 | $dom = $extAPI->htmlToDom( $html ); |
159 | $ret = $proc( $dom ); |
160 | return $ret ? $extAPI->domToHtml( $dom, true, true ) : $html; |
161 | }; |
162 | // @phan-suppress-next-line PhanDeprecatedFunction internal use |
163 | self::processAttributeEmbeddedHTML( $extAPI, $elt, $str2df2str ); |
164 | |
165 | if ( WTUtils::isInlineMedia( $elt ) ) { |
166 | $caption = DOMDataUtils::getDataMw( $elt )->caption ?? null; |
167 | if ( $caption !== null ) { |
168 | $proc( $caption ); |
169 | } |
170 | } |
171 | |
172 | // Process extension-specific embedded DocumentFragments |
173 | $extTagName = WTUtils::getExtTagName( $elt ); |
174 | if ( $extTagName ) { |
175 | $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); |
176 | if ( $extConfig['options']['wt2html']['embedsDomInAttributes'] ?? false ) { |
177 | $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); |
178 | $tagHandler->processAttributeEmbeddedDom( $extAPI, $elt, $proc ); |
179 | } |
180 | } |
181 | $key = WTUtils::getPFragmentHandlerKey( $elt ); |
182 | if ( $key ) { |
183 | $config = $extAPI->getSiteConfig()->getPFragmentHandlerConfig( $key ); |
184 | if ( $config['options']['embedsDomInAttributes'] ?? false ) { |
185 | $handler = $extAPI->getSiteConfig()->getPFragmentHandlerImpl( $key ); |
186 | $handler->processAttributeEmbeddedDom( $extAPI, $elt, $proc ); |
187 | } |
188 | } |
189 | } |
190 | |
191 | /** |
192 | * Extensions might be interested in examining their content embedded |
193 | * in data-mw attributes that don't otherwise show up in the DOM. |
194 | * |
195 | * Ex: inline media captions that aren't rendered, language variant markup, |
196 | * attributes that are transcluded. More scenarios might be added later. |
197 | * |
198 | * @deprecated |
199 | * Don't use this directly: use ::processAttributeEmbeddedDom(). |
200 | * This method may omit content which is embedded natively as |
201 | * DocumentFragments instead of as HTML strings. |
202 | * |
203 | * @param ParsoidExtensionAPI $extAPI |
204 | * @param Element $elt The node whose data attributes need to be examined |
205 | * @param Closure $proc The processor that will process the embedded HTML |
206 | * Signature: (string) -> string |
207 | * This processor will be provided the HTML string as input |
208 | * and is expected to return a possibly modified string. |
209 | */ |
210 | public static function processAttributeEmbeddedHTML( |
211 | ParsoidExtensionAPI $extAPI, Element $elt, Closure $proc |
212 | ): void { |
213 | if ( !$elt->hasAttribute( 'typeof' ) ) { |
214 | return; |
215 | } |
216 | |
217 | // Expanded attributes |
218 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:ExpandedAttrs$/' ) ) { |
219 | $dmw = DOMDataUtils::getDataMw( $elt ); |
220 | if ( $dmw->attribs ?? null ) { |
221 | foreach ( $dmw->attribs as $a ) { |
222 | // Look in both key and value of the DataMwAttrib |
223 | foreach ( [ 'key', 'value' ] as $part ) { |
224 | if ( !is_string( $a->$part ) && isset( $a->$part['html'] ) ) { |
225 | $a->$part['html'] = $proc( $a->$part['html'] ); |
226 | } |
227 | } |
228 | } |
229 | } |
230 | } |
231 | |
232 | // Language variant markup |
233 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:LanguageVariant$/' ) ) { |
234 | $dmwv = DOMDataUtils::getJSONAttribute( $elt, 'data-mw-variant', null ); |
235 | if ( $dmwv ) { |
236 | if ( isset( $dmwv->disabled ) ) { |
237 | $dmwv->disabled->t = $proc( $dmwv->disabled->t ); |
238 | } |
239 | if ( isset( $dmwv->twoway ) ) { |
240 | foreach ( $dmwv->twoway as $l ) { |
241 | $l->t = $proc( $l->t ); |
242 | } |
243 | } |
244 | if ( isset( $dmwv->oneway ) ) { |
245 | foreach ( $dmwv->oneway as $l ) { |
246 | $l->f = $proc( $l->f ); |
247 | $l->t = $proc( $l->t ); |
248 | } |
249 | } |
250 | if ( isset( $dmwv->filter ) ) { |
251 | $dmwv->filter->t = $proc( $dmwv->filter->t ); |
252 | } |
253 | DOMDataUtils::setJSONAttribute( $elt, 'data-mw-variant', $dmwv ); |
254 | } |
255 | } |
256 | |
257 | // Process extension-specific embedded HTML |
258 | $extTagName = WTUtils::getExtTagName( $elt ); |
259 | if ( $extTagName ) { |
260 | $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); |
261 | if ( $extConfig['options']['wt2html']['embedsHTMLInAttributes'] ?? false ) { |
262 | $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); |
263 | $tagHandler->processAttributeEmbeddedHTML( $extAPI, $elt, $proc ); |
264 | } |
265 | } |
266 | } |
267 | |
268 | /** |
269 | * Shift the DOM Source Range (DSR) of a DOM fragment. |
270 | * @param Env $env |
271 | * @param Node $rootNode |
272 | * @param callable $dsrFunc |
273 | * @param ParsoidExtensionAPI $extAPI |
274 | * @return Node Returns the $rootNode passed in to allow chaining. |
275 | */ |
276 | public static function shiftDSR( |
277 | Env $env, Node $rootNode, callable $dsrFunc, ParsoidExtensionAPI $extAPI |
278 | ): Node { |
279 | $doc = $rootNode->ownerDocument; |
280 | $convertNode = static function ( Node $node ) use ( |
281 | $env, $extAPI, $dsrFunc, &$convertNode |
282 | ) { |
283 | if ( !( $node instanceof Element ) ) { |
284 | return; |
285 | } |
286 | $dp = DOMDataUtils::getDataParsoid( $node ); |
287 | if ( isset( $dp->dsr ) ) { |
288 | $dp->dsr = $dsrFunc( clone $dp->dsr ); |
289 | // We don't need to setDataParsoid because dp is not a copy |
290 | |
291 | // This is a bit of a hack, but we use this function to |
292 | // clear DSR properties as well. See below as well. |
293 | if ( $dp->dsr === null ) { |
294 | unset( $dp->dsr ); |
295 | } |
296 | } |
297 | $tmp = $dp->getTemp(); |
298 | if ( isset( $tmp->origDSR ) ) { |
299 | // Even though tmp shouldn't escape Parsoid, go ahead and |
300 | // convert to enable hybrid testing. |
301 | $tmp->origDSR = $dsrFunc( clone $tmp->origDSR ); |
302 | if ( $tmp->origDSR === null ) { |
303 | unset( $tmp->origDSR ); |
304 | } |
305 | } |
306 | if ( isset( $dp->extTagOffsets ) ) { |
307 | $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); |
308 | if ( $dp->extTagOffsets === null ) { |
309 | unset( $dp->extTagOffsets ); |
310 | } |
311 | } |
312 | |
313 | // Handle embedded HTML in attributes |
314 | self::processAttributeEmbeddedDom( |
315 | $extAPI, $node, |
316 | static function ( DocumentFragment $df ) use ( $convertNode ): bool { |
317 | DOMPostOrder::traverse( $df, $convertNode ); |
318 | return true; |
319 | } ); |
320 | |
321 | // DOMFragments will have already been unpacked when DSR shifting is run |
322 | if ( DOMUtils::hasTypeOf( $node, 'mw:DOMFragment' ) ) { |
323 | throw new UnreachableException( "Shouldn't encounter these nodes here." ); |
324 | } |
325 | |
326 | // However, extensions can choose to handle sealed fragments whenever |
327 | // they want and so may be returned in subpipelines which could |
328 | // subsequently be shifted |
329 | if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment/sealed/\w+$#D' ) ) { |
330 | $dp = DOMDataUtils::getDataParsoid( $node ); |
331 | if ( $dp->html ?? null ) { |
332 | $domFragment = $env->getDOMFragment( $dp->html ); |
333 | DOMPostOrder::traverse( $domFragment, $convertNode ); |
334 | } |
335 | } |
336 | }; |
337 | DOMPostOrder::traverse( $rootNode, $convertNode ); |
338 | return $rootNode; // chainable |
339 | } |
340 | |
341 | /** |
342 | * Convert DSR offsets in a Document between utf-8/ucs2/codepoint |
343 | * indices. |
344 | * |
345 | * Offset types are: |
346 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
347 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
348 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
349 | * |
350 | * @see TokenUtils::convertTokenOffsets for a related function on tokens. |
351 | * |
352 | * @param Env $env |
353 | * @param Document $doc The document to convert |
354 | * @param string $from Offset type to convert from. |
355 | * @param string $to Offset type to convert to. |
356 | */ |
357 | public static function convertOffsets( |
358 | Env $env, |
359 | Document $doc, |
360 | string $from, |
361 | string $to |
362 | ): void { |
363 | $env->setCurrentOffsetType( $to ); |
364 | if ( $from === $to ) { |
365 | return; // Hey, that was easy! |
366 | } |
367 | $offsetMap = []; |
368 | $offsets = []; |
369 | $collect = static function ( int $n ) use ( &$offsetMap, &$offsets ) { |
370 | if ( !array_key_exists( $n, $offsetMap ) ) { |
371 | $box = (object)[ 'value' => $n ]; |
372 | $offsetMap[$n] = $box; |
373 | $offsets[] =& $box->value; |
374 | } |
375 | }; |
376 | // Collect DSR offsets throughout the document |
377 | $collectDSR = static function ( DomSourceRange $dsr ) use ( $collect ) { |
378 | if ( $dsr->start !== null ) { |
379 | $collect( $dsr->start ); |
380 | $collect( $dsr->innerStart() ); |
381 | } |
382 | if ( $dsr->end !== null ) { |
383 | $collect( $dsr->innerEnd() ); |
384 | $collect( $dsr->end ); |
385 | } |
386 | return $dsr; |
387 | }; |
388 | $body = DOMCompat::getBody( $doc ); |
389 | $extAPI = new ParsoidExtensionAPI( $env ); |
390 | self::shiftDSR( $env, $body, $collectDSR, $extAPI ); |
391 | if ( count( $offsets ) === 0 ) { |
392 | return; /* nothing to do (shouldn't really happen) */ |
393 | } |
394 | // Now convert these offsets |
395 | TokenUtils::convertOffsets( |
396 | $env->topFrame->getSrcText(), $from, $to, $offsets |
397 | ); |
398 | // Apply converted offsets |
399 | $applyDSR = static function ( DomSourceRange $dsr ) use ( $offsetMap ) { |
400 | $start = $dsr->start; |
401 | $openWidth = $dsr->openWidth; |
402 | if ( $start !== null ) { |
403 | $start = $offsetMap[$start]->value; |
404 | $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; |
405 | } |
406 | $end = $dsr->end; |
407 | $closeWidth = $dsr->closeWidth; |
408 | if ( $end !== null ) { |
409 | $end = $offsetMap[$end]->value; |
410 | $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; |
411 | } |
412 | return new DomSourceRange( |
413 | $start, $end, $openWidth, $closeWidth |
414 | ); |
415 | }; |
416 | self::shiftDSR( $env, $body, $applyDSR, $extAPI ); |
417 | } |
418 | |
419 | /** |
420 | * @param Node $node |
421 | * @param array $options |
422 | * @return string |
423 | */ |
424 | private static function dumpNode( Node $node, array $options ): string { |
425 | return self::toXML( $node, $options + [ 'saveData' => true ] ); |
426 | } |
427 | |
428 | /** |
429 | * Dump the DOM with attributes. |
430 | * |
431 | * @param Node $rootNode |
432 | * @param string $title |
433 | * @param array $options Associative array of options: |
434 | * - dumpFragmentMap: Dump the fragment map from env |
435 | * - quiet: Suppress separators |
436 | * |
437 | * storeDataAttribs options: |
438 | * - discardDataParsoid |
439 | * - keepTmp |
440 | * - storeInPageBundle |
441 | * - storeDiffMark |
442 | * - env |
443 | * - idIndex |
444 | * |
445 | * XMLSerializer options: |
446 | * - smartQuote |
447 | * - innerXML |
448 | * - captureOffsets |
449 | * - addDoctype |
450 | * @return string The dump result |
451 | */ |
452 | public static function dumpDOM( |
453 | Node $rootNode, string $title = '', array $options = [] |
454 | ): string { |
455 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
456 | Assert::invariant( isset( $options['env'] ), "env should be set" ); |
457 | } |
458 | |
459 | $buf = ''; |
460 | if ( empty( $options['quiet'] ) ) { |
461 | $buf .= "----- {$title} -----\n"; |
462 | } |
463 | $buf .= self::dumpNode( $rootNode, $options ) . "\n"; |
464 | |
465 | // Dump cached fragments |
466 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
467 | foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { |
468 | $buf .= str_repeat( '=', 15 ) . "\n"; |
469 | $buf .= "FRAGMENT {$k}\n"; |
470 | $buf .= self::dumpNode( |
471 | is_array( $fragment ) ? $fragment[0] : $fragment, |
472 | $options |
473 | ) . "\n"; |
474 | } |
475 | } |
476 | |
477 | if ( empty( $options['quiet'] ) ) { |
478 | $buf .= str_repeat( '-', mb_strlen( $title ) + 12 ) . "\n"; |
479 | } |
480 | return $buf; |
481 | } |
482 | |
483 | } |