Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 175 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
ContentUtils | |
0.00% |
0 / 175 |
|
0.00% |
0 / 12 |
3906 | |
0.00% |
0 / 1 |
toXML | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
ppToXML | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
createDocument | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocument | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocumentFragment | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
extractDpAndSerialize | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
stripUnnecessaryWrappersAndSyntheticNodes | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
processAttributeEmbeddedHTML | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
420 | |||
shiftDSR | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
132 | |||
convertOffsets | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
72 | |||
dumpNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
dumpDOM | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
56 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Closure; |
7 | use Wikimedia\Assert\Assert; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * These utilities are for processing content that's generated |
20 | * by parsing source input (ex: wikitext) |
21 | */ |
22 | class ContentUtils { |
23 | |
24 | /** |
25 | * XML Serializer. |
26 | * |
27 | * @param Node $node |
28 | * @param array $options XMLSerializer options. |
29 | * @return string |
30 | */ |
31 | public static function toXML( Node $node, array $options = [] ): string { |
32 | return XMLSerializer::serialize( $node, $options )['html']; |
33 | } |
34 | |
35 | /** |
36 | * dataobject aware XML serializer, to be used in the DOM post-processing phase. |
37 | * |
38 | * @param Node $node |
39 | * @param array $options |
40 | * @return string |
41 | */ |
42 | public static function ppToXML( Node $node, array $options = [] ): string { |
43 | DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); |
44 | return self::toXML( $node, $options ); |
45 | } |
46 | |
47 | /** |
48 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
49 | * to create new documents when parsing or serializing. A document lives |
50 | * on the environment which can be used to create fragments. The bag added |
51 | * as a dynamic property to the PHP wrapper around the libxml doc |
52 | * is at risk of being GC-ed. |
53 | * |
54 | * @param string $html |
55 | * @param bool $validateXMLNames |
56 | * @return Document |
57 | */ |
58 | public static function createDocument( |
59 | string $html = '', bool $validateXMLNames = false |
60 | ): Document { |
61 | $doc = DOMUtils::parseHTML( $html, $validateXMLNames ); |
62 | DOMDataUtils::prepareDoc( $doc ); |
63 | return $doc; |
64 | } |
65 | |
66 | /** |
67 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
68 | * to create new documents when parsing or serializing. A document lives |
69 | * on the environment which can be used to create fragments. The bag added |
70 | * as a dynamic property to the PHP wrapper around the libxml doc |
71 | * is at risk of being GC-ed. |
72 | * |
73 | * @param string $html |
74 | * @param array $options |
75 | * @return Document |
76 | */ |
77 | public static function createAndLoadDocument( |
78 | string $html, array $options = [] |
79 | ): Document { |
80 | $doc = self::createDocument( $html, $options['validateXMLNames'] ?? false ); |
81 | DOMDataUtils::visitAndLoadDataAttribs( |
82 | DOMCompat::getBody( $doc ), $options |
83 | ); |
84 | return $doc; |
85 | } |
86 | |
87 | /** |
88 | * @param Document $doc |
89 | * @param string $html |
90 | * @param array $options |
91 | * @return DocumentFragment |
92 | */ |
93 | public static function createAndLoadDocumentFragment( |
94 | Document $doc, string $html, array $options = [] |
95 | ): DocumentFragment { |
96 | $domFragment = $doc->createDocumentFragment(); |
97 | DOMUtils::setFragmentInnerHTML( $domFragment, $html ); |
98 | DOMDataUtils::visitAndLoadDataAttribs( $domFragment, $options ); |
99 | return $domFragment; |
100 | } |
101 | |
102 | /** |
103 | * Pull the data-parsoid script element out of the doc before serializing. |
104 | * |
105 | * @param Node $node |
106 | * @param array $options XMLSerializer options. |
107 | * @return array |
108 | */ |
109 | public static function extractDpAndSerialize( Node $node, array $options = [] ): array { |
110 | $doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node; |
111 | $pb = DOMDataUtils::extractPageBundle( $doc ); |
112 | $out = XMLSerializer::serialize( $node, $options ); |
113 | $pb->html = $out['html']; |
114 | $pb->version = $options['contentversion'] ?? null; |
115 | $pb->headers = $options['headers'] ?? null; |
116 | $pb->contentmodel = $options['contentmodel'] ?? null; |
117 | $out['pb'] = $pb; |
118 | return $out; |
119 | } |
120 | |
121 | /** |
122 | * Strip Parsoid-inserted section wrappers, annotation wrappers, and synthetic nodes |
123 | * (fallback id spans with HTML4 ids for headings, auto-generated TOC metas |
124 | * and possibly other such in the future) from the DOM. |
125 | * |
126 | * @param Element $node |
127 | */ |
128 | public static function stripUnnecessaryWrappersAndSyntheticNodes( Element $node ): void { |
129 | $n = $node->firstChild; |
130 | while ( $n ) { |
131 | $next = $n->nextSibling; |
132 | if ( $n instanceof Element ) { |
133 | if ( DOMCompat::nodeName( $n ) === 'meta' && |
134 | ( DOMDataUtils::getDataMw( $n )->autoGenerated ?? false ) |
135 | ) { |
136 | // Strip auto-generated synthetic meta tags |
137 | $n->parentNode->removeChild( $n ); |
138 | } elseif ( WTUtils::isFallbackIdSpan( $n ) ) { |
139 | // Strip <span typeof='mw:FallbackId' ...></span> |
140 | $n->parentNode->removeChild( $n ); |
141 | } else { |
142 | // Recurse into subtree before stripping this |
143 | self::stripUnnecessaryWrappersAndSyntheticNodes( $n ); |
144 | |
145 | // Strip <section> tags and synthetic extended-annotation-region wrappers |
146 | if ( WTUtils::isParsoidSectionTag( $n ) || |
147 | DOMUtils::hasTypeOf( $n, 'mw:ExtendedAnnRange' ) ) { |
148 | DOMUtils::migrateChildren( $n, $n->parentNode, $n ); |
149 | $n->parentNode->removeChild( $n ); |
150 | } |
151 | } |
152 | } |
153 | $n = $next; |
154 | } |
155 | } |
156 | |
157 | /** |
158 | * Extensions might be interested in examining their content embedded |
159 | * in data-mw attributes that don't otherwise show up in the DOM. |
160 | * |
161 | * Ex: inline media captions that aren't rendered, language variant markup, |
162 | * attributes that are transcluded. More scenarios might be added later. |
163 | * |
164 | * @param ParsoidExtensionAPI $extAPI |
165 | * @param Element $elt The node whose data attributes need to be examined |
166 | * @param Closure $proc The processor that will process the embedded HTML |
167 | * Signature: (string) -> string |
168 | * This processor will be provided the HTML string as input |
169 | * and is expected to return a possibly modified string. |
170 | */ |
171 | public static function processAttributeEmbeddedHTML( |
172 | ParsoidExtensionAPI $extAPI, Element $elt, Closure $proc |
173 | ): void { |
174 | if ( !$elt->hasAttribute( 'typeof' ) ) { |
175 | return; |
176 | } |
177 | |
178 | // Expanded attributes |
179 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:ExpandedAttrs$/' ) ) { |
180 | $dmw = DOMDataUtils::getDataMw( $elt ); |
181 | if ( $dmw->attribs ?? null ) { |
182 | foreach ( $dmw->attribs as $a ) { |
183 | // Look in both key and value of the DataMwAttrib |
184 | foreach ( [ 'key', 'value' ] as $part ) { |
185 | if ( !is_string( $a->$part ) && isset( $a->$part['html'] ) ) { |
186 | $a->$part['html'] = $proc( $a->$part['html'] ); |
187 | } |
188 | } |
189 | } |
190 | } |
191 | } |
192 | |
193 | // Language variant markup |
194 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:LanguageVariant$/' ) ) { |
195 | $dmwv = DOMDataUtils::getJSONAttribute( $elt, 'data-mw-variant', null ); |
196 | if ( $dmwv ) { |
197 | if ( isset( $dmwv->disabled ) ) { |
198 | $dmwv->disabled->t = $proc( $dmwv->disabled->t ); |
199 | } |
200 | if ( isset( $dmwv->twoway ) ) { |
201 | foreach ( $dmwv->twoway as $l ) { |
202 | $l->t = $proc( $l->t ); |
203 | } |
204 | } |
205 | if ( isset( $dmwv->oneway ) ) { |
206 | foreach ( $dmwv->oneway as $l ) { |
207 | $l->f = $proc( $l->f ); |
208 | $l->t = $proc( $l->t ); |
209 | } |
210 | } |
211 | if ( isset( $dmwv->filter ) ) { |
212 | $dmwv->filter->t = $proc( $dmwv->filter->t ); |
213 | } |
214 | DOMDataUtils::setJSONAttribute( $elt, 'data-mw-variant', $dmwv ); |
215 | } |
216 | } |
217 | |
218 | // Inline media -- look inside the data-mw attribute |
219 | if ( WTUtils::isInlineMedia( $elt ) ) { |
220 | $dmw = DOMDataUtils::getDataMw( $elt ); |
221 | $caption = $dmw->caption ?? null; |
222 | if ( $caption ) { |
223 | $dmw->caption = $proc( $caption ); |
224 | } |
225 | } |
226 | |
227 | // Process extension-specific embedded HTML |
228 | $extTagName = WTUtils::getExtTagName( $elt ); |
229 | if ( $extTagName ) { |
230 | $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); |
231 | if ( $extConfig['options']['wt2html']['embedsHTMLInAttributes'] ?? false ) { |
232 | $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); |
233 | $tagHandler->processAttributeEmbeddedHTML( $extAPI, $elt, $proc ); |
234 | } |
235 | } |
236 | } |
237 | |
238 | /** |
239 | * Shift the DOM Source Range (DSR) of a DOM fragment. |
240 | * @param Env $env |
241 | * @param Node $rootNode |
242 | * @param callable $dsrFunc |
243 | * @param ParsoidExtensionAPI $extAPI |
244 | * @return Node Returns the $rootNode passed in to allow chaining. |
245 | */ |
246 | public static function shiftDSR( |
247 | Env $env, Node $rootNode, callable $dsrFunc, ParsoidExtensionAPI $extAPI |
248 | ): Node { |
249 | $doc = $rootNode->ownerDocument; |
250 | $convertString = static function ( $str ) { |
251 | // Stub $convertString out to allow definition of a pair of |
252 | // mutually-recursive functions. |
253 | return $str; |
254 | }; |
255 | $convertNode = static function ( Node $node ) use ( |
256 | $env, $extAPI, $dsrFunc, &$convertString, &$convertNode |
257 | ) { |
258 | if ( !( $node instanceof Element ) ) { |
259 | return; |
260 | } |
261 | $dp = DOMDataUtils::getDataParsoid( $node ); |
262 | if ( isset( $dp->dsr ) ) { |
263 | $dp->dsr = $dsrFunc( clone $dp->dsr ); |
264 | // We don't need to setDataParsoid because dp is not a copy |
265 | |
266 | // This is a bit of a hack, but we use this function to |
267 | // clear DSR properties as well. See below as well. |
268 | if ( $dp->dsr === null ) { |
269 | unset( $dp->dsr ); |
270 | } |
271 | } |
272 | $tmp = $dp->getTemp(); |
273 | if ( isset( $tmp->origDSR ) ) { |
274 | // Even though tmp shouldn't escape Parsoid, go ahead and |
275 | // convert to enable hybrid testing. |
276 | $tmp->origDSR = $dsrFunc( clone $tmp->origDSR ); |
277 | if ( $tmp->origDSR === null ) { |
278 | unset( $tmp->origDSR ); |
279 | } |
280 | } |
281 | if ( isset( $dp->extTagOffsets ) ) { |
282 | $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); |
283 | if ( $dp->extTagOffsets === null ) { |
284 | unset( $dp->extTagOffsets ); |
285 | } |
286 | } |
287 | |
288 | // Handle embedded HTML in attributes |
289 | self::processAttributeEmbeddedHTML( $extAPI, $node, $convertString ); |
290 | |
291 | // DOMFragments will have already been unpacked when DSR shifting is run |
292 | if ( DOMUtils::hasTypeOf( $node, 'mw:DOMFragment' ) ) { |
293 | throw new UnreachableException( "Shouldn't encounter these nodes here." ); |
294 | } |
295 | |
296 | // However, extensions can choose to handle sealed fragments whenever |
297 | // they want and so may be returned in subpipelines which could |
298 | // subsequently be shifted |
299 | if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment/sealed/\w+$#D' ) ) { |
300 | $dp = DOMDataUtils::getDataParsoid( $node ); |
301 | if ( $dp->html ?? null ) { |
302 | $domFragment = $env->getDOMFragment( $dp->html ); |
303 | DOMPostOrder::traverse( $domFragment, $convertNode ); |
304 | } |
305 | } |
306 | }; |
307 | $convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string { |
308 | $node = self::createAndLoadDocumentFragment( $doc, $str ); |
309 | DOMPostOrder::traverse( $node, $convertNode ); |
310 | return self::ppToXML( $node, [ 'innerXML' => true ] ); |
311 | }; |
312 | DOMPostOrder::traverse( $rootNode, $convertNode ); |
313 | return $rootNode; // chainable |
314 | } |
315 | |
316 | /** |
317 | * Convert DSR offsets in a Document between utf-8/ucs2/codepoint |
318 | * indices. |
319 | * |
320 | * Offset types are: |
321 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
322 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
323 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
324 | * |
325 | * @see TokenUtils::convertTokenOffsets for a related function on tokens. |
326 | * |
327 | * @param Env $env |
328 | * @param Document $doc The document to convert |
329 | * @param string $from Offset type to convert from. |
330 | * @param string $to Offset type to convert to. |
331 | */ |
332 | public static function convertOffsets( |
333 | Env $env, |
334 | Document $doc, |
335 | string $from, |
336 | string $to |
337 | ): void { |
338 | $env->setCurrentOffsetType( $to ); |
339 | if ( $from === $to ) { |
340 | return; // Hey, that was easy! |
341 | } |
342 | $offsetMap = []; |
343 | $offsets = []; |
344 | $collect = static function ( int $n ) use ( &$offsetMap, &$offsets ) { |
345 | if ( !array_key_exists( $n, $offsetMap ) ) { |
346 | $box = (object)[ 'value' => $n ]; |
347 | $offsetMap[$n] = $box; |
348 | $offsets[] =& $box->value; |
349 | } |
350 | }; |
351 | // Collect DSR offsets throughout the document |
352 | $collectDSR = static function ( DomSourceRange $dsr ) use ( $collect ) { |
353 | if ( $dsr->start !== null ) { |
354 | $collect( $dsr->start ); |
355 | $collect( $dsr->innerStart() ); |
356 | } |
357 | if ( $dsr->end !== null ) { |
358 | $collect( $dsr->innerEnd() ); |
359 | $collect( $dsr->end ); |
360 | } |
361 | return $dsr; |
362 | }; |
363 | $body = DOMCompat::getBody( $doc ); |
364 | $extAPI = new ParsoidExtensionAPI( $env ); |
365 | self::shiftDSR( $env, $body, $collectDSR, $extAPI ); |
366 | if ( count( $offsets ) === 0 ) { |
367 | return; /* nothing to do (shouldn't really happen) */ |
368 | } |
369 | // Now convert these offsets |
370 | TokenUtils::convertOffsets( |
371 | $env->topFrame->getSrcText(), $from, $to, $offsets |
372 | ); |
373 | // Apply converted offsets |
374 | $applyDSR = static function ( DomSourceRange $dsr ) use ( $offsetMap ) { |
375 | $start = $dsr->start; |
376 | $openWidth = $dsr->openWidth; |
377 | if ( $start !== null ) { |
378 | $start = $offsetMap[$start]->value; |
379 | $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; |
380 | } |
381 | $end = $dsr->end; |
382 | $closeWidth = $dsr->closeWidth; |
383 | if ( $end !== null ) { |
384 | $end = $offsetMap[$end]->value; |
385 | $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; |
386 | } |
387 | return new DomSourceRange( |
388 | $start, $end, $openWidth, $closeWidth |
389 | ); |
390 | }; |
391 | self::shiftDSR( $env, $body, $applyDSR, $extAPI ); |
392 | } |
393 | |
394 | /** |
395 | * @param Node $node |
396 | * @param array $options |
397 | * @return string |
398 | */ |
399 | private static function dumpNode( Node $node, array $options ): string { |
400 | return self::toXML( $node, $options + [ 'saveData' => true ] ); |
401 | } |
402 | |
403 | /** |
404 | * Dump the DOM with attributes. |
405 | * |
406 | * @param Node $rootNode |
407 | * @param string $title |
408 | * @param array $options Associative array of options: |
409 | * - dumpFragmentMap: Dump the fragment map from env |
410 | * - quiet: Suppress separators |
411 | * |
412 | * storeDataAttribs options: |
413 | * - discardDataParsoid |
414 | * - keepTmp |
415 | * - storeInPageBundle |
416 | * - storeDiffMark |
417 | * - env |
418 | * - idIndex |
419 | * |
420 | * XMLSerializer options: |
421 | * - smartQuote |
422 | * - innerXML |
423 | * - captureOffsets |
424 | * - addDoctype |
425 | * @return string The dump result |
426 | */ |
427 | public static function dumpDOM( |
428 | Node $rootNode, string $title = '', array $options = [] |
429 | ): string { |
430 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
431 | Assert::invariant( isset( $options['env'] ), "env should be set" ); |
432 | } |
433 | |
434 | $buf = ''; |
435 | if ( empty( $options['quiet'] ) ) { |
436 | $buf .= "----- {$title} -----\n"; |
437 | } |
438 | $buf .= self::dumpNode( $rootNode, $options ) . "\n"; |
439 | |
440 | // Dump cached fragments |
441 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
442 | foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { |
443 | $buf .= str_repeat( '=', 15 ) . "\n"; |
444 | $buf .= "FRAGMENT {$k}\n"; |
445 | $buf .= self::dumpNode( |
446 | is_array( $fragment ) ? $fragment[0] : $fragment, |
447 | $options |
448 | ) . "\n"; |
449 | } |
450 | } |
451 | |
452 | if ( empty( $options['quiet'] ) ) { |
453 | $buf .= str_repeat( '-', mb_strlen( $title ) + 12 ) . "\n"; |
454 | } |
455 | return $buf; |
456 | } |
457 | |
458 | } |