Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 171 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
ContentUtils | |
0.00% |
0 / 171 |
|
0.00% |
0 / 12 |
3906 | |
0.00% |
0 / 1 |
toXML | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
ppToXML | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
createDocument | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocument | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocumentFragment | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
extractDpAndSerialize | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
stripUnnecessaryWrappersAndSyntheticNodes | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
processAttributeEmbeddedHTML | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
420 | |||
shiftDSR | |
0.00% |
0 / 39 |
|
0.00% |
0 / 1 |
132 | |||
convertOffsets | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
72 | |||
dumpNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
dumpDOM | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
56 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Closure; |
7 | use Wikimedia\Assert\Assert; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * These utilities are for processing content that's generated |
20 | * by parsing source input (ex: wikitext) |
21 | */ |
22 | class ContentUtils { |
23 | |
24 | /** |
25 | * XML Serializer. |
26 | * |
27 | * @param Node $node |
28 | * @param array $options XMLSerializer options. |
29 | * @return string |
30 | */ |
31 | public static function toXML( Node $node, array $options = [] ): string { |
32 | return XMLSerializer::serialize( $node, $options )['html']; |
33 | } |
34 | |
35 | /** |
36 | * dataobject aware XML serializer, to be used in the DOM post-processing phase. |
37 | * |
38 | * @param Node $node |
39 | * @param array $options |
40 | * @return string |
41 | */ |
42 | public static function ppToXML( Node $node, array $options = [] ): string { |
43 | DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); |
44 | return self::toXML( $node, $options ); |
45 | } |
46 | |
47 | /** |
48 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
49 | * to create new documents when parsing or serializing. A document lives |
50 | * on the environment which can be used to create fragments. The bag added |
51 | * as a dynamic property to the PHP wrapper around the libxml doc |
52 | * is at risk of being GC-ed. |
53 | * |
54 | * @param string $html |
55 | * @param bool $validateXMLNames |
56 | * @return Document |
57 | */ |
58 | public static function createDocument( |
59 | string $html = '', bool $validateXMLNames = false |
60 | ): Document { |
61 | $doc = DOMUtils::parseHTML( $html, $validateXMLNames ); |
62 | DOMDataUtils::prepareDoc( $doc ); |
63 | return $doc; |
64 | } |
65 | |
66 | /** |
67 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
68 | * to create new documents when parsing or serializing. A document lives |
69 | * on the environment which can be used to create fragments. The bag added |
70 | * as a dynamic property to the PHP wrapper around the libxml doc |
71 | * is at risk of being GC-ed. |
72 | * |
73 | * @param string $html |
74 | * @param array $options |
75 | * @return Document |
76 | */ |
77 | public static function createAndLoadDocument( |
78 | string $html, array $options = [] |
79 | ): Document { |
80 | $doc = self::createDocument( $html, $options['validateXMLNames'] ?? false ); |
81 | DOMDataUtils::visitAndLoadDataAttribs( |
82 | DOMCompat::getBody( $doc ), $options |
83 | ); |
84 | return $doc; |
85 | } |
86 | |
87 | /** |
88 | * @param Document $doc |
89 | * @param string $html |
90 | * @param array $options |
91 | * @return DocumentFragment |
92 | */ |
93 | public static function createAndLoadDocumentFragment( |
94 | Document $doc, string $html, array $options = [] |
95 | ): DocumentFragment { |
96 | $domFragment = $doc->createDocumentFragment(); |
97 | DOMUtils::setFragmentInnerHTML( $domFragment, $html ); |
98 | DOMDataUtils::visitAndLoadDataAttribs( $domFragment, $options ); |
99 | return $domFragment; |
100 | } |
101 | |
102 | /** |
103 | * Pull the data-parsoid script element out of the doc before serializing. |
104 | * |
105 | * @param Node $node |
106 | * @param array $options XMLSerializer options. |
107 | * @return array |
108 | */ |
109 | public static function extractDpAndSerialize( Node $node, array $options = [] ): array { |
110 | $doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node; |
111 | $pb = DOMDataUtils::extractPageBundle( $doc ); |
112 | $out = XMLSerializer::serialize( $node, $options ); |
113 | $out['pb'] = $pb; |
114 | return $out; |
115 | } |
116 | |
117 | /** |
118 | * Strip Parsoid-inserted section wrappers, annotation wrappers, and synthetic nodes |
119 | * (fallback id spans with HTML4 ids for headings, auto-generated TOC metas |
120 | * and possibly other such in the future) from the DOM. |
121 | * |
122 | * @param Element $node |
123 | */ |
124 | public static function stripUnnecessaryWrappersAndSyntheticNodes( Element $node ): void { |
125 | $n = $node->firstChild; |
126 | while ( $n ) { |
127 | $next = $n->nextSibling; |
128 | if ( $n instanceof Element ) { |
129 | if ( DOMCompat::nodeName( $n ) === 'meta' && |
130 | ( DOMDataUtils::getDataMw( $n )->autoGenerated ?? false ) |
131 | ) { |
132 | // Strip auto-generated synthetic meta tags |
133 | $n->parentNode->removeChild( $n ); |
134 | } elseif ( WTUtils::isFallbackIdSpan( $n ) ) { |
135 | // Strip <span typeof='mw:FallbackId' ...></span> |
136 | $n->parentNode->removeChild( $n ); |
137 | } else { |
138 | // Recurse into subtree before stripping this |
139 | self::stripUnnecessaryWrappersAndSyntheticNodes( $n ); |
140 | |
141 | // Strip <section> tags and synthetic extended-annotation-region wrappers |
142 | if ( WTUtils::isParsoidSectionTag( $n ) || |
143 | DOMUtils::hasTypeOf( $n, 'mw:ExtendedAnnRange' ) ) { |
144 | DOMUtils::migrateChildren( $n, $n->parentNode, $n ); |
145 | $n->parentNode->removeChild( $n ); |
146 | } |
147 | } |
148 | } |
149 | $n = $next; |
150 | } |
151 | } |
152 | |
153 | /** |
154 | * Extensions might be interested in examining their content embedded |
155 | * in data-mw attributes that don't otherwise show up in the DOM. |
156 | * |
157 | * Ex: inline media captions that aren't rendered, language variant markup, |
158 | * attributes that are transcluded. More scenarios might be added later. |
159 | * |
160 | * @param ParsoidExtensionAPI $extAPI |
161 | * @param Element $elt The node whose data attributes need to be examined |
162 | * @param Closure $proc The processor that will process the embedded HTML |
163 | * Signature: (string) -> string |
164 | * This processor will be provided the HTML string as input |
165 | * and is expected to return a possibly modified string. |
166 | */ |
167 | public static function processAttributeEmbeddedHTML( |
168 | ParsoidExtensionAPI $extAPI, Element $elt, Closure $proc |
169 | ): void { |
170 | if ( !$elt->hasAttribute( 'typeof' ) ) { |
171 | return; |
172 | } |
173 | |
174 | // Expanded attributes |
175 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:ExpandedAttrs$/' ) ) { |
176 | $dmw = DOMDataUtils::getDataMw( $elt ); |
177 | if ( $dmw->attribs ?? null ) { |
178 | foreach ( $dmw->attribs as $a ) { |
179 | // Look in both key and value of the DataMwAttrib |
180 | foreach ( [ 'key', 'value' ] as $part ) { |
181 | if ( !is_string( $a->$part ) && isset( $a->$part['html'] ) ) { |
182 | $a->$part['html'] = $proc( $a->$part['html'] ); |
183 | } |
184 | } |
185 | } |
186 | } |
187 | } |
188 | |
189 | // Language variant markup |
190 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:LanguageVariant$/' ) ) { |
191 | $dmwv = DOMDataUtils::getJSONAttribute( $elt, 'data-mw-variant', null ); |
192 | if ( $dmwv ) { |
193 | if ( isset( $dmwv->disabled ) ) { |
194 | $dmwv->disabled->t = $proc( $dmwv->disabled->t ); |
195 | } |
196 | if ( isset( $dmwv->twoway ) ) { |
197 | foreach ( $dmwv->twoway as $l ) { |
198 | $l->t = $proc( $l->t ); |
199 | } |
200 | } |
201 | if ( isset( $dmwv->oneway ) ) { |
202 | foreach ( $dmwv->oneway as $l ) { |
203 | $l->f = $proc( $l->f ); |
204 | $l->t = $proc( $l->t ); |
205 | } |
206 | } |
207 | if ( isset( $dmwv->filter ) ) { |
208 | $dmwv->filter->t = $proc( $dmwv->filter->t ); |
209 | } |
210 | DOMDataUtils::setJSONAttribute( $elt, 'data-mw-variant', $dmwv ); |
211 | } |
212 | } |
213 | |
214 | // Inline media -- look inside the data-mw attribute |
215 | if ( WTUtils::isInlineMedia( $elt ) ) { |
216 | $dmw = DOMDataUtils::getDataMw( $elt ); |
217 | $caption = $dmw->caption ?? null; |
218 | if ( $caption ) { |
219 | $dmw->caption = $proc( $caption ); |
220 | } |
221 | } |
222 | |
223 | // Process extension-specific embedded HTML |
224 | $extTagName = WTUtils::getExtTagName( $elt ); |
225 | if ( $extTagName ) { |
226 | $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); |
227 | if ( $extConfig['options']['wt2html']['embedsHTMLInAttributes'] ?? false ) { |
228 | $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); |
229 | $tagHandler->processAttributeEmbeddedHTML( $extAPI, $elt, $proc ); |
230 | } |
231 | } |
232 | } |
233 | |
234 | /** |
235 | * Shift the DOM Source Range (DSR) of a DOM fragment. |
236 | * @param Env $env |
237 | * @param Node $rootNode |
238 | * @param callable $dsrFunc |
239 | * @param ParsoidExtensionAPI $extAPI |
240 | * @return Node Returns the $rootNode passed in to allow chaining. |
241 | */ |
242 | public static function shiftDSR( |
243 | Env $env, Node $rootNode, callable $dsrFunc, ParsoidExtensionAPI $extAPI |
244 | ): Node { |
245 | $doc = $rootNode->ownerDocument; |
246 | $convertString = static function ( $str ) { |
247 | // Stub $convertString out to allow definition of a pair of |
248 | // mutually-recursive functions. |
249 | return $str; |
250 | }; |
251 | $convertNode = static function ( Node $node ) use ( |
252 | $env, $extAPI, $dsrFunc, &$convertString, &$convertNode |
253 | ) { |
254 | if ( !( $node instanceof Element ) ) { |
255 | return; |
256 | } |
257 | $dp = DOMDataUtils::getDataParsoid( $node ); |
258 | if ( isset( $dp->dsr ) ) { |
259 | $dp->dsr = $dsrFunc( clone $dp->dsr ); |
260 | // We don't need to setDataParsoid because dp is not a copy |
261 | |
262 | // This is a bit of a hack, but we use this function to |
263 | // clear DSR properties as well. See below as well. |
264 | if ( $dp->dsr === null ) { |
265 | unset( $dp->dsr ); |
266 | } |
267 | } |
268 | $tmp = $dp->getTemp(); |
269 | if ( isset( $tmp->origDSR ) ) { |
270 | // Even though tmp shouldn't escape Parsoid, go ahead and |
271 | // convert to enable hybrid testing. |
272 | $tmp->origDSR = $dsrFunc( clone $tmp->origDSR ); |
273 | if ( $tmp->origDSR === null ) { |
274 | unset( $tmp->origDSR ); |
275 | } |
276 | } |
277 | if ( isset( $dp->extTagOffsets ) ) { |
278 | $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); |
279 | if ( $dp->extTagOffsets === null ) { |
280 | unset( $dp->extTagOffsets ); |
281 | } |
282 | } |
283 | |
284 | // Handle embedded HTML in attributes |
285 | self::processAttributeEmbeddedHTML( $extAPI, $node, $convertString ); |
286 | |
287 | // DOMFragments will have already been unpacked when DSR shifting is run |
288 | if ( DOMUtils::hasTypeOf( $node, 'mw:DOMFragment' ) ) { |
289 | throw new UnreachableException( "Shouldn't encounter these nodes here." ); |
290 | } |
291 | |
292 | // However, extensions can choose to handle sealed fragments whenever |
293 | // they want and so may be returned in subpipelines which could |
294 | // subsequently be shifted |
295 | if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment/sealed/\w+$#D' ) ) { |
296 | $dp = DOMDataUtils::getDataParsoid( $node ); |
297 | if ( $dp->html ?? null ) { |
298 | $domFragment = $env->getDOMFragment( $dp->html ); |
299 | DOMPostOrder::traverse( $domFragment, $convertNode ); |
300 | } |
301 | } |
302 | }; |
303 | $convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string { |
304 | $node = self::createAndLoadDocumentFragment( $doc, $str ); |
305 | DOMPostOrder::traverse( $node, $convertNode ); |
306 | return self::ppToXML( $node, [ 'innerXML' => true ] ); |
307 | }; |
308 | DOMPostOrder::traverse( $rootNode, $convertNode ); |
309 | return $rootNode; // chainable |
310 | } |
311 | |
312 | /** |
313 | * Convert DSR offsets in a Document between utf-8/ucs2/codepoint |
314 | * indices. |
315 | * |
316 | * Offset types are: |
317 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
318 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
319 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
320 | * |
321 | * @see TokenUtils::convertTokenOffsets for a related function on tokens. |
322 | * |
323 | * @param Env $env |
324 | * @param Document $doc The document to convert |
325 | * @param string $from Offset type to convert from. |
326 | * @param string $to Offset type to convert to. |
327 | */ |
328 | public static function convertOffsets( |
329 | Env $env, |
330 | Document $doc, |
331 | string $from, |
332 | string $to |
333 | ): void { |
334 | $env->setCurrentOffsetType( $to ); |
335 | if ( $from === $to ) { |
336 | return; // Hey, that was easy! |
337 | } |
338 | $offsetMap = []; |
339 | $offsets = []; |
340 | $collect = static function ( int $n ) use ( &$offsetMap, &$offsets ) { |
341 | if ( !array_key_exists( $n, $offsetMap ) ) { |
342 | $box = (object)[ 'value' => $n ]; |
343 | $offsetMap[$n] = $box; |
344 | $offsets[] =& $box->value; |
345 | } |
346 | }; |
347 | // Collect DSR offsets throughout the document |
348 | $collectDSR = static function ( DomSourceRange $dsr ) use ( $collect ) { |
349 | if ( $dsr->start !== null ) { |
350 | $collect( $dsr->start ); |
351 | $collect( $dsr->innerStart() ); |
352 | } |
353 | if ( $dsr->end !== null ) { |
354 | $collect( $dsr->innerEnd() ); |
355 | $collect( $dsr->end ); |
356 | } |
357 | return $dsr; |
358 | }; |
359 | $body = DOMCompat::getBody( $doc ); |
360 | $extAPI = new ParsoidExtensionAPI( $env ); |
361 | self::shiftDSR( $env, $body, $collectDSR, $extAPI ); |
362 | if ( count( $offsets ) === 0 ) { |
363 | return; /* nothing to do (shouldn't really happen) */ |
364 | } |
365 | // Now convert these offsets |
366 | TokenUtils::convertOffsets( |
367 | $env->topFrame->getSrcText(), $from, $to, $offsets |
368 | ); |
369 | // Apply converted offsets |
370 | $applyDSR = static function ( DomSourceRange $dsr ) use ( $offsetMap ) { |
371 | $start = $dsr->start; |
372 | $openWidth = $dsr->openWidth; |
373 | if ( $start !== null ) { |
374 | $start = $offsetMap[$start]->value; |
375 | $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; |
376 | } |
377 | $end = $dsr->end; |
378 | $closeWidth = $dsr->closeWidth; |
379 | if ( $end !== null ) { |
380 | $end = $offsetMap[$end]->value; |
381 | $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; |
382 | } |
383 | return new DomSourceRange( |
384 | $start, $end, $openWidth, $closeWidth |
385 | ); |
386 | }; |
387 | self::shiftDSR( $env, $body, $applyDSR, $extAPI ); |
388 | } |
389 | |
390 | /** |
391 | * @param Node $node |
392 | * @param array $options |
393 | * @return string |
394 | */ |
395 | private static function dumpNode( Node $node, array $options ): string { |
396 | return self::toXML( $node, $options + [ 'saveData' => true ] ); |
397 | } |
398 | |
399 | /** |
400 | * Dump the DOM with attributes. |
401 | * |
402 | * @param Node $rootNode |
403 | * @param string $title |
404 | * @param array $options Associative array of options: |
405 | * - dumpFragmentMap: Dump the fragment map from env |
406 | * - quiet: Suppress separators |
407 | * |
408 | * storeDataAttribs options: |
409 | * - discardDataParsoid |
410 | * - keepTmp |
411 | * - storeInPageBundle |
412 | * - storeDiffMark |
413 | * - env |
414 | * - idIndex |
415 | * |
416 | * XMLSerializer options: |
417 | * - smartQuote |
418 | * - innerXML |
419 | * - captureOffsets |
420 | * - addDoctype |
421 | * @return string The dump result |
422 | */ |
423 | public static function dumpDOM( |
424 | Node $rootNode, string $title = '', array $options = [] |
425 | ): string { |
426 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
427 | Assert::invariant( isset( $options['env'] ), "env should be set" ); |
428 | } |
429 | |
430 | $buf = ''; |
431 | if ( empty( $options['quiet'] ) ) { |
432 | $buf .= "----- {$title} -----\n"; |
433 | } |
434 | $buf .= self::dumpNode( $rootNode, $options ) . "\n"; |
435 | |
436 | // Dump cached fragments |
437 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
438 | foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { |
439 | $buf .= str_repeat( '=', 15 ) . "\n"; |
440 | $buf .= "FRAGMENT {$k}\n"; |
441 | $buf .= self::dumpNode( |
442 | is_array( $fragment ) ? $fragment[0] : $fragment, |
443 | $options |
444 | ) . "\n"; |
445 | } |
446 | } |
447 | |
448 | if ( empty( $options['quiet'] ) ) { |
449 | $buf .= str_repeat( '-', mb_strlen( $title ) + 12 ) . "\n"; |
450 | } |
451 | return $buf; |
452 | } |
453 | |
454 | } |