Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 165 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
ContentUtils | |
0.00% |
0 / 165 |
|
0.00% |
0 / 12 |
3540 | |
0.00% |
0 / 1 |
toXML | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
ppToXML | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
createDocument | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocument | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
createAndLoadDocumentFragment | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
extractDpAndSerialize | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
stripUnnecessaryWrappersAndSyntheticNodes | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
processAttributeEmbeddedHTML | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
420 | |||
shiftDSR | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
72 | |||
convertOffsets | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
72 | |||
dumpNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
dumpDOM | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
56 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Closure; |
7 | use Wikimedia\Assert\Assert; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * These utilities are for processing content that's generated |
20 | * by parsing source input (ex: wikitext) |
21 | */ |
22 | class ContentUtils { |
23 | |
24 | /** |
25 | * XML Serializer. |
26 | * |
27 | * @param Node $node |
28 | * @param array $options XMLSerializer options. |
29 | * @return string |
30 | */ |
31 | public static function toXML( Node $node, array $options = [] ): string { |
32 | return XMLSerializer::serialize( $node, $options )['html']; |
33 | } |
34 | |
35 | /** |
36 | * dataobject aware XML serializer, to be used in the DOM post-processing phase. |
37 | * |
38 | * @param Node $node |
39 | * @param array $options |
40 | * @return string |
41 | */ |
42 | public static function ppToXML( Node $node, array $options = [] ): string { |
43 | DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); |
44 | return self::toXML( $node, $options ); |
45 | } |
46 | |
47 | /** |
48 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
49 | * to create new documents when parsing or serializing. A document lives |
50 | * on the environment which can be used to create fragments. The bag added |
51 | * as a dynamic property to the PHP wrapper around the libxml doc |
52 | * is at risk of being GC-ed. |
53 | * |
54 | * @param string $html |
55 | * @param bool $validateXMLNames |
56 | * @return Document |
57 | */ |
58 | public static function createDocument( |
59 | string $html = '', bool $validateXMLNames = false |
60 | ): Document { |
61 | $doc = DOMUtils::parseHTML( $html, $validateXMLNames ); |
62 | DOMDataUtils::prepareDoc( $doc ); |
63 | return $doc; |
64 | } |
65 | |
66 | /** |
67 | * XXX: Don't use this outside of testing. It shouldn't be necessary |
68 | * to create new documents when parsing or serializing. A document lives |
69 | * on the environment which can be used to create fragments. The bag added |
70 | * as a dynamic property to the PHP wrapper around the libxml doc |
71 | * is at risk of being GC-ed. |
72 | * |
73 | * @param string $html |
74 | * @param array $options |
75 | * @return Document |
76 | */ |
77 | public static function createAndLoadDocument( |
78 | string $html, array $options = [] |
79 | ): Document { |
80 | $doc = self::createDocument( $html ); |
81 | DOMDataUtils::visitAndLoadDataAttribs( |
82 | DOMCompat::getBody( $doc ), $options |
83 | ); |
84 | return $doc; |
85 | } |
86 | |
87 | /** |
88 | * @param Document $doc |
89 | * @param string $html |
90 | * @param array $options |
91 | * @return DocumentFragment |
92 | */ |
93 | public static function createAndLoadDocumentFragment( |
94 | Document $doc, string $html, array $options = [] |
95 | ): DocumentFragment { |
96 | $domFragment = $doc->createDocumentFragment(); |
97 | DOMUtils::setFragmentInnerHTML( $domFragment, $html ); |
98 | DOMDataUtils::visitAndLoadDataAttribs( $domFragment, $options ); |
99 | return $domFragment; |
100 | } |
101 | |
102 | /** |
103 | * Pull the data-parsoid script element out of the doc before serializing. |
104 | * |
105 | * @param Node $node |
106 | * @param array $options XMLSerializer options. |
107 | * @return array |
108 | */ |
109 | public static function extractDpAndSerialize( Node $node, array $options = [] ): array { |
110 | $doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node; |
111 | $pb = DOMDataUtils::extractPageBundle( $doc ); |
112 | $out = XMLSerializer::serialize( $node, $options ); |
113 | $out['pb'] = $pb; |
114 | return $out; |
115 | } |
116 | |
117 | /** |
118 | * Strip Parsoid-inserted section wrappers, annotation wrappers, and synthetic nodes |
119 | * (fallback id spans with HTML4 ids for headings, auto-generated TOC metas |
120 | * and possibly other such in the future) from the DOM. |
121 | * |
122 | * @param Element $node |
123 | */ |
124 | public static function stripUnnecessaryWrappersAndSyntheticNodes( Element $node ): void { |
125 | $n = $node->firstChild; |
126 | while ( $n ) { |
127 | $next = $n->nextSibling; |
128 | if ( $n instanceof Element ) { |
129 | if ( DOMCompat::nodeName( $n ) === 'meta' && |
130 | ( DOMDataUtils::getDataMw( $n )->autoGenerated ?? false ) |
131 | ) { |
132 | // Strip auto-generated synthetic meta tags |
133 | $n->parentNode->removeChild( $n ); |
134 | } elseif ( WTUtils::isFallbackIdSpan( $n ) ) { |
135 | // Strip <span typeof='mw:FallbackId' ...></span> |
136 | $n->parentNode->removeChild( $n ); |
137 | } else { |
138 | // Recurse into subtree before stripping this |
139 | self::stripUnnecessaryWrappersAndSyntheticNodes( $n ); |
140 | |
141 | // Strip <section> tags and synthetic extended-annotation-region wrappers |
142 | if ( WTUtils::isParsoidSectionTag( $n ) || |
143 | DOMUtils::hasTypeOf( $n, 'mw:ExtendedAnnRange' ) ) { |
144 | DOMUtils::migrateChildren( $n, $n->parentNode, $n ); |
145 | $n->parentNode->removeChild( $n ); |
146 | } |
147 | } |
148 | } |
149 | $n = $next; |
150 | } |
151 | } |
152 | |
153 | /** |
154 | * Extensions might be interested in examining their content embedded |
155 | * in data-mw attributes that don't otherwise show up in the DOM. |
156 | * |
157 | * Ex: inline media captions that aren't rendered, language variant markup, |
158 | * attributes that are transcluded. More scenarios might be added later. |
159 | * |
160 | * @param ParsoidExtensionAPI $extAPI |
161 | * @param Element $elt The node whose data attributes need to be examined |
162 | * @param Closure $proc The processor that will process the embedded HTML |
163 | * Signature: (string) -> string |
164 | * This processor will be provided the HTML string as input |
165 | * and is expected to return a possibly modified string. |
166 | */ |
167 | public static function processAttributeEmbeddedHTML( |
168 | ParsoidExtensionAPI $extAPI, Element $elt, Closure $proc |
169 | ): void { |
170 | if ( !$elt->hasAttribute( 'typeof' ) ) { |
171 | return; |
172 | } |
173 | |
174 | // Expanded attributes |
175 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:ExpandedAttrs$/' ) ) { |
176 | $dmw = DOMDataUtils::getDataMw( $elt ); |
177 | if ( $dmw->attribs ?? null ) { |
178 | foreach ( $dmw->attribs as &$a ) { |
179 | foreach ( $a as $kOrV ) { |
180 | if ( !is_string( $kOrV ) && isset( $kOrV->html ) ) { |
181 | $kOrV->html = $proc( $kOrV->html ); |
182 | } |
183 | } |
184 | } |
185 | } |
186 | } |
187 | |
188 | // Language variant markup |
189 | if ( DOMUtils::matchTypeOf( $elt, '/^mw:LanguageVariant$/' ) ) { |
190 | $dmwv = DOMDataUtils::getJSONAttribute( $elt, 'data-mw-variant', null ); |
191 | if ( $dmwv ) { |
192 | if ( isset( $dmwv->disabled ) ) { |
193 | $dmwv->disabled->t = $proc( $dmwv->disabled->t ); |
194 | } |
195 | if ( isset( $dmwv->twoway ) ) { |
196 | foreach ( $dmwv->twoway as $l ) { |
197 | $l->t = $proc( $l->t ); |
198 | } |
199 | } |
200 | if ( isset( $dmwv->oneway ) ) { |
201 | foreach ( $dmwv->oneway as $l ) { |
202 | $l->f = $proc( $l->f ); |
203 | $l->t = $proc( $l->t ); |
204 | } |
205 | } |
206 | if ( isset( $dmwv->filter ) ) { |
207 | $dmwv->filter->t = $proc( $dmwv->filter->t ); |
208 | } |
209 | DOMDataUtils::setJSONAttribute( $elt, 'data-mw-variant', $dmwv ); |
210 | } |
211 | } |
212 | |
213 | // Inline media -- look inside the data-mw attribute |
214 | if ( WTUtils::isInlineMedia( $elt ) ) { |
215 | $dmw = DOMDataUtils::getDataMw( $elt ); |
216 | $caption = $dmw->caption ?? null; |
217 | if ( $caption ) { |
218 | $dmw->caption = $proc( $caption ); |
219 | } |
220 | } |
221 | |
222 | // Process extension-specific embedded HTML |
223 | $extTagName = WTUtils::getExtTagName( $elt ); |
224 | if ( $extTagName ) { |
225 | $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); |
226 | if ( $extConfig['options']['wt2html']['embedsHTMLInAttributes'] ?? false ) { |
227 | $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); |
228 | $tagHandler->processAttributeEmbeddedHTML( $extAPI, $elt, $proc ); |
229 | } |
230 | } |
231 | } |
232 | |
233 | /** |
234 | * Shift the DOM Source Range (DSR) of a DOM fragment. |
235 | * @param Env $env |
236 | * @param Node $rootNode |
237 | * @param callable $dsrFunc |
238 | * @param ParsoidExtensionAPI $extAPI |
239 | * @return Node Returns the $rootNode passed in to allow chaining. |
240 | */ |
241 | public static function shiftDSR( |
242 | Env $env, Node $rootNode, callable $dsrFunc, ParsoidExtensionAPI $extAPI |
243 | ): Node { |
244 | $doc = $rootNode->ownerDocument; |
245 | $convertString = static function ( $str ) { |
246 | // Stub $convertString out to allow definition of a pair of |
247 | // mutually-recursive functions. |
248 | return $str; |
249 | }; |
250 | $convertNode = static function ( Node $node ) use ( |
251 | $env, $extAPI, $dsrFunc, &$convertString, &$convertNode |
252 | ) { |
253 | if ( !( $node instanceof Element ) ) { |
254 | return; |
255 | } |
256 | $dp = DOMDataUtils::getDataParsoid( $node ); |
257 | if ( isset( $dp->dsr ) ) { |
258 | $dp->dsr = $dsrFunc( clone $dp->dsr ); |
259 | // We don't need to setDataParsoid because dp is not a copy |
260 | } |
261 | $tmp = $dp->getTemp(); |
262 | if ( isset( $tmp->origDSR ) ) { |
263 | // Even though tmp shouldn't escape Parsoid, go ahead and |
264 | // convert to enable hybrid testing. |
265 | $tmp->origDSR = $dsrFunc( clone $tmp->origDSR ); |
266 | } |
267 | if ( isset( $dp->extTagOffsets ) ) { |
268 | $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); |
269 | } |
270 | |
271 | // Handle embedded HTML in attributes |
272 | self::processAttributeEmbeddedHTML( $extAPI, $node, $convertString ); |
273 | |
274 | // DOMFragments will have already been unpacked when DSR shifting is run |
275 | if ( DOMUtils::hasTypeOf( $node, 'mw:DOMFragment' ) ) { |
276 | throw new UnreachableException( "Shouldn't encounter these nodes here." ); |
277 | } |
278 | |
279 | // However, extensions can choose to handle sealed fragments whenever |
280 | // they want and so may be returned in subpipelines which could |
281 | // subsequently be shifted |
282 | if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment/sealed/\w+$#D' ) ) { |
283 | $dp = DOMDataUtils::getDataParsoid( $node ); |
284 | if ( $dp->html ?? null ) { |
285 | $domFragment = $env->getDOMFragment( $dp->html ); |
286 | DOMPostOrder::traverse( $domFragment, $convertNode ); |
287 | } |
288 | } |
289 | }; |
290 | $convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string { |
291 | $node = self::createAndLoadDocumentFragment( $doc, $str ); |
292 | DOMPostOrder::traverse( $node, $convertNode ); |
293 | return self::ppToXML( $node, [ 'innerXML' => true ] ); |
294 | }; |
295 | DOMPostOrder::traverse( $rootNode, $convertNode ); |
296 | return $rootNode; // chainable |
297 | } |
298 | |
299 | /** |
300 | * Convert DSR offsets in a Document between utf-8/ucs2/codepoint |
301 | * indices. |
302 | * |
303 | * Offset types are: |
304 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
305 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
306 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
307 | * |
308 | * @see TokenUtils::convertTokenOffsets for a related function on tokens. |
309 | * |
310 | * @param Env $env |
311 | * @param Document $doc The document to convert |
312 | * @param string $from Offset type to convert from. |
313 | * @param string $to Offset type to convert to. |
314 | */ |
315 | public static function convertOffsets( |
316 | Env $env, |
317 | Document $doc, |
318 | string $from, |
319 | string $to |
320 | ): void { |
321 | $env->setCurrentOffsetType( $to ); |
322 | if ( $from === $to ) { |
323 | return; // Hey, that was easy! |
324 | } |
325 | $offsetMap = []; |
326 | $offsets = []; |
327 | $collect = static function ( int $n ) use ( &$offsetMap, &$offsets ) { |
328 | if ( !array_key_exists( $n, $offsetMap ) ) { |
329 | $box = (object)[ 'value' => $n ]; |
330 | $offsetMap[$n] = $box; |
331 | $offsets[] =& $box->value; |
332 | } |
333 | }; |
334 | // Collect DSR offsets throughout the document |
335 | $collectDSR = static function ( DomSourceRange $dsr ) use ( $collect ) { |
336 | if ( $dsr->start !== null ) { |
337 | $collect( $dsr->start ); |
338 | $collect( $dsr->innerStart() ); |
339 | } |
340 | if ( $dsr->end !== null ) { |
341 | $collect( $dsr->innerEnd() ); |
342 | $collect( $dsr->end ); |
343 | } |
344 | return $dsr; |
345 | }; |
346 | $body = DOMCompat::getBody( $doc ); |
347 | $extAPI = new ParsoidExtensionAPI( $env ); |
348 | self::shiftDSR( $env, $body, $collectDSR, $extAPI ); |
349 | if ( count( $offsets ) === 0 ) { |
350 | return; /* nothing to do (shouldn't really happen) */ |
351 | } |
352 | // Now convert these offsets |
353 | TokenUtils::convertOffsets( |
354 | $env->topFrame->getSrcText(), $from, $to, $offsets |
355 | ); |
356 | // Apply converted offsets |
357 | $applyDSR = static function ( DomSourceRange $dsr ) use ( $offsetMap ) { |
358 | $start = $dsr->start; |
359 | $openWidth = $dsr->openWidth; |
360 | if ( $start !== null ) { |
361 | $start = $offsetMap[$start]->value; |
362 | $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; |
363 | } |
364 | $end = $dsr->end; |
365 | $closeWidth = $dsr->closeWidth; |
366 | if ( $end !== null ) { |
367 | $end = $offsetMap[$end]->value; |
368 | $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; |
369 | } |
370 | return new DomSourceRange( |
371 | $start, $end, $openWidth, $closeWidth |
372 | ); |
373 | }; |
374 | self::shiftDSR( $env, $body, $applyDSR, $extAPI ); |
375 | } |
376 | |
377 | /** |
378 | * @param Node $node |
379 | * @param array $options |
380 | * @return string |
381 | */ |
382 | private static function dumpNode( Node $node, array $options ): string { |
383 | return self::toXML( $node, $options + [ 'saveData' => true ] ); |
384 | } |
385 | |
386 | /** |
387 | * Dump the DOM with attributes. |
388 | * |
389 | * @param Node $rootNode |
390 | * @param string $title |
391 | * @param array $options Associative array of options: |
392 | * - dumpFragmentMap: Dump the fragment map from env |
393 | * - quiet: Suppress separators |
394 | * |
395 | * storeDataAttribs options: |
396 | * - discardDataParsoid |
397 | * - keepTmp |
398 | * - storeInPageBundle |
399 | * - storeDiffMark |
400 | * - env |
401 | * - idIndex |
402 | * |
403 | * XMLSerializer options: |
404 | * - smartQuote |
405 | * - innerXML |
406 | * - captureOffsets |
407 | * - addDoctype |
408 | * @return string The dump result |
409 | */ |
410 | public static function dumpDOM( |
411 | Node $rootNode, string $title = '', array $options = [] |
412 | ): string { |
413 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
414 | Assert::invariant( isset( $options['env'] ), "env should be set" ); |
415 | } |
416 | |
417 | $buf = ''; |
418 | if ( empty( $options['quiet'] ) ) { |
419 | $buf .= "----- {$title} -----\n"; |
420 | } |
421 | $buf .= self::dumpNode( $rootNode, $options ) . "\n"; |
422 | |
423 | // Dump cached fragments |
424 | if ( !empty( $options['dumpFragmentMap'] ) ) { |
425 | foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { |
426 | $buf .= str_repeat( '=', 15 ) . "\n"; |
427 | $buf .= "FRAGMENT {$k}\n"; |
428 | $buf .= self::dumpNode( |
429 | is_array( $fragment ) ? $fragment[0] : $fragment, |
430 | $options |
431 | ) . "\n"; |
432 | } |
433 | } |
434 | |
435 | if ( empty( $options['quiet'] ) ) { |
436 | $buf .= str_repeat( '-', mb_strlen( $title ) + 12 ) . "\n"; |
437 | } |
438 | return $buf; |
439 | } |
440 | |
441 | } |