Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
33.87% |
42 / 124 |
|
33.33% |
5 / 15 |
CRAP | |
0.00% |
0 / 1 |
DomPageBundle | |
33.87% |
42 / 124 |
|
33.33% |
5 / 15 |
161.97 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
newEmpty | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
fromPageBundle | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
toDom | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
6 | |||
apply | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
42 | |||
toSingleDocument | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
fromSingleDocument | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
fromLoadedDocument | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
isSingleDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
toSingleDocumentHtml | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
toInlineAttributeHtml | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
2.15 | |||
encodeForHeadElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeFromHeadElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
toJsonArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
newFromJsonArray | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Core; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\JsonCodec\JsonCodecable; |
8 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
15 | use Wikimedia\Parsoid\Utils\PHPUtils; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * A page bundle stores an HTML DOM with separated data-parsoid and |
20 | * data-mw content. The data-parsoid and data-mw content is indexed |
21 | * by the id attributes on individual nodes. This content needs to |
22 | * be loaded before the data-parsoid and/or data-mw information can be |
23 | * used. |
24 | * |
25 | * Note that the parsoid/mw properties of the page bundle are in "serialized |
26 | * array" form; that is, they are flat arrays appropriate for json-encoding |
27 | * and do not contain DataParsoid or DataMw objects. |
28 | * |
29 | * See PageBundle for a similar structure used where the HTML DOM has been |
30 | * serialized into a string. |
31 | */ |
32 | class DomPageBundle implements JsonCodecable { |
33 | use JsonCodecableTrait; |
34 | |
35 | /** The document, as a DOM. */ |
36 | public ?Document $doc; |
37 | |
38 | /** |
39 | * A map from ID to the array serialization of DataParsoid for the Node |
40 | * with that ID. |
41 | * |
42 | * @var null|array{counter?:int,offsetType?:'byte'|'ucs2'|'char',ids:array<string,array>} |
43 | */ |
44 | public $parsoid; |
45 | |
46 | /** |
47 | * A map from ID to the array serialization of DataMw for the Node |
48 | * with that ID. |
49 | * |
50 | * @var null|array{ids:array<string,array>} |
51 | */ |
52 | public $mw; |
53 | |
54 | /** @var ?string */ |
55 | public $version; |
56 | |
57 | /** |
58 | * A map of HTTP headers: both name and value should be strings. |
59 | * @var array<string,string>|null |
60 | */ |
61 | public $headers; |
62 | |
63 | /** @var string|null */ |
64 | public $contentmodel; |
65 | |
66 | public function __construct( |
67 | Document $doc, ?array $parsoid = null, ?array $mw = null, |
68 | ?string $version = null, ?array $headers = null, |
69 | ?string $contentmodel = null |
70 | ) { |
71 | $this->doc = $doc; |
72 | $this->parsoid = $parsoid; |
73 | $this->mw = $mw; |
74 | $this->version = $version; |
75 | $this->headers = $headers; |
76 | $this->contentmodel = $contentmodel; |
77 | Assert::invariant( |
78 | !self::isSingleDocument( $doc ), |
79 | 'single document should be unpacked before DomPageBundle created' |
80 | ); |
81 | } |
82 | |
83 | public static function newEmpty( |
84 | Document $doc, |
85 | ?string $version = null, |
86 | ?array $headers = null, |
87 | ?string $contentmodel = null |
88 | ): self { |
89 | return new DomPageBundle( |
90 | $doc, |
91 | [ |
92 | 'counter' => -1, |
93 | 'ids' => [], |
94 | ], |
95 | [ |
96 | 'ids' => [], |
97 | ], |
98 | $version, |
99 | $headers, |
100 | $contentmodel |
101 | ); |
102 | } |
103 | |
104 | /** |
105 | * Create a DomPageBundle from a PageBundle. |
106 | * |
107 | * This simply parses the HTML string from the PageBundle, preserving |
108 | * the metadata. |
109 | */ |
110 | public static function fromPageBundle( PageBundle $pb ): DomPageBundle { |
111 | return new DomPageBundle( |
112 | DOMUtils::parseHTML( $pb->html ), |
113 | $pb->parsoid, |
114 | $pb->mw, |
115 | $pb->version, |
116 | $pb->headers, |
117 | $pb->contentmodel |
118 | ); |
119 | } |
120 | |
121 | /** |
122 | * Return a DOM from the contents of this page bundle. |
123 | * |
124 | * If `$load` is true (the default), the returned DOM will be prepared |
125 | * and loaded using `$options`. |
126 | * |
127 | * If `$load` is false, any data-parsoid or data-mw information from this |
128 | * page bundle will be converted to inline attributes in the DOM. This |
129 | * process is less efficient than preparing and loading the document |
130 | * directly from the DOM and should be avoided if possible. |
131 | */ |
132 | public function toDom( bool $load = true, ?array $options = null ): Document { |
133 | $doc = $this->doc; |
134 | if ( $load ) { |
135 | $options ??= []; |
136 | DOMDataUtils::prepareDoc( $doc ); |
137 | $body = DOMCompat::getBody( $doc ); |
138 | '@phan-var Element $body'; // assert non-null |
139 | DOMDataUtils::visitAndLoadDataAttribs( |
140 | $body, |
141 | [ |
142 | 'loadFromPageBundle' => $this, |
143 | ] + $options + [ |
144 | 'markNew' => true, |
145 | 'validateXMLNames' => true, |
146 | ] |
147 | ); |
148 | DOMDataUtils::getBag( $doc )->loaded = true; |
149 | } else { |
150 | self::apply( $doc, $this ); |
151 | } |
152 | $this->doc = null; // Prevent reuse of the DomPageBundle |
153 | return $doc; |
154 | } |
155 | |
156 | /** |
157 | * Applies the `data-*` attributes JSON structure to the document. |
158 | * Leaves `id` attributes behind -- they are used by citation code to |
159 | * extract `<ref>` body from the DOM. |
160 | * |
161 | * @param Document $doc doc |
162 | * @param DomPageBundle $pb page bundle |
163 | */ |
164 | private static function apply( Document $doc, DomPageBundle $pb ): void { |
165 | Assert::invariant( |
166 | !self::isSingleDocument( $doc ), |
167 | "conflicting page bundle found in document" |
168 | ); |
169 | $apply = static function ( Node $node ) use ( $pb ): void { |
170 | if ( $node instanceof Element ) { |
171 | $id = DOMCompat::getAttribute( $node, 'id' ); |
172 | if ( $id === null ) { |
173 | return; |
174 | } |
175 | if ( isset( $pb->parsoid['ids'][$id] ) ) { |
176 | DOMDataUtils::setJSONAttribute( |
177 | $node, 'data-parsoid', $pb->parsoid['ids'][$id] |
178 | ); |
179 | } |
180 | if ( isset( $pb->mw['ids'][$id] ) ) { |
181 | // Only apply if it isn't already set. This means |
182 | // earlier applications of the pagebundle have higher |
183 | // precedence, inline data being the highest. |
184 | if ( !$node->hasAttribute( 'data-mw' ) ) { |
185 | DOMDataUtils::setJSONAttribute( |
186 | $node, 'data-mw', $pb->mw['ids'][$id] |
187 | ); |
188 | } |
189 | } |
190 | } |
191 | }; |
192 | DOMUtils::visitDOM( |
193 | DOMCompat::getBody( $doc ), $apply |
194 | ); |
195 | // For fragment bank representations, visit <template> nodes in the |
196 | // <head> as well. |
197 | DOMUtils::visitDOM( |
198 | DOMCompat::getHead( $doc ), $apply |
199 | ); |
200 | } |
201 | |
202 | /** |
203 | * Create a "PageBundle as single Document" by embedding page bundle |
204 | * information into a <script> element in the <head> of the DOM. |
205 | * |
206 | * @see ::fromSingleDocument() |
207 | */ |
208 | public function toSingleDocument(): Document { |
209 | $script = DOMUtils::appendToHead( $this->doc, 'script', [ |
210 | 'id' => 'mw-pagebundle', |
211 | 'type' => 'application/x-mw-pagebundle', |
212 | ] ); |
213 | $script->appendChild( $this->doc->createTextNode( $this->encodeForHeadElement() ) ); |
214 | $doc = $this->doc; |
215 | // Invalidate this DomPageBundle to prevent us from using it again. |
216 | $this->doc = null; |
217 | return $doc; |
218 | } |
219 | |
220 | /** |
221 | * Return a DomPageBundle from a "PageBundle as single Document" |
222 | * representation, where some page bundle information has been embedded |
223 | * as a <script> element into the <head> of the DOM. |
224 | * |
225 | * @see ::toSingleDocument() |
226 | * |
227 | * @param Document $doc doc |
228 | * @param array $options Optional content version/headers/contentmodel |
229 | * @return DomPageBundle |
230 | */ |
231 | public static function fromSingleDocument( Document $doc, array $options = [] ): DomPageBundle { |
232 | $pb = null; |
233 | $dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' ); |
234 | Assert::invariant( $dpScriptElt !== null, "no page bundle found" ); |
235 | $dpScriptElt->parentNode->removeChild( $dpScriptElt ); |
236 | return self::decodeFromHeadElement( $doc, $dpScriptElt->textContent, $options ); |
237 | } |
238 | |
239 | /** |
240 | * Create a new DomPageBundle from a "prepared and loaded" document. |
241 | * |
242 | * If a `pageBundle` key is present in the options, the |
243 | * version/headers/contentmodel will be initialized from that |
244 | * page bundle. |
245 | * |
246 | * @param Document $doc Should be "prepared and loaded" |
247 | * @param array $options store options |
248 | * @return DomPageBundle |
249 | */ |
250 | public static function fromLoadedDocument( Document $doc, array $options = [] ): DomPageBundle { |
251 | $metadata = $options['pageBundle'] ?? null; |
252 | $dpb = self::newEmpty( |
253 | $doc, |
254 | $metadata->version ?? $options['contentversion'] ?? null, |
255 | $metadata->headers ?? $options['headers'] ?? null, |
256 | $metadata->contentmodel ?? $options['contentmodel'] ?? null |
257 | ); |
258 | DOMDataUtils::visitAndStoreDataAttribs( |
259 | DOMCompat::getBody( $doc ), |
260 | [ |
261 | 'storeInPageBundle' => $dpb, |
262 | 'outputContentVersion' => $dpb->version, |
263 | ] + $options |
264 | ); |
265 | return $dpb; |
266 | } |
267 | |
268 | /** |
269 | * Return true iff the given Document has page bundle information embedded |
270 | * as a <script id="mw-pagebundle"> element in its <head>. |
271 | */ |
272 | public static function isSingleDocument( Document $doc ): bool { |
273 | return DOMCompat::getElementById( $doc, 'mw-pagebundle' ) !== null; |
274 | } |
275 | |
276 | /** |
277 | * Convert this DomPageBundle to "single document" form, where page bundle |
278 | * information is embedded in the <head> of the document. |
279 | * @param array $options XMLSerializer options |
280 | * @return string an HTML string |
281 | */ |
282 | public function toSingleDocumentHtml( array $options = [] ): string { |
283 | $doc = $this->toSingleDocument(); |
284 | return XMLSerializer::serialize( $doc, $options )['html']; |
285 | } |
286 | |
287 | /** |
288 | * Convert this DomPageBundle to "inline attribute" form, where page bundle |
289 | * information is represented as inline JSON-valued attributes. |
290 | * @param array $options XMLSerializer options |
291 | * @return string an HTML string |
292 | */ |
293 | public function toInlineAttributeHtml( array $options = [] ): string { |
294 | $doc = $this->toDom( false ); |
295 | if ( $options['body_only'] ?? false ) { |
296 | $node = DOMCompat::getBody( $doc ); |
297 | $options['innerXML'] = true; |
298 | } else { |
299 | $node = $doc; |
300 | } |
301 | return XMLSerializer::serialize( $node, $options )['html']; |
302 | } |
303 | |
304 | /** |
305 | * Encode some page bundle properties for emitting as a <script> element |
306 | * in the <head> of a document. |
307 | */ |
308 | private function encodeForHeadElement(): string { |
309 | // Note that $this->parsoid and $this->mw are already serialized arrays |
310 | // so a naive jsonEncode is sufficient. We don't need a codec. |
311 | return PHPUtils::jsonEncode( [ 'parsoid' => $this->parsoid ?? [], 'mw' => $this->mw ?? [] ] ); |
312 | } |
313 | |
314 | /** |
315 | * Decode some page bundle properties from the contents of the <script> |
316 | * element embedded in a document. |
317 | */ |
318 | private static function decodeFromHeadElement( Document $doc, string $s, array $options = [] ): DomPageBundle { |
319 | // Note that only 'parsoid' and 'mw' are encoded, so these will be |
320 | // the only fields set in the decoded DomPageBundle |
321 | $decoded = PHPUtils::jsonDecode( $s ); |
322 | return new DomPageBundle( |
323 | $doc, |
324 | $decoded['parsoid'] ?? null, |
325 | $decoded['mw'] ?? null, |
326 | $options['contentversion'] ?? null, |
327 | $options['headers'] ?? null, |
328 | $options['contentmodel'] ?? null |
329 | ); |
330 | } |
331 | |
332 | // JsonCodecable ------------- |
333 | |
334 | /** @inheritDoc */ |
335 | public function toJsonArray(): array { |
336 | return PageBundle::fromDomPageBundle( $this )->toJsonArray(); |
337 | } |
338 | |
339 | /** @inheritDoc */ |
340 | public static function newFromJsonArray( array $json ): DomPageBundle { |
341 | $pb = PageBundle::newFromJsonArray( $json ); |
342 | return self::fromPageBundle( $pb ); |
343 | } |
344 | } |