Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
36.21% |
42 / 116 |
|
33.33% |
5 / 15 |
CRAP | |
0.00% |
0 / 1 |
DomPageBundle | |
36.21% |
42 / 116 |
|
33.33% |
5 / 15 |
147.65 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
newEmpty | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
fromPageBundle | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
toDom | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
apply | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
42 | |||
toSingleDocument | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
fromSingleDocument | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
fromLoadedDocument | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
isSingleDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
toSingleDocumentHtml | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
toInlineAttributeHtml | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
2.15 | |||
encodeForHeadElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeFromHeadElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
toJsonArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
newFromJsonArray | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Core; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\JsonCodec\JsonCodecable; |
8 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
15 | use Wikimedia\Parsoid\Utils\PHPUtils; |
16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
17 | |
18 | /** |
19 | * A page bundle stores an HTML DOM with separated data-parsoid and |
20 | * data-mw content. The data-parsoid and data-mw content is indexed |
21 | * by the id attributes on individual nodes. This content needs to |
22 | * be loaded before the data-parsoid and/or data-mw information can be |
23 | * used. |
24 | * |
25 | * Note that the parsoid/mw properties of the page bundle are in "serialized |
26 | * array" form; that is, they are flat arrays appropriate for json-encoding |
27 | * and do not contain DataParsoid or DataMw objects. |
28 | * |
29 | * See PageBundle for a similar structure used where the HTML DOM has been |
30 | * serialized into a string. |
31 | */ |
32 | class DomPageBundle implements JsonCodecable { |
33 | use JsonCodecableTrait; |
34 | |
35 | /** The document, as a DOM. */ |
36 | public ?Document $doc; |
37 | |
38 | /** |
39 | * A map from ID to the array serialization of DataParsoid for the Node |
40 | * with that ID. |
41 | * |
42 | * @var null|array{counter?:int,offsetType?:'byte'|'ucs2'|'char',ids:array<string,array>} |
43 | */ |
44 | public $parsoid; |
45 | |
46 | /** |
47 | * A map from ID to the array serialization of DataMw for the Node |
48 | * with that ID. |
49 | * |
50 | * @var null|array{ids:array<string,array>} |
51 | */ |
52 | public $mw; |
53 | |
54 | /** @var ?string */ |
55 | public $version; |
56 | |
57 | /** |
58 | * A map of HTTP headers: both name and value should be strings. |
59 | * @var array<string,string>|null |
60 | */ |
61 | public $headers; |
62 | |
63 | /** @var string|null */ |
64 | public $contentmodel; |
65 | |
66 | public function __construct( |
67 | Document $doc, ?array $parsoid = null, ?array $mw = null, |
68 | ?string $version = null, ?array $headers = null, |
69 | ?string $contentmodel = null |
70 | ) { |
71 | $this->doc = $doc; |
72 | $this->parsoid = $parsoid; |
73 | $this->mw = $mw; |
74 | $this->version = $version; |
75 | $this->headers = $headers; |
76 | $this->contentmodel = $contentmodel; |
77 | Assert::invariant( |
78 | !self::isSingleDocument( $doc ), |
79 | 'single document should be unpacked before DomPageBundle created' |
80 | ); |
81 | } |
82 | |
83 | public static function newEmpty( |
84 | Document $doc, |
85 | ?string $version = null, |
86 | ?array $headers = null, |
87 | ?string $contentmodel = null |
88 | ): self { |
89 | return new DomPageBundle( |
90 | $doc, |
91 | [ |
92 | 'counter' => -1, |
93 | 'ids' => [], |
94 | ], |
95 | [ |
96 | 'ids' => [], |
97 | ], |
98 | $version, |
99 | $headers, |
100 | $contentmodel |
101 | ); |
102 | } |
103 | |
104 | /** |
105 | * Create a DomPageBundle from a PageBundle. |
106 | * |
107 | * This simply parses the HTML string from the PageBundle, preserving |
108 | * the metadata. |
109 | */ |
110 | public static function fromPageBundle( PageBundle $pb ): DomPageBundle { |
111 | return new DomPageBundle( |
112 | DOMUtils::parseHTML( $pb->html ), |
113 | $pb->parsoid, |
114 | $pb->mw, |
115 | $pb->version, |
116 | $pb->headers, |
117 | $pb->contentmodel |
118 | ); |
119 | } |
120 | |
121 | /** |
122 | * Return a DOM from the contents of this page bundle. |
123 | * |
124 | * If `$load` is true (the default), the returned DOM will be prepared |
125 | * and loaded using `$options`. |
126 | * |
127 | * If `$load` is false, any data-parsoid or data-mw information from this |
128 | * page bundle will be converted to inline attributes in the DOM. This |
129 | * process is less efficient than preparing and loading the document |
130 | * directly from the DOM and should be avoided if possible. |
131 | */ |
132 | public function toDom( bool $load = true, ?array $options = null ): Document { |
133 | $doc = $this->doc; |
134 | self::apply( $doc, $this ); |
135 | if ( $load ) { |
136 | DOMDataUtils::prepareDoc( $doc ); |
137 | $body = DOMCompat::getBody( $doc ); |
138 | '@phan-var Element $body'; // assert non-null |
139 | DOMDataUtils::visitAndLoadDataAttribs( $body, $options ?? [ |
140 | 'markNew' => true, |
141 | 'validateXMLNames' => true, |
142 | ] ); |
143 | } |
144 | $this->doc = null; // Prevent reuse of the DomPageBundle |
145 | return $doc; |
146 | } |
147 | |
148 | /** |
149 | * Applies the `data-*` attributes JSON structure to the document. |
150 | * Leaves `id` attributes behind -- they are used by citation code to |
151 | * extract `<ref>` body from the DOM. |
152 | * |
153 | * @param Document $doc doc |
154 | * @param DomPageBundle $pb page bundle |
155 | */ |
156 | private static function apply( Document $doc, DomPageBundle $pb ): void { |
157 | Assert::invariant( |
158 | !self::isSingleDocument( $doc ), |
159 | "conflicting page bundle found in document" |
160 | ); |
161 | $apply = static function ( Node $node ) use ( $pb ): void { |
162 | if ( $node instanceof Element ) { |
163 | $id = DOMCompat::getAttribute( $node, 'id' ); |
164 | if ( $id === null ) { |
165 | return; |
166 | } |
167 | if ( isset( $pb->parsoid['ids'][$id] ) ) { |
168 | DOMDataUtils::setJSONAttribute( |
169 | $node, 'data-parsoid', $pb->parsoid['ids'][$id] |
170 | ); |
171 | } |
172 | if ( isset( $pb->mw['ids'][$id] ) ) { |
173 | // Only apply if it isn't already set. This means |
174 | // earlier applications of the pagebundle have higher |
175 | // precedence, inline data being the highest. |
176 | if ( !$node->hasAttribute( 'data-mw' ) ) { |
177 | DOMDataUtils::setJSONAttribute( |
178 | $node, 'data-mw', $pb->mw['ids'][$id] |
179 | ); |
180 | } |
181 | } |
182 | } |
183 | }; |
184 | DOMUtils::visitDOM( |
185 | DOMCompat::getBody( $doc ), $apply |
186 | ); |
187 | // For template-bank representations, visit <template> nodes in the |
188 | // <head> as well. |
189 | DOMUtils::visitDOM( |
190 | DOMCompat::getHead( $doc ), $apply |
191 | ); |
192 | } |
193 | |
194 | /** |
195 | * Create a "PageBundle as single Document" by embedding page bundle |
196 | * information into a <script> element in the <head> of the DOM. |
197 | * |
198 | * @see ::fromSingleDocument() |
199 | */ |
200 | public function toSingleDocument(): Document { |
201 | $script = DOMUtils::appendToHead( $this->doc, 'script', [ |
202 | 'id' => 'mw-pagebundle', |
203 | 'type' => 'application/x-mw-pagebundle', |
204 | ] ); |
205 | $script->appendChild( $this->doc->createTextNode( $this->encodeForHeadElement() ) ); |
206 | $doc = $this->doc; |
207 | // Invalidate this DomPageBundle to prevent us from using it again. |
208 | $this->doc = null; |
209 | return $doc; |
210 | } |
211 | |
212 | /** |
213 | * Return a DomPageBundle from a "PageBundle as single Document" |
214 | * representation, where some page bundle information has been embedded |
215 | * as a <script> element into the <head> of the DOM. |
216 | * |
217 | * @see ::toSingleDocument() |
218 | * |
219 | * @param Document $doc doc |
220 | * @param array $options Optional content version/headers/contentmodel |
221 | * @return DomPageBundle |
222 | */ |
223 | public static function fromSingleDocument( Document $doc, array $options = [] ): DomPageBundle { |
224 | $pb = null; |
225 | $dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' ); |
226 | Assert::invariant( $dpScriptElt !== null, "no page bundle found" ); |
227 | $dpScriptElt->parentNode->removeChild( $dpScriptElt ); |
228 | return self::decodeFromHeadElement( $doc, $dpScriptElt->textContent, $options ); |
229 | } |
230 | |
231 | /** |
232 | * Create a new DomPageBundle from a "prepared and loaded" document. |
233 | * |
234 | * If a `pageBundle` key is present in the options, the |
235 | * version/headers/contentmodel will be initialized from that |
236 | * page bundle. |
237 | * |
238 | * @param Document $doc Should be "prepared and loaded" |
239 | * @param array $options store options |
240 | * @return DomPageBundle |
241 | */ |
242 | public static function fromLoadedDocument( Document $doc, array $options = [] ): DomPageBundle { |
243 | $metadata = $options['pageBundle'] ?? null; |
244 | $dpb = self::newEmpty( |
245 | $doc, |
246 | $metadata->version ?? $options['contentversion'] ?? null, |
247 | $metadata->headers ?? $options['headers'] ?? null, |
248 | $metadata->contentmodel ?? $options['contentmodel'] ?? null |
249 | ); |
250 | DOMDataUtils::visitAndStoreDataAttribs( |
251 | DOMCompat::getBody( $doc ), |
252 | [ |
253 | 'storeInPageBundle' => $dpb, |
254 | 'outputContentVersion' => $dpb->version, |
255 | ] + $options |
256 | ); |
257 | return $dpb; |
258 | } |
259 | |
260 | /** |
261 | * Return true iff the given Document has page bundle information embedded |
262 | * as a <script id="mw-pagebundle"> element in its <head>. |
263 | */ |
264 | public static function isSingleDocument( Document $doc ): bool { |
265 | return DOMCompat::getElementById( $doc, 'mw-pagebundle' ) !== null; |
266 | } |
267 | |
268 | /** |
269 | * Convert this DomPageBundle to "single document" form, where page bundle |
270 | * information is embedded in the <head> of the document. |
271 | * @param array $options XMLSerializer options |
272 | * @return string an HTML string |
273 | */ |
274 | public function toSingleDocumentHtml( array $options = [] ): string { |
275 | $doc = $this->toSingleDocument(); |
276 | return XMLSerializer::serialize( $doc, $options )['html']; |
277 | } |
278 | |
279 | /** |
280 | * Convert this DomPageBundle to "inline attribute" form, where page bundle |
281 | * information is represented as inline JSON-valued attributes. |
282 | * @param array $options XMLSerializer options |
283 | * @return string an HTML string |
284 | */ |
285 | public function toInlineAttributeHtml( array $options = [] ): string { |
286 | $doc = $this->toDom( false ); |
287 | if ( $options['body_only'] ?? false ) { |
288 | $node = DOMCompat::getBody( $doc ); |
289 | $options['innerXML'] = true; |
290 | } else { |
291 | $node = $doc; |
292 | } |
293 | return XMLSerializer::serialize( $node, $options )['html']; |
294 | } |
295 | |
296 | /** |
297 | * Encode some page bundle properties for emitting as a <script> element |
298 | * in the <head> of a document. |
299 | */ |
300 | private function encodeForHeadElement(): string { |
301 | // Note that $this->parsoid and $this->mw are already serialized arrays |
302 | // so a naive jsonEncode is sufficient. We don't need a codec. |
303 | return PHPUtils::jsonEncode( [ 'parsoid' => $this->parsoid ?? [], 'mw' => $this->mw ?? [] ] ); |
304 | } |
305 | |
306 | /** |
307 | * Decode some page bundle properties from the contents of the <script> |
308 | * element embedded in a document. |
309 | */ |
310 | private static function decodeFromHeadElement( Document $doc, string $s, array $options = [] ): DomPageBundle { |
311 | // Note that only 'parsoid' and 'mw' are encoded, so these will be |
312 | // the only fields set in the decoded DomPageBundle |
313 | $decoded = PHPUtils::jsonDecode( $s ); |
314 | return new DomPageBundle( |
315 | $doc, |
316 | $decoded['parsoid'] ?? null, |
317 | $decoded['mw'] ?? null, |
318 | $options['contentversion'] ?? null, |
319 | $options['headers'] ?? null, |
320 | $options['contentmodel'] ?? null |
321 | ); |
322 | } |
323 | |
324 | // JsonCodecable ------------- |
325 | |
326 | /** @inheritDoc */ |
327 | public function toJsonArray(): array { |
328 | return PageBundle::fromDomPageBundle( $this )->toJsonArray(); |
329 | } |
330 | |
331 | /** @inheritDoc */ |
332 | public static function newFromJsonArray( array $json ): DomPageBundle { |
333 | $pb = PageBundle::newFromJsonArray( $json ); |
334 | return self::fromPageBundle( $pb ); |
335 | } |
336 | } |