Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 122 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
PageBundle | |
0.00% |
0 / 122 |
|
0.00% |
0 / 14 |
756 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
newEmpty | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
toDom | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
toHtml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
validate | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
responseData | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
6 | |||
apply | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
42 | |||
encodeForHeadElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeFromHeadElement | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
fromDomPageBundle | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
12 | |||
toSingleDocumentHtml | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
toInlineAttributeHtml | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
toJsonArray | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
newFromJsonArray | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Core; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use Wikimedia\JsonCodec\JsonCodecable; |
8 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\Utils\ContentUtils; |
13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
16 | use Wikimedia\Parsoid\Utils\PHPUtils; |
17 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
18 | |
19 | /** |
20 | * A page bundle stores an HTML string with separated data-parsoid and |
21 | * (optionally) data-mw content. The data-parsoid and data-mw content |
22 | * is indexed by the id attributes on individual nodes. This content |
23 | * needs to be loaded before the data-parsoid and/or data-mw |
24 | * information can be used. |
25 | * |
26 | * Note that the parsoid/mw properties of the page bundle are in "serialized |
27 | * array" form; that is, they are flat arrays appropriate for json-encoding |
28 | * and do not contain DataParsoid or DataMw objects. |
29 | * |
30 | * See DomPageBundle for a similar structure used where the HTML string |
31 | * has been parsed into a DOM. |
32 | */ |
33 | class PageBundle implements JsonCodecable { |
34 | use JsonCodecableTrait; |
35 | |
36 | /** The document, as an HTML string. */ |
37 | public string $html; |
38 | |
39 | /** |
40 | * A map from ID to the array serialization of DataParsoid for the Node |
41 | * with that ID. |
42 | * |
43 | * @var null|array{counter?:int,offsetType?:'byte'|'ucs2'|'char',ids:array<string,array>} |
44 | */ |
45 | public $parsoid; |
46 | |
47 | /** |
48 | * A map from ID to the array serialization of DataMw for the Node |
49 | * with that ID. |
50 | * |
51 | * @var null|array{ids:array<string,array>} |
52 | */ |
53 | public $mw; |
54 | |
55 | /** @var ?string */ |
56 | public $version; |
57 | |
58 | /** |
59 | * A map of HTTP headers: both name and value should be strings. |
60 | * @var array<string,string>|null |
61 | */ |
62 | public $headers; |
63 | |
64 | /** @var string|null */ |
65 | public $contentmodel; |
66 | |
67 | public function __construct( |
68 | string $html, ?array $parsoid = null, ?array $mw = null, |
69 | ?string $version = null, ?array $headers = null, |
70 | ?string $contentmodel = null |
71 | ) { |
72 | $this->html = $html; |
73 | $this->parsoid = $parsoid; |
74 | $this->mw = $mw; |
75 | $this->version = $version; |
76 | $this->headers = $headers; |
77 | $this->contentmodel = $contentmodel; |
78 | } |
79 | |
80 | public static function newEmpty( |
81 | string $html, |
82 | ?string $version = null, |
83 | ?array $headers = null, |
84 | ?string $contentmodel = null |
85 | ): self { |
86 | return new PageBundle( |
87 | $html, |
88 | [ |
89 | 'counter' => -1, |
90 | 'ids' => [], |
91 | ], |
92 | [ |
93 | 'ids' => [], |
94 | ], |
95 | $version, |
96 | $headers, |
97 | $contentmodel |
98 | ); |
99 | } |
100 | |
101 | public function toDom(): Document { |
102 | $doc = DOMUtils::parseHTML( $this->html ); |
103 | self::apply( $doc, $this ); |
104 | return $doc; |
105 | } |
106 | |
107 | public function toHtml(): string { |
108 | return ContentUtils::toXML( $this->toDom() ); |
109 | } |
110 | |
111 | /** |
112 | * Check if this pagebundle is valid. |
113 | * @param string $contentVersion Document content version to validate against. |
114 | * @param ?string &$errorMessage Error message will be returned here. |
115 | * @return bool |
116 | */ |
117 | public function validate( |
118 | string $contentVersion, ?string &$errorMessage = null |
119 | ) { |
120 | if ( !$this->parsoid || !isset( $this->parsoid['ids'] ) ) { |
121 | $errorMessage = 'Invalid data-parsoid was provided.'; |
122 | return false; |
123 | } elseif ( Semver::satisfies( $contentVersion, '^999.0.0' ) |
124 | && ( !$this->mw || !isset( $this->mw['ids'] ) ) |
125 | ) { |
126 | $errorMessage = 'Invalid data-mw was provided.'; |
127 | return false; |
128 | } |
129 | return true; |
130 | } |
131 | |
132 | /** |
133 | * @return array |
134 | */ |
135 | public function responseData() { |
136 | $version = $this->version ?? '0.0.0'; |
137 | $responseData = [ |
138 | 'contentmodel' => $this->contentmodel ?? '', |
139 | 'html' => [ |
140 | 'headers' => array_merge( [ |
141 | 'content-type' => 'text/html; charset=utf-8; ' |
142 | . 'profile="https://www.mediawiki.org/wiki/Specs/HTML/' |
143 | . $version . '"', |
144 | ], $this->headers ?? [] ), |
145 | 'body' => $this->html, |
146 | ], |
147 | 'data-parsoid' => [ |
148 | 'headers' => [ |
149 | 'content-type' => 'application/json; charset=utf-8; ' |
150 | . 'profile="https://www.mediawiki.org/wiki/Specs/data-parsoid/' |
151 | . $version . '"', |
152 | ], |
153 | 'body' => $this->parsoid, |
154 | ], |
155 | ]; |
156 | if ( Semver::satisfies( $version, '^999.0.0' ) ) { |
157 | $responseData['data-mw'] = [ |
158 | 'headers' => [ |
159 | 'content-type' => 'application/json; charset=utf-8; ' . |
160 | 'profile="https://www.mediawiki.org/wiki/Specs/data-mw/' . |
161 | $version . '"', |
162 | ], |
163 | 'body' => $this->mw, |
164 | ]; |
165 | } |
166 | return $responseData; |
167 | } |
168 | |
169 | /** |
170 | * Applies the `data-*` attributes JSON structure to the document. |
171 | * Leaves `id` attributes behind -- they are used by citation code to |
172 | * extract `<ref>` body from the DOM. |
173 | * |
174 | * @param Document $doc doc |
175 | * @param PageBundle $pb page bundle |
176 | */ |
177 | public static function apply( Document $doc, PageBundle $pb ): void { |
178 | DOMUtils::visitDOM( |
179 | DOMCompat::getBody( $doc ), |
180 | static function ( Node $node ) use ( $pb ): void { |
181 | if ( $node instanceof Element ) { |
182 | $id = DOMCompat::getAttribute( $node, 'id' ); |
183 | if ( $id === null ) { |
184 | return; |
185 | } |
186 | if ( isset( $pb->parsoid['ids'][$id] ) ) { |
187 | DOMDataUtils::setJSONAttribute( |
188 | $node, 'data-parsoid', $pb->parsoid['ids'][$id] |
189 | ); |
190 | } |
191 | if ( isset( $pb->mw['ids'][$id] ) ) { |
192 | // Only apply if it isn't already set. This means |
193 | // earlier applications of the pagebundle have higher |
194 | // precedence, inline data being the highest. |
195 | if ( !$node->hasAttribute( 'data-mw' ) ) { |
196 | DOMDataUtils::setJSONAttribute( |
197 | $node, 'data-mw', $pb->mw['ids'][$id] |
198 | ); |
199 | } |
200 | } |
201 | } |
202 | } |
203 | ); |
204 | } |
205 | |
206 | /** |
207 | * Encode some of these properties for emitting in the <head> element of a doc |
208 | * @return string |
209 | */ |
210 | public function encodeForHeadElement(): string { |
211 | // Note that $this->parsoid and $this->mw are already serialized arrays |
212 | // so a naive jsonEncode is sufficient. We don't need a codec. |
213 | return PHPUtils::jsonEncode( [ 'parsoid' => $this->parsoid ?? [], 'mw' => $this->mw ?? [] ] ); |
214 | } |
215 | |
216 | public static function decodeFromHeadElement( string $s ): PageBundle { |
217 | // Note that only 'parsoid' and 'mw' are encoded, so these will be |
218 | // the only fields set in the decoded PageBundle |
219 | $decoded = PHPUtils::jsonDecode( $s ); |
220 | return new PageBundle( |
221 | '', /* html */ |
222 | $decoded['parsoid'] ?? null, |
223 | $decoded['mw'] ?? null |
224 | ); |
225 | } |
226 | |
227 | /** |
228 | * Convert a DomPageBundle to a PageBundle. |
229 | * |
230 | * This serializes the DOM from the DomPageBundle, with the given $options. |
231 | * The options can also provide defaults for content version, headers, |
232 | * content model, and offsetType if they weren't already set in the |
233 | * DomPageBundle. |
234 | * |
235 | * @param DomPageBundle $dpb |
236 | * @param array $options XMLSerializer options |
237 | * @return PageBundle |
238 | */ |
239 | public static function fromDomPageBundle( DomPageBundle $dpb, array $options = [] ): PageBundle { |
240 | $node = $dpb->doc; |
241 | if ( $options['body_only'] ?? false ) { |
242 | $node = DOMCompat::getBody( $dpb->doc ); |
243 | $options += [ 'innerXML' => true ]; |
244 | } |
245 | $out = XMLSerializer::serialize( $node, $options ); |
246 | $pb = new PageBundle( |
247 | $out['html'], |
248 | $dpb->parsoid, |
249 | $dpb->mw, |
250 | $dpb->version ?? $options['contentversion'] ?? null, |
251 | $dpb->headers ?? $options['headers'] ?? null, |
252 | $dpb->contentmodel ?? $options['contentmodel'] ?? null |
253 | ); |
254 | if ( isset( $options['offsetType'] ) ) { |
255 | $pb->parsoid['offsetType'] ??= $options['offsetType']; |
256 | } |
257 | return $pb; |
258 | } |
259 | |
260 | /** |
261 | * Convert this PageBundle to "single document" form, where page bundle |
262 | * information is embedded in the <head> of the document. |
263 | * @param array $options XMLSerializer options |
264 | * @return string an HTML string |
265 | */ |
266 | public function toSingleDocumentHtml( array $options = [] ): string { |
267 | return DomPageBundle::fromPageBundle( $this ) |
268 | ->toSingleDocumentHtml( $options ); |
269 | } |
270 | |
271 | /** |
272 | * Convert this PageBundle to "inline attribute" form, where page bundle |
273 | * information is represented as inline JSON-valued attributes. |
274 | * @param array $options XMLSerializer options |
275 | * @return string an HTML string |
276 | */ |
277 | public function toInlineAttributeHtml( array $options = [] ): string { |
278 | return DomPageBundle::fromPageBundle( $this ) |
279 | ->toInlineAttributeHtml( $options ); |
280 | } |
281 | |
282 | // JsonCodecable ------------- |
283 | |
284 | /** @inheritDoc */ |
285 | public function toJsonArray(): array { |
286 | return [ |
287 | 'html' => $this->html, |
288 | 'parsoid' => $this->parsoid, |
289 | 'mw' => $this->mw, |
290 | 'version' => $this->version, |
291 | 'headers' => $this->headers, |
292 | 'contentmodel' => $this->contentmodel, |
293 | ]; |
294 | } |
295 | |
296 | /** @inheritDoc */ |
297 | public static function newFromJsonArray( array $json ): PageBundle { |
298 | return new PageBundle( |
299 | $json['html'] ?? '', |
300 | $json['parsoid'] ?? null, |
301 | $json['mw'] ?? null, |
302 | $json['version'] ?? null, |
303 | $json['headers'] ?? null, |
304 | $json['contentmodel'] ?? null |
305 | ); |
306 | } |
307 | } |