Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
34.92% |
44 / 126 |
|
33.33% |
5 / 15 |
CRAP | |
0.00% |
0 / 1 |
| DomPageBundle | |
34.92% |
44 / 126 |
|
33.33% |
5 / 15 |
155.41 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
| newEmpty | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
| fromPageBundle | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
| toDom | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
6 | |||
| apply | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
42 | |||
| toSingleDocument | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
| fromSingleDocument | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| fromLoadedDocument | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
1 | |||
| isSingleDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| toSingleDocumentHtml | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| toInlineAttributeHtml | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
2.15 | |||
| encodeForHeadElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| decodeFromHeadElement | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
| toJsonArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| newFromJsonArray | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Core; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\JsonCodec\JsonCodecable; |
| 8 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
| 9 | use Wikimedia\Parsoid\DOM\Document; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\DOM\Node; |
| 12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 15 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 16 | use Wikimedia\Parsoid\Wt2Html\XMLSerializer; |
| 17 | |
| 18 | /** |
| 19 | * A page bundle stores an HTML DOM with separated data-parsoid and |
| 20 | * data-mw content. The data-parsoid and data-mw content is indexed |
| 21 | * by the id attributes on individual nodes. This content needs to |
| 22 | * be loaded before the data-parsoid and/or data-mw information can be |
| 23 | * used. |
| 24 | * |
| 25 | * Note that the parsoid/mw properties of the page bundle are in "serialized |
| 26 | * array" form; that is, they are flat arrays appropriate for json-encoding |
| 27 | * and do not contain DataParsoid or DataMw objects. |
| 28 | * |
| 29 | * See PageBundle for a similar structure used where the HTML DOM has been |
| 30 | * serialized into a string. |
| 31 | */ |
| 32 | class DomPageBundle implements JsonCodecable { |
| 33 | use JsonCodecableTrait; |
| 34 | |
| 35 | /** The document, as a DOM. */ |
| 36 | public ?Document $doc; |
| 37 | |
| 38 | /** |
| 39 | * A map from ID to the array serialization of DataParsoid for the Node |
| 40 | * with that ID. |
| 41 | * |
| 42 | * @var null|array{counter?:int,offsetType?:'byte'|'ucs2'|'char',ids:array<string,array>} |
| 43 | */ |
| 44 | public $parsoid; |
| 45 | |
| 46 | /** |
| 47 | * A map from ID to the array serialization of DataMw for the Node |
| 48 | * with that ID. |
| 49 | * |
| 50 | * @var null|array{ids:array<string,array>} |
| 51 | */ |
| 52 | public $mw; |
| 53 | |
| 54 | /** @var ?string */ |
| 55 | public $version; |
| 56 | |
| 57 | /** |
| 58 | * A map of HTTP headers: both name and value should be strings. |
| 59 | * @var array<string,string>|null |
| 60 | */ |
| 61 | public $headers; |
| 62 | |
| 63 | /** @var string|null */ |
| 64 | public $contentmodel; |
| 65 | |
| 66 | public function __construct( |
| 67 | Document $doc, ?array $parsoid = null, ?array $mw = null, |
| 68 | ?string $version = null, ?array $headers = null, |
| 69 | ?string $contentmodel = null |
| 70 | ) { |
| 71 | $this->doc = $doc; |
| 72 | $this->parsoid = $parsoid; |
| 73 | $this->mw = $mw; |
| 74 | $this->version = $version; |
| 75 | $this->headers = $headers; |
| 76 | $this->contentmodel = $contentmodel; |
| 77 | Assert::invariant( |
| 78 | !self::isSingleDocument( $doc ), |
| 79 | 'single document should be unpacked before DomPageBundle created' |
| 80 | ); |
| 81 | } |
| 82 | |
| 83 | public static function newEmpty( |
| 84 | Document $doc, |
| 85 | ?string $version = null, |
| 86 | ?array $headers = null, |
| 87 | ?string $contentmodel = null |
| 88 | ): self { |
| 89 | return new DomPageBundle( |
| 90 | $doc, |
| 91 | [ |
| 92 | 'counter' => -1, |
| 93 | 'ids' => [], |
| 94 | ], |
| 95 | [ |
| 96 | 'ids' => [], |
| 97 | ], |
| 98 | $version, |
| 99 | $headers, |
| 100 | $contentmodel |
| 101 | ); |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Create a DomPageBundle from a PageBundle. |
| 106 | * |
| 107 | * This simply parses the HTML string from the PageBundle, preserving |
| 108 | * the metadata. |
| 109 | */ |
| 110 | public static function fromPageBundle( PageBundle $pb ): DomPageBundle { |
| 111 | return new DomPageBundle( |
| 112 | DOMUtils::parseHTML( $pb->html ), |
| 113 | $pb->parsoid, |
| 114 | $pb->mw, |
| 115 | $pb->version, |
| 116 | $pb->headers, |
| 117 | $pb->contentmodel |
| 118 | ); |
| 119 | } |
| 120 | |
| 121 | /** |
| 122 | * Return a DOM from the contents of this page bundle. |
| 123 | * |
| 124 | * If `$load` is true (the default), the returned DOM will be prepared |
| 125 | * and loaded using `$options`. |
| 126 | * |
| 127 | * If `$load` is false, any data-parsoid or data-mw information from this |
| 128 | * page bundle will be converted to inline attributes in the DOM. This |
| 129 | * process is less efficient than preparing and loading the document |
| 130 | * directly from the DOM and should be avoided if possible. |
| 131 | */ |
| 132 | public function toDom( bool $load = true, ?array $options = null ): Document { |
| 133 | $doc = $this->doc; |
| 134 | if ( $load ) { |
| 135 | $options ??= []; |
| 136 | DOMDataUtils::prepareDoc( $doc ); |
| 137 | $body = DOMCompat::getBody( $doc ); |
| 138 | '@phan-var Element $body'; // assert non-null |
| 139 | DOMDataUtils::visitAndLoadDataAttribs( |
| 140 | $body, |
| 141 | [ |
| 142 | 'loadFromPageBundle' => $this, |
| 143 | ] + $options + [ |
| 144 | 'markNew' => true, |
| 145 | 'validateXMLNames' => true, |
| 146 | ] |
| 147 | ); |
| 148 | DOMDataUtils::getBag( $doc )->loaded = true; |
| 149 | } else { |
| 150 | self::apply( $doc, $this ); |
| 151 | } |
| 152 | $this->doc = null; // Prevent reuse of the DomPageBundle |
| 153 | return $doc; |
| 154 | } |
| 155 | |
| 156 | /** |
| 157 | * Applies the `data-*` attributes JSON structure to the document. |
| 158 | * Leaves `id` attributes behind -- they are used by citation code to |
| 159 | * extract `<ref>` body from the DOM. |
| 160 | * |
| 161 | * @param Document $doc doc |
| 162 | * @param DomPageBundle $pb page bundle |
| 163 | */ |
| 164 | private static function apply( Document $doc, DomPageBundle $pb ): void { |
| 165 | Assert::invariant( |
| 166 | !self::isSingleDocument( $doc ), |
| 167 | "conflicting page bundle found in document" |
| 168 | ); |
| 169 | $apply = static function ( Node $node ) use ( $pb ): void { |
| 170 | if ( $node instanceof Element ) { |
| 171 | $id = DOMCompat::getAttribute( $node, 'id' ); |
| 172 | if ( $id === null ) { |
| 173 | return; |
| 174 | } |
| 175 | if ( isset( $pb->parsoid['ids'][$id] ) ) { |
| 176 | DOMDataUtils::setJSONAttribute( |
| 177 | $node, 'data-parsoid', $pb->parsoid['ids'][$id] |
| 178 | ); |
| 179 | } |
| 180 | if ( isset( $pb->mw['ids'][$id] ) ) { |
| 181 | // Only apply if it isn't already set. This means |
| 182 | // earlier applications of the pagebundle have higher |
| 183 | // precedence, inline data being the highest. |
| 184 | if ( !$node->hasAttribute( 'data-mw' ) ) { |
| 185 | DOMDataUtils::setJSONAttribute( |
| 186 | $node, 'data-mw', $pb->mw['ids'][$id] |
| 187 | ); |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | }; |
| 192 | DOMUtils::visitDOM( |
| 193 | DOMCompat::getBody( $doc ), $apply |
| 194 | ); |
| 195 | // For fragment bank representations, visit <template> nodes in the |
| 196 | // <head> as well. |
| 197 | DOMUtils::visitDOM( |
| 198 | DOMCompat::getHead( $doc ), $apply |
| 199 | ); |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * Create a "PageBundle as single Document" by embedding page bundle |
| 204 | * information into a <script> element in the <head> of the DOM. |
| 205 | * |
| 206 | * @see ::fromSingleDocument() |
| 207 | */ |
| 208 | public function toSingleDocument(): Document { |
| 209 | $script = DOMUtils::appendToHead( $this->doc, 'script', [ |
| 210 | 'id' => 'mw-pagebundle', |
| 211 | 'type' => 'application/x-mw-pagebundle', |
| 212 | ] ); |
| 213 | $script->appendChild( $this->doc->createTextNode( $this->encodeForHeadElement() ) ); |
| 214 | $doc = $this->doc; |
| 215 | // Invalidate this DomPageBundle to prevent us from using it again. |
| 216 | $this->doc = null; |
| 217 | return $doc; |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * Return a DomPageBundle from a "PageBundle as single Document" |
| 222 | * representation, where some page bundle information has been embedded |
| 223 | * as a <script> element into the <head> of the DOM. |
| 224 | * |
| 225 | * @see ::toSingleDocument() |
| 226 | * |
| 227 | * @param Document $doc doc |
| 228 | * @param array $options Optional content version/headers/contentmodel |
| 229 | * @return DomPageBundle |
| 230 | */ |
| 231 | public static function fromSingleDocument( Document $doc, array $options = [] ): DomPageBundle { |
| 232 | $pb = null; |
| 233 | $dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' ); |
| 234 | Assert::invariant( $dpScriptElt !== null, "no page bundle found" ); |
| 235 | $dpScriptElt->parentNode->removeChild( $dpScriptElt ); |
| 236 | return self::decodeFromHeadElement( $doc, $dpScriptElt->textContent, $options ); |
| 237 | } |
| 238 | |
| 239 | /** |
| 240 | * Create a new DomPageBundle from a "prepared and loaded" document. |
| 241 | * |
| 242 | * If a `pageBundle` key is present in the options, the |
| 243 | * version/headers/contentmodel will be initialized from that |
| 244 | * page bundle. |
| 245 | * |
| 246 | * @param Document $doc Should be "prepared and loaded" |
| 247 | * @param array $options store options |
| 248 | * @return DomPageBundle |
| 249 | */ |
| 250 | public static function fromLoadedDocument( Document $doc, array $options = [] ): DomPageBundle { |
| 251 | $metadata = $options['pageBundle'] ?? null; |
| 252 | $dpb = self::newEmpty( |
| 253 | $doc, |
| 254 | $metadata->version ?? $options['contentversion'] ?? null, |
| 255 | $metadata->headers ?? $options['headers'] ?? null, |
| 256 | $metadata->contentmodel ?? $options['contentmodel'] ?? null |
| 257 | ); |
| 258 | // XXX We can't create a full idIndex unless we can traverse |
| 259 | // extension content, which requires an Env or a ParsoidExtensionAPI, |
| 260 | // but as long as your extension content doesn't contain IDs beginning |
| 261 | // with 'mw' you'll be fine. |
| 262 | $env = $options['env'] ?? $options['extAPI'] ?? null; |
| 263 | DOMDataUtils::visitAndStoreDataAttribs( |
| 264 | DOMCompat::getBody( $doc ), |
| 265 | [ |
| 266 | 'storeInPageBundle' => $dpb, |
| 267 | 'outputContentVersion' => $dpb->version, |
| 268 | 'idIndex' => DOMDataUtils::usedIdIndex( $env, $doc ), |
| 269 | ] + $options |
| 270 | ); |
| 271 | return $dpb; |
| 272 | } |
| 273 | |
| 274 | /** |
| 275 | * Return true iff the given Document has page bundle information embedded |
| 276 | * as a <script id="mw-pagebundle"> element in its <head>. |
| 277 | */ |
| 278 | public static function isSingleDocument( Document $doc ): bool { |
| 279 | return DOMCompat::getElementById( $doc, 'mw-pagebundle' ) !== null; |
| 280 | } |
| 281 | |
| 282 | /** |
| 283 | * Convert this DomPageBundle to "single document" form, where page bundle |
| 284 | * information is embedded in the <head> of the document. |
| 285 | * @param array $options XMLSerializer options |
| 286 | * @return string an HTML string |
| 287 | */ |
| 288 | public function toSingleDocumentHtml( array $options = [] ): string { |
| 289 | $doc = $this->toSingleDocument(); |
| 290 | return XMLSerializer::serialize( $doc, $options )['html']; |
| 291 | } |
| 292 | |
| 293 | /** |
| 294 | * Convert this DomPageBundle to "inline attribute" form, where page bundle |
| 295 | * information is represented as inline JSON-valued attributes. |
| 296 | * @param array $options XMLSerializer options |
| 297 | * @return string an HTML string |
| 298 | */ |
| 299 | public function toInlineAttributeHtml( array $options = [] ): string { |
| 300 | $doc = $this->toDom( false ); |
| 301 | if ( $options['body_only'] ?? false ) { |
| 302 | $node = DOMCompat::getBody( $doc ); |
| 303 | $options['innerXML'] = true; |
| 304 | } else { |
| 305 | $node = $doc; |
| 306 | } |
| 307 | return XMLSerializer::serialize( $node, $options )['html']; |
| 308 | } |
| 309 | |
| 310 | /** |
| 311 | * Encode some page bundle properties for emitting as a <script> element |
| 312 | * in the <head> of a document. |
| 313 | */ |
| 314 | private function encodeForHeadElement(): string { |
| 315 | // Note that $this->parsoid and $this->mw are already serialized arrays |
| 316 | // so a naive jsonEncode is sufficient. We don't need a codec. |
| 317 | return PHPUtils::jsonEncode( [ 'parsoid' => $this->parsoid ?? [], 'mw' => $this->mw ?? [] ] ); |
| 318 | } |
| 319 | |
| 320 | /** |
| 321 | * Decode some page bundle properties from the contents of the <script> |
| 322 | * element embedded in a document. |
| 323 | */ |
| 324 | private static function decodeFromHeadElement( Document $doc, string $s, array $options = [] ): DomPageBundle { |
| 325 | // Note that only 'parsoid' and 'mw' are encoded, so these will be |
| 326 | // the only fields set in the decoded DomPageBundle |
| 327 | $decoded = PHPUtils::jsonDecode( $s ); |
| 328 | return new DomPageBundle( |
| 329 | $doc, |
| 330 | $decoded['parsoid'] ?? null, |
| 331 | $decoded['mw'] ?? null, |
| 332 | $options['contentversion'] ?? null, |
| 333 | $options['headers'] ?? null, |
| 334 | $options['contentmodel'] ?? null |
| 335 | ); |
| 336 | } |
| 337 | |
| 338 | // JsonCodecable ------------- |
| 339 | |
| 340 | /** @inheritDoc */ |
| 341 | public function toJsonArray(): array { |
| 342 | return PageBundle::fromDomPageBundle( $this )->toJsonArray(); |
| 343 | } |
| 344 | |
| 345 | /** @inheritDoc */ |
| 346 | public static function newFromJsonArray( array $json ): DomPageBundle { |
| 347 | $pb = PageBundle::newFromJsonArray( $json ); |
| 348 | return self::fromPageBundle( $pb ); |
| 349 | } |
| 350 | } |