Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
| SectionMetadata | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
930 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
| setExtensionData | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
| appendExtensionData | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| getExtensionData | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| toArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| fromArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| fromLegacy | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
| toLegacy | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
6 | |||
| jsonSerialize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| toJsonArray | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
132 | |||
| newFromJsonArray | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| prettyPrint | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Core; |
| 5 | |
| 6 | use Wikimedia\JsonCodec\JsonCodecable; |
| 7 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
| 8 | use Wikimedia\Parsoid\Utils\CompatJsonCodec; |
| 9 | |
| 10 | /** |
| 11 | * Section metadata for generating TOC. |
| 12 | * |
| 13 | * This is not the complete data for the article section, just the |
| 14 | * information needed to generate the table of contents. |
| 15 | * |
| 16 | * For now, this schema matches whatever is generated by Parser.php. |
| 17 | * Parsoid will attempt to match this output for now. |
| 18 | * |
| 19 | * Parser.php::finalizeHeadings() is the authoritative source for how |
| 20 | * some of these properties are computed right now, especially for the |
| 21 | * $line, $anchor, and $linkAnchor properties below. |
| 22 | * |
| 23 | * Linker.php::tocLine() and ::makeHeadline() demonstrate how these |
| 24 | * properties are used to create headings and table of contents lines. |
| 25 | */ |
| 26 | class SectionMetadata implements \JsonSerializable, JsonCodecable { |
| 27 | use JsonCodecableTrait; |
| 28 | |
| 29 | /** |
| 30 | * The heading tag level: a 1 here means an <H1> tag was used, a |
| 31 | * 2 means an <H2> tag was used, etc. |
| 32 | */ |
| 33 | public int $hLevel; |
| 34 | |
| 35 | /** |
| 36 | * This is a one-indexed TOC level and the nesting level. |
| 37 | * So, if a page has a H2-H4-H6, then, those levels 2,4,6 |
| 38 | * correspond to TOC-levels 1,2,3. |
| 39 | */ |
| 40 | public int $tocLevel; |
| 41 | |
| 42 | /** |
| 43 | * HTML heading of the section. Only a narrow set of HTML tags are allowed here. |
| 44 | * |
| 45 | * This starts with the parsed headline seen in wikitext and |
| 46 | * - replaces links with link text |
| 47 | * - processes extension strip markers |
| 48 | * - removes style, script tags |
| 49 | * - strips all HTML tags except the following tags (from Parser.php) |
| 50 | * . <sup> and <sub> (T10393) |
| 51 | * . <i> (T28375) |
| 52 | * . <b> (r105284) |
| 53 | * . <bdi> (T74884) |
| 54 | * . <span dir="rtl"> and <span dir="ltr"> (T37167) |
| 55 | * . <s> and <strike> (T35715) |
| 56 | * . <q> (T251672) |
| 57 | * We strip any parameter from accepted tags, except dir="rtl|ltr" from <span>, |
| 58 | * to allow setting directionality in toc items. |
| 59 | * |
| 60 | * @note This should be converted into the proper html variant. |
| 61 | */ |
| 62 | public string $line; |
| 63 | |
| 64 | /** |
| 65 | * TOC number string (3.1.3, 4.5.2, etc.) |
| 66 | * |
| 67 | * @note This should be localized into the parser target language. |
| 68 | */ |
| 69 | public string $number; |
| 70 | |
| 71 | /** |
| 72 | * Section id (integer, assigned in depth first traversal order) |
| 73 | * Template generated sections get a "T-" prefix. |
| 74 | */ |
| 75 | public string $index; |
| 76 | |
| 77 | /** |
| 78 | * The title of the page that generated this heading. |
| 79 | * For template-generated sections, this will be the template title. |
| 80 | * This string is in "prefixed DB key" format. |
| 81 | */ |
| 82 | public ?string $fromTitle; |
| 83 | |
| 84 | /** |
| 85 | * Codepoint offset where the section shows up in wikitext; this is null |
| 86 | * if this section comes from a template, if it comes from a literal |
| 87 | * HTML <h_> tag, or otherwise doesn't correspond to a "preprocessor |
| 88 | * section". |
| 89 | * @note This is measured in codepoints, not bytes; you should use |
| 90 | * appropriate multi-byte aware string functions, *not* substr(). |
| 91 | * Similarly, in JavaScript, be careful not to confuse JavaScript |
| 92 | * UCS-2 "characters" with codepoints. |
| 93 | */ |
| 94 | public ?int $codepointOffset; |
| 95 | |
| 96 | /** |
| 97 | * Anchor attribute. |
| 98 | * |
| 99 | * This property is the "true" value of the ID attribute, and should be |
| 100 | * used when looking up a heading or setting an attribute, for example |
| 101 | * using Document.getElementById() or Element.setAttribute('id',...). |
| 102 | * |
| 103 | * This value is *not* HTML-entity escaped; if you are writing HTML |
| 104 | * as a literal string, you should still entity-escape ampersands and |
| 105 | * single/double quotes as appropriate. |
| 106 | * |
| 107 | * This value is *not* URL-escaped either; instead use the `linkAnchor` |
| 108 | * property if you are constructing a URL to target this section. |
| 109 | * |
| 110 | * The anchor attribute is based on the $line property, but does extra |
| 111 | * processing to turn it into a valid attribute: |
| 112 | * - strip all HTML tags, |
| 113 | * - normalizes section name |
| 114 | * - normalizes section name whitespace |
| 115 | * - decodes char references |
| 116 | * - makes it a valid HTML id attribute value |
| 117 | * (HTML5 / HTML4 based on $wgFragmentMode property) |
| 118 | * - dedupes (case-insensitively) identical anchors by adding "_$n" suffixes |
| 119 | */ |
| 120 | public string $anchor; |
| 121 | |
| 122 | /** |
| 123 | * Anchor URL fragment. |
| 124 | * |
| 125 | * This is very similar to the $anchor property, but is appropriately |
| 126 | * URL-escaped to make it appropriate to use in constructing a URL |
| 127 | * fragment link. You should almost always prepend a `#` symbol |
| 128 | * to `linkAnchor` if you are using it correctly. You are still |
| 129 | * responsible for HTML-escaping the resulting URL if you are emitting |
| 130 | * this as an HTML attribute. |
| 131 | */ |
| 132 | public string $linkAnchor; |
| 133 | |
| 134 | /** |
| 135 | * Arbitrary data attached to this section by extensions. This |
| 136 | * data will be stored and cached in the ParserOutput object along |
| 137 | * with the rest of the section data, and made available to external |
| 138 | * clients via the action API. |
| 139 | * |
| 140 | * This method is provided to overcome the unsafe practice of attaching |
| 141 | * extra information to a section by directly assigning member variables. |
| 142 | * |
| 143 | * See ParserOutput::setExtensionData() for more information on typical |
| 144 | * use. |
| 145 | */ |
| 146 | private array $extensionData; |
| 147 | |
| 148 | /** |
| 149 | * @param int $tocLevel One-indexed TOC level and the nesting level |
| 150 | * @param int $hLevel The heading tag level |
| 151 | * @param string $line Stripped headline text |
| 152 | * @param string $number TOC number string (3.1.3, 4.5.2, etc) |
| 153 | * @param string $index Section id |
| 154 | * @param ?string $fromTitle The title of the page or template that |
| 155 | * generated this heading, or null. |
| 156 | * @param ?int $codepointOffset Codepoint offset (# of characters) where the |
| 157 | * section shows up in wikitext, or null if this doesn't correspond to |
| 158 | * a "preprocesor section". (Be careful if using JavaScript, as |
| 159 | * JavaScript "characters" are UCS-2 encoded and don't correspond |
| 160 | * directly to code points.) |
| 161 | * @param string $anchor "True" value of the ID attribute |
| 162 | * @param string $linkAnchor URL-escaped value of the anchor, for use in |
| 163 | * constructing a URL fragment link |
| 164 | * @param ?array $extensionData Extension data passed in as an associative array |
| 165 | */ |
| 166 | public function __construct( |
| 167 | // This is a great candidate for named arguments in PHP 8.0+ |
| 168 | int $tocLevel = 0, |
| 169 | int $hLevel = -1, |
| 170 | string $line = '', |
| 171 | string $number = '', |
| 172 | string $index = '', |
| 173 | ?string $fromTitle = null, |
| 174 | ?int $codepointOffset = null, |
| 175 | string $anchor = '', |
| 176 | string $linkAnchor = '', |
| 177 | ?array $extensionData = null |
| 178 | ) { |
| 179 | $this->tocLevel = $tocLevel; |
| 180 | $this->line = $line; |
| 181 | $this->hLevel = $hLevel; |
| 182 | $this->number = $number; |
| 183 | $this->index = $index; |
| 184 | $this->fromTitle = $fromTitle; |
| 185 | $this->codepointOffset = $codepointOffset; |
| 186 | $this->anchor = $anchor; |
| 187 | $this->linkAnchor = $linkAnchor; |
| 188 | $this->extensionData = $extensionData ?? []; |
| 189 | } |
| 190 | |
| 191 | /** |
| 192 | * Attaches arbitrary data to this SectionMetadata object. This |
| 193 | * can be used to store some information about this section in the |
| 194 | * ParserOutput object for later use during page output. The data |
| 195 | * will be cached along with the ParserOutput object. |
| 196 | * |
| 197 | * This method is provided to overcome the unsafe practice of |
| 198 | * attaching extra information to a section by directly assigning |
| 199 | * member variables. |
| 200 | * |
| 201 | * See ParserOutput::setExtensionData() in core for further information |
| 202 | * about typical usage in hooks. |
| 203 | * |
| 204 | * Setting conflicting values for the same key is not allowed. |
| 205 | * If you call ::setExtensionData() multiple times with the same key |
| 206 | * on a SectionMetadata, is is expected that the value will be identical |
| 207 | * each time. If you want to collect multiple pieces of data under a |
| 208 | * single key, use ::appendExtensionData(). |
| 209 | * |
| 210 | * @note Only scalar values (numbers, strings, or arrays) are |
| 211 | * supported as a value. (A future revision will allow anything |
| 212 | * that core's JsonCodec can handle.) Attempts to set other types |
| 213 | * as extension data values will break ParserCache for the page. |
| 214 | * |
| 215 | * @todo When more complex values than scalar values are supported, |
| 216 | * TOCData::__clone should be updated to take that into account. |
| 217 | * |
| 218 | * @param string $key The key for accessing the data. Extensions |
| 219 | * should take care to avoid conflicts in naming keys. It is |
| 220 | * suggested to use the extension's name as a prefix. Using |
| 221 | * the prefix `mw:` is reserved for core. |
| 222 | * |
| 223 | * @param mixed $value The value to set. |
| 224 | * Setting a value to null is equivalent to removing the value. |
| 225 | */ |
| 226 | public function setExtensionData( string $key, $value ): void { |
| 227 | if ( |
| 228 | array_key_exists( $key, $this->extensionData ) && |
| 229 | $this->extensionData[$key] !== $value |
| 230 | ) { |
| 231 | throw new \InvalidArgumentException( "Conflicting data for $key" ); |
| 232 | } |
| 233 | if ( $value === null ) { |
| 234 | unset( $this->extensionData[$key] ); |
| 235 | } else { |
| 236 | $this->extensionData[$key] = $value; |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | /** |
| 241 | * Appends arbitrary data to this SectionMetadata. This can be used |
| 242 | * to store some information about the section in the ParserOutput object for later |
| 243 | * use during page output. |
| 244 | * |
| 245 | * See ::setExtensionData() for more details on rationale and use. |
| 246 | * |
| 247 | * @param string $key The key for accessing the data. Extensions should take care to avoid |
| 248 | * conflicts in naming keys. It is suggested to use the extension's name as a prefix. |
| 249 | * |
| 250 | * @param int|string $value The value to append to the list. |
| 251 | * @return never This method is not yet implemented. |
| 252 | */ |
| 253 | public function appendExtensionData( string $key, $value ): void { |
| 254 | // This implementation would mirror that of |
| 255 | // ParserOutput::appendExtensionData, but let's defer implementing |
| 256 | // this until we're sure we need it. In particular, we might need |
| 257 | // to figure out how a merge on section data is expected to work |
| 258 | // before we can determine the right semantics for this. |
| 259 | throw new \InvalidArgumentException( "Not yet implemented" ); |
| 260 | } |
| 261 | |
| 262 | /** |
| 263 | * Gets extension data previously attached to this SectionMetadata. |
| 264 | * |
| 265 | * @param string $key The key to look up |
| 266 | * @return mixed|null The value(s) previously set for the given key using |
| 267 | * ::setExtensionData() or ::appendExtensionData(), or null if no |
| 268 | * value was set for this key. |
| 269 | */ |
| 270 | public function getExtensionData( $key ) { |
| 271 | $value = $this->extensionData[$key] ?? null; |
| 272 | return $value; |
| 273 | } |
| 274 | |
| 275 | /** |
| 276 | * Alias for :toLegacy(), for b/c compatibility only. |
| 277 | * @deprecated |
| 278 | * @return array |
| 279 | */ |
| 280 | public function toArray(): array { |
| 281 | return $this->toLegacy(); |
| 282 | } |
| 283 | |
| 284 | /** |
| 285 | * Alias for :fromLegacy(), for b/c compatibility only. |
| 286 | * @deprecated |
| 287 | * @param array $data |
| 288 | * @return SectionMetadata |
| 289 | */ |
| 290 | public static function fromArray( array $data ): SectionMetadata { |
| 291 | return self::fromLegacy( $data ); |
| 292 | } |
| 293 | |
| 294 | /** |
| 295 | * Create a new SectionMetadata object from an array in the legacy |
| 296 | * format returned by the action API. |
| 297 | * |
| 298 | * This is useful for backward-compatibility, but is expected to |
| 299 | * be replaced by conversion to/from JSON in the future. |
| 300 | * |
| 301 | * @param array $data Associative array with section metadata |
| 302 | * @return SectionMetadata |
| 303 | */ |
| 304 | public static function fromLegacy( array $data ): SectionMetadata { |
| 305 | return new SectionMetadata( |
| 306 | $data['toclevel'] ?? 0, |
| 307 | (int)( $data['level'] ?? -1 ), |
| 308 | $data['line'] ?? '', |
| 309 | $data['number'] ?? '', |
| 310 | $data['index'] ?? '', |
| 311 | ( $data['fromtitle'] ?? false ) ?: null, |
| 312 | $data['byteoffset'] ?? null, // T319141: actually "codepoint offset" |
| 313 | $data['anchor'] ?? '', |
| 314 | $data['linkAnchor'] ?? $data['anchor'] ?? '', |
| 315 | $data['extensionData'] ?? null |
| 316 | ); |
| 317 | } |
| 318 | |
| 319 | /** |
| 320 | * Return as associative array, in the format returned by the |
| 321 | * action API (including the order of fields and the value types). |
| 322 | * |
| 323 | * This is helpful as b/c support while we transition to objects. |
| 324 | * @return array |
| 325 | */ |
| 326 | public function toLegacy(): array { |
| 327 | $ret = [ |
| 328 | 'toclevel' => $this->tocLevel, |
| 329 | // cast $level to string in order to keep b/c for the parse api |
| 330 | 'level' => (string)$this->hLevel, |
| 331 | 'line' => $this->line, |
| 332 | 'number' => $this->number, |
| 333 | 'index' => $this->index, |
| 334 | 'fromtitle' => $this->fromTitle ?? false, |
| 335 | // T319141: legacy 'byteoffset' is actually "codepoint offset" |
| 336 | 'byteoffset' => $this->codepointOffset, |
| 337 | 'anchor' => $this->anchor, |
| 338 | 'linkAnchor' => $this->linkAnchor, |
| 339 | ]; |
| 340 | // Micro-opt: Output 'extensionData' conditionally to avoid bloat |
| 341 | if ( $this->extensionData ) { |
| 342 | $ret['extensionData'] = $this->extensionData; |
| 343 | } |
| 344 | return $ret; |
| 345 | } |
| 346 | |
| 347 | /** |
| 348 | * @inheritDoc |
| 349 | */ |
| 350 | public function jsonSerialize(): array { |
| 351 | return $this->toLegacy(); |
| 352 | } |
| 353 | |
| 354 | // JsonCodecable interface |
| 355 | |
| 356 | /** @inheritDoc */ |
| 357 | public function toJsonArray(): array { |
| 358 | $ret = []; |
| 359 | if ( $this->tocLevel !== 0 ) { |
| 360 | $ret['tocLevel'] = $this->tocLevel; |
| 361 | } |
| 362 | if ( $this->hLevel !== -1 ) { |
| 363 | $ret['hLevel'] = $this->hLevel; |
| 364 | } |
| 365 | if ( $this->line !== '' ) { |
| 366 | $ret['line'] = $this->line; |
| 367 | } |
| 368 | if ( $this->number !== '' ) { |
| 369 | $ret['number'] = $this->number; |
| 370 | } |
| 371 | if ( $this->index !== '' ) { |
| 372 | $ret['index'] = $this->index; |
| 373 | } |
| 374 | if ( $this->fromTitle !== null ) { |
| 375 | $ret['fromTitle'] = $this->fromTitle; |
| 376 | } |
| 377 | if ( $this->codepointOffset !== null ) { |
| 378 | $ret['codepointOffset'] = $this->codepointOffset; |
| 379 | } |
| 380 | if ( $this->anchor !== '' ) { |
| 381 | $ret['anchor'] = $this->anchor; |
| 382 | } |
| 383 | if ( $this->linkAnchor !== $this->anchor ) { |
| 384 | $ret['linkAnchor'] = $this->linkAnchor; |
| 385 | } |
| 386 | if ( $this->extensionData ) { |
| 387 | $ret['extensionData'] = $this->extensionData; |
| 388 | } |
| 389 | return $ret; |
| 390 | } |
| 391 | |
| 392 | /** @inheritDoc */ |
| 393 | public static function newFromJsonArray( array $json ) { |
| 394 | return new SectionMetadata( |
| 395 | $json['tocLevel'] ?? 0, |
| 396 | $json['hLevel'] ?? -1, |
| 397 | $json['line'] ?? '', |
| 398 | $json['number'] ?? '', |
| 399 | $json['index'] ?? '', |
| 400 | $json['fromTitle'] ?? null, |
| 401 | $json['codepointOffset'] ?? null, |
| 402 | $json['anchor'] ?? '', |
| 403 | $json['linkAnchor'] ?? $json['anchor'] ?? '', |
| 404 | $json['extensionData'] ?? null |
| 405 | ); |
| 406 | } |
| 407 | |
| 408 | // Pretty-printing |
| 409 | |
| 410 | /** |
| 411 | * For use in parser tests and wherever else humans might appreciate |
| 412 | * some formatting in the JSON encoded output. For now, nothing special. |
| 413 | * @param int $indent Additional indentation to apply (defaults to zero) |
| 414 | * @return string |
| 415 | */ |
| 416 | public function prettyPrint( int $indent = 0 ): string { |
| 417 | # Basic info |
| 418 | $buf = str_repeat( ' ', $indent + $this->tocLevel ) . "h{$this->hLevel}"; |
| 419 | $buf .= " index:{$this->index} toclevel:$this->tocLevel number:{$this->number}"; |
| 420 | |
| 421 | # Optional information |
| 422 | $title = $this->fromTitle ?? "NULL"; |
| 423 | $offset = $this->codepointOffset ?? "NULL"; |
| 424 | $buf .= " title:{$title} off:{$offset}"; |
| 425 | |
| 426 | # Anchors & link text |
| 427 | if ( $this->anchor === $this->linkAnchor ) { |
| 428 | $buf .= " anchor/linkAnchor:{$this->anchor}"; |
| 429 | } else { |
| 430 | $buf .= " anchor:{$this->anchor} linkAnchor:{$this->linkAnchor}"; |
| 431 | } |
| 432 | $line = $this->line; |
| 433 | if ( str_contains( $line, "\n" ) ) { |
| 434 | // Handle cases where $line has "funny" characters |
| 435 | $line = json_encode( $line ); |
| 436 | } |
| 437 | $buf .= " line:{$line}"; |
| 438 | |
| 439 | # Extension data |
| 440 | if ( $this->extensionData ) { |
| 441 | $codec = new CompatJsonCodec(); |
| 442 | $buf .= " ext:" . json_encode( $codec->toJsonArray( $this->extensionData ) ); |
| 443 | } |
| 444 | |
| 445 | return $buf; |
| 446 | } |
| 447 | } |