Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
SectionMetadata | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
930 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
setExtensionData | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
appendExtensionData | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getExtensionData | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
toArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fromArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fromLegacy | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
toLegacy | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
6 | |||
jsonSerialize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
toJsonArray | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
132 | |||
newFromJsonArray | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
prettyPrint | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Core; |
5 | |
6 | use Wikimedia\JsonCodec\JsonCodecable; |
7 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
8 | use Wikimedia\Parsoid\Utils\CompatJsonCodec; |
9 | |
10 | /** |
11 | * Section metadata for generating TOC. |
12 | * |
13 | * This is not the complete data for the article section, just the |
14 | * information needed to generate the table of contents. |
15 | * |
16 | * For now, this schema matches whatever is generated by Parser.php. |
17 | * Parsoid will attempt to match this output for now. |
18 | * |
19 | * Parser.php::finalizeHeadings() is the authoritative source for how |
20 | * some of these properties are computed right now, especially for the |
21 | * $line, $anchor, and $linkAnchor properties below. |
22 | * |
23 | * Linker.php::tocLine() and ::makeHeadline() demonstrate how these |
24 | * properties are used to create headings and table of contents lines. |
25 | */ |
26 | class SectionMetadata implements \JsonSerializable, JsonCodecable { |
27 | use JsonCodecableTrait; |
28 | |
29 | /** |
30 | * The heading tag level: a 1 here means an <H1> tag was used, a |
31 | * 2 means an <H2> tag was used, etc. |
32 | */ |
33 | public int $hLevel; |
34 | |
35 | /** |
36 | * This is a one-indexed TOC level and the nesting level. |
37 | * So, if a page has a H2-H4-H6, then, those levels 2,4,6 |
38 | * correspond to TOC-levels 1,2,3. |
39 | */ |
40 | public int $tocLevel; |
41 | |
42 | /** |
43 | * HTML heading of the section. Only a narrow set of HTML tags are allowed here. |
44 | * |
45 | * This starts with the parsed headline seen in wikitext and |
46 | * - replaces links with link text |
47 | * - processes extension strip markers |
48 | * - removes style, script tags |
49 | * - strips all HTML tags except the following tags (from Parser.php) |
50 | * . <sup> and <sub> (T10393) |
51 | * . <i> (T28375) |
52 | * . <b> (r105284) |
53 | * . <bdi> (T74884) |
54 | * . <span dir="rtl"> and <span dir="ltr"> (T37167) |
55 | * . <s> and <strike> (T35715) |
56 | * . <q> (T251672) |
57 | * We strip any parameter from accepted tags, except dir="rtl|ltr" from <span>, |
58 | * to allow setting directionality in toc items. |
59 | * |
60 | * @note This should be converted into the proper html variant. |
61 | */ |
62 | public string $line; |
63 | |
64 | /** |
65 | * TOC number string (3.1.3, 4.5.2, etc.) |
66 | * |
67 | * @note This should be localized into the parser target language. |
68 | */ |
69 | public string $number; |
70 | |
71 | /** |
72 | * Section id (integer, assigned in depth first traversal order) |
73 | * Template generated sections get a "T-" prefix. |
74 | */ |
75 | public string $index; |
76 | |
77 | /** |
78 | * The title of the page that generated this heading. |
79 | * For template-generated sections, this will be the template title. |
80 | */ |
81 | public ?string $fromTitle; |
82 | |
83 | /** |
84 | * Codepoint offset where the section shows up in wikitext; this is null |
85 | * if this section comes from a template, if it comes from a literal |
86 | * HTML <h_> tag, or otherwise doesn't correspond to a "preprocessor |
87 | * section". |
88 | * @note This is measured in codepoints, not bytes; you should use |
89 | * appropriate multi-byte aware string functions, *not* substr(). |
90 | * Similarly, in JavaScript, be careful not to confuse JavaScript |
91 | * UCS-2 "characters" with codepoints. |
92 | */ |
93 | public ?int $codepointOffset; |
94 | |
95 | /** |
96 | * Anchor attribute. |
97 | * |
98 | * This property is the "true" value of the ID attribute, and should be |
99 | * used when looking up a heading or setting an attribute, for example |
100 | * using Document.getElementById() or Element.setAttribute('id',...). |
101 | * |
102 | * This value is *not* HTML-entity escaped; if you are writing HTML |
103 | * as a literal string, you should still entity-escape ampersands and |
104 | * single/double quotes as appropriate. |
105 | * |
106 | * This value is *not* URL-escaped either; instead use the `linkAnchor` |
107 | * property if you are constructing a URL to target this section. |
108 | * |
109 | * The anchor attribute is based on the $line property, but does extra |
110 | * processing to turn it into a valid attribute: |
111 | * - strip all HTML tags, |
112 | * - normalizes section name |
113 | * - normalizes section name whitespace |
114 | * - decodes char references |
115 | * - makes it a valid HTML id attribute value |
116 | * (HTML5 / HTML4 based on $wgFragmentMode property) |
117 | * - dedupes (case-insensitively) identical anchors by adding "_$n" suffixes |
118 | */ |
119 | public string $anchor; |
120 | |
121 | /** |
122 | * Anchor URL fragment. |
123 | * |
124 | * This is very similar to the $anchor property, but is appropriately |
125 | * URL-escaped to make it appropriate to use in constructing a URL |
126 | * fragment link. You should almost always prepend a `#` symbol |
127 | * to `linkAnchor` if you are using it correctly. |
128 | */ |
129 | public string $linkAnchor; |
130 | |
131 | /** |
132 | * Arbitrary data attached to this section by extensions. This |
133 | * data will be stored and cached in the ParserOutput object along |
134 | * with the rest of the section data, and made available to external |
135 | * clients via the action API. |
136 | * |
137 | * This method is provided to overcome the unsafe practice of attaching |
138 | * extra information to a section by directly assigning member variables. |
139 | * |
140 | * See ParserOutput::setExtensionData() for more information on typical |
141 | * use. |
142 | */ |
143 | private array $extensionData; |
144 | |
145 | /** |
146 | * @param int $tocLevel One-indexed TOC level and the nesting level |
147 | * @param int $hLevel The heading tag level |
148 | * @param string $line Stripped headline text |
149 | * @param string $number TOC number string (3.1.3, 4.5.2, etc) |
150 | * @param string $index Section id |
151 | * @param ?string $fromTitle The title of the page or template that |
152 | * generated this heading, or null. |
153 | * @param ?int $codepointOffset Codepoint offset (# of characters) where the |
154 | * section shows up in wikitext, or null if this doesn't correspond to |
155 | * a "preprocesor section". (Be careful if using JavaScript, as |
156 | * JavaScript "characters" are UCS-2 encoded and don't correspond |
157 | * directly to code points.) |
158 | * @param string $anchor "True" value of the ID attribute |
159 | * @param string $linkAnchor URL-escaped value of the anchor, for use in |
160 | * constructing a URL fragment link |
161 | * @param ?array $extensionData Extension data passed in as an associative array |
162 | */ |
163 | public function __construct( |
164 | // This is a great candidate for named arguments in PHP 8.0+ |
165 | int $tocLevel = 0, |
166 | int $hLevel = -1, |
167 | string $line = '', |
168 | string $number = '', |
169 | string $index = '', |
170 | ?string $fromTitle = null, |
171 | ?int $codepointOffset = null, |
172 | string $anchor = '', |
173 | string $linkAnchor = '', |
174 | ?array $extensionData = null |
175 | ) { |
176 | $this->tocLevel = $tocLevel; |
177 | $this->line = $line; |
178 | $this->hLevel = $hLevel; |
179 | $this->number = $number; |
180 | $this->index = $index; |
181 | $this->fromTitle = $fromTitle; |
182 | $this->codepointOffset = $codepointOffset; |
183 | $this->anchor = $anchor; |
184 | $this->linkAnchor = $linkAnchor; |
185 | $this->extensionData = $extensionData ?? []; |
186 | } |
187 | |
188 | /** |
189 | * Attaches arbitrary data to this SectionMetadata object. This |
190 | * can be used to store some information about this section in the |
191 | * ParserOutput object for later use during page output. The data |
192 | * will be cached along with the ParserOutput object. |
193 | * |
194 | * This method is provided to overcome the unsafe practice of |
195 | * attaching extra information to a section by directly assigning |
196 | * member variables. |
197 | * |
198 | * See ParserOutput::setExtensionData() in core for further information |
199 | * about typical usage in hooks. |
200 | * |
201 | * Setting conflicting values for the same key is not allowed. |
202 | * If you call ::setExtensionData() multiple times with the same key |
203 | * on a SectionMetadata, is is expected that the value will be identical |
204 | * each time. If you want to collect multiple pieces of data under a |
205 | * single key, use ::appendExtensionData(). |
206 | * |
207 | * @note Only scalar values (numbers, strings, or arrays) are |
208 | * supported as a value. (A future revision will allow anything |
209 | * that core's JsonCodec can handle.) Attempts to set other types |
210 | * as extension data values will break ParserCache for the page. |
211 | * |
212 | * @todo When more complex values than scalar values are supported, |
213 | * TOCData::__clone should be updated to take that into account. |
214 | * |
215 | * @param string $key The key for accessing the data. Extensions |
216 | * should take care to avoid conflicts in naming keys. It is |
217 | * suggested to use the extension's name as a prefix. Using |
218 | * the prefix `mw:` is reserved for core. |
219 | * |
220 | * @param mixed $value The value to set. |
221 | * Setting a value to null is equivalent to removing the value. |
222 | */ |
223 | public function setExtensionData( string $key, $value ): void { |
224 | if ( |
225 | array_key_exists( $key, $this->extensionData ) && |
226 | $this->extensionData[$key] !== $value |
227 | ) { |
228 | throw new \InvalidArgumentException( "Conflicting data for $key" ); |
229 | } |
230 | if ( $value === null ) { |
231 | unset( $this->extensionData[$key] ); |
232 | } else { |
233 | $this->extensionData[$key] = $value; |
234 | } |
235 | } |
236 | |
237 | /** |
238 | * Appends arbitrary data to this SectionMetadata. This can be used |
239 | * to store some information about the section in the ParserOutput object for later |
240 | * use during page output. |
241 | * |
242 | * See ::setExtensionData() for more details on rationale and use. |
243 | * |
244 | * @param string $key The key for accessing the data. Extensions should take care to avoid |
245 | * conflicts in naming keys. It is suggested to use the extension's name as a prefix. |
246 | * |
247 | * @param int|string $value The value to append to the list. |
248 | * @return never This method is not yet implemented. |
249 | */ |
250 | public function appendExtensionData( string $key, $value ): void { |
251 | // This implementation would mirror that of |
252 | // ParserOutput::appendExtensionData, but let's defer implementing |
253 | // this until we're sure we need it. In particular, we might need |
254 | // to figure out how a merge on section data is expected to work |
255 | // before we can determine the right semantics for this. |
256 | throw new \InvalidArgumentException( "Not yet implemented" ); |
257 | } |
258 | |
259 | /** |
260 | * Gets extension data previously attached to this SectionMetadata. |
261 | * |
262 | * @param string $key The key to look up |
263 | * @return mixed|null The value(s) previously set for the given key using |
264 | * ::setExtensionData() or ::appendExtensionData(), or null if no |
265 | * value was set for this key. |
266 | */ |
267 | public function getExtensionData( $key ) { |
268 | $value = $this->extensionData[$key] ?? null; |
269 | return $value; |
270 | } |
271 | |
272 | /** |
273 | * Alias for :toLegacy(), for b/c compatibility only. |
274 | * @deprecated |
275 | * @return array |
276 | */ |
277 | public function toArray(): array { |
278 | return $this->toLegacy(); |
279 | } |
280 | |
281 | /** |
282 | * Alias for :fromLegacy(), for b/c compatibility only. |
283 | * @deprecated |
284 | * @param array $data |
285 | * @return SectionMetadata |
286 | */ |
287 | public static function fromArray( array $data ): SectionMetadata { |
288 | return self::fromLegacy( $data ); |
289 | } |
290 | |
291 | /** |
292 | * Create a new SectionMetadata object from an array in the legacy |
293 | * format returned by the action API. |
294 | * |
295 | * This is useful for backward-compatibility, but is expected to |
296 | * be replaced by conversion to/from JSON in the future. |
297 | * |
298 | * @param array $data Associative array with section metadata |
299 | * @return SectionMetadata |
300 | */ |
301 | public static function fromLegacy( array $data ): SectionMetadata { |
302 | return new SectionMetadata( |
303 | $data['toclevel'] ?? 0, |
304 | (int)( $data['level'] ?? -1 ), |
305 | $data['line'] ?? '', |
306 | $data['number'] ?? '', |
307 | $data['index'] ?? '', |
308 | ( $data['fromtitle'] ?? false ) ?: null, |
309 | $data['byteoffset'] ?? null, // T319141: actually "codepoint offset" |
310 | $data['anchor'] ?? '', |
311 | $data['linkAnchor'] ?? $data['anchor'] ?? '', |
312 | $data['extensionData'] ?? null |
313 | ); |
314 | } |
315 | |
316 | /** |
317 | * Return as associative array, in the format returned by the |
318 | * action API (including the order of fields and the value types). |
319 | * |
320 | * This is helpful as b/c support while we transition to objects. |
321 | * @return array |
322 | */ |
323 | public function toLegacy(): array { |
324 | $ret = [ |
325 | 'toclevel' => $this->tocLevel, |
326 | // cast $level to string in order to keep b/c for the parse api |
327 | 'level' => (string)$this->hLevel, |
328 | 'line' => $this->line, |
329 | 'number' => $this->number, |
330 | 'index' => $this->index, |
331 | 'fromtitle' => $this->fromTitle ?? false, |
332 | // T319141: legacy 'byteoffset' is actually "codepoint offset" |
333 | 'byteoffset' => $this->codepointOffset, |
334 | 'anchor' => $this->anchor, |
335 | 'linkAnchor' => $this->linkAnchor, |
336 | ]; |
337 | // Micro-opt: Output 'extensionData' conditionally to avoid bloat |
338 | if ( $this->extensionData ) { |
339 | $ret['extensionData'] = $this->extensionData; |
340 | } |
341 | return $ret; |
342 | } |
343 | |
344 | /** |
345 | * @inheritDoc |
346 | */ |
347 | public function jsonSerialize(): array { |
348 | return $this->toLegacy(); |
349 | } |
350 | |
351 | // JsonCodecable interface |
352 | |
353 | /** @inheritDoc */ |
354 | public function toJsonArray(): array { |
355 | $ret = []; |
356 | if ( $this->tocLevel !== 0 ) { |
357 | $ret['tocLevel'] = $this->tocLevel; |
358 | } |
359 | if ( $this->hLevel !== -1 ) { |
360 | $ret['hLevel'] = $this->hLevel; |
361 | } |
362 | if ( $this->line !== '' ) { |
363 | $ret['line'] = $this->line; |
364 | } |
365 | if ( $this->number !== '' ) { |
366 | $ret['number'] = $this->number; |
367 | } |
368 | if ( $this->index !== '' ) { |
369 | $ret['index'] = $this->index; |
370 | } |
371 | if ( $this->fromTitle !== null ) { |
372 | $ret['fromTitle'] = $this->fromTitle; |
373 | } |
374 | if ( $this->codepointOffset !== null ) { |
375 | $ret['codepointOffset'] = $this->codepointOffset; |
376 | } |
377 | if ( $this->anchor !== '' ) { |
378 | $ret['anchor'] = $this->anchor; |
379 | } |
380 | if ( $this->linkAnchor !== $this->anchor ) { |
381 | $ret['linkAnchor'] = $this->linkAnchor; |
382 | } |
383 | if ( $this->extensionData ) { |
384 | $ret['extensionData'] = $this->extensionData; |
385 | } |
386 | return $ret; |
387 | } |
388 | |
389 | /** @inheritDoc */ |
390 | public static function newFromJsonArray( array $json ) { |
391 | return new SectionMetadata( |
392 | $json['tocLevel'] ?? 0, |
393 | $json['hLevel'] ?? -1, |
394 | $json['line'] ?? '', |
395 | $json['number'] ?? '', |
396 | $json['index'] ?? '', |
397 | $json['fromTitle'] ?? null, |
398 | $json['codepointOffset'] ?? null, |
399 | $json['anchor'] ?? '', |
400 | $json['linkAnchor'] ?? $json['anchor'] ?? '', |
401 | $json['extensionData'] ?? null |
402 | ); |
403 | } |
404 | |
405 | // Pretty-printing |
406 | |
407 | /** |
408 | * For use in parser tests and wherever else humans might appreciate |
409 | * some formatting in the JSON encoded output. For now, nothing special. |
410 | * @param int $indent Additional indentation to apply (defaults to zero) |
411 | * @return string |
412 | */ |
413 | public function prettyPrint( int $indent = 0 ): string { |
414 | # Basic info |
415 | $buf = str_repeat( ' ', $indent + $this->tocLevel ) . "h{$this->hLevel}"; |
416 | $buf .= " index:{$this->index} toclevel:$this->tocLevel number:{$this->number}"; |
417 | |
418 | # Optional information |
419 | $title = $this->fromTitle ?? "NULL"; |
420 | $offset = $this->codepointOffset ?? "NULL"; |
421 | $buf .= " title:{$title} off:{$offset}"; |
422 | |
423 | # Anchors & link text |
424 | if ( $this->anchor === $this->linkAnchor ) { |
425 | $buf .= " anchor/linkAnchor:{$this->anchor}"; |
426 | } else { |
427 | $buf .= " anchor:{$this->anchor} linkAnchor:{$this->linkAnchor}"; |
428 | } |
429 | $line = $this->line; |
430 | if ( str_contains( $line, "\n" ) ) { |
431 | // Handle cases where $line has "funny" characters |
432 | $line = json_encode( $line ); |
433 | } |
434 | $buf .= " line:{$line}"; |
435 | |
436 | # Extension data |
437 | if ( $this->extensionData ) { |
438 | $codec = new CompatJsonCodec(); |
439 | $buf .= " ext:" . json_encode( $codec->toJsonArray( $this->extensionData ) ); |
440 | } |
441 | |
442 | return $buf; |
443 | } |
444 | } |