Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
SectionMetadata | |
0.00% |
0 / 98 |
|
0.00% |
0 / 12 |
930 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
setExtensionData | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
appendExtensionData | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getExtensionData | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
toArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fromArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fromLegacy | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
toLegacy | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
6 | |||
jsonSerialize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
toJsonArray | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
132 | |||
newFromJsonArray | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
prettyPrint | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Core; |
5 | |
6 | use Wikimedia\JsonCodec\JsonCodecable; |
7 | use Wikimedia\JsonCodec\JsonCodecableTrait; |
8 | use Wikimedia\Parsoid\Utils\CompatJsonCodec; |
9 | |
10 | /** |
11 | * Section metadata for generating TOC. |
12 | * |
13 | * This is not the complete data for the article section, just the |
14 | * information needed to generate the table of contents. |
15 | * |
16 | * For now, this schema matches whatever is generated by Parser.php. |
17 | * Parsoid will attempt to match this output for now. |
18 | * |
19 | * Parser.php::finalizeHeadings() is the authoritative source for how |
20 | * some of these properties are computed right now, especially for the |
21 | * $line, $anchor, and $linkAnchor properties below. |
22 | * |
23 | * Linker.php::tocLine() and ::makeHeadline() demonstrate how these |
24 | * properties are used to create headings and table of contents lines. |
25 | */ |
26 | class SectionMetadata implements \JsonSerializable, JsonCodecable { |
27 | use JsonCodecableTrait; |
28 | |
29 | /** |
30 | * The heading tag level: a 1 here means an <H1> tag was used, a |
31 | * 2 means an <H2> tag was used, etc. |
32 | */ |
33 | public int $hLevel; |
34 | |
35 | /** |
36 | * This is a one-indexed TOC level and the nesting level. |
37 | * So, if a page has a H2-H4-H6, then, those levels 2,4,6 |
38 | * correspond to TOC-levels 1,2,3. |
39 | */ |
40 | public int $tocLevel; |
41 | |
42 | /** |
43 | * HTML heading of the section. Only a narrow set of HTML tags are allowed here. |
44 | * |
45 | * This starts with the parsed headline seen in wikitext and |
46 | * - replaces links with link text |
47 | * - processes extension strip markers |
48 | * - removes style, script tags |
49 | * - strips all HTML tags except the following tags (from Parser.php) |
50 | * . <sup> and <sub> (T10393) |
51 | * . <i> (T28375) |
52 | * . <b> (r105284) |
53 | * . <bdi> (T74884) |
54 | * . <span dir="rtl"> and <span dir="ltr"> (T37167) |
55 | * . <s> and <strike> (T35715) |
56 | * . <q> (T251672) |
57 | * We strip any parameter from accepted tags, except dir="rtl|ltr" from <span>, |
58 | * to allow setting directionality in toc items. |
59 | * |
60 | * @note This should be converted into the proper html variant. |
61 | */ |
62 | public string $line; |
63 | |
64 | /** |
65 | * TOC number string (3.1.3, 4.5.2, etc.) |
66 | * |
67 | * @note This should be localized into the parser target language. |
68 | */ |
69 | public string $number; |
70 | |
71 | /** |
72 | * Section id (integer, assigned in depth first traversal order) |
73 | * Template generated sections get a "T-" prefix. |
74 | */ |
75 | public string $index; |
76 | |
77 | /** |
78 | * The title of the page that generated this heading. |
79 | * For template-generated sections, this will be the template title. |
80 | * This string is in "prefixed DB key" format. |
81 | */ |
82 | public ?string $fromTitle; |
83 | |
84 | /** |
85 | * Codepoint offset where the section shows up in wikitext; this is null |
86 | * if this section comes from a template, if it comes from a literal |
87 | * HTML <h_> tag, or otherwise doesn't correspond to a "preprocessor |
88 | * section". |
89 | * @note This is measured in codepoints, not bytes; you should use |
90 | * appropriate multi-byte aware string functions, *not* substr(). |
91 | * Similarly, in JavaScript, be careful not to confuse JavaScript |
92 | * UCS-2 "characters" with codepoints. |
93 | */ |
94 | public ?int $codepointOffset; |
95 | |
96 | /** |
97 | * Anchor attribute. |
98 | * |
99 | * This property is the "true" value of the ID attribute, and should be |
100 | * used when looking up a heading or setting an attribute, for example |
101 | * using Document.getElementById() or Element.setAttribute('id',...). |
102 | * |
103 | * This value is *not* HTML-entity escaped; if you are writing HTML |
104 | * as a literal string, you should still entity-escape ampersands and |
105 | * single/double quotes as appropriate. |
106 | * |
107 | * This value is *not* URL-escaped either; instead use the `linkAnchor` |
108 | * property if you are constructing a URL to target this section. |
109 | * |
110 | * The anchor attribute is based on the $line property, but does extra |
111 | * processing to turn it into a valid attribute: |
112 | * - strip all HTML tags, |
113 | * - normalizes section name |
114 | * - normalizes section name whitespace |
115 | * - decodes char references |
116 | * - makes it a valid HTML id attribute value |
117 | * (HTML5 / HTML4 based on $wgFragmentMode property) |
118 | * - dedupes (case-insensitively) identical anchors by adding "_$n" suffixes |
119 | */ |
120 | public string $anchor; |
121 | |
122 | /** |
123 | * Anchor URL fragment. |
124 | * |
125 | * This is very similar to the $anchor property, but is appropriately |
126 | * URL-escaped to make it appropriate to use in constructing a URL |
127 | * fragment link. You should almost always prepend a `#` symbol |
128 | * to `linkAnchor` if you are using it correctly. You are still |
129 | * responsible for HTML-escaping the resulting URL if you are emitting |
130 | * this as an HTML attribute. |
131 | */ |
132 | public string $linkAnchor; |
133 | |
134 | /** |
135 | * Arbitrary data attached to this section by extensions. This |
136 | * data will be stored and cached in the ParserOutput object along |
137 | * with the rest of the section data, and made available to external |
138 | * clients via the action API. |
139 | * |
140 | * This method is provided to overcome the unsafe practice of attaching |
141 | * extra information to a section by directly assigning member variables. |
142 | * |
143 | * See ParserOutput::setExtensionData() for more information on typical |
144 | * use. |
145 | */ |
146 | private array $extensionData; |
147 | |
148 | /** |
149 | * @param int $tocLevel One-indexed TOC level and the nesting level |
150 | * @param int $hLevel The heading tag level |
151 | * @param string $line Stripped headline text |
152 | * @param string $number TOC number string (3.1.3, 4.5.2, etc) |
153 | * @param string $index Section id |
154 | * @param ?string $fromTitle The title of the page or template that |
155 | * generated this heading, or null. |
156 | * @param ?int $codepointOffset Codepoint offset (# of characters) where the |
157 | * section shows up in wikitext, or null if this doesn't correspond to |
158 | * a "preprocesor section". (Be careful if using JavaScript, as |
159 | * JavaScript "characters" are UCS-2 encoded and don't correspond |
160 | * directly to code points.) |
161 | * @param string $anchor "True" value of the ID attribute |
162 | * @param string $linkAnchor URL-escaped value of the anchor, for use in |
163 | * constructing a URL fragment link |
164 | * @param ?array $extensionData Extension data passed in as an associative array |
165 | */ |
166 | public function __construct( |
167 | // This is a great candidate for named arguments in PHP 8.0+ |
168 | int $tocLevel = 0, |
169 | int $hLevel = -1, |
170 | string $line = '', |
171 | string $number = '', |
172 | string $index = '', |
173 | ?string $fromTitle = null, |
174 | ?int $codepointOffset = null, |
175 | string $anchor = '', |
176 | string $linkAnchor = '', |
177 | ?array $extensionData = null |
178 | ) { |
179 | $this->tocLevel = $tocLevel; |
180 | $this->line = $line; |
181 | $this->hLevel = $hLevel; |
182 | $this->number = $number; |
183 | $this->index = $index; |
184 | $this->fromTitle = $fromTitle; |
185 | $this->codepointOffset = $codepointOffset; |
186 | $this->anchor = $anchor; |
187 | $this->linkAnchor = $linkAnchor; |
188 | $this->extensionData = $extensionData ?? []; |
189 | } |
190 | |
191 | /** |
192 | * Attaches arbitrary data to this SectionMetadata object. This |
193 | * can be used to store some information about this section in the |
194 | * ParserOutput object for later use during page output. The data |
195 | * will be cached along with the ParserOutput object. |
196 | * |
197 | * This method is provided to overcome the unsafe practice of |
198 | * attaching extra information to a section by directly assigning |
199 | * member variables. |
200 | * |
201 | * See ParserOutput::setExtensionData() in core for further information |
202 | * about typical usage in hooks. |
203 | * |
204 | * Setting conflicting values for the same key is not allowed. |
205 | * If you call ::setExtensionData() multiple times with the same key |
206 | * on a SectionMetadata, is is expected that the value will be identical |
207 | * each time. If you want to collect multiple pieces of data under a |
208 | * single key, use ::appendExtensionData(). |
209 | * |
210 | * @note Only scalar values (numbers, strings, or arrays) are |
211 | * supported as a value. (A future revision will allow anything |
212 | * that core's JsonCodec can handle.) Attempts to set other types |
213 | * as extension data values will break ParserCache for the page. |
214 | * |
215 | * @todo When more complex values than scalar values are supported, |
216 | * TOCData::__clone should be updated to take that into account. |
217 | * |
218 | * @param string $key The key for accessing the data. Extensions |
219 | * should take care to avoid conflicts in naming keys. It is |
220 | * suggested to use the extension's name as a prefix. Using |
221 | * the prefix `mw:` is reserved for core. |
222 | * |
223 | * @param mixed $value The value to set. |
224 | * Setting a value to null is equivalent to removing the value. |
225 | */ |
226 | public function setExtensionData( string $key, $value ): void { |
227 | if ( |
228 | array_key_exists( $key, $this->extensionData ) && |
229 | $this->extensionData[$key] !== $value |
230 | ) { |
231 | throw new \InvalidArgumentException( "Conflicting data for $key" ); |
232 | } |
233 | if ( $value === null ) { |
234 | unset( $this->extensionData[$key] ); |
235 | } else { |
236 | $this->extensionData[$key] = $value; |
237 | } |
238 | } |
239 | |
240 | /** |
241 | * Appends arbitrary data to this SectionMetadata. This can be used |
242 | * to store some information about the section in the ParserOutput object for later |
243 | * use during page output. |
244 | * |
245 | * See ::setExtensionData() for more details on rationale and use. |
246 | * |
247 | * @param string $key The key for accessing the data. Extensions should take care to avoid |
248 | * conflicts in naming keys. It is suggested to use the extension's name as a prefix. |
249 | * |
250 | * @param int|string $value The value to append to the list. |
251 | * @return never This method is not yet implemented. |
252 | */ |
253 | public function appendExtensionData( string $key, $value ): void { |
254 | // This implementation would mirror that of |
255 | // ParserOutput::appendExtensionData, but let's defer implementing |
256 | // this until we're sure we need it. In particular, we might need |
257 | // to figure out how a merge on section data is expected to work |
258 | // before we can determine the right semantics for this. |
259 | throw new \InvalidArgumentException( "Not yet implemented" ); |
260 | } |
261 | |
262 | /** |
263 | * Gets extension data previously attached to this SectionMetadata. |
264 | * |
265 | * @param string $key The key to look up |
266 | * @return mixed|null The value(s) previously set for the given key using |
267 | * ::setExtensionData() or ::appendExtensionData(), or null if no |
268 | * value was set for this key. |
269 | */ |
270 | public function getExtensionData( $key ) { |
271 | $value = $this->extensionData[$key] ?? null; |
272 | return $value; |
273 | } |
274 | |
275 | /** |
276 | * Alias for :toLegacy(), for b/c compatibility only. |
277 | * @deprecated |
278 | * @return array |
279 | */ |
280 | public function toArray(): array { |
281 | return $this->toLegacy(); |
282 | } |
283 | |
284 | /** |
285 | * Alias for :fromLegacy(), for b/c compatibility only. |
286 | * @deprecated |
287 | * @param array $data |
288 | * @return SectionMetadata |
289 | */ |
290 | public static function fromArray( array $data ): SectionMetadata { |
291 | return self::fromLegacy( $data ); |
292 | } |
293 | |
294 | /** |
295 | * Create a new SectionMetadata object from an array in the legacy |
296 | * format returned by the action API. |
297 | * |
298 | * This is useful for backward-compatibility, but is expected to |
299 | * be replaced by conversion to/from JSON in the future. |
300 | * |
301 | * @param array $data Associative array with section metadata |
302 | * @return SectionMetadata |
303 | */ |
304 | public static function fromLegacy( array $data ): SectionMetadata { |
305 | return new SectionMetadata( |
306 | $data['toclevel'] ?? 0, |
307 | (int)( $data['level'] ?? -1 ), |
308 | $data['line'] ?? '', |
309 | $data['number'] ?? '', |
310 | $data['index'] ?? '', |
311 | ( $data['fromtitle'] ?? false ) ?: null, |
312 | $data['byteoffset'] ?? null, // T319141: actually "codepoint offset" |
313 | $data['anchor'] ?? '', |
314 | $data['linkAnchor'] ?? $data['anchor'] ?? '', |
315 | $data['extensionData'] ?? null |
316 | ); |
317 | } |
318 | |
319 | /** |
320 | * Return as associative array, in the format returned by the |
321 | * action API (including the order of fields and the value types). |
322 | * |
323 | * This is helpful as b/c support while we transition to objects. |
324 | * @return array |
325 | */ |
326 | public function toLegacy(): array { |
327 | $ret = [ |
328 | 'toclevel' => $this->tocLevel, |
329 | // cast $level to string in order to keep b/c for the parse api |
330 | 'level' => (string)$this->hLevel, |
331 | 'line' => $this->line, |
332 | 'number' => $this->number, |
333 | 'index' => $this->index, |
334 | 'fromtitle' => $this->fromTitle ?? false, |
335 | // T319141: legacy 'byteoffset' is actually "codepoint offset" |
336 | 'byteoffset' => $this->codepointOffset, |
337 | 'anchor' => $this->anchor, |
338 | 'linkAnchor' => $this->linkAnchor, |
339 | ]; |
340 | // Micro-opt: Output 'extensionData' conditionally to avoid bloat |
341 | if ( $this->extensionData ) { |
342 | $ret['extensionData'] = $this->extensionData; |
343 | } |
344 | return $ret; |
345 | } |
346 | |
347 | /** |
348 | * @inheritDoc |
349 | */ |
350 | public function jsonSerialize(): array { |
351 | return $this->toLegacy(); |
352 | } |
353 | |
354 | // JsonCodecable interface |
355 | |
356 | /** @inheritDoc */ |
357 | public function toJsonArray(): array { |
358 | $ret = []; |
359 | if ( $this->tocLevel !== 0 ) { |
360 | $ret['tocLevel'] = $this->tocLevel; |
361 | } |
362 | if ( $this->hLevel !== -1 ) { |
363 | $ret['hLevel'] = $this->hLevel; |
364 | } |
365 | if ( $this->line !== '' ) { |
366 | $ret['line'] = $this->line; |
367 | } |
368 | if ( $this->number !== '' ) { |
369 | $ret['number'] = $this->number; |
370 | } |
371 | if ( $this->index !== '' ) { |
372 | $ret['index'] = $this->index; |
373 | } |
374 | if ( $this->fromTitle !== null ) { |
375 | $ret['fromTitle'] = $this->fromTitle; |
376 | } |
377 | if ( $this->codepointOffset !== null ) { |
378 | $ret['codepointOffset'] = $this->codepointOffset; |
379 | } |
380 | if ( $this->anchor !== '' ) { |
381 | $ret['anchor'] = $this->anchor; |
382 | } |
383 | if ( $this->linkAnchor !== $this->anchor ) { |
384 | $ret['linkAnchor'] = $this->linkAnchor; |
385 | } |
386 | if ( $this->extensionData ) { |
387 | $ret['extensionData'] = $this->extensionData; |
388 | } |
389 | return $ret; |
390 | } |
391 | |
392 | /** @inheritDoc */ |
393 | public static function newFromJsonArray( array $json ) { |
394 | return new SectionMetadata( |
395 | $json['tocLevel'] ?? 0, |
396 | $json['hLevel'] ?? -1, |
397 | $json['line'] ?? '', |
398 | $json['number'] ?? '', |
399 | $json['index'] ?? '', |
400 | $json['fromTitle'] ?? null, |
401 | $json['codepointOffset'] ?? null, |
402 | $json['anchor'] ?? '', |
403 | $json['linkAnchor'] ?? $json['anchor'] ?? '', |
404 | $json['extensionData'] ?? null |
405 | ); |
406 | } |
407 | |
408 | // Pretty-printing |
409 | |
410 | /** |
411 | * For use in parser tests and wherever else humans might appreciate |
412 | * some formatting in the JSON encoded output. For now, nothing special. |
413 | * @param int $indent Additional indentation to apply (defaults to zero) |
414 | * @return string |
415 | */ |
416 | public function prettyPrint( int $indent = 0 ): string { |
417 | # Basic info |
418 | $buf = str_repeat( ' ', $indent + $this->tocLevel ) . "h{$this->hLevel}"; |
419 | $buf .= " index:{$this->index} toclevel:$this->tocLevel number:{$this->number}"; |
420 | |
421 | # Optional information |
422 | $title = $this->fromTitle ?? "NULL"; |
423 | $offset = $this->codepointOffset ?? "NULL"; |
424 | $buf .= " title:{$title} off:{$offset}"; |
425 | |
426 | # Anchors & link text |
427 | if ( $this->anchor === $this->linkAnchor ) { |
428 | $buf .= " anchor/linkAnchor:{$this->anchor}"; |
429 | } else { |
430 | $buf .= " anchor:{$this->anchor} linkAnchor:{$this->linkAnchor}"; |
431 | } |
432 | $line = $this->line; |
433 | if ( str_contains( $line, "\n" ) ) { |
434 | // Handle cases where $line has "funny" characters |
435 | $line = json_encode( $line ); |
436 | } |
437 | $buf .= " line:{$line}"; |
438 | |
439 | # Extension data |
440 | if ( $this->extensionData ) { |
441 | $codec = new CompatJsonCodec(); |
442 | $buf .= " ext:" . json_encode( $codec->toJsonArray( $this->extensionData ) ); |
443 | } |
444 | |
445 | return $buf; |
446 | } |
447 | } |