Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
40.14% |
116 / 289 |
|
40.74% |
11 / 27 |
CRAP | |
0.00% |
0 / 1 |
TokenUtils | |
40.14% |
116 / 289 |
|
40.74% |
11 / 27 |
6668.46 | |
0.00% |
0 / 1 |
getTokenType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isWikitextBlockTag | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tagOpensBlockScope | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
tagClosesBlockScope | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
isTemplateToken | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isHTMLTag | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
7 | |||
hasDOMFragmentType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isTableTag | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
isSolTransparentLinkTag | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
5 | |||
isBehaviorSwitch | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
30 | |||
isSolTransparent | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
90 | |||
isTranslationUnitMarker | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
isEmptyLineMetaToken | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchTypeOf | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
hasTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
shiftTokenTSR | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
552 | |||
stripEOFTkFromTokens | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
convertOffsets | |
95.16% |
59 / 62 |
|
0.00% |
0 / 1 |
32 | |||
convertTokenOffsets | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
90 | |||
collectOffsets | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
156 | |||
isEntitySpanToken | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
newlinesToNlTks | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
tokensToString | |
50.00% |
19 / 38 |
|
0.00% |
0 / 1 |
82.50 | |||
kvToHash | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
tokenTrim | |
13.04% |
3 / 23 |
|
0.00% |
0 / 1 |
90.56 | |||
isAnnotationStartToken | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
isAnnotationEndToken | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | /** |
5 | * This file contains general utilities for: |
6 | * (a) querying token properties and token types |
7 | * (b) manipulating tokens, individually and as collections. |
8 | */ |
9 | |
10 | namespace Wikimedia\Parsoid\Utils; |
11 | |
12 | use Wikimedia\Assert\Assert; |
13 | use Wikimedia\Assert\UnreachableException; |
14 | use Wikimedia\Parsoid\Config\Env; |
15 | use Wikimedia\Parsoid\Core\DomSourceRange; |
16 | use Wikimedia\Parsoid\Tokens\CommentTk; |
17 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
18 | use Wikimedia\Parsoid\Tokens\EOFTk; |
19 | use Wikimedia\Parsoid\Tokens\KV; |
20 | use Wikimedia\Parsoid\Tokens\KVSourceRange; |
21 | use Wikimedia\Parsoid\Tokens\NlTk; |
22 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
23 | use Wikimedia\Parsoid\Tokens\SourceRange; |
24 | use Wikimedia\Parsoid\Tokens\TagTk; |
25 | use Wikimedia\Parsoid\Tokens\Token; |
26 | use Wikimedia\Parsoid\Wikitext\Consts; |
27 | |
28 | class TokenUtils { |
29 | public const SOL_TRANSPARENT_LINK_REGEX = |
30 | '/(?:^|\s)mw:PageProp\/(?:Category|redirect|Language)(?=$|\s)/D'; |
31 | |
32 | /** |
33 | * Gets a string type value for a token |
34 | * @param Token|string $token |
35 | * @return string |
36 | */ |
37 | public static function getTokenType( $token ): string { |
38 | return is_string( $token ) ? 'string' : $token->getType(); |
39 | } |
40 | |
41 | /** |
42 | * @param string $name |
43 | * @return bool |
44 | */ |
45 | public static function isWikitextBlockTag( string $name ): bool { |
46 | return isset( Consts::$wikitextBlockElems[$name] ); |
47 | } |
48 | |
49 | /** |
50 | * In the legacy parser, these block tags open block-tag scope |
51 | * See doBlockLevels in the PHP parser (includes/parser/Parser.php). |
52 | * |
53 | * @param string $name |
54 | * @return bool |
55 | */ |
56 | public static function tagOpensBlockScope( string $name ): bool { |
57 | return isset( Consts::$blockElems[$name] ) || |
58 | isset( Consts::$alwaysBlockElems[$name] ); |
59 | } |
60 | |
61 | /** |
62 | * In the legacy parser, these block tags close block-tag scope |
63 | * See doBlockLevels in the PHP parser (includes/parser/Parser.php). |
64 | * |
65 | * @param string $name |
66 | * @return bool |
67 | */ |
68 | public static function tagClosesBlockScope( string $name ): bool { |
69 | return isset( Consts::$antiBlockElems[$name] ) || |
70 | isset( Consts::$neverBlockElems[$name] ); |
71 | } |
72 | |
73 | /** |
74 | * Is this a template token? |
75 | * @param Token|string|null $token |
76 | * @return bool |
77 | */ |
78 | public static function isTemplateToken( $token ): bool { |
79 | return $token instanceof SelfclosingTagTk && $token->getName() === 'template'; |
80 | } |
81 | |
82 | /** |
83 | * Determine whether the current token was an HTML tag in wikitext. |
84 | * |
85 | * @param Token|string|null $token |
86 | * @return bool |
87 | */ |
88 | public static function isHTMLTag( $token ): bool { |
89 | return $token && !is_string( $token ) && |
90 | ( $token instanceof TagTk || |
91 | $token instanceof EndTagTk || |
92 | $token instanceof SelfClosingTagTk ) && |
93 | isset( $token->dataParsoid->stx ) && |
94 | $token->dataParsoid->stx === 'html'; |
95 | } |
96 | |
97 | /** |
98 | * Is the token a DOMFragment type value? |
99 | * |
100 | * @param Token $token |
101 | * @return bool |
102 | */ |
103 | public static function hasDOMFragmentType( Token $token ): bool { |
104 | return self::matchTypeOf( $token, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null; |
105 | } |
106 | |
107 | /** |
108 | * Is the token a table tag? |
109 | * |
110 | * @param Token|string $token |
111 | * @return bool |
112 | */ |
113 | public static function isTableTag( $token ): bool { |
114 | return ( $token instanceof TagTk || $token instanceof EndTagTk ) && |
115 | isset( Consts::$HTML['TableTags'][$token->getName()] ); |
116 | } |
117 | |
118 | /** |
119 | * Determine if token is a transparent link tag |
120 | * |
121 | * @param Token|string $token |
122 | * @return bool |
123 | */ |
124 | public static function isSolTransparentLinkTag( $token ): bool { |
125 | return ( |
126 | $token instanceof SelfclosingTagTk || |
127 | $token instanceof TagTk || |
128 | $token instanceof EndTagTk |
129 | ) && |
130 | $token->getName() === 'link' && |
131 | preg_match( self::SOL_TRANSPARENT_LINK_REGEX, $token->getAttributeV( 'rel' ) ?? '' ); |
132 | } |
133 | |
134 | /** |
135 | * Does this token represent a behavior switch? |
136 | * |
137 | * @param Env $env |
138 | * @param Token|string $token |
139 | * @return bool |
140 | */ |
141 | public static function isBehaviorSwitch( Env $env, $token ): bool { |
142 | return $token instanceof SelfclosingTagTk && ( |
143 | // Before BehaviorSwitchHandler (ie. PreHandler, etc.) |
144 | $token->getName() === 'behavior-switch' || |
145 | // After BehaviorSwitchHandler |
146 | // (ie. ListHandler, ParagraphWrapper, etc.) |
147 | ( $token->getName() === 'meta' && |
148 | $token->hasAttribute( 'property' ) && |
149 | preg_match( $env->getSiteConfig()->bswPagePropRegexp(), |
150 | $token->getAttributeV( 'property' ) ?? '' ) |
151 | ) ); |
152 | } |
153 | |
154 | /** |
155 | * This should come close to matching |
156 | * {@link WTUtils::emitsSolTransparentSingleLineWT}, |
157 | * without the single line caveat. |
158 | * @param Env $env |
159 | * @param Token|string $token |
160 | * @return bool |
161 | */ |
162 | public static function isSolTransparent( Env $env, $token ): bool { |
163 | if ( is_string( $token ) ) { |
164 | return (bool)preg_match( '/^[ \t]*$/D', $token ); |
165 | } elseif ( self::isSolTransparentLinkTag( $token ) ) { |
166 | return true; |
167 | } elseif ( $token instanceof CommentTk && !self::isTranslationUnitMarker( $env, $token ) ) { |
168 | return true; |
169 | } elseif ( self::isBehaviorSwitch( $env, $token ) ) { |
170 | return true; |
171 | } elseif ( !$token instanceof SelfclosingTagTk || $token->getName() !== 'meta' ) { |
172 | return false; |
173 | } else { // only metas left |
174 | return !( isset( $token->dataParsoid->stx ) && $token->dataParsoid->stx === 'html' ); |
175 | } |
176 | } |
177 | |
178 | /** |
179 | * HACK: Returns true if $token looks like a TU marker (<!--T:XXX-->) and if we could be in a |
180 | * translate-annotated page. |
181 | * @param Env $env |
182 | * @param CommentTk $token |
183 | * @return bool |
184 | */ |
185 | public static function isTranslationUnitMarker( Env $env, CommentTk $token ): bool { |
186 | return $env->hasAnnotations && |
187 | $env->getSiteConfig()->isAnnotationTag( 'translate' ) && |
188 | preg_match( '/^T:/', $token->value ) === 1; |
189 | } |
190 | |
191 | /** |
192 | * Is token a transparent link tag? |
193 | * |
194 | * @param Token|string $token |
195 | * @return bool |
196 | */ |
197 | public static function isEmptyLineMetaToken( $token ): bool { |
198 | return $token instanceof SelfclosingTagTk && |
199 | $token->getName() === 'meta' && |
200 | $token->getAttributeV( 'typeof' ) === 'mw:EmptyLine'; |
201 | } |
202 | |
203 | /** |
204 | * Determine whether the token matches the given `typeof` attribute value. |
205 | * |
206 | * @param Token $t The token to test |
207 | * @param string $typeRe Regular expression matching the expected value of |
208 | * the `typeof` attribute. |
209 | * @return ?string The matching `typeof` value, or `null` if there is |
210 | * no match. |
211 | */ |
212 | public static function matchTypeOf( Token $t, string $typeRe ): ?string { |
213 | $v = $t->getAttributeV( 'typeof' ); |
214 | if ( $v === null ) { |
215 | return null; |
216 | } |
217 | Assert::invariant( is_string( $v ), "Typeof is not simple" ); |
218 | foreach ( preg_split( '/\s+/', $v, -1, PREG_SPLIT_NO_EMPTY ) as $ty ) { |
219 | $count = preg_match( $typeRe, $ty ); |
220 | Assert::invariant( $count !== false, "Bad regexp" ); |
221 | if ( $count ) { |
222 | return $ty; |
223 | } |
224 | } |
225 | return null; |
226 | } |
227 | |
228 | /** |
229 | * Determine whether the token matches the given typeof attribute value. |
230 | * |
231 | * @param Token $t |
232 | * @param string $type Expected value of "typeof" attribute, as a literal |
233 | * string. |
234 | * @return bool True if the token matches. |
235 | */ |
236 | public static function hasTypeOf( Token $t, string $type ): bool { |
237 | return self::matchTypeOf( |
238 | $t, '/^' . preg_quote( $type, '/' ) . '$/D' |
239 | ) !== null; |
240 | } |
241 | |
242 | /** |
243 | * Shift TSR of a token |
244 | * |
245 | * PORT-FIXME: In JS this was sometimes called with $offset=undefined, which meant do |
246 | * nothing by default, except if there was a third parameter set to true, in which case it |
247 | * meant the same thing as $offset = null. We can't pass in undefined in PHP, so this should |
248 | * usually be handled with isset() is the caller. But isset() returns true if the variable is |
249 | * null, so let's use false instead of null for whatever the previous code meant by a null |
250 | * offset. |
251 | * |
252 | * @param array<Token|string> $tokens |
253 | * @param int|false $offset |
254 | */ |
255 | public static function shiftTokenTSR( array $tokens, $offset ): void { |
256 | // Bail early if we can |
257 | if ( $offset === 0 ) { |
258 | return; |
259 | } |
260 | |
261 | // JS b/c |
262 | if ( $offset === null ) { |
263 | $offset = false; |
264 | } |
265 | |
266 | // update/clear tsr |
267 | for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { |
268 | $t = $tokens[$i]; |
269 | switch ( is_object( $t ) ? get_class( $t ) : null ) { |
270 | case TagTk::class: |
271 | case SelfclosingTagTk::class: |
272 | case NlTk::class: |
273 | case CommentTk::class: |
274 | case EndTagTk::class: |
275 | $da = $t->dataParsoid; |
276 | $tsr = $da->tsr; |
277 | if ( $tsr ) { |
278 | if ( $offset ) { |
279 | $da->tsr = $tsr->offset( $offset ); |
280 | } else { |
281 | $da->tsr = null; |
282 | } |
283 | } |
284 | |
285 | if ( $offset && isset( $da->extTagOffsets ) ) { |
286 | $da->extTagOffsets = |
287 | $da->extTagOffsets->offset( $offset ); |
288 | } |
289 | |
290 | // SSS FIXME: offset will always be available in |
291 | // chunky-tokenizer mode in which case we wont have |
292 | // buggy offsets below. The null scenario is only |
293 | // for when the token-stream-patcher attempts to |
294 | // reparse a string -- it is likely to only patch up |
295 | // small string fragments and the complicated use cases |
296 | // below should not materialize. |
297 | // CSA: token-stream-patcher shouldn't have problems |
298 | // now that $frame->srcText is always accurate? |
299 | |
300 | // content offsets for ext-links |
301 | if ( $offset && isset( $da->tmp->extLinkContentOffsets ) ) { |
302 | $da->tmp->extLinkContentOffsets = |
303 | $da->tmp->extLinkContentOffsets->offset( $offset ); |
304 | } |
305 | |
306 | // Process attributes |
307 | if ( isset( $t->attribs ) ) { |
308 | for ( $j = 0, $m = count( $t->attribs ); $j < $m; $j++ ) { |
309 | $a = $t->attribs[$j]; |
310 | if ( is_array( $a->k ) ) { |
311 | self::shiftTokenTSR( $a->k, $offset ); |
312 | } |
313 | if ( is_array( $a->v ) ) { |
314 | self::shiftTokenTSR( $a->v, $offset ); |
315 | } |
316 | |
317 | // src offsets used to set mw:TemplateParams |
318 | if ( !$offset ) { |
319 | $a->srcOffsets = null; |
320 | } elseif ( $a->srcOffsets !== null ) { |
321 | $a->srcOffsets = $a->srcOffsets->offset( $offset ); |
322 | } |
323 | } |
324 | } |
325 | break; |
326 | |
327 | default: |
328 | break; |
329 | } |
330 | } |
331 | } |
332 | |
333 | /** |
334 | * Strip EOFTk token from token chunk. |
335 | * The EOFTk is expected to be the last token of the chunk. |
336 | * |
337 | * @param array &$tokens |
338 | * @return array return the modified token array so that this call can be chained |
339 | */ |
340 | public static function stripEOFTkFromTokens( array &$tokens ): array { |
341 | $n = count( $tokens ); |
342 | if ( $n && $tokens[$n - 1] instanceof EOFTk ) { |
343 | array_pop( $tokens ); |
344 | } |
345 | return $tokens; |
346 | } |
347 | |
348 | /** |
349 | * Convert string offsets |
350 | * |
351 | * Offset types are: |
352 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
353 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
354 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
355 | * |
356 | * Offsets that are mid-Unicode character are "rounded" up to the next full |
357 | * character, i.e. the output offset will always point to the start of a |
358 | * Unicode code point (or just past the end of the string). Offsets outside |
359 | * the string are "rounded" to 0 or just-past-the-end. |
360 | * |
361 | * @note When constructing the array of offsets to pass to this method, |
362 | * populate it with references as `$offsets[] = &$var;`. |
363 | * |
364 | * @param string $s Unicode string the offsets are offsets into, UTF-8 encoded. |
365 | * @param ('byte'|'ucs2'|'char') $from Offset type to convert from. |
366 | * @param ('byte'|'ucs2'|'char') $to Offset type to convert to. |
367 | * @param int[] $offsets References to the offsets to convert. |
368 | */ |
369 | public static function convertOffsets( |
370 | string $s, string $from, string $to, array $offsets |
371 | ): void { |
372 | static $valid = [ 'byte', 'char', 'ucs2' ]; |
373 | if ( !in_array( $from, $valid, true ) ) { |
374 | throw new \InvalidArgumentException( 'Invalid $from' ); |
375 | } |
376 | if ( !in_array( $to, $valid, true ) ) { |
377 | throw new \InvalidArgumentException( 'Invalid $to' ); |
378 | } |
379 | |
380 | $i = 0; |
381 | $offsetCt = count( $offsets ); |
382 | if ( $offsetCt === 0 ) { // Nothing to do |
383 | return; |
384 | } |
385 | sort( $offsets, SORT_NUMERIC ); |
386 | |
387 | $bytePos = 0; |
388 | $ucs2Pos = 0; |
389 | $charPos = 0; |
390 | |
391 | $fromPos = &${$from . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar |
392 | $toPos = &${$to . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar |
393 | |
394 | $byteLen = strlen( $s ); |
395 | while ( $bytePos < $byteLen ) { |
396 | // Update offsets that we've reached |
397 | while ( $offsets[$i] <= $fromPos ) { |
398 | $offsets[$i] = $toPos; |
399 | if ( ++$i >= $offsetCt ) { |
400 | return; |
401 | } |
402 | } |
403 | |
404 | // Update positions |
405 | ++$charPos; |
406 | $c = ord( $s[$bytePos] ) & 0xf8; |
407 | switch ( $c ) { |
408 | case 0x00: |
409 | case 0x08: |
410 | case 0x10: |
411 | case 0x18: |
412 | case 0x20: |
413 | case 0x28: |
414 | case 0x30: |
415 | case 0x38: |
416 | case 0x40: |
417 | case 0x48: |
418 | case 0x50: |
419 | case 0x58: |
420 | case 0x60: |
421 | case 0x68: |
422 | case 0x70: |
423 | case 0x78: |
424 | ++$bytePos; |
425 | ++$ucs2Pos; |
426 | break; |
427 | |
428 | case 0xc0: |
429 | case 0xc8: |
430 | case 0xd0: |
431 | case 0xd8: |
432 | $bytePos += 2; |
433 | ++$ucs2Pos; |
434 | break; |
435 | |
436 | case 0xe0: |
437 | case 0xe8: |
438 | $bytePos += 3; |
439 | ++$ucs2Pos; |
440 | break; |
441 | |
442 | case 0xf0: |
443 | $bytePos += 4; |
444 | $ucs2Pos += 2; |
445 | break; |
446 | |
447 | default: |
448 | throw new \InvalidArgumentException( '$s is not UTF-8' ); |
449 | } |
450 | } |
451 | |
452 | // Convert any offsets past the end of the string to the length of the |
453 | // string. |
454 | while ( $i < $offsetCt ) { |
455 | $offsets[$i] = $toPos; |
456 | ++$i; |
457 | } |
458 | } |
459 | |
460 | /** |
461 | * Convert offsets in a token array |
462 | * |
463 | * @see TokenUtils::convertOffsets() |
464 | * |
465 | * @param string $s The offset reference string |
466 | * @param ('byte'|'ucs2'|'char') $from Offset type to convert from |
467 | * @param ('byte'|'ucs2'|'char') $to Offset type to convert to |
468 | * @param array<Token|string|array> $tokens |
469 | */ |
470 | public static function convertTokenOffsets( |
471 | string $s, string $from, string $to, array $tokens |
472 | ): void { |
473 | $offsets = []; /* @var array<int> $offsets */ |
474 | self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { |
475 | if ( $sr instanceof DomSourceRange ) { |
476 | // Adjust the widths to be actual character offsets |
477 | if ( $sr->openWidth !== null ) { |
478 | Assert::invariant( $sr->start !== null, "width w/o start" ); |
479 | $sr->openWidth = $sr->start + $sr->openWidth; |
480 | $offsets[] =& $sr->openWidth; |
481 | } |
482 | if ( $sr->closeWidth !== null ) { |
483 | Assert::invariant( $sr->end !== null, "width w/o end" ); |
484 | $sr->closeWidth = $sr->end - $sr->closeWidth; |
485 | $offsets[] =& $sr->closeWidth; |
486 | } |
487 | } |
488 | if ( $sr->start !== null ) { |
489 | $offsets[] =& $sr->start; |
490 | } |
491 | if ( $sr->end !== null ) { |
492 | $offsets[] =& $sr->end; |
493 | } |
494 | } ); |
495 | self::convertOffsets( $s, $from, $to, $offsets ); |
496 | self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { |
497 | if ( $sr instanceof DomSourceRange ) { |
498 | // Adjust widths back from being character offsets |
499 | if ( $sr->openWidth !== null ) { |
500 | $sr->openWidth -= $sr->start; |
501 | } |
502 | if ( $sr->closeWidth !== null ) { |
503 | $sr->closeWidth = $sr->end - $sr->closeWidth; |
504 | } |
505 | } |
506 | } ); |
507 | } |
508 | |
509 | /** |
510 | * @param array<Token|string>|array<KV>|KV|Token|DomSourceRange|KVSourceRange|SourceRange|string $input |
511 | * @param callable $offsetFunc |
512 | */ |
513 | private static function collectOffsets( $input, callable $offsetFunc ): void { |
514 | if ( is_array( $input ) ) { |
515 | foreach ( $input as $token ) { |
516 | self::collectOffsets( $token, $offsetFunc ); |
517 | } |
518 | } elseif ( $input instanceof KV ) { |
519 | self::collectOffsets( $input->k, $offsetFunc ); |
520 | self::collectOffsets( $input->v, $offsetFunc ); |
521 | if ( $input->srcOffsets ) { |
522 | self::collectOffsets( $input->srcOffsets, $offsetFunc ); |
523 | } |
524 | } elseif ( $input instanceof Token ) { |
525 | if ( isset( $input->dataParsoid->tsr ) ) { |
526 | self::collectOffsets( $input->dataParsoid->tsr, $offsetFunc ); |
527 | } |
528 | if ( isset( $input->dataParsoid->tmp->extLinkContentOffsets ) ) { |
529 | self::collectOffsets( $input->dataParsoid->tmp->extLinkContentOffsets, $offsetFunc ); |
530 | } |
531 | if ( isset( $input->dataParsoid->tokens ) ) { |
532 | self::collectOffsets( $input->dataParsoid->tokens, $offsetFunc ); |
533 | } |
534 | if ( isset( $input->dataParsoid->extTagOffsets ) ) { |
535 | self::collectOffsets( $input->dataParsoid->extTagOffsets, $offsetFunc ); |
536 | } |
537 | self::collectOffsets( $input->attribs, $offsetFunc ); |
538 | } elseif ( $input instanceof KVSourceRange ) { |
539 | self::collectOffsets( $input->key, $offsetFunc ); |
540 | self::collectOffsets( $input->value, $offsetFunc ); |
541 | } elseif ( $input instanceof SourceRange ) { |
542 | // This includes DomSourceRange |
543 | $offsetFunc( $input ); |
544 | } |
545 | } |
546 | |
547 | /** |
548 | * Tests whether token represents an HTML entity. |
549 | * Think `<span typeof="mw:Entity">`. |
550 | * @param Token|string|null $token |
551 | * @return bool |
552 | */ |
553 | public static function isEntitySpanToken( $token ): bool { |
554 | return $token && |
555 | $token instanceof TagTk && |
556 | $token->getName() === 'span' && |
557 | self::hasTypeOf( $token, 'mw:Entity' ); |
558 | } |
559 | |
560 | /** |
561 | * Transform `"\n"` and `"\r\n"` in the input string to {@link NlTk} tokens. |
562 | * @param string $str |
563 | * @return array (interspersed string and NlTk tokens) |
564 | */ |
565 | public static function newlinesToNlTks( string $str ): array { |
566 | $toks = preg_split( '/\n|\r\n/', $str ); |
567 | $ret = []; |
568 | // Add one NlTk between each pair, hence toks.length-1 |
569 | for ( $i = 0, $n = count( $toks ) - 1; $i < $n; $i++ ) { |
570 | $ret[] = $toks[$i]; |
571 | $ret[] = new NlTk( null ); |
572 | } |
573 | $ret[] = $toks[$i]; |
574 | return $ret; |
575 | } |
576 | |
577 | /** |
578 | * Flatten/convert a token array into a string. |
579 | * @param string|Token|array<Token|string> $tokens |
580 | * @param bool $strict Whether to abort as soon as we find a token we |
581 | * can't stringify. |
582 | * @param array<string,bool|Env> $opts |
583 | * @return string|array{0:string,1:Array<Token|string>} |
584 | * The stringified tokens. If $strict is true, returns a two-element |
585 | * array containing string prefix and the remainder of the tokens as |
586 | * soon as we encounter something we can't stringify. |
587 | * |
588 | * Unsure why phan is whining about $opts array accesses. |
589 | * So for now, I am simply suppressing those warnings. |
590 | */ |
591 | public static function tokensToString( $tokens, bool $strict = false, array $opts = [] ) { |
592 | if ( is_string( $tokens ) ) { |
593 | return $tokens; |
594 | } |
595 | |
596 | if ( !is_array( $tokens ) ) { |
597 | $tokens = [ $tokens ]; |
598 | } |
599 | |
600 | $out = ''; |
601 | for ( $i = 0, $l = count( $tokens ); $i < $l; $i++ ) { |
602 | $token = $tokens[$i]; |
603 | if ( $token === null ) { |
604 | throw new UnreachableException( "No nulls expected." ); |
605 | } elseif ( $token instanceof KV ) { |
606 | // Since this function is occasionally called on KV->v, |
607 | // whose signature recursively includes KV[], a mismatch with |
608 | // this function, we assert that those values are only |
609 | // included in safe places that don't intend to stringify |
610 | // their tokens. |
611 | throw new UnreachableException( "No KVs expected." ); |
612 | } elseif ( is_string( $token ) ) { |
613 | $out .= $token; |
614 | } elseif ( is_array( $token ) ) { |
615 | Assert::invariant( !$strict, "strict case handled above" ); |
616 | $out .= self::tokensToString( $token, $strict, $opts ); |
617 | } elseif ( |
618 | $token instanceof CommentTk || |
619 | ( empty( $opts['retainNLs'] ) && $token instanceof NlTk ) |
620 | ) { |
621 | // strip comments and newlines |
622 | } elseif ( !empty( $opts['stripEmptyLineMeta'] ) && self::isEmptyLineMetaToken( $token ) ) { |
623 | // If requested, strip empty line meta tokens too. |
624 | } elseif ( !empty( $opts['includeEntities'] ) && self::isEntitySpanToken( $token ) ) { |
625 | $out .= $token->dataParsoid->src; |
626 | $i += 2; // Skip child and end tag. |
627 | } elseif ( $strict ) { |
628 | // If strict, return accumulated string on encountering first non-text token |
629 | return [ $out, array_slice( $tokens, $i ) ]; |
630 | } elseif ( |
631 | // This option shouldn't be used if the tokens have been |
632 | // expanded to DOM |
633 | !empty( $opts['unpackDOMFragments'] ) && |
634 | ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) && |
635 | self::hasDOMFragmentType( $token ) |
636 | ) { |
637 | // Handle dom fragments |
638 | $domFragment = $opts['env']->getDOMFragment( |
639 | $token->dataParsoid->html |
640 | ); |
641 | // Calling `env->removeDOMFragment()` here is case dependent |
642 | // but should be rare enough when permissible that it can be |
643 | // ignored. |
644 | // FIXME: The correct thing to do would be to return |
645 | // `$domFragment.innerHTML` for the current scenarios where |
646 | // `unpackDOMFragments` is used (expanded attribute |
647 | // values and reparses thereof) but we'd need to remove |
648 | // the span wrapping and typeof annotation of extension |
649 | // content and nowikis. Since we're primarily expecting |
650 | // to find <translate> and <nowiki> here, this will do. |
651 | $out .= $domFragment->textContent; |
652 | if ( $token instanceof TagTk ) { |
653 | $i += 1; // Skip the EndTagTK |
654 | Assert::invariant( |
655 | $i >= $l || $tokens[$i] instanceof EndTagTk, |
656 | "tag should be followed by endtag" |
657 | ); |
658 | } |
659 | } |
660 | } |
661 | return $out; |
662 | } |
663 | |
664 | /** |
665 | * Convert an array of key-value pairs into a hash of keys to values. |
666 | * For duplicate keys, the last entry wins. |
667 | * @param array<KV> $kvs |
668 | * @return array<string,array<Token|string>>|array<string,string> |
669 | */ |
670 | public static function kvToHash( array $kvs ): array { |
671 | $res = []; |
672 | foreach ( $kvs as $kv ) { |
673 | $key = trim( self::tokensToString( $kv->k ) ); |
674 | // SSS FIXME: Temporary fix to handle extensions which use |
675 | // entities in attribute values. We need more robust handling |
676 | // of non-string template attribute values in general. |
677 | $val = self::tokensToString( $kv->v ); |
678 | $res[mb_strtolower( $key )] = self::tokenTrim( $val ); |
679 | } |
680 | return $res; |
681 | } |
682 | |
683 | /** |
684 | * Trim space and newlines from leading and trailing text tokens. |
685 | * @param string|Token|(Token|string)[] $tokens |
686 | * @return string|Token|(Token|string)[] |
687 | */ |
688 | public static function tokenTrim( $tokens ) { |
689 | if ( !is_array( $tokens ) ) { |
690 | if ( is_string( $tokens ) ) { |
691 | return trim( $tokens ); |
692 | } |
693 | return $tokens; |
694 | } |
695 | |
696 | $n = count( $tokens ); |
697 | |
698 | // strip leading space |
699 | foreach ( $tokens as &$token ) { |
700 | if ( $token instanceof NlTk ) { |
701 | $token = ''; |
702 | } elseif ( is_string( $token ) ) { |
703 | $token = preg_replace( '/^\s+/', '', $token, 1 ); |
704 | if ( $token !== '' ) { |
705 | break; |
706 | } |
707 | } else { |
708 | break; |
709 | } |
710 | } |
711 | |
712 | // strip trailing space |
713 | for ( $i = $n - 1; $i >= 0; $i-- ) { |
714 | $token = &$tokens[$i]; |
715 | if ( $token instanceof NlTk ) { |
716 | $token = ''; // replace newline with empty |
717 | } elseif ( is_string( $token ) ) { |
718 | $token = preg_replace( '/\s+$/D', '', $token, 1 ); |
719 | if ( $token !== '' ) { |
720 | break; |
721 | } |
722 | } else { |
723 | break; |
724 | } |
725 | } |
726 | |
727 | return $tokens; |
728 | } |
729 | |
730 | /** |
731 | * Checks whether the provided meta tag token is an annotation start token |
732 | * @param Token $t |
733 | * @return bool |
734 | */ |
735 | public static function isAnnotationStartToken( Token $t ): bool { |
736 | $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); |
737 | return $type !== null && !str_ends_with( $type, '/End' ); |
738 | } |
739 | |
740 | /** |
741 | * Checks whether the provided meta tag token is an annotation end token |
742 | * @param Token $t |
743 | * @return bool |
744 | */ |
745 | public static function isAnnotationEndToken( Token $t ): bool { |
746 | $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); |
747 | return $type !== null && str_ends_with( $type, '/End' ); |
748 | } |
749 | } |