Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
39.80% |
117 / 294 |
|
36.67% |
11 / 30 |
CRAP | |
0.00% |
0 / 1 |
TokenUtils | |
39.80% |
117 / 294 |
|
36.67% |
11 / 30 |
7170.72 | |
0.00% |
0 / 1 |
getTokenType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
isWikitextBlockTag | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tagOpensBlockScope | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
tagClosesBlockScope | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
isTemplateToken | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
isTemplateArgToken | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isExtensionToken | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isHTMLTag | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
7 | |||
hasDOMFragmentType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isTableTag | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
isSolTransparentLinkTag | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
5 | |||
isBehaviorSwitch | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
30 | |||
isSolTransparent | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
90 | |||
isAnnotationMetaToken | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isAnnotationStartToken | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
isAnnotationEndToken | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
isTranslationUnitMarker | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
isEmptyLineMetaToken | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchTypeOf | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
hasTypeOf | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
shiftTokenTSR | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
552 | |||
stripEOFTkFromTokens | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
convertOffsets | |
93.65% |
59 / 63 |
|
0.00% |
0 / 1 |
32.26 | |||
convertTokenOffsets | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
90 | |||
collectOffsets | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
156 | |||
isEntitySpanToken | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
newlinesToNlTks | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
tokensToString | |
50.00% |
19 / 38 |
|
0.00% |
0 / 1 |
82.50 | |||
kvToHash | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
tokenTrim | |
13.04% |
3 / 23 |
|
0.00% |
0 / 1 |
90.56 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\DomSourceRange; |
10 | use Wikimedia\Parsoid\Tokens\CommentTk; |
11 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
12 | use Wikimedia\Parsoid\Tokens\EOFTk; |
13 | use Wikimedia\Parsoid\Tokens\KV; |
14 | use Wikimedia\Parsoid\Tokens\KVSourceRange; |
15 | use Wikimedia\Parsoid\Tokens\NlTk; |
16 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
17 | use Wikimedia\Parsoid\Tokens\SourceRange; |
18 | use Wikimedia\Parsoid\Tokens\TagTk; |
19 | use Wikimedia\Parsoid\Tokens\Token; |
20 | use Wikimedia\Parsoid\Wikitext\Consts; |
21 | |
22 | /** |
23 | * This class contains general utilities for: |
24 | * (a) querying token properties and token types |
25 | * (b) manipulating tokens, individually and as collections. |
26 | */ |
27 | class TokenUtils { |
28 | public const SOL_TRANSPARENT_LINK_REGEX = |
29 | '/(?:^|\s)mw:PageProp\/(?:Category|redirect|Language)(?=$|\s)/D'; |
30 | |
31 | /** |
32 | * Gets a string type value for a token |
33 | * @param Token|string $token |
34 | * @return string |
35 | */ |
36 | public static function getTokenType( $token ): string { |
37 | return is_string( $token ) ? 'string' : $token->getType(); |
38 | } |
39 | |
40 | /** |
41 | * @param string $name |
42 | * @return bool |
43 | */ |
44 | public static function isWikitextBlockTag( string $name ): bool { |
45 | return isset( Consts::$wikitextBlockElems[$name] ); |
46 | } |
47 | |
48 | /** |
49 | * In the legacy parser, these block tags open block-tag scope |
50 | * See doBlockLevels in the PHP parser (includes/parser/Parser.php). |
51 | * |
52 | * @param string $name |
53 | * @return bool |
54 | */ |
55 | public static function tagOpensBlockScope( string $name ): bool { |
56 | return isset( Consts::$blockElems[$name] ) || |
57 | isset( Consts::$alwaysBlockElems[$name] ); |
58 | } |
59 | |
60 | /** |
61 | * In the legacy parser, these block tags close block-tag scope |
62 | * See doBlockLevels in the PHP parser (includes/parser/Parser.php). |
63 | * |
64 | * @param string $name |
65 | * @return bool |
66 | */ |
67 | public static function tagClosesBlockScope( string $name ): bool { |
68 | return isset( Consts::$antiBlockElems[$name] ) || |
69 | isset( Consts::$neverBlockElems[$name] ); |
70 | } |
71 | |
72 | /** |
73 | * Is this a template token? |
74 | * @param Token|string|null $token |
75 | * @return bool |
76 | */ |
77 | public static function isTemplateToken( $token ): bool { |
78 | return $token instanceof SelfclosingTagTk && |
79 | in_array( $token->getName(), [ 'template', 'templatearg' ], true ); |
80 | } |
81 | |
82 | /** |
83 | * Is this a template arg token? |
84 | * @param Token|string|null $token |
85 | * @return bool |
86 | */ |
87 | public static function isTemplateArgToken( $token ): bool { |
88 | return $token instanceof SelfclosingTagTk && $token->getName() === 'templatearg'; |
89 | } |
90 | |
91 | /** |
92 | * Is this an extension token? |
93 | * @param Token|string|null $token |
94 | * @return bool |
95 | */ |
96 | public static function isExtensionToken( $token ): bool { |
97 | return $token instanceof SelfclosingTagTk && $token->getName() === 'extension'; |
98 | } |
99 | |
100 | /** |
101 | * Determine whether the current token was an HTML tag in wikitext. |
102 | * |
103 | * @param Token|string|null $token |
104 | * @return bool |
105 | */ |
106 | public static function isHTMLTag( $token ): bool { |
107 | return $token && !is_string( $token ) && |
108 | ( $token instanceof TagTk || |
109 | $token instanceof EndTagTk || |
110 | $token instanceof SelfClosingTagTk ) && |
111 | isset( $token->dataParsoid->stx ) && |
112 | $token->dataParsoid->stx === 'html'; |
113 | } |
114 | |
115 | /** |
116 | * Is the token a DOMFragment type value? |
117 | * |
118 | * @param Token $token |
119 | * @return bool |
120 | */ |
121 | public static function hasDOMFragmentType( Token $token ): bool { |
122 | return self::matchTypeOf( $token, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null; |
123 | } |
124 | |
125 | /** |
126 | * Is the token a table tag? |
127 | * |
128 | * @param Token|string $token |
129 | * @return bool |
130 | */ |
131 | public static function isTableTag( $token ): bool { |
132 | return ( $token instanceof TagTk || $token instanceof EndTagTk ) && |
133 | isset( Consts::$HTML['TableTags'][$token->getName()] ); |
134 | } |
135 | |
136 | /** |
137 | * Determine if token is a transparent link tag |
138 | * |
139 | * @param Token|string $token |
140 | * @return bool |
141 | */ |
142 | public static function isSolTransparentLinkTag( $token ): bool { |
143 | return ( |
144 | $token instanceof SelfclosingTagTk || |
145 | $token instanceof TagTk || |
146 | $token instanceof EndTagTk |
147 | ) && |
148 | $token->getName() === 'link' && |
149 | preg_match( self::SOL_TRANSPARENT_LINK_REGEX, $token->getAttributeV( 'rel' ) ?? '' ); |
150 | } |
151 | |
152 | /** |
153 | * Does this token represent a behavior switch? |
154 | * |
155 | * @param Env $env |
156 | * @param Token|string $token |
157 | * @return bool |
158 | */ |
159 | public static function isBehaviorSwitch( Env $env, $token ): bool { |
160 | return $token instanceof SelfclosingTagTk && ( |
161 | // Before BehaviorSwitchHandler (ie. PreHandler, etc.) |
162 | $token->getName() === 'behavior-switch' || |
163 | // After BehaviorSwitchHandler |
164 | // (ie. ListHandler, ParagraphWrapper, etc.) |
165 | ( $token->getName() === 'meta' && |
166 | $token->hasAttribute( 'property' ) && |
167 | preg_match( $env->getSiteConfig()->bswPagePropRegexp(), |
168 | $token->getAttributeV( 'property' ) ?? '' ) |
169 | ) ); |
170 | } |
171 | |
172 | /** |
173 | * This should come close to matching |
174 | * {@link WTUtils::emitsSolTransparentSingleLineWT}, |
175 | * without the single line caveat. |
176 | * @param Env $env |
177 | * @param Token|string $token |
178 | * @return bool |
179 | */ |
180 | public static function isSolTransparent( Env $env, $token ): bool { |
181 | if ( is_string( $token ) ) { |
182 | return (bool)preg_match( '/^[ \t]*$/D', $token ); |
183 | } elseif ( self::isSolTransparentLinkTag( $token ) ) { |
184 | return true; |
185 | } elseif ( $token instanceof CommentTk && !self::isTranslationUnitMarker( $env, $token ) ) { |
186 | return true; |
187 | } elseif ( self::isBehaviorSwitch( $env, $token ) ) { |
188 | return true; |
189 | } elseif ( !$token instanceof SelfclosingTagTk || $token->getName() !== 'meta' ) { |
190 | return false; |
191 | } else { // only metas left |
192 | return !( isset( $token->dataParsoid->stx ) && $token->dataParsoid->stx === 'html' ); |
193 | } |
194 | } |
195 | |
196 | /** |
197 | * @param Token $t |
198 | * @return bool |
199 | */ |
200 | public static function isAnnotationMetaToken( Token $t ): bool { |
201 | return self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ) !== null; |
202 | } |
203 | |
204 | /** |
205 | * Checks whether the provided meta tag token is an annotation start token |
206 | * @param Token $t |
207 | * @return bool |
208 | */ |
209 | public static function isAnnotationStartToken( Token $t ): bool { |
210 | $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); |
211 | return $type !== null && !str_ends_with( $type, '/End' ); |
212 | } |
213 | |
214 | /** |
215 | * Checks whether the provided meta tag token is an annotation end token |
216 | * @param Token $t |
217 | * @return bool |
218 | */ |
219 | public static function isAnnotationEndToken( Token $t ): bool { |
220 | $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); |
221 | return $type !== null && str_ends_with( $type, '/End' ); |
222 | } |
223 | |
224 | /** |
225 | * HACK: Returns true if $token looks like a TU marker (<!--T:XXX-->) and if we could be in a |
226 | * translate-annotated page. |
227 | * @param Env $env |
228 | * @param CommentTk $token |
229 | * @return bool |
230 | */ |
231 | public static function isTranslationUnitMarker( Env $env, CommentTk $token ): bool { |
232 | return $env->hasAnnotations && |
233 | $env->getSiteConfig()->isAnnotationTag( 'translate' ) && |
234 | preg_match( '/^T:/', $token->value ) === 1; |
235 | } |
236 | |
237 | /** |
238 | * Is token a transparent link tag? |
239 | * |
240 | * @param Token|string $token |
241 | * @return bool |
242 | */ |
243 | public static function isEmptyLineMetaToken( $token ): bool { |
244 | return $token instanceof SelfclosingTagTk && |
245 | $token->getName() === 'meta' && |
246 | $token->getAttributeV( 'typeof' ) === 'mw:EmptyLine'; |
247 | } |
248 | |
249 | /** |
250 | * Determine whether the token matches the given `typeof` attribute value. |
251 | * |
252 | * @param Token $t The token to test |
253 | * @param string $typeRe Regular expression matching the expected value of |
254 | * the `typeof` attribute. |
255 | * @return ?string The matching `typeof` value, or `null` if there is |
256 | * no match. |
257 | */ |
258 | public static function matchTypeOf( Token $t, string $typeRe ): ?string { |
259 | $v = $t->getAttributeV( 'typeof' ); |
260 | if ( $v === null ) { |
261 | return null; |
262 | } |
263 | Assert::invariant( is_string( $v ), "Typeof is not simple" ); |
264 | foreach ( preg_split( '/\s+/', $v, -1, PREG_SPLIT_NO_EMPTY ) as $ty ) { |
265 | $count = preg_match( $typeRe, $ty ); |
266 | Assert::invariant( $count !== false, "Bad regexp" ); |
267 | if ( $count ) { |
268 | return $ty; |
269 | } |
270 | } |
271 | return null; |
272 | } |
273 | |
274 | /** |
275 | * Determine whether the token matches the given typeof attribute value. |
276 | * |
277 | * @param Token $t |
278 | * @param string $type Expected value of "typeof" attribute, as a literal |
279 | * string. |
280 | * @return bool True if the token matches. |
281 | */ |
282 | public static function hasTypeOf( Token $t, string $type ): bool { |
283 | return self::matchTypeOf( |
284 | $t, '/^' . preg_quote( $type, '/' ) . '$/D' |
285 | ) !== null; |
286 | } |
287 | |
288 | /** |
289 | * Shift TSR of a token |
290 | * |
291 | * PORT-FIXME: In JS this was sometimes called with $offset=undefined, which meant do |
292 | * nothing by default, except if there was a third parameter set to true, in which case it |
293 | * meant the same thing as $offset = null. We can't pass in undefined in PHP, so this should |
294 | * usually be handled with isset() is the caller. But isset() returns true if the variable is |
295 | * null, so let's use false instead of null for whatever the previous code meant by a null |
296 | * offset. |
297 | * |
298 | * @param array<Token|string> $tokens |
299 | * @param int|false $offset |
300 | */ |
301 | public static function shiftTokenTSR( array $tokens, $offset ): void { |
302 | // Bail early if we can |
303 | if ( $offset === 0 ) { |
304 | return; |
305 | } |
306 | |
307 | // JS b/c |
308 | if ( $offset === null ) { |
309 | $offset = false; |
310 | } |
311 | |
312 | // update/clear tsr |
313 | for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { |
314 | $t = $tokens[$i]; |
315 | switch ( is_object( $t ) ? get_class( $t ) : null ) { |
316 | case TagTk::class: |
317 | case SelfclosingTagTk::class: |
318 | case NlTk::class: |
319 | case CommentTk::class: |
320 | case EndTagTk::class: |
321 | $da = $t->dataParsoid; |
322 | $tsr = $da->tsr; |
323 | if ( $tsr ) { |
324 | if ( $offset ) { |
325 | $da->tsr = $tsr->offset( $offset ); |
326 | } else { |
327 | $da->tsr = null; |
328 | } |
329 | } |
330 | |
331 | if ( $offset && isset( $da->extTagOffsets ) ) { |
332 | $da->extTagOffsets = |
333 | $da->extTagOffsets->offset( $offset ); |
334 | } |
335 | |
336 | // SSS FIXME: offset will always be available in |
337 | // chunky-tokenizer mode in which case we wont have |
338 | // buggy offsets below. The null scenario is only |
339 | // for when the token-stream-patcher attempts to |
340 | // reparse a string -- it is likely to only patch up |
341 | // small string fragments and the complicated use cases |
342 | // below should not materialize. |
343 | // CSA: token-stream-patcher shouldn't have problems |
344 | // now that $frame->srcText is always accurate? |
345 | |
346 | // content offsets for ext-links |
347 | if ( $offset && isset( $da->tmp->extLinkContentOffsets ) ) { |
348 | $da->tmp->extLinkContentOffsets = |
349 | $da->tmp->extLinkContentOffsets->offset( $offset ); |
350 | } |
351 | |
352 | // Process attributes |
353 | if ( $t->attribs !== null ) { |
354 | for ( $j = 0, $m = count( $t->attribs ); $j < $m; $j++ ) { |
355 | $a = $t->attribs[$j]; |
356 | if ( is_array( $a->k ) ) { |
357 | self::shiftTokenTSR( $a->k, $offset ); |
358 | } |
359 | if ( is_array( $a->v ) ) { |
360 | self::shiftTokenTSR( $a->v, $offset ); |
361 | } |
362 | |
363 | // src offsets used to set mw:TemplateParams |
364 | if ( !$offset ) { |
365 | $a->srcOffsets = null; |
366 | } elseif ( $a->srcOffsets !== null ) { |
367 | $a->srcOffsets = $a->srcOffsets->offset( $offset ); |
368 | } |
369 | } |
370 | } |
371 | break; |
372 | |
373 | default: |
374 | break; |
375 | } |
376 | } |
377 | } |
378 | |
379 | /** |
380 | * Strip EOFTk token from token chunk. |
381 | * The EOFTk is expected to be the last token of the chunk. |
382 | * |
383 | * @param array &$tokens |
384 | * @return array return the modified token array so that this call can be chained |
385 | */ |
386 | public static function stripEOFTkFromTokens( array &$tokens ): array { |
387 | $n = count( $tokens ); |
388 | if ( $n && $tokens[$n - 1] instanceof EOFTk ) { |
389 | array_pop( $tokens ); |
390 | } |
391 | return $tokens; |
392 | } |
393 | |
394 | /** |
395 | * Convert string offsets |
396 | * |
397 | * Offset types are: |
398 | * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. |
399 | * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. |
400 | * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. |
401 | * |
402 | * Offsets that are mid-Unicode character are "rounded" up to the next full |
403 | * character, i.e. the output offset will always point to the start of a |
404 | * Unicode code point (or just past the end of the string). Offsets outside |
405 | * the string are "rounded" to 0 or just-past-the-end. |
406 | * |
407 | * @note When constructing the array of offsets to pass to this method, |
408 | * populate it with references as `$offsets[] = &$var;`. |
409 | * |
410 | * @param string $s Unicode string the offsets are offsets into, UTF-8 encoded. |
411 | * @param ('byte'|'ucs2'|'char') $from Offset type to convert from. |
412 | * @param ('byte'|'ucs2'|'char') $to Offset type to convert to. |
413 | * @param int[] $offsets References to the offsets to convert. |
414 | */ |
415 | public static function convertOffsets( |
416 | string $s, string $from, string $to, array $offsets |
417 | ): void { |
418 | static $valid = [ 'byte', 'char', 'ucs2' ]; |
419 | if ( !in_array( $from, $valid, true ) ) { |
420 | throw new \InvalidArgumentException( 'Invalid $from' ); |
421 | } |
422 | if ( !in_array( $to, $valid, true ) ) { |
423 | throw new \InvalidArgumentException( 'Invalid $to' ); |
424 | } |
425 | |
426 | $i = 0; |
427 | $offsetCt = count( $offsets ); |
428 | if ( $offsetCt === 0 ) { // Nothing to do |
429 | return; |
430 | } |
431 | sort( $offsets, SORT_NUMERIC ); |
432 | |
433 | $bytePos = 0; |
434 | $ucs2Pos = 0; |
435 | $charPos = 0; |
436 | |
437 | $fromPos = &${$from . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar |
438 | $toPos = &${$to . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar |
439 | |
440 | $byteLen = strlen( $s ); |
441 | while ( $bytePos < $byteLen ) { |
442 | // Update offsets that we've reached |
443 | while ( $offsets[$i] <= $fromPos ) { |
444 | $offsets[$i] = $toPos; |
445 | if ( ++$i >= $offsetCt ) { |
446 | return; |
447 | } |
448 | } |
449 | |
450 | // Update positions |
451 | ++$charPos; |
452 | $c = ord( $s[$bytePos] ) & 0xf8; |
453 | switch ( $c ) { |
454 | case 0x00: |
455 | case 0x08: |
456 | case 0x10: |
457 | case 0x18: |
458 | case 0x20: |
459 | case 0x28: |
460 | case 0x30: |
461 | case 0x38: |
462 | case 0x40: |
463 | case 0x48: |
464 | case 0x50: |
465 | case 0x58: |
466 | case 0x60: |
467 | case 0x68: |
468 | case 0x70: |
469 | case 0x78: |
470 | ++$bytePos; |
471 | ++$ucs2Pos; |
472 | break; |
473 | |
474 | case 0xc0: |
475 | case 0xc8: |
476 | case 0xd0: |
477 | case 0xd8: |
478 | $bytePos += 2; |
479 | ++$ucs2Pos; |
480 | break; |
481 | |
482 | case 0xe0: |
483 | case 0xe8: |
484 | $bytePos += 3; |
485 | ++$ucs2Pos; |
486 | break; |
487 | |
488 | case 0xf0: |
489 | $bytePos += 4; |
490 | $ucs2Pos += 2; |
491 | break; |
492 | |
493 | default: |
494 | throw new \InvalidArgumentException( |
495 | bin2hex( $s ) . " (dumped via php bin2hex) is not valid UTF-8" ); |
496 | } |
497 | } |
498 | |
499 | // Convert any offsets past the end of the string to the length of the |
500 | // string. |
501 | while ( $i < $offsetCt ) { |
502 | $offsets[$i] = $toPos; |
503 | ++$i; |
504 | } |
505 | } |
506 | |
507 | /** |
508 | * Convert offsets in a token array |
509 | * |
510 | * @see TokenUtils::convertOffsets() |
511 | * |
512 | * @param string $s The offset reference string |
513 | * @param ('byte'|'ucs2'|'char') $from Offset type to convert from |
514 | * @param ('byte'|'ucs2'|'char') $to Offset type to convert to |
515 | * @param array<Token|string|array> $tokens |
516 | */ |
517 | public static function convertTokenOffsets( |
518 | string $s, string $from, string $to, array $tokens |
519 | ): void { |
520 | $offsets = []; /* @var array<int> $offsets */ |
521 | self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { |
522 | if ( $sr instanceof DomSourceRange ) { |
523 | // Adjust the widths to be actual character offsets |
524 | if ( $sr->openWidth !== null ) { |
525 | Assert::invariant( $sr->start !== null, "width w/o start" ); |
526 | $sr->openWidth = $sr->start + $sr->openWidth; |
527 | $offsets[] =& $sr->openWidth; |
528 | } |
529 | if ( $sr->closeWidth !== null ) { |
530 | Assert::invariant( $sr->end !== null, "width w/o end" ); |
531 | $sr->closeWidth = $sr->end - $sr->closeWidth; |
532 | $offsets[] =& $sr->closeWidth; |
533 | } |
534 | } |
535 | if ( $sr->start !== null ) { |
536 | $offsets[] =& $sr->start; |
537 | } |
538 | if ( $sr->end !== null ) { |
539 | $offsets[] =& $sr->end; |
540 | } |
541 | } ); |
542 | self::convertOffsets( $s, $from, $to, $offsets ); |
543 | self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { |
544 | if ( $sr instanceof DomSourceRange ) { |
545 | // Adjust widths back from being character offsets |
546 | if ( $sr->openWidth !== null ) { |
547 | $sr->openWidth -= $sr->start; |
548 | } |
549 | if ( $sr->closeWidth !== null ) { |
550 | $sr->closeWidth = $sr->end - $sr->closeWidth; |
551 | } |
552 | } |
553 | } ); |
554 | } |
555 | |
556 | /** |
557 | * @param array<Token|string>|array<KV>|KV|Token|DomSourceRange|KVSourceRange|SourceRange|string $input |
558 | * @param callable $offsetFunc |
559 | */ |
560 | private static function collectOffsets( $input, callable $offsetFunc ): void { |
561 | if ( is_array( $input ) ) { |
562 | foreach ( $input as $token ) { |
563 | self::collectOffsets( $token, $offsetFunc ); |
564 | } |
565 | } elseif ( $input instanceof KV ) { |
566 | self::collectOffsets( $input->k, $offsetFunc ); |
567 | self::collectOffsets( $input->v, $offsetFunc ); |
568 | if ( $input->srcOffsets ) { |
569 | self::collectOffsets( $input->srcOffsets, $offsetFunc ); |
570 | } |
571 | } elseif ( $input instanceof Token ) { |
572 | if ( isset( $input->dataParsoid->tsr ) ) { |
573 | self::collectOffsets( $input->dataParsoid->tsr, $offsetFunc ); |
574 | } |
575 | if ( isset( $input->dataParsoid->tmp->extLinkContentOffsets ) ) { |
576 | self::collectOffsets( $input->dataParsoid->tmp->extLinkContentOffsets, $offsetFunc ); |
577 | } |
578 | if ( isset( $input->dataParsoid->tokens ) ) { |
579 | self::collectOffsets( $input->dataParsoid->tokens, $offsetFunc ); |
580 | } |
581 | if ( isset( $input->dataParsoid->extTagOffsets ) ) { |
582 | self::collectOffsets( $input->dataParsoid->extTagOffsets, $offsetFunc ); |
583 | } |
584 | self::collectOffsets( $input->attribs, $offsetFunc ); |
585 | } elseif ( $input instanceof KVSourceRange ) { |
586 | self::collectOffsets( $input->key, $offsetFunc ); |
587 | self::collectOffsets( $input->value, $offsetFunc ); |
588 | } elseif ( $input instanceof SourceRange ) { |
589 | // This includes DomSourceRange |
590 | $offsetFunc( $input ); |
591 | } |
592 | } |
593 | |
594 | /** |
595 | * Tests whether token represents an HTML entity. |
596 | * Think `<span typeof="mw:Entity">`. |
597 | * @param Token|string|null $token |
598 | * @return bool |
599 | */ |
600 | public static function isEntitySpanToken( $token ): bool { |
601 | return $token && |
602 | $token instanceof TagTk && |
603 | $token->getName() === 'span' && |
604 | self::hasTypeOf( $token, 'mw:Entity' ); |
605 | } |
606 | |
607 | /** |
608 | * Transform `"\n"` and `"\r\n"` in the input string to {@link NlTk} tokens. |
609 | * @param string $str |
610 | * @return array (interspersed string and NlTk tokens) |
611 | */ |
612 | public static function newlinesToNlTks( string $str ): array { |
613 | $toks = preg_split( '/\n|\r\n/', $str ); |
614 | $ret = []; |
615 | // Add one NlTk between each pair, hence toks.length-1 |
616 | for ( $i = 0, $n = count( $toks ) - 1; $i < $n; $i++ ) { |
617 | $ret[] = $toks[$i]; |
618 | $ret[] = new NlTk( null ); |
619 | } |
620 | $ret[] = $toks[$i]; |
621 | return $ret; |
622 | } |
623 | |
624 | /** |
625 | * Flatten/convert a token array into a string. |
626 | * @param string|Token|array<Token|string> $tokens |
627 | * @param bool $strict Whether to abort as soon as we find a token we |
628 | * can't stringify. |
629 | * @param array<string,bool|Env> $opts |
630 | * @return string|array{0:string,1:Array<Token|string>} |
631 | * The stringified tokens. If $strict is true, returns a two-element |
632 | * array containing string prefix and the remainder of the tokens as |
633 | * soon as we encounter something we can't stringify. |
634 | * |
635 | * Unsure why phan is whining about $opts array accesses. |
636 | * So for now, I am simply suppressing those warnings. |
637 | */ |
638 | public static function tokensToString( $tokens, bool $strict = false, array $opts = [] ) { |
639 | if ( is_string( $tokens ) ) { |
640 | return $tokens; |
641 | } |
642 | |
643 | if ( !is_array( $tokens ) ) { |
644 | $tokens = [ $tokens ]; |
645 | } |
646 | |
647 | $out = ''; |
648 | for ( $i = 0, $l = count( $tokens ); $i < $l; $i++ ) { |
649 | $token = $tokens[$i]; |
650 | if ( $token === null ) { |
651 | throw new UnreachableException( "No nulls expected." ); |
652 | } elseif ( $token instanceof KV ) { |
653 | // Since this function is occasionally called on KV->v, |
654 | // whose signature recursively includes KV[], a mismatch with |
655 | // this function, we assert that those values are only |
656 | // included in safe places that don't intend to stringify |
657 | // their tokens. |
658 | throw new UnreachableException( "No KVs expected." ); |
659 | } elseif ( is_string( $token ) ) { |
660 | $out .= $token; |
661 | } elseif ( is_array( $token ) ) { |
662 | Assert::invariant( !$strict, "strict case handled above" ); |
663 | $out .= self::tokensToString( $token, $strict, $opts ); |
664 | } elseif ( |
665 | $token instanceof CommentTk || |
666 | ( empty( $opts['retainNLs'] ) && $token instanceof NlTk ) |
667 | ) { |
668 | // strip comments and newlines |
669 | } elseif ( !empty( $opts['stripEmptyLineMeta'] ) && self::isEmptyLineMetaToken( $token ) ) { |
670 | // If requested, strip empty line meta tokens too. |
671 | } elseif ( !empty( $opts['includeEntities'] ) && self::isEntitySpanToken( $token ) ) { |
672 | $out .= $token->dataParsoid->src; |
673 | $i += 2; // Skip child and end tag. |
674 | } elseif ( $strict ) { |
675 | // If strict, return accumulated string on encountering first non-text token |
676 | return [ $out, array_slice( $tokens, $i ) ]; |
677 | } elseif ( |
678 | // This option shouldn't be used if the tokens have been |
679 | // expanded to DOM |
680 | !empty( $opts['unpackDOMFragments'] ) && |
681 | ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) && |
682 | self::hasDOMFragmentType( $token ) |
683 | ) { |
684 | // Handle dom fragments |
685 | $domFragment = $opts['env']->getDOMFragment( |
686 | $token->dataParsoid->html |
687 | ); |
688 | // Calling `env->removeDOMFragment()` here is case dependent |
689 | // but should be rare enough when permissible that it can be |
690 | // ignored. |
691 | // FIXME: The correct thing to do would be to return |
692 | // `$domFragment.innerHTML` for the current scenarios where |
693 | // `unpackDOMFragments` is used (expanded attribute |
694 | // values and reparses thereof) but we'd need to remove |
695 | // the span wrapping and typeof annotation of extension |
696 | // content and nowikis. Since we're primarily expecting |
697 | // to find <translate> and <nowiki> here, this will do. |
698 | $out .= $domFragment->textContent; |
699 | if ( $token instanceof TagTk ) { |
700 | $i += 1; // Skip the EndTagTK |
701 | Assert::invariant( |
702 | $i >= $l || $tokens[$i] instanceof EndTagTk, |
703 | "tag should be followed by endtag" |
704 | ); |
705 | } |
706 | } |
707 | } |
708 | return $out; |
709 | } |
710 | |
711 | /** |
712 | * Convert an array of key-value pairs into a hash of keys to values. |
713 | * For duplicate keys, the last entry wins. |
714 | * @param array<KV> $kvs |
715 | * @return array<string,array<Token|string>>|array<string,string> |
716 | */ |
717 | public static function kvToHash( array $kvs ): array { |
718 | $res = []; |
719 | foreach ( $kvs as $kv ) { |
720 | $key = trim( self::tokensToString( $kv->k ) ); |
721 | // SSS FIXME: Temporary fix to handle extensions which use |
722 | // entities in attribute values. We need more robust handling |
723 | // of non-string template attribute values in general. |
724 | $val = self::tokensToString( $kv->v ); |
725 | $res[mb_strtolower( $key )] = self::tokenTrim( $val ); |
726 | } |
727 | return $res; |
728 | } |
729 | |
730 | /** |
731 | * Trim space and newlines from leading and trailing text tokens. |
732 | * @param string|Token|(Token|string)[] $tokens |
733 | * @return string|Token|(Token|string)[] |
734 | */ |
735 | public static function tokenTrim( $tokens ) { |
736 | if ( !is_array( $tokens ) ) { |
737 | if ( is_string( $tokens ) ) { |
738 | return trim( $tokens ); |
739 | } |
740 | return $tokens; |
741 | } |
742 | |
743 | $n = count( $tokens ); |
744 | |
745 | // strip leading space |
746 | foreach ( $tokens as &$token ) { |
747 | if ( $token instanceof NlTk ) { |
748 | $token = ''; |
749 | } elseif ( is_string( $token ) ) { |
750 | $token = preg_replace( '/^\s+/', '', $token, 1 ); |
751 | if ( $token !== '' ) { |
752 | break; |
753 | } |
754 | } else { |
755 | break; |
756 | } |
757 | } |
758 | |
759 | // strip trailing space |
760 | for ( $i = $n - 1; $i >= 0; $i-- ) { |
761 | $token = &$tokens[$i]; |
762 | if ( $token instanceof NlTk ) { |
763 | $token = ''; // replace newline with empty |
764 | } elseif ( is_string( $token ) ) { |
765 | $token = preg_replace( '/\s+$/D', '', $token, 1 ); |
766 | if ( $token !== '' ) { |
767 | break; |
768 | } |
769 | } else { |
770 | break; |
771 | } |
772 | } |
773 | |
774 | return $tokens; |
775 | } |
776 | |
777 | } |