Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 202 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
TokenizerUtils | |
0.00% |
0 / 202 |
|
0.00% |
0 / 13 |
13340 | |
0.00% |
0 / 1 |
internalFlatten | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
flattenIfArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
flattenString | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
flattenStringlist | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
42 | |||
getAttrVal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
buildTableTokens | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
380 | |||
buildXMLTag | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
inlineBreaks | |
0.00% |
0 / 74 |
|
0.00% |
0 / 1 |
2256 | |||
popComments | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
156 | |||
getAutoUrlTerminatingChars | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
enforceParserResourceLimits | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
90 | |||
protectAttrs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
resetAnnotationIncludeRegex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Utilities used in the tokenizer. |
4 | * @module wt2html/tokenizer_utils |
5 | */ |
6 | |
7 | declare( strict_types = 1 ); |
8 | |
9 | namespace Wikimedia\Parsoid\Wt2Html; |
10 | |
11 | use Wikimedia\Parsoid\Config\Env; |
12 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
13 | use Wikimedia\Parsoid\NodeData\TempData; |
14 | use Wikimedia\Parsoid\Tokens\CommentTk; |
15 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
16 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
17 | use Wikimedia\Parsoid\Tokens\SourceRange; |
18 | use Wikimedia\Parsoid\Tokens\TagTk; |
19 | use Wikimedia\Parsoid\Tokens\Token; |
20 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Wikitext\Consts; |
23 | |
24 | class TokenizerUtils { |
25 | private static ?string $protectAttrsRegExp = null; |
26 | private static ?string $inclAnnRegExp = null; |
27 | |
28 | /** |
29 | * @param mixed $e |
30 | * @param ?array &$res |
31 | * @return mixed (same type as $e) |
32 | * @throws \Exception |
33 | */ |
34 | private static function internalFlatten( $e, ?array &$res ) { |
35 | // Don't bother flattening if we dont have an array |
36 | if ( !is_array( $e ) ) { |
37 | return $e; |
38 | } |
39 | |
40 | for ( $i = 0; $i < count( $e ); $i++ ) { |
41 | $v = $e[$i]; |
42 | if ( is_array( $v ) ) { |
43 | // Change in assumption from a shallow array to a nested array. |
44 | if ( $res === null ) { |
45 | $res = array_slice( $e, 0, $i ); |
46 | } |
47 | self::internalFlatten( $v, $res ); |
48 | } elseif ( $v !== null ) { |
49 | if ( $res !== null ) { |
50 | $res[] = $v; |
51 | } |
52 | } else { |
53 | throw new \RuntimeException( __METHOD__ . ": found falsy element $v @ posn $i" ); |
54 | } |
55 | } |
56 | |
57 | if ( $res !== null ) { |
58 | $e = $res; |
59 | } |
60 | return $e; |
61 | } |
62 | |
63 | /** |
64 | * If $a is an array, this recursively flattens all nested arrays. |
65 | * @param mixed $a |
66 | * @return mixed |
67 | */ |
68 | public static function flattenIfArray( $a ) { |
69 | return self::internalFlatten( $a, $res ); |
70 | } |
71 | |
72 | /** |
73 | * FIXME: document |
74 | * @param mixed $c |
75 | * @return mixed |
76 | */ |
77 | public static function flattenString( $c ) { |
78 | $out = self::flattenStringlist( $c ); |
79 | if ( count( $out ) === 1 && is_string( $out[0] ) ) { |
80 | return $out[0]; |
81 | } else { |
82 | return $out; |
83 | } |
84 | } |
85 | |
86 | /** |
87 | * FIXME: document |
88 | * @param array $c |
89 | * @return array |
90 | */ |
91 | public static function flattenStringlist( array $c ): array { |
92 | $out = []; |
93 | $text = ''; |
94 | $c = self::flattenIfArray( $c ); |
95 | for ( $i = 0, $l = count( $c ); $i < $l; $i++ ) { |
96 | $ci = $c[$i]; |
97 | if ( is_string( $ci ) ) { |
98 | if ( $ci !== '' ) { |
99 | $text .= $ci; |
100 | } |
101 | } else { |
102 | if ( $text !== '' ) { |
103 | $out[] = $text; |
104 | $text = ''; |
105 | } |
106 | $out[] = $ci; |
107 | } |
108 | } |
109 | if ( $text !== '' ) { |
110 | $out[] = $text; |
111 | } |
112 | return $out; |
113 | } |
114 | |
115 | /** |
116 | * @param mixed $value |
117 | * @param int $start start of TSR range |
118 | * @param int $end end of TSR range |
119 | * @return array |
120 | */ |
121 | public static function getAttrVal( $value, int $start, int $end ): array { |
122 | return [ 'value' => $value, 'srcOffsets' => new SourceRange( $start, $end ) ]; |
123 | } |
124 | |
125 | /** |
126 | * Build a token array representing <tag>$content</tag> alongwith |
127 | * appropriate attributes and TSR info set on the tokens. |
128 | * |
129 | * @param string $pegSource |
130 | * @param string $tagName |
131 | * @param string $wtChar |
132 | * @param mixed $attrInfo |
133 | * @param SourceRange $tsr |
134 | * @param int $endPos |
135 | * @param mixed $content |
136 | * @param bool $addEndTag |
137 | * @return array (of tokens) |
138 | */ |
139 | public static function buildTableTokens( |
140 | string $pegSource, string $tagName, string $wtChar, $attrInfo, |
141 | SourceRange $tsr, int $endPos, $content, bool $addEndTag = false |
142 | ): array { |
143 | $dp = new DataParsoid; |
144 | $dp->tsr = $tsr; |
145 | |
146 | if ( $tagName === 'td' ) { |
147 | if ( !$attrInfo ) { |
148 | // Add a flag that indicates that the tokenizer didn't |
149 | // encounter a "|...|" attribute box. This is useful when |
150 | // deciding which <td>/<th> cells need attribute fixups. |
151 | $dp->setTempFlag( TempData::NO_ATTRS ); |
152 | } else { |
153 | if ( !$attrInfo[0] && $attrInfo[1] === "" ) { |
154 | // FIXME: Skip comments between the two "|" chars |
155 | // [ [], "", "|"] => "||" syntax for first <td> on line |
156 | $dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
157 | $dp->setTempFlag( TempData::NO_ATTRS ); |
158 | } |
159 | } |
160 | } elseif ( $tagName === 'th' ) { |
161 | if ( !$attrInfo ) { |
162 | // Add a flag that indicates that the tokenizer didn't |
163 | // encounter a "|...|" attribute box. This is useful when |
164 | // deciding which <td>/<th> cells need attribute fixups. |
165 | $dp->setTempFlag( TempData::NO_ATTRS ); |
166 | |
167 | // FIXME: Skip comments between the two "!" chars |
168 | // "!!foo" in sol context parses as <th>!foo</th> |
169 | if ( |
170 | is_string( $content[0][0] ?? null ) && |
171 | str_starts_with( $content[0][0], "!" ) |
172 | ) { |
173 | $dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
174 | } |
175 | } |
176 | } |
177 | |
178 | $a = []; |
179 | if ( $attrInfo ) { |
180 | if ( $tagName !== 'caption' ) { |
181 | $dp->getTemp()->attrSrc = substr( |
182 | $pegSource, $tsr->start, $tsr->end - $tsr->start - strlen( $attrInfo[2] ) |
183 | ); |
184 | } |
185 | $a = $attrInfo[0]; |
186 | if ( !$a ) { |
187 | $dp->startTagSrc = $wtChar . $attrInfo[1]; |
188 | } |
189 | if ( ( !$a && $attrInfo[2] ) || $attrInfo[2] !== '|' ) { |
190 | // Variation from default |
191 | // 1. Separator present with an empty attribute block |
192 | // 2. Not "|" |
193 | $dp->attrSepSrc = $attrInfo[2]; |
194 | } |
195 | } elseif ( $tagName !== 'caption' ) { |
196 | $dp->getTemp()->attrSrc = ''; |
197 | } |
198 | |
199 | // We consider 1 the start because the table_data_tag and table_heading_tag |
200 | // rules don't include the pipe so it isn't accounted for in the tsr passed |
201 | // to this function. The rules making use of those rules do some extra |
202 | // bookkeeping to adjust for that on the start token returned from this |
203 | // function. Of course, table_caption_tag doesn't follow that same pattern |
204 | // but that isn't a concern here. |
205 | if ( $tagName !== 'caption' && $tsr->start === 1 ) { |
206 | $dp->setTempFlag( TempData::AT_SRC_START ); |
207 | } |
208 | |
209 | $tokens = [ new TagTk( $tagName, $a, $dp ) ]; |
210 | PHPUtils::pushArray( $tokens, $content ); |
211 | |
212 | if ( $addEndTag ) { |
213 | $dataParsoid = new DataParsoid; |
214 | $dataParsoid->tsr = new SourceRange( $endPos, $endPos ); |
215 | $tokens[] = new EndTagTk( $tagName, [], $dataParsoid ); |
216 | } else { |
217 | // We rely on our tree builder to close the table cell (td/th) as needed. |
218 | // We cannot close the cell here because cell content can come from |
219 | // multiple parsing contexts and we cannot close the tag in the same |
220 | // parsing context in which the td was opened: |
221 | // Ex: {{1x|{{!}}foo}}{{1x|bar}} has to output <td>foobar</td> |
222 | // |
223 | // Previously a meta marker was added here for DSR computation, but |
224 | // that's complicated now that marker meta handling has been removed |
225 | // from ComputeDSR. |
226 | } |
227 | |
228 | return $tokens; |
229 | } |
230 | |
231 | /** |
232 | * Build a token representing <tag>, <tag />, or </tag> |
233 | * with appropriate attributes set on the token. |
234 | * |
235 | * @param string $name |
236 | * @param string $lcName |
237 | * @param array $attribs |
238 | * @param mixed $endTag |
239 | * @param bool $selfClose |
240 | * @param SourceRange $tsr |
241 | * @return Token |
242 | */ |
243 | public static function buildXMLTag( string $name, string $lcName, array $attribs, $endTag, |
244 | bool $selfClose, SourceRange $tsr |
245 | ): Token { |
246 | $tok = null; |
247 | $da = new DataParsoid; |
248 | $da->tsr = $tsr; |
249 | $da->stx = 'html'; |
250 | |
251 | if ( $name !== $lcName ) { |
252 | $da->srcTagName = $name; |
253 | } |
254 | |
255 | if ( $endTag !== null ) { |
256 | $tok = new EndTagTk( $lcName, $attribs, $da ); |
257 | } elseif ( $selfClose ) { |
258 | $da->selfClose = true; |
259 | $tok = new SelfclosingTagTk( $lcName, $attribs, $da ); |
260 | } else { |
261 | $tok = new TagTk( $lcName, $attribs, $da ); |
262 | } |
263 | |
264 | return $tok; |
265 | } |
266 | |
267 | /** |
268 | * Inline breaks, flag-enabled rule which detects end positions for |
269 | * active higher-level rules in inline and other nested rules. |
270 | * Those inner rules are then exited, so that the outer rule can |
271 | * handle the end marker. |
272 | * @param string $input |
273 | * @param int $pos |
274 | * @param array $stops |
275 | * @param Env $env |
276 | * @return bool |
277 | * @throws \Exception |
278 | */ |
279 | public static function inlineBreaks( string $input, int $pos, array $stops, Env $env ): bool { |
280 | $c = $input[$pos]; |
281 | $c2 = $input[$pos + 1] ?? ''; |
282 | |
283 | switch ( $c ) { |
284 | case '=': |
285 | if ( $stops['arrow'] && $c2 === '>' ) { |
286 | return true; |
287 | } |
288 | if ( $stops['equal'] ) { |
289 | return true; |
290 | } |
291 | if ( $stops['h'] ) { |
292 | if ( self::$inclAnnRegExp === null ) { |
293 | $tags = array_merge( |
294 | [ 'noinclude', 'includeonly', 'onlyinclude' ], |
295 | $env->getSiteConfig()->getAnnotationTags() |
296 | ); |
297 | self::$inclAnnRegExp = '|<\/?(?:' . implode( '|', $tags ) . ')>'; |
298 | } |
299 | return ( $pos === strlen( $input ) - 1 |
300 | // possibly more equals followed by spaces or comments |
301 | || preg_match( '/^=*(?:[ \t]|<\!--(?:(?!-->).)*-->' |
302 | . self::$inclAnnRegExp . ')*(?:[\r\n]|$)/sD', |
303 | substr( $input, $pos + 1 ) ) ); |
304 | } |
305 | return false; |
306 | |
307 | case '|': |
308 | return !$stops['annOrExtTag'] && ( |
309 | $stops['templateArg'] |
310 | || $stops['tableCellArg'] |
311 | || $stops['linkdesc'] |
312 | || ( $stops['table'] |
313 | && $pos < strlen( $input ) - 1 |
314 | && preg_match( '/[}|]/', $input[$pos + 1] ) ) |
315 | ); |
316 | |
317 | case '!': |
318 | return $stops['th'] |
319 | && !$stops['intemplate'] |
320 | && $c2 === '!'; |
321 | |
322 | case '{': |
323 | // {{!}} pipe templates.. |
324 | // FIXME: Presumably these should mix with and match | above. |
325 | // phpcs:ignore Squiz.WhiteSpace.LanguageConstructSpacing.IncorrectSingle |
326 | return ( $stops['tableCellArg'] |
327 | && substr( $input, $pos, 5 ) === '{{!}}' ) |
328 | || ( $stops['table'] |
329 | && substr( $input, $pos, 10 ) === '{{!}}{{!}}' ); |
330 | |
331 | case '}': |
332 | $preproc = $stops['preproc']; |
333 | return ( $c2 === '}' && $preproc === '}}' ) |
334 | || ( $c2 === '-' && $preproc === '}-' ); |
335 | |
336 | case ':': |
337 | return $stops['colon'] |
338 | && !$stops['extlink'] |
339 | && !$stops['intemplate'] |
340 | && !$stops['linkdesc'] |
341 | && !( $stops['preproc'] === '}-' ); |
342 | |
343 | case ';': |
344 | return $stops['semicolon']; |
345 | |
346 | case "\r": |
347 | return $stops['table'] |
348 | && preg_match( '/\r\n?\s*[!|]/', substr( $input, $pos ) ); |
349 | |
350 | case "\n": |
351 | // The code below is just a manual / efficient |
352 | // version of this check. |
353 | // |
354 | // stops.table && /^\n\s*[!|]/.test(input.substr(pos)); |
355 | // |
356 | // It eliminates a substr on the string and eliminates |
357 | // a potential perf problem since "\n" and the inline_breaks |
358 | // test is common during tokenization. |
359 | if ( !$stops['table'] ) { |
360 | return false; |
361 | } |
362 | |
363 | // Allow leading whitespace in tables |
364 | |
365 | // Since we switched on 'c' which is input[pos], |
366 | // we know that input[pos] is "\n". |
367 | // So, the /^\n/ part of the regexp is already satisfied. |
368 | // Look for /\s*[!|]/ below. |
369 | $n = strlen( $input ); |
370 | for ( $i = $pos + 1; $i < $n; $i++ ) { |
371 | $d = $input[$i]; |
372 | if ( preg_match( '/[!|]/', $d ) ) { |
373 | return true; |
374 | } elseif ( !( preg_match( '/\s/', $d ) ) ) { |
375 | return false; |
376 | } |
377 | } |
378 | return false; |
379 | case '[': |
380 | // This is a special case in php's doTableStuff, added in |
381 | // response to T2553. If it encounters a `[[`, it bails on |
382 | // parsing attributes and interprets it all as content. |
383 | return $stops['tableCellArg'] && $c2 === '['; |
384 | |
385 | case '-': |
386 | // Same as above: a special case in doTableStuff, added |
387 | // as part of T153140 |
388 | return $stops['tableCellArg'] && $c2 === '{'; |
389 | |
390 | case ']': |
391 | if ( $stops['extlink'] ) { |
392 | return true; |
393 | } |
394 | return $stops['preproc'] === ']]' |
395 | && $c2 === ']'; |
396 | |
397 | default: |
398 | throw new \RuntimeException( 'Unhandled case!' ); |
399 | } |
400 | } |
401 | |
402 | /** |
403 | * Pop off the end comments, if any. |
404 | * @param array &$attrs |
405 | * @return array|null |
406 | */ |
407 | public static function popComments( array &$attrs ): ?array { |
408 | $buf = []; |
409 | for ( $i = count( $attrs ) - 1; $i > -1; $i-- ) { |
410 | $kv = $attrs[$i]; |
411 | if ( is_string( $kv->k ) && !$kv->v && preg_match( '/^\s*$/D', $kv->k ) ) { |
412 | // permit whitespace |
413 | array_unshift( $buf, $kv->k ); |
414 | } elseif ( is_array( $kv->k ) && !$kv->v ) { |
415 | // all should be comments |
416 | foreach ( $kv->k as $k ) { |
417 | if ( !( $k instanceof CommentTk ) ) { |
418 | break 2; |
419 | } |
420 | } |
421 | array_splice( $buf, 0, 0, $kv->k ); |
422 | } else { |
423 | break; |
424 | } |
425 | } |
426 | // ensure we found a comment |
427 | while ( $buf && !( $buf[0] instanceof CommentTk ) ) { |
428 | array_shift( $buf ); |
429 | } |
430 | if ( $buf ) { |
431 | array_splice( $attrs, -count( $buf ), count( $buf ) ); |
432 | return [ 'buf' => $buf, 'commentStartPos' => $buf[0]->dataParsoid->tsr->start ]; |
433 | } else { |
434 | return null; |
435 | } |
436 | } |
437 | |
438 | /** Get a string containing all the autourl terminating characters (as in legacy parser |
439 | * Parser.php::makeFreeExternalLink). This list is slightly context-dependent because the |
440 | * inclusion of the right parenthesis depends on whether the provided character array $arr |
441 | * contains a left parenthesis. |
442 | * @param bool $hasLeftParen should be true if the URL in question contains |
443 | * a left parenthesis. |
444 | * @return string |
445 | */ |
446 | public static function getAutoUrlTerminatingChars( bool $hasLeftParen ): string { |
447 | $chars = Consts::$strippedUrlCharacters; |
448 | if ( !$hasLeftParen ) { |
449 | $chars .= ')'; |
450 | } |
451 | return $chars; |
452 | } |
453 | |
454 | /** |
455 | * @param Env $env |
456 | * @param mixed $token |
457 | */ |
458 | public static function enforceParserResourceLimits( Env $env, $token ) { |
459 | if ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) { |
460 | $resource = null; |
461 | switch ( $token->getName() ) { |
462 | case 'listItem': |
463 | $resource = 'listItem'; |
464 | break; |
465 | case 'template': |
466 | $resource = 'transclusion'; |
467 | break; |
468 | case 'td': |
469 | case 'th': |
470 | $resource = 'tableCell'; |
471 | break; |
472 | } |
473 | if ( |
474 | $resource !== null && |
475 | $env->bumpWt2HtmlResourceUse( $resource ) === false |
476 | ) { |
477 | // `false` indicates that this bump pushed us over the threshold |
478 | // We don't want to log every token above that, which would be `null` |
479 | $env->log( 'warn', "wt2html: $resource limit exceeded" ); |
480 | } |
481 | } |
482 | } |
483 | |
484 | /** |
485 | * Protect Parsoid-inserted attributes by escaping them to prevent |
486 | * Parsoid-HTML spoofing in wikitext. |
487 | * |
488 | * @param string $name |
489 | * @return string |
490 | */ |
491 | public static function protectAttrs( string $name ): string { |
492 | if ( self::$protectAttrsRegExp === null ) { |
493 | self::$protectAttrsRegExp = "/^(about|data-mw.*|data-parsoid.*|data-x.*|" . |
494 | DOMDataUtils::DATA_OBJECT_ATTR_NAME . |
495 | '|property|rel|typeof)$/i'; |
496 | } |
497 | return preg_replace( self::$protectAttrsRegExp, 'data-x-$1', $name ); |
498 | } |
499 | |
500 | /** |
501 | * Resets $inclAnnRegExp to null to avoid test environment side effects |
502 | */ |
503 | public static function resetAnnotationIncludeRegex(): void { |
504 | self::$inclAnnRegExp = null; |
505 | } |
506 | |
507 | } |