Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 194 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
TokenizerUtils | |
0.00% |
0 / 194 |
|
0.00% |
0 / 13 |
12432 | |
0.00% |
0 / 1 |
internalFlatten | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
flattenIfArray | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
flattenString | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
flattenStringlist | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
42 | |||
getAttrVal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
buildTableTokens | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
240 | |||
buildXMLTag | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
inlineBreaks | |
0.00% |
0 / 74 |
|
0.00% |
0 / 1 |
2256 | |||
popComments | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
156 | |||
getAutoUrlTerminatingChars | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
enforceParserResourceLimits | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
90 | |||
protectAttrs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
resetAnnotationIncludeRegex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Utilities used in the tokenizer. |
4 | * @module wt2html/tokenizer_utils |
5 | */ |
6 | |
7 | declare( strict_types = 1 ); |
8 | |
9 | namespace Wikimedia\Parsoid\Wt2Html; |
10 | |
11 | use Wikimedia\Parsoid\Config\Env; |
12 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
13 | use Wikimedia\Parsoid\NodeData\TempData; |
14 | use Wikimedia\Parsoid\Tokens\CommentTk; |
15 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
16 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
17 | use Wikimedia\Parsoid\Tokens\SourceRange; |
18 | use Wikimedia\Parsoid\Tokens\TagTk; |
19 | use Wikimedia\Parsoid\Tokens\Token; |
20 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Wikitext\Consts; |
23 | |
24 | class TokenizerUtils { |
25 | private static $protectAttrsRegExp; |
26 | private static $inclAnnRegExp; |
27 | |
28 | /** |
29 | * @param mixed $e |
30 | * @param ?array &$res |
31 | * @return mixed (same type as $e) |
32 | * @throws \Exception |
33 | */ |
34 | private static function internalFlatten( $e, ?array &$res ) { |
35 | // Don't bother flattening if we dont have an array |
36 | if ( !is_array( $e ) ) { |
37 | return $e; |
38 | } |
39 | |
40 | for ( $i = 0; $i < count( $e ); $i++ ) { |
41 | $v = $e[$i]; |
42 | if ( is_array( $v ) ) { |
43 | // Change in assumption from a shallow array to a nested array. |
44 | if ( $res === null ) { |
45 | $res = array_slice( $e, 0, $i ); |
46 | } |
47 | self::internalFlatten( $v, $res ); |
48 | } elseif ( $v !== null ) { |
49 | if ( $res !== null ) { |
50 | $res[] = $v; |
51 | } |
52 | } else { |
53 | throw new \RuntimeException( __METHOD__ . ": found falsy element $v @ posn $i" ); |
54 | } |
55 | } |
56 | |
57 | if ( $res !== null ) { |
58 | $e = $res; |
59 | } |
60 | return $e; |
61 | } |
62 | |
63 | /** |
64 | * If $a is an array, this recursively flattens all nested arrays. |
65 | * @param mixed $a |
66 | * @return mixed |
67 | */ |
68 | public static function flattenIfArray( $a ) { |
69 | return self::internalFlatten( $a, $res ); |
70 | } |
71 | |
72 | /** |
73 | * FIXME: document |
74 | * @param mixed $c |
75 | * @return mixed |
76 | */ |
77 | public static function flattenString( $c ) { |
78 | $out = self::flattenStringlist( $c ); |
79 | if ( count( $out ) === 1 && is_string( $out[0] ) ) { |
80 | return $out[0]; |
81 | } else { |
82 | return $out; |
83 | } |
84 | } |
85 | |
86 | /** |
87 | * FIXME: document |
88 | * @param array $c |
89 | * @return array |
90 | */ |
91 | public static function flattenStringlist( array $c ): array { |
92 | $out = []; |
93 | $text = ''; |
94 | $c = self::flattenIfArray( $c ); |
95 | for ( $i = 0, $l = count( $c ); $i < $l; $i++ ) { |
96 | $ci = $c[$i]; |
97 | if ( is_string( $ci ) ) { |
98 | if ( $ci !== '' ) { |
99 | $text .= $ci; |
100 | } |
101 | } else { |
102 | if ( $text !== '' ) { |
103 | $out[] = $text; |
104 | $text = ''; |
105 | } |
106 | $out[] = $ci; |
107 | } |
108 | } |
109 | if ( $text !== '' ) { |
110 | $out[] = $text; |
111 | } |
112 | return $out; |
113 | } |
114 | |
115 | /** |
116 | * @param mixed $value |
117 | * @param int $start start of TSR range |
118 | * @param int $end end of TSR range |
119 | * @return array |
120 | */ |
121 | public static function getAttrVal( $value, int $start, int $end ): array { |
122 | return [ 'value' => $value, 'srcOffsets' => new SourceRange( $start, $end ) ]; |
123 | } |
124 | |
125 | /** |
126 | * Build a token array representing <tag>$content</tag> alongwith |
127 | * appropriate attributes and TSR info set on the tokens. |
128 | * |
129 | * @param string $tagName |
130 | * @param string $wtChar |
131 | * @param mixed $attrInfo |
132 | * @param SourceRange $tsr |
133 | * @param int $endPos |
134 | * @param mixed $content |
135 | * @param bool $addEndTag |
136 | * @return array (of tokens) |
137 | */ |
138 | public static function buildTableTokens( |
139 | string $tagName, string $wtChar, $attrInfo, SourceRange $tsr, |
140 | int $endPos, $content, bool $addEndTag = false |
141 | ): array { |
142 | $dp = new DataParsoid; |
143 | $dp->tsr = $tsr; |
144 | |
145 | if ( $tagName === 'td' ) { |
146 | if ( !$attrInfo ) { |
147 | // Add a flag that indicates that the tokenizer didn't |
148 | // encounter a "|...|" attribute box. This is useful when |
149 | // deciding which <td>/<th> cells need attribute fixups. |
150 | $dp->setTempFlag( TempData::NO_ATTRS ); |
151 | } elseif ( !$attrInfo[0] && $attrInfo[1] === "" ) { |
152 | // FIXME: Skip comments between the two "|" chars |
153 | // [ [], "", "|"] => "||" syntax for first <td> on line |
154 | $dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
155 | $dp->setTempFlag( TempData::NO_ATTRS ); |
156 | } |
157 | } elseif ( $tagName === 'th' ) { |
158 | if ( !$attrInfo ) { |
159 | // Add a flag that indicates that the tokenizer didn't |
160 | // encounter a "|...|" attribute box. This is useful when |
161 | // deciding which <td>/<th> cells need attribute fixups. |
162 | $dp->setTempFlag( TempData::NO_ATTRS ); |
163 | |
164 | // FIXME: Skip comments between the two "!" chars |
165 | // "!!foo" in sol context parses as <th>!foo</th> |
166 | if ( |
167 | is_string( $content[0][0] ?? null ) && |
168 | str_starts_with( $content[0][0], "!" ) |
169 | ) { |
170 | $dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
171 | } |
172 | } |
173 | } |
174 | |
175 | $a = []; |
176 | if ( $attrInfo ) { |
177 | $a = $attrInfo[0]; |
178 | if ( !$a ) { |
179 | $dp->startTagSrc = $wtChar . $attrInfo[1]; |
180 | } |
181 | if ( ( !$a && $attrInfo[2] ) || $attrInfo[2] !== '|' ) { |
182 | // Variation from default |
183 | // 1. Separator present with an empty attribute block |
184 | // 2. Not "|" |
185 | $dp->attrSepSrc = $attrInfo[2]; |
186 | } |
187 | } |
188 | |
189 | $tokens = [ new TagTk( $tagName, $a, $dp ) ]; |
190 | PHPUtils::pushArray( $tokens, $content ); |
191 | |
192 | if ( $addEndTag ) { |
193 | $dataParsoid = new DataParsoid; |
194 | $dataParsoid->tsr = new SourceRange( $endPos, $endPos ); |
195 | $tokens[] = new EndTagTk( $tagName, [], $dataParsoid ); |
196 | } else { |
197 | // We rely on our tree builder to close the table cell (td/th) as needed. |
198 | // We cannot close the cell here because cell content can come from |
199 | // multiple parsing contexts and we cannot close the tag in the same |
200 | // parsing context in which the td was opened: |
201 | // Ex: {{1x|{{!}}foo}}{{1x|bar}} has to output <td>foobar</td> |
202 | // |
203 | // Previously a meta marker was added here for DSR computation, but |
204 | // that's complicated now that marker meta handling has been removed |
205 | // from ComputeDSR. |
206 | } |
207 | |
208 | return $tokens; |
209 | } |
210 | |
211 | /** |
212 | * Build a token representing <tag>, <tag />, or </tag> |
213 | * with appropriate attributes set on the token. |
214 | * |
215 | * @param string $name |
216 | * @param string $lcName |
217 | * @param array $attribs |
218 | * @param mixed $endTag |
219 | * @param bool $selfClose |
220 | * @param SourceRange $tsr |
221 | * @return Token |
222 | */ |
223 | public static function buildXMLTag( string $name, string $lcName, array $attribs, $endTag, |
224 | bool $selfClose, SourceRange $tsr |
225 | ): Token { |
226 | $tok = null; |
227 | $da = new DataParsoid; |
228 | $da->tsr = $tsr; |
229 | $da->stx = 'html'; |
230 | |
231 | if ( $name !== $lcName ) { |
232 | $da->srcTagName = $name; |
233 | } |
234 | |
235 | if ( $endTag !== null ) { |
236 | $tok = new EndTagTk( $lcName, $attribs, $da ); |
237 | } elseif ( $selfClose ) { |
238 | $da->selfClose = true; |
239 | $tok = new SelfclosingTagTk( $lcName, $attribs, $da ); |
240 | } else { |
241 | $tok = new TagTk( $lcName, $attribs, $da ); |
242 | } |
243 | |
244 | return $tok; |
245 | } |
246 | |
247 | /** |
248 | * Inline breaks, flag-enabled rule which detects end positions for |
249 | * active higher-level rules in inline and other nested rules. |
250 | * Those inner rules are then exited, so that the outer rule can |
251 | * handle the end marker. |
252 | * @param string $input |
253 | * @param int $pos |
254 | * @param array $stops |
255 | * @param Env $env |
256 | * @return bool |
257 | * @throws \Exception |
258 | */ |
259 | public static function inlineBreaks( string $input, int $pos, array $stops, Env $env ): bool { |
260 | $c = $input[$pos]; |
261 | $c2 = $input[$pos + 1] ?? ''; |
262 | |
263 | switch ( $c ) { |
264 | case '=': |
265 | if ( $stops['arrow'] && $c2 === '>' ) { |
266 | return true; |
267 | } |
268 | if ( $stops['equal'] ) { |
269 | return true; |
270 | } |
271 | if ( $stops['h'] ) { |
272 | if ( self::$inclAnnRegExp === null ) { |
273 | $tags = array_merge( |
274 | [ 'noinclude', 'includeonly', 'onlyinclude' ], |
275 | $env->getSiteConfig()->getAnnotationTags() |
276 | ); |
277 | self::$inclAnnRegExp = '|<\/?(?:' . implode( '|', $tags ) . ')>'; |
278 | } |
279 | return ( $pos === strlen( $input ) - 1 |
280 | // possibly more equals followed by spaces or comments |
281 | || preg_match( '/^=*(?:[ \t]|<\!--(?:(?!-->).)*-->' |
282 | . self::$inclAnnRegExp . ')*(?:[\r\n]|$)/sD', |
283 | substr( $input, $pos + 1 ) ) ); |
284 | } |
285 | return false; |
286 | |
287 | case '|': |
288 | return !$stops['annOrExtTag'] && ( |
289 | $stops['templateArg'] |
290 | || $stops['tableCellArg'] |
291 | || $stops['linkdesc'] |
292 | || ( $stops['table'] |
293 | && $pos < strlen( $input ) - 1 |
294 | && preg_match( '/[}|]/', $input[$pos + 1] ) ) |
295 | ); |
296 | |
297 | case '!': |
298 | return $stops['th'] |
299 | && !$stops['intemplate'] |
300 | && $c2 === '!'; |
301 | |
302 | case '{': |
303 | // {{!}} pipe templates.. |
304 | // FIXME: Presumably these should mix with and match | above. |
305 | // phpcs:ignore Squiz.WhiteSpace.LanguageConstructSpacing.IncorrectSingle |
306 | return ( $stops['tableCellArg'] |
307 | && substr( $input, $pos, 5 ) === '{{!}}' ) |
308 | || ( $stops['table'] |
309 | && substr( $input, $pos, 10 ) === '{{!}}{{!}}' ); |
310 | |
311 | case '}': |
312 | $preproc = $stops['preproc']; |
313 | return ( $c2 === '}' && $preproc === '}}' ) |
314 | || ( $c2 === '-' && $preproc === '}-' ); |
315 | |
316 | case ':': |
317 | return $stops['colon'] |
318 | && !$stops['extlink'] |
319 | && !$stops['intemplate'] |
320 | && !$stops['linkdesc'] |
321 | && !( $stops['preproc'] === '}-' ); |
322 | |
323 | case ';': |
324 | return $stops['semicolon']; |
325 | |
326 | case "\r": |
327 | return $stops['table'] |
328 | && preg_match( '/\r\n?\s*[!|]/', substr( $input, $pos ) ); |
329 | |
330 | case "\n": |
331 | // The code below is just a manual / efficient |
332 | // version of this check. |
333 | // |
334 | // stops.table && /^\n\s*[!|]/.test(input.substr(pos)); |
335 | // |
336 | // It eliminates a substr on the string and eliminates |
337 | // a potential perf problem since "\n" and the inline_breaks |
338 | // test is common during tokenization. |
339 | if ( !$stops['table'] ) { |
340 | return false; |
341 | } |
342 | |
343 | // Allow leading whitespace in tables |
344 | |
345 | // Since we switched on 'c' which is input[pos], |
346 | // we know that input[pos] is "\n". |
347 | // So, the /^\n/ part of the regexp is already satisfied. |
348 | // Look for /\s*[!|]/ below. |
349 | $n = strlen( $input ); |
350 | for ( $i = $pos + 1; $i < $n; $i++ ) { |
351 | $d = $input[$i]; |
352 | if ( preg_match( '/[!|]/', $d ) ) { |
353 | return true; |
354 | } elseif ( !( preg_match( '/\s/', $d ) ) ) { |
355 | return false; |
356 | } |
357 | } |
358 | return false; |
359 | case '[': |
360 | // This is a special case in php's doTableStuff, added in |
361 | // response to T2553. If it encounters a `[[`, it bails on |
362 | // parsing attributes and interprets it all as content. |
363 | return $stops['tableCellArg'] && $c2 === '['; |
364 | |
365 | case '-': |
366 | // Same as above: a special case in doTableStuff, added |
367 | // as part of T153140 |
368 | return $stops['tableCellArg'] && $c2 === '{'; |
369 | |
370 | case ']': |
371 | if ( $stops['extlink'] ) { |
372 | return true; |
373 | } |
374 | return $stops['preproc'] === ']]' |
375 | && $c2 === ']'; |
376 | |
377 | default: |
378 | throw new \RuntimeException( 'Unhandled case!' ); |
379 | } |
380 | } |
381 | |
382 | /** |
383 | * Pop off the end comments, if any. |
384 | * @param array &$attrs |
385 | * @return array|null |
386 | */ |
387 | public static function popComments( array &$attrs ): ?array { |
388 | $buf = []; |
389 | for ( $i = count( $attrs ) - 1; $i > -1; $i-- ) { |
390 | $kv = $attrs[$i]; |
391 | if ( is_string( $kv->k ) && !$kv->v && preg_match( '/^\s*$/D', $kv->k ) ) { |
392 | // permit whitespace |
393 | array_unshift( $buf, $kv->k ); |
394 | } elseif ( is_array( $kv->k ) && !$kv->v ) { |
395 | // all should be comments |
396 | foreach ( $kv->k as $k ) { |
397 | if ( !( $k instanceof CommentTk ) ) { |
398 | break 2; |
399 | } |
400 | } |
401 | array_splice( $buf, 0, 0, $kv->k ); |
402 | } else { |
403 | break; |
404 | } |
405 | } |
406 | // ensure we found a comment |
407 | while ( $buf && !( $buf[0] instanceof CommentTk ) ) { |
408 | array_shift( $buf ); |
409 | } |
410 | if ( $buf ) { |
411 | array_splice( $attrs, -count( $buf ), count( $buf ) ); |
412 | return [ 'buf' => $buf, 'commentStartPos' => $buf[0]->dataParsoid->tsr->start ]; |
413 | } else { |
414 | return null; |
415 | } |
416 | } |
417 | |
418 | /** Get a string containing all the autourl terminating characters (as in legacy parser |
419 | * Parser.php::makeFreeExternalLink). This list is slightly context-dependent because the |
420 | * inclusion of the right parenthesis depends on whether the provided character array $arr |
421 | * contains a left parenthesis. |
422 | * @param bool $hasLeftParen should be true if the URL in question contains |
423 | * a left parenthesis. |
424 | * @return string |
425 | */ |
426 | public static function getAutoUrlTerminatingChars( bool $hasLeftParen ): string { |
427 | $chars = Consts::$strippedUrlCharacters; |
428 | if ( !$hasLeftParen ) { |
429 | $chars .= ')'; |
430 | } |
431 | return $chars; |
432 | } |
433 | |
434 | /** |
435 | * @param Env $env |
436 | * @param mixed $token |
437 | */ |
438 | public static function enforceParserResourceLimits( Env $env, $token ) { |
439 | if ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) { |
440 | $resource = null; |
441 | switch ( $token->getName() ) { |
442 | case 'listItem': |
443 | $resource = 'listItem'; |
444 | break; |
445 | case 'template': |
446 | $resource = 'transclusion'; |
447 | break; |
448 | case 'td': |
449 | case 'th': |
450 | $resource = 'tableCell'; |
451 | break; |
452 | } |
453 | if ( |
454 | $resource !== null && |
455 | $env->bumpWt2HtmlResourceUse( $resource ) === false |
456 | ) { |
457 | // `false` indicates that this bump pushed us over the threshold |
458 | // We don't want to log every token above that, which would be `null` |
459 | $env->log( 'warn', "wt2html: $resource limit exceeded" ); |
460 | } |
461 | } |
462 | } |
463 | |
464 | /** |
465 | * Protect Parsoid-inserted attributes by escaping them to prevent |
466 | * Parsoid-HTML spoofing in wikitext. |
467 | * |
468 | * @param string $name |
469 | * @return string |
470 | */ |
471 | public static function protectAttrs( string $name ): string { |
472 | if ( self::$protectAttrsRegExp === null ) { |
473 | self::$protectAttrsRegExp = "/^(about|data-mw.*|data-parsoid.*|data-x.*|" . |
474 | DOMDataUtils::DATA_OBJECT_ATTR_NAME . |
475 | '|property|rel|typeof)$/i'; |
476 | } |
477 | return preg_replace( self::$protectAttrsRegExp, 'data-x-$1', $name ); |
478 | } |
479 | |
480 | /** |
481 | * Resets $inclAnnRegExp to null to avoid test environment side effects |
482 | */ |
483 | public static function resetAnnotationIncludeRegex(): void { |
484 | self::$inclAnnRegExp = null; |
485 | } |
486 | |
487 | } |