MediaWiki master
BlockLevelPass.php
Go to the documentation of this file.
1<?php
2
28
29use LogicException;
30use Wikimedia\Parsoid\Utils\Utils;
32
35 private $DTopen = false;
37 private $inPre = false;
39 private $lastParagraph = '';
41 private $lineStart;
43 private $text;
44
45 # State constants for the definition list colon extraction
46 private const COLON_STATE_TEXT = 0;
47 private const COLON_STATE_TAG = 1;
48 private const COLON_STATE_TAGSTART = 2;
49 private const COLON_STATE_CLOSETAG = 3;
50 private const COLON_STATE_TAGSLASH = 4;
51 private const COLON_STATE_COMMENT = 5;
52 private const COLON_STATE_COMMENTDASH = 6;
53 private const COLON_STATE_COMMENTDASHDASH = 7;
54 private const COLON_STATE_LC = 8;
55
64 public static function doBlockLevels( $text, $lineStart ) {
65 $pass = new self( $text, $lineStart );
66 return $pass->execute();
67 }
68
73 private function __construct( $text, $lineStart ) {
74 $this->text = $text;
75 $this->lineStart = $lineStart;
76 }
77
81 private function hasOpenParagraph() {
82 return $this->lastParagraph !== '';
83 }
84
91 private function closeParagraph( $atTheEnd = false ) {
92 $result = '';
93 if ( $this->hasOpenParagraph() ) {
94 $result = '</' . $this->lastParagraph . '>';
95 if ( !$atTheEnd ) {
96 $result .= "\n";
97 }
98 }
99 $this->inPre = false;
100 $this->lastParagraph = '';
101 return $result;
102 }
103
113 private function getCommon( $st1, $st2 ) {
114 $shorter = min( strlen( $st1 ), strlen( $st2 ) );
115
116 for ( $i = 0; $i < $shorter; ++$i ) {
117 if ( $st1[$i] !== $st2[$i] ) {
118 break;
119 }
120 }
121 return $i;
122 }
123
131 private function openList( $char ) {
132 $result = $this->closeParagraph();
133
134 if ( $char === '*' ) {
135 $result .= "<ul><li>";
136 } elseif ( $char === '#' ) {
137 $result .= "<ol><li>";
138 } elseif ( $char === ':' ) {
139 $result .= "<dl><dd>";
140 } elseif ( $char === ';' ) {
141 $result .= "<dl><dt>";
142 $this->DTopen = true;
143 } else {
144 $result = '<!-- ERR 1 -->';
145 }
146
147 return $result;
148 }
149
156 private function nextItem( $char ) {
157 if ( $char === '*' || $char === '#' ) {
158 return "</li>\n<li>";
159 } elseif ( $char === ':' || $char === ';' ) {
160 $close = "</dd>\n";
161 if ( $this->DTopen ) {
162 $close = "</dt>\n";
163 }
164 if ( $char === ';' ) {
165 $this->DTopen = true;
166 return $close . '<dt>';
167 } else {
168 $this->DTopen = false;
169 return $close . '<dd>';
170 }
171 }
172 return '<!-- ERR 2 -->';
173 }
174
181 private function closeList( $char ) {
182 if ( $char === '*' ) {
183 $text = "</li></ul>";
184 } elseif ( $char === '#' ) {
185 $text = "</li></ol>";
186 } elseif ( $char === ':' ) {
187 if ( $this->DTopen ) {
188 $this->DTopen = false;
189 $text = "</dt></dl>";
190 } else {
191 $text = "</dd></dl>";
192 }
193 } else {
194 return '<!-- ERR 3 -->';
195 }
196 return $text;
197 }
198
203 private function execute() {
204 $text = $this->text;
205 # Parsing through the text line by line. The main thing
206 # happening here is handling of block-level elements p, pre,
207 # and making lists from lines starting with * # : etc.
208 $textLines = StringUtils::explode( "\n", $text );
209
210 $lastPrefix = $output = '';
211 $this->DTopen = $inBlockElem = false;
212 $prefixLength = 0;
213 $pendingPTag = false;
214 $inBlockquote = false;
215
216 for ( $textLines->rewind(); $textLines->valid(); ) {
217 $inputLine = $textLines->current();
218 $textLines->next();
219 $notLastLine = $textLines->valid();
220
221 # Fix up $lineStart
222 if ( !$this->lineStart ) {
223 $output .= $inputLine;
224 $this->lineStart = true;
225 continue;
226 }
227 # * = ul
228 # # = ol
229 # ; = dt
230 # : = dd
231
232 $lastPrefixLength = strlen( $lastPrefix );
233 $preCloseMatch = preg_match( '/<\\/pre/i', $inputLine );
234 $preOpenMatch = preg_match( '/<pre/i', $inputLine );
235 # If not in a <pre> element, scan for and figure out what prefixes are there.
236 if ( !$this->inPre ) {
237 # Multiple prefixes may abut each other for nested lists.
238 $prefixLength = strspn( $inputLine, '*#:;' );
239 $prefix = substr( $inputLine, 0, $prefixLength );
240
241 # eh?
242 # ; and : are both from definition-lists, so they're equivalent
243 # for the purposes of determining whether or not we need to open/close
244 # elements.
245 $prefix2 = str_replace( ';', ':', $prefix );
246 $t = substr( $inputLine, $prefixLength );
247 $this->inPre = (bool)$preOpenMatch;
248 } else {
249 # Don't interpret any other prefixes in preformatted text
250 $prefixLength = 0;
251 $prefix = $prefix2 = '';
252 $t = $inputLine;
253 }
254
255 # List generation
256 if ( $prefixLength && $lastPrefix === $prefix2 ) {
257 # Same as the last item, so no need to deal with nesting or opening stuff
258 $output .= $this->nextItem( substr( $prefix, -1 ) );
259 $pendingPTag = false;
260
261 if ( substr( $prefix, -1 ) === ';' ) {
262 # The one nasty exception: definition lists work like this:
263 # ; title : definition text
264 # So we check for : in the remainder text to split up the
265 # title and definition, without b0rking links.
266 $term = $t2 = '';
267 if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
268 $t = $t2;
269 // Trim whitespace in list items
270 $output .= trim( $term ) . $this->nextItem( ':' );
271 }
272 }
273 } elseif ( $prefixLength || $lastPrefixLength ) {
274 # We need to open or close prefixes, or both.
275
276 # Either open or close a level...
277 $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
278 $pendingPTag = false;
279
280 # Close all the prefixes which aren't shared.
281 while ( $commonPrefixLength < $lastPrefixLength ) {
282 // @phan-suppress-next-line PhanTypeInvalidDimOffset
283 $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
284 --$lastPrefixLength;
285 }
286
287 # Continue the current prefix if appropriate.
288 if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
289 $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
290 }
291
292 # Close an open <dt> if we have a <dd> (":") starting on this line
293 if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] === ':' ) {
294 $output .= $this->nextItem( ':' );
295 }
296
297 # Open prefixes where appropriate.
298 if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
299 $output .= "\n";
300 }
301 while ( $prefixLength > $commonPrefixLength ) {
302 $char = $prefix[$commonPrefixLength];
303 $output .= $this->openList( $char );
304
305 if ( $char === ';' ) {
306 # @todo FIXME: This is dupe of code above
307 if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
308 $t = $t2;
309 // Trim whitespace in list items
310 $output .= trim( $term ) . $this->nextItem( ':' );
311 }
312 }
313 ++$commonPrefixLength;
314 }
315 if ( !$prefixLength && $lastPrefix ) {
316 $output .= "\n";
317 }
318 $lastPrefix = $prefix2;
319 }
320
321 # If we have no prefixes, go to paragraph mode.
322 if ( $prefixLength == 0 ) {
323 # No prefix (not in list)--go to paragraph mode
324 # @todo consider using a stack for nestable elements like span, table and div
325
326 // P-wrapping and indent-pre are suppressed inside, not outside
327 $blockElems = 'table|h1|h2|h3|h4|h5|h6|pre|p|ul|ol|dl';
328 // P-wrapping and indent-pre are suppressed outside, not inside
329 $antiBlockElems = 'td|th';
330
331 $openMatch = preg_match(
332 '/<('
333 . "({$blockElems})|\\/({$antiBlockElems})|"
334 // Always suppresses
335 . '\\/?(tr|caption|dt|dd|li)'
336 . ')\\b/iS',
337 $t
338 );
339 $closeMatch = preg_match(
340 '/<('
341 . "\\/({$blockElems})|({$antiBlockElems})|"
342 // Never suppresses
343 . '\\/?(center|blockquote|div|hr|mw:|aside|figure)|'
344 // Used as Parser::TOC_PLACEHOLDER
345 . 'meta property="mw:'
346 . ')\\b/iS',
347 $t
348 );
349
350 // Any match closes the paragraph, but only when `!$closeMatch`
351 // do we enter block mode. The oddities with table rows and
352 // cells are to avoid paragraph wrapping in interstitial spaces
353 // leading to fostered content.
354
355 if ( $openMatch || $closeMatch ) {
356 $pendingPTag = false;
357 // Only close the paragraph if we're not inside a <pre> tag, or if
358 // that <pre> tag has just been opened
359 if ( !$this->inPre || $preOpenMatch ) {
360 // @todo T7718: paragraph closed
361 $output .= $this->closeParagraph();
362 }
363 if ( $preOpenMatch && !$preCloseMatch ) {
364 $this->inPre = true;
365 }
366 $bqOffset = 0;
367 while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t,
368 $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
369 ) {
370 $inBlockquote = !$bqMatch[1][0]; // is this a close tag?
371 $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
372 }
373 $inBlockElem = !$closeMatch;
374 } elseif ( !$inBlockElem && !$this->inPre ) {
375 if ( substr( $t, 0, 1 ) == ' '
376 && ( $this->lastParagraph === 'pre' || trim( $t ) != '' )
377 && !$inBlockquote
378 ) {
379 # pre
380 if ( $this->lastParagraph !== 'pre' ) {
381 $pendingPTag = false;
382 $output .= $this->closeParagraph() . '<pre>';
383 $this->lastParagraph = 'pre';
384 }
385 $t = substr( $t, 1 );
386 } elseif ( preg_match( '/^(?:<style\\b[^>]*>.*?<\\/style>\s*|<link\\b[^>]*>\s*)+$/iS', $t ) ) {
387 # T186965: <style> or <link> by itself on a line shouldn't open or close paragraphs.
388 # But it should clear $pendingPTag.
389 if ( $pendingPTag ) {
390 $output .= $this->closeParagraph();
391 $pendingPTag = false;
392 }
393 } else {
394 # paragraph
395 if ( trim( $t ) === '' ) {
396 if ( $pendingPTag ) {
397 $output .= $pendingPTag . '<br />';
398 $pendingPTag = false;
399 $this->lastParagraph = 'p';
400 } elseif ( $this->lastParagraph !== 'p' ) {
401 $output .= $this->closeParagraph();
402 $pendingPTag = '<p>';
403 } else {
404 $pendingPTag = '</p><p>';
405 }
406 } elseif ( $pendingPTag ) {
407 $output .= $pendingPTag;
408 $pendingPTag = false;
409 $this->lastParagraph = 'p';
410 } elseif ( $this->lastParagraph !== 'p' ) {
411 $output .= $this->closeParagraph() . '<p>';
412 $this->lastParagraph = 'p';
413 }
414 }
415 }
416 }
417 # somewhere above we forget to get out of pre block (T2785)
418 if ( $preCloseMatch && $this->inPre ) {
419 $this->inPre = false;
420 }
421 if ( $pendingPTag === false ) {
422 if ( $prefixLength === 0 ) {
423 $output .= $t;
424 // Add a newline if there's an open paragraph
425 // or we've yet to reach the last line.
426 if ( $notLastLine || $this->hasOpenParagraph() ) {
427 $output .= "\n";
428 }
429 } else {
430 // Trim whitespace in list items
431 $output .= trim( $t );
432 }
433 }
434 }
435 while ( $prefixLength ) {
436 // @phan-suppress-next-line PhanTypeArraySuspicious $prefix set if $prefixLength is set
437 $output .= $this->closeList( $prefix2[$prefixLength - 1] );
438 --$prefixLength;
439 // Note that a paragraph is only ever opened when `prefixLength`
440 // is zero, but we'll choose to be overly cautious.
441 if ( !$prefixLength && $this->hasOpenParagraph() ) {
442 $output .= "\n";
443 }
444 }
445 $output .= $this->closeParagraph( true );
446 return $output;
447 }
448
458 private function findColonNoLinks( $str, &$before, &$after ) {
459 if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
460 # Nothing to find!
461 return false;
462 }
463
464 if ( $m[0][0] === ':' ) {
465 # Easy; no tag nesting to worry about
466 $colonPos = $m[0][1];
467 $before = substr( $str, 0, $colonPos );
468 $after = substr( $str, $colonPos + 1 );
469 return $colonPos;
470 }
471
472 # Ugly state machine to walk through avoiding tags.
473 $state = self::COLON_STATE_TEXT;
474 $ltLevel = 0;
475 $lcLevel = 0;
476 $captureName = false;
477 $tagName = '';
478 $len = strlen( $str );
479 for ( $i = $m[0][1]; $i < $len; $i++ ) {
480 $c = $str[$i];
481
482 switch ( $state ) {
483 case self::COLON_STATE_TEXT:
484 switch ( $c ) {
485 case "<":
486 # Could be either a <start> tag or an </end> tag
487 $state = self::COLON_STATE_TAGSTART;
488 $captureName = true;
489 $tagName = '';
490 break;
491 case ":":
492 if ( $ltLevel === 0 ) {
493 # We found it!
494 $before = substr( $str, 0, $i );
495 $after = substr( $str, $i + 1 );
496 return $i;
497 }
498 # Embedded in a tag; don't break it.
499 break;
500 default:
501 # Skip ahead looking for something interesting
502 if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
503 # Nothing else interesting
504 return false;
505 }
506 if ( $m[0][0] === '-{' ) {
507 $state = self::COLON_STATE_LC;
508 $lcLevel++;
509 $i = $m[0][1] + 1;
510 } else {
511 # Skip ahead to next interesting character.
512 $i = $m[0][1] - 1;
513 }
514 break;
515 }
516 break;
517 case self::COLON_STATE_LC:
518 # In language converter markup -{ ... }-
519 if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
520 # Nothing else interesting to find; abort!
521 # We're nested in language converter markup, but there
522 # are no close tags left. Abort!
523 break 2;
524 }
525 if ( $m[0][0] === '-{' ) {
526 $i = $m[0][1] + 1;
527 $lcLevel++;
528 } elseif ( $m[0][0] === '}-' ) {
529 $i = $m[0][1] + 1;
530 $lcLevel--;
531 if ( $lcLevel === 0 ) {
532 $state = self::COLON_STATE_TEXT;
533 }
534 }
535 break;
536 case self::COLON_STATE_TAG:
537 # In a <tag>
538 switch ( $c ) {
539 case " ":
540 $captureName = false;
541 break;
542 case ">":
543 if ( !Utils::isVoidElement( strtolower( $tagName ) ) ) {
544 $ltLevel++;
545 }
546 $state = self::COLON_STATE_TEXT;
547 break;
548 case "/":
549 # Slash may be followed by >?
550 $state = self::COLON_STATE_TAGSLASH;
551 break;
552 default:
553 if ( $captureName ) {
554 $tagName .= $c;
555 }
556 # ignore
557 }
558 break;
559 case self::COLON_STATE_TAGSTART:
560 switch ( $c ) {
561 case "/":
562 $state = self::COLON_STATE_CLOSETAG;
563 break;
564 case "!":
565 $state = self::COLON_STATE_COMMENT;
566 break;
567 case ">":
568 # Illegal early close? This shouldn't happen D:
569 $state = self::COLON_STATE_TEXT;
570 break;
571 default:
572 if ( $captureName ) {
573 $tagName .= $c;
574 }
575 $state = self::COLON_STATE_TAG;
576 }
577 break;
578 case self::COLON_STATE_CLOSETAG:
579 # In a </tag>
580 if ( $c === ">" ) {
581 if ( $ltLevel > 0 ) {
582 $ltLevel--;
583 } else {
584 # ignore the excess close tag, but keep looking for
585 # colons. (This matches Parsoid behavior.)
586 wfDebug( __METHOD__ . ": Invalid input; too many close tags" );
587 }
588 $state = self::COLON_STATE_TEXT;
589 }
590 break;
591 case self::COLON_STATE_TAGSLASH:
592 if ( $c === ">" ) {
593 # Yes, a self-closed tag <blah/>
594 $state = self::COLON_STATE_TEXT;
595 } else {
596 # Probably we're jumping the gun, and this is an attribute
597 $state = self::COLON_STATE_TAG;
598 }
599 break;
600 case self::COLON_STATE_COMMENT:
601 if ( $c === "-" ) {
602 $state = self::COLON_STATE_COMMENTDASH;
603 }
604 break;
605 case self::COLON_STATE_COMMENTDASH:
606 if ( $c === "-" ) {
607 $state = self::COLON_STATE_COMMENTDASHDASH;
608 } else {
609 $state = self::COLON_STATE_COMMENT;
610 }
611 break;
612 case self::COLON_STATE_COMMENTDASHDASH:
613 if ( $c === ">" ) {
614 $state = self::COLON_STATE_TEXT;
615 } else {
616 $state = self::COLON_STATE_COMMENT;
617 }
618 break;
619 default:
620 throw new LogicException( "State machine error in " . __METHOD__ );
621 }
622 }
623 if ( $ltLevel > 0 || $lcLevel > 0 ) {
624 wfDebug(
625 __METHOD__ . ": Invalid input; not enough close tags " .
626 "(level $ltLevel/$lcLevel, state $state)"
627 );
628 }
629 return false;
630 }
631}
632
634class_alias( BlockLevelPass::class, 'BlockLevelPass' );
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
static doBlockLevels( $text, $lineStart)
Make lists from lines starting with ':', '*', '#', etc.
A collection of static methods to play with strings.