35 private $DTopen =
false;
37 private $inPre =
false;
39 private $lastParagraph =
'';
45 # State constants for the definition list colon extraction
46 private const COLON_STATE_TEXT = 0;
47 private const COLON_STATE_TAG = 1;
48 private const COLON_STATE_TAGSTART = 2;
49 private const COLON_STATE_CLOSETAG = 3;
50 private const COLON_STATE_TAGSLASH = 4;
51 private const COLON_STATE_COMMENT = 5;
52 private const COLON_STATE_COMMENTDASH = 6;
53 private const COLON_STATE_COMMENTDASHDASH = 7;
54 private const COLON_STATE_LC = 8;
65 $pass =
new self( $text, $lineStart );
66 return $pass->execute();
73 private function __construct( $text, $lineStart ) {
75 $this->lineStart = $lineStart;
81 private function hasOpenParagraph() {
82 return $this->lastParagraph !==
'';
91 private function closeParagraph( $atTheEnd =
false ) {
93 if ( $this->hasOpenParagraph() ) {
94 $result =
'</' . $this->lastParagraph .
'>';
100 $this->lastParagraph =
'';
113 private function getCommon( $st1, $st2 ) {
114 $shorter = min( strlen( $st1 ), strlen( $st2 ) );
116 for ( $i = 0; $i < $shorter; ++$i ) {
117 if ( $st1[$i] !== $st2[$i] ) {
131 private function openList( $char ) {
132 $result = $this->closeParagraph();
134 if ( $char ===
'*' ) {
135 $result .=
"<ul><li>";
136 } elseif ( $char ===
'#' ) {
137 $result .=
"<ol><li>";
138 } elseif ( $char ===
':' ) {
139 $result .=
"<dl><dd>";
140 } elseif ( $char ===
';' ) {
141 $result .=
"<dl><dt>";
142 $this->DTopen =
true;
144 $result =
'<!-- ERR 1 -->';
156 private function nextItem( $char ) {
157 if ( $char ===
'*' || $char ===
'#' ) {
158 return "</li>\n<li>";
159 } elseif ( $char ===
':' || $char ===
';' ) {
161 if ( $this->DTopen ) {
164 if ( $char ===
';' ) {
165 $this->DTopen =
true;
166 return $close .
'<dt>';
168 $this->DTopen =
false;
169 return $close .
'<dd>';
172 return '<!-- ERR 2 -->';
181 private function closeList( $char ) {
182 if ( $char ===
'*' ) {
183 $text =
"</li></ul>";
184 } elseif ( $char ===
'#' ) {
185 $text =
"</li></ol>";
186 } elseif ( $char ===
':' ) {
187 if ( $this->DTopen ) {
188 $this->DTopen =
false;
189 $text =
"</dt></dl>";
191 $text =
"</dd></dl>";
194 return '<!-- ERR 3 -->';
203 private function execute() {
205 # Parsing through the text line by line. The main thing
206 # happening here is handling of block-level elements p, pre,
207 # and making lists from lines starting with * # : etc.
208 $textLines = StringUtils::explode(
"\n", $text );
210 $lastPrefix = $output =
'';
211 $this->DTopen = $inBlockElem =
false;
213 $pendingPTag =
false;
214 $inBlockquote =
false;
216 for ( $textLines->rewind(); $textLines->valid(); ) {
217 $inputLine = $textLines->current();
219 $notLastLine = $textLines->valid();
222 if ( !$this->lineStart ) {
223 $output .= $inputLine;
224 $this->lineStart =
true;
232 $lastPrefixLength = strlen( $lastPrefix );
233 $preCloseMatch = preg_match(
'/<\\/pre/i', $inputLine );
234 $preOpenMatch = preg_match(
'/<pre/i', $inputLine );
235 # If not in a <pre> element, scan for and figure out what prefixes are there.
236 if ( !$this->inPre ) {
237 # Multiple prefixes may abut each other for nested lists.
238 $prefixLength = strspn( $inputLine,
'*#:;' );
239 $prefix = substr( $inputLine, 0, $prefixLength );
242 # ; and : are both from definition-lists, so they're equivalent
243 # for the purposes of determining whether or not we need to open/close
245 $prefix2 = str_replace(
';',
':', $prefix );
246 $t = substr( $inputLine, $prefixLength );
247 $this->inPre = (bool)$preOpenMatch;
249 # Don't interpret any other prefixes in preformatted text
251 $prefix = $prefix2 =
'';
256 if ( $prefixLength && $lastPrefix === $prefix2 ) {
257 # Same as the last item, so no need to deal with nesting or opening stuff
258 $output .= $this->nextItem( substr( $prefix, -1 ) );
259 $pendingPTag =
false;
261 if ( substr( $prefix, -1 ) ===
';' ) {
262 # The one nasty exception: definition lists work like this:
263 # ; title : definition text
264 # So we check for : in the remainder text to split up the
265 # title and definition, without b0rking links.
267 if ( $this->findColonNoLinks( $t, $term, $t2 ) !==
false ) {
270 $output .= trim( $term ) . $this->nextItem(
':' );
273 } elseif ( $prefixLength || $lastPrefixLength ) {
274 # We need to open or close prefixes, or both.
276 # Either open or close a level...
277 $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
278 $pendingPTag =
false;
280 # Close all the prefixes which aren't shared.
281 while ( $commonPrefixLength < $lastPrefixLength ) {
283 $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
287 # Continue the current prefix if appropriate.
288 if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
289 $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
292 # Close an open <dt> if we have a <dd> (":") starting on this line
293 if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] ===
':' ) {
294 $output .= $this->nextItem(
':' );
297 # Open prefixes where appropriate.
298 if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
301 while ( $prefixLength > $commonPrefixLength ) {
302 $char = $prefix[$commonPrefixLength];
303 $output .= $this->openList( $char );
305 if ( $char ===
';' ) {
306 # @todo FIXME: This is dupe of code above
307 if ( $this->findColonNoLinks( $t, $term, $t2 ) !==
false ) {
310 $output .= trim( $term ) . $this->nextItem(
':' );
313 ++$commonPrefixLength;
315 if ( !$prefixLength && $lastPrefix ) {
318 $lastPrefix = $prefix2;
321 # If we have no prefixes, go to paragraph mode.
322 if ( $prefixLength == 0 ) {
323 # No prefix (not in list)--go to paragraph mode
324 # @todo consider using a stack for nestable elements like span, table and div
327 $blockElems =
'table|h1|h2|h3|h4|h5|h6|pre|p|ul|ol|dl';
329 $antiBlockElems =
'td|th';
331 $openMatch = preg_match(
333 .
"({$blockElems})|\\/({$antiBlockElems})|"
335 .
'\\/?(tr|caption|dt|dd|li)'
339 $closeMatch = preg_match(
341 .
"\\/({$blockElems})|({$antiBlockElems})|"
343 .
'\\/?(center|blockquote|div|hr|mw:|aside|figure)|'
345 .
'meta property="mw:'
355 if ( $openMatch || $closeMatch ) {
356 $pendingPTag =
false;
359 if ( !$this->inPre || $preOpenMatch ) {
361 $output .= $this->closeParagraph();
363 if ( $preOpenMatch && !$preCloseMatch ) {
367 while ( preg_match(
'/<(\\/?)blockquote[\s>]/i', $t,
368 $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
370 $inBlockquote = !$bqMatch[1][0];
371 $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
373 $inBlockElem = !$closeMatch;
374 } elseif ( !$inBlockElem && !$this->inPre ) {
375 if ( substr( $t, 0, 1 ) ==
' '
376 && ( $this->lastParagraph ===
'pre' || trim( $t ) !=
'' )
380 if ( $this->lastParagraph !==
'pre' ) {
381 $pendingPTag =
false;
382 $output .= $this->closeParagraph() .
'<pre>';
383 $this->lastParagraph =
'pre';
385 $t = substr( $t, 1 );
386 } elseif ( preg_match(
'/^(?:<style\\b[^>]*>.*?<\\/style>\s*|<link\\b[^>]*>\s*)+$/iS', $t ) ) {
387 # T186965: <style> or <link> by itself on a line shouldn't open or close paragraphs.
388 # But it should clear $pendingPTag.
389 if ( $pendingPTag ) {
390 $output .= $this->closeParagraph();
391 $pendingPTag =
false;
395 if ( trim( $t ) ===
'' ) {
396 if ( $pendingPTag ) {
397 $output .= $pendingPTag .
'<br />';
398 $pendingPTag =
false;
399 $this->lastParagraph =
'p';
400 } elseif ( $this->lastParagraph !==
'p' ) {
401 $output .= $this->closeParagraph();
402 $pendingPTag =
'<p>';
404 $pendingPTag =
'</p><p>';
406 } elseif ( $pendingPTag ) {
407 $output .= $pendingPTag;
408 $pendingPTag =
false;
409 $this->lastParagraph =
'p';
410 } elseif ( $this->lastParagraph !==
'p' ) {
411 $output .= $this->closeParagraph() .
'<p>';
412 $this->lastParagraph =
'p';
417 # somewhere above we forget to get out of pre block (T2785)
418 if ( $preCloseMatch && $this->inPre ) {
419 $this->inPre =
false;
421 if ( $pendingPTag ===
false ) {
422 if ( $prefixLength === 0 ) {
426 if ( $notLastLine || $this->hasOpenParagraph() ) {
431 $output .= trim( $t );
435 while ( $prefixLength ) {
437 $output .= $this->closeList( $prefix2[$prefixLength - 1] );
441 if ( !$prefixLength && $this->hasOpenParagraph() ) {
445 $output .= $this->closeParagraph(
true );
458 private function findColonNoLinks( $str, &$before, &$after ) {
459 if ( !preg_match(
'/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
464 if ( $m[0][0] ===
':' ) {
465 # Easy; no tag nesting to worry about
466 $colonPos = $m[0][1];
467 $before = substr( $str, 0, $colonPos );
468 $after = substr( $str, $colonPos + 1 );
472 # Ugly state machine to walk through avoiding tags.
473 $state = self::COLON_STATE_TEXT;
476 $captureName =
false;
478 $len = strlen( $str );
479 for ( $i = $m[0][1]; $i < $len; $i++ ) {
483 case self::COLON_STATE_TEXT:
486 # Could be either a <start> tag or an </end> tag
487 $state = self::COLON_STATE_TAGSTART;
492 if ( $ltLevel === 0 ) {
494 $before = substr( $str, 0, $i );
495 $after = substr( $str, $i + 1 );
498 # Embedded in a tag; don't break it.
501 # Skip ahead looking for something interesting
502 if ( !preg_match(
'/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
503 # Nothing else interesting
506 if ( $m[0][0] ===
'-{' ) {
507 $state = self::COLON_STATE_LC;
511 # Skip ahead to next interesting character.
517 case self::COLON_STATE_LC:
518 # In language converter markup -{ ... }-
519 if ( !preg_match(
'/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
520 # Nothing else interesting to find; abort!
521 # We're nested in language converter markup, but there
522 # are no close tags left. Abort!
525 if ( $m[0][0] ===
'-{' ) {
528 } elseif ( $m[0][0] ===
'}-' ) {
531 if ( $lcLevel === 0 ) {
532 $state = self::COLON_STATE_TEXT;
536 case self::COLON_STATE_TAG:
540 $captureName =
false;
543 if ( !Utils::isVoidElement( strtolower( $tagName ) ) ) {
546 $state = self::COLON_STATE_TEXT;
549 # Slash may be followed by >?
550 $state = self::COLON_STATE_TAGSLASH;
553 if ( $captureName ) {
559 case self::COLON_STATE_TAGSTART:
562 $state = self::COLON_STATE_CLOSETAG;
565 $state = self::COLON_STATE_COMMENT;
568 # Illegal early close? This shouldn't happen D:
569 $state = self::COLON_STATE_TEXT;
572 if ( $captureName ) {
575 $state = self::COLON_STATE_TAG;
578 case self::COLON_STATE_CLOSETAG:
581 if ( $ltLevel > 0 ) {
584 # ignore the excess close tag, but keep looking for
585 # colons. (This matches Parsoid behavior.)
586 wfDebug( __METHOD__ .
": Invalid input; too many close tags" );
588 $state = self::COLON_STATE_TEXT;
591 case self::COLON_STATE_TAGSLASH:
593 # Yes, a self-closed tag <blah/>
594 $state = self::COLON_STATE_TEXT;
596 # Probably we're jumping the gun, and this is an attribute
597 $state = self::COLON_STATE_TAG;
600 case self::COLON_STATE_COMMENT:
602 $state = self::COLON_STATE_COMMENTDASH;
605 case self::COLON_STATE_COMMENTDASH:
607 $state = self::COLON_STATE_COMMENTDASHDASH;
609 $state = self::COLON_STATE_COMMENT;
612 case self::COLON_STATE_COMMENTDASHDASH:
614 $state = self::COLON_STATE_TEXT;
616 $state = self::COLON_STATE_COMMENT;
620 throw new LogicException(
"State machine error in " . __METHOD__ );
623 if ( $ltLevel > 0 || $lcLevel > 0 ) {
625 __METHOD__ .
": Invalid input; not enough close tags " .
626 "(level $ltLevel/$lcLevel, state $state)"