MediaWiki  master
BlockLevelPass.php
Go to the documentation of this file.
1 <?php
2 
26  private $DTopen = false;
27  private $inPre = false;
28  private $lastParagraph = '';
29  private $lineStart;
30  private $text;
31 
32  # State constants for the definition list colon extraction
33  const COLON_STATE_TEXT = 0;
34  const COLON_STATE_TAG = 1;
41  const COLON_STATE_LC = 8;
42 
50  public static function doBlockLevels( $text, $lineStart ) {
51  $pass = new self( $text, $lineStart );
52  return $pass->execute();
53  }
54 
59  private function __construct( $text, $lineStart ) {
60  $this->text = $text;
61  $this->lineStart = $lineStart;
62  }
63 
67  private function hasOpenParagraph() {
68  return $this->lastParagraph !== '';
69  }
70 
77  private function closeParagraph( $atTheEnd = false ) {
78  $result = '';
79  if ( $this->hasOpenParagraph() ) {
80  $result = '</' . $this->lastParagraph . '>';
81  if ( !$atTheEnd ) {
82  $result .= "\n";
83  }
84  }
85  $this->inPre = false;
86  $this->lastParagraph = '';
87  return $result;
88  }
89 
99  private function getCommon( $st1, $st2 ) {
100  $shorter = min( strlen( $st1 ), strlen( $st2 ) );
101 
102  for ( $i = 0; $i < $shorter; ++$i ) {
103  if ( $st1[$i] !== $st2[$i] ) {
104  break;
105  }
106  }
107  return $i;
108  }
109 
117  private function openList( $char ) {
118  $result = $this->closeParagraph();
119 
120  if ( $char === '*' ) {
121  $result .= "<ul><li>";
122  } elseif ( $char === '#' ) {
123  $result .= "<ol><li>";
124  } elseif ( $char === ':' ) {
125  $result .= "<dl><dd>";
126  } elseif ( $char === ';' ) {
127  $result .= "<dl><dt>";
128  $this->DTopen = true;
129  } else {
130  $result = '<!-- ERR 1 -->';
131  }
132 
133  return $result;
134  }
135 
142  private function nextItem( $char ) {
143  if ( $char === '*' || $char === '#' ) {
144  return "</li>\n<li>";
145  } elseif ( $char === ':' || $char === ';' ) {
146  $close = "</dd>\n";
147  if ( $this->DTopen ) {
148  $close = "</dt>\n";
149  }
150  if ( $char === ';' ) {
151  $this->DTopen = true;
152  return $close . '<dt>';
153  } else {
154  $this->DTopen = false;
155  return $close . '<dd>';
156  }
157  }
158  return '<!-- ERR 2 -->';
159  }
160 
167  private function closeList( $char ) {
168  if ( $char === '*' ) {
169  $text = "</li></ul>";
170  } elseif ( $char === '#' ) {
171  $text = "</li></ol>";
172  } elseif ( $char === ':' ) {
173  if ( $this->DTopen ) {
174  $this->DTopen = false;
175  $text = "</dt></dl>";
176  } else {
177  $text = "</dd></dl>";
178  }
179  } else {
180  return '<!-- ERR 3 -->';
181  }
182  return $text;
183  }
184 
189  private function execute() {
190  $text = $this->text;
191  # Parsing through the text line by line. The main thing
192  # happening here is handling of block-level elements p, pre,
193  # and making lists from lines starting with * # : etc.
194  $textLines = StringUtils::explode( "\n", $text );
195 
196  $lastPrefix = $output = '';
197  $this->DTopen = $inBlockElem = false;
198  $prefixLength = 0;
199  $pendingPTag = false;
200  $inBlockquote = false;
201 
202  for ( $textLines->rewind(); $textLines->valid(); ) {
203  $inputLine = $textLines->current();
204  $textLines->next();
205  $notLastLine = $textLines->valid();
206 
207  # Fix up $lineStart
208  if ( !$this->lineStart ) {
209  $output .= $inputLine;
210  $this->lineStart = true;
211  continue;
212  }
213  # * = ul
214  # # = ol
215  # ; = dt
216  # : = dd
217 
218  $lastPrefixLength = strlen( $lastPrefix );
219  $preCloseMatch = preg_match( '/<\\/pre/i', $inputLine );
220  $preOpenMatch = preg_match( '/<pre/i', $inputLine );
221  # If not in a <pre> element, scan for and figure out what prefixes are there.
222  if ( !$this->inPre ) {
223  # Multiple prefixes may abut each other for nested lists.
224  $prefixLength = strspn( $inputLine, '*#:;' );
225  $prefix = substr( $inputLine, 0, $prefixLength );
226 
227  # eh?
228  # ; and : are both from definition-lists, so they're equivalent
229  # for the purposes of determining whether or not we need to open/close
230  # elements.
231  $prefix2 = str_replace( ';', ':', $prefix );
232  $t = substr( $inputLine, $prefixLength );
233  $this->inPre = (bool)$preOpenMatch;
234  } else {
235  # Don't interpret any other prefixes in preformatted text
236  $prefixLength = 0;
237  $prefix = $prefix2 = '';
238  $t = $inputLine;
239  }
240 
241  # List generation
242  if ( $prefixLength && $lastPrefix === $prefix2 ) {
243  # Same as the last item, so no need to deal with nesting or opening stuff
244  $output .= $this->nextItem( substr( $prefix, -1 ) );
245  $pendingPTag = false;
246 
247  if ( substr( $prefix, -1 ) === ';' ) {
248  # The one nasty exception: definition lists work like this:
249  # ; title : definition text
250  # So we check for : in the remainder text to split up the
251  # title and definition, without b0rking links.
252  $term = $t2 = '';
253  if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
254  $t = $t2;
255  // Trim whitespace in list items
256  $output .= trim( $term ) . $this->nextItem( ':' );
257  }
258  }
259  } elseif ( $prefixLength || $lastPrefixLength ) {
260  # We need to open or close prefixes, or both.
261 
262  # Either open or close a level...
263  $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
264  $pendingPTag = false;
265 
266  # Close all the prefixes which aren't shared.
267  while ( $commonPrefixLength < $lastPrefixLength ) {
268  $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
269  --$lastPrefixLength;
270  }
271 
272  # Continue the current prefix if appropriate.
273  if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
274  $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
275  }
276 
277  # Close an open <dt> if we have a <dd> (":") starting on this line
278  if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] === ':' ) {
279  $output .= $this->nextItem( ':' );
280  }
281 
282  # Open prefixes where appropriate.
283  if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
284  $output .= "\n";
285  }
286  while ( $prefixLength > $commonPrefixLength ) {
287  $char = $prefix[$commonPrefixLength];
288  $output .= $this->openList( $char );
289 
290  if ( $char === ';' ) {
291  # @todo FIXME: This is dupe of code above
292  if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
293  $t = $t2;
294  // Trim whitespace in list items
295  $output .= trim( $term ) . $this->nextItem( ':' );
296  }
297  }
298  ++$commonPrefixLength;
299  }
300  if ( !$prefixLength && $lastPrefix ) {
301  $output .= "\n";
302  }
303  $lastPrefix = $prefix2;
304  }
305 
306  # If we have no prefixes, go to paragraph mode.
307  if ( $prefixLength == 0 ) {
308  # No prefix (not in list)--go to paragraph mode
309  # @todo consider using a stack for nestable elements like span, table and div
310 
311  // P-wrapping and indent-pre are suppressed inside, not outside
312  $blockElems = 'table|h1|h2|h3|h4|h5|h6|pre|p|ul|ol|dl';
313  // P-wrapping and indent-pre are suppressed outside, not inside
314  $antiBlockElems = 'td|th';
315 
316  $openMatch = preg_match(
317  '/<('
318  . "({$blockElems})|\\/({$antiBlockElems})|"
319  // Always suppresses
320  . '\\/?(tr|dt|dd|li)'
321  . ')\\b/iS',
322  $t
323  );
324  $closeMatch = preg_match(
325  '/<('
326  . "\\/({$blockElems})|({$antiBlockElems})|"
327  // Never suppresses
328  . '\\/?(center|blockquote|div|hr|mw:)'
329  . ')\\b/iS',
330  $t
331  );
332 
333  // Any match closes the paragraph, but only when `!$closeMatch`
334  // do we enter block mode. The oddities with table rows and
335  // cells are to avoid paragraph wrapping in interstitial spaces
336  // leading to fostered content.
337 
338  if ( $openMatch || $closeMatch ) {
339  $pendingPTag = false;
340  // Only close the paragraph if we're not inside a <pre> tag, or if
341  // that <pre> tag has just been opened
342  if ( !$this->inPre || $preOpenMatch ) {
343  // @todo T7718: paragraph closed
344  $output .= $this->closeParagraph();
345  }
346  if ( $preOpenMatch && !$preCloseMatch ) {
347  $this->inPre = true;
348  }
349  $bqOffset = 0;
350  while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t,
351  $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
352  ) {
353  $inBlockquote = !$bqMatch[1][0]; // is this a close tag?
354  $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
355  }
356  $inBlockElem = !$closeMatch;
357  } elseif ( !$inBlockElem && !$this->inPre ) {
358  if ( substr( $t, 0, 1 ) == ' '
359  && ( $this->lastParagraph === 'pre' || trim( $t ) != '' )
360  && !$inBlockquote
361  ) {
362  # pre
363  if ( $this->lastParagraph !== 'pre' ) {
364  $pendingPTag = false;
365  $output .= $this->closeParagraph() . '<pre>';
366  $this->lastParagraph = 'pre';
367  }
368  $t = substr( $t, 1 );
369  } elseif ( preg_match( '/^(?:<style\\b[^>]*>.*?<\\/style>\s*|<link\\b[^>]*>\s*)+$/iS', $t ) ) {
370  # T186965: <style> or <link> by itself on a line shouldn't open or close paragraphs.
371  # But it should clear $pendingPTag.
372  if ( $pendingPTag ) {
373  $output .= $this->closeParagraph();
374  $pendingPTag = false;
375  }
376  } else {
377  # paragraph
378  if ( trim( $t ) === '' ) {
379  if ( $pendingPTag ) {
380  $output .= $pendingPTag . '<br />';
381  $pendingPTag = false;
382  $this->lastParagraph = 'p';
383  } elseif ( $this->lastParagraph !== 'p' ) {
384  $output .= $this->closeParagraph();
385  $pendingPTag = '<p>';
386  } else {
387  $pendingPTag = '</p><p>';
388  }
389  } elseif ( $pendingPTag ) {
390  $output .= $pendingPTag;
391  $pendingPTag = false;
392  $this->lastParagraph = 'p';
393  } elseif ( $this->lastParagraph !== 'p' ) {
394  $output .= $this->closeParagraph() . '<p>';
395  $this->lastParagraph = 'p';
396  }
397  }
398  }
399  }
400  # somewhere above we forget to get out of pre block (T2785)
401  if ( $preCloseMatch && $this->inPre ) {
402  $this->inPre = false;
403  }
404  if ( $pendingPTag === false ) {
405  if ( $prefixLength === 0 ) {
406  $output .= $t;
407  // Add a newline if there's an open paragraph
408  // or we've yet to reach the last line.
409  if ( $notLastLine || $this->hasOpenParagraph() ) {
410  $output .= "\n";
411  }
412  } else {
413  // Trim whitespace in list items
414  $output .= trim( $t );
415  }
416  }
417  }
418  while ( $prefixLength ) {
419  $output .= $this->closeList( $prefix2[$prefixLength - 1] );
420  --$prefixLength;
421  // Note that a paragraph is only ever opened when `prefixLength`
422  // is zero, but we'll choose to be overly cautious.
423  if ( !$prefixLength && $this->hasOpenParagraph() ) {
424  $output .= "\n";
425  }
426  }
427  $output .= $this->closeParagraph( true );
428  return $output;
429  }
430 
441  private function findColonNoLinks( $str, &$before, &$after ) {
442  if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
443  # Nothing to find!
444  return false;
445  }
446 
447  if ( $m[0][0] === ':' ) {
448  # Easy; no tag nesting to worry about
449  $colonPos = $m[0][1];
450  $before = substr( $str, 0, $colonPos );
451  $after = substr( $str, $colonPos + 1 );
452  return $colonPos;
453  }
454 
455  # Ugly state machine to walk through avoiding tags.
456  $state = self::COLON_STATE_TEXT;
457  $ltLevel = 0;
458  $lcLevel = 0;
459  $len = strlen( $str );
460  for ( $i = $m[0][1]; $i < $len; $i++ ) {
461  $c = $str[$i];
462 
463  switch ( $state ) {
464  case self::COLON_STATE_TEXT:
465  switch ( $c ) {
466  case "<":
467  # Could be either a <start> tag or an </end> tag
468  $state = self::COLON_STATE_TAGSTART;
469  break;
470  case ":":
471  if ( $ltLevel === 0 ) {
472  # We found it!
473  $before = substr( $str, 0, $i );
474  $after = substr( $str, $i + 1 );
475  return $i;
476  }
477  # Embedded in a tag; don't break it.
478  break;
479  default:
480  # Skip ahead looking for something interesting
481  if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
482  # Nothing else interesting
483  return false;
484  }
485  if ( $m[0][0] === '-{' ) {
486  $state = self::COLON_STATE_LC;
487  $lcLevel++;
488  $i = $m[0][1] + 1;
489  } else {
490  # Skip ahead to next interesting character.
491  $i = $m[0][1] - 1;
492  }
493  break;
494  }
495  break;
496  case self::COLON_STATE_LC:
497  # In language converter markup -{ ... }-
498  if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
499  # Nothing else interesting to find; abort!
500  # We're nested in language converter markup, but there
501  # are no close tags left. Abort!
502  break 2;
503  } elseif ( $m[0][0] === '-{' ) {
504  $i = $m[0][1] + 1;
505  $lcLevel++;
506  } elseif ( $m[0][0] === '}-' ) {
507  $i = $m[0][1] + 1;
508  $lcLevel--;
509  if ( $lcLevel === 0 ) {
510  $state = self::COLON_STATE_TEXT;
511  }
512  }
513  break;
514  case self::COLON_STATE_TAG:
515  # In a <tag>
516  switch ( $c ) {
517  case ">":
518  $ltLevel++;
519  $state = self::COLON_STATE_TEXT;
520  break;
521  case "/":
522  # Slash may be followed by >?
523  $state = self::COLON_STATE_TAGSLASH;
524  break;
525  default:
526  # ignore
527  }
528  break;
529  case self::COLON_STATE_TAGSTART:
530  switch ( $c ) {
531  case "/":
532  $state = self::COLON_STATE_CLOSETAG;
533  break;
534  case "!":
535  $state = self::COLON_STATE_COMMENT;
536  break;
537  case ">":
538  # Illegal early close? This shouldn't happen D:
539  $state = self::COLON_STATE_TEXT;
540  break;
541  default:
542  $state = self::COLON_STATE_TAG;
543  }
544  break;
545  case self::COLON_STATE_CLOSETAG:
546  # In a </tag>
547  if ( $c === ">" ) {
548  if ( $ltLevel > 0 ) {
549  $ltLevel--;
550  } else {
551  # ignore the excess close tag, but keep looking for
552  # colons. (This matches Parsoid behavior.)
553  wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" );
554  }
555  $state = self::COLON_STATE_TEXT;
556  }
557  break;
558  case self::COLON_STATE_TAGSLASH:
559  if ( $c === ">" ) {
560  # Yes, a self-closed tag <blah/>
561  $state = self::COLON_STATE_TEXT;
562  } else {
563  # Probably we're jumping the gun, and this is an attribute
564  $state = self::COLON_STATE_TAG;
565  }
566  break;
567  case self::COLON_STATE_COMMENT:
568  if ( $c === "-" ) {
569  $state = self::COLON_STATE_COMMENTDASH;
570  }
571  break;
572  case self::COLON_STATE_COMMENTDASH:
573  if ( $c === "-" ) {
574  $state = self::COLON_STATE_COMMENTDASHDASH;
575  } else {
576  $state = self::COLON_STATE_COMMENT;
577  }
578  break;
579  case self::COLON_STATE_COMMENTDASHDASH:
580  if ( $c === ">" ) {
581  $state = self::COLON_STATE_TEXT;
582  } else {
583  $state = self::COLON_STATE_COMMENT;
584  }
585  break;
586  default:
587  throw new MWException( "State machine error in " . __METHOD__ );
588  }
589  }
590  if ( $ltLevel > 0 || $lcLevel > 0 ) {
591  wfDebug(
592  __METHOD__ . ": Invalid input; not enough close tags " .
593  "(level $ltLevel/$lcLevel, state $state)\n"
594  );
595  return false;
596  }
597  return false;
598  }
599 }
const COLON_STATE_TAGSTART
const COLON_STATE_COMMENTDASH
const COLON_STATE_COMMENT
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
openList( $char)
Open the list item element identified by the prefix character.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
__construct( $text, $lineStart)
static doBlockLevels( $text, $lineStart)
Make lists from lines starting with &#39;:&#39;, &#39;*&#39;, &#39;#&#39;, etc.
execute()
Execute the pass.
const COLON_STATE_COMMENTDASHDASH
nextItem( $char)
Close the current list item and open the next one.
closeList( $char)
Close the current list item identified by the prefix character.
findColonNoLinks( $str, &$before, &$after)
Split up a string on &#39;:&#39;, ignoring any occurrences inside tags to prevent illegal overlapping...
getCommon( $st1, $st2)
getCommon() returns the length of the longest common substring of both arguments, starting at the beg...
closeParagraph( $atTheEnd=false)
If a pre or p is open, return the corresponding close tag and update the state.
const COLON_STATE_TAGSLASH
const COLON_STATE_CLOSETAG