MediaWiki  master
BlockLevelPass.php
Go to the documentation of this file.
1 <?php
2 
26  private $DTopen = false;
27  private $inPre = false;
28  private $lastParagraph = '';
29  private $lineStart;
30  private $text;
31 
32  # State constants for the definition list colon extraction
33  const COLON_STATE_TEXT = 0;
34  const COLON_STATE_TAG = 1;
41  const COLON_STATE_LC = 8;
42 
50  public static function doBlockLevels( $text, $lineStart ) {
51  $pass = new self( $text, $lineStart );
52  return $pass->execute();
53  }
54 
59  private function __construct( $text, $lineStart ) {
60  $this->text = $text;
61  $this->lineStart = $lineStart;
62  }
63 
67  private function hasOpenParagraph() {
68  return $this->lastParagraph !== '';
69  }
70 
77  private function closeParagraph( $atTheEnd = false ) {
78  $result = '';
79  if ( $this->hasOpenParagraph() ) {
80  $result = '</' . $this->lastParagraph . '>';
81  if ( !$atTheEnd ) {
82  $result .= "\n";
83  }
84  }
85  $this->inPre = false;
86  $this->lastParagraph = '';
87  return $result;
88  }
89 
99  private function getCommon( $st1, $st2 ) {
100  $shorter = min( strlen( $st1 ), strlen( $st2 ) );
101 
102  for ( $i = 0; $i < $shorter; ++$i ) {
103  if ( $st1[$i] !== $st2[$i] ) {
104  break;
105  }
106  }
107  return $i;
108  }
109 
117  private function openList( $char ) {
118  $result = $this->closeParagraph();
119 
120  if ( $char === '*' ) {
121  $result .= "<ul><li>";
122  } elseif ( $char === '#' ) {
123  $result .= "<ol><li>";
124  } elseif ( $char === ':' ) {
125  $result .= "<dl><dd>";
126  } elseif ( $char === ';' ) {
127  $result .= "<dl><dt>";
128  $this->DTopen = true;
129  } else {
130  $result = '<!-- ERR 1 -->';
131  }
132 
133  return $result;
134  }
135 
142  private function nextItem( $char ) {
143  if ( $char === '*' || $char === '#' ) {
144  return "</li>\n<li>";
145  } elseif ( $char === ':' || $char === ';' ) {
146  $close = "</dd>\n";
147  if ( $this->DTopen ) {
148  $close = "</dt>\n";
149  }
150  if ( $char === ';' ) {
151  $this->DTopen = true;
152  return $close . '<dt>';
153  } else {
154  $this->DTopen = false;
155  return $close . '<dd>';
156  }
157  }
158  return '<!-- ERR 2 -->';
159  }
160 
167  private function closeList( $char ) {
168  if ( $char === '*' ) {
169  $text = "</li></ul>";
170  } elseif ( $char === '#' ) {
171  $text = "</li></ol>";
172  } elseif ( $char === ':' ) {
173  if ( $this->DTopen ) {
174  $this->DTopen = false;
175  $text = "</dt></dl>";
176  } else {
177  $text = "</dd></dl>";
178  }
179  } else {
180  return '<!-- ERR 3 -->';
181  }
182  return $text;
183  }
184 
189  private function execute() {
190  $text = $this->text;
191  # Parsing through the text line by line. The main thing
192  # happening here is handling of block-level elements p, pre,
193  # and making lists from lines starting with * # : etc.
194  $textLines = StringUtils::explode( "\n", $text );
195 
196  $lastPrefix = $output = '';
197  $this->DTopen = $inBlockElem = false;
198  $prefixLength = 0;
199  $pendingPTag = false;
200  $inBlockquote = false;
201 
202  for ( $textLines->rewind(); $textLines->valid(); ) {
203  $inputLine = $textLines->current();
204  $textLines->next();
205  $notLastLine = $textLines->valid();
206 
207  # Fix up $lineStart
208  if ( !$this->lineStart ) {
209  $output .= $inputLine;
210  $this->lineStart = true;
211  continue;
212  }
213  # * = ul
214  # # = ol
215  # ; = dt
216  # : = dd
217 
218  $lastPrefixLength = strlen( $lastPrefix );
219  $preCloseMatch = preg_match( '/<\\/pre/i', $inputLine );
220  $preOpenMatch = preg_match( '/<pre/i', $inputLine );
221  # If not in a <pre> element, scan for and figure out what prefixes are there.
222  if ( !$this->inPre ) {
223  # Multiple prefixes may abut each other for nested lists.
224  $prefixLength = strspn( $inputLine, '*#:;' );
225  $prefix = substr( $inputLine, 0, $prefixLength );
226 
227  # eh?
228  # ; and : are both from definition-lists, so they're equivalent
229  # for the purposes of determining whether or not we need to open/close
230  # elements.
231  $prefix2 = str_replace( ';', ':', $prefix );
232  $t = substr( $inputLine, $prefixLength );
233  $this->inPre = (bool)$preOpenMatch;
234  } else {
235  # Don't interpret any other prefixes in preformatted text
236  $prefixLength = 0;
237  $prefix = $prefix2 = '';
238  $t = $inputLine;
239  }
240 
241  # List generation
242  if ( $prefixLength && $lastPrefix === $prefix2 ) {
243  # Same as the last item, so no need to deal with nesting or opening stuff
244  $output .= $this->nextItem( substr( $prefix, -1 ) );
245  $pendingPTag = false;
246 
247  if ( substr( $prefix, -1 ) === ';' ) {
248  # The one nasty exception: definition lists work like this:
249  # ; title : definition text
250  # So we check for : in the remainder text to split up the
251  # title and definition, without b0rking links.
252  $term = $t2 = '';
253  if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
254  $t = $t2;
255  // Trim whitespace in list items
256  $output .= trim( $term ) . $this->nextItem( ':' );
257  }
258  }
259  } elseif ( $prefixLength || $lastPrefixLength ) {
260  # We need to open or close prefixes, or both.
261 
262  # Either open or close a level...
263  $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
264  $pendingPTag = false;
265 
266  # Close all the prefixes which aren't shared.
267  while ( $commonPrefixLength < $lastPrefixLength ) {
268  // @phan-suppress-next-line PhanTypeInvalidDimOffset
269  $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
270  --$lastPrefixLength;
271  }
272 
273  # Continue the current prefix if appropriate.
274  if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
275  $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
276  }
277 
278  # Close an open <dt> if we have a <dd> (":") starting on this line
279  if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] === ':' ) {
280  $output .= $this->nextItem( ':' );
281  }
282 
283  # Open prefixes where appropriate.
284  if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
285  $output .= "\n";
286  }
287  while ( $prefixLength > $commonPrefixLength ) {
288  $char = $prefix[$commonPrefixLength];
289  $output .= $this->openList( $char );
290 
291  if ( $char === ';' ) {
292  # @todo FIXME: This is dupe of code above
293  if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
294  $t = $t2;
295  // Trim whitespace in list items
296  $output .= trim( $term ) . $this->nextItem( ':' );
297  }
298  }
299  ++$commonPrefixLength;
300  }
301  if ( !$prefixLength && $lastPrefix ) {
302  $output .= "\n";
303  }
304  $lastPrefix = $prefix2;
305  }
306 
307  # If we have no prefixes, go to paragraph mode.
308  if ( $prefixLength == 0 ) {
309  # No prefix (not in list)--go to paragraph mode
310  # @todo consider using a stack for nestable elements like span, table and div
311 
312  // P-wrapping and indent-pre are suppressed inside, not outside
313  $blockElems = 'table|h1|h2|h3|h4|h5|h6|pre|p|ul|ol|dl';
314  // P-wrapping and indent-pre are suppressed outside, not inside
315  $antiBlockElems = 'td|th';
316 
317  $openMatch = preg_match(
318  '/<('
319  . "({$blockElems})|\\/({$antiBlockElems})|"
320  // Always suppresses
321  . '\\/?(tr|dt|dd|li)'
322  . ')\\b/iS',
323  $t
324  );
325  $closeMatch = preg_match(
326  '/<('
327  . "\\/({$blockElems})|({$antiBlockElems})|"
328  // Never suppresses
329  . '\\/?(center|blockquote|div|hr|mw:)'
330  . ')\\b/iS',
331  $t
332  );
333 
334  // Any match closes the paragraph, but only when `!$closeMatch`
335  // do we enter block mode. The oddities with table rows and
336  // cells are to avoid paragraph wrapping in interstitial spaces
337  // leading to fostered content.
338 
339  if ( $openMatch || $closeMatch ) {
340  $pendingPTag = false;
341  // Only close the paragraph if we're not inside a <pre> tag, or if
342  // that <pre> tag has just been opened
343  if ( !$this->inPre || $preOpenMatch ) {
344  // @todo T7718: paragraph closed
345  $output .= $this->closeParagraph();
346  }
347  if ( $preOpenMatch && !$preCloseMatch ) {
348  $this->inPre = true;
349  }
350  $bqOffset = 0;
351  while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t,
352  $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
353  ) {
354  $inBlockquote = !$bqMatch[1][0]; // is this a close tag?
355  $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
356  }
357  $inBlockElem = !$closeMatch;
358  } elseif ( !$inBlockElem && !$this->inPre ) {
359  if ( substr( $t, 0, 1 ) == ' '
360  && ( $this->lastParagraph === 'pre' || trim( $t ) != '' )
361  && !$inBlockquote
362  ) {
363  # pre
364  if ( $this->lastParagraph !== 'pre' ) {
365  $pendingPTag = false;
366  $output .= $this->closeParagraph() . '<pre>';
367  $this->lastParagraph = 'pre';
368  }
369  $t = substr( $t, 1 );
370  } elseif ( preg_match( '/^(?:<style\\b[^>]*>.*?<\\/style>\s*|<link\\b[^>]*>\s*)+$/iS', $t ) ) {
371  # T186965: <style> or <link> by itself on a line shouldn't open or close paragraphs.
372  # But it should clear $pendingPTag.
373  if ( $pendingPTag ) {
374  $output .= $this->closeParagraph();
375  $pendingPTag = false;
376  }
377  } else {
378  # paragraph
379  if ( trim( $t ) === '' ) {
380  if ( $pendingPTag ) {
381  $output .= $pendingPTag . '<br />';
382  $pendingPTag = false;
383  $this->lastParagraph = 'p';
384  } elseif ( $this->lastParagraph !== 'p' ) {
385  $output .= $this->closeParagraph();
386  $pendingPTag = '<p>';
387  } else {
388  $pendingPTag = '</p><p>';
389  }
390  } elseif ( $pendingPTag ) {
391  $output .= $pendingPTag;
392  $pendingPTag = false;
393  $this->lastParagraph = 'p';
394  } elseif ( $this->lastParagraph !== 'p' ) {
395  $output .= $this->closeParagraph() . '<p>';
396  $this->lastParagraph = 'p';
397  }
398  }
399  }
400  }
401  # somewhere above we forget to get out of pre block (T2785)
402  if ( $preCloseMatch && $this->inPre ) {
403  $this->inPre = false;
404  }
405  if ( $pendingPTag === false ) {
406  if ( $prefixLength === 0 ) {
407  $output .= $t;
408  // Add a newline if there's an open paragraph
409  // or we've yet to reach the last line.
410  if ( $notLastLine || $this->hasOpenParagraph() ) {
411  $output .= "\n";
412  }
413  } else {
414  // Trim whitespace in list items
415  $output .= trim( $t );
416  }
417  }
418  }
419  while ( $prefixLength ) {
420  $output .= $this->closeList( $prefix2[$prefixLength - 1] );
421  --$prefixLength;
422  // Note that a paragraph is only ever opened when `prefixLength`
423  // is zero, but we'll choose to be overly cautious.
424  if ( !$prefixLength && $this->hasOpenParagraph() ) {
425  $output .= "\n";
426  }
427  }
428  $output .= $this->closeParagraph( true );
429  return $output;
430  }
431 
442  private function findColonNoLinks( $str, &$before, &$after ) {
443  if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
444  # Nothing to find!
445  return false;
446  }
447 
448  if ( $m[0][0] === ':' ) {
449  # Easy; no tag nesting to worry about
450  $colonPos = $m[0][1];
451  $before = substr( $str, 0, $colonPos );
452  $after = substr( $str, $colonPos + 1 );
453  return $colonPos;
454  }
455 
456  # Ugly state machine to walk through avoiding tags.
457  $state = self::COLON_STATE_TEXT;
458  $ltLevel = 0;
459  $lcLevel = 0;
460  $len = strlen( $str );
461  for ( $i = $m[0][1]; $i < $len; $i++ ) {
462  $c = $str[$i];
463 
464  switch ( $state ) {
466  switch ( $c ) {
467  case "<":
468  # Could be either a <start> tag or an </end> tag
470  break;
471  case ":":
472  if ( $ltLevel === 0 ) {
473  # We found it!
474  $before = substr( $str, 0, $i );
475  $after = substr( $str, $i + 1 );
476  return $i;
477  }
478  # Embedded in a tag; don't break it.
479  break;
480  default:
481  # Skip ahead looking for something interesting
482  if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
483  # Nothing else interesting
484  return false;
485  }
486  if ( $m[0][0] === '-{' ) {
487  $state = self::COLON_STATE_LC;
488  $lcLevel++;
489  $i = $m[0][1] + 1;
490  } else {
491  # Skip ahead to next interesting character.
492  $i = $m[0][1] - 1;
493  }
494  break;
495  }
496  break;
498  # In language converter markup -{ ... }-
499  if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
500  # Nothing else interesting to find; abort!
501  # We're nested in language converter markup, but there
502  # are no close tags left. Abort!
503  break 2;
504  } elseif ( $m[0][0] === '-{' ) {
505  $i = $m[0][1] + 1;
506  $lcLevel++;
507  } elseif ( $m[0][0] === '}-' ) {
508  $i = $m[0][1] + 1;
509  $lcLevel--;
510  if ( $lcLevel === 0 ) {
511  $state = self::COLON_STATE_TEXT;
512  }
513  }
514  break;
516  # In a <tag>
517  switch ( $c ) {
518  case ">":
519  $ltLevel++;
520  $state = self::COLON_STATE_TEXT;
521  break;
522  case "/":
523  # Slash may be followed by >?
525  break;
526  default:
527  # ignore
528  }
529  break;
531  switch ( $c ) {
532  case "/":
534  break;
535  case "!":
536  $state = self::COLON_STATE_COMMENT;
537  break;
538  case ">":
539  # Illegal early close? This shouldn't happen D:
540  $state = self::COLON_STATE_TEXT;
541  break;
542  default:
543  $state = self::COLON_STATE_TAG;
544  }
545  break;
547  # In a </tag>
548  if ( $c === ">" ) {
549  if ( $ltLevel > 0 ) {
550  $ltLevel--;
551  } else {
552  # ignore the excess close tag, but keep looking for
553  # colons. (This matches Parsoid behavior.)
554  wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" );
555  }
556  $state = self::COLON_STATE_TEXT;
557  }
558  break;
560  if ( $c === ">" ) {
561  # Yes, a self-closed tag <blah/>
562  $state = self::COLON_STATE_TEXT;
563  } else {
564  # Probably we're jumping the gun, and this is an attribute
565  $state = self::COLON_STATE_TAG;
566  }
567  break;
569  if ( $c === "-" ) {
571  }
572  break;
574  if ( $c === "-" ) {
576  } else {
577  $state = self::COLON_STATE_COMMENT;
578  }
579  break;
581  if ( $c === ">" ) {
582  $state = self::COLON_STATE_TEXT;
583  } else {
584  $state = self::COLON_STATE_COMMENT;
585  }
586  break;
587  default:
588  throw new MWException( "State machine error in " . __METHOD__ );
589  }
590  }
591  if ( $ltLevel > 0 || $lcLevel > 0 ) {
592  wfDebug(
593  __METHOD__ . ": Invalid input; not enough close tags " .
594  "(level $ltLevel/$lcLevel, state $state)\n"
595  );
596  return false;
597  }
598  return false;
599  }
600 }
BlockLevelPass
Definition: BlockLevelPass.php:25
BlockLevelPass\COLON_STATE_COMMENT
const COLON_STATE_COMMENT
Definition: BlockLevelPass.php:38
BlockLevelPass\openList
openList( $char)
Open the list item element identified by the prefix character.
Definition: BlockLevelPass.php:117
BlockLevelPass\COLON_STATE_CLOSETAG
const COLON_STATE_CLOSETAG
Definition: BlockLevelPass.php:36
BlockLevelPass\getCommon
getCommon( $st1, $st2)
getCommon() returns the length of the longest common substring of both arguments, starting at the beg...
Definition: BlockLevelPass.php:99
BlockLevelPass\$lastParagraph
$lastParagraph
Definition: BlockLevelPass.php:28
BlockLevelPass\execute
execute()
Execute the pass.
Definition: BlockLevelPass.php:189
MWException
MediaWiki exception.
Definition: MWException.php:26
BlockLevelPass\$DTopen
$DTopen
Definition: BlockLevelPass.php:26
BlockLevelPass\$lineStart
$lineStart
Definition: BlockLevelPass.php:29
BlockLevelPass\doBlockLevels
static doBlockLevels( $text, $lineStart)
Make lists from lines starting with ':', '*', '#', etc.
Definition: BlockLevelPass.php:50
StringUtils\explode
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
Definition: StringUtils.php:356
BlockLevelPass\closeList
closeList( $char)
Close the current list item identified by the prefix character.
Definition: BlockLevelPass.php:167
BlockLevelPass\hasOpenParagraph
hasOpenParagraph()
Definition: BlockLevelPass.php:67
BlockLevelPass\$inPre
$inPre
Definition: BlockLevelPass.php:27
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:913
BlockLevelPass\$text
$text
Definition: BlockLevelPass.php:30
BlockLevelPass\COLON_STATE_TAGSLASH
const COLON_STATE_TAGSLASH
Definition: BlockLevelPass.php:37
BlockLevelPass\COLON_STATE_TAG
const COLON_STATE_TAG
Definition: BlockLevelPass.php:34
BlockLevelPass\__construct
__construct( $text, $lineStart)
Definition: BlockLevelPass.php:59
BlockLevelPass\COLON_STATE_LC
const COLON_STATE_LC
Definition: BlockLevelPass.php:41
BlockLevelPass\findColonNoLinks
findColonNoLinks( $str, &$before, &$after)
Split up a string on ':', ignoring any occurrences inside tags to prevent illegal overlapping.
Definition: BlockLevelPass.php:442
BlockLevelPass\closeParagraph
closeParagraph( $atTheEnd=false)
If a pre or p is open, return the corresponding close tag and update the state.
Definition: BlockLevelPass.php:77
BlockLevelPass\COLON_STATE_TAGSTART
const COLON_STATE_TAGSTART
Definition: BlockLevelPass.php:35
$t
$t
Definition: testCompression.php:71
BlockLevelPass\COLON_STATE_TEXT
const COLON_STATE_TEXT
Definition: BlockLevelPass.php:33
BlockLevelPass\COLON_STATE_COMMENTDASH
const COLON_STATE_COMMENTDASH
Definition: BlockLevelPass.php:39
BlockLevelPass\COLON_STATE_COMMENTDASHDASH
const COLON_STATE_COMMENTDASHDASH
Definition: BlockLevelPass.php:40
BlockLevelPass\nextItem
nextItem( $char)
Close the current list item and open the next one.
Definition: BlockLevelPass.php:142