MediaWiki  master
RemexCompatFormatter.php
Go to the documentation of this file.
1 <?php
2 
3 namespace MediaWiki\Tidy;
4 
6 use Wikimedia\RemexHtml\HTMLData;
7 use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
8 use Wikimedia\RemexHtml\Serializer\SerializerNode;
9 
13 class RemexCompatFormatter extends HtmlFormatter {
14  private static $markedEmptyElements = [
15  'li' => true,
16  'p' => true,
17  'tr' => true,
18  ];
19 
21  private $textProcessor;
22 
23  public function __construct( $options = [] ) {
24  parent::__construct( $options );
25  $this->attributeEscapes["\u{00A0}"] = '&#160;';
26  unset( $this->attributeEscapes["&"] );
27  $this->textEscapes["\u{00A0}"] = '&#160;';
28  unset( $this->textEscapes["&"] );
29  $this->textProcessor = $options['textProcessor'] ?? null;
30  }
31 
32  public function startDocument( $fragmentNamespace, $fragmentName ) {
33  return '';
34  }
35 
36  public function characters( SerializerNode $parent, $text, $start, $length ) {
37  $text = parent::characters( $parent, $text, $start, $length );
38 
39  if ( $parent->namespace !== HTMLData::NS_HTML
40  || !isset( $this->rawTextElements[$parent->name] )
41  ) {
42  if ( $this->textProcessor !== null ) {
43  $text = call_user_func( $this->textProcessor, $text );
44  }
45  }
46 
47  // Ensure a consistent representation for all entities
48  $text = Sanitizer::normalizeCharReferences( $text );
49  return $text;
50  }
51 
52  public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
53  $data = $node->snData;
54  if ( $data && $data->isPWrapper ) {
55  if ( $data->nonblankNodeCount ) {
56  return "<p>$contents</p>";
57  } else {
58  return $contents;
59  }
60  }
61 
62  $name = $node->name;
63  $attrs = $node->attrs;
64  if ( isset( self::$markedEmptyElements[$name] ) && $attrs->count() === 0
65  && strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
66  ) {
67  return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
68  }
69 
70  $s = "<$name";
71  foreach ( $attrs->getValues() as $attrName => $attrValue ) {
72  $encValue = strtr( $attrValue, $this->attributeEscapes );
73  $encValue = Sanitizer::normalizeCharReferences( $encValue );
74  $s .= " $attrName=\"$encValue\"";
75  }
76  if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
77  $s .= ' />';
78  return $s;
79  }
80 
81  $s .= '>';
82  if ( $node->namespace === HTMLData::NS_HTML
83  && isset( $contents[0] ) && $contents[0] === "\n"
84  && isset( $this->prefixLfElements[$name] )
85  ) {
86  $s .= "\n$contents</$name>";
87  } else {
88  $s .= "$contents</$name>";
89  }
90  return $s;
91  }
92 }
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:46
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1262
startDocument( $fragmentNamespace, $fragmentName)
element(SerializerNode $parent, SerializerNode $node, $contents)
characters(SerializerNode $parent, $text, $start, $length)