MediaWiki master
RemexCompatFormatter.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Tidy;
4
6use Wikimedia\RemexHtml\HTMLData;
7use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
8use Wikimedia\RemexHtml\Serializer\SerializerNode;
9
16class RemexCompatFormatter extends HtmlFormatter {
17 private const MARKED_EMPTY_ELEMENTS = [
18 'li' => true,
19 'p' => true,
20 'tr' => true,
21 ];
22
24 private $textProcessor;
25
26 public function __construct( array $options = [] ) {
27 parent::__construct( $options );
28 // Escape non-breaking space
29 $this->attributeEscapes["\u{00A0}"] = '&#160;';
30 $this->textEscapes["\u{00A0}"] = '&#160;';
31 // Escape U+0338 (T387130)
32 $this->textEscapes["\u{0338}"] = '&#x338;';
33 // Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
34 unset( $this->attributeEscapes["&"] );
35 unset( $this->textEscapes["&"] );
36 $this->textProcessor = $options['textProcessor'] ?? null;
37 }
38
40 public function startDocument( $fragmentNamespace, $fragmentName ) {
41 return '';
42 }
43
51 public function characters( SerializerNode $parent, $text, $start, $length ) {
52 $text = parent::characters( $parent, $text, $start, $length );
53
54 if ( $parent->namespace !== HTMLData::NS_HTML
55 || !isset( $this->rawTextElements[$parent->name] )
56 ) {
57 if ( $this->textProcessor !== null ) {
58 $text = ( $this->textProcessor )( $text );
59 }
60
61 // Ensure a consistent representation for all entities
62 $text = Sanitizer::normalizeCharReferences( $text );
63 }
64
65 return $text;
66 }
67
69 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
70 $data = $node->snData;
71 if ( $data && $data->isPWrapper ) {
72 if ( $data->nonblankNodeCount ) {
73 return "<p>$contents</p>";
74 } else {
75 return $contents;
76 }
77 }
78
79 $name = $node->name;
80 $attrs = $node->attrs;
81 if ( isset( self::MARKED_EMPTY_ELEMENTS[$name] ) && $attrs->count() === 0
82 && strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
83 ) {
84 return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
85 }
86
87 $s = "<$name";
88 foreach ( $attrs->getValues() as $attrName => $attrValue ) {
89 $encValue = strtr( $attrValue, $this->attributeEscapes );
90 $encValue = Sanitizer::normalizeCharReferences( $encValue );
91 $s .= " $attrName=\"$encValue\"";
92 }
93 if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
94 $s .= ' />';
95 return $s;
96 }
97
98 $s .= '>';
99 if ( $node->namespace === HTMLData::NS_HTML
100 && isset( $contents[0] ) && $contents[0] === "\n"
101 && isset( $this->prefixLfElements[$name] )
102 ) {
103 $s .= "\n$contents</$name>";
104 } else {
105 $s .= "$contents</$name>";
106 }
107 return $s;
108 }
109}
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:32
startDocument( $fragmentNamespace, $fragmentName)
element(SerializerNode $parent, SerializerNode $node, $contents)
characters(SerializerNode $parent, $text, $start, $length)
WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains unexpande...