MediaWiki master
RemexCompatFormatter.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Tidy;
4
6use Wikimedia\RemexHtml\HTMLData;
7use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
8use Wikimedia\RemexHtml\Serializer\SerializerNode;
9
16class RemexCompatFormatter extends HtmlFormatter {
17 private static $markedEmptyElements = [
18 'li' => true,
19 'p' => true,
20 'tr' => true,
21 ];
22
24 private $textProcessor;
25
26 public function __construct( $options = [] ) {
27 parent::__construct( $options );
28 // Escape non-breaking space
29 $this->attributeEscapes["\u{00A0}"] = '&#160;';
30 $this->textEscapes["\u{00A0}"] = '&#160;';
31 // Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
32 unset( $this->attributeEscapes["&"] );
33 unset( $this->textEscapes["&"] );
34 $this->textProcessor = $options['textProcessor'] ?? null;
35 }
36
37 public function startDocument( $fragmentNamespace, $fragmentName ) {
38 return '';
39 }
40
48 public function characters( SerializerNode $parent, $text, $start, $length ) {
49 $text = parent::characters( $parent, $text, $start, $length );
50
51 if ( $parent->namespace !== HTMLData::NS_HTML
52 || !isset( $this->rawTextElements[$parent->name] )
53 ) {
54 if ( $this->textProcessor !== null ) {
55 $text = call_user_func( $this->textProcessor, $text );
56 }
57 }
58
59 // Ensure a consistent representation for all entities
60 $text = Sanitizer::normalizeCharReferences( $text );
61 return $text;
62 }
63
64 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
65 $data = $node->snData;
66 if ( $data && $data->isPWrapper ) {
67 if ( $data->nonblankNodeCount ) {
68 return "<p>$contents</p>";
69 } else {
70 return $contents;
71 }
72 }
73
74 $name = $node->name;
75 $attrs = $node->attrs;
76 if ( isset( self::$markedEmptyElements[$name] ) && $attrs->count() === 0
77 && strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
78 ) {
79 return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
80 }
81
82 $s = "<$name";
83 foreach ( $attrs->getValues() as $attrName => $attrValue ) {
84 $encValue = strtr( $attrValue, $this->attributeEscapes );
85 $encValue = Sanitizer::normalizeCharReferences( $encValue );
86 $s .= " $attrName=\"$encValue\"";
87 }
88 if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
89 $s .= ' />';
90 return $s;
91 }
92
93 $s .= '>';
94 if ( $node->namespace === HTMLData::NS_HTML
95 && isset( $contents[0] ) && $contents[0] === "\n"
96 && isset( $this->prefixLfElements[$name] )
97 ) {
98 $s .= "\n$contents</$name>";
99 } else {
100 $s .= "$contents</$name>";
101 }
102 return $s;
103 }
104}
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
startDocument( $fragmentNamespace, $fragmentName)
element(SerializerNode $parent, SerializerNode $node, $contents)
characters(SerializerNode $parent, $text, $start, $length)
WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains unexpande...