Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
86.00% covered (warning)
86.00%
86 / 100
42.86% covered (danger)
42.86%
3 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
WikifunctionsPFragmentSanitiserTokenHandler
86.00% covered (warning)
86.00%
86 / 100
42.86% covered (danger)
42.86%
3 / 7
31.31
0.00% covered (danger)
0.00%
0 / 1
 __construct
62.50% covered (warning)
62.50%
15 / 24
0.00% covered (danger)
0.00%
0 / 1
7.90
 newSiteMatrix
40.00% covered (danger)
40.00%
2 / 5
0.00% covered (danger)
0.00%
0 / 1
4.94
 toProtocolRelative
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 startTag
97.83% covered (success)
97.83%
45 / 46
0.00% covered (danger)
0.00%
0 / 1
12
 getMatchingDomains
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
3.07
 endTag
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 sanitiseHtmlFragment
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3/**
4 * WikiLambda extension HTML-stripping Remex token handler for our parser function
5 *
6 * @file
7 * @ingroup Extensions
8 * @copyright 2020– Abstract Wikipedia team; see AUTHORS.txt
9 * @license MIT
10 */
11
12namespace MediaWiki\Extension\WikiLambda\ParserFunction;
13
14use MediaWiki\Extension\SiteMatrix\SiteMatrix;
15use MediaWiki\Extension\WikiLambda\Tests\Integration\MockSiteMatrix;
16use MediaWiki\MediaWikiServices;
17use MediaWiki\Parser\Sanitizer;
18use MediaWiki\Registration\ExtensionRegistry;
19use MediaWiki\Tidy\RemexCompatFormatter;
20use Psr\Log\LoggerInterface;
21use Wikimedia\RemexHtml\HTMLData;
22use Wikimedia\RemexHtml\Serializer\Serializer;
23use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
24use Wikimedia\RemexHtml\Tokenizer\Attributes;
25use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
26use Wikimedia\RemexHtml\Tokenizer\RelayTokenHandler;
27use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
28use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
29use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
30
31class WikifunctionsPFragmentSanitiserTokenHandler extends RelayTokenHandler {
32
33    private string $source;
34    private array $allowedUrls = [];
35    private LoggerInterface $logger;
36
37    public function __construct( LoggerInterface $logger, Serializer $serializer, string $source ) {
38        $this->nextHandler = new Dispatcher( new TreeBuilder( $serializer, [
39            'ignoreErrors' => true,
40            'ignoreNulls' => true,
41        ] ) );
42
43        parent::__construct( $this->nextHandler );
44
45        $this->logger = $logger;
46        $this->source = $source;
47
48        // The local server URL is always allowed, so we can link to the current wiki
49        $localServer = MediaWikiServices::getInstance()->getMainConfig()->get( 'Server' );
50        $canonicalServer = MediaWikiServices::getInstance()->getMainConfig()->get( 'CanonicalServer' );
51
52        $this->allowedUrls = array_filter( [
53            $this->toProtocolRelative( $localServer ),
54            $this->toProtocolRelative( $canonicalServer )
55        ] );
56
57        // If loaded, SiteMatrix can give us a list of cluster wikis and thus their server URLs
58        $sitematrix = $this->newSiteMatrix();
59        if ( $sitematrix ) {
60            $languages = $sitematrix->getLangList();
61            $families = $sitematrix->getSites();
62            foreach ( $languages as $key => $langCode ) {
63                foreach ( $families as $family ) {
64                    if ( $sitematrix->exist( $langCode, $family ) ) {
65                        $this->allowedUrls[] = $this->toProtocolRelative( $sitematrix->getUrl( $langCode, $family ) );
66                    }
67                }
68            }
69
70            $specials = $sitematrix->getSpecials();
71            foreach ( $specials as $special ) {
72                $this->allowedUrls[] = $this->toProtocolRelative( $sitematrix->getUrl( $special[0], $special[1] ) );
73            }
74        }
75    }
76
77    /**
78     * Returns the appropriate SiteMatrixProvider depending on the environment:
79     * * If running Phpunit tests: return MockSiteMatrixProvider
80     * * If production and SiteMatrix is loaded: return WikiLambdaSiteMatrixProvider
81     * * Else return nothing
82     *
83     * @return ?SiteMatrix
84     */
85    protected function newSiteMatrix(): ?SiteMatrix {
86        if ( ExtensionRegistry::getInstance()->isLoaded( 'SiteMatrix' ) ) {
87            if ( defined( 'MW_PHPUNIT_TEST' ) ) {
88                // Phan is unhappy because, altough it's a sub-class, this is not loaded in prod code.
89                // @phan-suppress-next-line PhanTypeMismatchReturn, PhanUndeclaredClassMethod
90                return new MockSiteMatrix();
91            }
92            return new SiteMatrix();
93        }
94        return null;
95    }
96
97    // This is our list of allowed HTML elements. It should be kept extremely minimal, and any changes should
98    // be carefully considered in conjunction with the Security and MW Content Transformation team.
99    // Keep this in sync with CodeEditor.getDisallowedTagAnnotations()
100    private const ALLOWEDELEMENTS = [
101        // Headings
102        "h1",
103        "h2",
104        "h3",
105        "h4",
106        "h5",
107        "h6",
108
109        // Primary content
110        "div",
111        "span",
112        "p",
113        "a",
114
115        // Secondary content
116        "blockquote",
117        "br",
118        "hr",
119
120        // Annotations (FIXME: possibly trim these?)
121        "abbr",
122        "b",
123        "code",
124        "del",
125        "dfn",
126        "em",
127        "i",
128        "ins",
129        "kbd",
130        "q",
131        "s",
132        "strike",
133        "strong",
134        "sub",
135        "sup",
136        "u",
137
138        // Structural content (lists and tables)
139        "li",
140        "dt",
141        "dd",
142        "ol",
143        "ul",
144        "dl",
145        "tr",
146        "td",
147        "th",
148        "table",
149        "caption",
150
151        // Special Unicode bi-directionality elements
152        "bdi",
153        "bdo",
154    ];
155
156    /**
157     * Convert a URL to a protocol-relative URL
158     *
159     * @param string $url
160     * @return string
161     */
162    private function toProtocolRelative( string $url ): string {
163        return preg_match( '#^https?://#i', trim( $url ) ) ?
164            '//' . preg_replace( '#^https?://#i', '', $url ) :
165            trim( $url );
166    }
167
168    /**
169     * @inheritDoc
170     */
171    public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
172        $tagName = strtolower( $name );
173
174        // If the tag is not in the allowed list, we'll skip processing it entirely and escape as text
175        if ( in_array( $tagName, self::ALLOWEDELEMENTS ) ) {
176            // Check attributes are allowed, and drop banned ones
177
178            // First, we use MediaWiki's Sanitizer to validate the tag's attributes.
179            // This is imperfect, but a good start for dropping bad attributes.
180            $fixedAttrs = Sanitizer::validateTagAttributes( $attrs->getValues(), $tagName );
181
182            // Unlike the MediaWiki Sanitizer, for safety we do not allow any data- attributes at all
183            foreach ( $fixedAttrs as $key => $value ) {
184                if ( str_starts_with( $key, 'data-' ) ) {
185                    unset( $fixedAttrs[$key] );
186                }
187
188                if ( $key === 'style' && $value === "/* insecure input */" ) {
189                    // Don't let the placeholder cleansed value through
190                    unset( $fixedAttrs[$key] );
191                }
192            }
193
194            $tagAllowed = true;
195
196            // Finally, we do some special handling for the <a> tag. The MediaWiki Sanitizer (above) will
197            // have only allowed through supported full URLs with supported protocols (so no relative URLs
198            // or javascript: URLs), but we want to restrict further to only known local and interwiiki links.
199            if ( $tagName === 'a' ) {
200                $parsedLink = MediaWikiServices::getInstance()->getUrlUtils()->parse( $fixedAttrs['href'] ?? '' );
201
202                if ( !$parsedLink || empty( $parsedLink['host'] ) ) {
203                    // If the link is not parseable, or has no host, we will not allow it
204                    // This is already filtered out by MediaWiki's Sanitizer
205                    $tagAllowed = false;
206                    $fixedAttrs = [];
207                } else {
208                    // (T407640) Use protocol-relative urls to compare with allowed urls
209                    $targetDomain = '//' . $parsedLink['host'];
210
211                    // Mostly for local testing!
212                    if ( isset( $parsedLink['port'] ) ) {
213                        $targetDomain .= ':' . $parsedLink['port'];
214                    }
215
216                    if ( in_array( $targetDomain, $this->allowedUrls ) ) {
217                        // Allowed; over-write all other attributes
218                        $fixedAttrs = [
219                            'href' => $fixedAttrs['href']
220                        ];
221                        $this->logger->info(
222                            __METHOD__ . ': Allowing <a> tag with href "{targetDomain}"',
223                            [
224                                'rawHref' => $fixedAttrs['href'] ?? '',
225                                'targetDomain' => $targetDomain
226                            ]
227                        );
228
229                    } else {
230                        $tagAllowed = false;
231                        $this->logger->info(
232                            __METHOD__ . ': Rejecting <a> tag with href "{targetDomain}"',
233                            [
234                                'rawHref' => $fixedAttrs['href'] ?? '',
235                                'targetDomain' => $targetDomain,
236                                'allowedDomainsCount' => count( $this->allowedUrls ),
237                                'allowedDomainsMatch' => $this->getMatchingDomains(
238                                    $this->allowedUrls,
239                                    $parsedLink[ 'host' ]
240                                )
241                            ]
242                        );
243                    }
244                }
245            }
246
247            $attrs = new PlainAttributes( $fixedAttrs );
248
249            if ( $tagAllowed ) {
250                $this->nextHandler->startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
251                return;
252            }
253            // If the tag is not allowed, we will fall down to the below, and escape it as text
254        }
255
256        // If we've reached this point, the tag is not allowed, so we will escape it as text
257        $this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength );
258    }
259
260    /**
261     * Returns the allowedDomains that match the host to enable easier
262     * debugging if link is not parsed. Passing the whole allowedDomains
263     * to the logger will mostly end up in a discarded log due to the
264     * size of the whole allowedDomains list, so we will log substring
265     * matches with the host part of the url.
266     *
267     * @param array $allowedDomains
268     * @param string $targetHost
269     * @return array
270     */
271    private function getMatchingDomains( $allowedDomains, $targetHost ) {
272        $matches = [];
273        foreach ( $allowedDomains as $allowed ) {
274            if ( strpos( $targetHost, $allowed ) !== false ) {
275                $matches[] = $allowed;
276            }
277        }
278        return $matches;
279    }
280
281    /**
282     * @inheritDoc
283     */
284    public function endTag( $name, $sourceStart, $sourceLength ) {
285        $tagName = strtolower( $name );
286
287        if ( in_array( $tagName, self::ALLOWEDELEMENTS ) ) {
288            $this->nextHandler->endTag( $name, $sourceStart, $sourceLength );
289        } else {
290            $this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength );
291        }
292    }
293
294    /**
295     * Sanitise an HTML fragment string using Remex, like MediaWiki's Sanitizer but with
296     * more control (for both including and excluding things).
297     *
298     * @param LoggerInterface $logger
299     * @param string $text
300     * @return string
301     */
302    public static function sanitiseHtmlFragment( LoggerInterface $logger, string $text ): string {
303        // Use RemexHtml to tokenize $text and remove the barred tags
304
305        $serializer = new RemexSerializer( new RemexCompatFormatter );
306
307        $tokenizer = new RemexTokenizer(
308            new WikifunctionsPFragmentSanitiserTokenHandler( $logger, $serializer, $text ),
309            $text,
310                [
311                'ignoreErrors' => true,
312                // Don't ignore char refs, as we want them to be decoded
313                'ignoreCharRefs' => false,
314                'ignoreNulls' => true,
315                'skipPreprocess' => true,
316            ]
317        );
318        $tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'body', ] );
319
320        return $serializer->getResult();
321    }
322}