Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
86.00% |
86 / 100 |
|
42.86% |
3 / 7 |
CRAP | |
0.00% |
0 / 1 |
| WikifunctionsPFragmentSanitiserTokenHandler | |
86.00% |
86 / 100 |
|
42.86% |
3 / 7 |
31.31 | |
0.00% |
0 / 1 |
| __construct | |
62.50% |
15 / 24 |
|
0.00% |
0 / 1 |
7.90 | |||
| newSiteMatrix | |
40.00% |
2 / 5 |
|
0.00% |
0 / 1 |
4.94 | |||
| toProtocolRelative | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| startTag | |
97.83% |
45 / 46 |
|
0.00% |
0 / 1 |
12 | |||
| getMatchingDomains | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
| endTag | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| sanitiseHtmlFragment | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | /** |
| 4 | * WikiLambda extension HTML-stripping Remex token handler for our parser function |
| 5 | * |
| 6 | * @file |
| 7 | * @ingroup Extensions |
| 8 | * @copyright 2020– Abstract Wikipedia team; see AUTHORS.txt |
| 9 | * @license MIT |
| 10 | */ |
| 11 | |
| 12 | namespace MediaWiki\Extension\WikiLambda\ParserFunction; |
| 13 | |
| 14 | use MediaWiki\Extension\SiteMatrix\SiteMatrix; |
| 15 | use MediaWiki\Extension\WikiLambda\Tests\Integration\MockSiteMatrix; |
| 16 | use MediaWiki\MediaWikiServices; |
| 17 | use MediaWiki\Parser\Sanitizer; |
| 18 | use MediaWiki\Registration\ExtensionRegistry; |
| 19 | use MediaWiki\Tidy\RemexCompatFormatter; |
| 20 | use Psr\Log\LoggerInterface; |
| 21 | use Wikimedia\RemexHtml\HTMLData; |
| 22 | use Wikimedia\RemexHtml\Serializer\Serializer; |
| 23 | use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer; |
| 24 | use Wikimedia\RemexHtml\Tokenizer\Attributes; |
| 25 | use Wikimedia\RemexHtml\Tokenizer\PlainAttributes; |
| 26 | use Wikimedia\RemexHtml\Tokenizer\RelayTokenHandler; |
| 27 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer; |
| 28 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
| 29 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
| 30 | |
| 31 | class WikifunctionsPFragmentSanitiserTokenHandler extends RelayTokenHandler { |
| 32 | |
| 33 | private string $source; |
| 34 | private array $allowedUrls = []; |
| 35 | private LoggerInterface $logger; |
| 36 | |
| 37 | public function __construct( LoggerInterface $logger, Serializer $serializer, string $source ) { |
| 38 | $this->nextHandler = new Dispatcher( new TreeBuilder( $serializer, [ |
| 39 | 'ignoreErrors' => true, |
| 40 | 'ignoreNulls' => true, |
| 41 | ] ) ); |
| 42 | |
| 43 | parent::__construct( $this->nextHandler ); |
| 44 | |
| 45 | $this->logger = $logger; |
| 46 | $this->source = $source; |
| 47 | |
| 48 | // The local server URL is always allowed, so we can link to the current wiki |
| 49 | $localServer = MediaWikiServices::getInstance()->getMainConfig()->get( 'Server' ); |
| 50 | $canonicalServer = MediaWikiServices::getInstance()->getMainConfig()->get( 'CanonicalServer' ); |
| 51 | |
| 52 | $this->allowedUrls = array_filter( [ |
| 53 | $this->toProtocolRelative( $localServer ), |
| 54 | $this->toProtocolRelative( $canonicalServer ) |
| 55 | ] ); |
| 56 | |
| 57 | // If loaded, SiteMatrix can give us a list of cluster wikis and thus their server URLs |
| 58 | $sitematrix = $this->newSiteMatrix(); |
| 59 | if ( $sitematrix ) { |
| 60 | $languages = $sitematrix->getLangList(); |
| 61 | $families = $sitematrix->getSites(); |
| 62 | foreach ( $languages as $key => $langCode ) { |
| 63 | foreach ( $families as $family ) { |
| 64 | if ( $sitematrix->exist( $langCode, $family ) ) { |
| 65 | $this->allowedUrls[] = $this->toProtocolRelative( $sitematrix->getUrl( $langCode, $family ) ); |
| 66 | } |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | $specials = $sitematrix->getSpecials(); |
| 71 | foreach ( $specials as $special ) { |
| 72 | $this->allowedUrls[] = $this->toProtocolRelative( $sitematrix->getUrl( $special[0], $special[1] ) ); |
| 73 | } |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | /** |
| 78 | * Returns the appropriate SiteMatrixProvider depending on the environment: |
| 79 | * * If running Phpunit tests: return MockSiteMatrixProvider |
| 80 | * * If production and SiteMatrix is loaded: return WikiLambdaSiteMatrixProvider |
| 81 | * * Else return nothing |
| 82 | * |
| 83 | * @return ?SiteMatrix |
| 84 | */ |
| 85 | protected function newSiteMatrix(): ?SiteMatrix { |
| 86 | if ( ExtensionRegistry::getInstance()->isLoaded( 'SiteMatrix' ) ) { |
| 87 | if ( defined( 'MW_PHPUNIT_TEST' ) ) { |
| 88 | // Phan is unhappy because, altough it's a sub-class, this is not loaded in prod code. |
| 89 | // @phan-suppress-next-line PhanTypeMismatchReturn, PhanUndeclaredClassMethod |
| 90 | return new MockSiteMatrix(); |
| 91 | } |
| 92 | return new SiteMatrix(); |
| 93 | } |
| 94 | return null; |
| 95 | } |
| 96 | |
| 97 | // This is our list of allowed HTML elements. It should be kept extremely minimal, and any changes should |
| 98 | // be carefully considered in conjunction with the Security and MW Content Transformation team. |
| 99 | // Keep this in sync with CodeEditor.getDisallowedTagAnnotations() |
| 100 | private const ALLOWEDELEMENTS = [ |
| 101 | // Headings |
| 102 | "h1", |
| 103 | "h2", |
| 104 | "h3", |
| 105 | "h4", |
| 106 | "h5", |
| 107 | "h6", |
| 108 | |
| 109 | // Primary content |
| 110 | "div", |
| 111 | "span", |
| 112 | "p", |
| 113 | "a", |
| 114 | |
| 115 | // Secondary content |
| 116 | "blockquote", |
| 117 | "br", |
| 118 | "hr", |
| 119 | |
| 120 | // Annotations (FIXME: possibly trim these?) |
| 121 | "abbr", |
| 122 | "b", |
| 123 | "code", |
| 124 | "del", |
| 125 | "dfn", |
| 126 | "em", |
| 127 | "i", |
| 128 | "ins", |
| 129 | "kbd", |
| 130 | "q", |
| 131 | "s", |
| 132 | "strike", |
| 133 | "strong", |
| 134 | "sub", |
| 135 | "sup", |
| 136 | "u", |
| 137 | |
| 138 | // Structural content (lists and tables) |
| 139 | "li", |
| 140 | "dt", |
| 141 | "dd", |
| 142 | "ol", |
| 143 | "ul", |
| 144 | "dl", |
| 145 | "tr", |
| 146 | "td", |
| 147 | "th", |
| 148 | "table", |
| 149 | "caption", |
| 150 | |
| 151 | // Special Unicode bi-directionality elements |
| 152 | "bdi", |
| 153 | "bdo", |
| 154 | ]; |
| 155 | |
| 156 | /** |
| 157 | * Convert a URL to a protocol-relative URL |
| 158 | * |
| 159 | * @param string $url |
| 160 | * @return string |
| 161 | */ |
| 162 | private function toProtocolRelative( string $url ): string { |
| 163 | return preg_match( '#^https?://#i', trim( $url ) ) ? |
| 164 | '//' . preg_replace( '#^https?://#i', '', $url ) : |
| 165 | trim( $url ); |
| 166 | } |
| 167 | |
| 168 | /** |
| 169 | * @inheritDoc |
| 170 | */ |
| 171 | public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { |
| 172 | $tagName = strtolower( $name ); |
| 173 | |
| 174 | // If the tag is not in the allowed list, we'll skip processing it entirely and escape as text |
| 175 | if ( in_array( $tagName, self::ALLOWEDELEMENTS ) ) { |
| 176 | // Check attributes are allowed, and drop banned ones |
| 177 | |
| 178 | // First, we use MediaWiki's Sanitizer to validate the tag's attributes. |
| 179 | // This is imperfect, but a good start for dropping bad attributes. |
| 180 | $fixedAttrs = Sanitizer::validateTagAttributes( $attrs->getValues(), $tagName ); |
| 181 | |
| 182 | // Unlike the MediaWiki Sanitizer, for safety we do not allow any data- attributes at all |
| 183 | foreach ( $fixedAttrs as $key => $value ) { |
| 184 | if ( str_starts_with( $key, 'data-' ) ) { |
| 185 | unset( $fixedAttrs[$key] ); |
| 186 | } |
| 187 | |
| 188 | if ( $key === 'style' && $value === "/* insecure input */" ) { |
| 189 | // Don't let the placeholder cleansed value through |
| 190 | unset( $fixedAttrs[$key] ); |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | $tagAllowed = true; |
| 195 | |
| 196 | // Finally, we do some special handling for the <a> tag. The MediaWiki Sanitizer (above) will |
| 197 | // have only allowed through supported full URLs with supported protocols (so no relative URLs |
| 198 | // or javascript: URLs), but we want to restrict further to only known local and interwiiki links. |
| 199 | if ( $tagName === 'a' ) { |
| 200 | $parsedLink = MediaWikiServices::getInstance()->getUrlUtils()->parse( $fixedAttrs['href'] ?? '' ); |
| 201 | |
| 202 | if ( !$parsedLink || empty( $parsedLink['host'] ) ) { |
| 203 | // If the link is not parseable, or has no host, we will not allow it |
| 204 | // This is already filtered out by MediaWiki's Sanitizer |
| 205 | $tagAllowed = false; |
| 206 | $fixedAttrs = []; |
| 207 | } else { |
| 208 | // (T407640) Use protocol-relative urls to compare with allowed urls |
| 209 | $targetDomain = '//' . $parsedLink['host']; |
| 210 | |
| 211 | // Mostly for local testing! |
| 212 | if ( isset( $parsedLink['port'] ) ) { |
| 213 | $targetDomain .= ':' . $parsedLink['port']; |
| 214 | } |
| 215 | |
| 216 | if ( in_array( $targetDomain, $this->allowedUrls ) ) { |
| 217 | // Allowed; over-write all other attributes |
| 218 | $fixedAttrs = [ |
| 219 | 'href' => $fixedAttrs['href'] |
| 220 | ]; |
| 221 | $this->logger->info( |
| 222 | __METHOD__ . ': Allowing <a> tag with href "{targetDomain}"', |
| 223 | [ |
| 224 | 'rawHref' => $fixedAttrs['href'] ?? '', |
| 225 | 'targetDomain' => $targetDomain |
| 226 | ] |
| 227 | ); |
| 228 | |
| 229 | } else { |
| 230 | $tagAllowed = false; |
| 231 | $this->logger->info( |
| 232 | __METHOD__ . ': Rejecting <a> tag with href "{targetDomain}"', |
| 233 | [ |
| 234 | 'rawHref' => $fixedAttrs['href'] ?? '', |
| 235 | 'targetDomain' => $targetDomain, |
| 236 | 'allowedDomainsCount' => count( $this->allowedUrls ), |
| 237 | 'allowedDomainsMatch' => $this->getMatchingDomains( |
| 238 | $this->allowedUrls, |
| 239 | $parsedLink[ 'host' ] |
| 240 | ) |
| 241 | ] |
| 242 | ); |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | $attrs = new PlainAttributes( $fixedAttrs ); |
| 248 | |
| 249 | if ( $tagAllowed ) { |
| 250 | $this->nextHandler->startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength ); |
| 251 | return; |
| 252 | } |
| 253 | // If the tag is not allowed, we will fall down to the below, and escape it as text |
| 254 | } |
| 255 | |
| 256 | // If we've reached this point, the tag is not allowed, so we will escape it as text |
| 257 | $this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength ); |
| 258 | } |
| 259 | |
| 260 | /** |
| 261 | * Returns the allowedDomains that match the host to enable easier |
| 262 | * debugging if link is not parsed. Passing the whole allowedDomains |
| 263 | * to the logger will mostly end up in a discarded log due to the |
| 264 | * size of the whole allowedDomains list, so we will log substring |
| 265 | * matches with the host part of the url. |
| 266 | * |
| 267 | * @param array $allowedDomains |
| 268 | * @param string $targetHost |
| 269 | * @return array |
| 270 | */ |
| 271 | private function getMatchingDomains( $allowedDomains, $targetHost ) { |
| 272 | $matches = []; |
| 273 | foreach ( $allowedDomains as $allowed ) { |
| 274 | if ( strpos( $targetHost, $allowed ) !== false ) { |
| 275 | $matches[] = $allowed; |
| 276 | } |
| 277 | } |
| 278 | return $matches; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * @inheritDoc |
| 283 | */ |
| 284 | public function endTag( $name, $sourceStart, $sourceLength ) { |
| 285 | $tagName = strtolower( $name ); |
| 286 | |
| 287 | if ( in_array( $tagName, self::ALLOWEDELEMENTS ) ) { |
| 288 | $this->nextHandler->endTag( $name, $sourceStart, $sourceLength ); |
| 289 | } else { |
| 290 | $this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength ); |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | /** |
| 295 | * Sanitise an HTML fragment string using Remex, like MediaWiki's Sanitizer but with |
| 296 | * more control (for both including and excluding things). |
| 297 | * |
| 298 | * @param LoggerInterface $logger |
| 299 | * @param string $text |
| 300 | * @return string |
| 301 | */ |
| 302 | public static function sanitiseHtmlFragment( LoggerInterface $logger, string $text ): string { |
| 303 | // Use RemexHtml to tokenize $text and remove the barred tags |
| 304 | |
| 305 | $serializer = new RemexSerializer( new RemexCompatFormatter ); |
| 306 | |
| 307 | $tokenizer = new RemexTokenizer( |
| 308 | new WikifunctionsPFragmentSanitiserTokenHandler( $logger, $serializer, $text ), |
| 309 | $text, |
| 310 | [ |
| 311 | 'ignoreErrors' => true, |
| 312 | // Don't ignore char refs, as we want them to be decoded |
| 313 | 'ignoreCharRefs' => false, |
| 314 | 'ignoreNulls' => true, |
| 315 | 'skipPreprocess' => true, |
| 316 | ] |
| 317 | ); |
| 318 | $tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'body', ] ); |
| 319 | |
| 320 | return $serializer->getResult(); |
| 321 | } |
| 322 | } |