Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
2.89% |
13 / 450 |
|
2.94% |
1 / 34 |
CRAP | |
0.00% |
0 / 1 |
| Sanitizer | |
2.89% |
13 / 450 |
|
2.94% |
1 / 34 |
23311.67 | |
0.00% |
0 / 1 |
| escapeLiteralHTMLTag | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
72 | |||
| isParsoidAttr | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
42 | |||
| isReservedDataAttribute | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
| normalizeCss | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
20 | |||
| checkCss | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
| cssDecodeCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
72 | |||
| armorFrenchSpaces | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
| escapeIdForAttribute | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| escapeIdForLink | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| escapeIdForExternalInterwiki | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| escapeIdInternalUrl | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| escapeIdInternal | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
| escapeIdReferenceListInternal | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
| normalizeSectionNameWhitespace | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeCharReferences | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeCharReferencesCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
| normalizeEntity | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
| decCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| hexCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
| validateCodepoint | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
110 | |||
| decodeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| decodeCharReferencesCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
| decodeChar | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| decodeEntity | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| attributesAllowedInternal | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| setupAttributesAllowedInternal | |
0.00% |
0 / 127 |
|
0.00% |
0 / 1 |
6 | |||
| stripIDNs | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| encodeUrlForExtLink | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| cleanUrl | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
42 | |||
| sanitizeTagAttrs | |
0.00% |
0 / 77 |
|
0.00% |
0 / 1 |
2162 | |||
| applySanitizedArgs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| sanitizeTitleURI | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
| delimiterReplaceCallback | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
182 | |||
| delimiterReplace | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | /** |
| 5 | * HTML sanitizer for %MediaWiki. |
| 6 | * |
| 7 | * Extended by Parsoid to be a general token sanitizer. Strips out (or |
| 8 | * encapsulates) unsafe and disallowed tag types and |
| 9 | * attributes. Should run last in the third, synchronous expansion |
| 10 | * stage. |
| 11 | * |
| 12 | * This code was originally ported from PHP to JS in 2012 |
| 13 | * and periodically updated before being back to PHP. This code should be |
| 14 | * periodically resynced with core sanitizer changes. |
| 15 | * |
| 16 | * Copyright © 2002-2005 Brooke Vibber <bvibber@pobox.com> et al |
| 17 | * https://www.mediawiki.org/ |
| 18 | * |
| 19 | * @license GPL-2.0-or-later |
| 20 | * @file |
| 21 | */ |
| 22 | |
| 23 | namespace Wikimedia\Parsoid\Core; |
| 24 | |
| 25 | use InvalidArgumentException; |
| 26 | use Wikimedia\Parsoid\Config\SiteConfig; |
| 27 | use Wikimedia\Parsoid\DOM\Element; |
| 28 | use Wikimedia\Parsoid\Tokens\KV; |
| 29 | use Wikimedia\Parsoid\Tokens\XMLTagTk; |
| 30 | use Wikimedia\Parsoid\Utils\CounterType; |
| 31 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 32 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 33 | use Wikimedia\Parsoid\Utils\TokenUtils; |
| 34 | use Wikimedia\RemexHtml\HTMLData; |
| 35 | |
| 36 | /** |
| 37 | * HTML sanitizer for MediaWiki |
| 38 | */ |
| 39 | class Sanitizer { |
| 40 | /** |
| 41 | * Regular expression to match various types of character references in |
| 42 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences. |
| 43 | * Note that HTML5 allows some named entities to omit the trailing |
| 44 | * semicolon; wikitext entities *must* have a trailing semicolon. |
| 45 | */ |
| 46 | private const CHAR_REFS_REGEX = |
| 47 | '/&([A-Za-z0-9\x80-\xff]+;) |
| 48 | |&\#([0-9]+); |
| 49 | |&\#[xX]([0-9A-Fa-f]+); |
| 50 | |&/x'; |
| 51 | |
| 52 | private const INSECURE_RE = '! expression |
| 53 | | accelerator\s*: |
| 54 | | -o-link\s*: |
| 55 | | -o-link-source\s*: |
| 56 | | -o-replace\s*: |
| 57 | | url\s*\( |
| 58 | | src\s*\( |
| 59 | | image\s*\( |
| 60 | | image-set\s*\( |
| 61 | | attr\s*\([^)]+[\s,]+url |
| 62 | !ix'; |
| 63 | |
| 64 | /** |
| 65 | * Pattern matching evil uris like javascript: |
| 66 | * WARNING: DO NOT use this in any place that actually requires denying |
| 67 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
| 68 | * pattern-based deny lists; the only way to be secure from javascript: |
| 69 | * uri based xss vectors is to allow only things that you know are safe |
| 70 | * and deny everything else. |
| 71 | * [1]: http://ha.ckers.org/xss.html |
| 72 | */ |
| 73 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)(\W|$)!iD'; |
| 74 | /* NOTE: This need not include A-Z because we are comparing against a lower-case string */ |
| 75 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/D"; |
| 76 | |
| 77 | /** |
| 78 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
| 79 | * |
| 80 | * @since 1.30 |
| 81 | */ |
| 82 | public const ID_PRIMARY = 0; |
| 83 | |
| 84 | /** |
| 85 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
| 86 | * if no fallback is configured. |
| 87 | * |
| 88 | * @since 1.30 |
| 89 | */ |
| 90 | public const ID_FALLBACK = 1; |
| 91 | |
| 92 | /** Characters that will be ignored in IDNs. |
| 93 | * https://datatracker.ietf.org/doc/html/rfc8264#section-9.13 |
| 94 | * https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
| 95 | * Strip them before further processing so deny lists and such work. |
| 96 | * Part of Sanitizer::cleanUrl in core. |
| 97 | */ |
| 98 | private const IDN_RE_G = "/ |
| 99 | \\s| # general whitespace |
| 100 | \u{00AD}| # SOFT HYPHEN |
| 101 | \u{034F}| # COMBINING GRAPHEME JOINER |
| 102 | \u{061C}| # ARABIC LETTER MARK |
| 103 | [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER.. |
| 104 | # HANGUL JUNGSEONG FILLER |
| 105 | [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ.. |
| 106 | # KHMER VOWEL INHERENT AA |
| 107 | [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE.. |
| 108 | # MONGOLIAN FREE VARIATION SELECTOR THREE |
| 109 | \u{180E}| # MONGOLIAN VOWEL SEPARATOR |
| 110 | [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE.. |
| 111 | # RIGHT-TO-LEFT MARK |
| 112 | [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING.. |
| 113 | # RIGHT-TO-LEFT OVERRIDE |
| 114 | [\u{2060}-\u{2064}]| # WORD JOINER.. |
| 115 | # INVISIBLE PLUS |
| 116 | \u{2065}| # <reserved-2065> |
| 117 | [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE.. |
| 118 | # NOMINAL DIGIT SHAPES |
| 119 | \u{3164}| # HANGUL FILLER |
| 120 | [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1.. |
| 121 | # VARIATION SELECTOR-16 |
| 122 | \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE |
| 123 | \u{FFA0}| # HALFWIDTH HANGUL FILLER |
| 124 | [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>.. |
| 125 | # <reserved-FFF8> |
| 126 | [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP.. |
| 127 | # SHORTHAND FORMAT UP STEP |
| 128 | [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM.. |
| 129 | # MUSICAL SYMBOL END PHRASE |
| 130 | \u{E0000}| # <reserved-E0000> |
| 131 | \u{E0001}| # LANGUAGE TAG |
| 132 | [\u{E0002}-\u{E001F}]| # <reserved-E0002>.. |
| 133 | # <reserved-E001F> |
| 134 | [\u{E0020}-\u{E007F}]| # TAG SPACE.. |
| 135 | # CANCEL TAG |
| 136 | [\u{E0080}-\u{E00FF}]| # <reserved-E0080>.. |
| 137 | # <reserved-E00FF> |
| 138 | [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17.. |
| 139 | # VARIATION SELECTOR-256 |
| 140 | [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>.. |
| 141 | # <reserved-E0FFF> |
| 142 | /xuD"; |
| 143 | |
| 144 | /* NOTE: This could be \p{Ll} because we are comparing against a lower-case string */ |
| 145 | private const GET_ATTRIBS_RE = '/^[:_\p{L}\p{N}][:_\.\-\p{L}\p{N}]*$/uD'; |
| 146 | |
| 147 | /** |
| 148 | * Character entity aliases accepted by MediaWiki in wikitext. |
| 149 | * These are not part of the HTML standard. |
| 150 | */ |
| 151 | private const MW_ENTITY_ALIASES = [ |
| 152 | 'רלמ;' => 'rlm;', |
| 153 | 'رلم;' => 'rlm;', |
| 154 | ]; |
| 155 | |
| 156 | /** |
| 157 | * Token-based version of core \MediaWiki\Parser\Sanitizer::validateTag |
| 158 | * |
| 159 | * @param XMLTagTk $token |
| 160 | * @return bool |
| 161 | * @see \MediaWiki\Parser\Sanitizer::validateTag |
| 162 | */ |
| 163 | public static function escapeLiteralHTMLTag( XMLTagTk $token ): bool { |
| 164 | $tag = $token->getName(); |
| 165 | if ( $tag !== 'meta' && $tag !== 'link' ) { |
| 166 | return false; |
| 167 | } |
| 168 | |
| 169 | // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content |
| 170 | if ( $token->getAttributeV( 'itemprop' ) === null ) { |
| 171 | return true; |
| 172 | } |
| 173 | |
| 174 | // <meta> must have a content="" for the itemprop |
| 175 | if ( $tag === 'meta' && $token->getAttributeV( 'content' ) === null ) { |
| 176 | return true; |
| 177 | } |
| 178 | |
| 179 | // <link> must have an associated href="" |
| 180 | if ( $tag === 'link' && $token->getAttributeV( 'href' ) === null ) { |
| 181 | return true; |
| 182 | } |
| 183 | |
| 184 | return false; |
| 185 | } |
| 186 | |
| 187 | /** |
| 188 | * SSS FIXME: There is a test in mediawiki.environment.js that doles out |
| 189 | * and tests about ids. There are probably some tests in Util.php as well. |
| 190 | * We should move all these kind of tests somewhere else. |
| 191 | * @param string $k |
| 192 | * @param string $v |
| 193 | * @param KV[] $attrs |
| 194 | * @return bool |
| 195 | */ |
| 196 | private static function isParsoidAttr( string $k, string $v, array $attrs ): bool { |
| 197 | // NOTES: |
| 198 | // 1. Currently the tokenizer unconditionally escapes typeof and about |
| 199 | // attributes from wikitxt to data-x-typeof and data-x-about. So, |
| 200 | // this check will only pass through Parsoid inserted attrs. |
| 201 | // 2. But, if we fix the over-aggressive escaping in the tokenizer to |
| 202 | // not escape non-Parsoid typeof and about, then this will return |
| 203 | // true for something like typeof='mw:Foo evilScriptHere'. But, that |
| 204 | // is safe since this check is only used to see if we should |
| 205 | // unconditionally discard the entire attribute or process it further. |
| 206 | // That further processing will catch and discard any dangerous |
| 207 | // strings in the rest of the attribute |
| 208 | return ( in_array( $k, [ 'typeof', 'property', 'rel' ], true ) |
| 209 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', $v ) ) |
| 210 | || ( $k === 'about' && CounterType::TRANSCLUSION_ABOUT->matches( $v ) ) |
| 211 | || ( $k === 'content' |
| 212 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', KV::lookup( $attrs, 'property' ) ?? '' ) ); |
| 213 | } |
| 214 | |
| 215 | /** |
| 216 | * Given an attribute name, checks whether it is a reserved data attribute |
| 217 | * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki |
| 218 | * core and extension code can safely use it to communicate with frontend code. |
| 219 | * @param string $attr Attribute name. |
| 220 | * @return bool |
| 221 | */ |
| 222 | public static function isReservedDataAttribute( string $attr ): bool { |
| 223 | // data-ooui is reserved for ooui. |
| 224 | // data-mw and data-parsoid are reserved for parsoid. |
| 225 | // data-mw-<name here> is reserved for extensions (or core) if |
| 226 | // they need to communicate some data to the client and want to be |
| 227 | // sure that it isn't coming from an untrusted user. |
| 228 | // We ignore the possibility of namespaces since user-generated HTML |
| 229 | // can't use them anymore. |
| 230 | if ( preg_match( '/^data-(mw|parsoid)/', $attr ) ) { |
| 231 | return false; // PARSOID SPECIFIC |
| 232 | } |
| 233 | return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); |
| 234 | } |
| 235 | |
| 236 | /** |
| 237 | * Normalize CSS into a format we can easily search for hostile input |
| 238 | * - decode character references |
| 239 | * - decode escape sequences |
| 240 | * - convert characters that IE6 interprets into ascii |
| 241 | * - remove comments, unless the entire value is one single comment |
| 242 | * @param string $value the css string |
| 243 | * @return string normalized css |
| 244 | */ |
| 245 | public static function normalizeCss( string $value ): string { |
| 246 | // Decode character references like { |
| 247 | $value = self::decodeCharReferences( $value ); |
| 248 | |
| 249 | // Decode escape sequences and line continuation |
| 250 | // See the grammar in the CSS 2 spec, appendix D. |
| 251 | // This has to be done AFTER decoding character references. |
| 252 | // This means it isn't possible for this function to return |
| 253 | // unsanitized escape sequences. It is possible to manufacture |
| 254 | // input that contains character references that decode to |
| 255 | // escape sequences that decode to character references, but |
| 256 | // it's OK for the return value to contain character references |
| 257 | // because the caller is supposed to escape those anyway. |
| 258 | static $decodeRegex; |
| 259 | if ( !$decodeRegex ) { |
| 260 | $space = '[\\x20\\t\\r\\n\\f]'; |
| 261 | $nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
| 262 | $backslash = '\\\\'; |
| 263 | $decodeRegex = "/ $backslash |
| 264 | (?: |
| 265 | ($nl) | # 1. Line continuation |
| 266 | ([0-9A-Fa-f]{1,6})$space? | # 2. character number |
| 267 | (.) | # 3. backslash cancelling special meaning |
| 268 | () | # 4. backslash at end of string |
| 269 | )/xu"; |
| 270 | } |
| 271 | $value = preg_replace_callback( $decodeRegex, |
| 272 | self::cssDecodeCallback( ... ), $value ); |
| 273 | |
| 274 | // Let the value through if it's nothing but a single comment, to |
| 275 | // allow other functions which may reject it to pass some error |
| 276 | // message through. |
| 277 | if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !xD', $value ) ) { |
| 278 | // Remove any comments; IE gets token splitting wrong |
| 279 | // This must be done AFTER decoding character references and |
| 280 | // escape sequences, because those steps can introduce comments |
| 281 | // This step cannot introduce character references or escape |
| 282 | // sequences, because it replaces comments with spaces rather |
| 283 | // than removing them completely. |
| 284 | $value = self::delimiterReplace( '/*', '*/', ' ', $value ); |
| 285 | |
| 286 | // Remove anything after a comment-start token, to guard against |
| 287 | // incorrect client implementations. |
| 288 | $commentPos = strpos( $value, '/*' ); |
| 289 | if ( $commentPos !== false ) { |
| 290 | $value = substr( $value, 0, $commentPos ); |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | return $value; |
| 295 | } |
| 296 | |
| 297 | /** |
| 298 | * Pick apart some CSS and check it for forbidden or unsafe structures. |
| 299 | * Returns a sanitized string. This sanitized string will have |
| 300 | * character references and escape sequences decoded and comments |
| 301 | * stripped (unless it is itself one valid comment, in which case the value |
| 302 | * will be passed through). If the input is just too evil, only a comment |
| 303 | * complaining about evilness will be returned. |
| 304 | * |
| 305 | * Currently URL references, 'expression', 'tps' are forbidden. |
| 306 | * |
| 307 | * NOTE: Despite the fact that character references are decoded, the |
| 308 | * returned string may contain character references given certain |
| 309 | * clever input strings. These character references must |
| 310 | * be escaped before the return value is embedded in HTML. |
| 311 | * |
| 312 | * @warning This method is intended to sanitize style attributes on |
| 313 | * html tags only. It is not safe to use on full CSS files. |
| 314 | * @param string $value |
| 315 | * @return string |
| 316 | */ |
| 317 | public static function checkCss( $value ) { |
| 318 | $value = self::normalizeCss( $value ); |
| 319 | |
| 320 | // Reject problematic keywords and control characters |
| 321 | if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || |
| 322 | str_contains( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) ) { |
| 323 | return '/* invalid control char */'; |
| 324 | } elseif ( preg_match( self::INSECURE_RE, $value ) ) { |
| 325 | return '/* insecure input */'; |
| 326 | } |
| 327 | return $value; |
| 328 | } |
| 329 | |
| 330 | private static function cssDecodeCallback( array $matches ): string { |
| 331 | if ( $matches[1] !== '' ) { |
| 332 | // Line continuation |
| 333 | return ''; |
| 334 | } elseif ( $matches[2] !== '' ) { |
| 335 | # hexdec could return a float if the match is too long, but the |
| 336 | # regexp in question limits the string length to 6. |
| 337 | $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); |
| 338 | } elseif ( $matches[3] !== '' ) { |
| 339 | $char = $matches[3]; |
| 340 | } else { |
| 341 | $char = '\\'; |
| 342 | } |
| 343 | if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
| 344 | // These characters need to be escaped in strings |
| 345 | // Clean up the escape sequence to avoid parsing errors by clients |
| 346 | return '\\' . dechex( ord( $char ) ) . ' '; |
| 347 | } else { |
| 348 | // Decode unnecessary escape |
| 349 | return $char; |
| 350 | } |
| 351 | } |
| 352 | |
| 353 | public const FIXTAGS = [ |
| 354 | # French spaces, last one Guillemet-left |
| 355 | # only if it isn't followed by a word character. |
| 356 | '/ (?=[?:;!%»›](?!\w))/u' => "%s", |
| 357 | # French spaces, Guillemet-right |
| 358 | # only if it isn't preceded by a word character. |
| 359 | '/(?<!\w)([«‹]) /u' => "\\1%s", |
| 360 | ]; |
| 361 | |
| 362 | /** |
| 363 | * Armor French spaces with a replacement character |
| 364 | * |
| 365 | * @since 1.32 |
| 366 | * @param string $text Text to armor |
| 367 | * @param string $space Space character for the French spaces, defaults to ' ' |
| 368 | * @return string Armored text |
| 369 | */ |
| 370 | public static function armorFrenchSpaces( string $text, string $space = ' ' ): string { |
| 371 | // Replace $ with \$ and \ with \\ |
| 372 | $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); |
| 373 | return preg_replace( |
| 374 | array_keys( self::FIXTAGS ), |
| 375 | array_map( static function ( string $replacement ) use ( $space ) { |
| 376 | // @phan-suppress-next-line PhanPluginPrintfVariableFormatString |
| 377 | return sprintf( $replacement, $space ); |
| 378 | }, array_values( self::FIXTAGS ) ), |
| 379 | $text |
| 380 | ); |
| 381 | } |
| 382 | |
| 383 | /** |
| 384 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
| 385 | * a valid HTML id attribute. |
| 386 | * |
| 387 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
| 388 | * be sure to use proper escaping. |
| 389 | * |
| 390 | * In Parsoid, proper escaping is usually handled for us by the HTML |
| 391 | * serialization algorithm, but be careful of corner cases (such as |
| 392 | * emitting attributes in wikitext). |
| 393 | * |
| 394 | * @param string $id String to escape |
| 395 | * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding |
| 396 | * should be used. |
| 397 | * @return string Escaped ID |
| 398 | * |
| 399 | * @since 1.30 |
| 400 | */ |
| 401 | public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ): string { |
| 402 | // For consistency with PHP's API, we accept "primary" or "fallback" as |
| 403 | // the mode in 'options'. This (slightly) abstracts the actual details |
| 404 | // of the id encoding from the Parsoid code which handles ids; we could |
| 405 | // swap primary and fallback here, or even transition to a new HTML6 |
| 406 | // encoding (!), without touching all the call sites. |
| 407 | $internalMode = $mode === self::ID_FALLBACK ? 'legacy' : 'html5'; |
| 408 | return self::escapeIdInternal( $id, $internalMode ); |
| 409 | } |
| 410 | |
| 411 | /** |
| 412 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
| 413 | * a valid URL fragment. |
| 414 | * |
| 415 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
| 416 | * be sure to use proper escaping. |
| 417 | * |
| 418 | * @param string $id String to escape |
| 419 | * @return string Escaped ID |
| 420 | * |
| 421 | * @since 1.30 |
| 422 | */ |
| 423 | public static function escapeIdForLink( string $id ): string { |
| 424 | return self::escapeIdInternalUrl( $id, 'html5' ); |
| 425 | } |
| 426 | |
| 427 | /** |
| 428 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
| 429 | * a valid URL fragment for external interwikis. |
| 430 | * |
| 431 | * @param string $id String to escape |
| 432 | * @return string Escaped ID |
| 433 | * |
| 434 | * @since 1.30 |
| 435 | */ |
| 436 | private static function escapeIdForExternalInterwiki( string $id ): string { |
| 437 | // Assume $wgExternalInterwikiFragmentMode = 'legacy' |
| 438 | return self::escapeIdInternalUrl( $id, 'legacy' ); |
| 439 | } |
| 440 | |
| 441 | /** |
| 442 | * Do percent encoding of percent signs for href (but not id) attributes |
| 443 | * |
| 444 | * @since 1.35 |
| 445 | * @see https://phabricator.wikimedia.org/T238385 |
| 446 | * @param string $id String to escape |
| 447 | * @param string $mode One of modes from $wgFragmentMode |
| 448 | * @return string |
| 449 | */ |
| 450 | private static function escapeIdInternalUrl( string $id, string $mode ): string { |
| 451 | $id = self::escapeIdInternal( $id, $mode ); |
| 452 | if ( $mode === 'html5' ) { |
| 453 | $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); |
| 454 | } |
| 455 | return $id; |
| 456 | } |
| 457 | |
| 458 | /** |
| 459 | * Helper for escapeIdFor*() functions. Performs most of the actual escaping. |
| 460 | * |
| 461 | * @param string $id String to escape |
| 462 | * @param string $mode One of modes from $wgFragmentMode ('html5' or 'legacy') |
| 463 | * @return string |
| 464 | */ |
| 465 | private static function escapeIdInternal( string $id, string $mode ): string { |
| 466 | // Truncate overly-long IDs. This isn't an HTML limit, it's just |
| 467 | // griefer protection. [T251506] |
| 468 | $id = mb_substr( $id, 0, 1024 ); |
| 469 | |
| 470 | switch ( $mode ) { |
| 471 | case 'html5': |
| 472 | // html5 spec says ids must not have any of the following: |
| 473 | // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE |
| 474 | // In practice, in wikitext, only tab, LF, CR (and SPACE) are |
| 475 | // possible using either Lua or html entities. |
| 476 | $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); |
| 477 | break; |
| 478 | |
| 479 | case 'legacy': |
| 480 | // This corresponds to 'noninitial' mode of the former escapeId() |
| 481 | static $replace = [ |
| 482 | '%3A' => ':', |
| 483 | '%' => '.' |
| 484 | ]; |
| 485 | |
| 486 | $id = urlencode( str_replace( ' ', '_', $id ) ); |
| 487 | $id = strtr( $id, $replace ); |
| 488 | break; |
| 489 | |
| 490 | default: |
| 491 | throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); |
| 492 | } |
| 493 | |
| 494 | return $id; |
| 495 | } |
| 496 | |
| 497 | /** |
| 498 | * Given a string containing a space delimited list of ids, escape each id |
| 499 | * to match ids escaped by the escapeIdForAttribute() function. |
| 500 | * |
| 501 | * @param string $referenceString Space delimited list of ids |
| 502 | * @return string |
| 503 | */ |
| 504 | private static function escapeIdReferenceListInternal( string $referenceString ): string { |
| 505 | # Explode the space delimited list string into an array of tokens |
| 506 | $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
| 507 | |
| 508 | # Escape each token as an id |
| 509 | foreach ( $references as &$ref ) { |
| 510 | $ref = self::escapeIdForAttribute( $ref ); |
| 511 | } |
| 512 | |
| 513 | # Merge the array back to a space delimited list string |
| 514 | # If the array is empty, the result will be an empty string ('') |
| 515 | $referenceString = implode( ' ', $references ); |
| 516 | |
| 517 | return $referenceString; |
| 518 | } |
| 519 | |
| 520 | /** |
| 521 | * Normalizes whitespace in a section name, such as might be returned |
| 522 | * by Parser::stripSectionName(), for use in the ids that are used for |
| 523 | * section links. |
| 524 | */ |
| 525 | public static function normalizeSectionNameWhitespace( string $section ): string { |
| 526 | $normalized = preg_replace( '/[ _]+/', ' ', $section ); |
| 527 | return trim( $normalized ); |
| 528 | } |
| 529 | |
| 530 | /** |
| 531 | * Ensure that any entities and character references are legal |
| 532 | * for XML and XHTML specifically. Any stray bits will be |
| 533 | * &-escaped to result in a valid text fragment. |
| 534 | * |
| 535 | * a. named char refs can only be < > & ", others are |
| 536 | * numericized (this way we're well-formed even without a DTD) |
| 537 | * b. any numeric char refs must be legal chars, not invalid or forbidden |
| 538 | * c. use lower cased "&#x", not "&#X" |
| 539 | * d. fix or reject non-valid attributes |
| 540 | * |
| 541 | * @internal |
| 542 | */ |
| 543 | public static function normalizeCharReferences( string $text ): string { |
| 544 | return preg_replace_callback( |
| 545 | self::CHAR_REFS_REGEX, |
| 546 | self::normalizeCharReferencesCallback( ... ), |
| 547 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
| 548 | ); |
| 549 | } |
| 550 | |
| 551 | private static function normalizeCharReferencesCallback( array $matches ): string { |
| 552 | $ret = null; |
| 553 | if ( isset( $matches[1] ) ) { |
| 554 | $ret = self::normalizeEntity( $matches[1] ); |
| 555 | } elseif ( isset( $matches[2] ) ) { |
| 556 | $ret = self::decCharReference( $matches[2] ); |
| 557 | } elseif ( isset( $matches[3] ) ) { |
| 558 | $ret = self::hexCharReference( $matches[3] ); |
| 559 | } |
| 560 | if ( $ret === null ) { |
| 561 | return htmlspecialchars( $matches[0] ); |
| 562 | } else { |
| 563 | return $ret; |
| 564 | } |
| 565 | } |
| 566 | |
| 567 | /** |
| 568 | * If the named entity is defined in HTML5 |
| 569 | * return the equivalent numeric entity reference (except for the core < |
| 570 | * > & "). If the entity is a MediaWiki-specific alias, returns |
| 571 | * the HTML equivalent. Otherwise, returns HTML-escaped text of |
| 572 | * pseudo-entity source (eg &foo;) |
| 573 | * |
| 574 | * @param string $name Semicolon-terminated name |
| 575 | * @return string |
| 576 | */ |
| 577 | private static function normalizeEntity( string $name ): string { |
| 578 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
| 579 | // Non-standard MediaWiki-specific entities |
| 580 | return '&' . self::MW_ENTITY_ALIASES[$name]; |
| 581 | } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) { |
| 582 | // Keep these in word form |
| 583 | return "&$name"; |
| 584 | } elseif ( isset( HTMLData::NAMED_ENTITY_TRANSLATION[$name] ) ) { |
| 585 | // Beware: some entities expand to more than 1 codepoint |
| 586 | return preg_replace_callback( '/./Ssu', static function ( $m ) { |
| 587 | // @phan-suppress-next-line PhanDeprecatedFunction |
| 588 | return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; |
| 589 | }, HTMLData::NAMED_ENTITY_TRANSLATION[$name] ); |
| 590 | } else { |
| 591 | return "&$name"; |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | private static function decCharReference( string $codepoint ): ?string { |
| 596 | # intval() will (safely) saturate at the maximum signed integer |
| 597 | # value if $codepoint is too many digits |
| 598 | $point = intval( $codepoint ); |
| 599 | if ( self::validateCodepoint( $point ) ) { |
| 600 | return "&#$point;"; |
| 601 | } else { |
| 602 | return null; |
| 603 | } |
| 604 | } |
| 605 | |
| 606 | private static function hexCharReference( string $codepoint ): ?string { |
| 607 | $point = hexdec( $codepoint ); |
| 608 | // hexdec() might return a float if the string is too long |
| 609 | if ( is_int( $point ) && self::validateCodepoint( $point ) ) { |
| 610 | return sprintf( '&#x%x;', $point ); |
| 611 | } else { |
| 612 | return null; |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | /** |
| 617 | * Returns true if a given Unicode codepoint is a valid character in |
| 618 | * both HTML5 and XML. |
| 619 | */ |
| 620 | private static function validateCodepoint( int $codepoint ): bool { |
| 621 | # U+000C is valid in HTML5 but not allowed in XML. |
| 622 | # U+000D is valid in XML but not allowed in HTML5. |
| 623 | # U+007F - U+009F are disallowed in HTML5 (control characters). |
| 624 | return $codepoint == 0x09 |
| 625 | || $codepoint == 0x0a |
| 626 | || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
| 627 | || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
| 628 | || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
| 629 | || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
| 630 | } |
| 631 | |
| 632 | /** |
| 633 | * Decode any character references, numeric or named entities, |
| 634 | * in the text and return a UTF-8 string. |
| 635 | */ |
| 636 | public static function decodeCharReferences( string $text ): string { |
| 637 | return preg_replace_callback( |
| 638 | self::CHAR_REFS_REGEX, |
| 639 | self::decodeCharReferencesCallback( ... ), |
| 640 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
| 641 | ); |
| 642 | } |
| 643 | |
| 644 | private static function decodeCharReferencesCallback( array $matches ): string { |
| 645 | if ( isset( $matches[1] ) ) { |
| 646 | return self::decodeEntity( $matches[1] ); |
| 647 | } elseif ( isset( $matches[2] ) ) { |
| 648 | // Value is user provided string and may exceed native int bounds. |
| 649 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
| 650 | return self::decodeChar( @intval( $matches[2] ) ); |
| 651 | } elseif ( isset( $matches[3] ) ) { |
| 652 | $point = hexdec( $matches[3] ); |
| 653 | // hexdec() might return a float if the string is too long |
| 654 | if ( !is_int( $point ) ) { |
| 655 | // Invalid character reference. |
| 656 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
| 657 | } |
| 658 | return self::decodeChar( $point ); |
| 659 | } |
| 660 | # Last case should be an ampersand by itself |
| 661 | return $matches[0]; |
| 662 | } |
| 663 | |
| 664 | /** |
| 665 | * Return UTF-8 string for a codepoint if that is a valid |
| 666 | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
| 667 | * @internal |
| 668 | */ |
| 669 | private static function decodeChar( int $codepoint ): string { |
| 670 | if ( self::validateCodepoint( $codepoint ) ) { |
| 671 | return \UtfNormal\Utils::codepointToUtf8( $codepoint ); |
| 672 | } else { |
| 673 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
| 674 | } |
| 675 | } |
| 676 | |
| 677 | /** |
| 678 | * If the named entity is defined in HTML5 |
| 679 | * return the UTF-8 encoding of that character. Otherwise, returns |
| 680 | * pseudo-entity source (eg "&foo;") |
| 681 | * |
| 682 | * @param string $name Semicolon-terminated entity name |
| 683 | * @return string |
| 684 | */ |
| 685 | private static function decodeEntity( string $name ): string { |
| 686 | // These are MediaWiki-specific entities, not in the HTML standard |
| 687 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
| 688 | $name = self::MW_ENTITY_ALIASES[$name]; |
| 689 | } |
| 690 | $trans = HTMLData::NAMED_ENTITY_TRANSLATION[$name] ?? null; |
| 691 | return $trans ?? "&$name"; |
| 692 | } |
| 693 | |
| 694 | /** |
| 695 | * Fetch the list of acceptable attributes for a given element name. |
| 696 | * |
| 697 | * @param string $element |
| 698 | * @return array<string,int> An associative array where keys are acceptable |
| 699 | * attribute names |
| 700 | */ |
| 701 | private static function attributesAllowedInternal( string $element ): array { |
| 702 | $lists = self::setupAttributesAllowedInternal(); |
| 703 | $list = $lists[$element] ?? []; |
| 704 | return array_flip( $list ); |
| 705 | } |
| 706 | |
| 707 | /** |
| 708 | * Foreach array key (an allowed HTML element), return an array |
| 709 | * of allowed attributes |
| 710 | * @return array<string,string[]> |
| 711 | */ |
| 712 | private static function setupAttributesAllowedInternal(): array { |
| 713 | static $allowed; |
| 714 | |
| 715 | if ( $allowed !== null ) { |
| 716 | return $allowed; |
| 717 | } |
| 718 | |
| 719 | $common = [ |
| 720 | # HTML |
| 721 | 'id', |
| 722 | 'class', |
| 723 | 'style', |
| 724 | 'lang', |
| 725 | 'dir', |
| 726 | 'title', |
| 727 | 'tabindex', |
| 728 | |
| 729 | # WAI-ARIA |
| 730 | 'aria-describedby', |
| 731 | 'aria-flowto', |
| 732 | 'aria-hidden', |
| 733 | 'aria-label', |
| 734 | 'aria-labelledby', |
| 735 | 'aria-level', |
| 736 | 'aria-owns', |
| 737 | 'role', |
| 738 | |
| 739 | # RDFa |
| 740 | # These attributes are specified in section 9 of |
| 741 | # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
| 742 | 'about', |
| 743 | 'property', |
| 744 | 'resource', |
| 745 | 'datatype', |
| 746 | 'typeof', |
| 747 | |
| 748 | # Microdata. These are specified by |
| 749 | # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
| 750 | 'itemid', |
| 751 | 'itemprop', |
| 752 | 'itemref', |
| 753 | 'itemscope', |
| 754 | 'itemtype', |
| 755 | ]; |
| 756 | |
| 757 | $block = array_merge( $common, [ 'align' ] ); |
| 758 | $tablealign = [ 'align', 'valign' ]; |
| 759 | $tablecell = [ |
| 760 | 'abbr', |
| 761 | 'axis', |
| 762 | 'headers', |
| 763 | 'scope', |
| 764 | 'rowspan', |
| 765 | 'colspan', |
| 766 | 'nowrap', # deprecated |
| 767 | 'width', # deprecated |
| 768 | 'height', # deprecated |
| 769 | 'bgcolor', # deprecated |
| 770 | ]; |
| 771 | |
| 772 | # Numbers refer to sections in HTML 4.01 standard describing the element. |
| 773 | # See: https://www.w3.org/TR/html4/ |
| 774 | $allowed = [ |
| 775 | # 7.5.4 |
| 776 | 'div' => $block, |
| 777 | 'center' => $common, # deprecated |
| 778 | 'span' => $common, |
| 779 | |
| 780 | # 7.5.5 |
| 781 | 'h1' => $block, |
| 782 | 'h2' => $block, |
| 783 | 'h3' => $block, |
| 784 | 'h4' => $block, |
| 785 | 'h5' => $block, |
| 786 | 'h6' => $block, |
| 787 | |
| 788 | # 7.5.6 |
| 789 | # address |
| 790 | |
| 791 | # 8.2.4 |
| 792 | 'bdo' => $common, |
| 793 | |
| 794 | # 9.2.1 |
| 795 | 'em' => $common, |
| 796 | 'strong' => $common, |
| 797 | 'cite' => $common, |
| 798 | 'dfn' => $common, |
| 799 | 'code' => $common, |
| 800 | 'samp' => $common, |
| 801 | 'kbd' => $common, |
| 802 | 'var' => $common, |
| 803 | 'abbr' => $common, |
| 804 | # acronym |
| 805 | |
| 806 | # 9.2.2 |
| 807 | 'blockquote' => array_merge( $common, [ 'cite' ] ), |
| 808 | 'q' => array_merge( $common, [ 'cite' ] ), |
| 809 | |
| 810 | # 9.2.3 |
| 811 | 'sub' => $common, |
| 812 | 'sup' => $common, |
| 813 | |
| 814 | # 9.3.1 |
| 815 | 'p' => $block, |
| 816 | |
| 817 | # 9.3.2 |
| 818 | 'br' => array_merge( $common, [ 'clear' ] ), |
| 819 | |
| 820 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
| 821 | 'wbr' => $common, |
| 822 | |
| 823 | # 9.3.4 |
| 824 | 'pre' => array_merge( $common, [ 'width' ] ), |
| 825 | |
| 826 | # 9.4 |
| 827 | 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), |
| 828 | 'del' => array_merge( $common, [ 'cite', 'datetime' ] ), |
| 829 | |
| 830 | # 10.2 |
| 831 | 'ul' => array_merge( $common, [ 'type' ] ), |
| 832 | 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), |
| 833 | 'li' => array_merge( $common, [ 'type', 'value' ] ), |
| 834 | |
| 835 | # 10.3 |
| 836 | 'dl' => $common, |
| 837 | 'dd' => $common, |
| 838 | 'dt' => $common, |
| 839 | |
| 840 | # 11.2.1 |
| 841 | 'table' => array_merge( $common, |
| 842 | [ 'summary', 'width', 'border', 'frame', |
| 843 | 'rules', 'cellspacing', 'cellpadding', |
| 844 | 'align', 'bgcolor', |
| 845 | ] ), |
| 846 | |
| 847 | # 11.2.2 |
| 848 | 'caption' => $block, |
| 849 | |
| 850 | # 11.2.3 |
| 851 | 'thead' => $common, |
| 852 | 'tfoot' => $common, |
| 853 | 'tbody' => $common, |
| 854 | |
| 855 | # 11.2.4 |
| 856 | 'colgroup' => array_merge( $common, [ 'span' ] ), |
| 857 | 'col' => array_merge( $common, [ 'span' ] ), |
| 858 | |
| 859 | # 11.2.5 |
| 860 | 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), |
| 861 | |
| 862 | # 11.2.6 |
| 863 | 'td' => array_merge( $common, $tablecell, $tablealign ), |
| 864 | 'th' => array_merge( $common, $tablecell, $tablealign ), |
| 865 | |
| 866 | # 12.2 |
| 867 | # NOTE: <a> is not allowed directly, but this list of allowed |
| 868 | # attributes is used from the Parser object |
| 869 | 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
| 870 | |
| 871 | # 13.2 |
| 872 | # Not usually allowed, but may be used for extension-style hooks |
| 873 | # such as <math> when it is rasterized, or if $wgAllowImageTag is |
| 874 | # true |
| 875 | 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), |
| 876 | # Attributes for A/V tags added in T163583 / T133673 |
| 877 | 'audio' => array_merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), |
| 878 | 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), |
| 879 | 'source' => array_merge( $common, [ 'type', 'src' ] ), |
| 880 | 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), |
| 881 | |
| 882 | # 15.2.1 |
| 883 | 'tt' => $common, |
| 884 | 'b' => $common, |
| 885 | 'i' => $common, |
| 886 | 'big' => $common, |
| 887 | 'small' => $common, |
| 888 | 'strike' => $common, |
| 889 | 's' => $common, |
| 890 | 'u' => $common, |
| 891 | |
| 892 | # 15.2.2 |
| 893 | 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), |
| 894 | # basefont |
| 895 | |
| 896 | # 15.3 |
| 897 | 'hr' => array_merge( $common, [ 'width' ] ), |
| 898 | |
| 899 | # HTML Ruby annotation text module, simple ruby only. |
| 900 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
| 901 | 'ruby' => $common, |
| 902 | # rbc |
| 903 | 'rb' => $common, |
| 904 | 'rp' => $common, |
| 905 | 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ), |
| 906 | 'rtc' => $common, |
| 907 | |
| 908 | # MathML root element, where used for extensions |
| 909 | # 'title' may not be 100% valid here; it's XHTML |
| 910 | # https://www.w3.org/TR/REC-MathML/ |
| 911 | 'math' => [ 'class', 'style', 'id', 'title' ], |
| 912 | |
| 913 | // HTML 5 section 4.5 |
| 914 | 'figure' => $common, |
| 915 | 'figcaption' => $common, |
| 916 | |
| 917 | # HTML 5 section 4.6 |
| 918 | 'bdi' => $common, |
| 919 | |
| 920 | # HTML5 elements, defined by: |
| 921 | # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
| 922 | 'data' => array_merge( $common, [ 'value' ] ), |
| 923 | 'time' => array_merge( $common, [ 'datetime' ] ), |
| 924 | 'mark' => $common, |
| 925 | |
| 926 | // meta and link are only permitted by removeHTMLtags when Microdata |
| 927 | // is enabled so we don't bother adding a conditional to hide these |
| 928 | // Also meta and link are only valid in WikiText as Microdata elements |
| 929 | // (ie: validateTag rejects tags missing the attributes needed for Microdata) |
| 930 | // So we don't bother including $common attributes that have no purpose. |
| 931 | 'meta' => [ 'itemprop', 'content' ], |
| 932 | 'link' => [ 'itemprop', 'href', 'title' ], |
| 933 | |
| 934 | // HTML 5 section 4.3.5 |
| 935 | 'aside' => $common, |
| 936 | ]; |
| 937 | |
| 938 | return $allowed; |
| 939 | } |
| 940 | |
| 941 | /** |
| 942 | * @param string $host |
| 943 | * @return string |
| 944 | */ |
| 945 | private static function stripIDNs( string $host ): string { |
| 946 | // This code is part of Sanitizer::cleanUrl in core |
| 947 | return preg_replace( self::IDN_RE_G, '', $host ); |
| 948 | } |
| 949 | |
| 950 | /** |
| 951 | * Urlencode chars not in the legacy parser's Parser::EXT_LINK_URL_CLASS |
| 952 | * |
| 953 | * The pipe char is an exception introduced in core commit 2519512 |
| 954 | */ |
| 955 | public static function encodeUrlForExtLink( string $href ): string { |
| 956 | return preg_replace_callback( |
| 957 | '/([\][<>"\x00-\x20\x7F\|])/', static function ( $matches ) { |
| 958 | return urlencode( $matches[0] ); |
| 959 | }, $href |
| 960 | ) ?? $href; |
| 961 | } |
| 962 | |
| 963 | /** |
| 964 | * @param SiteConfig $siteConfig |
| 965 | * @param string $href |
| 966 | * @param string $mode |
| 967 | * @return string|null |
| 968 | */ |
| 969 | public static function cleanUrl( SiteConfig $siteConfig, string $href, string $mode ): ?string { |
| 970 | if ( $mode !== 'wikilink' ) { |
| 971 | $href = self::encodeUrlForExtLink( $href ); |
| 972 | } |
| 973 | |
| 974 | $matched = preg_match( '#^((?:[a-zA-Z][^:/]*:)?(?://)?)([^/]+)(/?.*)#', $href, $bits ); |
| 975 | if ( $matched === 1 ) { |
| 976 | $proto = $bits[1]; |
| 977 | if ( $proto && !$siteConfig->hasValidProtocol( $proto ) ) { |
| 978 | // invalid proto, disallow URL |
| 979 | return null; |
| 980 | } |
| 981 | $host = self::stripIDNs( $bits[2] ); |
| 982 | preg_match( '/^%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$/D', $host, $match ); |
| 983 | if ( $match ) { |
| 984 | // IPv6 host names |
| 985 | $host = '[' . $match[1] . ']' . $match[2]; |
| 986 | } |
| 987 | $path = $bits[3]; |
| 988 | } else { |
| 989 | $proto = ''; |
| 990 | $host = ''; |
| 991 | $path = $href; |
| 992 | } |
| 993 | return $proto . $host . $path; |
| 994 | } |
| 995 | |
| 996 | /** |
| 997 | * @param SiteConfig $siteConfig |
| 998 | * @param ?string $tagName |
| 999 | * @param ?XMLTagTk $token |
| 1000 | * @param array $attrs |
| 1001 | * |
| 1002 | * @return array<string, list{?string, mixed, mixed, bool}> |
| 1003 | */ |
| 1004 | public static function sanitizeTagAttrs( |
| 1005 | SiteConfig $siteConfig, ?string $tagName, ?XMLTagTk $token, array $attrs |
| 1006 | ): array { |
| 1007 | $tag = $tagName ?: $token->getName(); |
| 1008 | |
| 1009 | $list = self::attributesAllowedInternal( $tag ); |
| 1010 | $newAttrs = []; |
| 1011 | $n = count( $attrs ); |
| 1012 | for ( $i = 0; $i < $n; $i++ ) { |
| 1013 | $a = $attrs[$i]; |
| 1014 | $a->v ??= ''; |
| 1015 | |
| 1016 | // Convert attributes to string, if necessary. |
| 1017 | $a->k = TokenUtils::tokensToString( $a->k ); |
| 1018 | |
| 1019 | if ( is_array( $a->v ) ) { |
| 1020 | // Use the expanded attr instead of trying to unpackDOMFragments |
| 1021 | // since the fragment will have been released when expanding to DOM |
| 1022 | $expandedDom = $token ? $token->fetchExpandedAttrValue( $a->k ) : null; |
| 1023 | if ( $expandedDom === null ) { |
| 1024 | $a->v = TokenUtils::tokensToString( $a->v ); |
| 1025 | } else { |
| 1026 | // See the comment in TokenUtils::tokensToString about |
| 1027 | // unpackDOMFragments for why we're just using the textContent |
| 1028 | $a->v = $expandedDom->textContent; |
| 1029 | } |
| 1030 | } |
| 1031 | |
| 1032 | $k = $a->k; |
| 1033 | $v = $a->v; |
| 1034 | $origK = $a->ksrc ?? $k; |
| 1035 | $origV = $a->vsrc ?? $v; |
| 1036 | $psdAttr = self::isParsoidAttr( $k, $v, $attrs ); |
| 1037 | |
| 1038 | // $a->k can be uppercase |
| 1039 | $k = mb_strtolower( $k ); |
| 1040 | |
| 1041 | // Bypass RDFa/allowed attribute checks for Parsoid-inserted attrs |
| 1042 | // Safe to do since the tokenizer renames about/typeof attrs. |
| 1043 | // unconditionally. FIXME: The escaping solution in the tokenizer |
| 1044 | // may be aggressive. There is no need to escape typeof strings |
| 1045 | // that or about ids that don't resemble Parsoid tokens/about ids. |
| 1046 | if ( !$psdAttr ) { |
| 1047 | if ( !preg_match( self::GET_ATTRIBS_RE, $k ) ) { |
| 1048 | $newAttrs[$k] = [ null, $origV, $origK, false ]; |
| 1049 | continue; |
| 1050 | } |
| 1051 | |
| 1052 | # Allow XML namespace declaration to allow RDFa |
| 1053 | if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $k ) ) { |
| 1054 | if ( !preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
| 1055 | $newAttrs[$k] = [ $v, $origV, $origK, false ]; |
| 1056 | } else { |
| 1057 | $newAttrs[$k] = [ null, $origV, $origK, false ]; |
| 1058 | } |
| 1059 | continue; |
| 1060 | } |
| 1061 | |
| 1062 | # Allow any attribute beginning with "data-" |
| 1063 | # However: |
| 1064 | # * Disallow data attributes used by MediaWiki code |
| 1065 | # * Ensure that the attribute is not namespaced by banning |
| 1066 | # colons. |
| 1067 | # * Ensure attribute name will be accepted by the HTML |
| 1068 | # parser; see |
| 1069 | # https://github.com/whatwg/dom/issues/849#issuecomment-1007541209 |
| 1070 | if ( ( !preg_match( '|^data-[^:= \t\r\n/>\0]*$|iD', $k ) && !isset( $list[$k] ) ) |
| 1071 | || self::isReservedDataAttribute( $k ) |
| 1072 | ) { |
| 1073 | $newAttrs[$k] = [ null, $origV, $origK, false ]; |
| 1074 | continue; |
| 1075 | } |
| 1076 | } |
| 1077 | |
| 1078 | # Strip javascript "expression" from stylesheets. |
| 1079 | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
| 1080 | if ( $k === 'style' ) { |
| 1081 | $v = self::checkCss( $v ); |
| 1082 | } |
| 1083 | |
| 1084 | # Escape HTML id attributes |
| 1085 | if ( $k === 'id' ) { |
| 1086 | $v = self::escapeIdForAttribute( $v, self::ID_PRIMARY ); |
| 1087 | if ( $v === '' ) { |
| 1088 | $newAttrs[$k] = [ null, $origV, $origK, false ]; |
| 1089 | continue; |
| 1090 | } elseif ( CounterType::NODE_DATA_ID->matches( $v ) ) { |
| 1091 | // Force shadowing for Parsoid-like ids so we can distinguish |
| 1092 | // them from ones added for the pagebundle |
| 1093 | $newAttrs[$k] = [ $v, $origV, $origK, true ]; |
| 1094 | continue; |
| 1095 | } |
| 1096 | } |
| 1097 | |
| 1098 | # Escape HTML id reference lists |
| 1099 | if ( $k === 'aria-describedby' |
| 1100 | || $k === 'aria-flowto' |
| 1101 | || $k === 'aria-labelledby' |
| 1102 | || $k === 'aria-owns' |
| 1103 | ) { |
| 1104 | $v = self::escapeIdReferenceListInternal( $v ); |
| 1105 | } |
| 1106 | |
| 1107 | // RDFa and microdata properties allow URLs, URIs and/or CURIs. |
| 1108 | // Check them for validity. |
| 1109 | if ( $k === 'rel' || $k === 'rev' |
| 1110 | # RDFa |
| 1111 | || $k === 'about' || $k === 'property' |
| 1112 | || $k === 'resource' || $k === 'datatype' |
| 1113 | || $k === 'typeof' |
| 1114 | # HTML5 microdata |
| 1115 | || $k === 'itemid' || $k === 'itemprop' |
| 1116 | || $k === 'itemref' || $k === 'itemscope' |
| 1117 | || $k === 'itemtype' |
| 1118 | ) { |
| 1119 | // Paranoia. Allow "simple" values but suppress javascript |
| 1120 | if ( preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
| 1121 | // Retain the Parsoid typeofs for Parsoid attrs |
| 1122 | $newV = $psdAttr ? trim( preg_replace( '/(?:^|\s)(?!mw:\w)\S*/', '', $origV ) ) : null; |
| 1123 | $newAttrs[$k] = [ $newV, $origV, $origK, false ]; |
| 1124 | continue; |
| 1125 | } |
| 1126 | } |
| 1127 | |
| 1128 | # NOTE: even though elements using href/src are not allowed directly, supply |
| 1129 | # validation code that can be used by tag hook handlers, etc |
| 1130 | if ( $token && ( $k === 'href' || $k === 'src' || $k === 'poster' ) ) { // T163583 |
| 1131 | // `origV` will always be `v`, because `a.vsrc` isn't set, since |
| 1132 | // this attribute didn't come from source. However, in the |
| 1133 | // LinkHandler, we may have already shadowed this value so use |
| 1134 | // that instead. |
| 1135 | $rel = $token->getAttributeShadowInfo( 'rel' ); |
| 1136 | $mode = ( $k === 'href' && |
| 1137 | isset( $rel['value'] ) && |
| 1138 | preg_match( '#^mw:WikiLink(/Interwiki)?$#', $rel['value'] ) |
| 1139 | ) ? 'wikilink' : 'external'; |
| 1140 | $origHref = $token->getAttributeShadowInfo( $k )['value']; |
| 1141 | $newHref = self::cleanUrl( $siteConfig, $v, $mode ); |
| 1142 | if ( $newHref !== $v ) { |
| 1143 | $newAttrs[$k] = [ $newHref, $origHref, $origK, false ]; |
| 1144 | continue; |
| 1145 | } |
| 1146 | } |
| 1147 | |
| 1148 | if ( $k === 'tabindex' && $v !== '0' ) { |
| 1149 | // Only allow tabindex of 0, which is useful for accessibility. |
| 1150 | continue; |
| 1151 | } |
| 1152 | |
| 1153 | // SSS FIXME: This logic is not RT-friendly. |
| 1154 | // If this attribute was previously set, override it. |
| 1155 | // Output should only have one attribute of each name. |
| 1156 | $newAttrs[$k] = [ $v, $origV, $origK, false ]; |
| 1157 | } |
| 1158 | |
| 1159 | # itemtype, itemid, itemref don't make sense without itemscope |
| 1160 | if ( !array_key_exists( 'itemscope', $newAttrs ) ) { |
| 1161 | // SSS FIXME: This logic is not RT-friendly. |
| 1162 | unset( $newAttrs['itemtype'] ); |
| 1163 | unset( $newAttrs['itemid'] ); |
| 1164 | unset( $newAttrs['itemref'] ); |
| 1165 | } |
| 1166 | # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
| 1167 | |
| 1168 | return $newAttrs; |
| 1169 | } |
| 1170 | |
| 1171 | /** |
| 1172 | * Sanitize and apply attributes to a wrapper element. |
| 1173 | * |
| 1174 | * Used primarily when we're applying tokenized attributes directly to |
| 1175 | * dom elements, which wouldn't have had a chance to be sanitized before |
| 1176 | * tree building. |
| 1177 | * @param SiteConfig $siteConfig |
| 1178 | * @param Element $wrapper wrapper |
| 1179 | * @param array $attrs attributes |
| 1180 | */ |
| 1181 | public static function applySanitizedArgs( |
| 1182 | SiteConfig $siteConfig, Element $wrapper, array $attrs |
| 1183 | ): void { |
| 1184 | $nodeName = DOMUtils::nodeName( $wrapper ); |
| 1185 | $sanitizedAttrs = self::sanitizeTagAttrs( $siteConfig, $nodeName, null, $attrs ); |
| 1186 | foreach ( $sanitizedAttrs as $k => $v ) { |
| 1187 | if ( isset( $v[0] ) ) { |
| 1188 | $wrapper->setAttribute( $k, $v[0] ); |
| 1189 | } |
| 1190 | } |
| 1191 | } |
| 1192 | |
| 1193 | /** |
| 1194 | * Sanitize a title to be used in a URI? |
| 1195 | * @param string $title |
| 1196 | * @param bool $isInterwiki |
| 1197 | * @return string |
| 1198 | * @note This is a Parsoid-only method |
| 1199 | */ |
| 1200 | public static function sanitizeTitleURI( string $title, bool $isInterwiki = false ): string { |
| 1201 | $idx = strpos( $title, '#' ); |
| 1202 | $anchor = null; |
| 1203 | if ( $idx !== false ) { // split at first '#' |
| 1204 | $anchor = substr( $title, $idx + 1 ); |
| 1205 | $title = substr( $title, 0, $idx ); |
| 1206 | } |
| 1207 | $title = preg_replace_callback( |
| 1208 | '/[%? \[\]#|<>\\\\]/', static function ( $matches ) { |
| 1209 | return PHPUtils::encodeURIComponent( $matches[0] ); |
| 1210 | }, $title ); |
| 1211 | if ( $anchor !== null ) { |
| 1212 | $title .= '#' . ( $isInterwiki |
| 1213 | ? self::escapeIdForExternalInterwiki( $anchor ) |
| 1214 | : self::escapeIdForLink( $anchor ) ); |
| 1215 | } |
| 1216 | return $title; |
| 1217 | } |
| 1218 | |
| 1219 | // PORT_FIXME - The delimiterReplace code below is from StringUtils in core |
| 1220 | |
| 1221 | /** |
| 1222 | * Perform an operation equivalent to `preg_replace_callback()` |
| 1223 | * |
| 1224 | * Matches this code: |
| 1225 | * |
| 1226 | * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ); |
| 1227 | * |
| 1228 | * If the start delimiter ends with an initial substring of the end delimiter, |
| 1229 | * e.g. in the case of C-style comments, the behavior differs from the model |
| 1230 | * regex. In this implementation, the end must share no characters with the |
| 1231 | * start, so e.g. `/*\/` is not considered to be both the start and end of a |
| 1232 | * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. |
| 1233 | * |
| 1234 | * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() |
| 1235 | * but uses far less memory. The delimiters are literal strings, not regular expressions. |
| 1236 | * |
| 1237 | * @param string $startDelim Start delimiter |
| 1238 | * @param string $endDelim End delimiter |
| 1239 | * @param callable $callback Function to call on each match |
| 1240 | * @param string $subject |
| 1241 | * @param string $flags Regular expression flags |
| 1242 | * @throws InvalidArgumentException |
| 1243 | * @return string |
| 1244 | */ |
| 1245 | private static function delimiterReplaceCallback( |
| 1246 | string $startDelim, string $endDelim, callable $callback, string $subject, string $flags = '' |
| 1247 | ): string { |
| 1248 | $inputPos = 0; |
| 1249 | $outputPos = 0; |
| 1250 | $contentPos = 0; |
| 1251 | $output = ''; |
| 1252 | $foundStart = false; |
| 1253 | $encStart = preg_quote( $startDelim, '!' ); |
| 1254 | $encEnd = preg_quote( $endDelim, '!' ); |
| 1255 | $strcmp = !str_contains( $flags, 'i' ) ? 'strcmp' : 'strcasecmp'; |
| 1256 | $endLength = strlen( $endDelim ); |
| 1257 | $m = []; |
| 1258 | while ( $inputPos < strlen( $subject ) && |
| 1259 | preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) |
| 1260 | ) { |
| 1261 | $tokenOffset = $m[0][1]; |
| 1262 | if ( $m[1][0] !== '' ) { |
| 1263 | if ( $foundStart && |
| 1264 | $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) === 0 |
| 1265 | ) { |
| 1266 | # An end match is present at the same location |
| 1267 | $tokenType = 'end'; |
| 1268 | $tokenLength = $endLength; |
| 1269 | } else { |
| 1270 | $tokenType = 'start'; |
| 1271 | $tokenLength = strlen( $m[0][0] ); |
| 1272 | } |
| 1273 | } elseif ( $m[2][0] !== '' ) { |
| 1274 | $tokenType = 'end'; |
| 1275 | $tokenLength = strlen( $m[0][0] ); |
| 1276 | } else { |
| 1277 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
| 1278 | } |
| 1279 | if ( $tokenType === 'start' ) { |
| 1280 | # Only move the start position if we haven't already found a start |
| 1281 | # This means that START START END matches outer pair |
| 1282 | if ( !$foundStart ) { |
| 1283 | # Found start |
| 1284 | $inputPos = $tokenOffset + $tokenLength; |
| 1285 | # Write out the non-matching section |
| 1286 | $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); |
| 1287 | $outputPos = $tokenOffset; |
| 1288 | $contentPos = $inputPos; |
| 1289 | $foundStart = true; |
| 1290 | } else { |
| 1291 | # Move the input position past the *first character* of START, |
| 1292 | # to protect against missing END when it overlaps with START |
| 1293 | $inputPos = $tokenOffset + 1; |
| 1294 | } |
| 1295 | } elseif ( $tokenType === 'end' ) { |
| 1296 | if ( $foundStart ) { |
| 1297 | # Found match |
| 1298 | $output .= $callback( [ |
| 1299 | substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), |
| 1300 | substr( $subject, $contentPos, $tokenOffset - $contentPos ) |
| 1301 | ] ); |
| 1302 | $foundStart = false; |
| 1303 | } else { |
| 1304 | # Non-matching end, write it out |
| 1305 | $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); |
| 1306 | } |
| 1307 | $inputPos = $outputPos = $tokenOffset + $tokenLength; |
| 1308 | } else { |
| 1309 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
| 1310 | } |
| 1311 | } |
| 1312 | if ( $outputPos < strlen( $subject ) ) { |
| 1313 | $output .= substr( $subject, $outputPos ); |
| 1314 | } |
| 1315 | return $output; |
| 1316 | } |
| 1317 | |
| 1318 | /** |
| 1319 | * Perform an operation equivalent to `preg_replace()` with flags. |
| 1320 | * |
| 1321 | * Matches this code: |
| 1322 | * |
| 1323 | * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ); |
| 1324 | * |
| 1325 | * @param string $startDelim Start delimiter regular expression |
| 1326 | * @param string $endDelim End delimiter regular expression |
| 1327 | * @param string $replace Replacement string. May contain $1, which will be |
| 1328 | * replaced by the text between the delimiters |
| 1329 | * @param string $subject String to search |
| 1330 | * @param string $flags Regular expression flags |
| 1331 | * @return string The string with the matches replaced |
| 1332 | */ |
| 1333 | private static function delimiterReplace( |
| 1334 | string $startDelim, string $endDelim, string $replace, string $subject, string $flags = '' |
| 1335 | ): string { |
| 1336 | return self::delimiterReplaceCallback( |
| 1337 | $startDelim, $endDelim, |
| 1338 | static function ( array $matches ) use ( $replace ) { |
| 1339 | return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] ); |
| 1340 | }, |
| 1341 | $subject, $flags |
| 1342 | ); |
| 1343 | } |
| 1344 | } |