Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
5.44% |
24 / 441 |
|
3.03% |
1 / 33 |
CRAP | |
0.00% |
0 / 1 |
Sanitizer | |
5.44% |
24 / 441 |
|
3.03% |
1 / 33 |
19172.82 | |
0.00% |
0 / 1 |
attributesAllowedInternal | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setupAttributesAllowedInternal | |
0.00% |
0 / 127 |
|
0.00% |
0 / 1 |
6 | |||
normalizeCharReferences | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
normalizeCharReferencesCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
normalizeEntity | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
decCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
hexCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
validateCodepoint | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
110 | |||
codepointToUtf8 | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
utf8ToCodepoint | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
stripIDNs | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanUrl | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
decodeEntity | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
decodeChar | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
decodeCharReferences | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
normalizeCss | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
20 | |||
delimiterReplaceCallback | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
182 | |||
delimiterReplace | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidAttr | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
42 | |||
isReservedDataAttribute | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
sanitizeTagAttrs | |
0.00% |
0 / 74 |
|
0.00% |
0 / 1 |
2070 | |||
applySanitizedArgs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
checkCss | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
cssDecodeCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
72 | |||
sanitizeTitleURI | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
armorFrenchSpaces | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdForAttribute | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
escapeIdForLink | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdForExternalInterwiki | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdInternalUrl | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
escapeIdInternal | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
escapeIdReferenceList | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
normalizeSectionNameWhiteSpace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | /** |
5 | * General token sanitizer. Strips out (or encapsulates) unsafe and disallowed |
6 | * tag types and attributes. Should run last in the third, synchronous |
7 | * expansion stage. |
8 | * |
9 | * FIXME: This code was originally ported from PHP to JS in 2012 |
10 | * and periodically updated before being back to PHP. This code should be |
11 | * (a) resynced with core sanitizer changes (b) updated to use HTML5 spec |
12 | */ |
13 | |
14 | namespace Wikimedia\Parsoid\Core; |
15 | |
16 | use InvalidArgumentException; |
17 | use Wikimedia\Assert\Assert; |
18 | use Wikimedia\Parsoid\Config\SiteConfig; |
19 | use Wikimedia\Parsoid\DOM\Element; |
20 | use Wikimedia\Parsoid\Tokens\KV; |
21 | use Wikimedia\Parsoid\Tokens\Token; |
22 | use Wikimedia\Parsoid\Utils\DOMCompat; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\RemexHtml\HTMLData; |
27 | |
28 | class Sanitizer { |
29 | /** |
30 | * RDFa and microdata properties allow URLs, URIs and/or CURIs. |
31 | */ |
32 | private const MICRODATA = [ |
33 | 'rel' => true, |
34 | 'rev' => true, |
35 | 'about' => true, |
36 | 'property' => true, |
37 | 'resource' => true, |
38 | 'datatype' => true, |
39 | 'typeof' => true, // RDFa |
40 | 'itemid' => true, |
41 | 'itemprop' => true, |
42 | 'itemref' => true, |
43 | 'itemscope' => true, |
44 | 'itemtype' => true, |
45 | ]; |
46 | |
47 | private const UTF8_REPLACEMENT = "\u{FFFD}"; |
48 | |
49 | /** |
50 | * Regular expression to match various types of character references in |
51 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences |
52 | */ |
53 | private const CHAR_REFS_REGEX = |
54 | '/&([A-Za-z0-9\x80-\xff]+;) |
55 | |&\#([0-9]+); |
56 | |&\#[xX]([0-9A-Fa-f]+); |
57 | |&/x'; |
58 | |
59 | private const INSECURE_RE = '! expression |
60 | | accelerator\s*: |
61 | | -o-link\s*: |
62 | | -o-link-source\s*: |
63 | | -o-replace\s*: |
64 | | url\s*\( |
65 | | src\s*\( |
66 | | image\s*\( |
67 | | image-set\s*\( |
68 | | attr\s*\([^)]+[\s,]+url |
69 | !ix'; |
70 | |
71 | /** |
72 | * Pattern matching evil uris like javascript: |
73 | * WARNING: DO NOT use this in any place that actually requires denying |
74 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
75 | * pattern-based deny lists; the only way to be secure from javascript: |
76 | * uri based xss vectors is to allow only things that you know are safe |
77 | * and deny everything else. |
78 | * [1]: http://ha.ckers.org/xss.html |
79 | */ |
80 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)(\W|$)!iD'; |
81 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/D"; |
82 | |
83 | /** |
84 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
85 | * |
86 | * @since 1.30 |
87 | */ |
88 | public const ID_PRIMARY = 0; |
89 | |
90 | /** |
91 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
92 | * if no fallback is configured. |
93 | * |
94 | * @since 1.30 |
95 | */ |
96 | public const ID_FALLBACK = 1; // public because it is accessed in Headings handler |
97 | |
98 | /** Characters that will be ignored in IDNs. |
99 | * https://datatracker.ietf.org/doc/html/rfc8264#section-9.13 |
100 | * https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
101 | * Strip them before further processing so deny lists and such work. |
102 | * Part of Sanitizer::cleanUrl in core. |
103 | */ |
104 | private const IDN_RE_G = "/ |
105 | \\s| # general whitespace |
106 | \u{00AD}| # SOFT HYPHEN |
107 | \u{034F}| # COMBINING GRAPHEME JOINER |
108 | \u{061C}| # ARABIC LETTER MARK |
109 | [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER.. |
110 | # HANGUL JUNGSEONG FILLER |
111 | [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ.. |
112 | # KHMER VOWEL INHERENT AA |
113 | [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE.. |
114 | # MONGOLIAN FREE VARIATION SELECTOR THREE |
115 | \u{180E}| # MONGOLIAN VOWEL SEPARATOR |
116 | [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE.. |
117 | # RIGHT-TO-LEFT MARK |
118 | [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING.. |
119 | # RIGHT-TO-LEFT OVERRIDE |
120 | [\u{2060}-\u{2064}]| # WORD JOINER.. |
121 | # INVISIBLE PLUS |
122 | \u{2065}| # <reserved-2065> |
123 | [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE.. |
124 | # NOMINAL DIGIT SHAPES |
125 | \u{3164}| # HANGUL FILLER |
126 | [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1.. |
127 | # VARIATION SELECTOR-16 |
128 | \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE |
129 | \u{FFA0}| # HALFWIDTH HANGUL FILLER |
130 | [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>.. |
131 | # <reserved-FFF8> |
132 | [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP.. |
133 | # SHORTHAND FORMAT UP STEP |
134 | [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM.. |
135 | # MUSICAL SYMBOL END PHRASE |
136 | \u{E0000}| # <reserved-E0000> |
137 | \u{E0001}| # LANGUAGE TAG |
138 | [\u{E0002}-\u{E001F}]| # <reserved-E0002>.. |
139 | # <reserved-E001F> |
140 | [\u{E0020}-\u{E007F}]| # TAG SPACE.. |
141 | # CANCEL TAG |
142 | [\u{E0080}-\u{E00FF}]| # <reserved-E0080>.. |
143 | # <reserved-E00FF> |
144 | [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17.. |
145 | # VARIATION SELECTOR-256 |
146 | [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>.. |
147 | # <reserved-E0FFF> |
148 | /xuD"; |
149 | |
150 | private const GET_ATTRIBS_RE = '/^[:_\p{L}\p{N}][:_\.\-\p{L}\p{N}]*$/uD'; |
151 | |
152 | /** |
153 | * Character entity aliases accepted by MediaWiki in wikitext. |
154 | * These are not part of the HTML standard. |
155 | */ |
156 | private const MW_ENTITY_ALIASES = [ |
157 | 'רלמ;' => 'rlm;', |
158 | 'رلم;' => 'rlm;', |
159 | ]; |
160 | |
161 | /** |
162 | * Fetch the list of acceptable attributes for a given element name. |
163 | * |
164 | * @param string $element |
165 | * @return array<string,int> |
166 | */ |
167 | public static function attributesAllowedInternal( string $element ): array { |
168 | // PORT-FIXME: this method is private in core, but used by Gallery |
169 | $lists = self::setupAttributesAllowedInternal(); |
170 | $list = $lists[$element] ?? []; |
171 | return array_flip( $list ); |
172 | } |
173 | |
174 | /** |
175 | * Foreach array key (an allowed HTML element), return an array |
176 | * of allowed attributes |
177 | * @return array<string,string[]> |
178 | */ |
179 | private static function setupAttributesAllowedInternal(): array { |
180 | static $allowed; |
181 | |
182 | if ( $allowed !== null ) { |
183 | return $allowed; |
184 | } |
185 | |
186 | $common = [ |
187 | # HTML |
188 | 'id', |
189 | 'class', |
190 | 'style', |
191 | 'lang', |
192 | 'dir', |
193 | 'title', |
194 | 'tabindex', |
195 | |
196 | # WAI-ARIA |
197 | 'aria-describedby', |
198 | 'aria-flowto', |
199 | 'aria-hidden', |
200 | 'aria-label', |
201 | 'aria-labelledby', |
202 | 'aria-level', |
203 | 'aria-owns', |
204 | 'role', |
205 | |
206 | # RDFa |
207 | # These attributes are specified in section 9 of |
208 | # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
209 | 'about', |
210 | 'property', |
211 | 'resource', |
212 | 'datatype', |
213 | 'typeof', |
214 | |
215 | # Microdata. These are specified by |
216 | # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
217 | 'itemid', |
218 | 'itemprop', |
219 | 'itemref', |
220 | 'itemscope', |
221 | 'itemtype', |
222 | ]; |
223 | |
224 | $block = array_merge( $common, [ 'align' ] ); |
225 | $tablealign = [ 'align', 'valign' ]; |
226 | $tablecell = [ |
227 | 'abbr', |
228 | 'axis', |
229 | 'headers', |
230 | 'scope', |
231 | 'rowspan', |
232 | 'colspan', |
233 | 'nowrap', # deprecated |
234 | 'width', # deprecated |
235 | 'height', # deprecated |
236 | 'bgcolor', # deprecated |
237 | ]; |
238 | |
239 | # Numbers refer to sections in HTML 4.01 standard describing the element. |
240 | # See: https://www.w3.org/TR/html4/ |
241 | $allowed = [ |
242 | # 7.5.4 |
243 | 'div' => $block, |
244 | 'center' => $common, # deprecated |
245 | 'span' => $common, |
246 | |
247 | # 7.5.5 |
248 | 'h1' => $block, |
249 | 'h2' => $block, |
250 | 'h3' => $block, |
251 | 'h4' => $block, |
252 | 'h5' => $block, |
253 | 'h6' => $block, |
254 | |
255 | # 7.5.6 |
256 | # address |
257 | |
258 | # 8.2.4 |
259 | 'bdo' => $common, |
260 | |
261 | # 9.2.1 |
262 | 'em' => $common, |
263 | 'strong' => $common, |
264 | 'cite' => $common, |
265 | 'dfn' => $common, |
266 | 'code' => $common, |
267 | 'samp' => $common, |
268 | 'kbd' => $common, |
269 | 'var' => $common, |
270 | 'abbr' => $common, |
271 | # acronym |
272 | |
273 | # 9.2.2 |
274 | 'blockquote' => array_merge( $common, [ 'cite' ] ), |
275 | 'q' => array_merge( $common, [ 'cite' ] ), |
276 | |
277 | # 9.2.3 |
278 | 'sub' => $common, |
279 | 'sup' => $common, |
280 | |
281 | # 9.3.1 |
282 | 'p' => $block, |
283 | |
284 | # 9.3.2 |
285 | 'br' => array_merge( $common, [ 'clear' ] ), |
286 | |
287 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
288 | 'wbr' => $common, |
289 | |
290 | # 9.3.4 |
291 | 'pre' => array_merge( $common, [ 'width' ] ), |
292 | |
293 | # 9.4 |
294 | 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), |
295 | 'del' => array_merge( $common, [ 'cite', 'datetime' ] ), |
296 | |
297 | # 10.2 |
298 | 'ul' => array_merge( $common, [ 'type' ] ), |
299 | 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), |
300 | 'li' => array_merge( $common, [ 'type', 'value' ] ), |
301 | |
302 | # 10.3 |
303 | 'dl' => $common, |
304 | 'dd' => $common, |
305 | 'dt' => $common, |
306 | |
307 | # 11.2.1 |
308 | 'table' => array_merge( $common, |
309 | [ 'summary', 'width', 'border', 'frame', |
310 | 'rules', 'cellspacing', 'cellpadding', |
311 | 'align', 'bgcolor', |
312 | ] ), |
313 | |
314 | # 11.2.2 |
315 | 'caption' => $block, |
316 | |
317 | # 11.2.3 |
318 | 'thead' => $common, |
319 | 'tfoot' => $common, |
320 | 'tbody' => $common, |
321 | |
322 | # 11.2.4 |
323 | 'colgroup' => array_merge( $common, [ 'span' ] ), |
324 | 'col' => array_merge( $common, [ 'span' ] ), |
325 | |
326 | # 11.2.5 |
327 | 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), |
328 | |
329 | # 11.2.6 |
330 | 'td' => array_merge( $common, $tablecell, $tablealign ), |
331 | 'th' => array_merge( $common, $tablecell, $tablealign ), |
332 | |
333 | # 12.2 |
334 | # NOTE: <a> is not allowed directly, but this list of allowed |
335 | # attributes is used from the Parser object |
336 | 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
337 | |
338 | # 13.2 |
339 | # Not usually allowed, but may be used for extension-style hooks |
340 | # such as <math> when it is rasterized, or if $wgAllowImageTag is |
341 | # true |
342 | 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), |
343 | # Attributes for A/V tags added in T163583 / T133673 |
344 | 'audio' => array_merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), |
345 | 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), |
346 | 'source' => array_merge( $common, [ 'type', 'src' ] ), |
347 | 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), |
348 | |
349 | # 15.2.1 |
350 | 'tt' => $common, |
351 | 'b' => $common, |
352 | 'i' => $common, |
353 | 'big' => $common, |
354 | 'small' => $common, |
355 | 'strike' => $common, |
356 | 's' => $common, |
357 | 'u' => $common, |
358 | |
359 | # 15.2.2 |
360 | 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), |
361 | # basefont |
362 | |
363 | # 15.3 |
364 | 'hr' => array_merge( $common, [ 'width' ] ), |
365 | |
366 | # HTML Ruby annotation text module, simple ruby only. |
367 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
368 | 'ruby' => $common, |
369 | # rbc |
370 | 'rb' => $common, |
371 | 'rp' => $common, |
372 | 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ), |
373 | 'rtc' => $common, |
374 | |
375 | # MathML root element, where used for extensions |
376 | # 'title' may not be 100% valid here; it's XHTML |
377 | # https://www.w3.org/TR/REC-MathML/ |
378 | 'math' => [ 'class', 'style', 'id', 'title' ], |
379 | |
380 | // HTML 5 section 4.5 |
381 | 'figure' => $common, |
382 | 'figcaption' => $common, |
383 | |
384 | # HTML 5 section 4.6 |
385 | 'bdi' => $common, |
386 | |
387 | # HTML5 elements, defined by: |
388 | # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
389 | 'data' => array_merge( $common, [ 'value' ] ), |
390 | 'time' => array_merge( $common, [ 'datetime' ] ), |
391 | 'mark' => $common, |
392 | |
393 | // meta and link are only permitted by removeHTMLtags when Microdata |
394 | // is enabled so we don't bother adding a conditional to hide these |
395 | // Also meta and link are only valid in WikiText as Microdata elements |
396 | // (ie: validateTag rejects tags missing the attributes needed for Microdata) |
397 | // So we don't bother including $common attributes that have no purpose. |
398 | 'meta' => [ 'itemprop', 'content' ], |
399 | 'link' => [ 'itemprop', 'href', 'title' ], |
400 | |
401 | // HTML 5 section 4.3.5 |
402 | 'aside' => $common, |
403 | ]; |
404 | |
405 | return $allowed; |
406 | } |
407 | |
408 | /** |
409 | * Ensure that any entities and character references are legal |
410 | * for XML and XHTML specifically. Any stray bits will be |
411 | * &-escaped to result in a valid text fragment. |
412 | * |
413 | * a. named char refs can only be < > & ", others are |
414 | * numericized (this way we're well-formed even without a DTD) |
415 | * b. any numeric char refs must be legal chars, not invalid or forbidden |
416 | * c. use lower cased "&#x", not "&#X" |
417 | * d. fix or reject non-valid attributes |
418 | * |
419 | * @param string $text |
420 | * @return string |
421 | * @internal |
422 | */ |
423 | public static function normalizeCharReferences( string $text ): string { |
424 | return preg_replace_callback( |
425 | self::CHAR_REFS_REGEX, |
426 | [ self::class, 'normalizeCharReferencesCallback' ], |
427 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
428 | ); |
429 | } |
430 | |
431 | /** |
432 | * @param array $matches |
433 | * @return string |
434 | */ |
435 | private static function normalizeCharReferencesCallback( array $matches ): string { |
436 | $ret = null; |
437 | if ( isset( $matches[1] ) ) { |
438 | $ret = self::normalizeEntity( $matches[1] ); |
439 | } elseif ( isset( $matches[2] ) ) { |
440 | $ret = self::decCharReference( $matches[2] ); |
441 | } elseif ( isset( $matches[3] ) ) { |
442 | $ret = self::hexCharReference( $matches[3] ); |
443 | } |
444 | if ( $ret === null ) { |
445 | return htmlspecialchars( $matches[0] ); |
446 | } else { |
447 | return $ret; |
448 | } |
449 | } |
450 | |
451 | /** |
452 | * If the named entity is defined in HTML5 |
453 | * return the equivalent numeric entity reference (except for the core < |
454 | * > & "). If the entity is a MediaWiki-specific alias, returns |
455 | * the HTML equivalent. Otherwise, returns HTML-escaped text of |
456 | * pseudo-entity source (eg &foo;) |
457 | * |
458 | * @param string $name Semicolon-terminated name |
459 | * @return string |
460 | */ |
461 | private static function normalizeEntity( string $name ): string { |
462 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
463 | // Non-standard MediaWiki-specific entities |
464 | return '&' . self::MW_ENTITY_ALIASES[$name]; |
465 | } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) { |
466 | // Keep these in word form |
467 | return "&$name"; |
468 | } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) { |
469 | // Beware: some entities expand to more than 1 codepoint |
470 | return preg_replace_callback( '/./Ssu', function ( $m ) { |
471 | return '&#' . self::utf8ToCodepoint( $m[0] ) . ';'; |
472 | }, HTMLData::$namedEntityTranslations[$name] ); |
473 | } else { |
474 | return "&$name"; |
475 | } |
476 | } |
477 | |
478 | /** |
479 | * @param string $codepoint |
480 | * @return null|string |
481 | */ |
482 | private static function decCharReference( string $codepoint ): ?string { |
483 | # intval() will (safely) saturate at the maximum signed integer |
484 | # value if $codepoint is too many digits |
485 | $point = intval( $codepoint ); |
486 | if ( self::validateCodepoint( $point ) ) { |
487 | return "&#$point;"; |
488 | } else { |
489 | return null; |
490 | } |
491 | } |
492 | |
493 | /** |
494 | * @param string $codepoint |
495 | * @return null|string |
496 | */ |
497 | private static function hexCharReference( string $codepoint ): ?string { |
498 | $point = hexdec( $codepoint ); |
499 | // hexdec() might return a float if the string is too long |
500 | if ( is_int( $point ) && self::validateCodepoint( $point ) ) { |
501 | return sprintf( '&#x%x;', $point ); |
502 | } else { |
503 | return null; |
504 | } |
505 | } |
506 | |
507 | /** |
508 | * Returns true if a given Unicode codepoint is a valid character in |
509 | * both HTML5 and XML. |
510 | * @param int $codepoint |
511 | * @return bool |
512 | */ |
513 | private static function validateCodepoint( int $codepoint ): bool { |
514 | # U+000C is valid in HTML5 but not allowed in XML. |
515 | # U+000D is valid in XML but not allowed in HTML5. |
516 | # U+007F - U+009F are disallowed in HTML5 (control characters). |
517 | return $codepoint == 0x09 |
518 | || $codepoint == 0x0a |
519 | || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
520 | || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
521 | || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
522 | || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
523 | } |
524 | |
525 | /** |
526 | * Returns a string from the provided code point. |
527 | * |
528 | * @param int $cp |
529 | * @return string |
530 | */ |
531 | private static function codepointToUtf8( int $cp ): string { |
532 | $chr = mb_chr( $cp, 'UTF-8' ); |
533 | Assert::invariant( $chr !== false, "Getting char failed!" ); |
534 | return $chr; |
535 | } |
536 | |
537 | /** |
538 | * Returns the code point at the first position of the string. |
539 | * |
540 | * @param string $str |
541 | * @return int |
542 | */ |
543 | private static function utf8ToCodepoint( string $str ): int { |
544 | $ord = mb_ord( $str ); |
545 | Assert::invariant( $ord !== false, "Getting code point failed!" ); |
546 | return $ord; |
547 | } |
548 | |
549 | /** |
550 | * @param string $host |
551 | * @return string |
552 | */ |
553 | private static function stripIDNs( string $host ): string { |
554 | // This code is part of Sanitizer::cleanUrl in core |
555 | return preg_replace( self::IDN_RE_G, '', $host ); |
556 | } |
557 | |
558 | /** |
559 | * @param SiteConfig $siteConfig |
560 | * @param string $href |
561 | * @param string $mode |
562 | * @return string|null |
563 | */ |
564 | public static function cleanUrl( SiteConfig $siteConfig, string $href, string $mode ): ?string { |
565 | if ( $mode !== 'wikilink' ) { |
566 | $href = preg_replace_callback( |
567 | '/([\][<>"\x00-\x20\x7F\|])/', static function ( $matches ) { |
568 | return urlencode( $matches[0] ); |
569 | }, $href |
570 | ); |
571 | } |
572 | |
573 | $matched = preg_match( '#^((?:[a-zA-Z][^:/]*:)?(?://)?)([^/]+)(/?.*)#', $href, $bits ); |
574 | if ( $matched === 1 ) { |
575 | $proto = $bits[1]; |
576 | if ( $proto && !$siteConfig->hasValidProtocol( $proto ) ) { |
577 | // invalid proto, disallow URL |
578 | return null; |
579 | } |
580 | $host = self::stripIDNs( $bits[2] ); |
581 | preg_match( '/^%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$/D', $host, $match ); |
582 | if ( $match ) { |
583 | // IPv6 host names |
584 | $host = '[' . $match[1] . ']' . $match[2]; |
585 | } |
586 | $path = $bits[3]; |
587 | } else { |
588 | $proto = ''; |
589 | $host = ''; |
590 | $path = $href; |
591 | } |
592 | return $proto . $host . $path; |
593 | } |
594 | |
595 | /** |
596 | * If the named entity is defined in HTML5 |
597 | * return the UTF-8 encoding of that character. Otherwise, returns |
598 | * pseudo-entity source (eg "&foo;") |
599 | * |
600 | * @param string $name Semicolon-terminated entity name |
601 | * @return string |
602 | */ |
603 | private static function decodeEntity( string $name ): string { |
604 | // These are MediaWiki-specific entities, not in the HTML standard |
605 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
606 | $name = self::MW_ENTITY_ALIASES[$name]; |
607 | } |
608 | $trans = HTMLData::$namedEntityTranslations[$name] ?? null; |
609 | return $trans ?? "&$name"; |
610 | } |
611 | |
612 | /** |
613 | * Return UTF-8 string for a codepoint if that is a valid |
614 | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
615 | * @param int $codepoint |
616 | * @return string |
617 | */ |
618 | private static function decodeChar( int $codepoint ): string { |
619 | if ( self::validateCodepoint( $codepoint ) ) { |
620 | return self::codepointToUtf8( $codepoint ); |
621 | } else { |
622 | return self::UTF8_REPLACEMENT; |
623 | } |
624 | } |
625 | |
626 | /** |
627 | * Decode any character references, numeric or named entities, |
628 | * in the text and return a UTF-8 string. |
629 | * @param string $text |
630 | * @return string |
631 | */ |
632 | public static function decodeCharReferences( string $text ): string { |
633 | return preg_replace_callback( |
634 | self::CHAR_REFS_REGEX, |
635 | function ( $matches ) { |
636 | if ( isset( $matches[1] ) ) { |
637 | return self::decodeEntity( $matches[1] ); |
638 | } elseif ( isset( $matches[2] ) ) { |
639 | return self::decodeChar( intval( $matches[2] ) ); |
640 | } elseif ( isset( $matches[3] ) ) { |
641 | $point = hexdec( $matches[3] ); |
642 | // hexdec() might return a float if the string is too long |
643 | if ( !is_int( $point ) ) { |
644 | // Invalid character reference. |
645 | return self::UTF8_REPLACEMENT; |
646 | } |
647 | return self::decodeChar( $point ); |
648 | } |
649 | # Last case should be an ampersand by itself |
650 | return $matches[0]; |
651 | }, |
652 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
653 | ); |
654 | } |
655 | |
656 | /** |
657 | * Normalize CSS into a format we can easily search for hostile input |
658 | * - decode character references |
659 | * - decode escape sequences |
660 | * - convert characters that IE6 interprets into ascii |
661 | * - remove comments, unless the entire value is one single comment |
662 | * @param string $value the css string |
663 | * @return string normalized css |
664 | */ |
665 | public static function normalizeCss( string $value ): string { |
666 | // Decode character references like { |
667 | $value = self::decodeCharReferences( $value ); |
668 | |
669 | // Decode escape sequences and line continuation |
670 | // See the grammar in the CSS 2 spec, appendix D. |
671 | // This has to be done AFTER decoding character references. |
672 | // This means it isn't possible for this function to return |
673 | // unsanitized escape sequences. It is possible to manufacture |
674 | // input that contains character references that decode to |
675 | // escape sequences that decode to character references, but |
676 | // it's OK for the return value to contain character references |
677 | // because the caller is supposed to escape those anyway. |
678 | static $decodeRegex; |
679 | if ( !$decodeRegex ) { |
680 | $space = '[\\x20\\t\\r\\n\\f]'; |
681 | $nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
682 | $backslash = '\\\\'; |
683 | $decodeRegex = "/ $backslash |
684 | (?: |
685 | ($nl) | # 1. Line continuation |
686 | ([0-9A-Fa-f]{1,6})$space? | # 2. character number |
687 | (.) | # 3. backslash cancelling special meaning |
688 | () | # 4. backslash at end of string |
689 | )/xu"; |
690 | } |
691 | $value = preg_replace_callback( $decodeRegex, |
692 | [ self::class, 'cssDecodeCallback' ], $value ); |
693 | |
694 | // Let the value through if it's nothing but a single comment, to |
695 | // allow other functions which may reject it to pass some error |
696 | // message through. |
697 | if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !xD', $value ) ) { |
698 | // Remove any comments; IE gets token splitting wrong |
699 | // This must be done AFTER decoding character references and |
700 | // escape sequences, because those steps can introduce comments |
701 | // This step cannot introduce character references or escape |
702 | // sequences, because it replaces comments with spaces rather |
703 | // than removing them completely. |
704 | $value = self::delimiterReplace( '/*', '*/', ' ', $value ); |
705 | |
706 | // Remove anything after a comment-start token, to guard against |
707 | // incorrect client implementations. |
708 | $commentPos = strpos( $value, '/*' ); |
709 | if ( $commentPos !== false ) { |
710 | $value = substr( $value, 0, $commentPos ); |
711 | } |
712 | } |
713 | |
714 | return $value; |
715 | } |
716 | |
717 | // PORT_FIXME - The delimiterReplace code below is from StringUtils in core |
718 | |
719 | /** |
720 | * Perform an operation equivalent to `preg_replace_callback()` |
721 | * |
722 | * Matches this code: |
723 | * |
724 | * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ); |
725 | * |
726 | * If the start delimiter ends with an initial substring of the end delimiter, |
727 | * e.g. in the case of C-style comments, the behavior differs from the model |
728 | * regex. In this implementation, the end must share no characters with the |
729 | * start, so e.g. `/*\/` is not considered to be both the start and end of a |
730 | * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. |
731 | * |
732 | * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() |
733 | * but uses far less memory. The delimiters are literal strings, not regular expressions. |
734 | * |
735 | * @param string $startDelim Start delimiter |
736 | * @param string $endDelim End delimiter |
737 | * @param callable $callback Function to call on each match |
738 | * @param string $subject |
739 | * @param string $flags Regular expression flags |
740 | * @throws InvalidArgumentException |
741 | * @return string |
742 | */ |
743 | private static function delimiterReplaceCallback( |
744 | string $startDelim, string $endDelim, callable $callback, string $subject, string $flags = '' |
745 | ): string { |
746 | $inputPos = 0; |
747 | $outputPos = 0; |
748 | $contentPos = 0; |
749 | $output = ''; |
750 | $foundStart = false; |
751 | $encStart = preg_quote( $startDelim, '!' ); |
752 | $encEnd = preg_quote( $endDelim, '!' ); |
753 | $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; |
754 | $endLength = strlen( $endDelim ); |
755 | $m = []; |
756 | while ( $inputPos < strlen( $subject ) && |
757 | preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) |
758 | ) { |
759 | $tokenOffset = $m[0][1]; |
760 | if ( $m[1][0] !== '' ) { |
761 | if ( $foundStart && |
762 | $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) === 0 |
763 | ) { |
764 | # An end match is present at the same location |
765 | $tokenType = 'end'; |
766 | $tokenLength = $endLength; |
767 | } else { |
768 | $tokenType = 'start'; |
769 | $tokenLength = strlen( $m[0][0] ); |
770 | } |
771 | } elseif ( $m[2][0] !== '' ) { |
772 | $tokenType = 'end'; |
773 | $tokenLength = strlen( $m[0][0] ); |
774 | } else { |
775 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
776 | } |
777 | if ( $tokenType === 'start' ) { |
778 | # Only move the start position if we haven't already found a start |
779 | # This means that START START END matches outer pair |
780 | if ( !$foundStart ) { |
781 | # Found start |
782 | $inputPos = $tokenOffset + $tokenLength; |
783 | # Write out the non-matching section |
784 | $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); |
785 | $outputPos = $tokenOffset; |
786 | $contentPos = $inputPos; |
787 | $foundStart = true; |
788 | } else { |
789 | # Move the input position past the *first character* of START, |
790 | # to protect against missing END when it overlaps with START |
791 | $inputPos = $tokenOffset + 1; |
792 | } |
793 | } elseif ( $tokenType === 'end' ) { |
794 | if ( $foundStart ) { |
795 | # Found match |
796 | $output .= $callback( [ |
797 | substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), |
798 | substr( $subject, $contentPos, $tokenOffset - $contentPos ) |
799 | ] ); |
800 | $foundStart = false; |
801 | } else { |
802 | # Non-matching end, write it out |
803 | $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); |
804 | } |
805 | $inputPos = $outputPos = $tokenOffset + $tokenLength; |
806 | } else { |
807 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
808 | } |
809 | } |
810 | if ( $outputPos < strlen( $subject ) ) { |
811 | $output .= substr( $subject, $outputPos ); |
812 | } |
813 | return $output; |
814 | } |
815 | |
816 | /** |
817 | * Perform an operation equivalent to `preg_replace()` with flags. |
818 | * |
819 | * Matches this code: |
820 | * |
821 | * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ); |
822 | * |
823 | * @param string $startDelim Start delimiter regular expression |
824 | * @param string $endDelim End delimiter regular expression |
825 | * @param string $replace Replacement string. May contain $1, which will be |
826 | * replaced by the text between the delimiters |
827 | * @param string $subject String to search |
828 | * @param string $flags Regular expression flags |
829 | * @return string The string with the matches replaced |
830 | */ |
831 | private static function delimiterReplace( |
832 | string $startDelim, string $endDelim, string $replace, string $subject, string $flags = '' |
833 | ): string { |
834 | return self::delimiterReplaceCallback( |
835 | $startDelim, $endDelim, |
836 | static function ( array $matches ) use ( $replace ) { |
837 | return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] ); |
838 | }, |
839 | $subject, $flags |
840 | ); |
841 | } |
842 | |
843 | /** |
844 | * SSS FIXME: There is a test in mediawiki.environment.js that doles out |
845 | * and tests about ids. There are probably some tests in Util.php as well. |
846 | * We should move all these kind of tests somewhere else. |
847 | * @param string $k |
848 | * @param string $v |
849 | * @param KV[] $attrs |
850 | * @return bool |
851 | */ |
852 | private static function isParsoidAttr( string $k, string $v, array $attrs ): bool { |
853 | // NOTES: |
854 | // 1. Currently the tokenizer unconditionally escapes typeof and about |
855 | // attributes from wikitxt to data-x-typeof and data-x-about. So, |
856 | // this check will only pass through Parsoid inserted attrs. |
857 | // 2. But, if we fix the over-aggressive escaping in the tokenizer to |
858 | // not escape non-Parsoid typeof and about, then this will return |
859 | // true for something like typeof='mw:Foo evilScriptHere'. But, that |
860 | // is safe since this check is only used to see if we should |
861 | // unconditionally discard the entire attribute or process it further. |
862 | // That further processing will catch and discard any dangerous |
863 | // strings in the rest of the attribute |
864 | return ( in_array( $k, [ 'typeof', 'property', 'rel' ], true ) |
865 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', $v ) ) |
866 | || ( $k === 'about' && preg_match( '/^#mwt\d+$/D', $v ) ) |
867 | || ( $k === 'content' |
868 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', KV::lookup( $attrs, 'property' ) ?? '' ) ); |
869 | } |
870 | |
871 | /** |
872 | * Given an attribute name, checks whether it is a reserved data attribute |
873 | * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki |
874 | * core and extension code can safely use it to communicate with frontend code. |
875 | * @param string $attr Attribute name. |
876 | * @return bool |
877 | */ |
878 | public static function isReservedDataAttribute( string $attr ): bool { |
879 | // data-ooui is reserved for ooui. |
880 | // data-mw and data-parsoid are reserved for parsoid. |
881 | // data-mw-<name here> is reserved for extensions (or core) if |
882 | // they need to communicate some data to the client and want to be |
883 | // sure that it isn't coming from an untrusted user. |
884 | // We ignore the possibility of namespaces since user-generated HTML |
885 | // can't use them anymore. |
886 | if ( preg_match( '/^data-(mw|parsoid)/', $attr ) ) { |
887 | return false; // PARSOID SPECIFIC |
888 | } |
889 | return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); |
890 | } |
891 | |
892 | /** |
893 | * @param SiteConfig $siteConfig |
894 | * @param ?string $tagName |
895 | * @param ?Token $token |
896 | * @param array $attrs |
897 | * @return array |
898 | */ |
899 | public static function sanitizeTagAttrs( |
900 | SiteConfig $siteConfig, ?string $tagName, ?Token $token, array $attrs |
901 | ): array { |
902 | $tag = $tagName ?: $token->getName(); |
903 | |
904 | $list = self::attributesAllowedInternal( $tag ); |
905 | $newAttrs = []; |
906 | $n = count( $attrs ); |
907 | for ( $i = 0; $i < $n; $i++ ) { |
908 | $a = $attrs[$i]; |
909 | $a->v ??= ''; |
910 | |
911 | // Convert attributes to string, if necessary. |
912 | $a->k = TokenUtils::tokensToString( $a->k ); |
913 | |
914 | if ( is_array( $a->v ) ) { |
915 | // Use the expanded attr instead of trying to unpackDOMFragments |
916 | // since the fragment will have been released when expanding to DOM |
917 | $expandedVal = $token ? $token->fetchExpandedAttrValue( $a->k ) : null; |
918 | if ( $expandedVal === null ) { |
919 | $a->v = TokenUtils::tokensToString( $a->v ); |
920 | } else { |
921 | // See the comment in TokenUtils::tokensToString about |
922 | // unpackDOMFragments for why we're just using the textContent |
923 | $dom = DOMUtils::parseHTML( $expandedVal ); |
924 | $a->v = DOMCompat::getBody( $dom )->textContent; |
925 | } |
926 | } |
927 | |
928 | $origK = $a->ksrc ?? $a->k; |
929 | // $a->k can be uppercase |
930 | $k = mb_strtolower( $a->k ); |
931 | $v = $a->v; |
932 | $origV = $a->vsrc ?? $v; |
933 | $psdAttr = self::isParsoidAttr( $k, $v, $attrs ); |
934 | |
935 | // Bypass RDFa/allowed attribute checks for Parsoid-inserted attrs |
936 | // Safe to do since the tokenizer renames about/typeof attrs. |
937 | // unconditionally. FIXME: The escaping solution in the tokenizer |
938 | // may be aggressive. There is no need to escape typeof strings |
939 | // that or about ids that don't resemble Parsoid tokens/about ids. |
940 | if ( !$psdAttr ) { |
941 | if ( !preg_match( self::GET_ATTRIBS_RE, $k ) ) { |
942 | $newAttrs[$k] = [ null, $origV, $origK ]; |
943 | continue; |
944 | } |
945 | |
946 | # Allow XML namespace declaration to allow RDFa |
947 | if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $k ) ) { |
948 | if ( !preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
949 | $newAttrs[$k] = [ $v, $origV, $origK ]; |
950 | } else { |
951 | $newAttrs[$k] = [ null, $origV, $origK ]; |
952 | } |
953 | continue; |
954 | } |
955 | |
956 | # Allow any attribute beginning with "data-" |
957 | # However: |
958 | # * Disallow data attributes used by MediaWiki code |
959 | # * Ensure that the attribute is not namespaced by banning |
960 | # colons. |
961 | if ( ( !preg_match( '/^data-[^:]*$/iD', $k ) && !isset( $list[$k] ) ) |
962 | || self::isReservedDataAttribute( $k ) |
963 | ) { |
964 | $newAttrs[$k] = [ null, $origV, $origK ]; |
965 | continue; |
966 | } |
967 | } |
968 | |
969 | # Strip javascript "expression" from stylesheets. |
970 | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
971 | if ( $k === 'style' ) { |
972 | $v = self::checkCss( $v ); |
973 | } |
974 | |
975 | # Escape HTML id attributes |
976 | if ( $k === 'id' ) { |
977 | $v = self::escapeIdForAttribute( $v, self::ID_PRIMARY ); |
978 | if ( $v === '' ) { |
979 | $newAttrs[$k] = [ null, $origV, $origK ]; |
980 | continue; |
981 | } |
982 | } |
983 | |
984 | # Escape HTML id reference lists |
985 | if ( $k === 'aria-describedby' |
986 | || $k === 'aria-flowto' |
987 | || $k === 'aria-labelledby' |
988 | || $k === 'aria-owns' |
989 | ) { |
990 | $v = self::escapeIdReferenceList( $v ); |
991 | } |
992 | |
993 | // RDFa and microdata properties allow URLs, URIs and/or CURIs. |
994 | // Check them for validity. |
995 | if ( $k === 'rel' || $k === 'rev' |
996 | # RDFa |
997 | || $k === 'about' || $k === 'property' |
998 | || $k === 'resource' || $k === 'datatype' |
999 | || $k === 'typeof' |
1000 | # HTML5 microdata |
1001 | || $k === 'itemid' || $k === 'itemprop' |
1002 | || $k === 'itemref' || $k === 'itemscope' |
1003 | || $k === 'itemtype' |
1004 | ) { |
1005 | // Paranoia. Allow "simple" values but suppress javascript |
1006 | if ( preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
1007 | // Retain the Parsoid typeofs for Parsoid attrs |
1008 | $newV = $psdAttr ? trim( preg_replace( '/(?:^|\s)(?!mw:\w)\S*/', '', $origV ) ) : null; |
1009 | $newAttrs[$k] = [ $newV, $origV, $origK ]; |
1010 | continue; |
1011 | } |
1012 | } |
1013 | |
1014 | # NOTE: even though elements using href/src are not allowed directly, supply |
1015 | # validation code that can be used by tag hook handlers, etc |
1016 | if ( $token && ( $k === 'href' || $k === 'src' || $k === 'poster' ) ) { // T163583 |
1017 | // `origV` will always be `v`, because `a.vsrc` isn't set, since |
1018 | // this attribute didn't come from source. However, in the |
1019 | // LinkHandler, we may have already shadowed this value so use |
1020 | // that instead. |
1021 | $rel = $token->getAttributeShadowInfo( 'rel' ); |
1022 | $mode = ( $k === 'href' && |
1023 | isset( $rel['value'] ) && |
1024 | preg_match( '#^mw:WikiLink(/Interwiki)?$#', $rel['value'] ) |
1025 | ) ? 'wikilink' : 'external'; |
1026 | $origHref = $token->getAttributeShadowInfo( $k )['value']; |
1027 | $newHref = self::cleanUrl( $siteConfig, $v, $mode ); |
1028 | if ( $newHref !== $v ) { |
1029 | $newAttrs[$k] = [ $newHref, $origHref, $origK ]; |
1030 | continue; |
1031 | } |
1032 | } |
1033 | |
1034 | if ( $k === 'tabindex' && $v !== '0' ) { |
1035 | // Only allow tabindex of 0, which is useful for accessibility. |
1036 | continue; |
1037 | } |
1038 | |
1039 | // SSS FIXME: This logic is not RT-friendly. |
1040 | // If this attribute was previously set, override it. |
1041 | // Output should only have one attribute of each name. |
1042 | $newAttrs[$k] = [ $v, $origV, $origK ]; |
1043 | } |
1044 | |
1045 | # itemtype, itemid, itemref don't make sense without itemscope |
1046 | if ( !array_key_exists( 'itemscope', $newAttrs ) ) { |
1047 | // SSS FIXME: This logic is not RT-friendly. |
1048 | unset( $newAttrs['itemtype'] ); |
1049 | unset( $newAttrs['itemid'] ); |
1050 | unset( $newAttrs['itemref'] ); |
1051 | } |
1052 | # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
1053 | |
1054 | return $newAttrs; |
1055 | } |
1056 | |
1057 | /** |
1058 | * Sanitize and apply attributes to a wrapper element. |
1059 | * |
1060 | * Used primarily when we're applying tokenized attributes directly to |
1061 | * dom elements, which wouldn't have had a chance to be sanitized before |
1062 | * tree building. |
1063 | * @param SiteConfig $siteConfig |
1064 | * @param Element $wrapper wrapper |
1065 | * @param array $attrs attributes |
1066 | */ |
1067 | public static function applySanitizedArgs( |
1068 | SiteConfig $siteConfig, Element $wrapper, array $attrs |
1069 | ): void { |
1070 | $nodeName = DOMCompat::nodeName( $wrapper ); |
1071 | $sanitizedAttrs = self::sanitizeTagAttrs( $siteConfig, $nodeName, null, $attrs ); |
1072 | foreach ( $sanitizedAttrs as $k => $v ) { |
1073 | if ( isset( $v[0] ) ) { |
1074 | $wrapper->setAttribute( $k, $v[0] ); |
1075 | } |
1076 | } |
1077 | } |
1078 | |
1079 | /** |
1080 | * @param string $text |
1081 | * @return string |
1082 | */ |
1083 | public static function checkCss( string $text ): string { |
1084 | $text = self::normalizeCss( $text ); |
1085 | // \000-\010\013\016-\037\177 are the octal escape sequences |
1086 | if ( preg_match( '/[\000-\010\013\016-\037\177]/', $text ) |
1087 | || strpos( $text, self::UTF8_REPLACEMENT ) !== false |
1088 | ) { |
1089 | return '/* invalid control char */'; |
1090 | } elseif ( preg_match( self::INSECURE_RE, $text ) ) { |
1091 | return '/* insecure input */'; |
1092 | } else { |
1093 | return $text; |
1094 | } |
1095 | } |
1096 | |
1097 | /** |
1098 | * @param array $matches |
1099 | * @return string |
1100 | */ |
1101 | public static function cssDecodeCallback( array $matches ): string { |
1102 | if ( $matches[1] !== '' ) { |
1103 | // Line continuation |
1104 | return ''; |
1105 | } elseif ( $matches[2] !== '' ) { |
1106 | # hexdec could return a float if the match is too long, but the |
1107 | # regexp in question limits the string length to 6. |
1108 | $char = self::codepointToUtf8( hexdec( $matches[2] ) ); |
1109 | } elseif ( $matches[3] !== '' ) { |
1110 | $char = $matches[3]; |
1111 | } else { |
1112 | $char = '\\'; |
1113 | } |
1114 | if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
1115 | // These characters need to be escaped in strings |
1116 | // Clean up the escape sequence to avoid parsing errors by clients |
1117 | return '\\' . dechex( ord( $char ) ) . ' '; |
1118 | } else { |
1119 | // Decode unnecessary escape |
1120 | return $char; |
1121 | } |
1122 | } |
1123 | |
1124 | /** |
1125 | * Sanitize a title to be used in a URI? |
1126 | * @param string $title |
1127 | * @param bool $isInterwiki |
1128 | * @return string |
1129 | */ |
1130 | public static function sanitizeTitleURI( string $title, bool $isInterwiki = false ): string { |
1131 | $idx = strpos( $title, '#' ); |
1132 | $anchor = null; |
1133 | if ( $idx !== false ) { // split at first '#' |
1134 | $anchor = substr( $title, $idx + 1 ); |
1135 | $title = substr( $title, 0, $idx ); |
1136 | } |
1137 | $title = preg_replace_callback( |
1138 | '/[%? \[\]#|<>]/', static function ( $matches ) { |
1139 | return PHPUtils::encodeURIComponent( $matches[0] ); |
1140 | }, $title ); |
1141 | if ( $anchor !== null ) { |
1142 | $title .= '#' . ( $isInterwiki |
1143 | ? self::escapeIdForExternalInterwiki( $anchor ) |
1144 | : self::escapeIdForLink( $anchor ) ); |
1145 | } |
1146 | return $title; |
1147 | } |
1148 | |
1149 | public const FIXTAGS = [ |
1150 | # French spaces, last one Guillemet-left |
1151 | # only if it isn't followed by a word character. |
1152 | '/ (?=[?:;!%»›](?!\w))/u' => "%s", |
1153 | # French spaces, Guillemet-right |
1154 | '/([«‹]) /u' => "\\1%s", |
1155 | ]; |
1156 | |
1157 | /** |
1158 | * Armor French spaces with a replacement character |
1159 | * |
1160 | * @since 1.32 |
1161 | * @param string $text Text to armor |
1162 | * @param string $space Space character for the French spaces, defaults to ' ' |
1163 | * @return string Armored text |
1164 | */ |
1165 | public static function armorFrenchSpaces( string $text, string $space = ' ' ): string { |
1166 | // Replace $ with \$ and \ with \\ |
1167 | $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); |
1168 | return preg_replace( |
1169 | array_keys( self::FIXTAGS ), |
1170 | array_map( static function ( string $replacement ) use ( $space ) { |
1171 | // @phan-suppress-next-line PhanPluginPrintfVariableFormatString |
1172 | return sprintf( $replacement, $space ); |
1173 | }, array_values( self::FIXTAGS ) ), |
1174 | $text |
1175 | ); |
1176 | } |
1177 | |
1178 | /** |
1179 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1180 | * a valid HTML id attribute. |
1181 | * |
1182 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
1183 | * be sure to use proper escaping. |
1184 | * |
1185 | * In Parsoid, proper escaping is usually handled for us by the HTML |
1186 | * serialization algorithm, but be careful of corner cases (such as |
1187 | * emitting attributes in wikitext). |
1188 | * |
1189 | * @param string $id String to escape |
1190 | * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding |
1191 | * should be used. |
1192 | * @return string Escaped ID |
1193 | * |
1194 | * @since 1.30 |
1195 | */ |
1196 | public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ): string { |
1197 | // For consistency with PHP's API, we accept "primary" or "fallback" as |
1198 | // the mode in 'options'. This (slightly) abstracts the actual details |
1199 | // of the id encoding from the Parsoid code which handles ids; we could |
1200 | // swap primary and fallback here, or even transition to a new HTML6 |
1201 | // encoding (!), without touching all the call sites. |
1202 | $internalMode = $mode === self::ID_FALLBACK ? 'legacy' : 'html5'; |
1203 | return self::escapeIdInternal( $id, $internalMode ); |
1204 | } |
1205 | |
1206 | /** |
1207 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1208 | * a valid URL fragment. |
1209 | * |
1210 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
1211 | * be sure to use proper escaping. |
1212 | * |
1213 | * @param string $id String to escape |
1214 | * @return string Escaped ID |
1215 | * |
1216 | * @since 1.30 |
1217 | */ |
1218 | public static function escapeIdForLink( string $id ): string { |
1219 | return self::escapeIdInternalUrl( $id, 'html5' ); |
1220 | } |
1221 | |
1222 | /** |
1223 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1224 | * a valid URL fragment for external interwikis. |
1225 | * |
1226 | * @param string $id String to escape |
1227 | * @return string Escaped ID |
1228 | * |
1229 | * @since 1.30 |
1230 | */ |
1231 | private static function escapeIdForExternalInterwiki( string $id ): string { |
1232 | // Assume $wgExternalInterwikiFragmentMode = 'legacy' |
1233 | return self::escapeIdInternalUrl( $id, 'legacy' ); |
1234 | } |
1235 | |
1236 | /** |
1237 | * Do percent encoding of percent signs for href (but not id) attributes |
1238 | * |
1239 | * @see https://phabricator.wikimedia.org/T238385 |
1240 | * @param string $id String to escape |
1241 | * @param string $mode One of modes from $wgFragmentMode |
1242 | * @return string |
1243 | */ |
1244 | private static function escapeIdInternalUrl( string $id, string $mode ): string { |
1245 | $id = self::escapeIdInternal( $id, $mode ); |
1246 | if ( $mode === 'html5' ) { |
1247 | $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); |
1248 | } |
1249 | return $id; |
1250 | } |
1251 | |
1252 | /** |
1253 | * Helper for escapeIdFor*() functions. Performs most of the actual escaping. |
1254 | * |
1255 | * @param string $id String to escape |
1256 | * @param string $mode One of modes from $wgFragmentMode ('html5' or 'legacy') |
1257 | * @return string |
1258 | */ |
1259 | private static function escapeIdInternal( string $id, string $mode ): string { |
1260 | switch ( $mode ) { |
1261 | case 'html5': |
1262 | // html5 spec says ids must not have any of the following: |
1263 | // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE |
1264 | // In practice, in wikitext, only tab, LF, CR (and SPACE) are |
1265 | // possible using either Lua or html entities. |
1266 | $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); |
1267 | break; |
1268 | |
1269 | case 'legacy': |
1270 | // This corresponds to 'noninitial' mode of the old escapeId |
1271 | static $replace = [ |
1272 | '%3A' => ':', |
1273 | '%' => '.' |
1274 | ]; |
1275 | |
1276 | $id = urlencode( str_replace( ' ', '_', $id ) ); |
1277 | $id = strtr( $id, $replace ); |
1278 | break; |
1279 | |
1280 | default: |
1281 | throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); |
1282 | } |
1283 | |
1284 | return $id; |
1285 | } |
1286 | |
1287 | /** |
1288 | * Given a string containing a space delimited list of ids, escape each id |
1289 | * to match ids escaped by the escapeIdForAttribute() function. |
1290 | * |
1291 | * @since 1.27 |
1292 | * |
1293 | * @param string $referenceString Space delimited list of ids |
1294 | * @return string |
1295 | */ |
1296 | public static function escapeIdReferenceList( string $referenceString ): string { |
1297 | # Explode the space delimited list string into an array of tokens |
1298 | $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
1299 | |
1300 | # Escape each token as an id |
1301 | foreach ( $references as &$ref ) { |
1302 | $ref = self::escapeIdForAttribute( $ref ); |
1303 | } |
1304 | |
1305 | # Merge the array back to a space delimited list string |
1306 | # If the array is empty, the result will be an empty string ('') |
1307 | $referenceString = implode( ' ', $references ); |
1308 | |
1309 | return $referenceString; |
1310 | } |
1311 | |
1312 | /** |
1313 | * Normalizes whitespace in a section name, such as might be returned |
1314 | * by Parser::stripSectionName(), for use in the ids that are used for |
1315 | * section links. |
1316 | * |
1317 | * @param string $section |
1318 | * @return string |
1319 | */ |
1320 | public static function normalizeSectionNameWhiteSpace( string $section ): string { |
1321 | return trim( preg_replace( '/[ _]+/', ' ', $section ) ); |
1322 | } |
1323 | } |