Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
5.48% |
24 / 438 |
|
3.03% |
1 / 33 |
CRAP | |
0.00% |
0 / 1 |
Sanitizer | |
5.48% |
24 / 438 |
|
3.03% |
1 / 33 |
18896.84 | |
0.00% |
0 / 1 |
attributesAllowedInternal | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setupAttributesAllowedInternal | |
0.00% |
0 / 127 |
|
0.00% |
0 / 1 |
6 | |||
normalizeCharReferences | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
normalizeCharReferencesCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
normalizeEntity | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
decCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
hexCharReference | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
validateCodepoint | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
110 | |||
codepointToUtf8 | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
utf8ToCodepoint | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
stripIDNs | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cleanUrl | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
decodeEntity | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
decodeChar | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
decodeCharReferences | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
normalizeCss | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
20 | |||
delimiterReplaceCallback | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
182 | |||
delimiterReplace | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidAttr | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
42 | |||
isReservedDataAttribute | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
sanitizeTagAttrs | |
0.00% |
0 / 71 |
|
0.00% |
0 / 1 |
1980 | |||
applySanitizedArgs | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
checkCss | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
cssDecodeCallback | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
72 | |||
sanitizeTitleURI | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
armorFrenchSpaces | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdForAttribute | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
escapeIdForLink | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdForExternalInterwiki | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeIdInternalUrl | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
escapeIdInternal | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
escapeIdReferenceList | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
normalizeSectionNameWhiteSpace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | /** |
5 | * General token sanitizer. Strips out (or encapsulates) unsafe and disallowed |
6 | * tag types and attributes. Should run last in the third, synchronous |
7 | * expansion stage. |
8 | * |
9 | * FIXME: This code was originally ported from PHP to JS in 2012 |
10 | * and periodically updated before being back to PHP. This code should be |
11 | * (a) resynced with core sanitizer changes (b) updated to use HTML5 spec |
12 | */ |
13 | |
14 | namespace Wikimedia\Parsoid\Core; |
15 | |
16 | use InvalidArgumentException; |
17 | use Wikimedia\Assert\Assert; |
18 | use Wikimedia\Parsoid\Config\SiteConfig; |
19 | use Wikimedia\Parsoid\DOM\Element; |
20 | use Wikimedia\Parsoid\Tokens\KV; |
21 | use Wikimedia\Parsoid\Tokens\Token; |
22 | use Wikimedia\Parsoid\Utils\DOMCompat; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\RemexHtml\HTMLData; |
27 | |
28 | class Sanitizer { |
29 | /** |
30 | * RDFa and microdata properties allow URLs, URIs and/or CURIs. |
31 | */ |
32 | private const MICRODATA = [ |
33 | 'rel' => true, |
34 | 'rev' => true, |
35 | 'about' => true, |
36 | 'property' => true, |
37 | 'resource' => true, |
38 | 'datatype' => true, |
39 | 'typeof' => true, // RDFa |
40 | 'itemid' => true, |
41 | 'itemprop' => true, |
42 | 'itemref' => true, |
43 | 'itemscope' => true, |
44 | 'itemtype' => true, |
45 | ]; |
46 | |
47 | private const UTF8_REPLACEMENT = "\u{FFFD}"; |
48 | |
49 | /** |
50 | * Regular expression to match various types of character references in |
51 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences |
52 | */ |
53 | private const CHAR_REFS_REGEX = |
54 | '/&([A-Za-z0-9\x80-\xff]+;) |
55 | |&\#([0-9]+); |
56 | |&\#[xX]([0-9A-Fa-f]+); |
57 | |&/x'; |
58 | |
59 | private const INSECURE_RE = '! expression |
60 | | accelerator\s*: |
61 | | -o-link\s*: |
62 | | -o-link-source\s*: |
63 | | -o-replace\s*: |
64 | | url\s*\( |
65 | | image\s*\( |
66 | | image-set\s*\( |
67 | | attr\s*\([^)]+[\s,]+url |
68 | !ix'; |
69 | |
70 | /** |
71 | * Pattern matching evil uris like javascript: |
72 | * WARNING: DO NOT use this in any place that actually requires denying |
73 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
74 | * pattern-based deny lists; the only way to be secure from javascript: |
75 | * uri based xss vectors is to allow only things that you know are safe |
76 | * and deny everything else. |
77 | * [1]: http://ha.ckers.org/xss.html |
78 | */ |
79 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)(\W|$)!iD'; |
80 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/D"; |
81 | |
82 | /** |
83 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
84 | * |
85 | * @since 1.30 |
86 | */ |
87 | public const ID_PRIMARY = 0; |
88 | |
89 | /** |
90 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
91 | * if no fallback is configured. |
92 | * |
93 | * @since 1.30 |
94 | */ |
95 | public const ID_FALLBACK = 1; // public because it is accessed in Headings handler |
96 | |
97 | /** Characters that will be ignored in IDNs. |
98 | * https://datatracker.ietf.org/doc/html/rfc8264#section-9.13 |
99 | * https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
100 | * Strip them before further processing so deny lists and such work. |
101 | * Part of Sanitizer::cleanUrl in core. |
102 | */ |
103 | private const IDN_RE_G = "/ |
104 | \\s| # general whitespace |
105 | \u{00AD}| # SOFT HYPHEN |
106 | \u{034F}| # COMBINING GRAPHEME JOINER |
107 | \u{061C}| # ARABIC LETTER MARK |
108 | [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER.. |
109 | # HANGUL JUNGSEONG FILLER |
110 | [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ.. |
111 | # KHMER VOWEL INHERENT AA |
112 | [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE.. |
113 | # MONGOLIAN FREE VARIATION SELECTOR THREE |
114 | \u{180E}| # MONGOLIAN VOWEL SEPARATOR |
115 | [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE.. |
116 | # RIGHT-TO-LEFT MARK |
117 | [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING.. |
118 | # RIGHT-TO-LEFT OVERRIDE |
119 | [\u{2060}-\u{2064}]| # WORD JOINER.. |
120 | # INVISIBLE PLUS |
121 | \u{2065}| # <reserved-2065> |
122 | [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE.. |
123 | # NOMINAL DIGIT SHAPES |
124 | \u{3164}| # HANGUL FILLER |
125 | [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1.. |
126 | # VARIATION SELECTOR-16 |
127 | \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE |
128 | \u{FFA0}| # HALFWIDTH HANGUL FILLER |
129 | [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>.. |
130 | # <reserved-FFF8> |
131 | [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP.. |
132 | # SHORTHAND FORMAT UP STEP |
133 | [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM.. |
134 | # MUSICAL SYMBOL END PHRASE |
135 | \u{E0000}| # <reserved-E0000> |
136 | \u{E0001}| # LANGUAGE TAG |
137 | [\u{E0002}-\u{E001F}]| # <reserved-E0002>.. |
138 | # <reserved-E001F> |
139 | [\u{E0020}-\u{E007F}]| # TAG SPACE.. |
140 | # CANCEL TAG |
141 | [\u{E0080}-\u{E00FF}]| # <reserved-E0080>.. |
142 | # <reserved-E00FF> |
143 | [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17.. |
144 | # VARIATION SELECTOR-256 |
145 | [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>.. |
146 | # <reserved-E0FFF> |
147 | /xuD"; |
148 | |
149 | private const GET_ATTRIBS_RE = '/^[:_\p{L}\p{N}][:_\.\-\p{L}\p{N}]*$/uD'; |
150 | |
151 | /** |
152 | * Character entity aliases accepted by MediaWiki in wikitext. |
153 | * These are not part of the HTML standard. |
154 | */ |
155 | private const MW_ENTITY_ALIASES = [ |
156 | 'רלמ;' => 'rlm;', |
157 | 'رلم;' => 'rlm;', |
158 | ]; |
159 | |
160 | /** |
161 | * Fetch the list of acceptable attributes for a given element name. |
162 | * |
163 | * @param string $element |
164 | * @return array<string,int> |
165 | */ |
166 | public static function attributesAllowedInternal( string $element ): array { |
167 | // PORT-FIXME: this method is private in core, but used by Gallery |
168 | $lists = self::setupAttributesAllowedInternal(); |
169 | $list = $lists[$element] ?? []; |
170 | return array_flip( $list ); |
171 | } |
172 | |
173 | /** |
174 | * Foreach array key (an allowed HTML element), return an array |
175 | * of allowed attributes |
176 | * @return array<string,string[]> |
177 | */ |
178 | private static function setupAttributesAllowedInternal(): array { |
179 | static $allowed; |
180 | |
181 | if ( $allowed !== null ) { |
182 | return $allowed; |
183 | } |
184 | |
185 | $common = [ |
186 | # HTML |
187 | 'id', |
188 | 'class', |
189 | 'style', |
190 | 'lang', |
191 | 'dir', |
192 | 'title', |
193 | 'tabindex', |
194 | |
195 | # WAI-ARIA |
196 | 'aria-describedby', |
197 | 'aria-flowto', |
198 | 'aria-hidden', |
199 | 'aria-label', |
200 | 'aria-labelledby', |
201 | 'aria-level', |
202 | 'aria-owns', |
203 | 'role', |
204 | |
205 | # RDFa |
206 | # These attributes are specified in section 9 of |
207 | # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
208 | 'about', |
209 | 'property', |
210 | 'resource', |
211 | 'datatype', |
212 | 'typeof', |
213 | |
214 | # Microdata. These are specified by |
215 | # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
216 | 'itemid', |
217 | 'itemprop', |
218 | 'itemref', |
219 | 'itemscope', |
220 | 'itemtype', |
221 | ]; |
222 | |
223 | $block = array_merge( $common, [ 'align' ] ); |
224 | $tablealign = [ 'align', 'valign' ]; |
225 | $tablecell = [ |
226 | 'abbr', |
227 | 'axis', |
228 | 'headers', |
229 | 'scope', |
230 | 'rowspan', |
231 | 'colspan', |
232 | 'nowrap', # deprecated |
233 | 'width', # deprecated |
234 | 'height', # deprecated |
235 | 'bgcolor', # deprecated |
236 | ]; |
237 | |
238 | # Numbers refer to sections in HTML 4.01 standard describing the element. |
239 | # See: https://www.w3.org/TR/html4/ |
240 | $allowed = [ |
241 | # 7.5.4 |
242 | 'div' => $block, |
243 | 'center' => $common, # deprecated |
244 | 'span' => $common, |
245 | |
246 | # 7.5.5 |
247 | 'h1' => $block, |
248 | 'h2' => $block, |
249 | 'h3' => $block, |
250 | 'h4' => $block, |
251 | 'h5' => $block, |
252 | 'h6' => $block, |
253 | |
254 | # 7.5.6 |
255 | # address |
256 | |
257 | # 8.2.4 |
258 | 'bdo' => $common, |
259 | |
260 | # 9.2.1 |
261 | 'em' => $common, |
262 | 'strong' => $common, |
263 | 'cite' => $common, |
264 | 'dfn' => $common, |
265 | 'code' => $common, |
266 | 'samp' => $common, |
267 | 'kbd' => $common, |
268 | 'var' => $common, |
269 | 'abbr' => $common, |
270 | # acronym |
271 | |
272 | # 9.2.2 |
273 | 'blockquote' => array_merge( $common, [ 'cite' ] ), |
274 | 'q' => array_merge( $common, [ 'cite' ] ), |
275 | |
276 | # 9.2.3 |
277 | 'sub' => $common, |
278 | 'sup' => $common, |
279 | |
280 | # 9.3.1 |
281 | 'p' => $block, |
282 | |
283 | # 9.3.2 |
284 | 'br' => array_merge( $common, [ 'clear' ] ), |
285 | |
286 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
287 | 'wbr' => $common, |
288 | |
289 | # 9.3.4 |
290 | 'pre' => array_merge( $common, [ 'width' ] ), |
291 | |
292 | # 9.4 |
293 | 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), |
294 | 'del' => array_merge( $common, [ 'cite', 'datetime' ] ), |
295 | |
296 | # 10.2 |
297 | 'ul' => array_merge( $common, [ 'type' ] ), |
298 | 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), |
299 | 'li' => array_merge( $common, [ 'type', 'value' ] ), |
300 | |
301 | # 10.3 |
302 | 'dl' => $common, |
303 | 'dd' => $common, |
304 | 'dt' => $common, |
305 | |
306 | # 11.2.1 |
307 | 'table' => array_merge( $common, |
308 | [ 'summary', 'width', 'border', 'frame', |
309 | 'rules', 'cellspacing', 'cellpadding', |
310 | 'align', 'bgcolor', |
311 | ] ), |
312 | |
313 | # 11.2.2 |
314 | 'caption' => $block, |
315 | |
316 | # 11.2.3 |
317 | 'thead' => $common, |
318 | 'tfoot' => $common, |
319 | 'tbody' => $common, |
320 | |
321 | # 11.2.4 |
322 | 'colgroup' => array_merge( $common, [ 'span' ] ), |
323 | 'col' => array_merge( $common, [ 'span' ] ), |
324 | |
325 | # 11.2.5 |
326 | 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), |
327 | |
328 | # 11.2.6 |
329 | 'td' => array_merge( $common, $tablecell, $tablealign ), |
330 | 'th' => array_merge( $common, $tablecell, $tablealign ), |
331 | |
332 | # 12.2 |
333 | # NOTE: <a> is not allowed directly, but this list of allowed |
334 | # attributes is used from the Parser object |
335 | 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
336 | |
337 | # 13.2 |
338 | # Not usually allowed, but may be used for extension-style hooks |
339 | # such as <math> when it is rasterized, or if $wgAllowImageTag is |
340 | # true |
341 | 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), |
342 | # Attributes for A/V tags added in T163583 / T133673 |
343 | 'audio' => array_merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), |
344 | 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), |
345 | 'source' => array_merge( $common, [ 'type', 'src' ] ), |
346 | 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), |
347 | |
348 | # 15.2.1 |
349 | 'tt' => $common, |
350 | 'b' => $common, |
351 | 'i' => $common, |
352 | 'big' => $common, |
353 | 'small' => $common, |
354 | 'strike' => $common, |
355 | 's' => $common, |
356 | 'u' => $common, |
357 | |
358 | # 15.2.2 |
359 | 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), |
360 | # basefont |
361 | |
362 | # 15.3 |
363 | 'hr' => array_merge( $common, [ 'width' ] ), |
364 | |
365 | # HTML Ruby annotation text module, simple ruby only. |
366 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
367 | 'ruby' => $common, |
368 | # rbc |
369 | 'rb' => $common, |
370 | 'rp' => $common, |
371 | 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ), |
372 | 'rtc' => $common, |
373 | |
374 | # MathML root element, where used for extensions |
375 | # 'title' may not be 100% valid here; it's XHTML |
376 | # https://www.w3.org/TR/REC-MathML/ |
377 | 'math' => [ 'class', 'style', 'id', 'title' ], |
378 | |
379 | // HTML 5 section 4.5 |
380 | 'figure' => $common, |
381 | 'figcaption' => $common, |
382 | |
383 | # HTML 5 section 4.6 |
384 | 'bdi' => $common, |
385 | |
386 | # HTML5 elements, defined by: |
387 | # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
388 | 'data' => array_merge( $common, [ 'value' ] ), |
389 | 'time' => array_merge( $common, [ 'datetime' ] ), |
390 | 'mark' => $common, |
391 | |
392 | // meta and link are only permitted by removeHTMLtags when Microdata |
393 | // is enabled so we don't bother adding a conditional to hide these |
394 | // Also meta and link are only valid in WikiText as Microdata elements |
395 | // (ie: validateTag rejects tags missing the attributes needed for Microdata) |
396 | // So we don't bother including $common attributes that have no purpose. |
397 | 'meta' => [ 'itemprop', 'content' ], |
398 | 'link' => [ 'itemprop', 'href', 'title' ], |
399 | |
400 | // HTML 5 section 4.3.5 |
401 | 'aside' => $common, |
402 | ]; |
403 | |
404 | return $allowed; |
405 | } |
406 | |
407 | /** |
408 | * Ensure that any entities and character references are legal |
409 | * for XML and XHTML specifically. Any stray bits will be |
410 | * &-escaped to result in a valid text fragment. |
411 | * |
412 | * a. named char refs can only be < > & ", others are |
413 | * numericized (this way we're well-formed even without a DTD) |
414 | * b. any numeric char refs must be legal chars, not invalid or forbidden |
415 | * c. use lower cased "&#x", not "&#X" |
416 | * d. fix or reject non-valid attributes |
417 | * |
418 | * @param string $text |
419 | * @return string |
420 | * @internal |
421 | */ |
422 | public static function normalizeCharReferences( string $text ): string { |
423 | return preg_replace_callback( |
424 | self::CHAR_REFS_REGEX, |
425 | [ self::class, 'normalizeCharReferencesCallback' ], |
426 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
427 | ); |
428 | } |
429 | |
430 | /** |
431 | * @param array $matches |
432 | * @return string |
433 | */ |
434 | private static function normalizeCharReferencesCallback( array $matches ): string { |
435 | $ret = null; |
436 | if ( isset( $matches[1] ) ) { |
437 | $ret = self::normalizeEntity( $matches[1] ); |
438 | } elseif ( isset( $matches[2] ) ) { |
439 | $ret = self::decCharReference( $matches[2] ); |
440 | } elseif ( isset( $matches[3] ) ) { |
441 | $ret = self::hexCharReference( $matches[3] ); |
442 | } |
443 | if ( $ret === null ) { |
444 | return htmlspecialchars( $matches[0] ); |
445 | } else { |
446 | return $ret; |
447 | } |
448 | } |
449 | |
450 | /** |
451 | * If the named entity is defined in HTML5 |
452 | * return the equivalent numeric entity reference (except for the core < |
453 | * > & "). If the entity is a MediaWiki-specific alias, returns |
454 | * the HTML equivalent. Otherwise, returns HTML-escaped text of |
455 | * pseudo-entity source (eg &foo;) |
456 | * |
457 | * @param string $name Semicolon-terminated name |
458 | * @return string |
459 | */ |
460 | private static function normalizeEntity( string $name ): string { |
461 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
462 | // Non-standard MediaWiki-specific entities |
463 | return '&' . self::MW_ENTITY_ALIASES[$name]; |
464 | } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) { |
465 | // Keep these in word form |
466 | return "&$name"; |
467 | } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) { |
468 | // Beware: some entities expand to more than 1 codepoint |
469 | return preg_replace_callback( '/./Ssu', function ( $m ) { |
470 | return '&#' . self::utf8ToCodepoint( $m[0] ) . ';'; |
471 | }, HTMLData::$namedEntityTranslations[$name] ); |
472 | } else { |
473 | return "&$name"; |
474 | } |
475 | } |
476 | |
477 | /** |
478 | * @param string $codepoint |
479 | * @return null|string |
480 | */ |
481 | private static function decCharReference( string $codepoint ): ?string { |
482 | # intval() will (safely) saturate at the maximum signed integer |
483 | # value if $codepoint is too many digits |
484 | $point = intval( $codepoint ); |
485 | if ( self::validateCodepoint( $point ) ) { |
486 | return "&#$point;"; |
487 | } else { |
488 | return null; |
489 | } |
490 | } |
491 | |
492 | /** |
493 | * @param string $codepoint |
494 | * @return null|string |
495 | */ |
496 | private static function hexCharReference( string $codepoint ): ?string { |
497 | $point = hexdec( $codepoint ); |
498 | // hexdec() might return a float if the string is too long |
499 | if ( is_int( $point ) && self::validateCodepoint( $point ) ) { |
500 | return sprintf( '&#x%x;', $point ); |
501 | } else { |
502 | return null; |
503 | } |
504 | } |
505 | |
506 | /** |
507 | * Returns true if a given Unicode codepoint is a valid character in |
508 | * both HTML5 and XML. |
509 | * @param int $codepoint |
510 | * @return bool |
511 | */ |
512 | private static function validateCodepoint( int $codepoint ): bool { |
513 | # U+000C is valid in HTML5 but not allowed in XML. |
514 | # U+000D is valid in XML but not allowed in HTML5. |
515 | # U+007F - U+009F are disallowed in HTML5 (control characters). |
516 | return $codepoint == 0x09 |
517 | || $codepoint == 0x0a |
518 | || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
519 | || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
520 | || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
521 | || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
522 | } |
523 | |
524 | /** |
525 | * Returns a string from the provided code point. |
526 | * |
527 | * @param int $cp |
528 | * @return string |
529 | */ |
530 | private static function codepointToUtf8( int $cp ): string { |
531 | $chr = mb_chr( $cp, 'UTF-8' ); |
532 | Assert::invariant( $chr !== false, "Getting char failed!" ); |
533 | return $chr; |
534 | } |
535 | |
536 | /** |
537 | * Returns the code point at the first position of the string. |
538 | * |
539 | * @param string $str |
540 | * @return int |
541 | */ |
542 | private static function utf8ToCodepoint( string $str ): int { |
543 | $ord = mb_ord( $str ); |
544 | Assert::invariant( $ord !== false, "Getting code point failed!" ); |
545 | return $ord; |
546 | } |
547 | |
548 | /** |
549 | * @param string $host |
550 | * @return string |
551 | */ |
552 | private static function stripIDNs( string $host ): string { |
553 | // This code is part of Sanitizer::cleanUrl in core |
554 | return preg_replace( self::IDN_RE_G, '', $host ); |
555 | } |
556 | |
557 | /** |
558 | * @param SiteConfig $siteConfig |
559 | * @param string $href |
560 | * @param string $mode |
561 | * @return string|null |
562 | */ |
563 | public static function cleanUrl( SiteConfig $siteConfig, string $href, string $mode ): ?string { |
564 | if ( $mode !== 'wikilink' ) { |
565 | $href = preg_replace_callback( |
566 | '/([\][<>"\x00-\x20\x7F\|])/', static function ( $matches ) { |
567 | return urlencode( $matches[0] ); |
568 | }, $href |
569 | ); |
570 | } |
571 | |
572 | $matched = preg_match( '#^((?:[a-zA-Z][^:/]*:)?(?://)?)([^/]+)(/?.*)#', $href, $bits ); |
573 | if ( $matched === 1 ) { |
574 | $proto = $bits[1]; |
575 | if ( $proto && !$siteConfig->hasValidProtocol( $proto ) ) { |
576 | // invalid proto, disallow URL |
577 | return null; |
578 | } |
579 | $host = self::stripIDNs( $bits[2] ); |
580 | preg_match( '/^%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$/D', $host, $match ); |
581 | if ( $match ) { |
582 | // IPv6 host names |
583 | $host = '[' . $match[1] . ']' . $match[2]; |
584 | } |
585 | $path = $bits[3]; |
586 | } else { |
587 | $proto = ''; |
588 | $host = ''; |
589 | $path = $href; |
590 | } |
591 | return $proto . $host . $path; |
592 | } |
593 | |
594 | /** |
595 | * If the named entity is defined in HTML5 |
596 | * return the UTF-8 encoding of that character. Otherwise, returns |
597 | * pseudo-entity source (eg "&foo;") |
598 | * |
599 | * @param string $name Semicolon-terminated entity name |
600 | * @return string |
601 | */ |
602 | private static function decodeEntity( string $name ): string { |
603 | // These are MediaWiki-specific entities, not in the HTML standard |
604 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
605 | $name = self::MW_ENTITY_ALIASES[$name]; |
606 | } |
607 | $trans = HTMLData::$namedEntityTranslations[$name] ?? null; |
608 | return $trans ?? "&$name"; |
609 | } |
610 | |
611 | /** |
612 | * Return UTF-8 string for a codepoint if that is a valid |
613 | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
614 | * @param int $codepoint |
615 | * @return string |
616 | */ |
617 | private static function decodeChar( int $codepoint ): string { |
618 | if ( self::validateCodepoint( $codepoint ) ) { |
619 | return self::codepointToUtf8( $codepoint ); |
620 | } else { |
621 | return self::UTF8_REPLACEMENT; |
622 | } |
623 | } |
624 | |
625 | /** |
626 | * Decode any character references, numeric or named entities, |
627 | * in the text and return a UTF-8 string. |
628 | * @param string $text |
629 | * @return string |
630 | */ |
631 | public static function decodeCharReferences( string $text ): string { |
632 | return preg_replace_callback( |
633 | self::CHAR_REFS_REGEX, |
634 | function ( $matches ) { |
635 | if ( isset( $matches[1] ) ) { |
636 | return self::decodeEntity( $matches[1] ); |
637 | } elseif ( isset( $matches[2] ) ) { |
638 | return self::decodeChar( intval( $matches[2] ) ); |
639 | } elseif ( isset( $matches[3] ) ) { |
640 | $point = hexdec( $matches[3] ); |
641 | // hexdec() might return a float if the string is too long |
642 | if ( !is_int( $point ) ) { |
643 | // Invalid character reference. |
644 | return self::UTF8_REPLACEMENT; |
645 | } |
646 | return self::decodeChar( $point ); |
647 | } |
648 | # Last case should be an ampersand by itself |
649 | return $matches[0]; |
650 | }, |
651 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
652 | ); |
653 | } |
654 | |
655 | /** |
656 | * Normalize CSS into a format we can easily search for hostile input |
657 | * - decode character references |
658 | * - decode escape sequences |
659 | * - convert characters that IE6 interprets into ascii |
660 | * - remove comments, unless the entire value is one single comment |
661 | * @param string $value the css string |
662 | * @return string normalized css |
663 | */ |
664 | public static function normalizeCss( string $value ): string { |
665 | // Decode character references like { |
666 | $value = self::decodeCharReferences( $value ); |
667 | |
668 | // Decode escape sequences and line continuation |
669 | // See the grammar in the CSS 2 spec, appendix D. |
670 | // This has to be done AFTER decoding character references. |
671 | // This means it isn't possible for this function to return |
672 | // unsanitized escape sequences. It is possible to manufacture |
673 | // input that contains character references that decode to |
674 | // escape sequences that decode to character references, but |
675 | // it's OK for the return value to contain character references |
676 | // because the caller is supposed to escape those anyway. |
677 | static $decodeRegex; |
678 | if ( !$decodeRegex ) { |
679 | $space = '[\\x20\\t\\r\\n\\f]'; |
680 | $nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
681 | $backslash = '\\\\'; |
682 | $decodeRegex = "/ $backslash |
683 | (?: |
684 | ($nl) | # 1. Line continuation |
685 | ([0-9A-Fa-f]{1,6})$space? | # 2. character number |
686 | (.) | # 3. backslash cancelling special meaning |
687 | () | # 4. backslash at end of string |
688 | )/xu"; |
689 | } |
690 | $value = preg_replace_callback( $decodeRegex, |
691 | [ self::class, 'cssDecodeCallback' ], $value ); |
692 | |
693 | // Let the value through if it's nothing but a single comment, to |
694 | // allow other functions which may reject it to pass some error |
695 | // message through. |
696 | if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !xD', $value ) ) { |
697 | // Remove any comments; IE gets token splitting wrong |
698 | // This must be done AFTER decoding character references and |
699 | // escape sequences, because those steps can introduce comments |
700 | // This step cannot introduce character references or escape |
701 | // sequences, because it replaces comments with spaces rather |
702 | // than removing them completely. |
703 | $value = self::delimiterReplace( '/*', '*/', ' ', $value ); |
704 | |
705 | // Remove anything after a comment-start token, to guard against |
706 | // incorrect client implementations. |
707 | $commentPos = strpos( $value, '/*' ); |
708 | if ( $commentPos !== false ) { |
709 | $value = substr( $value, 0, $commentPos ); |
710 | } |
711 | } |
712 | |
713 | return $value; |
714 | } |
715 | |
716 | // PORT_FIXME - The delimiterReplace code below is from StringUtils in core |
717 | |
718 | /** |
719 | * Perform an operation equivalent to `preg_replace_callback()` |
720 | * |
721 | * Matches this code: |
722 | * |
723 | * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ); |
724 | * |
725 | * If the start delimiter ends with an initial substring of the end delimiter, |
726 | * e.g. in the case of C-style comments, the behavior differs from the model |
727 | * regex. In this implementation, the end must share no characters with the |
728 | * start, so e.g. `/*\/` is not considered to be both the start and end of a |
729 | * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. |
730 | * |
731 | * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() |
732 | * but uses far less memory. The delimiters are literal strings, not regular expressions. |
733 | * |
734 | * @param string $startDelim Start delimiter |
735 | * @param string $endDelim End delimiter |
736 | * @param callable $callback Function to call on each match |
737 | * @param string $subject |
738 | * @param string $flags Regular expression flags |
739 | * @throws InvalidArgumentException |
740 | * @return string |
741 | */ |
742 | private static function delimiterReplaceCallback( |
743 | string $startDelim, string $endDelim, callable $callback, string $subject, string $flags = '' |
744 | ): string { |
745 | $inputPos = 0; |
746 | $outputPos = 0; |
747 | $contentPos = 0; |
748 | $output = ''; |
749 | $foundStart = false; |
750 | $encStart = preg_quote( $startDelim, '!' ); |
751 | $encEnd = preg_quote( $endDelim, '!' ); |
752 | $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; |
753 | $endLength = strlen( $endDelim ); |
754 | $m = []; |
755 | while ( $inputPos < strlen( $subject ) && |
756 | preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) |
757 | ) { |
758 | $tokenOffset = $m[0][1]; |
759 | if ( $m[1][0] !== '' ) { |
760 | if ( $foundStart && |
761 | $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) === 0 |
762 | ) { |
763 | # An end match is present at the same location |
764 | $tokenType = 'end'; |
765 | $tokenLength = $endLength; |
766 | } else { |
767 | $tokenType = 'start'; |
768 | $tokenLength = strlen( $m[0][0] ); |
769 | } |
770 | } elseif ( $m[2][0] !== '' ) { |
771 | $tokenType = 'end'; |
772 | $tokenLength = strlen( $m[0][0] ); |
773 | } else { |
774 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
775 | } |
776 | if ( $tokenType === 'start' ) { |
777 | # Only move the start position if we haven't already found a start |
778 | # This means that START START END matches outer pair |
779 | if ( !$foundStart ) { |
780 | # Found start |
781 | $inputPos = $tokenOffset + $tokenLength; |
782 | # Write out the non-matching section |
783 | $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); |
784 | $outputPos = $tokenOffset; |
785 | $contentPos = $inputPos; |
786 | $foundStart = true; |
787 | } else { |
788 | # Move the input position past the *first character* of START, |
789 | # to protect against missing END when it overlaps with START |
790 | $inputPos = $tokenOffset + 1; |
791 | } |
792 | } elseif ( $tokenType === 'end' ) { |
793 | if ( $foundStart ) { |
794 | # Found match |
795 | $output .= $callback( [ |
796 | substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), |
797 | substr( $subject, $contentPos, $tokenOffset - $contentPos ) |
798 | ] ); |
799 | $foundStart = false; |
800 | } else { |
801 | # Non-matching end, write it out |
802 | $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); |
803 | } |
804 | $inputPos = $outputPos = $tokenOffset + $tokenLength; |
805 | } else { |
806 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
807 | } |
808 | } |
809 | if ( $outputPos < strlen( $subject ) ) { |
810 | $output .= substr( $subject, $outputPos ); |
811 | } |
812 | return $output; |
813 | } |
814 | |
815 | /** |
816 | * Perform an operation equivalent to `preg_replace()` with flags. |
817 | * |
818 | * Matches this code: |
819 | * |
820 | * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ); |
821 | * |
822 | * @param string $startDelim Start delimiter regular expression |
823 | * @param string $endDelim End delimiter regular expression |
824 | * @param string $replace Replacement string. May contain $1, which will be |
825 | * replaced by the text between the delimiters |
826 | * @param string $subject String to search |
827 | * @param string $flags Regular expression flags |
828 | * @return string The string with the matches replaced |
829 | */ |
830 | private static function delimiterReplace( |
831 | string $startDelim, string $endDelim, string $replace, string $subject, string $flags = '' |
832 | ): string { |
833 | return self::delimiterReplaceCallback( |
834 | $startDelim, $endDelim, |
835 | static function ( array $matches ) use ( $replace ) { |
836 | return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] ); |
837 | }, |
838 | $subject, $flags |
839 | ); |
840 | } |
841 | |
842 | /** |
843 | * SSS FIXME: There is a test in mediawiki.environment.js that doles out |
844 | * and tests about ids. There are probably some tests in Util.php as well. |
845 | * We should move all these kind of tests somewhere else. |
846 | * @param string $k |
847 | * @param string $v |
848 | * @param KV[] $attrs |
849 | * @return bool |
850 | */ |
851 | private static function isParsoidAttr( string $k, string $v, array $attrs ): bool { |
852 | // NOTES: |
853 | // 1. Currently the tokenizer unconditionally escapes typeof and about |
854 | // attributes from wikitxt to data-x-typeof and data-x-about. So, |
855 | // this check will only pass through Parsoid inserted attrs. |
856 | // 2. But, if we fix the over-aggressive escaping in the tokenizer to |
857 | // not escape non-Parsoid typeof and about, then this will return |
858 | // true for something like typeof='mw:Foo evilScriptHere'. But, that |
859 | // is safe since this check is only used to see if we should |
860 | // unconditionally discard the entire attribute or process it further. |
861 | // That further processing will catch and discard any dangerous |
862 | // strings in the rest of the attribute |
863 | return ( in_array( $k, [ 'typeof', 'property', 'rel' ], true ) |
864 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', $v ) ) |
865 | || ( $k === 'about' && preg_match( '/^#mwt\d+$/D', $v ) ) |
866 | || ( $k === 'content' |
867 | && preg_match( '/(?:^|\s)mw:.+?(?=$|\s)/D', KV::lookup( $attrs, 'property' ) ?? '' ) ); |
868 | } |
869 | |
870 | /** |
871 | * Given an attribute name, checks whether it is a reserved data attribute |
872 | * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki |
873 | * core and extension code can safely use it to communicate with frontend code. |
874 | * @param string $attr Attribute name. |
875 | * @return bool |
876 | */ |
877 | public static function isReservedDataAttribute( string $attr ): bool { |
878 | // data-ooui is reserved for ooui. |
879 | // data-mw and data-parsoid are reserved for parsoid. |
880 | // data-mw-<name here> is reserved for extensions (or core) if |
881 | // they need to communicate some data to the client and want to be |
882 | // sure that it isn't coming from an untrusted user. |
883 | // We ignore the possibility of namespaces since user-generated HTML |
884 | // can't use them anymore. |
885 | if ( preg_match( '/^data-(mw|parsoid)/', $attr ) ) { |
886 | return false; // PARSOID SPECIFIC |
887 | } |
888 | return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); |
889 | } |
890 | |
891 | /** |
892 | * @param SiteConfig $siteConfig |
893 | * @param ?string $tagName |
894 | * @param ?Token $token |
895 | * @param array $attrs |
896 | * @return array |
897 | */ |
898 | public static function sanitizeTagAttrs( |
899 | SiteConfig $siteConfig, ?string $tagName, ?Token $token, array $attrs |
900 | ): array { |
901 | $tag = $tagName ?: $token->getName(); |
902 | |
903 | $list = self::attributesAllowedInternal( $tag ); |
904 | $newAttrs = []; |
905 | $n = count( $attrs ); |
906 | for ( $i = 0; $i < $n; $i++ ) { |
907 | $a = $attrs[$i]; |
908 | $a->v ??= ''; |
909 | |
910 | // Convert attributes to string, if necessary. |
911 | $a->k = TokenUtils::tokensToString( $a->k ); |
912 | |
913 | if ( is_array( $a->v ) ) { |
914 | // Use the expanded attr instead of trying to unpackDOMFragments |
915 | // since the fragment will have been released when expanding to DOM |
916 | $expandedVal = $token ? $token->fetchExpandedAttrValue( $a->k ) : null; |
917 | if ( $expandedVal === null ) { |
918 | $a->v = TokenUtils::tokensToString( $a->v ); |
919 | } else { |
920 | // See the comment in TokenUtils::tokensToString about |
921 | // unpackDOMFragments for why we're just using the textContent |
922 | $dom = DOMUtils::parseHTML( $expandedVal ); |
923 | $a->v = DOMCompat::getBody( $dom )->textContent; |
924 | } |
925 | } |
926 | |
927 | $origK = $a->ksrc ?? $a->k; |
928 | // $a->k can be uppercase |
929 | $k = mb_strtolower( $a->k ); |
930 | $v = $a->v; |
931 | $origV = $a->vsrc ?? $v; |
932 | $psdAttr = self::isParsoidAttr( $k, $v, $attrs ); |
933 | |
934 | // Bypass RDFa/allowed attribute checks for Parsoid-inserted attrs |
935 | // Safe to do since the tokenizer renames about/typeof attrs. |
936 | // unconditionally. FIXME: The escaping solution in the tokenizer |
937 | // may be aggressive. There is no need to escape typeof strings |
938 | // that or about ids that don't resemble Parsoid tokens/about ids. |
939 | if ( !$psdAttr ) { |
940 | if ( !preg_match( self::GET_ATTRIBS_RE, $k ) ) { |
941 | $newAttrs[$k] = [ null, $origV, $origK ]; |
942 | continue; |
943 | } |
944 | |
945 | # Allow XML namespace declaration to allow RDFa |
946 | if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $k ) ) { |
947 | if ( !preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
948 | $newAttrs[$k] = [ $v, $origV, $origK ]; |
949 | } else { |
950 | $newAttrs[$k] = [ null, $origV, $origK ]; |
951 | } |
952 | continue; |
953 | } |
954 | |
955 | # Allow any attribute beginning with "data-" |
956 | # However: |
957 | # * Disallow data attributes used by MediaWiki code |
958 | # * Ensure that the attribute is not namespaced by banning |
959 | # colons. |
960 | if ( ( !preg_match( '/^data-[^:]*$/iD', $k ) && !isset( $list[$k] ) ) |
961 | || self::isReservedDataAttribute( $k ) |
962 | ) { |
963 | $newAttrs[$k] = [ null, $origV, $origK ]; |
964 | continue; |
965 | } |
966 | } |
967 | |
968 | # Strip javascript "expression" from stylesheets. |
969 | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
970 | if ( $k === 'style' ) { |
971 | $v = self::checkCss( $v ); |
972 | } |
973 | |
974 | # Escape HTML id attributes |
975 | if ( $k === 'id' ) { |
976 | $v = self::escapeIdForAttribute( $v, self::ID_PRIMARY ); |
977 | } |
978 | |
979 | # Escape HTML id reference lists |
980 | if ( $k === 'aria-describedby' |
981 | || $k === 'aria-flowto' |
982 | || $k === 'aria-labelledby' |
983 | || $k === 'aria-owns' |
984 | ) { |
985 | $v = self::escapeIdReferenceList( $v ); |
986 | } |
987 | |
988 | // RDFa and microdata properties allow URLs, URIs and/or CURIs. |
989 | // Check them for validity. |
990 | if ( $k === 'rel' || $k === 'rev' |
991 | # RDFa |
992 | || $k === 'about' || $k === 'property' |
993 | || $k === 'resource' || $k === 'datatype' |
994 | || $k === 'typeof' |
995 | # HTML5 microdata |
996 | || $k === 'itemid' || $k === 'itemprop' |
997 | || $k === 'itemref' || $k === 'itemscope' |
998 | || $k === 'itemtype' |
999 | ) { |
1000 | // Paranoia. Allow "simple" values but suppress javascript |
1001 | if ( preg_match( self::EVIL_URI_PATTERN, $v ) ) { |
1002 | // Retain the Parsoid typeofs for Parsoid attrs |
1003 | $newV = $psdAttr ? trim( preg_replace( '/(?:^|\s)(?!mw:\w)\S*/', '', $origV ) ) : null; |
1004 | $newAttrs[$k] = [ $newV, $origV, $origK ]; |
1005 | continue; |
1006 | } |
1007 | } |
1008 | |
1009 | # NOTE: even though elements using href/src are not allowed directly, supply |
1010 | # validation code that can be used by tag hook handlers, etc |
1011 | if ( $token && ( $k === 'href' || $k === 'src' || $k === 'poster' ) ) { // T163583 |
1012 | // `origV` will always be `v`, because `a.vsrc` isn't set, since |
1013 | // this attribute didn't come from source. However, in the |
1014 | // LinkHandler, we may have already shadowed this value so use |
1015 | // that instead. |
1016 | $rel = $token->getAttributeShadowInfo( 'rel' ); |
1017 | $mode = ( $k === 'href' && |
1018 | isset( $rel['value'] ) && |
1019 | preg_match( '#^mw:WikiLink(/Interwiki)?$#', $rel['value'] ) |
1020 | ) ? 'wikilink' : 'external'; |
1021 | $origHref = $token->getAttributeShadowInfo( $k )['value']; |
1022 | $newHref = self::cleanUrl( $siteConfig, $v, $mode ); |
1023 | if ( $newHref !== $v ) { |
1024 | $newAttrs[$k] = [ $newHref, $origHref, $origK ]; |
1025 | continue; |
1026 | } |
1027 | } |
1028 | |
1029 | if ( $k === 'tabindex' && $v !== '0' ) { |
1030 | // Only allow tabindex of 0, which is useful for accessibility. |
1031 | continue; |
1032 | } |
1033 | |
1034 | // SSS FIXME: This logic is not RT-friendly. |
1035 | // If this attribute was previously set, override it. |
1036 | // Output should only have one attribute of each name. |
1037 | $newAttrs[$k] = [ $v, $origV, $origK ]; |
1038 | } |
1039 | |
1040 | # itemtype, itemid, itemref don't make sense without itemscope |
1041 | if ( !array_key_exists( 'itemscope', $newAttrs ) ) { |
1042 | // SSS FIXME: This logic is not RT-friendly. |
1043 | unset( $newAttrs['itemtype'] ); |
1044 | unset( $newAttrs['itemid'] ); |
1045 | unset( $newAttrs['itemref'] ); |
1046 | } |
1047 | # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
1048 | |
1049 | return $newAttrs; |
1050 | } |
1051 | |
1052 | /** |
1053 | * Sanitize and apply attributes to a wrapper element. |
1054 | * |
1055 | * Used primarily when we're applying tokenized attributes directly to |
1056 | * dom elements, which wouldn't have had a chance to be sanitized before |
1057 | * tree building. |
1058 | * @param SiteConfig $siteConfig |
1059 | * @param Element $wrapper wrapper |
1060 | * @param array $attrs attributes |
1061 | */ |
1062 | public static function applySanitizedArgs( |
1063 | SiteConfig $siteConfig, Element $wrapper, array $attrs |
1064 | ): void { |
1065 | $nodeName = DOMCompat::nodeName( $wrapper ); |
1066 | $sanitizedAttrs = self::sanitizeTagAttrs( $siteConfig, $nodeName, null, $attrs ); |
1067 | foreach ( $sanitizedAttrs as $k => $v ) { |
1068 | if ( isset( $v[0] ) ) { |
1069 | $wrapper->setAttribute( $k, $v[0] ); |
1070 | } |
1071 | } |
1072 | } |
1073 | |
1074 | /** |
1075 | * @param string $text |
1076 | * @return string |
1077 | */ |
1078 | public static function checkCss( string $text ): string { |
1079 | $text = self::normalizeCss( $text ); |
1080 | // \000-\010\013\016-\037\177 are the octal escape sequences |
1081 | if ( preg_match( '/[\000-\010\013\016-\037\177]/', $text ) |
1082 | || strpos( $text, self::UTF8_REPLACEMENT ) !== false |
1083 | ) { |
1084 | return '/* invalid control char */'; |
1085 | } elseif ( preg_match( self::INSECURE_RE, $text ) ) { |
1086 | return '/* insecure input */'; |
1087 | } else { |
1088 | return $text; |
1089 | } |
1090 | } |
1091 | |
1092 | /** |
1093 | * @param array $matches |
1094 | * @return string |
1095 | */ |
1096 | public static function cssDecodeCallback( array $matches ): string { |
1097 | if ( $matches[1] !== '' ) { |
1098 | // Line continuation |
1099 | return ''; |
1100 | } elseif ( $matches[2] !== '' ) { |
1101 | # hexdec could return a float if the match is too long, but the |
1102 | # regexp in question limits the string length to 6. |
1103 | $char = self::codepointToUtf8( hexdec( $matches[2] ) ); |
1104 | } elseif ( $matches[3] !== '' ) { |
1105 | $char = $matches[3]; |
1106 | } else { |
1107 | $char = '\\'; |
1108 | } |
1109 | if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
1110 | // These characters need to be escaped in strings |
1111 | // Clean up the escape sequence to avoid parsing errors by clients |
1112 | return '\\' . dechex( ord( $char ) ) . ' '; |
1113 | } else { |
1114 | // Decode unnecessary escape |
1115 | return $char; |
1116 | } |
1117 | } |
1118 | |
1119 | /** |
1120 | * Sanitize a title to be used in a URI? |
1121 | * @param string $title |
1122 | * @param bool $isInterwiki |
1123 | * @return string |
1124 | */ |
1125 | public static function sanitizeTitleURI( string $title, bool $isInterwiki = false ): string { |
1126 | $idx = strpos( $title, '#' ); |
1127 | $anchor = null; |
1128 | if ( $idx !== false ) { // split at first '#' |
1129 | $anchor = substr( $title, $idx + 1 ); |
1130 | $title = substr( $title, 0, $idx ); |
1131 | } |
1132 | $title = preg_replace_callback( |
1133 | '/[%? \[\]#|<>]/', static function ( $matches ) { |
1134 | return PHPUtils::encodeURIComponent( $matches[0] ); |
1135 | }, $title ); |
1136 | if ( $anchor !== null ) { |
1137 | $title .= '#' . ( $isInterwiki |
1138 | ? self::escapeIdForExternalInterwiki( $anchor ) |
1139 | : self::escapeIdForLink( $anchor ) ); |
1140 | } |
1141 | return $title; |
1142 | } |
1143 | |
1144 | public const FIXTAGS = [ |
1145 | # French spaces, last one Guillemet-left |
1146 | # only if it isn't followed by a word character. |
1147 | '/ (?=[?:;!%»›](?!\w))/u' => "%s", |
1148 | # French spaces, Guillemet-right |
1149 | '/([«‹]) /u' => "\\1%s", |
1150 | ]; |
1151 | |
1152 | /** |
1153 | * Armor French spaces with a replacement character |
1154 | * |
1155 | * @since 1.32 |
1156 | * @param string $text Text to armor |
1157 | * @param string $space Space character for the French spaces, defaults to ' ' |
1158 | * @return string Armored text |
1159 | */ |
1160 | public static function armorFrenchSpaces( string $text, string $space = ' ' ): string { |
1161 | // Replace $ with \$ and \ with \\ |
1162 | $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); |
1163 | return preg_replace( |
1164 | array_keys( self::FIXTAGS ), |
1165 | array_map( static function ( string $replacement ) use ( $space ) { |
1166 | // @phan-suppress-next-line PhanPluginPrintfVariableFormatString |
1167 | return sprintf( $replacement, $space ); |
1168 | }, array_values( self::FIXTAGS ) ), |
1169 | $text |
1170 | ); |
1171 | } |
1172 | |
1173 | /** |
1174 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1175 | * a valid HTML id attribute. |
1176 | * |
1177 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
1178 | * be sure to use proper escaping. |
1179 | * |
1180 | * In Parsoid, proper escaping is usually handled for us by the HTML |
1181 | * serialization algorithm, but be careful of corner cases (such as |
1182 | * emitting attributes in wikitext). |
1183 | * |
1184 | * @param string $id String to escape |
1185 | * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding |
1186 | * should be used. |
1187 | * @return string Escaped ID |
1188 | * |
1189 | * @since 1.30 |
1190 | */ |
1191 | public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ): string { |
1192 | // For consistency with PHP's API, we accept "primary" or "fallback" as |
1193 | // the mode in 'options'. This (slightly) abstracts the actual details |
1194 | // of the id encoding from the Parsoid code which handles ids; we could |
1195 | // swap primary and fallback here, or even transition to a new HTML6 |
1196 | // encoding (!), without touching all the call sites. |
1197 | $internalMode = $mode === self::ID_FALLBACK ? 'legacy' : 'html5'; |
1198 | return self::escapeIdInternal( $id, $internalMode ); |
1199 | } |
1200 | |
1201 | /** |
1202 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1203 | * a valid URL fragment. |
1204 | * |
1205 | * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, |
1206 | * be sure to use proper escaping. |
1207 | * |
1208 | * @param string $id String to escape |
1209 | * @return string Escaped ID |
1210 | * |
1211 | * @since 1.30 |
1212 | */ |
1213 | public static function escapeIdForLink( string $id ): string { |
1214 | return self::escapeIdInternalUrl( $id, 'html5' ); |
1215 | } |
1216 | |
1217 | /** |
1218 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
1219 | * a valid URL fragment for external interwikis. |
1220 | * |
1221 | * @param string $id String to escape |
1222 | * @return string Escaped ID |
1223 | * |
1224 | * @since 1.30 |
1225 | */ |
1226 | private static function escapeIdForExternalInterwiki( string $id ): string { |
1227 | // Assume $wgExternalInterwikiFragmentMode = 'legacy' |
1228 | return self::escapeIdInternalUrl( $id, 'legacy' ); |
1229 | } |
1230 | |
1231 | /** |
1232 | * Do percent encoding of percent signs for href (but not id) attributes |
1233 | * |
1234 | * @see https://phabricator.wikimedia.org/T238385 |
1235 | * @param string $id String to escape |
1236 | * @param string $mode One of modes from $wgFragmentMode |
1237 | * @return string |
1238 | */ |
1239 | private static function escapeIdInternalUrl( string $id, string $mode ): string { |
1240 | $id = self::escapeIdInternal( $id, $mode ); |
1241 | if ( $mode === 'html5' ) { |
1242 | $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); |
1243 | } |
1244 | return $id; |
1245 | } |
1246 | |
1247 | /** |
1248 | * Helper for escapeIdFor*() functions. Performs most of the actual escaping. |
1249 | * |
1250 | * @param string $id String to escape |
1251 | * @param string $mode One of modes from $wgFragmentMode ('html5' or 'legacy') |
1252 | * @return string |
1253 | */ |
1254 | private static function escapeIdInternal( string $id, string $mode ): string { |
1255 | switch ( $mode ) { |
1256 | case 'html5': |
1257 | // html5 spec says ids must not have any of the following: |
1258 | // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE |
1259 | // In practice, in wikitext, only tab, LF, CR (and SPACE) are |
1260 | // possible using either Lua or html entities. |
1261 | $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); |
1262 | break; |
1263 | |
1264 | case 'legacy': |
1265 | // This corresponds to 'noninitial' mode of the old escapeId |
1266 | static $replace = [ |
1267 | '%3A' => ':', |
1268 | '%' => '.' |
1269 | ]; |
1270 | |
1271 | $id = urlencode( str_replace( ' ', '_', $id ) ); |
1272 | $id = strtr( $id, $replace ); |
1273 | break; |
1274 | |
1275 | default: |
1276 | throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); |
1277 | } |
1278 | |
1279 | return $id; |
1280 | } |
1281 | |
1282 | /** |
1283 | * Given a string containing a space delimited list of ids, escape each id |
1284 | * to match ids escaped by the escapeIdForAttribute() function. |
1285 | * |
1286 | * @since 1.27 |
1287 | * |
1288 | * @param string $referenceString Space delimited list of ids |
1289 | * @return string |
1290 | */ |
1291 | public static function escapeIdReferenceList( string $referenceString ): string { |
1292 | # Explode the space delimited list string into an array of tokens |
1293 | $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
1294 | |
1295 | # Escape each token as an id |
1296 | foreach ( $references as &$ref ) { |
1297 | $ref = self::escapeIdForAttribute( $ref ); |
1298 | } |
1299 | |
1300 | # Merge the array back to a space delimited list string |
1301 | # If the array is empty, the result will be an empty string ('') |
1302 | $referenceString = implode( ' ', $references ); |
1303 | |
1304 | return $referenceString; |
1305 | } |
1306 | |
1307 | /** |
1308 | * Normalizes whitespace in a section name, such as might be returned |
1309 | * by Parser::stripSectionName(), for use in the ids that are used for |
1310 | * section links. |
1311 | * |
1312 | * @param string $section |
1313 | * @return string |
1314 | */ |
1315 | public static function normalizeSectionNameWhiteSpace( string $section ): string { |
1316 | return trim( preg_replace( '/[ _]+/', ' ', $section ) ); |
1317 | } |
1318 | } |