Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 218 |
|
0.00% |
0 / 1 |
CRAP | |
0.00% |
0 / 1 |
Consts | |
0.00% |
0 / 217 |
|
0.00% |
0 / 1 |
110 | |
0.00% |
0 / 1 |
init | |
0.00% |
0 / 217 |
|
0.00% |
0 / 1 |
110 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wikitext; |
5 | |
6 | use Wikimedia\Parsoid\Utils\PHPUtils; |
7 | |
8 | class Consts { |
9 | public static $Media; |
10 | public static $Sanitizer; |
11 | public static $WikitextTagsWithTrimmableWS; |
12 | public static $HTMLTagsRequiringSOLContext; |
13 | public static $WTQuoteTags; |
14 | public static $SolSpaceSensitiveTags; |
15 | public static $HTML; |
16 | public static $WTTagsWithNoClosingTags; |
17 | public static $Output; |
18 | public static $WtTagWidths; |
19 | public static $ZeroWidthWikitextTags; |
20 | public static $LCFlagMap; |
21 | public static $LCNameMap; |
22 | public static $blockElems; |
23 | public static $antiBlockElems; |
24 | public static $alwaysBlockElems; |
25 | public static $neverBlockElems; |
26 | public static $wikitextBlockElems; |
27 | public static $strippedUrlCharacters; |
28 | |
29 | public static function init() { |
30 | /* |
31 | * Valid media options: |
32 | * - Prefix options are of the form "alt=foo" |
33 | * - Simple options are of the form "center" |
34 | * |
35 | * See http:#en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax |
36 | * for more information about how they are used. |
37 | */ |
38 | self::$Media = [ |
39 | 'PrefixOptions' => [ |
40 | 'img_link' => 'link', |
41 | 'img_alt' => 'alt', |
42 | 'img_page' => 'page', |
43 | 'img_lang' => 'lang', # see T34987 |
44 | 'img_upright' => 'upright', |
45 | 'img_width' => 'width', |
46 | 'img_class' => 'class', |
47 | 'img_manualthumb' => 'manualthumb', |
48 | |
49 | 'timedmedia_thumbtime' => 'thumbtime', |
50 | 'timedmedia_starttime' => 'start', |
51 | 'timedmedia_endtime' => 'end', |
52 | 'timedmedia_disablecontrols' => 'disablecontrols' # See T135537 |
53 | ], |
54 | 'SimpleOptions' => [ |
55 | # halign |
56 | 'img_left' => 'halign', |
57 | 'img_right' => 'halign', |
58 | 'img_center' => 'halign', |
59 | 'img_none' => 'halign', |
60 | |
61 | # valign |
62 | 'img_baseline' => 'valign', |
63 | 'img_sub' => 'valign', |
64 | 'img_super' => 'valign', |
65 | 'img_top' => 'valign', |
66 | 'img_text_top' => 'valign', |
67 | 'img_middle' => 'valign', |
68 | 'img_bottom' => 'valign', |
69 | 'img_text_bottom' => 'valign', |
70 | |
71 | # format |
72 | # 'border' can be given in addition to *one of* |
73 | # frameless, framed, or thumbnail |
74 | 'img_border' => 'border', |
75 | 'img_frameless' => 'format', |
76 | 'img_framed' => 'format', |
77 | 'img_thumbnail' => 'format', |
78 | |
79 | # Ha! Upright can be either one! Try parsing THAT! |
80 | 'img_upright' => 'upright', |
81 | |
82 | 'timedmedia_loop' => 'loop', # T308230 |
83 | 'timedmedia_muted' => 'muted', # T308230 |
84 | ] |
85 | ]; |
86 | |
87 | self::$Sanitizer = [ |
88 | # List of allowed tags that can be used as raw HTML in wikitext. |
89 | # All other html/html-like tags will be spit out as text. |
90 | 'AllowedLiteralTags' => PHPUtils::makeSet( [ |
91 | # In case you were wondering, explicit <a .. > HTML is NOT allowed in wikitext. |
92 | # That is why the <a> tag is missing from the allowed list. |
93 | 'abbr', |
94 | 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', |
95 | 'caption', 'center', 'cite', 'code', |
96 | 'data', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', |
97 | 'em', |
98 | 'font', |
99 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', |
100 | 'i', 'ins', |
101 | 'kbd', |
102 | 'li', |
103 | 'mark', |
104 | 'ol', |
105 | 'p', 'pre', |
106 | 'q', |
107 | 'rb', 'rp', 'rt', 'rtc', 'ruby', |
108 | 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', |
109 | 'table', 'td', 'th', 'time', 'tr', 'tt', |
110 | 'u', 'ul', |
111 | 'var', |
112 | 'wbr', |
113 | ] ), |
114 | ]; |
115 | |
116 | /** |
117 | * These HTML tags come from native wikitext markup and |
118 | * (as long as they are not literal HTML tags in the wikitext source) |
119 | * should have whitespace trimmed from their content. |
120 | */ |
121 | self::$WikitextTagsWithTrimmableWS = PHPUtils::makeSet( [ |
122 | "h1", "h2", "h3", "h4", "h5", "h6", |
123 | "ol", "li", "ul", "dd", "dl", "dt", |
124 | "td", "th", "caption" |
125 | ] ); |
126 | |
127 | # These HTML tags will be generated only if |
128 | # the corresponding wikitext occurs in a SOL context. |
129 | self::$HTMLTagsRequiringSOLContext = PHPUtils::makeSet( [ |
130 | "pre", |
131 | "h1", "h2", "h3", "h4", "h5", "h6", |
132 | "ol", "li", "ul", "dd", "dl", "dt", |
133 | ] ); |
134 | |
135 | # These wikitext tags are composed with quote-chars. |
136 | self::$WTQuoteTags = PHPUtils::makeSet( [ 'i', 'b' ] ); |
137 | |
138 | // These are defined in the legacy parser's `BlockLevelPass` |
139 | |
140 | // Opens block scope when entering, closes when exiting |
141 | self::$blockElems = PHPUtils::makeSet( [ |
142 | 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'p', 'ul', |
143 | 'ol', 'dl' |
144 | ] ); |
145 | // Closes block scope when entering, opens when exiting |
146 | self::$antiBlockElems = PHPUtils::makeSet( [ 'td', 'th' ] ); |
147 | // Opens block scope when entering, opens when exiting too |
148 | self::$alwaysBlockElems = PHPUtils::makeSet( [ |
149 | 'tr', 'caption', 'dt', 'dd', 'li' |
150 | ] ); |
151 | // Closes block scope when entering, closes when exiting too |
152 | self::$neverBlockElems = PHPUtils::makeSet( [ |
153 | 'center', 'blockquote', 'div', 'hr', 'figure', 'aside', // T278565 |
154 | ] ); |
155 | |
156 | self::$wikitextBlockElems = PHPUtils::makeSet( array_merge( |
157 | array_keys( self::$blockElems ), |
158 | array_keys( self::$antiBlockElems ), |
159 | array_keys( self::$alwaysBlockElems ), |
160 | array_keys( self::$neverBlockElems ) |
161 | ) ); |
162 | |
163 | self::$HTML = [ |
164 | # The list of HTML5 tags, mainly used for the identification of *non*-html tags. |
165 | # Non-html tags terminate otherwise tag-eating rules in the tokenizer |
166 | # to support potential extension tags. |
167 | 'HTML5Tags' => PHPUtils::makeSet( [ |
168 | "a", "abbr", "address", "area", "article", |
169 | "aside", "audio", "b", "base", "bdi", "bdo", "blockquote", |
170 | "body", "br", "button", "canvas", "caption", "cite", "code", |
171 | "col", "colgroup", "data", "datalist", "dd", "del", |
172 | "details", "dfn", "div", "dl", "dt", "em", "embed", "fieldset", |
173 | "figcaption", "figure", "footer", "form", |
174 | "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", |
175 | "hr", "html", "i", "iframe", "img", "input", "ins", "kbd", "keygen", |
176 | "label", "legend", "li", "link", "map", "mark", "menu", "meta", |
177 | "meter", "nav", "noscript", "object", "ol", "optgroup", "option", |
178 | "output", "p", "param", "pre", "progress", "q", "rb", "rp", "rt", |
179 | "rtc", "ruby", "s", "samp", "script", "section", "select", "small", |
180 | "source", "span", "strong", "style", "sub", "summary", "sup", |
181 | "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", |
182 | "title", "tr", "track", "u", "ul", "var", "video", "wbr", |
183 | ] ), |
184 | |
185 | /** |
186 | * https://html.spec.whatwg.org/multipage/dom.html#metadata-content-2 |
187 | * @type {Set} |
188 | */ |
189 | 'MetaDataTags' => PHPUtils::makeSet( [ |
190 | "base", "link", "meta", "noscript", "script", "style", "template", "title" |
191 | ] ), |
192 | |
193 | # From http://www.w3.org/TR/html5-diff/#obsolete-elements |
194 | # SSS FIXME: basefont is missing here, but looks like the PHP parser |
195 | # does not support it anyway and treats it as plain text. So, skipping |
196 | # this one in Parsoid as well. |
197 | 'OlderHTMLTags' => PHPUtils::makeSet( [ |
198 | "strike", "big", "center", "font", "tt", |
199 | ] ), |
200 | |
201 | # See http://www.w3.org/html/wg/drafts/html/master/syntax.html#formatting |
202 | 'FormattingTags' => PHPUtils::makeSet( [ |
203 | 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', |
204 | 's', 'small', 'strike', 'strong', 'tt', 'u', |
205 | ] ), |
206 | |
207 | /** |
208 | * From \\MediaWiki\Tidy\RemexCompatMunger::$onlyInlineElements |
209 | */ |
210 | 'OnlyInlineElements' => PHPUtils::makeSet( [ |
211 | 'a', 'abbr', 'acronym', 'applet', 'b', 'basefont', 'bdo', |
212 | 'big', 'br', 'button', 'cite', 'code', 'del', 'dfn', 'em', |
213 | 'font', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', |
214 | 'legend', 'map', 'object', 'param', 'q', 'rb', 'rbc', 'rp', |
215 | 'rt', 'rtc', 'ruby', 's', 'samp', 'select', 'small', 'span', |
216 | 'strike', 'strong', 'sub', 'sup', 'textarea', 'tt', 'u', 'var', |
217 | // Those defined in tidy.conf |
218 | 'video', 'audio', 'bdi', 'data', 'time', 'mark', |
219 | ] ), |
220 | |
221 | 'ListTags' => PHPUtils::makeSet( [ 'ul', 'ol', 'dl' ] ), |
222 | |
223 | 'ListItemTags' => PHPUtils::makeSet( [ 'li', 'dd', 'dt' ] ), |
224 | |
225 | 'FosterablePosition' => PHPUtils::makeSet( [ 'table', 'thead', 'tbody', 'tfoot', 'tr' ] ), |
226 | |
227 | 'TableContentModels' => [ |
228 | 'table' => [ 'caption', 'colgroup', 'thead', 'tbody', 'tr', 'tfoot' ], |
229 | 'thead' => [ 'tr' ], |
230 | 'tbody' => [ 'tr' ], |
231 | 'tfoot' => [ 'tr' ], |
232 | 'tr' => [ 'td', 'th' ] |
233 | ], |
234 | |
235 | 'TableTags' => PHPUtils::makeSet( [ |
236 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
237 | ] ), |
238 | |
239 | # Table tags that can be children |
240 | 'ChildTableTags' => PHPUtils::makeSet( [ |
241 | "tbody", "thead", "tfoot", "tr", "caption", "th", "td", |
242 | ] ), |
243 | |
244 | # See https://html.spec.whatwg.org/#void-elements |
245 | 'VoidTags' => PHPUtils::makeSet( [ |
246 | 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', |
247 | 'input', 'link', 'meta', 'param', 'source', |
248 | 'track', 'wbr', |
249 | ] ), |
250 | |
251 | # HTML5 elements with raw (unescaped) content |
252 | 'RawTextElements' => PHPUtils::makeSet( [ |
253 | 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', |
254 | 'plaintext', 'noscript', |
255 | ] ), |
256 | ]; |
257 | |
258 | /** |
259 | * These HTML tags have native wikitext representations. |
260 | * The wikitext equivalents do not have closing tags. |
261 | * @type {Set} |
262 | */ |
263 | self::$WTTagsWithNoClosingTags = PHPUtils::makeSet( [ |
264 | "pre", "li", "dt", "dd", "hr", "tr", "td", "th" |
265 | ] ); |
266 | |
267 | self::$Output = [ |
268 | 'FlaggedEmptyElts' => PHPUtils::makeSet( [ |
269 | 'li', 'tr', 'p', |
270 | ] ), |
271 | ]; |
272 | |
273 | # Known wikitext tag widths -- these are known statically |
274 | # but other widths are computed or updated based on actual wikitext usage |
275 | self::$WtTagWidths = [ |
276 | "body" => [ 0, 0 ], |
277 | "html" => [ 0, 0 ], |
278 | "head" => [ 0, 0 ], |
279 | "p" => [ 0, 0 ], |
280 | "meta" => [ 0, 0 ], |
281 | // @see PreHandler::newIndentPreWS() for why opening width is 0, not 1 |
282 | "pre" => [ 0, 0 ], |
283 | "ol" => [ 0, 0 ], |
284 | "ul" => [ 0, 0 ], |
285 | "dl" => [ 0, 0 ], |
286 | "li" => [ 1, 0 ], |
287 | "dt" => [ 1, 0 ], |
288 | "dd" => [ 1, 0 ], |
289 | "h1" => [ 1, 1 ], |
290 | "h2" => [ 2, 2 ], |
291 | "h3" => [ 3, 3 ], |
292 | "h4" => [ 4, 4 ], |
293 | "h5" => [ 5, 5 ], |
294 | "h6" => [ 6, 6 ], |
295 | "hr" => [ 4, 0 ], |
296 | "table" => [ 2, 2 ], |
297 | "tbody" => [ 0, 0 ], |
298 | "thead" => [ 0, 0 ], |
299 | "tfoot" => [ 0, 0 ], |
300 | "tr" => [ null, 0 ], |
301 | "td" => [ null, 0 ], |
302 | "th" => [ null, 0 ], |
303 | "b" => [ 3, 3 ], |
304 | "i" => [ 2, 2 ], |
305 | "br" => [ 0, 0 ], |
306 | "figure" => [ 2, 2 ], |
307 | "figcaption" => [ 0, 0 ], |
308 | ]; |
309 | |
310 | # HTML tags whose wikitext equivalents are zero-width. |
311 | # This information is derived from WtTagWidths and set below. |
312 | self::$ZeroWidthWikitextTags = PHPUtils::makeSet( [] ); |
313 | |
314 | # Map LanguageConverter wikitext flags to readable JSON field names. |
315 | self::$LCFlagMap = [ |
316 | # These first three flags are used internally during flag processing, |
317 | # but should never appear in the output wikitext, so we prepend them |
318 | # with '$'. |
319 | |
320 | # 'S': Show converted text |
321 | '$S' => 'show', |
322 | # '+': Add conversion rule |
323 | '$+' => 'add', |
324 | # 'E': Error in the given flags |
325 | '$E' => 'error', |
326 | |
327 | # These rest of these are valid flags in wikitext. |
328 | |
329 | # 'A': add conversion rule *and show converted text* (implies S) |
330 | 'A' => 'add', |
331 | # 'T': Convert and override page title |
332 | 'T' => 'title', |
333 | # 'R': Disable language conversion (exclusive flag) |
334 | 'R' => 'disabled', |
335 | # 'D': Describe conversion rule (without adding to table) |
336 | 'D' => 'describe', |
337 | # '-': Remove existing conversion rule (exclusive flag) |
338 | '-' => 'remove', |
339 | # 'H': add rule for convert code (but no display in placed code ) |
340 | 'H' => '', # this is handled implicitly as a lack of 'show' |
341 | # 'N': Output current variant name (exclusive flag) |
342 | 'N' => 'name', |
343 | ]; |
344 | |
345 | # Map JSON field names to LanguageConverter wikitext flags. |
346 | # This information is derived from LCFlagMap and set below. |
347 | self::$LCNameMap = []; |
348 | |
349 | # Derived information from 'WtTagWidths' |
350 | foreach ( self::$WtTagWidths as $tag => $widths ) { |
351 | # This special case can be fixed by maybe removing them WtTagWidths. |
352 | # They may no longer be necessary -- to be investigated in another patch. |
353 | if ( $tag !== 'html' && $tag !== 'head' && $tag !== 'body' ) { |
354 | if ( $widths[0] === 0 && $widths[1] === 0 ) { |
355 | // @see explanation in PreHandler::newIndentPreWS() |
356 | // to understand this special case |
357 | if ( $tag !== 'pre' ) { |
358 | self::$ZeroWidthWikitextTags[$tag] = true; |
359 | } |
360 | } |
361 | } |
362 | } |
363 | |
364 | # Derived information from `LCFlagMap` |
365 | foreach ( self::$LCFlagMap as $k => $v ) { |
366 | if ( $v ) { |
367 | self::$LCNameMap[$v] = $k; |
368 | } |
369 | } |
370 | |
371 | # Handle ambiguity in inverse mapping. |
372 | self::$LCNameMap['add'] = 'A'; |
373 | |
374 | /* |
375 | * These characters are not considered to be part of a URL if they are the last |
376 | * character of a raw URL when converting it to an HTML link |
377 | * Right bracket would also be in that set, but only if there's no left bracket in the URL; |
378 | * see TokenizerUtils::getAutoUrlTerminatingChars. |
379 | */ |
380 | self::$strippedUrlCharacters = ',;\.:!?'; |
381 | } |
382 | } |
383 | |
384 | Consts::init(); |