Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 218 |
|
0.00% |
0 / 1 |
CRAP | |
0.00% |
0 / 1 |
| Consts | |
0.00% |
0 / 217 |
|
0.00% |
0 / 1 |
110 | |
0.00% |
0 / 1 |
| init | |
0.00% |
0 / 217 |
|
0.00% |
0 / 1 |
110 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wikitext; |
| 5 | |
| 6 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 7 | |
| 8 | class Consts { |
| 9 | public static array $Media; |
| 10 | public static array $Sanitizer; |
| 11 | public static array $WikitextTagsWithTrimmableWS; |
| 12 | public static array $HTMLTagsRequiringSOLContext; |
| 13 | public static array $WTQuoteTags; |
| 14 | public static array $SolSpaceSensitiveTags; |
| 15 | public static array $HTML; |
| 16 | public static array $WTTagsWithNoClosingTags; |
| 17 | public static array $Output; |
| 18 | public static array $WtTagWidths; |
| 19 | public static array $ZeroWidthWikitextTags; |
| 20 | public static array $LCFlagMap; |
| 21 | public static array $LCNameMap; |
| 22 | public static array $blockElems; |
| 23 | public static array $antiBlockElems; |
| 24 | public static array $alwaysBlockElems; |
| 25 | public static array $neverBlockElems; |
| 26 | public static array $wikitextBlockElems; |
| 27 | public static string $strippedUrlCharacters; |
| 28 | |
| 29 | public static function init() { |
| 30 | /* |
| 31 | * Valid media options: |
| 32 | * - Prefix options are of the form "alt=foo" |
| 33 | * - Simple options are of the form "center" |
| 34 | * |
| 35 | * See http:#en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax |
| 36 | * for more information about how they are used. |
| 37 | */ |
| 38 | self::$Media = [ |
| 39 | 'PrefixOptions' => [ |
| 40 | 'img_link' => 'link', |
| 41 | 'img_alt' => 'alt', |
| 42 | 'img_page' => 'page', |
| 43 | 'img_lang' => 'lang', # see T34987 |
| 44 | 'img_upright' => 'upright', |
| 45 | 'img_width' => 'width', |
| 46 | 'img_class' => 'class', |
| 47 | 'img_manualthumb' => 'manualthumb', |
| 48 | |
| 49 | 'timedmedia_thumbtime' => 'thumbtime', |
| 50 | 'timedmedia_starttime' => 'start', |
| 51 | 'timedmedia_endtime' => 'end', |
| 52 | 'timedmedia_disablecontrols' => 'disablecontrols' # See T135537 |
| 53 | ], |
| 54 | 'SimpleOptions' => [ |
| 55 | # halign |
| 56 | 'img_left' => 'halign', |
| 57 | 'img_right' => 'halign', |
| 58 | 'img_center' => 'halign', |
| 59 | 'img_none' => 'halign', |
| 60 | |
| 61 | # valign |
| 62 | 'img_baseline' => 'valign', |
| 63 | 'img_sub' => 'valign', |
| 64 | 'img_super' => 'valign', |
| 65 | 'img_top' => 'valign', |
| 66 | 'img_text_top' => 'valign', |
| 67 | 'img_middle' => 'valign', |
| 68 | 'img_bottom' => 'valign', |
| 69 | 'img_text_bottom' => 'valign', |
| 70 | |
| 71 | # format |
| 72 | # 'border' can be given in addition to *one of* |
| 73 | # frameless, framed, or thumbnail |
| 74 | 'img_border' => 'border', |
| 75 | 'img_frameless' => 'format', |
| 76 | 'img_framed' => 'format', |
| 77 | 'img_thumbnail' => 'format', |
| 78 | |
| 79 | # Ha! Upright can be either one! Try parsing THAT! |
| 80 | 'img_upright' => 'upright', |
| 81 | |
| 82 | 'timedmedia_loop' => 'loop', # T308230 |
| 83 | 'timedmedia_muted' => 'muted', # T308230 |
| 84 | ] |
| 85 | ]; |
| 86 | |
| 87 | self::$Sanitizer = [ |
| 88 | # List of allowed tags that can be used as raw HTML in wikitext. |
| 89 | # All other html/html-like tags will be spit out as text. |
| 90 | 'AllowedLiteralTags' => PHPUtils::makeSet( [ |
| 91 | # In case you were wondering, explicit <a .. > HTML is NOT allowed in wikitext. |
| 92 | # That is why the <a> tag is missing from the allowed list. |
| 93 | 'abbr', |
| 94 | 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', |
| 95 | 'caption', 'center', 'cite', 'code', |
| 96 | 'data', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', |
| 97 | 'em', |
| 98 | 'font', |
| 99 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', |
| 100 | 'i', 'ins', |
| 101 | 'kbd', |
| 102 | 'li', |
| 103 | 'mark', |
| 104 | 'ol', |
| 105 | 'p', 'pre', |
| 106 | 'q', |
| 107 | 'rb', 'rp', 'rt', 'rtc', 'ruby', |
| 108 | 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', |
| 109 | 'table', 'td', 'th', 'time', 'tr', 'tt', |
| 110 | 'u', 'ul', |
| 111 | 'var', |
| 112 | 'wbr', |
| 113 | ] ), |
| 114 | ]; |
| 115 | |
| 116 | /** |
| 117 | * These HTML tags come from native wikitext markup and |
| 118 | * (as long as they are not literal HTML tags in the wikitext source) |
| 119 | * should have whitespace trimmed from their content. |
| 120 | */ |
| 121 | self::$WikitextTagsWithTrimmableWS = PHPUtils::makeSet( [ |
| 122 | "h1", "h2", "h3", "h4", "h5", "h6", |
| 123 | "ol", "li", "ul", "dd", "dl", "dt", |
| 124 | "td", "th", "caption" |
| 125 | ] ); |
| 126 | |
| 127 | # These HTML tags will be generated only if |
| 128 | # the corresponding wikitext occurs in a SOL context. |
| 129 | self::$HTMLTagsRequiringSOLContext = PHPUtils::makeSet( [ |
| 130 | "pre", |
| 131 | "h1", "h2", "h3", "h4", "h5", "h6", |
| 132 | "ol", "li", "ul", "dd", "dl", "dt", |
| 133 | ] ); |
| 134 | |
| 135 | # These wikitext tags are composed with quote-chars. |
| 136 | self::$WTQuoteTags = PHPUtils::makeSet( [ 'i', 'b' ] ); |
| 137 | |
| 138 | // These are defined in the legacy parser's `BlockLevelPass` |
| 139 | |
| 140 | // Opens block scope when entering, closes when exiting |
| 141 | self::$blockElems = PHPUtils::makeSet( [ |
| 142 | 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'p', 'ul', |
| 143 | 'ol', 'dl' |
| 144 | ] ); |
| 145 | // Closes block scope when entering, opens when exiting |
| 146 | self::$antiBlockElems = PHPUtils::makeSet( [ 'td', 'th' ] ); |
| 147 | // Opens block scope when entering, opens when exiting too |
| 148 | self::$alwaysBlockElems = PHPUtils::makeSet( [ |
| 149 | 'tr', 'caption', 'dt', 'dd', 'li' |
| 150 | ] ); |
| 151 | // Closes block scope when entering, closes when exiting too |
| 152 | self::$neverBlockElems = PHPUtils::makeSet( [ |
| 153 | 'center', 'blockquote', 'div', 'hr', 'figure', 'aside', // T278565 |
| 154 | ] ); |
| 155 | |
| 156 | self::$wikitextBlockElems = PHPUtils::makeSet( array_merge( |
| 157 | array_keys( self::$blockElems ), |
| 158 | array_keys( self::$antiBlockElems ), |
| 159 | array_keys( self::$alwaysBlockElems ), |
| 160 | array_keys( self::$neverBlockElems ) |
| 161 | ) ); |
| 162 | |
| 163 | self::$HTML = [ |
| 164 | # The list of HTML5 tags, mainly used for the identification of *non*-html tags. |
| 165 | # Non-html tags terminate otherwise tag-eating rules in the tokenizer |
| 166 | # to support potential extension tags. |
| 167 | 'HTML5Tags' => PHPUtils::makeSet( [ |
| 168 | "a", "abbr", "address", "area", "article", |
| 169 | "aside", "audio", "b", "base", "bdi", "bdo", "blockquote", |
| 170 | "body", "br", "button", "canvas", "caption", "cite", "code", |
| 171 | "col", "colgroup", "data", "datalist", "dd", "del", |
| 172 | "details", "dfn", "div", "dl", "dt", "em", "embed", "fieldset", |
| 173 | "figcaption", "figure", "footer", "form", |
| 174 | "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", |
| 175 | "hr", "html", "i", "iframe", "img", "input", "ins", "kbd", "keygen", |
| 176 | "label", "legend", "li", "link", "map", "mark", "menu", "meta", |
| 177 | "meter", "nav", "noscript", "object", "ol", "optgroup", "option", |
| 178 | "output", "p", "param", "pre", "progress", "q", "rb", "rp", "rt", |
| 179 | "rtc", "ruby", "s", "samp", "script", "section", "select", "small", |
| 180 | "source", "span", "strong", "style", "sub", "summary", "sup", |
| 181 | "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", |
| 182 | "title", "tr", "track", "u", "ul", "var", "video", "wbr", |
| 183 | ] ), |
| 184 | |
| 185 | /** |
| 186 | * https://html.spec.whatwg.org/multipage/dom.html#metadata-content-2 |
| 187 | * @type {Set} |
| 188 | */ |
| 189 | 'MetaDataTags' => PHPUtils::makeSet( [ |
| 190 | "base", "link", "meta", "noscript", "script", "style", "template", "title" |
| 191 | ] ), |
| 192 | |
| 193 | # From http://www.w3.org/TR/html5-diff/#obsolete-elements |
| 194 | # SSS FIXME: basefont is missing here, but looks like the PHP parser |
| 195 | # does not support it anyway and treats it as plain text. So, skipping |
| 196 | # this one in Parsoid as well. |
| 197 | 'OlderHTMLTags' => PHPUtils::makeSet( [ |
| 198 | "strike", "big", "center", "font", "tt", |
| 199 | ] ), |
| 200 | |
| 201 | # See http://www.w3.org/html/wg/drafts/html/master/syntax.html#formatting |
| 202 | 'FormattingTags' => PHPUtils::makeSet( [ |
| 203 | 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', |
| 204 | 's', 'small', 'strike', 'strong', 'tt', 'u', |
| 205 | ] ), |
| 206 | |
| 207 | /** |
| 208 | * From \\MediaWiki\Tidy\RemexCompatMunger::$onlyInlineElements |
| 209 | */ |
| 210 | 'OnlyInlineElements' => PHPUtils::makeSet( [ |
| 211 | 'a', 'abbr', 'acronym', 'applet', 'audio', 'b', 'basefont', 'bdi', 'bdo', |
| 212 | 'big', 'br', 'button', 'cite', 'code', 'data', 'del', 'dfn', 'em', |
| 213 | 'font', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', |
| 214 | 'legend', 'map', 'mark', 'object', 'param', 'q', 'rb', 'rbc', 'rp', |
| 215 | 'rt', 'rtc', 'ruby', 's', 'samp', 'select', 'small', 'source', 'span', |
| 216 | 'strike', 'strong', 'sub', 'sup', 'textarea', 'time', 'track', 'tt', 'u', |
| 217 | 'var', 'video', 'wbr' |
| 218 | ] ), |
| 219 | |
| 220 | 'ListTags' => PHPUtils::makeSet( [ 'ul', 'ol', 'dl' ] ), |
| 221 | |
| 222 | 'ListItemTags' => PHPUtils::makeSet( [ 'li', 'dd', 'dt' ] ), |
| 223 | |
| 224 | 'FosterablePosition' => PHPUtils::makeSet( [ 'table', 'thead', 'tbody', 'tfoot', 'tr' ] ), |
| 225 | |
| 226 | 'TableContentModels' => [ |
| 227 | 'table' => [ 'caption', 'colgroup', 'thead', 'tbody', 'tr', 'tfoot' ], |
| 228 | 'thead' => [ 'tr' ], |
| 229 | 'tbody' => [ 'tr' ], |
| 230 | 'tfoot' => [ 'tr' ], |
| 231 | 'tr' => [ 'td', 'th' ] |
| 232 | ], |
| 233 | |
| 234 | 'TableTags' => PHPUtils::makeSet( [ |
| 235 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
| 236 | ] ), |
| 237 | |
| 238 | # Table tags that can be children |
| 239 | 'ChildTableTags' => PHPUtils::makeSet( [ |
| 240 | "tbody", "thead", "tfoot", "tr", "caption", "th", "td", |
| 241 | ] ), |
| 242 | |
| 243 | # See https://html.spec.whatwg.org/#void-elements |
| 244 | 'VoidTags' => PHPUtils::makeSet( [ |
| 245 | 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', |
| 246 | 'input', 'link', 'meta', 'param', 'source', |
| 247 | 'track', 'wbr', |
| 248 | ] ), |
| 249 | |
| 250 | # HTML5 elements with raw (unescaped) content |
| 251 | 'RawTextElements' => PHPUtils::makeSet( [ |
| 252 | 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', |
| 253 | 'plaintext', 'noscript', |
| 254 | ] ), |
| 255 | ]; |
| 256 | |
| 257 | /** |
| 258 | * These HTML tags have native wikitext representations. |
| 259 | * The wikitext equivalents do not have closing tags. |
| 260 | * @type {Set} |
| 261 | */ |
| 262 | self::$WTTagsWithNoClosingTags = PHPUtils::makeSet( [ |
| 263 | "pre", "li", "dt", "dd", "hr", "tr", "td", "th" |
| 264 | ] ); |
| 265 | |
| 266 | self::$Output = [ |
| 267 | 'FlaggedEmptyElts' => PHPUtils::makeSet( [ |
| 268 | 'li', 'tr', 'p', |
| 269 | ] ), |
| 270 | ]; |
| 271 | |
| 272 | # Known wikitext tag widths -- these are known statically |
| 273 | # but other widths are computed or updated based on actual wikitext usage |
| 274 | self::$WtTagWidths = [ |
| 275 | "body" => [ 0, 0 ], |
| 276 | "html" => [ 0, 0 ], |
| 277 | "head" => [ 0, 0 ], |
| 278 | "p" => [ 0, 0 ], |
| 279 | "meta" => [ 0, 0 ], |
| 280 | // @see PreHandler::newIndentPreWS() for why opening width is 0, not 1 |
| 281 | "pre" => [ 0, 0 ], |
| 282 | "ol" => [ 0, 0 ], |
| 283 | "ul" => [ 0, 0 ], |
| 284 | "dl" => [ 0, 0 ], |
| 285 | "li" => [ 1, 0 ], |
| 286 | "dt" => [ 1, 0 ], |
| 287 | "dd" => [ 1, 0 ], |
| 288 | "h1" => [ 1, 1 ], |
| 289 | "h2" => [ 2, 2 ], |
| 290 | "h3" => [ 3, 3 ], |
| 291 | "h4" => [ 4, 4 ], |
| 292 | "h5" => [ 5, 5 ], |
| 293 | "h6" => [ 6, 6 ], |
| 294 | "hr" => [ 4, 0 ], |
| 295 | "table" => [ 2, 2 ], |
| 296 | "tbody" => [ 0, 0 ], |
| 297 | "thead" => [ 0, 0 ], |
| 298 | "tfoot" => [ 0, 0 ], |
| 299 | "tr" => [ null, 0 ], |
| 300 | "td" => [ null, 0 ], |
| 301 | "th" => [ null, 0 ], |
| 302 | "b" => [ 3, 3 ], |
| 303 | "i" => [ 2, 2 ], |
| 304 | "br" => [ 0, 0 ], |
| 305 | "figure" => [ 2, 2 ], |
| 306 | "figcaption" => [ 0, 0 ], |
| 307 | ]; |
| 308 | |
| 309 | # HTML tags whose wikitext equivalents are zero-width. |
| 310 | # This information is derived from WtTagWidths and set below. |
| 311 | self::$ZeroWidthWikitextTags = PHPUtils::makeSet( [] ); |
| 312 | |
| 313 | # Map LanguageConverter wikitext flags to readable JSON field names. |
| 314 | self::$LCFlagMap = [ |
| 315 | # These first three flags are used internally during flag processing, |
| 316 | # but should never appear in the output wikitext, so we prepend them |
| 317 | # with '$'. |
| 318 | |
| 319 | # 'S': Show converted text |
| 320 | '$S' => 'show', |
| 321 | # '+': Add conversion rule |
| 322 | '$+' => 'add', |
| 323 | # 'E': Error in the given flags |
| 324 | '$E' => 'error', |
| 325 | |
| 326 | # These rest of these are valid flags in wikitext. |
| 327 | |
| 328 | # 'A': add conversion rule *and show converted text* (implies S) |
| 329 | 'A' => 'add', |
| 330 | # 'T': Convert and override page title |
| 331 | 'T' => 'title', |
| 332 | # 'R': Disable language conversion (exclusive flag) |
| 333 | 'R' => 'disabled', |
| 334 | # 'D': Describe conversion rule (without adding to table) |
| 335 | 'D' => 'describe', |
| 336 | # '-': Remove existing conversion rule (exclusive flag) |
| 337 | '-' => 'remove', |
| 338 | # 'H': add rule for convert code (but no display in placed code ) |
| 339 | 'H' => '', # this is handled implicitly as a lack of 'show' |
| 340 | # 'N': Output current variant name (exclusive flag) |
| 341 | 'N' => 'name', |
| 342 | ]; |
| 343 | |
| 344 | # Map JSON field names to LanguageConverter wikitext flags. |
| 345 | # This information is derived from LCFlagMap and set below. |
| 346 | self::$LCNameMap = []; |
| 347 | |
| 348 | # Derived information from 'WtTagWidths' |
| 349 | foreach ( self::$WtTagWidths as $tag => $widths ) { |
| 350 | # This special case can be fixed by maybe removing them WtTagWidths. |
| 351 | # They may no longer be necessary -- to be investigated in another patch. |
| 352 | if ( $tag !== 'html' && $tag !== 'head' && $tag !== 'body' ) { |
| 353 | if ( $widths[0] === 0 && $widths[1] === 0 ) { |
| 354 | // @see explanation in PreHandler::newIndentPreWS() |
| 355 | // to understand this special case |
| 356 | if ( $tag !== 'pre' ) { |
| 357 | self::$ZeroWidthWikitextTags[$tag] = true; |
| 358 | } |
| 359 | } |
| 360 | } |
| 361 | } |
| 362 | |
| 363 | # Derived information from `LCFlagMap` |
| 364 | foreach ( self::$LCFlagMap as $k => $v ) { |
| 365 | if ( $v ) { |
| 366 | self::$LCNameMap[$v] = $k; |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | # Handle ambiguity in inverse mapping. |
| 371 | self::$LCNameMap['add'] = 'A'; |
| 372 | |
| 373 | /* |
| 374 | * These characters are not considered to be part of a URL if they are the last |
| 375 | * character of a raw URL when converting it to an HTML link |
| 376 | * Right bracket would also be in that set, but only if there's no left bracket in the URL; |
| 377 | * see TokenizerUtils::getAutoUrlTerminatingChars. |
| 378 | */ |
| 379 | self::$strippedUrlCharacters = ',;\.:!?'; |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | Consts::init(); |