Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
62.60% |
82 / 131 |
|
56.25% |
9 / 16 |
CRAP | |
0.00% |
0 / 1 |
| Utils | |
62.60% |
82 / 131 |
|
56.25% |
9 / 16 |
180.83 | |
0.00% |
0 / 1 |
| convert | |
66.67% |
8 / 12 |
|
0.00% |
0 / 1 |
15.48 | |||
| wikitextToHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| htmlToWikitext | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
| htmlToPlaintext | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| commentParser | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
5.27 | |||
| createDOM | |
54.84% |
17 / 31 |
|
0.00% |
0 / 1 |
7.30 | |||
| onFlowAddModules | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| saferSaveXML | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| getInnerHtml | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
| getOuterHtml | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| encodeHeadInfo | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
| decodeHeadInfo | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
| getParsoidVersion | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| createRelativeTitle | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
| getLanguageConverter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| getConvertedTitle | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Flow\Conversion; |
| 4 | |
| 5 | use DOMDocument; |
| 6 | use DOMElement; |
| 7 | use DOMNode; |
| 8 | use Flow\Exception\NoParserException; |
| 9 | use Flow\Exception\WikitextException; |
| 10 | use Flow\Parsoid\ContentFixer; |
| 11 | use Flow\Parsoid\Fixer\EmptyNodeFixer; |
| 12 | use MediaWiki\Content\TextContent; |
| 13 | use MediaWiki\Content\WikitextContent; |
| 14 | use MediaWiki\Html\Html; |
| 15 | use MediaWiki\Language\ILanguageConverter; |
| 16 | use MediaWiki\Language\Language; |
| 17 | use MediaWiki\MediaWikiServices; |
| 18 | use MediaWiki\Output\OutputPage; |
| 19 | use MediaWiki\Parser\ParserOptions; |
| 20 | use MediaWiki\Parser\Sanitizer; |
| 21 | use MediaWiki\Title\Title; |
| 22 | |
| 23 | abstract class Utils { |
| 24 | |
| 25 | public const PARSOID_VERSION = '2.0.0'; |
| 26 | |
| 27 | /** |
| 28 | * Convert from/to wikitext <=> html or topic-title-wikitext => topic-title-html. |
| 29 | * Only these pairs are supported. html => wikitext requires Parsoid, and |
| 30 | * topic-title-html => topic-title-wikitext is not supported. |
| 31 | * |
| 32 | * @param string $from Format of content to convert: html|wikitext|topic-title-wikitext |
| 33 | * @param string $to Format to convert to: html|wikitext|topic-title-html |
| 34 | * @param string $content |
| 35 | * @param Title $title |
| 36 | * @return string |
| 37 | * @throws WikitextException When the requested conversion is unsupported |
| 38 | * @throws NoParserException When the conversion fails |
| 39 | * @return-taint none |
| 40 | */ |
| 41 | public static function convert( $from, $to, $content, Title $title ) { |
| 42 | if ( $from === $to || $content === '' ) { |
| 43 | return $content; |
| 44 | } |
| 45 | |
| 46 | if ( $from === 'wt' ) { |
| 47 | $from = 'wikitext'; |
| 48 | } |
| 49 | |
| 50 | if ( $from == 'wikitext' && $to == 'html' ) { |
| 51 | return self::wikitextToHTML( $content, $title ); |
| 52 | } elseif ( $from == 'html' && $to == 'wikitext' ) { |
| 53 | return self::htmlToWikitext( $content, $title ); |
| 54 | } elseif ( $from === 'topic-title-wikitext' && |
| 55 | ( $to === 'topic-title-html' || $to === 'topic-title-plaintext' ) ) { |
| 56 | // FIXME: links need to be proceed by findVariantLinks or equivant function |
| 57 | return self::getLanguageConverter()->convert( self::commentParser( $from, $to, $content ) ); |
| 58 | } else { |
| 59 | return self::commentParser( $from, $to, $content ); |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * @param string $wikitext |
| 65 | * @param Title $title |
| 66 | * |
| 67 | * @return string The converted wikitext to HTML |
| 68 | */ |
| 69 | private static function wikitextToHTML( string $wikitext, Title $title ) { |
| 70 | $parserOptions = ParserOptions::newFromAnon(); |
| 71 | $parserOptions->setRenderReason( __METHOD__ ); |
| 72 | |
| 73 | $parserFactory = MediaWikiServices::getInstance()->getParsoidParserFactory()->create(); |
| 74 | $parserOutput = $parserFactory->parse( $wikitext, $title, $parserOptions ); |
| 75 | |
| 76 | // $parserOutput->getText() will strip off the body tag, but we want to retain here. |
| 77 | // So we'll call ->getRawText() here and modify the HTML by ourselves. |
| 78 | preg_match( "#<body[^>]*>(.*?)</body>#s", $parserOutput->getRawText(), $html ); |
| 79 | |
| 80 | return $html[0]; |
| 81 | } |
| 82 | |
| 83 | /** |
| 84 | * @param string $html |
| 85 | * @param Title $title |
| 86 | * |
| 87 | * @return string The converted HTML to Wikitext |
| 88 | * @throws WikitextException When the conversion is unsupported |
| 89 | */ |
| 90 | private static function htmlToWikitext( string $html, Title $title ) { |
| 91 | $transform = MediaWikiServices::getInstance()->getHtmlTransformFactory() |
| 92 | ->getHtmlToContentTransform( $html, $title ); |
| 93 | |
| 94 | $transform->setOptions( [ |
| 95 | 'contentmodel' => CONTENT_MODEL_WIKITEXT, |
| 96 | 'offsetType' => 'byte' |
| 97 | ] ); |
| 98 | |
| 99 | /** @var TextContent $content */ |
| 100 | $content = $transform->htmlToContent(); |
| 101 | |
| 102 | if ( !$content instanceof WikitextContent ) { |
| 103 | throw new WikitextException( 'Conversion to Wikitext failed' ); |
| 104 | } |
| 105 | |
| 106 | return trim( $content->getTextForSearchIndex() ); |
| 107 | } |
| 108 | |
| 109 | /** |
| 110 | * Basic conversion of html to plaintext for use in recent changes, history, |
| 111 | * and other places where a roundtrip is undesired. |
| 112 | * |
| 113 | * @param string $html |
| 114 | * @param int|null $truncateLength Maximum length in characters (including ellipses) or null for whole string. |
| 115 | * @param Language|null $lang Language to use for truncation. Defaults to $wgLang |
| 116 | * @return string plaintext |
| 117 | */ |
| 118 | public static function htmlToPlaintext( $html, ?int $truncateLength = null, ?Language $lang = null ) { |
| 119 | /** @var Language $wgLang */ |
| 120 | global $wgLang; |
| 121 | |
| 122 | $plain = trim( Sanitizer::stripAllTags( $html ) ); |
| 123 | |
| 124 | // Fallback to some large-ish value for truncation. |
| 125 | if ( $truncateLength === null ) { |
| 126 | $truncateLength = 10000; |
| 127 | } |
| 128 | |
| 129 | $lang = $lang ?: $wgLang; |
| 130 | return $lang->truncateForVisual( $plain, $truncateLength ); |
| 131 | } |
| 132 | |
| 133 | /** |
| 134 | * Convert from/to topic-title-wikitext/topic-title-html using |
| 135 | * MediaWiki\CommentFormatter\CommentFormatter::formatLinks |
| 136 | * |
| 137 | * @param string $from Format of content to convert: topic-title-wikitext |
| 138 | * @param string $to Format of content to convert to: topic-title-html |
| 139 | * @param string $content Content to convert, in topic-title-wikitext format. |
| 140 | * @return string $content in HTML |
| 141 | * @throws WikitextException |
| 142 | */ |
| 143 | protected static function commentParser( $from, $to, $content ) { |
| 144 | if ( |
| 145 | $from !== 'topic-title-wikitext' || |
| 146 | ( $to !== 'topic-title-html' && $to !== 'topic-title-plaintext' ) |
| 147 | ) { |
| 148 | throw new WikitextException( "Conversion from '$from' to '$to' was requested, " . |
| 149 | "but this is not supported." ); |
| 150 | } |
| 151 | |
| 152 | $html = MediaWikiServices::getInstance()->getCommentFormatter() |
| 153 | ->formatLinks( Sanitizer::escapeHtmlAllowEntities( $content ) ); |
| 154 | if ( $to === 'topic-title-plaintext' ) { |
| 155 | return self::htmlToPlaintext( $html ); |
| 156 | } else { |
| 157 | return $html; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | /** |
| 162 | * Turns given $content string into a DOMDocument object. |
| 163 | * |
| 164 | * Note that, by default, $content will be prefixed with <?xml encoding="utf-8"?> to force |
| 165 | * libxml to interpret the content as UTF-8. If for some reason you don't want this to happen, |
| 166 | * or you are certain that your input already has <?xml encoding="utf-8"?> or |
| 167 | * <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> , then you can disable |
| 168 | * this behavior by setting $utf8Fragment=false to disable this behavior. |
| 169 | * |
| 170 | * Some libxml errors are forgivable, libxml errors that aren't |
| 171 | * ignored will throw a WikitextException. |
| 172 | * |
| 173 | * The default error codes allowed are: |
| 174 | * 9 - allow illegal characters (they are removed, but this option means it |
| 175 | * doesn't trigger an error. |
| 176 | * 76 - allow unexpected end tag. This is typically old wikitext using deprecated tags. |
| 177 | * 513 - allow multiple tags with same id |
| 178 | * 801 - allow unrecognized tags like figcaption |
| 179 | * |
| 180 | * @param string $content |
| 181 | * @param bool $utf8Fragment If true, prefix $content with <?xml encoding="utf-8"?> |
| 182 | * @param array $ignoreErrorCodes |
| 183 | * @return DOMDocument |
| 184 | * @throws WikitextException |
| 185 | * @see http://www.xmlsoft.org/html/libxml-xmlerror.html |
| 186 | */ |
| 187 | public static function createDOM( |
| 188 | $content, |
| 189 | $utf8Fragment = true, |
| 190 | array $ignoreErrorCodes = [ 9, 76, 513, 801 ] |
| 191 | ) { |
| 192 | $dom = new DOMDocument(); |
| 193 | |
| 194 | $loadEntities = false; |
| 195 | if ( LIBXML_VERSION < 20900 ) { |
| 196 | // Otherwise the parser may attempt to load the dtd from an external source. |
| 197 | // See: https://www.mediawiki.org/wiki/XML_External_Entity_Processing |
| 198 | $loadEntities = libxml_disable_entity_loader( true ); |
| 199 | } |
| 200 | |
| 201 | // don't output warnings |
| 202 | $useErrors = libxml_use_internal_errors( true ); |
| 203 | |
| 204 | // Work around DOMDocument's morbid insistence on using iso-8859-1 |
| 205 | // Even $dom = new DOMDocument( '1.0', 'utf-8' ); doesn't work, you have to specify |
| 206 | // encoding ="utf-8" in the string fed to loadHTML() |
| 207 | $html = ( $utf8Fragment ? '<?xml encoding="utf-8"?>' : '' ) . $content; |
| 208 | $dom->loadHTML( $html, LIBXML_PARSEHUGE ); |
| 209 | |
| 210 | if ( LIBXML_VERSION < 20900 ) { |
| 211 | libxml_disable_entity_loader( $loadEntities ); |
| 212 | } |
| 213 | |
| 214 | // check error codes; if not in the supplied list of ignorable errors, |
| 215 | // throw an exception |
| 216 | $errors = array_filter( |
| 217 | libxml_get_errors(), |
| 218 | static function ( $error ) use( $ignoreErrorCodes ) { |
| 219 | return !in_array( $error->code, $ignoreErrorCodes ); |
| 220 | } |
| 221 | ); |
| 222 | |
| 223 | // restore libxml state before anything else |
| 224 | libxml_clear_errors(); |
| 225 | libxml_use_internal_errors( $useErrors ); |
| 226 | |
| 227 | if ( $errors ) { |
| 228 | throw new WikitextException( |
| 229 | implode( |
| 230 | "\n", |
| 231 | array_map( |
| 232 | static function ( $error ) { |
| 233 | return $error->message; |
| 234 | }, |
| 235 | $errors |
| 236 | ) |
| 237 | ) . "\n\nFrom source content:\n" . $content, |
| 238 | 'process-wikitext' |
| 239 | ); |
| 240 | } |
| 241 | |
| 242 | return $dom; |
| 243 | } |
| 244 | |
| 245 | /** |
| 246 | * Handler for FlowAddModules, avoids rest of Flow having to be aware if |
| 247 | * Parsoid is in use. |
| 248 | * |
| 249 | * @param OutputPage $out |
| 250 | * @return bool |
| 251 | */ |
| 252 | public static function onFlowAddModules( OutputPage $out ) { |
| 253 | // The module is only necessary when we are using parsoid. |
| 254 | // XXX We only need the Parsoid CSS if some content being |
| 255 | // rendered has getContentFormat() === 'html'. |
| 256 | $out->addModuleStyles( [ |
| 257 | 'mediawiki.skinning.content.parsoid', |
| 258 | 'ext.cite.parsoid.styles', |
| 259 | ] ); |
| 260 | |
| 261 | return true; |
| 262 | } |
| 263 | |
| 264 | /** |
| 265 | * Saves a document using saveXML, but avoid escaping style blocks with CDATA. |
| 266 | * This is not needed in HTML and breaks the CSS. |
| 267 | * |
| 268 | * @param DOMDocument $doc |
| 269 | * @param DOMNode|null $node the specific node to save |
| 270 | * @return string HTML |
| 271 | */ |
| 272 | public static function saferSaveXML( DOMDocument $doc, ?DOMNode $node = null ) { |
| 273 | $html = $doc->saveXML( $node ); |
| 274 | // This regex is only safe as long as attribute values get escaped > chars |
| 275 | // This is checked by the testcases |
| 276 | $html = preg_replace( '/<style([^>]*)><!\[CDATA\[/i', '<style\1>', $html ); |
| 277 | return preg_replace( '/\]\]><\/style>/i', '</style>', $html ); |
| 278 | } |
| 279 | |
| 280 | /** |
| 281 | * Retrieves the html of the node's children. |
| 282 | * |
| 283 | * @param DOMNode|null $node |
| 284 | * @return string html of the nodes children |
| 285 | */ |
| 286 | public static function getInnerHtml( ?DOMNode $node = null ) { |
| 287 | $html = ''; |
| 288 | if ( $node ) { |
| 289 | $dom = $node instanceof DOMDocument ? $node : $node->ownerDocument; |
| 290 | // Don't use saveHTML(), it has bugs (T217766); instead use XML serialization |
| 291 | // with a workaround for empty non-void nodes |
| 292 | $fixer = new ContentFixer( new EmptyNodeFixer ); |
| 293 | $fixer->applyToDom( $dom, Title::newMainPage() ); |
| 294 | |
| 295 | foreach ( $node->childNodes as $child ) { |
| 296 | $html .= self::saferSaveXML( $dom, $child ); |
| 297 | } |
| 298 | } |
| 299 | return $html; |
| 300 | } |
| 301 | |
| 302 | /** |
| 303 | * Gets the HTML of a node. This is like getInnterHtml(), but includes the node's tag itself too. |
| 304 | * @param DOMNode $node |
| 305 | * @return string HTML |
| 306 | */ |
| 307 | public static function getOuterHtml( DOMNode $node ) { |
| 308 | $dom = $node instanceof DOMDocument ? $node : $node->ownerDocument; |
| 309 | // Don't use saveHTML(), it has bugs (T217766); instead use XML serialization |
| 310 | // with a workaround for empty non-void nodes |
| 311 | $fixer = new ContentFixer( new EmptyNodeFixer ); |
| 312 | $fixer->applyToDom( $dom, Title::newMainPage() ); |
| 313 | return self::saferSaveXML( $dom, $node ); |
| 314 | } |
| 315 | |
| 316 | /** |
| 317 | * Encode information from the <head> tag as attributes on the <body> tag, then |
| 318 | * drop the <head>. |
| 319 | * |
| 320 | * Specifically, add the Parsoid version number in the parsoid-version attribute; |
| 321 | * put the href of the <base> tag in the base-url attribute; |
| 322 | * and remove the class attribute from the <body>. |
| 323 | * |
| 324 | * @param string $html |
| 325 | * @return string HTML with <head> information encoded as attributes on the <body> |
| 326 | * @throws WikitextException |
| 327 | * @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs |
| 328 | */ |
| 329 | public static function encodeHeadInfo( $html ) { |
| 330 | $dom = ContentFixer::createDOM( $html ); |
| 331 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
| 332 | $head = $dom->getElementsByTagName( 'head' )->item( 0 ); |
| 333 | $base = $head ? $head->getElementsByTagName( 'base' )->item( 0 ) : null; |
| 334 | $body->setAttribute( 'parsoid-version', self::PARSOID_VERSION ); |
| 335 | if ( $base instanceof DOMElement && $base->getAttribute( 'href' ) ) { |
| 336 | $body->setAttribute( 'base-url', $base->getAttribute( 'href' ) ); |
| 337 | } |
| 338 | // The class attribute is not used by us and is wastefully long, remove it |
| 339 | $body->removeAttribute( 'class' ); |
| 340 | return self::getOuterHtml( $body ); |
| 341 | } |
| 342 | |
| 343 | /** |
| 344 | * Put the base URI from the <body>'s base-url attribute back in the <head> as a <base> tag. |
| 345 | * This reverses (part of) the transformation done by encodeHeadInfo(). |
| 346 | * |
| 347 | * @param string $html HTML (may be a full document, <body> tag or unwrapped <body> contents) |
| 348 | * @return string HTML (<html> tag with <head> and <body>) with the <base> tag restored |
| 349 | * @throws WikitextException |
| 350 | * @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs |
| 351 | */ |
| 352 | public static function decodeHeadInfo( $html ) { |
| 353 | $dom = ContentFixer::createDOM( $html ); |
| 354 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
| 355 | $baseUrl = $body->getAttribute( 'base-url' ); |
| 356 | return Html::rawElement( 'html', [], |
| 357 | Html::rawElement( 'head', [], |
| 358 | // Only set base href if there's a value to set. |
| 359 | $baseUrl ? Html::element( 'base', [ 'href' => $baseUrl ] ) : '' |
| 360 | ) . |
| 361 | self::getOuterHtml( $body ) |
| 362 | ); |
| 363 | } |
| 364 | |
| 365 | /** |
| 366 | * Get the Parsoid version from HTML content stored in the database. |
| 367 | * This interprets the transformation done by encodeHeadInfo(). |
| 368 | * |
| 369 | * @param string $html |
| 370 | * @return string|null Parsoid version number, or null if none found |
| 371 | * @suppress PhanUndeclaredMethod Apparently a phan bug / wrong built-in PHP stubs |
| 372 | */ |
| 373 | public static function getParsoidVersion( $html ) { |
| 374 | $dom = ContentFixer::createDOM( $html ); |
| 375 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
| 376 | $version = $body->getAttribute( 'parsoid-version' ); |
| 377 | return $version !== '' ? $version : null; |
| 378 | } |
| 379 | |
| 380 | /** |
| 381 | * Subpage links from Parsoid don't contain any direct context, its applied via |
| 382 | * a <base href="..."> tag, so here we apply a similar rule resolving against |
| 383 | * $title |
| 384 | * |
| 385 | * @param string $text |
| 386 | * @param Title $title Title to resolve relative links against |
| 387 | * @return Title|null |
| 388 | */ |
| 389 | public static function createRelativeTitle( $text, Title $title ) { |
| 390 | // currently parsoid always uses enough ../ or ./ to go |
| 391 | // back to the root, a bit of a kludge but just assume we |
| 392 | // can strip and will end up with a non-relative text. |
| 393 | $text = preg_replace( '|^(\.\.?/)+|', '', $text ); |
| 394 | |
| 395 | if ( $text && ( $text[0] === '/' || $text[0] === '#' ) ) { |
| 396 | return Title::newFromText( $title->getDBkey() . $text, $title->getNamespace() ); |
| 397 | } |
| 398 | |
| 399 | return Title::newFromText( $text ); |
| 400 | } |
| 401 | |
| 402 | /** |
| 403 | * @since 1.35 |
| 404 | * @return ILanguageConverter |
| 405 | */ |
| 406 | private static function getLanguageConverter(): ILanguageConverter { |
| 407 | $services = MediaWikiServices::getInstance(); |
| 408 | return $services |
| 409 | ->getLanguageConverterFactory() |
| 410 | ->getLanguageConverter( $services->getContentLanguage() ); |
| 411 | } |
| 412 | |
| 413 | /** |
| 414 | * @since 1.35 |
| 415 | * @param Title $title Title to convert to language variant |
| 416 | * @return string Converted title |
| 417 | */ |
| 418 | public static function getConvertedTitle( Title $title ) { |
| 419 | $ns = $title->getNamespace(); |
| 420 | $titleText = $title->getText(); |
| 421 | $langConv = self::getLanguageConverter(); |
| 422 | $variant = $langConv->getPreferredVariant(); |
| 423 | $convertedNamespace = $langConv->convertNamespace( $ns, $variant ); |
| 424 | if ( $convertedNamespace ) { |
| 425 | return $convertedNamespace . ':' . $langConv->translate( $titleText, $variant ); |
| 426 | } else { |
| 427 | return $langConv->translate( $titleText, $variant ); |
| 428 | } |
| 429 | } |
| 430 | } |