Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
62.60% |
82 / 131 |
|
56.25% |
9 / 16 |
CRAP | |
0.00% |
0 / 1 |
Utils | |
62.60% |
82 / 131 |
|
56.25% |
9 / 16 |
180.83 | |
0.00% |
0 / 1 |
convert | |
66.67% |
8 / 12 |
|
0.00% |
0 / 1 |
15.48 | |||
wikitextToHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
htmlToWikitext | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
htmlToPlaintext | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
commentParser | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
5.27 | |||
createDOM | |
54.84% |
17 / 31 |
|
0.00% |
0 / 1 |
7.30 | |||
onFlowAddModules | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
saferSaveXML | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getInnerHtml | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
getOuterHtml | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
encodeHeadInfo | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
decodeHeadInfo | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
getParsoidVersion | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
createRelativeTitle | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
getLanguageConverter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getConvertedTitle | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace Flow\Conversion; |
4 | |
5 | use DOMDocument; |
6 | use DOMElement; |
7 | use DOMNode; |
8 | use Flow\Exception\NoParserException; |
9 | use Flow\Exception\WikitextException; |
10 | use Flow\Parsoid\ContentFixer; |
11 | use Flow\Parsoid\Fixer\EmptyNodeFixer; |
12 | use MediaWiki\Content\TextContent; |
13 | use MediaWiki\Content\WikitextContent; |
14 | use MediaWiki\Html\Html; |
15 | use MediaWiki\Language\ILanguageConverter; |
16 | use MediaWiki\Language\Language; |
17 | use MediaWiki\MediaWikiServices; |
18 | use MediaWiki\Output\OutputPage; |
19 | use MediaWiki\Parser\ParserOptions; |
20 | use MediaWiki\Parser\Sanitizer; |
21 | use MediaWiki\Title\Title; |
22 | |
23 | abstract class Utils { |
24 | |
25 | public const PARSOID_VERSION = '2.0.0'; |
26 | |
27 | /** |
28 | * Convert from/to wikitext <=> html or topic-title-wikitext => topic-title-html. |
29 | * Only these pairs are supported. html => wikitext requires Parsoid, and |
30 | * topic-title-html => topic-title-wikitext is not supported. |
31 | * |
32 | * @param string $from Format of content to convert: html|wikitext|topic-title-wikitext |
33 | * @param string $to Format to convert to: html|wikitext|topic-title-html |
34 | * @param string $content |
35 | * @param Title $title |
36 | * @return string |
37 | * @throws WikitextException When the requested conversion is unsupported |
38 | * @throws NoParserException When the conversion fails |
39 | * @return-taint none |
40 | */ |
41 | public static function convert( $from, $to, $content, Title $title ) { |
42 | if ( $from === $to || $content === '' ) { |
43 | return $content; |
44 | } |
45 | |
46 | if ( $from === 'wt' ) { |
47 | $from = 'wikitext'; |
48 | } |
49 | |
50 | if ( $from == 'wikitext' && $to == 'html' ) { |
51 | return self::wikitextToHTML( $content, $title ); |
52 | } elseif ( $from == 'html' && $to == 'wikitext' ) { |
53 | return self::htmlToWikitext( $content, $title ); |
54 | } elseif ( $from === 'topic-title-wikitext' && |
55 | ( $to === 'topic-title-html' || $to === 'topic-title-plaintext' ) ) { |
56 | // FIXME: links need to be proceed by findVariantLinks or equivant function |
57 | return self::getLanguageConverter()->convert( self::commentParser( $from, $to, $content ) ); |
58 | } else { |
59 | return self::commentParser( $from, $to, $content ); |
60 | } |
61 | } |
62 | |
63 | /** |
64 | * @param string $wikitext |
65 | * @param Title $title |
66 | * |
67 | * @return string The converted wikitext to HTML |
68 | */ |
69 | private static function wikitextToHTML( string $wikitext, Title $title ) { |
70 | $parserOptions = ParserOptions::newFromAnon(); |
71 | $parserOptions->setRenderReason( __METHOD__ ); |
72 | |
73 | $parserFactory = MediaWikiServices::getInstance()->getParsoidParserFactory()->create(); |
74 | $parserOutput = $parserFactory->parse( $wikitext, $title, $parserOptions ); |
75 | |
76 | // $parserOutput->getText() will strip off the body tag, but we want to retain here. |
77 | // So we'll call ->getRawText() here and modify the HTML by ourselves. |
78 | preg_match( "#<body[^>]*>(.*?)</body>#s", $parserOutput->getRawText(), $html ); |
79 | |
80 | return $html[0]; |
81 | } |
82 | |
83 | /** |
84 | * @param string $html |
85 | * @param Title $title |
86 | * |
87 | * @return string The converted HTML to Wikitext |
88 | * @throws WikitextException When the conversion is unsupported |
89 | */ |
90 | private static function htmlToWikitext( string $html, Title $title ) { |
91 | $transform = MediaWikiServices::getInstance()->getHtmlTransformFactory() |
92 | ->getHtmlToContentTransform( $html, $title ); |
93 | |
94 | $transform->setOptions( [ |
95 | 'contentmodel' => CONTENT_MODEL_WIKITEXT, |
96 | 'offsetType' => 'byte' |
97 | ] ); |
98 | |
99 | /** @var TextContent $content */ |
100 | $content = $transform->htmlToContent(); |
101 | |
102 | if ( !$content instanceof WikitextContent ) { |
103 | throw new WikitextException( 'Conversion to Wikitext failed' ); |
104 | } |
105 | |
106 | return trim( $content->getTextForSearchIndex() ); |
107 | } |
108 | |
109 | /** |
110 | * Basic conversion of html to plaintext for use in recent changes, history, |
111 | * and other places where a roundtrip is undesired. |
112 | * |
113 | * @param string $html |
114 | * @param int|null $truncateLength Maximum length in characters (including ellipses) or null for whole string. |
115 | * @param Language|null $lang Language to use for truncation. Defaults to $wgLang |
116 | * @return string plaintext |
117 | */ |
118 | public static function htmlToPlaintext( $html, ?int $truncateLength = null, ?Language $lang = null ) { |
119 | /** @var Language $wgLang */ |
120 | global $wgLang; |
121 | |
122 | $plain = trim( Sanitizer::stripAllTags( $html ) ); |
123 | |
124 | // Fallback to some large-ish value for truncation. |
125 | if ( $truncateLength === null ) { |
126 | $truncateLength = 10000; |
127 | } |
128 | |
129 | $lang = $lang ?: $wgLang; |
130 | return $lang->truncateForVisual( $plain, $truncateLength ); |
131 | } |
132 | |
133 | /** |
134 | * Convert from/to topic-title-wikitext/topic-title-html using |
135 | * MediaWiki\CommentFormatter\CommentFormatter::formatLinks |
136 | * |
137 | * @param string $from Format of content to convert: topic-title-wikitext |
138 | * @param string $to Format of content to convert to: topic-title-html |
139 | * @param string $content Content to convert, in topic-title-wikitext format. |
140 | * @return string $content in HTML |
141 | * @throws WikitextException |
142 | */ |
143 | protected static function commentParser( $from, $to, $content ) { |
144 | if ( |
145 | $from !== 'topic-title-wikitext' || |
146 | ( $to !== 'topic-title-html' && $to !== 'topic-title-plaintext' ) |
147 | ) { |
148 | throw new WikitextException( "Conversion from '$from' to '$to' was requested, " . |
149 | "but this is not supported." ); |
150 | } |
151 | |
152 | $html = MediaWikiServices::getInstance()->getCommentFormatter() |
153 | ->formatLinks( Sanitizer::escapeHtmlAllowEntities( $content ) ); |
154 | if ( $to === 'topic-title-plaintext' ) { |
155 | return self::htmlToPlaintext( $html ); |
156 | } else { |
157 | return $html; |
158 | } |
159 | } |
160 | |
161 | /** |
162 | * Turns given $content string into a DOMDocument object. |
163 | * |
164 | * Note that, by default, $content will be prefixed with <?xml encoding="utf-8"?> to force |
165 | * libxml to interpret the content as UTF-8. If for some reason you don't want this to happen, |
166 | * or you are certain that your input already has <?xml encoding="utf-8"?> or |
167 | * <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> , then you can disable |
168 | * this behavior by setting $utf8Fragment=false to disable this behavior. |
169 | * |
170 | * Some libxml errors are forgivable, libxml errors that aren't |
171 | * ignored will throw a WikitextException. |
172 | * |
173 | * The default error codes allowed are: |
174 | * 9 - allow illegal characters (they are removed, but this option means it |
175 | * doesn't trigger an error. |
176 | * 76 - allow unexpected end tag. This is typically old wikitext using deprecated tags. |
177 | * 513 - allow multiple tags with same id |
178 | * 801 - allow unrecognized tags like figcaption |
179 | * |
180 | * @param string $content |
181 | * @param bool $utf8Fragment If true, prefix $content with <?xml encoding="utf-8"?> |
182 | * @param array $ignoreErrorCodes |
183 | * @return DOMDocument |
184 | * @throws WikitextException |
185 | * @see http://www.xmlsoft.org/html/libxml-xmlerror.html |
186 | */ |
187 | public static function createDOM( |
188 | $content, |
189 | $utf8Fragment = true, |
190 | array $ignoreErrorCodes = [ 9, 76, 513, 801 ] |
191 | ) { |
192 | $dom = new DOMDocument(); |
193 | |
194 | $loadEntities = false; |
195 | if ( LIBXML_VERSION < 20900 ) { |
196 | // Otherwise the parser may attempt to load the dtd from an external source. |
197 | // See: https://www.mediawiki.org/wiki/XML_External_Entity_Processing |
198 | $loadEntities = libxml_disable_entity_loader( true ); |
199 | } |
200 | |
201 | // don't output warnings |
202 | $useErrors = libxml_use_internal_errors( true ); |
203 | |
204 | // Work around DOMDocument's morbid insistence on using iso-8859-1 |
205 | // Even $dom = new DOMDocument( '1.0', 'utf-8' ); doesn't work, you have to specify |
206 | // encoding ="utf-8" in the string fed to loadHTML() |
207 | $html = ( $utf8Fragment ? '<?xml encoding="utf-8"?>' : '' ) . $content; |
208 | $dom->loadHTML( $html, LIBXML_PARSEHUGE ); |
209 | |
210 | if ( LIBXML_VERSION < 20900 ) { |
211 | libxml_disable_entity_loader( $loadEntities ); |
212 | } |
213 | |
214 | // check error codes; if not in the supplied list of ignorable errors, |
215 | // throw an exception |
216 | $errors = array_filter( |
217 | libxml_get_errors(), |
218 | static function ( $error ) use( $ignoreErrorCodes ) { |
219 | return !in_array( $error->code, $ignoreErrorCodes ); |
220 | } |
221 | ); |
222 | |
223 | // restore libxml state before anything else |
224 | libxml_clear_errors(); |
225 | libxml_use_internal_errors( $useErrors ); |
226 | |
227 | if ( $errors ) { |
228 | throw new WikitextException( |
229 | implode( |
230 | "\n", |
231 | array_map( |
232 | static function ( $error ) { |
233 | return $error->message; |
234 | }, |
235 | $errors |
236 | ) |
237 | ) . "\n\nFrom source content:\n" . $content, |
238 | 'process-wikitext' |
239 | ); |
240 | } |
241 | |
242 | return $dom; |
243 | } |
244 | |
245 | /** |
246 | * Handler for FlowAddModules, avoids rest of Flow having to be aware if |
247 | * Parsoid is in use. |
248 | * |
249 | * @param OutputPage $out |
250 | * @return bool |
251 | */ |
252 | public static function onFlowAddModules( OutputPage $out ) { |
253 | // The module is only necessary when we are using parsoid. |
254 | // XXX We only need the Parsoid CSS if some content being |
255 | // rendered has getContentFormat() === 'html'. |
256 | $out->addModuleStyles( [ |
257 | 'mediawiki.skinning.content.parsoid', |
258 | 'ext.cite.parsoid.styles', |
259 | ] ); |
260 | |
261 | return true; |
262 | } |
263 | |
264 | /** |
265 | * Saves a document using saveXML, but avoid escaping style blocks with CDATA. |
266 | * This is not needed in HTML and breaks the CSS. |
267 | * |
268 | * @param DOMDocument $doc |
269 | * @param DOMNode|null $node the specific node to save |
270 | * @return string HTML |
271 | */ |
272 | public static function saferSaveXML( DOMDocument $doc, ?DOMNode $node = null ) { |
273 | $html = $doc->saveXML( $node ); |
274 | // This regex is only safe as long as attribute values get escaped > chars |
275 | // This is checked by the testcases |
276 | $html = preg_replace( '/<style([^>]*)><!\[CDATA\[/i', '<style\1>', $html ); |
277 | return preg_replace( '/\]\]><\/style>/i', '</style>', $html ); |
278 | } |
279 | |
280 | /** |
281 | * Retrieves the html of the node's children. |
282 | * |
283 | * @param DOMNode|null $node |
284 | * @return string html of the nodes children |
285 | */ |
286 | public static function getInnerHtml( ?DOMNode $node = null ) { |
287 | $html = ''; |
288 | if ( $node ) { |
289 | $dom = $node instanceof DOMDocument ? $node : $node->ownerDocument; |
290 | // Don't use saveHTML(), it has bugs (T217766); instead use XML serialization |
291 | // with a workaround for empty non-void nodes |
292 | $fixer = new ContentFixer( new EmptyNodeFixer ); |
293 | $fixer->applyToDom( $dom, Title::newMainPage() ); |
294 | |
295 | foreach ( $node->childNodes as $child ) { |
296 | $html .= self::saferSaveXML( $dom, $child ); |
297 | } |
298 | } |
299 | return $html; |
300 | } |
301 | |
302 | /** |
303 | * Gets the HTML of a node. This is like getInnterHtml(), but includes the node's tag itself too. |
304 | * @param DOMNode $node |
305 | * @return string HTML |
306 | */ |
307 | public static function getOuterHtml( DOMNode $node ) { |
308 | $dom = $node instanceof DOMDocument ? $node : $node->ownerDocument; |
309 | // Don't use saveHTML(), it has bugs (T217766); instead use XML serialization |
310 | // with a workaround for empty non-void nodes |
311 | $fixer = new ContentFixer( new EmptyNodeFixer ); |
312 | $fixer->applyToDom( $dom, Title::newMainPage() ); |
313 | return self::saferSaveXML( $dom, $node ); |
314 | } |
315 | |
316 | /** |
317 | * Encode information from the <head> tag as attributes on the <body> tag, then |
318 | * drop the <head>. |
319 | * |
320 | * Specifically, add the Parsoid version number in the parsoid-version attribute; |
321 | * put the href of the <base> tag in the base-url attribute; |
322 | * and remove the class attribute from the <body>. |
323 | * |
324 | * @param string $html |
325 | * @return string HTML with <head> information encoded as attributes on the <body> |
326 | * @throws WikitextException |
327 | * @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs |
328 | */ |
329 | public static function encodeHeadInfo( $html ) { |
330 | $dom = ContentFixer::createDOM( $html ); |
331 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
332 | $head = $dom->getElementsByTagName( 'head' )->item( 0 ); |
333 | $base = $head ? $head->getElementsByTagName( 'base' )->item( 0 ) : null; |
334 | $body->setAttribute( 'parsoid-version', self::PARSOID_VERSION ); |
335 | if ( $base instanceof DOMElement && $base->getAttribute( 'href' ) ) { |
336 | $body->setAttribute( 'base-url', $base->getAttribute( 'href' ) ); |
337 | } |
338 | // The class attribute is not used by us and is wastefully long, remove it |
339 | $body->removeAttribute( 'class' ); |
340 | return self::getOuterHtml( $body ); |
341 | } |
342 | |
343 | /** |
344 | * Put the base URI from the <body>'s base-url attribute back in the <head> as a <base> tag. |
345 | * This reverses (part of) the transformation done by encodeHeadInfo(). |
346 | * |
347 | * @param string $html HTML (may be a full document, <body> tag or unwrapped <body> contents) |
348 | * @return string HTML (<html> tag with <head> and <body>) with the <base> tag restored |
349 | * @throws WikitextException |
350 | * @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs |
351 | */ |
352 | public static function decodeHeadInfo( $html ) { |
353 | $dom = ContentFixer::createDOM( $html ); |
354 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
355 | $baseUrl = $body->getAttribute( 'base-url' ); |
356 | return Html::rawElement( 'html', [], |
357 | Html::rawElement( 'head', [], |
358 | // Only set base href if there's a value to set. |
359 | $baseUrl ? Html::element( 'base', [ 'href' => $baseUrl ] ) : '' |
360 | ) . |
361 | self::getOuterHtml( $body ) |
362 | ); |
363 | } |
364 | |
365 | /** |
366 | * Get the Parsoid version from HTML content stored in the database. |
367 | * This interprets the transformation done by encodeHeadInfo(). |
368 | * |
369 | * @param string $html |
370 | * @return string|null Parsoid version number, or null if none found |
371 | * @suppress PhanUndeclaredMethod Apparently a phan bug / wrong built-in PHP stubs |
372 | */ |
373 | public static function getParsoidVersion( $html ) { |
374 | $dom = ContentFixer::createDOM( $html ); |
375 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
376 | $version = $body->getAttribute( 'parsoid-version' ); |
377 | return $version !== '' ? $version : null; |
378 | } |
379 | |
380 | /** |
381 | * Subpage links from Parsoid don't contain any direct context, its applied via |
382 | * a <base href="..."> tag, so here we apply a similar rule resolving against |
383 | * $title |
384 | * |
385 | * @param string $text |
386 | * @param Title $title Title to resolve relative links against |
387 | * @return Title|null |
388 | */ |
389 | public static function createRelativeTitle( $text, Title $title ) { |
390 | // currently parsoid always uses enough ../ or ./ to go |
391 | // back to the root, a bit of a kludge but just assume we |
392 | // can strip and will end up with a non-relative text. |
393 | $text = preg_replace( '|^(\.\.?/)+|', '', $text ); |
394 | |
395 | if ( $text && ( $text[0] === '/' || $text[0] === '#' ) ) { |
396 | return Title::newFromText( $title->getDBkey() . $text, $title->getNamespace() ); |
397 | } |
398 | |
399 | return Title::newFromText( $text ); |
400 | } |
401 | |
402 | /** |
403 | * @since 1.35 |
404 | * @return ILanguageConverter |
405 | */ |
406 | private static function getLanguageConverter(): ILanguageConverter { |
407 | $services = MediaWikiServices::getInstance(); |
408 | return $services |
409 | ->getLanguageConverterFactory() |
410 | ->getLanguageConverter( $services->getContentLanguage() ); |
411 | } |
412 | |
413 | /** |
414 | * @since 1.35 |
415 | * @param Title $title Title to convert to language variant |
416 | * @return string Converted title |
417 | */ |
418 | public static function getConvertedTitle( Title $title ) { |
419 | $ns = $title->getNamespace(); |
420 | $titleText = $title->getText(); |
421 | $langConv = self::getLanguageConverter(); |
422 | $variant = $langConv->getPreferredVariant(); |
423 | $convertedNamespace = $langConv->convertNamespace( $ns, $variant ); |
424 | if ( $convertedNamespace ) { |
425 | return $convertedNamespace . ':' . $langConv->translate( $titleText, $variant ); |
426 | } else { |
427 | return $langConv->translate( $titleText, $variant ); |
428 | } |
429 | } |
430 | } |