Code Coverage for /workspace/src/extensions/Flow/includes/Conversion/Utils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	62.60% covered (warning)	62.60%	82 / 131	56.25% covered (warning)	56.25%	9 / 16	CRAP	0.00% covered (danger)	0.00%	0 / 1
Utils	62.60% covered (warning)	62.60%	82 / 131	56.25% covered (warning)	56.25%	9 / 16	180.83	0.00% covered (danger)	0.00%	0 / 1
convert	66.67% covered (warning)	66.67%	8 / 12	0.00% covered (danger)	0.00%	0 / 1	15.48
wikitextToHTML	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2
htmlToWikitext	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	6
htmlToPlaintext	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	3
commentParser	77.78% covered (warning)	77.78%	7 / 9	0.00% covered (danger)	0.00%	0 / 1	5.27
createDOM	54.84% covered (warning)	54.84%	17 / 31	0.00% covered (danger)	0.00%	0 / 1	7.30
onFlowAddModules	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
saferSaveXML	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
getInnerHtml	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	4
getOuterHtml	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
encodeHeadInfo	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
decodeHeadInfo	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	2
getParsoidVersion	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
createRelativeTitle	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	4
getLanguageConverter	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
getConvertedTitle	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	6

1	<?php
2
3	namespace Flow\Conversion;
4
5	use DOMDocument;
6	use DOMElement;
7	use DOMNode;
8	use Flow\Exception\NoParserException;
9	use Flow\Exception\WikitextException;
10	use Flow\Parsoid\ContentFixer;
11	use Flow\Parsoid\Fixer\EmptyNodeFixer;
12	use MediaWiki\Content\TextContent;
13	use MediaWiki\Content\WikitextContent;
14	use MediaWiki\Html\Html;
15	use MediaWiki\Language\ILanguageConverter;
16	use MediaWiki\Language\Language;
17	use MediaWiki\MediaWikiServices;
18	use MediaWiki\Output\OutputPage;
19	use MediaWiki\Parser\ParserOptions;
20	use MediaWiki\Parser\Sanitizer;
21	use MediaWiki\Title\Title;
22
23	abstract class Utils {
24
25	public const PARSOID_VERSION = '2.0.0';
26
27	/**
28	* Convert from/to wikitext <=> html or topic-title-wikitext => topic-title-html.
29	* Only these pairs are supported. html => wikitext requires Parsoid, and
30	* topic-title-html => topic-title-wikitext is not supported.
31	*
32	* @param string $from Format of content to convert: html\|wikitext\|topic-title-wikitext
33	* @param string $to Format to convert to: html\|wikitext\|topic-title-html
34	* @param string $content
35	* @param Title $title
36	* @return string
37	* @throws WikitextException When the requested conversion is unsupported
38	* @throws NoParserException When the conversion fails
39	* @return-taint none
40	*/
41	public static function convert( $from, $to, $content, Title $title ) {
42	if ( $from === $to \|\| $content === '' ) {
43	return $content;
44	}
45
46	if ( $from === 'wt' ) {
47	$from = 'wikitext';
48	}
49
50	if ( $from == 'wikitext' && $to == 'html' ) {
51	return self::wikitextToHTML( $content, $title );
52	} elseif ( $from == 'html' && $to == 'wikitext' ) {
53	return self::htmlToWikitext( $content, $title );
54	} elseif ( $from === 'topic-title-wikitext' &&
55	( $to === 'topic-title-html' \|\| $to === 'topic-title-plaintext' ) ) {
56	// FIXME: links need to be proceed by findVariantLinks or equivant function
57	return self::getLanguageConverter()->convert( self::commentParser( $from, $to, $content ) );
58	} else {
59	return self::commentParser( $from, $to, $content );
60	}
61	}
62
63	/**
64	* @param string $wikitext
65	* @param Title $title
66	*
67	* @return string The converted wikitext to HTML
68	*/
69	private static function wikitextToHTML( string $wikitext, Title $title ) {
70	$parserOptions = ParserOptions::newFromAnon();
71	$parserOptions->setRenderReason( __METHOD__ );
72
73	$parserFactory = MediaWikiServices::getInstance()->getParsoidParserFactory()->create();
74	$parserOutput = $parserFactory->parse( $wikitext, $title, $parserOptions );
75
76	// $parserOutput->getText() will strip off the body tag, but we want to retain here.
77	// So we'll call ->getRawText() here and modify the HTML by ourselves.
78	preg_match( "#<body[^>]>(.?)</body>#s", $parserOutput->getRawText(), $html );
79
80	return $html[0];
81	}
82
83	/**
84	* @param string $html
85	* @param Title $title
86	*
87	* @return string The converted HTML to Wikitext
88	* @throws WikitextException When the conversion is unsupported
89	*/
90	private static function htmlToWikitext( string $html, Title $title ) {
91	$transform = MediaWikiServices::getInstance()->getHtmlTransformFactory()
92	->getHtmlToContentTransform( $html, $title );
93
94	$transform->setOptions( [
95	'contentmodel' => CONTENT_MODEL_WIKITEXT,
96	'offsetType' => 'byte'
97	] );
98
99	/** @var TextContent $content */
100	$content = $transform->htmlToContent();
101
102	if ( !$content instanceof WikitextContent ) {
103	throw new WikitextException( 'Conversion to Wikitext failed' );
104	}
105
106	return trim( $content->getTextForSearchIndex() );
107	}
108
109	/**
110	* Basic conversion of html to plaintext for use in recent changes, history,
111	* and other places where a roundtrip is undesired.
112	*
113	* @param string $html
114	* @param int\|null $truncateLength Maximum length in characters (including ellipses) or null for whole string.
115	* @param Language\|null $lang Language to use for truncation. Defaults to $wgLang
116	* @return string plaintext
117	*/
118	public static function htmlToPlaintext( $html, ?int $truncateLength = null, ?Language $lang = null ) {
119	/** @var Language $wgLang */
120	global $wgLang;
121
122	$plain = trim( Sanitizer::stripAllTags( $html ) );
123
124	// Fallback to some large-ish value for truncation.
125	if ( $truncateLength === null ) {
126	$truncateLength = 10000;
127	}
128
129	$lang = $lang ?: $wgLang;
130	return $lang->truncateForVisual( $plain, $truncateLength );
131	}
132
133	/**
134	* Convert from/to topic-title-wikitext/topic-title-html using
135	* MediaWiki\CommentFormatter\CommentFormatter::formatLinks
136	*
137	* @param string $from Format of content to convert: topic-title-wikitext
138	* @param string $to Format of content to convert to: topic-title-html
139	* @param string $content Content to convert, in topic-title-wikitext format.
140	* @return string $content in HTML
141	* @throws WikitextException
142	*/
143	protected static function commentParser( $from, $to, $content ) {
144	if (
145	$from !== 'topic-title-wikitext' \|\|
146	( $to !== 'topic-title-html' && $to !== 'topic-title-plaintext' )
147	) {
148	throw new WikitextException( "Conversion from '$from' to '$to' was requested, " .
149	"but this is not supported." );
150	}
151
152	$html = MediaWikiServices::getInstance()->getCommentFormatter()
153	->formatLinks( Sanitizer::escapeHtmlAllowEntities( $content ) );
154	if ( $to === 'topic-title-plaintext' ) {
155	return self::htmlToPlaintext( $html );
156	} else {
157	return $html;
158	}
159	}
160
161	/**
162	* Turns given $content string into a DOMDocument object.
163	*
164	* Note that, by default, $content will be prefixed with <?xml encoding="utf-8"?> to force
165	* libxml to interpret the content as UTF-8. If for some reason you don't want this to happen,
166	* or you are certain that your input already has <?xml encoding="utf-8"?> or
167	* <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> , then you can disable
168	* this behavior by setting $utf8Fragment=false to disable this behavior.
169	*
170	* Some libxml errors are forgivable, libxml errors that aren't
171	* ignored will throw a WikitextException.
172	*
173	* The default error codes allowed are:
174	* 9 - allow illegal characters (they are removed, but this option means it
175	* doesn't trigger an error.
176	* 76 - allow unexpected end tag. This is typically old wikitext using deprecated tags.
177	* 513 - allow multiple tags with same id
178	* 801 - allow unrecognized tags like figcaption
179	*
180	* @param string $content
181	* @param bool $utf8Fragment If true, prefix $content with <?xml encoding="utf-8"?>
182	* @param array $ignoreErrorCodes
183	* @return DOMDocument
184	* @throws WikitextException
185	* @see http://www.xmlsoft.org/html/libxml-xmlerror.html
186	*/
187	public static function createDOM(
188	$content,
189	$utf8Fragment = true,
190	array $ignoreErrorCodes = [ 9, 76, 513, 801 ]
191	) {
192	$dom = new DOMDocument();
193
194	$loadEntities = false;
195	if ( LIBXML_VERSION < 20900 ) {
196	// Otherwise the parser may attempt to load the dtd from an external source.
197	// See: https://www.mediawiki.org/wiki/XML_External_Entity_Processing
198	$loadEntities = libxml_disable_entity_loader( true );
199	}
200
201	// don't output warnings
202	$useErrors = libxml_use_internal_errors( true );
203
204	// Work around DOMDocument's morbid insistence on using iso-8859-1
205	// Even $dom = new DOMDocument( '1.0', 'utf-8' ); doesn't work, you have to specify
206	// encoding ="utf-8" in the string fed to loadHTML()
207	$html = ( $utf8Fragment ? '<?xml encoding="utf-8"?>' : '' ) . $content;
208	$dom->loadHTML( $html, LIBXML_PARSEHUGE );
209
210	if ( LIBXML_VERSION < 20900 ) {
211	libxml_disable_entity_loader( $loadEntities );
212	}
213
214	// check error codes; if not in the supplied list of ignorable errors,
215	// throw an exception
216	$errors = array_filter(
217	libxml_get_errors(),
218	static function ( $error ) use( $ignoreErrorCodes ) {
219	return !in_array( $error->code, $ignoreErrorCodes );
220	}
221	);
222
223	// restore libxml state before anything else
224	libxml_clear_errors();
225	libxml_use_internal_errors( $useErrors );
226
227	if ( $errors ) {
228	throw new WikitextException(
229	implode(
230	"\n",
231	array_map(
232	static function ( $error ) {
233	return $error->message;
234	},
235	$errors
236	)
237	) . "\n\nFrom source content:\n" . $content,
238	'process-wikitext'
239	);
240	}
241
242	return $dom;
243	}
244
245	/**
246	* Handler for FlowAddModules, avoids rest of Flow having to be aware if
247	* Parsoid is in use.
248	*
249	* @param OutputPage $out
250	* @return bool
251	*/
252	public static function onFlowAddModules( OutputPage $out ) {
253	// The module is only necessary when we are using parsoid.
254	// XXX We only need the Parsoid CSS if some content being
255	// rendered has getContentFormat() === 'html'.
256	$out->addModuleStyles( [
257	'mediawiki.skinning.content.parsoid',
258	'ext.cite.parsoid.styles',
259	] );
260
261	return true;
262	}
263
264	/**
265	* Saves a document using saveXML, but avoid escaping style blocks with CDATA.
266	* This is not needed in HTML and breaks the CSS.
267	*
268	* @param DOMDocument $doc
269	* @param DOMNode\|null $node the specific node to save
270	* @return string HTML
271	*/
272	public static function saferSaveXML( DOMDocument $doc, ?DOMNode $node = null ) {
273	$html = $doc->saveXML( $node );
274	// This regex is only safe as long as attribute values get escaped > chars
275	// This is checked by the testcases
276	$html = preg_replace( '/<style([^>]*)><!\[CDATA\[/i', '<style\1>', $html );
277	return preg_replace( '/\]\]><\/style>/i', '</style>', $html );
278	}
279
280	/**
281	* Retrieves the html of the node's children.
282	*
283	* @param DOMNode\|null $node
284	* @return string html of the nodes children
285	*/
286	public static function getInnerHtml( ?DOMNode $node = null ) {
287	$html = '';
288	if ( $node ) {
289	$dom = $node instanceof DOMDocument ? $node : $node->ownerDocument;
290	// Don't use saveHTML(), it has bugs (T217766); instead use XML serialization
291	// with a workaround for empty non-void nodes
292	$fixer = new ContentFixer( new EmptyNodeFixer );
293	$fixer->applyToDom( $dom, Title::newMainPage() );
294
295	foreach ( $node->childNodes as $child ) {
296	$html .= self::saferSaveXML( $dom, $child );
297	}
298	}
299	return $html;
300	}
301
302	/**
303	* Gets the HTML of a node. This is like getInnterHtml(), but includes the node's tag itself too.
304	* @param DOMNode $node
305	* @return string HTML
306	*/
307	public static function getOuterHtml( DOMNode $node ) {
308	$dom = $node instanceof DOMDocument ? $node : $node->ownerDocument;
309	// Don't use saveHTML(), it has bugs (T217766); instead use XML serialization
310	// with a workaround for empty non-void nodes
311	$fixer = new ContentFixer( new EmptyNodeFixer );
312	$fixer->applyToDom( $dom, Title::newMainPage() );
313	return self::saferSaveXML( $dom, $node );
314	}
315
316	/**
317	* Encode information from the <head> tag as attributes on the <body> tag, then
318	* drop the <head>.
319	*
320	* Specifically, add the Parsoid version number in the parsoid-version attribute;
321	* put the href of the <base> tag in the base-url attribute;
322	* and remove the class attribute from the <body>.
323	*
324	* @param string $html
325	* @return string HTML with <head> information encoded as attributes on the <body>
326	* @throws WikitextException
327	* @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs
328	*/
329	public static function encodeHeadInfo( $html ) {
330	$dom = ContentFixer::createDOM( $html );
331	$body = $dom->getElementsByTagName( 'body' )->item( 0 );
332	$head = $dom->getElementsByTagName( 'head' )->item( 0 );
333	$base = $head ? $head->getElementsByTagName( 'base' )->item( 0 ) : null;
334	$body->setAttribute( 'parsoid-version', self::PARSOID_VERSION );
335	if ( $base instanceof DOMElement && $base->getAttribute( 'href' ) ) {
336	$body->setAttribute( 'base-url', $base->getAttribute( 'href' ) );
337	}
338	// The class attribute is not used by us and is wastefully long, remove it
339	$body->removeAttribute( 'class' );
340	return self::getOuterHtml( $body );
341	}
342
343	/**
344	* Put the base URI from the <body>'s base-url attribute back in the <head> as a <base> tag.
345	* This reverses (part of) the transformation done by encodeHeadInfo().
346	*
347	* @param string $html HTML (may be a full document, <body> tag or unwrapped <body> contents)
348	* @return string HTML (<html> tag with <head> and <body>) with the <base> tag restored
349	* @throws WikitextException
350	* @suppress PhanUndeclaredMethod,PhanTypeMismatchArgumentNullable Apparently a phan bug / wrong built-in PHP stubs
351	*/
352	public static function decodeHeadInfo( $html ) {
353	$dom = ContentFixer::createDOM( $html );
354	$body = $dom->getElementsByTagName( 'body' )->item( 0 );
355	$baseUrl = $body->getAttribute( 'base-url' );
356	return Html::rawElement( 'html', [],
357	Html::rawElement( 'head', [],
358	// Only set base href if there's a value to set.
359	$baseUrl ? Html::element( 'base', [ 'href' => $baseUrl ] ) : ''
360	) .
361	self::getOuterHtml( $body )
362	);
363	}
364
365	/**
366	* Get the Parsoid version from HTML content stored in the database.
367	* This interprets the transformation done by encodeHeadInfo().
368	*
369	* @param string $html
370	* @return string\|null Parsoid version number, or null if none found
371	* @suppress PhanUndeclaredMethod Apparently a phan bug / wrong built-in PHP stubs
372	*/
373	public static function getParsoidVersion( $html ) {
374	$dom = ContentFixer::createDOM( $html );
375	$body = $dom->getElementsByTagName( 'body' )->item( 0 );
376	$version = $body->getAttribute( 'parsoid-version' );
377	return $version !== '' ? $version : null;
378	}
379
380	/**
381	* Subpage links from Parsoid don't contain any direct context, its applied via
382	* a <base href="..."> tag, so here we apply a similar rule resolving against
383	* $title
384	*
385	* @param string $text
386	* @param Title $title Title to resolve relative links against
387	* @return Title\|null
388	*/
389	public static function createRelativeTitle( $text, Title $title ) {
390	// currently parsoid always uses enough ../ or ./ to go
391	// back to the root, a bit of a kludge but just assume we
392	// can strip and will end up with a non-relative text.
393	$text = preg_replace( '\|^(\.\.?/)+\|', '', $text );
394
395	if ( $text && ( $text[0] === '/' \|\| $text[0] === '#' ) ) {
396	return Title::newFromText( $title->getDBkey() . $text, $title->getNamespace() );
397	}
398
399	return Title::newFromText( $text );
400	}
401
402	/**
403	* @since 1.35
404	* @return ILanguageConverter
405	*/
406	private static function getLanguageConverter(): ILanguageConverter {
407	$services = MediaWikiServices::getInstance();
408	return $services
409	->getLanguageConverterFactory()
410	->getLanguageConverter( $services->getContentLanguage() );
411	}
412
413	/**
414	* @since 1.35
415	* @param Title $title Title to convert to language variant
416	* @return string Converted title
417	*/
418	public static function getConvertedTitle( Title $title ) {
419	$ns = $title->getNamespace();
420	$titleText = $title->getText();
421	$langConv = self::getLanguageConverter();
422	$variant = $langConv->getPreferredVariant();
423	$convertedNamespace = $langConv->convertNamespace( $ns, $variant );
424	if ( $convertedNamespace ) {
425	return $convertedNamespace . ':' . $langConv->translate( $titleText, $variant );
426	} else {
427	return $langConv->translate( $titleText, $variant );
428	}
429	}
430	}