Code Coverage for /workspace/src/extensions/Wikispeech/includes/Segment/Cleaner.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	69.15% covered (warning)	69.15%	65 / 94	87.50% covered (warning)	87.50%	7 / 8	CRAP	0.00% covered (danger)	0.00%	0 / 1
Cleaner	69.15% covered (warning)	69.15%	65 / 94	87.50% covered (warning)	87.50%	7 / 8	56.43	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
cleanHtml	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	4
createDomDocument	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	1
addContent	100.00% covered (success)	100.00%	22 / 22	100.00% covered (success)	100.00%	1 / 1	9
matchesRemove	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	7
nodeHasClass	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
lastElement	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
cleanHtmlDom	0.00% covered (danger)	0.00%	0 / 29	0.00% covered (danger)	0.00%	0 / 1	20

1	<?php
2
3	namespace MediaWiki\Wikispeech\Segment;
4
5	/**
6	* @file
7	* @ingroup Extensions
8	* @license GPL-2.0-or-later
9	*/
10
11	use DOMComment;
12	use DOMDocument;
13	use DOMNode;
14	use DOMXPath;
15	use MWException;
16
17	/**
18	* Used for cleaning text with HTML markup. The cleaned text is used
19	* as input for `Segmenter`.
20	*
21	* @since 0.0.1
22	*/
23	class Cleaner {
24	/**
25	* An array of tags that should be removed completely during cleaning.
26	*
27	* @var array
28	*/
29	private $removeTags;
30
31	/**
32	* An array of tags that should add a segment break during cleaning.
33	*
34	* @var array
35	*/
36	private $segmentBreakingTags;
37
38	/**
39	* An array of `CleanedText`s and `SegmentBreak`s.
40	*
41	* @var SegmentContent[]
42	*/
43	private $cleanedContent;
44
45	/**
46	* @param array $removeTags An array of tags that should be
47	* removed completely during cleaning.
48	* @param array $segmentBreakingTags An array of `CleanedText`s
49	* and `SegmentBreak`s.
50	*/
51	public function __construct( $removeTags, $segmentBreakingTags ) {
52	$this->removeTags = $removeTags;
53	$this->segmentBreakingTags = $segmentBreakingTags;
54	}
55
56	/**
57	* Clean HTML tags from a string.
58	*
59	* Separates any HTML tags from the text.
60	*
61	* @since 0.0.1
62	* @param string $markedUpText Input text that may contain HTML
63	* tags.
64	* @return SegmentContent[] An array of `CleanedText`s and `SegmentBreak`s
65	* representing text nodes.
66	*/
67	public function cleanHtml( $markedUpText ): array {
68	$dom = self::createDomDocument( $markedUpText );
69	$xpath = new DOMXPath( $dom );
70	// Only add elements below the dummy element. These are the
71	// elements from the original HTML.
72	$top = $xpath->evaluate( '/meta/dummy' )->item( 0 );
73	$this->cleanedContent = [];
74	$this->addContent( $top );
75	// Remove any segment break at the start or end of the array,
76	// since they won't do anything.
77	if (
78	$this->cleanedContent &&
79	$this->cleanedContent[0] instanceof SegmentBreak
80	) {
81	array_shift( $this->cleanedContent );
82	}
83	if ( self::lastElement( $this->cleanedContent ) instanceof SegmentBreak ) {
84	array_pop( $this->cleanedContent );
85	}
86	return $this->cleanedContent;
87	}
88
89	/**
90	* Create a DOMDocument from an HTML string.
91	*
92	* A dummy element is added as top node.
93	*
94	* @since 0.0.1
95	* @param string $markedUpText The string to create the
96	* DOMDocument.
97	* @return DOMDocument The created DOMDocument.
98	*/
99	private static function createDomDocument( $markedUpText ): DOMDocument {
100	$dom = new DOMDocument();
101	// Add encoding information and wrap the input text in a dummy
102	// tag to prevent p tags from being added for text nodes.
103	$wrappedText = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' .
104	'<dummy>' . $markedUpText . '</dummy></head>';
105	libxml_use_internal_errors( true );
106	$dom->loadHTML(
107	$wrappedText,
108	LIBXML_HTML_NODEFDTD \| LIBXML_HTML_NOIMPLIED
109	);
110	return $dom;
111	}
112
113	/**
114	* Recursively add items to the cleaned content.
115	*
116	* Goes through all the child nodes of $node and adds their
117	* content text. Adds segment breaks for appropriate tags.
118	*
119	* @since 0.0.1
120	* @param DOMNode $node The top node to add from.
121	*/
122	private function addContent( $node ): void {
123	if ( !$node instanceof DOMComment && !$this->matchesRemove( $node ) ) {
124	foreach ( $node->childNodes as $child ) {
125	if (
126	!self::lastElement( $this->cleanedContent )
127	instanceof SegmentBreak &&
128	in_array(
129	$child->nodeName,
130	$this->segmentBreakingTags
131	)
132	) {
133	// Add segment breaks for start tags specified in
134	// the config, unless the previous item is a break
135	// or this is the first item.
136	$this->cleanedContent[] = new SegmentBreak();
137	}
138	if ( $child->nodeType == XML_TEXT_NODE ) {
139	// Remove the path to the dummy node and instead
140	// add "." to match when used with context.
141	$path = preg_replace(
142	'!^/meta/dummy' . '!',
143	'.',
144	$child->getNodePath()
145	);
146	$this->cleanedContent[] = new CleanedText( $child->textContent, $path );
147	} else {
148	$this->addContent( $child );
149	}
150	if (
151	!self::lastElement( $this->cleanedContent ) instanceof SegmentBreak &&
152	in_array(
153	$child->nodeName,
154	$this->segmentBreakingTags
155	)
156	) {
157	// Add segment breaks for end tags specified in
158	// the config.
159	$this->cleanedContent[] = new SegmentBreak();
160	}
161	}
162	}
163	}
164
165	/**
166	* Check if a node matches criteria for removal.
167	*
168	* The node is compared to the removal criteria from the
169	* configuration, to determine if it should be removed completely.
170	*
171	* @since 0.0.1
172	* @param DOMNode $node The node to check.
173	* @return bool true if the node match removal criteria, otherwise
174	* false.
175	*/
176	private function matchesRemove( $node ): bool {
177	if ( !array_key_exists( $node->nodeName, $this->removeTags ) ) {
178	// The node name isn't found in the removal list.
179	return false;
180	}
181	$removeCriteria = $this->removeTags[$node->nodeName];
182	if ( $removeCriteria === true ) {
183	// Node name is found and there are no extra criteria.
184	return true;
185	} elseif ( is_array( $removeCriteria ) ) {
186	// If there are multiple classes for a tag, check if any
187	// of them match.
188	foreach ( $removeCriteria as $class ) {
189	if ( self::nodeHasClass( $node, $class ) ) {
190	return true;
191	}
192	}
193	} elseif ( self::nodeHasClass( $node, $removeCriteria ) ) {
194	// Node name and class name match.
195	return true;
196	}
197	return false;
198	}
199
200	/**
201	* Check if a node has a class attribute, containing a string.
202	*
203	* Since this is for checking HTML tag classes, the class
204	* attribute, if present, is assumed to be a string of substrings,
205	* separated by spaces.
206	*
207	* @since 0.0.1
208	* @param DOMNode $node The node to check.
209	* @param string $className The name of the class to check for.
210	* @return bool true if the node's class attribute contain
211	* $className, otherwise false.
212	*/
213	private static function nodeHasClass( $node, $className ): bool {
214	$classNode = $node->attributes->getNamedItem( 'class' );
215	if ( $classNode == null ) {
216	return false;
217	}
218	$classString = $classNode->nodeValue;
219	$nodeClasses = explode( ' ', $classString );
220	return in_array( $className, $nodeClasses );
221	}
222
223	/**
224	* Get the last element in an array.
225	*
226	* @since 0.0.1
227	* @param array $array The array to get the last element from.
228	* @return mixed\|null The last element in the array, null if array is empty.
229	*/
230	private static function lastElement( $array ) {
231	if ( !count( $array ) ) {
232	return null;
233	} else {
234	return $array[count( $array ) - 1];
235	}
236	}
237
238	/**
239	* Cleans title and content.
240	*
241	* @since 0.1.10
242	* @param string $displayTitle
243	* @param string $pageContent
244	* @return SegmentContent[] Title and content represented as `CleanedText`s and `SegmentBreak`s
245	* @throws MWException If segmented title text is not an instance of CleanedText
246	*/
247	public function cleanHtmlDom(
248	string $displayTitle,
249	string $pageContent
250	): array {
251	// Clean HTML.
252	$cleanedText = null;
253	// Parse latest revision, using parser cache.
254	$cleanedText = $this->cleanHtml( $pageContent );
255	// Create a DOM for the title to get the Xpath, in case there
256	// are elements within the title. This happens e.g. when the
257	// title is italicized.
258	$dom = new DOMDocument();
259	$dom->loadHTML(
260	'<h1>' . $displayTitle . '</h1>',
261	LIBXML_HTML_NODEFDTD \| LIBXML_HTML_NOIMPLIED
262	);
263	$xpath = new DOMXPath( $dom );
264	$titleSegments = [];
265	$i = 0;
266	foreach ( $this->cleanHtml( $displayTitle ) as $titlePart ) {
267	if ( !$titlePart instanceof CleanedText ) {
268	throw new MWException(
269	'Segmented title is not an instance of CleanedText!'
270	);
271	}
272
273	$node = $xpath->evaluate( '//text()' )->item( $i );
274	$titlePart->setPath( '/' . $node->getNodePath() );
275	$titleSegments[] = $titlePart;
276	$titleSegments[] = new SegmentBreak();
277	$i++;
278	}
279	array_pop( $titleSegments );
280	if ( $cleanedText ) {
281	$cleanedText = array_merge(
282	$titleSegments,
283	[ new SegmentBreak() ],
284	$cleanedText
285	);
286	} else {
287	$cleanedText = $titleSegments;
288	}
289	return $cleanedText;
290	}
291
292	}