Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
69.15% |
65 / 94 |
|
87.50% |
7 / 8 |
CRAP | |
0.00% |
0 / 1 |
Cleaner | |
69.15% |
65 / 94 |
|
87.50% |
7 / 8 |
56.43 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
cleanHtml | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
createDomDocument | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
addContent | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
9 | |||
matchesRemove | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
7 | |||
nodeHasClass | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
lastElement | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
cleanHtmlDom | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Segment; |
4 | |
5 | /** |
6 | * @file |
7 | * @ingroup Extensions |
8 | * @license GPL-2.0-or-later |
9 | */ |
10 | |
11 | use DOMComment; |
12 | use DOMDocument; |
13 | use DOMNode; |
14 | use DOMXPath; |
15 | use MWException; |
16 | |
17 | /** |
18 | * Used for cleaning text with HTML markup. The cleaned text is used |
19 | * as input for `Segmenter`. |
20 | * |
21 | * @since 0.0.1 |
22 | */ |
23 | class Cleaner { |
24 | /** |
25 | * An array of tags that should be removed completely during cleaning. |
26 | * |
27 | * @var array |
28 | */ |
29 | private $removeTags; |
30 | |
31 | /** |
32 | * An array of tags that should add a segment break during cleaning. |
33 | * |
34 | * @var array |
35 | */ |
36 | private $segmentBreakingTags; |
37 | |
38 | /** |
39 | * An array of `CleanedText`s and `SegmentBreak`s. |
40 | * |
41 | * @var SegmentContent[] |
42 | */ |
43 | private $cleanedContent; |
44 | |
45 | /** |
46 | * @param array $removeTags An array of tags that should be |
47 | * removed completely during cleaning. |
48 | * @param array $segmentBreakingTags An array of `CleanedText`s |
49 | * and `SegmentBreak`s. |
50 | */ |
51 | public function __construct( $removeTags, $segmentBreakingTags ) { |
52 | $this->removeTags = $removeTags; |
53 | $this->segmentBreakingTags = $segmentBreakingTags; |
54 | } |
55 | |
56 | /** |
57 | * Clean HTML tags from a string. |
58 | * |
59 | * Separates any HTML tags from the text. |
60 | * |
61 | * @since 0.0.1 |
62 | * @param string $markedUpText Input text that may contain HTML |
63 | * tags. |
64 | * @return SegmentContent[] An array of `CleanedText`s and `SegmentBreak`s |
65 | * representing text nodes. |
66 | */ |
67 | public function cleanHtml( $markedUpText ): array { |
68 | $dom = self::createDomDocument( $markedUpText ); |
69 | $xpath = new DOMXPath( $dom ); |
70 | // Only add elements below the dummy element. These are the |
71 | // elements from the original HTML. |
72 | $top = $xpath->evaluate( '/meta/dummy' )->item( 0 ); |
73 | $this->cleanedContent = []; |
74 | $this->addContent( $top ); |
75 | // Remove any segment break at the start or end of the array, |
76 | // since they won't do anything. |
77 | if ( |
78 | $this->cleanedContent && |
79 | $this->cleanedContent[0] instanceof SegmentBreak |
80 | ) { |
81 | array_shift( $this->cleanedContent ); |
82 | } |
83 | if ( self::lastElement( $this->cleanedContent ) instanceof SegmentBreak ) { |
84 | array_pop( $this->cleanedContent ); |
85 | } |
86 | return $this->cleanedContent; |
87 | } |
88 | |
89 | /** |
90 | * Create a DOMDocument from an HTML string. |
91 | * |
92 | * A dummy element is added as top node. |
93 | * |
94 | * @since 0.0.1 |
95 | * @param string $markedUpText The string to create the |
96 | * DOMDocument. |
97 | * @return DOMDocument The created DOMDocument. |
98 | */ |
99 | private static function createDomDocument( $markedUpText ): DOMDocument { |
100 | $dom = new DOMDocument(); |
101 | // Add encoding information and wrap the input text in a dummy |
102 | // tag to prevent p tags from being added for text nodes. |
103 | $wrappedText = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' . |
104 | '<dummy>' . $markedUpText . '</dummy></head>'; |
105 | libxml_use_internal_errors( true ); |
106 | $dom->loadHTML( |
107 | $wrappedText, |
108 | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED |
109 | ); |
110 | return $dom; |
111 | } |
112 | |
113 | /** |
114 | * Recursively add items to the cleaned content. |
115 | * |
116 | * Goes through all the child nodes of $node and adds their |
117 | * content text. Adds segment breaks for appropriate tags. |
118 | * |
119 | * @since 0.0.1 |
120 | * @param DOMNode $node The top node to add from. |
121 | */ |
122 | private function addContent( $node ): void { |
123 | if ( !$node instanceof DOMComment && !$this->matchesRemove( $node ) ) { |
124 | foreach ( $node->childNodes as $child ) { |
125 | if ( |
126 | !self::lastElement( $this->cleanedContent ) |
127 | instanceof SegmentBreak && |
128 | in_array( |
129 | $child->nodeName, |
130 | $this->segmentBreakingTags |
131 | ) |
132 | ) { |
133 | // Add segment breaks for start tags specified in |
134 | // the config, unless the previous item is a break |
135 | // or this is the first item. |
136 | $this->cleanedContent[] = new SegmentBreak(); |
137 | } |
138 | if ( $child->nodeType == XML_TEXT_NODE ) { |
139 | // Remove the path to the dummy node and instead |
140 | // add "." to match when used with context. |
141 | $path = preg_replace( |
142 | '!^/meta/dummy' . '!', |
143 | '.', |
144 | $child->getNodePath() |
145 | ); |
146 | $this->cleanedContent[] = new CleanedText( $child->textContent, $path ); |
147 | } else { |
148 | $this->addContent( $child ); |
149 | } |
150 | if ( |
151 | !self::lastElement( $this->cleanedContent ) instanceof SegmentBreak && |
152 | in_array( |
153 | $child->nodeName, |
154 | $this->segmentBreakingTags |
155 | ) |
156 | ) { |
157 | // Add segment breaks for end tags specified in |
158 | // the config. |
159 | $this->cleanedContent[] = new SegmentBreak(); |
160 | } |
161 | } |
162 | } |
163 | } |
164 | |
165 | /** |
166 | * Check if a node matches criteria for removal. |
167 | * |
168 | * The node is compared to the removal criteria from the |
169 | * configuration, to determine if it should be removed completely. |
170 | * |
171 | * @since 0.0.1 |
172 | * @param DOMNode $node The node to check. |
173 | * @return bool true if the node match removal criteria, otherwise |
174 | * false. |
175 | */ |
176 | private function matchesRemove( $node ): bool { |
177 | if ( !array_key_exists( $node->nodeName, $this->removeTags ) ) { |
178 | // The node name isn't found in the removal list. |
179 | return false; |
180 | } |
181 | $removeCriteria = $this->removeTags[$node->nodeName]; |
182 | if ( $removeCriteria === true ) { |
183 | // Node name is found and there are no extra criteria. |
184 | return true; |
185 | } elseif ( is_array( $removeCriteria ) ) { |
186 | // If there are multiple classes for a tag, check if any |
187 | // of them match. |
188 | foreach ( $removeCriteria as $class ) { |
189 | if ( self::nodeHasClass( $node, $class ) ) { |
190 | return true; |
191 | } |
192 | } |
193 | } elseif ( self::nodeHasClass( $node, $removeCriteria ) ) { |
194 | // Node name and class name match. |
195 | return true; |
196 | } |
197 | return false; |
198 | } |
199 | |
200 | /** |
201 | * Check if a node has a class attribute, containing a string. |
202 | * |
203 | * Since this is for checking HTML tag classes, the class |
204 | * attribute, if present, is assumed to be a string of substrings, |
205 | * separated by spaces. |
206 | * |
207 | * @since 0.0.1 |
208 | * @param DOMNode $node The node to check. |
209 | * @param string $className The name of the class to check for. |
210 | * @return bool true if the node's class attribute contain |
211 | * $className, otherwise false. |
212 | */ |
213 | private static function nodeHasClass( $node, $className ): bool { |
214 | $classNode = $node->attributes->getNamedItem( 'class' ); |
215 | if ( $classNode == null ) { |
216 | return false; |
217 | } |
218 | $classString = $classNode->nodeValue; |
219 | $nodeClasses = explode( ' ', $classString ); |
220 | return in_array( $className, $nodeClasses ); |
221 | } |
222 | |
223 | /** |
224 | * Get the last element in an array. |
225 | * |
226 | * @since 0.0.1 |
227 | * @param array $array The array to get the last element from. |
228 | * @return mixed|null The last element in the array, null if array is empty. |
229 | */ |
230 | private static function lastElement( $array ) { |
231 | if ( !count( $array ) ) { |
232 | return null; |
233 | } else { |
234 | return $array[count( $array ) - 1]; |
235 | } |
236 | } |
237 | |
238 | /** |
239 | * Cleans title and content. |
240 | * |
241 | * @since 0.1.10 |
242 | * @param string $displayTitle |
243 | * @param string $pageContent |
244 | * @return SegmentContent[] Title and content represented as `CleanedText`s and `SegmentBreak`s |
245 | * @throws MWException If segmented title text is not an instance of CleanedText |
246 | */ |
247 | public function cleanHtmlDom( |
248 | string $displayTitle, |
249 | string $pageContent |
250 | ): array { |
251 | // Clean HTML. |
252 | $cleanedText = null; |
253 | // Parse latest revision, using parser cache. |
254 | $cleanedText = $this->cleanHtml( $pageContent ); |
255 | // Create a DOM for the title to get the Xpath, in case there |
256 | // are elements within the title. This happens e.g. when the |
257 | // title is italicized. |
258 | $dom = new DOMDocument(); |
259 | $dom->loadHTML( |
260 | '<h1>' . $displayTitle . '</h1>', |
261 | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED |
262 | ); |
263 | $xpath = new DOMXPath( $dom ); |
264 | $titleSegments = []; |
265 | $i = 0; |
266 | foreach ( $this->cleanHtml( $displayTitle ) as $titlePart ) { |
267 | if ( !$titlePart instanceof CleanedText ) { |
268 | throw new MWException( |
269 | 'Segmented title is not an instance of CleanedText!' |
270 | ); |
271 | } |
272 | |
273 | $node = $xpath->evaluate( '//text()' )->item( $i ); |
274 | $titlePart->setPath( '/' . $node->getNodePath() ); |
275 | $titleSegments[] = $titlePart; |
276 | $titleSegments[] = new SegmentBreak(); |
277 | $i++; |
278 | } |
279 | array_pop( $titleSegments ); |
280 | if ( $cleanedText ) { |
281 | $cleanedText = array_merge( |
282 | $titleSegments, |
283 | [ new SegmentBreak() ], |
284 | $cleanedText |
285 | ); |
286 | } else { |
287 | $cleanedText = $titleSegments; |
288 | } |
289 | return $cleanedText; |
290 | } |
291 | |
292 | } |