Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
93.60% |
161 / 172 |
|
76.92% |
20 / 26 |
CRAP | |
0.00% |
0 / 1 |
TemplateParser | |
93.60% |
161 / 172 |
|
76.92% |
20 / 26 |
80.63 | |
0.00% |
0 / 1 |
setPriorityLanguages | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setMultiLanguage | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setArtistCreditSeparator | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
parsePage | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
parseCoordinates | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
parseInformationFields | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
5 | |||
parseInformationField | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
6.01 | |||
sortInformationGroups | |
83.33% |
10 / 12 |
|
0.00% |
0 / 1 |
4.07 | |||
pruneInfoTemplateData | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
4 | |||
parseFieldArtist | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parseFieldCredit | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parseCreditOrArtist | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
parseFieldDateTimeOriginal | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
extractHCardProperty | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
parseLicenses | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
parseLicenseNode | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
parseNuke | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
9 | |||
parseRestrictions | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
parseContents | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
removeLanguageName | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
selectLanguage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
toHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
toText | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
innerHtml | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
cleanedInnerHtml | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
arrayTranspose | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | |
3 | namespace CommonsMetadata; |
4 | |
5 | use DOMElement; |
6 | use DOMNode; |
7 | |
8 | /** |
9 | * Class to parse metadata from commons formatted wiki pages. |
10 | * Relies on the attributes set by {{Information}} and similar templates - see |
11 | * https://commons.wikimedia.org/wiki/Commons:Machine-readable_data |
12 | */ |
13 | class TemplateParser { |
14 | public const COORDINATES_KEY = 'coordinates'; |
15 | public const LICENSES_KEY = 'licenses'; |
16 | public const INFORMATION_FIELDS_KEY = 'informationFields'; |
17 | public const DELETION_KEY = 'deletion'; |
18 | public const RESTRICTIONS_KEY = 'restrictions'; |
19 | |
20 | /** |
21 | * HTML element class name => metadata field name mapping for license data. |
22 | * @var array |
23 | */ |
24 | protected static $licenseFieldClasses = [ |
25 | 'licensetpl_short' => 'LicenseShortName', |
26 | 'licensetpl_long' => 'UsageTerms', |
27 | 'licensetpl_attr_req' => 'AttributionRequired', |
28 | 'licensetpl_attr' => 'Attribution', |
29 | // 'licensetpl_link_req', |
30 | 'licensetpl_link' => 'LicenseUrl', |
31 | 'licensetpl_nonfree' => 'NonFree', |
32 | ]; |
33 | |
34 | /** |
35 | * HTML element class/id => metadata field name mapping for information template data. |
36 | * @var array |
37 | */ |
38 | protected static $informationFieldClasses = [ |
39 | 'fileinfotpl_desc' => 'ImageDescription', |
40 | # For date: Open question - should we parse the commons |
41 | # date field better to deal with templates like |
42 | # {{Taken on}} et al. along with extracting a time stamp |
43 | # from the human readable field? |
44 | 'fileinfotpl_date' => 'DateTimeOriginal', |
45 | 'fileinfotpl_aut' => 'Artist', |
46 | # For "source" field of {{information}} there are two closely |
47 | # related fields we could map it to. Credit (iptc 2:110) is |
48 | # "Identifies the provider of the media, not necessarily the |
49 | # owner/creator." Source (iptc 2:115) "Identifies the |
50 | # original owner of the intellectual content of the media. This |
51 | # could be an agency, a member of an agency or an individual." |
52 | # I think "Credit" fits much more closely to the commons notion |
53 | # of source than "Source" does. |
54 | 'fileinfotpl_src' => 'Credit', |
55 | 'fileinfotpl_art_title' => 'ObjectName', |
56 | 'fileinfotpl_perm' => 'Permission', |
57 | 'fileinfotpl_credit' => 'Attribution', |
58 | ]; |
59 | |
60 | /** |
61 | * Classnames identifying {{Information}}-like templates, ordered from highest to lowest |
62 | * priority. Higher priority means that template is more likely to be about the image |
63 | * (as opposed to e.g. some object visible on the image), data in higher-priority templates |
64 | * will be preferred. The classes should be on the <table> element (for templates using the |
65 | * deprecated id-based fieldname markup) or on the same element which has the "fileinfotpl" |
66 | * class (for templates with the class-based markup). |
67 | * @var array |
68 | */ |
69 | protected static $infoTemplateClasses = [ |
70 | 'fileinfotpl-type-photograph', |
71 | 'fileinfotpl-type-information', |
72 | 'fileinfotpl-type-artwork', |
73 | ]; |
74 | |
75 | /** |
76 | * List for templates which should not have handled like {{Information}} even if they have |
77 | * fields matching $informationFieldClasses. Elements of this array refer to the same kind of |
78 | * classnames as $infoTemplateClasses. |
79 | * @var array |
80 | */ |
81 | protected static $infoTemplateExclusion = [ |
82 | 'fileinfotpl-type-book', |
83 | ]; |
84 | |
85 | /** |
86 | * preg_replace patterns which will be used to clean up parsed HTML code. |
87 | * @var array |
88 | */ |
89 | protected static $cleanupPatterns = [ |
90 | // trim leading or trailing whitespace |
91 | '/^\s+|\s+$/' => '', |
92 | // clean paragraph with no styling - usually generated by MediaWiki |
93 | '/^<p>(.*)<\/p>$/' => '\1', |
94 | ]; |
95 | |
96 | /** @var array */ |
97 | protected $priorityLanguages = [ 'en' ]; |
98 | |
99 | /** @var bool */ |
100 | protected $multiLanguage = false; |
101 | |
102 | /** @var string */ |
103 | protected $artistCreditSeparator = ' / '; |
104 | |
105 | /** |
106 | * When parsing multi-language text, use the first available language from this array. |
107 | * (Order matters - try to use the first element, if not available the second etc.) |
108 | * When set to false, will return all languages. |
109 | * @param array $priorityLanguages |
110 | */ |
111 | public function setPriorityLanguages( $priorityLanguages ) { |
112 | $this->priorityLanguages = $priorityLanguages; |
113 | } |
114 | |
115 | /** |
116 | * When true, the parser will ignore $priorityLanguages and return all available languages. |
117 | * @param bool $multiLanguage |
118 | */ |
119 | public function setMultiLanguage( $multiLanguage ) { |
120 | $this->multiLanguage = $multiLanguage; |
121 | } |
122 | |
123 | /** |
124 | * The separator used between multiple values of artist or credit when retrieved from a vcard |
125 | * @param string $separator |
126 | * @return void |
127 | */ |
128 | public function setArtistCreditSeparator( $separator ) { |
129 | $this->artistCreditSeparator = $separator; |
130 | } |
131 | |
132 | /** |
133 | * Parse an html string for metadata. |
134 | * |
135 | * This is the main entry point to the class. |
136 | * |
137 | * @param string $html The html to parse |
138 | * @return array The properties extracted from the page. |
139 | */ |
140 | public function parsePage( $html ) { |
141 | if ( !$html ) { // DOMDocument does not like empty strings |
142 | return []; |
143 | } |
144 | |
145 | $domNavigator = new DomNavigator( $html ); |
146 | |
147 | return array_filter( [ |
148 | self::COORDINATES_KEY => $this->parseCoordinates( $domNavigator ), |
149 | self::INFORMATION_FIELDS_KEY => $this->parseInformationFields( $domNavigator ), |
150 | self::LICENSES_KEY => $this->parseLicenses( $domNavigator ), |
151 | self::DELETION_KEY => $this->parseNuke( $domNavigator ), |
152 | self::RESTRICTIONS_KEY => $this->parseRestrictions( $domNavigator ), |
153 | ] ); |
154 | } |
155 | |
156 | /** |
157 | * Parses geocoded coordinates. |
158 | * @param DomNavigator $domNavigator |
159 | * @return array |
160 | */ |
161 | protected function parseCoordinates( DomNavigator $domNavigator ) { |
162 | $data = []; |
163 | foreach ( $domNavigator->findElementsWithClass( 'span', 'geo' ) as $geoNode ) { |
164 | $coordinateData = []; |
165 | $coords = explode( ';', $geoNode->textContent ); |
166 | if ( count( $coords ) == 2 && is_numeric( $coords[0] ) && is_numeric( $coords[1] ) ) { |
167 | $coordinateData['GPSLatitude'] = trim( $coords[0] ); |
168 | $coordinateData['GPSLongitude'] = trim( $coords[1] ); |
169 | $coordinateData['GPSMapDatum'] = 'WGS-84'; |
170 | } |
171 | $data[] = $coordinateData; |
172 | } |
173 | return $data; |
174 | } |
175 | |
176 | /** |
177 | * Parses the {{Information}} templates (and anything using the same metadata notation, |
178 | * like {{Artwork}}) |
179 | * @param DomNavigator $domNavigator |
180 | * @return array an array if information(-like) templates: |
181 | * array( 0 => array( 'ImageDescription' => ... ) ... ) |
182 | */ |
183 | protected function parseInformationFields( DomNavigator $domNavigator ) { |
184 | $attributePrefix = 'fileinfotpl_'; |
185 | $data = []; |
186 | $labelFields = $domNavigator->findElementsWithIdPrefix( [ 'td', 'th' ], $attributePrefix ); |
187 | foreach ( $labelFields as $labelField ) { |
188 | $informationField = $domNavigator->nextElementSibling( $labelField ); |
189 | if ( !$informationField ) { |
190 | continue; |
191 | } |
192 | $id = $labelField->getAttribute( 'id' ); |
193 | $group = $domNavigator->closest( $informationField, 'table' ); |
194 | $this->parseInformationField( $domNavigator, $informationField, $group, $id, $data ); |
195 | } |
196 | foreach ( $domNavigator->findElementsWithClass( '*', 'fileinfotpl' ) as $group ) { |
197 | $informationFields = $domNavigator->findElementsWithClassPrefix( |
198 | '*', $attributePrefix, $group ); |
199 | foreach ( $informationFields as $informationField ) { |
200 | $class = $domNavigator->getFirstClassWithPrefix( |
201 | $informationField, $attributePrefix ); |
202 | $this->parseInformationField( |
203 | $domNavigator, $informationField, $group, $class, $data ); |
204 | } |
205 | } |
206 | |
207 | $this->pruneInfoTemplateData( $data ); |
208 | $this->sortInformationGroups( $data ); |
209 | // using node paths to identify tables is an internal detail, hide it |
210 | return array_values( $data ); |
211 | } |
212 | |
213 | /** |
214 | * Helper function for the inner loop of parseInformationFields |
215 | * @param DomNavigator $domNavigator |
216 | * @param DOMElement $informationField the node holding the data |
217 | * @param DOMElement|null $group the top node containing all fields of this type; expected (but |
218 | * not required) to have one of the $informationFieldClasses. |
219 | * @param string $idOrClass id or class identifying the field, per $informationFieldClasses Node |
220 | * is ignored if this is not a key of $informationFieldClasses. Also ignored if this is null. |
221 | * @param array[] &$data |
222 | */ |
223 | protected function parseInformationField( |
224 | DomNavigator $domNavigator, DOMElement $informationField, $group, $idOrClass, array &$data |
225 | ) { |
226 | if ( !isset( self::$informationFieldClasses[$idOrClass] ) ) { |
227 | return; |
228 | } |
229 | $fieldName = self::$informationFieldClasses[$idOrClass]; |
230 | |
231 | // group fields coming from the same template |
232 | $groupName = $groupType = '-'; |
233 | if ( $group ) { |
234 | $groupName = $group->getNodePath() ?? '-'; |
235 | $groupType = |
236 | $domNavigator->getFirstClassWithPrefix( $group, 'fileinfotpl-type-' ) ?: '-'; |
237 | } |
238 | |
239 | if ( isset( $data[$groupName][$fieldName] ) ) { |
240 | // don't parse the same field multiple times if it has both id and classes; also |
241 | // ignore a second field of the same type in the same template |
242 | return; |
243 | } |
244 | |
245 | $method = 'parseField' . $fieldName; |
246 | if ( !method_exists( $this, $method ) ) { |
247 | $method = 'parseContents'; |
248 | } |
249 | |
250 | $data[$groupName][$fieldName] = $this->{$method}( $domNavigator, $informationField ); |
251 | $data[$groupName]['_type'] = $groupType; |
252 | } |
253 | |
254 | /** |
255 | * Sorts info template data groups according to $informationFieldClasses, highest priority first |
256 | * Also removes the _type helper keys. |
257 | * @param array[] &$data info template data, as returned by parseInformationFields() |
258 | */ |
259 | protected function sortInformationGroups( array &$data ) { |
260 | // PHP 5.3 does not like class references in closures |
261 | $infoTemplateClasses = self::$infoTemplateClasses; |
262 | |
263 | uasort( $data, static function ( $template1, $template2 ) use ( $infoTemplateClasses ) { |
264 | $priority1 = array_search( $template1['_type'], $infoTemplateClasses ); |
265 | $priority2 = array_search( $template2['_type'], $infoTemplateClasses ); |
266 | |
267 | // preserve the order of unknown templates; known precedes unknown |
268 | if ( $priority2 === false ) { |
269 | return -1; |
270 | } elseif ( $priority1 === false ) { |
271 | return 1; |
272 | } |
273 | |
274 | // $pri1 is smaller -> $template1['_type'] comes first in |
275 | // $informationFieldClasses -> should return negative so $template1 comes first |
276 | return $priority1 - $priority2; |
277 | } ); |
278 | |
279 | foreach ( $data as &$group ) { |
280 | unset( $group['_type'] ); |
281 | } |
282 | } |
283 | |
284 | /** |
285 | * Prunes template data |
286 | * Removes blacklisted templates if they are not alone |
287 | * @param array[] &$data info template data |
288 | */ |
289 | protected function pruneInfoTemplateData( array &$data ) { |
290 | foreach ( $data as $key => &$group ) { |
291 | if ( in_array( $group['_type'], self::$infoTemplateExclusion ) |
292 | && count( $data ) !== 1 |
293 | ) { |
294 | unset( $data[$key] ); |
295 | } |
296 | } |
297 | } |
298 | |
299 | /** |
300 | * Parses the artist, which might be an hCard |
301 | * @param DomNavigator $domNavigator |
302 | * @param DOMNode $node |
303 | * @return string |
304 | */ |
305 | protected function parseFieldArtist( DomNavigator $domNavigator, DOMNode $node ) { |
306 | return $this->parseCreditOrArtist( $domNavigator, $node ); |
307 | } |
308 | |
309 | /** |
310 | * @param DomNavigator $domNavigator |
311 | * @param DOMNode $node |
312 | * @return string |
313 | */ |
314 | protected function parseFieldCredit( DomNavigator $domNavigator, DOMNode $node ) { |
315 | return $this->parseCreditOrArtist( $domNavigator, $node ); |
316 | } |
317 | |
318 | /** |
319 | * @param DomNavigator $domNavigator |
320 | * @param DOMNode $node |
321 | * @return string |
322 | */ |
323 | protected function parseCreditOrArtist( DomNavigator $domNavigator, DOMNode $node ) { |
324 | $fields = $this->extractHCardProperty( $domNavigator, $node, 'fn' ); |
325 | if ( count( $fields ) ) { |
326 | $fields = array_map( function ( $field ) { |
327 | return $this->cleanedInnerHtml( $field ); |
328 | }, $fields ); |
329 | return implode( $this->artistCreditSeparator, $fields ); |
330 | } |
331 | |
332 | return $this->parseContents( $domNavigator, $node ); |
333 | } |
334 | |
335 | /** |
336 | * Parses the DateTimeOriginal - finds <time> tag and returns the value of datetime attribute |
337 | * @param DomNavigator $domNavigator |
338 | * @param DOMNode $node |
339 | * @return string |
340 | */ |
341 | protected function parseFieldDateTimeOriginal( DomNavigator $domNavigator, DOMNode $node ) { |
342 | $nodes = $domNavigator->findElementsWithAttribute( 'time', 'datetime', $node ); |
343 | foreach ( $nodes as $time ) { |
344 | return $time->getAttribute( 'datetime' ); |
345 | } |
346 | |
347 | return $this->parseContents( $domNavigator, $node ); |
348 | } |
349 | |
350 | /** |
351 | * Extracts an hCard property from a DOMNode that contains one or more hCard |
352 | * @param DomNavigator $domNavigator |
353 | * @param DOMNode $node |
354 | * @param string $property hCard property to be extracted |
355 | * @return array |
356 | */ |
357 | protected function extractHCardProperty( |
358 | DomNavigator $domNavigator, DOMNode $node, $property |
359 | ) { |
360 | $values = []; |
361 | foreach ( $domNavigator->findElementsWithClass( '*', 'vcard', $node ) as $vcard ) { |
362 | foreach ( $domNavigator->findElementsWithClass( '*', $property, $vcard ) as $name ) { |
363 | $values[] = $name; |
364 | } |
365 | } |
366 | return $values; |
367 | } |
368 | |
369 | /** |
370 | * @param DomNavigator $domNavigator |
371 | * @return array an array of licenses: array( 0 => array( 'LincenseShortName' => ... ) ... ) |
372 | */ |
373 | protected function parseLicenses( DomNavigator $domNavigator ) { |
374 | $data = []; |
375 | foreach ( $domNavigator->findElementsWithClass( '*', 'licensetpl' ) as $licenseNode ) { |
376 | $licenseData = $this->parseLicenseNode( $domNavigator, $licenseNode ); |
377 | if ( isset( $licenseData['UsageTerms'] ) ) { |
378 | $licenseData['Copyrighted'] = ( $licenseData['UsageTerms'] === 'Public domain' ) |
379 | ? 'False' : 'True'; |
380 | } |
381 | $data[] = $licenseData; |
382 | } |
383 | |
384 | return $data; |
385 | } |
386 | |
387 | /** |
388 | * @param DomNavigator $domNavigator |
389 | * @param DOMNode $licenseNode |
390 | * @return array |
391 | */ |
392 | protected function parseLicenseNode( DomNavigator $domNavigator, DOMNode $licenseNode ) { |
393 | $data = []; |
394 | foreach ( self::$licenseFieldClasses as $class => $fieldName ) { |
395 | foreach ( $domNavigator->findElementsWithClass( '*', $class, $licenseNode ) as $node ) { |
396 | $data[$fieldName] = $this->cleanedInnerHtml( $node ); |
397 | break; |
398 | } |
399 | } |
400 | return $data; |
401 | } |
402 | |
403 | /** |
404 | * Parse and return deletion reason from the {{Nuke}} template |
405 | * ( https://commons.wikimedia.org/wiki/Template:Nuke ) |
406 | * @param DomNavigator $domNavigator |
407 | * @return array |
408 | */ |
409 | protected function parseNuke( DomNavigator $domNavigator ) { |
410 | $deletions = []; |
411 | |
412 | foreach ( $domNavigator->findElementsWithClass( '*', 'nuke' ) as $nukeNode ) { |
413 | $nukeLink = $nukeNode->firstChild; |
414 | if ( $nukeLink |
415 | && $nukeLink instanceof DOMElement && $nukeLink->hasAttribute( 'href' ) |
416 | ) { |
417 | $urlBits = wfParseUrl( $nukeLink->getAttribute( 'href' ) ); |
418 | if ( isset( $urlBits['query'] ) ) { |
419 | $params = wfCgiToArray( $urlBits['query'] ); |
420 | if ( isset( $params['action'] ) && $params['action'] === 'delete' |
421 | && isset( $params['wpReason'] ) |
422 | ) { |
423 | $deletions[] = [ 'DeletionReason' => $params['wpReason'] ]; |
424 | } |
425 | } |
426 | } |
427 | } |
428 | return $deletions; |
429 | } |
430 | |
431 | /** |
432 | * Parses file restrictions i.e. trademark, insignia, etc. |
433 | * @param DomNavigator $domNavigator |
434 | * @return array |
435 | */ |
436 | protected function parseRestrictions( DomNavigator $domNavigator ) { |
437 | $restrictionPrefix = 'restriction-'; |
438 | $restrictions = []; |
439 | foreach ( |
440 | $domNavigator->findElementsWithClassPrefix( '*', $restrictionPrefix ) as $element |
441 | ) { |
442 | $classes = explode( ' ', $element->getAttribute( 'class' ) ); |
443 | foreach ( $classes as $class ) { |
444 | if ( strpos( $class, $restrictionPrefix ) === 0 ) { |
445 | $restrictionType = substr( $class, strlen( $restrictionPrefix ) ); |
446 | $restrictions[] = $restrictionType; |
447 | } |
448 | } |
449 | } |
450 | return [ [ 'Restrictions' => implode( '|', array_unique( $restrictions ) ) ] ]; |
451 | } |
452 | |
453 | /** |
454 | * Get the text of a node. The result might be a string, or an array of strings if the node has |
455 | * multiple languages (resulting from {{en}} and similar templates). |
456 | * @param DomNavigator $domNavigator |
457 | * @param DOMNode $node |
458 | * @return string|array |
459 | */ |
460 | protected function parseContents( DomNavigator $domNavigator, DOMNode $node ) { |
461 | $languageNodes = $domNavigator->findElementsWithClassAndLang( 'div', 'description', $node ); |
462 | if ( !$languageNodes->length ) { // no language templates at all |
463 | return $this->cleanedInnerHtml( $node ); |
464 | } |
465 | $languages = []; |
466 | foreach ( $languageNodes as $node ) { |
467 | $node = $this->removeLanguageName( $domNavigator, $node ); |
468 | $languageCode = $node->getAttribute( 'lang' ); |
469 | $languages[$languageCode] = $node; |
470 | } |
471 | if ( !$this->multiLanguage ) { |
472 | return $this->cleanedInnerHtml( $this->selectLanguage( $languages ) ); |
473 | } else { |
474 | $languages = array_map( [ $this, 'cleanedInnerHtml' ], $languages ); |
475 | $languages['_type'] = 'lang'; |
476 | return $languages; |
477 | } |
478 | } |
479 | |
480 | /** |
481 | * Language templates like {{en}} put the language name at the beginning of the text; |
482 | * this function removes it. |
483 | * @param DomNavigator $domNavigator |
484 | * @param DOMElement $node |
485 | * @return DOMElement a clone of the input node, with the language name removed |
486 | */ |
487 | protected function removeLanguageName( DomNavigator $domNavigator, DOMElement $node ) { |
488 | $node = $node->cloneNode( true ); |
489 | $languageNames = $domNavigator->findElementsWithClass( 'span', 'language', $node ); |
490 | foreach ( $languageNames as $languageName ) { |
491 | $parentNode = $languageName->parentNode; |
492 | if ( $parentNode !== null && !$node->isSameNode( $parentNode ) ) { |
493 | continue; // language names are direct children |
494 | } |
495 | $node->removeChild( $languageName ); |
496 | } |
497 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType cloneNode returns `static` |
498 | return $node; |
499 | } |
500 | |
501 | /** |
502 | * Takes an array indexed with language codes, and returns the best match. |
503 | * @param array $languages |
504 | * @return mixed |
505 | */ |
506 | protected function selectLanguage( array $languages ) { |
507 | foreach ( $this->priorityLanguages as $languageCode ) { |
508 | if ( array_key_exists( $languageCode, $languages ) ) { |
509 | return $languages[$languageCode]; |
510 | } |
511 | } |
512 | return reset( $languages ); |
513 | } |
514 | |
515 | /** |
516 | * Turns a node into a HTML string |
517 | * @param DOMNode $node |
518 | * @return string |
519 | */ |
520 | protected function toHtml( DOMNode $node ) { |
521 | return $node->ownerDocument->saveHTML( $node ); |
522 | } |
523 | |
524 | /** |
525 | * Turns a node into plain text |
526 | * @param DOMNode $node |
527 | * @return string |
528 | */ |
529 | protected function toText( DOMNode $node ) { |
530 | return trim( $node->textContent ); |
531 | } |
532 | |
533 | /** |
534 | * Turns a node into HTML, except for the enclosing tag. |
535 | * @param DOMNode $node |
536 | * @return string |
537 | */ |
538 | protected function innerHtml( DOMNode $node ) { |
539 | if ( !$node instanceof DOMElement ) { |
540 | return $this->toHtml( $node ); |
541 | } |
542 | |
543 | $html = ''; |
544 | foreach ( $node->childNodes as $child ) { |
545 | $html .= $this->toHtml( $child ); |
546 | } |
547 | return $html; |
548 | } |
549 | |
550 | /** |
551 | * Turns a node into HTML, except for the enclosing tag. |
552 | * Cleans up the contents by removing enclosing whitespace and some HTML elements. |
553 | * @param DOMNode $node |
554 | * @return string |
555 | */ |
556 | protected function cleanedInnerHtml( DOMNode $node ) { |
557 | $html = $this->innerHtml( $node ); |
558 | do { |
559 | $oldHtml = $html; |
560 | foreach ( static::$cleanupPatterns as $pattern => $replacement ) { |
561 | $html = preg_replace( $pattern, $replacement, $html ); |
562 | } |
563 | } while ( $oldHtml !== $html ); |
564 | return $html; |
565 | } |
566 | |
567 | /** |
568 | * Switch rows and columns. Usually it is easier to collect data grouped by source template, |
569 | * but the extmetadata API needs grouping by field name, this function turns around the grouping |
570 | * @param array $data |
571 | * @return array |
572 | */ |
573 | protected function arrayTranspose( array $data ) { |
574 | $transposedData = []; |
575 | foreach ( $data as $groupName => $group ) { |
576 | foreach ( $group as $fieldName => $value ) { |
577 | $transposedData[$fieldName][$groupName] = $value; |
578 | } |
579 | } |
580 | return $transposedData; |
581 | } |
582 | } |