Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
73.01% |
403 / 552 |
|
25.00% |
7 / 28 |
CRAP | |
0.00% |
0 / 1 |
Reader | |
73.01% |
403 / 552 |
|
25.00% |
7 / 28 |
917.01 | |
0.00% |
0 / 1 |
__construct | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
2.02 | |||
destroyXMLParser | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
resetXMLParser | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isSupported | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
getResults | |
56.67% |
17 / 30 |
|
0.00% |
0 / 1 |
40.52 | |||
parse | |
70.89% |
56 / 79 |
|
0.00% |
0 / 1 |
29.87 | |||
parseExtended | |
86.11% |
31 / 36 |
|
0.00% |
0 / 1 |
9.22 | |||
char | |
76.92% |
10 / 13 |
|
0.00% |
0 / 1 |
7.60 | |||
checkParseSafety | |
78.79% |
26 / 33 |
|
0.00% |
0 / 1 |
8.61 | |||
endElementModeIgnore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
endElementModeSimple | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
endElementNested | |
67.57% |
25 / 37 |
|
0.00% |
0 / 1 |
11.76 | |||
endElementModeLi | |
55.00% |
11 / 20 |
|
0.00% |
0 / 1 |
9.28 | |||
endElementModeQDesc | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
endElement | |
66.00% |
33 / 50 |
|
0.00% |
0 / 1 |
41.02 | |||
startElementModeIgnore | |
33.33% |
1 / 3 |
|
0.00% |
0 / 1 |
3.19 | |||
startElementModeBag | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
startElementModeSeq | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
3.07 | |||
startElementModeLang | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
startElementModeSimple | |
33.33% |
6 / 18 |
|
0.00% |
0 / 1 |
8.74 | |||
startElementModeQDesc | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
startElementModeInitial | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
6 | |||
startElementModeStruct | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
9.37 | |||
startElementModeLi | |
37.50% |
6 / 16 |
|
0.00% |
0 / 1 |
11.10 | |||
startElementModeLiLang | |
70.00% |
7 / 10 |
|
0.00% |
0 / 1 |
4.43 | |||
startElement | |
78.18% |
43 / 55 |
|
0.00% |
0 / 1 |
25.58 | |||
doAttribs | |
88.00% |
22 / 25 |
|
0.00% |
0 / 1 |
11.21 | |||
saveValue | |
75.86% |
22 / 29 |
|
0.00% |
0 / 1 |
11.41 |
1 | <?php |
2 | /** |
3 | * Reader for XMP data containing properties relevant to images. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Media |
22 | */ |
23 | |
24 | namespace Wikimedia\XMPReader; |
25 | |
26 | use Exception; |
27 | use Psr\Log\LoggerAwareInterface; |
28 | use Psr\Log\LoggerAwareTrait; |
29 | use Psr\Log\LoggerInterface; |
30 | use Psr\Log\NullLogger; |
31 | use RuntimeException; |
32 | use Wikimedia\ScopedCallback; |
33 | use XMLReader; |
34 | |
35 | /** |
36 | * Class for reading xmp data containing properties relevant to |
37 | * images, and spitting out an array that FormatMetadata accepts. |
38 | * |
39 | * Note, this is not meant to recognize every possible thing you can |
40 | * encode in XMP. It should recognize all the properties we want. |
41 | * For example, it doesn't have support for structures with multiple |
42 | * nesting levels, as none of the properties we're supporting use that |
43 | * feature. If it comes across properties it doesn't recognize, it should |
44 | * ignore them. |
45 | * |
46 | * The public methods one would call in this class are |
47 | * - parse( $content ) |
48 | * Reads in xmp content. |
49 | * Can potentially be called multiple times with partial data each time. |
50 | * - parseExtended( $content ) |
51 | * Reads XMPExtended blocks (jpeg files only). |
52 | * - getResults |
53 | * Outputs a results array. |
54 | * |
55 | * Note XMP kind of looks like rdf. They are not the same thing - XMP is |
56 | * encoded as a specific subset of rdf. This class can read XMP. It cannot |
57 | * read rdf. |
58 | */ |
59 | class Reader implements LoggerAwareInterface { |
60 | use LoggerAwareTrait; |
61 | |
62 | /** @var array XMP item configuration array */ |
63 | protected array $items; |
64 | |
65 | /** @var array Array to hold the current element (and previous element, and so on) */ |
66 | private array $curItem = []; |
67 | |
68 | /** @var bool|string The structure name when processing nested structures. */ |
69 | private $ancestorStruct = false; |
70 | |
71 | /** @var bool|string Temporary holder for character data that appears in xmp doc. */ |
72 | private $charContent = false; |
73 | |
74 | /** @var array Stores the state the xmpreader is in (see MODE_FOO constants) */ |
75 | private array $mode = []; |
76 | |
77 | /** @var array Array to hold results */ |
78 | private array $results = []; |
79 | |
80 | /** @var bool If we're doing a seq or bag. */ |
81 | private bool $processingArray = false; |
82 | |
83 | /** @var bool|string Used for lang alts only */ |
84 | private $itemLang = false; |
85 | |
86 | /** @var resource|null A resource handle for the XML parser */ |
87 | private $xmlParser; |
88 | |
89 | /** @var bool|string Character set like 'UTF-8' */ |
90 | private $charset = false; |
91 | |
92 | /** @var int */ |
93 | private int $extendedXMPOffset = 0; |
94 | |
95 | /** @var int Flag determining if the XMP is safe to parse */ |
96 | private int $parsable = 0; |
97 | |
98 | /** @var string Buffer of XML to parse */ |
99 | private string $xmlParsableBuffer = ''; |
100 | |
101 | /** |
102 | * @var string |
103 | */ |
104 | private string $filename; |
105 | |
106 | /** |
107 | * These are various mode constants. |
108 | * they are used to figure out what to do |
109 | * with an element when its encountered. |
110 | * |
111 | * For example, MODE_IGNORE is used when processing |
112 | * a property we're not interested in. So if a new |
113 | * element pops up when we're in that mode, we ignore it. |
114 | */ |
115 | private const MODE_INITIAL = 0; |
116 | private const MODE_IGNORE = 1; |
117 | private const MODE_LI = 2; |
118 | private const MODE_LI_LANG = 3; |
119 | private const MODE_QDESC = 4; |
120 | |
121 | // The following MODE constants are also used in the |
122 | // $items array to denote what type of property the item is. |
123 | public const MODE_SIMPLE = 10; |
124 | // structure (associative array) |
125 | public const MODE_STRUCT = 11; |
126 | // ordered list |
127 | public const MODE_SEQ = 12; |
128 | // unordered list |
129 | public const MODE_BAG = 13; |
130 | public const MODE_LANG = 14; |
131 | // non-language alt. Currently not implemented, and not needed atm. |
132 | public const MODE_ALT = 15; |
133 | // A BAG of Structs. |
134 | public const MODE_BAGSTRUCT = 16; |
135 | |
136 | private const NS_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; |
137 | private const NS_XML = 'http://www.w3.org/XML/1998/namespace'; |
138 | |
139 | // States used while determining if XML is safe to parse |
140 | private const PARSABLE_UNKNOWN = 0; |
141 | private const PARSABLE_OK = 1; |
142 | private const PARSABLE_BUFFERING = 2; |
143 | private const PARSABLE_NO = 3; |
144 | |
145 | /** |
146 | * Primary job is to initialize the XMLParser |
147 | * |
148 | * @param LoggerInterface|null $logger Logger instance if available |
149 | * @param string $filename |
150 | */ |
151 | public function __construct( LoggerInterface $logger = null, $filename = 'unknown' ) { |
152 | if ( $logger ) { |
153 | $this->setLogger( $logger ); |
154 | } else { |
155 | $this->setLogger( new NullLogger() ); |
156 | } |
157 | $this->filename = $filename; |
158 | |
159 | $this->items = Info::getItems(); |
160 | |
161 | $this->resetXMLParser(); |
162 | } |
163 | |
164 | /** |
165 | * free the XML parser. |
166 | * |
167 | * @note It is unclear to me if we really need to do this ourselves |
168 | * or if php garbage collection will automatically free the xmlParser |
169 | * when it is no longer needed. |
170 | */ |
171 | private function destroyXMLParser(): void { |
172 | if ( $this->xmlParser ) { |
173 | xml_parser_free( $this->xmlParser ); |
174 | $this->xmlParser = null; |
175 | } |
176 | } |
177 | |
178 | /** |
179 | * Main use is if a single item has multiple xmp documents describing it. |
180 | * For example in jpeg's with extendedXMP |
181 | */ |
182 | private function resetXMLParser(): void { |
183 | $this->destroyXMLParser(); |
184 | |
185 | $this->xmlParser = xml_parser_create_ns( 'UTF-8', ' ' ); |
186 | xml_parser_set_option( $this->xmlParser, XML_OPTION_CASE_FOLDING, 0 ); |
187 | xml_parser_set_option( $this->xmlParser, XML_OPTION_SKIP_WHITE, 1 ); |
188 | |
189 | xml_set_element_handler( $this->xmlParser, |
190 | [ $this, 'startElement' ], |
191 | [ $this, 'endElement' ] ); |
192 | |
193 | xml_set_character_data_handler( $this->xmlParser, [ $this, 'char' ] ); |
194 | |
195 | $this->parsable = self::PARSABLE_UNKNOWN; |
196 | $this->xmlParsableBuffer = ''; |
197 | } |
198 | |
199 | /** |
200 | * Check if this instance supports using this class |
201 | * |
202 | * @return bool |
203 | */ |
204 | public static function isSupported(): bool { |
205 | return function_exists( 'xml_parser_create_ns' ) && class_exists( XMLReader::class ); |
206 | } |
207 | |
208 | /** |
209 | * Get the result array. Do some post-processing before returning |
210 | * the array, and transform any metadata that is special-cased. |
211 | * |
212 | * @return array Array of results as an array of arrays suitable for |
213 | * FormatMetadata::getFormattedData(). |
214 | */ |
215 | public function getResults(): array { |
216 | // xmp-special is for metadata that affects how stuff |
217 | // is extracted. For example xmpNote:HasExtendedXMP. |
218 | |
219 | // It is also used to handle photoshop:AuthorsPosition |
220 | // which is weird and really part of another property, |
221 | // see 2:85 in IPTC. See also pg 21 of IPTC4XMP standard. |
222 | // The location fields also use it. |
223 | |
224 | $data = $this->results; |
225 | |
226 | if ( isset( $data['xmp-special']['AuthorsPosition'] ) |
227 | && is_string( $data['xmp-special']['AuthorsPosition'] ) |
228 | && isset( $data['xmp-general']['Artist'][0] ) |
229 | ) { |
230 | // Note, if there is more than one creator, |
231 | // this only applies to first. This also will |
232 | // only apply to the dc:Creator prop, not the |
233 | // exif:Artist prop. |
234 | |
235 | $data['xmp-general']['Artist'][0] = |
236 | $data['xmp-special']['AuthorsPosition'] . ', ' |
237 | . $data['xmp-general']['Artist'][0]; |
238 | } |
239 | |
240 | // Go through the LocationShown and LocationCreated |
241 | // changing it to the non-hierarchical form used by |
242 | // the other location fields. |
243 | |
244 | if ( isset( $data['xmp-special']['LocationShown'][0] ) |
245 | // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset |
246 | && is_array( $data['xmp-special']['LocationShown'][0] ) |
247 | ) { |
248 | // the is_array is just paranoia. It should always |
249 | // be an array. |
250 | foreach ( $data['xmp-special']['LocationShown'] as $loc ) { |
251 | if ( !is_array( $loc ) ) { |
252 | // To avoid copying over the _type meta-fields. |
253 | continue; |
254 | } |
255 | foreach ( $loc as $field => $val ) { |
256 | $data['xmp-general'][$field . 'Dest'][] = $val; |
257 | } |
258 | } |
259 | } |
260 | if ( isset( $data['xmp-special']['LocationCreated'][0] ) |
261 | // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset |
262 | && is_array( $data['xmp-special']['LocationCreated'][0] ) |
263 | ) { |
264 | // the is_array is just paranoia. It should always |
265 | // be an array. |
266 | foreach ( $data['xmp-special']['LocationCreated'] as $loc ) { |
267 | if ( !is_array( $loc ) ) { |
268 | // To avoid copying over the _type meta-fields. |
269 | continue; |
270 | } |
271 | foreach ( $loc as $field => $val ) { |
272 | $data['xmp-general'][$field . 'Created'][] = $val; |
273 | } |
274 | } |
275 | } |
276 | |
277 | // We don't want to return the special values, since they're |
278 | // special and not info to be stored about the file. |
279 | unset( $data['xmp-special'] ); |
280 | |
281 | // Convert GPSAltitude to negative if below sea level. |
282 | if ( isset( $data['xmp-exif']['GPSAltitudeRef'] ) |
283 | && isset( $data['xmp-exif']['GPSAltitude'] ) |
284 | ) { |
285 | // Must convert to a real before multiplying by -1 |
286 | // Validate guarantees there will always be a '/' in this value. |
287 | [ $nom, $denom ] = explode( '/', $data['xmp-exif']['GPSAltitude'] ); |
288 | // @phan-suppress-next-line PhanTypeInvalidLeftOperandOfNumericOp, PhanTypeInvalidRightOperandOfNumericOp |
289 | $data['xmp-exif']['GPSAltitude'] = $nom / $denom; |
290 | |
291 | // @phan-suppress-next-line PhanTypeInvalidDimOffset |
292 | if ( $data['xmp-exif']['GPSAltitudeRef'] == '1' ) { |
293 | $data['xmp-exif']['GPSAltitude'] *= -1; |
294 | } |
295 | unset( $data['xmp-exif']['GPSAltitudeRef'] ); |
296 | } |
297 | |
298 | return $data; |
299 | } |
300 | |
301 | /** |
302 | * Main function to call to parse XMP. Use getResults to |
303 | * get results. |
304 | * |
305 | * Also catches any errors during processing, writes them to |
306 | * debug log, blanks result array and returns false. |
307 | * |
308 | * @param string $content XMP data |
309 | * @param bool $allOfIt If this is all the data (true) or if it's split up (false). Default true |
310 | * @throws RuntimeException |
311 | * @return bool Success. |
312 | */ |
313 | public function parse( $content, $allOfIt = true ): bool { |
314 | if ( !$this->xmlParser ) { |
315 | $this->resetXMLParser(); |
316 | } |
317 | try { |
318 | |
319 | // detect encoding by looking for BOM which is supposed to be in processing instruction. |
320 | // see page 12 of http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart3.pdf |
321 | if ( !$this->charset ) { |
322 | $bom = []; |
323 | if ( preg_match( '/\xEF\xBB\xBF|\xFE\xFF|\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\xFF\xFE/', |
324 | $content, $bom ) |
325 | ) { |
326 | switch ( $bom[0] ) { |
327 | case "\xFE\xFF": |
328 | $this->charset = 'UTF-16BE'; |
329 | break; |
330 | case "\xFF\xFE": |
331 | $this->charset = 'UTF-16LE'; |
332 | break; |
333 | case "\x00\x00\xFE\xFF": |
334 | $this->charset = 'UTF-32BE'; |
335 | break; |
336 | case "\xFF\xFE\x00\x00": |
337 | $this->charset = 'UTF-32LE'; |
338 | break; |
339 | case "\xEF\xBB\xBF": |
340 | $this->charset = 'UTF-8'; |
341 | break; |
342 | default: |
343 | // this should be impossible to get to |
344 | throw new RuntimeException( "Invalid BOM" ); |
345 | } |
346 | } else { |
347 | // standard specifically says, if no bom assume utf-8 |
348 | $this->charset = 'UTF-8'; |
349 | } |
350 | } |
351 | if ( $this->charset !== 'UTF-8' ) { |
352 | // don't convert if already utf-8 |
353 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
354 | $content = @iconv( $this->charset, 'UTF-8//IGNORE', $content ); |
355 | } |
356 | |
357 | // Replace any null bytes with the replacement character (T320282) |
358 | $content = str_replace( "\0", "\u{FFFD}", $content ); |
359 | |
360 | // Ensure the XMP block does not have an xml doctype declaration, which |
361 | // could declare entities unsafe to parse with xml_parse (T85848/T71210). |
362 | if ( $this->parsable !== self::PARSABLE_OK ) { |
363 | if ( $this->parsable === self::PARSABLE_NO ) { |
364 | throw new RuntimeException( 'Unsafe doctype declaration in XML.' ); |
365 | } |
366 | |
367 | $content = $this->xmlParsableBuffer . $content; |
368 | if ( !$this->checkParseSafety( $content ) ) { |
369 | if ( !$allOfIt && $this->parsable !== self::PARSABLE_NO ) { |
370 | // parse wasn't Unsuccessful yet, so return true |
371 | // in this case. |
372 | return true; |
373 | } |
374 | $msg = ( $this->parsable === self::PARSABLE_NO ) ? |
375 | 'Unsafe doctype declaration in XML.' : |
376 | 'No root element found in XML.'; |
377 | throw new RuntimeException( $msg ); |
378 | } |
379 | } |
380 | |
381 | $ok = xml_parse( $this->xmlParser, $content, $allOfIt ); |
382 | if ( !$ok ) { |
383 | $code = xml_get_error_code( $this->xmlParser ); |
384 | $error = xml_error_string( $code ); |
385 | $line = xml_get_current_line_number( $this->xmlParser ); |
386 | $col = xml_get_current_column_number( $this->xmlParser ); |
387 | $offset = xml_get_current_byte_index( $this->xmlParser ); |
388 | |
389 | $this->logger->info( |
390 | '{method} : Error reading XMP content: {error} ' . |
391 | '(file: {file}, line: {line} column: {column} ' . |
392 | 'byte offset: {offset})', |
393 | [ |
394 | 'method' => __METHOD__, |
395 | 'error_code' => $code, |
396 | 'error' => $error, |
397 | 'file' => $this->filename, |
398 | 'line' => $line, |
399 | 'column' => $col, |
400 | 'offset' => $offset, |
401 | 'content' => $content, |
402 | ] |
403 | ); |
404 | // blank if error. |
405 | $this->results = []; |
406 | $this->destroyXMLParser(); |
407 | return false; |
408 | } |
409 | } catch ( Exception $e ) { |
410 | $this->logger->warning( |
411 | '{method} {message}', |
412 | [ |
413 | 'method' => __METHOD__, |
414 | 'message' => $e->getMessage(), |
415 | 'exception' => $e, |
416 | 'file' => $this->filename, |
417 | 'content' => $content, |
418 | ] |
419 | ); |
420 | $this->results = []; |
421 | return false; |
422 | } |
423 | if ( $allOfIt ) { |
424 | $this->destroyXMLParser(); |
425 | } |
426 | |
427 | return true; |
428 | } |
429 | |
430 | /** Entry point for XMPExtended blocks in jpeg files |
431 | * |
432 | * @todo In serious need of testing |
433 | * @see http://www.adobe.ge/devnet/xmp/pdfs/XMPSpecificationPart3.pdf XMP spec part 3 page 20 |
434 | * @param string $content XMPExtended block minus the namespace signature |
435 | * @return bool If it succeeded. |
436 | */ |
437 | public function parseExtended( $content ): bool { |
438 | // @todo FIXME: This is untested. Hard to find example files |
439 | // or programs that make such files.. |
440 | $guid = substr( $content, 0, 32 ); |
441 | if ( !isset( $this->results['xmp-special']['HasExtendedXMP'] ) |
442 | || $this->results['xmp-special']['HasExtendedXMP'] !== $guid |
443 | ) { |
444 | $this->logger->info( __METHOD__ . |
445 | " Ignoring XMPExtended block due to wrong guid (guid= '{guid}')", |
446 | [ |
447 | 'guid' => $guid, |
448 | 'file' => $this->filename, |
449 | ] ); |
450 | |
451 | return false; |
452 | } |
453 | $len = unpack( 'Nlength/Noffset', substr( $content, 32, 8 ) ); |
454 | |
455 | if ( !$len || |
456 | $len['length'] < 4 || |
457 | $len['offset'] < 0 || |
458 | $len['offset'] > $len['length'] |
459 | ) { |
460 | $this->logger->info( |
461 | __METHOD__ . 'Error reading extended XMP block, invalid length or offset.', |
462 | [ 'file' => $this->filename ] |
463 | ); |
464 | |
465 | return false; |
466 | } |
467 | |
468 | // we're not very robust here. we should accept it in the wrong order. |
469 | // To quote the XMP standard: |
470 | // "A JPEG writer should write the ExtendedXMP marker segments in order, |
471 | // immediately following the StandardXMP. However, the JPEG standard |
472 | // does not require preservation of marker segment order. A robust JPEG |
473 | // reader should tolerate the marker segments in any order." |
474 | // On the other hand, the probability that an image will have more than |
475 | // 128k of metadata is rather low... so the probability that it will have |
476 | // > 128k, and be in the wrong order is very low... |
477 | |
478 | if ( $len['offset'] !== $this->extendedXMPOffset ) { |
479 | $this->logger->info( __METHOD__ . 'Ignoring XMPExtended block due to wrong order. (Offset was ' |
480 | . $len['offset'] . ' but expected ' . $this->extendedXMPOffset . ')', |
481 | [ 'file' => $this->filename ] |
482 | ); |
483 | |
484 | return false; |
485 | } |
486 | |
487 | if ( $len['offset'] === 0 ) { |
488 | // if we're starting the extended block, we've probably already |
489 | // done the XMPStandard block, so reset. |
490 | $this->resetXMLParser(); |
491 | } |
492 | |
493 | $this->extendedXMPOffset += $len['length']; |
494 | |
495 | $actualContent = substr( $content, 40 ); |
496 | |
497 | $atEnd = ( $this->extendedXMPOffset === strlen( $actualContent ) ); |
498 | |
499 | $this->logger->debug( |
500 | __METHOD__ . 'Parsing a XMPExtended block', |
501 | [ 'file' => $this->filename ] |
502 | ); |
503 | |
504 | return $this->parse( $actualContent, $atEnd ); |
505 | } |
506 | |
507 | /** |
508 | * Character data handler |
509 | * Called whenever character data is found in the xmp document. |
510 | * |
511 | * does nothing if we're in MODE_IGNORE or if the data is whitespace |
512 | * throws an error if we're not in MODE_SIMPLE (as we're not allowed to have character |
513 | * data in the other modes). |
514 | * |
515 | * As an example, this happens when we encounter XMP like: |
516 | * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> |
517 | * and are processing the 0/10 bit. |
518 | * |
519 | * @param resource $parser XMLParser reference to the xml parser |
520 | * @param string $data Character data |
521 | * @throws RuntimeException On invalid data |
522 | */ |
523 | public function char( $parser, $data ): void { |
524 | $data = trim( $data ); |
525 | if ( trim( $data ) === "" ) { |
526 | return; |
527 | } |
528 | |
529 | if ( !isset( $this->mode[0] ) ) { |
530 | throw new RuntimeException( 'Unexpected character data before first rdf:Description element' ); |
531 | } |
532 | |
533 | if ( $this->mode[0] === self::MODE_IGNORE ) { |
534 | return; |
535 | } |
536 | |
537 | if ( $this->mode[0] !== self::MODE_SIMPLE |
538 | && $this->mode[0] !== self::MODE_QDESC |
539 | ) { |
540 | throw new RuntimeException( 'character data where not expected. (mode ' . $this->mode[0] . ')' ); |
541 | } |
542 | |
543 | // to check, how does this handle w.s. |
544 | if ( $this->charContent === false ) { |
545 | $this->charContent = $data; |
546 | } else { |
547 | $this->charContent .= $data; |
548 | } |
549 | } |
550 | |
551 | /** |
552 | * Check if a block of XML is safe to pass to xml_parse, i.e. doesn't |
553 | * contain a doctype declaration which could contain a dos attack if we |
554 | * parse it and expand internal entities (T85848). |
555 | * |
556 | * @param string $content xml string to check for parse safety |
557 | * @return bool true if the xml is safe to parse, false otherwise |
558 | */ |
559 | private function checkParseSafety( $content ): bool { |
560 | $reader = new XMLReader(); |
561 | $result = null; |
562 | |
563 | // Pull in the arbitrary MAX_URI_LENGTH from libxml2... |
564 | $maxUriLength = 1024 * 1024; |
565 | $dataUri = 'data://text/plain,' . urlencode( $content ); |
566 | if ( strlen( $dataUri ) > $maxUriLength ) { |
567 | // libxml2 won't parse this file as a data URI due to the length. |
568 | return false; |
569 | } |
570 | |
571 | // For XMLReader to parse incomplete/invalid XML, it has to be open()'ed |
572 | // instead of using XML(). |
573 | if ( !$reader->open( |
574 | $dataUri, |
575 | null, |
576 | LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET |
577 | ) ) { |
578 | return false; |
579 | } |
580 | |
581 | if ( LIBXML_VERSION < 20900 ) { |
582 | $oldDisable = libxml_disable_entity_loader( true ); |
583 | /** @noinspection PhpUnusedLocalVariableInspection */ |
584 | $reset = new ScopedCallback( |
585 | 'libxml_disable_entity_loader', |
586 | [ $oldDisable ] |
587 | ); |
588 | } |
589 | |
590 | $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false ); |
591 | |
592 | // Even with LIBXML_NOWARNING set, XMLReader::read gives a warning |
593 | // when parsing truncated XML, which causes unit tests to fail. |
594 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
595 | while ( @$reader->read() ) { |
596 | if ( $reader->nodeType === XMLReader::ELEMENT ) { |
597 | // Reached the first element without hitting a doctype declaration |
598 | $this->parsable = self::PARSABLE_OK; |
599 | $result = true; |
600 | break; |
601 | } |
602 | if ( $reader->nodeType === XMLReader::DOC_TYPE ) { |
603 | $this->parsable = self::PARSABLE_NO; |
604 | $result = false; |
605 | break; |
606 | } |
607 | } |
608 | |
609 | if ( $result !== null ) { |
610 | return $result; |
611 | } |
612 | |
613 | // Reached the end of the parsable xml without finding an element |
614 | // or doctype. Buffer and try again. |
615 | $this->parsable = self::PARSABLE_BUFFERING; |
616 | $this->xmlParsableBuffer = $content; |
617 | return false; |
618 | } |
619 | |
620 | /** When we hit a closing element in MODE_IGNORE |
621 | * Check to see if this is the element we started to ignore, |
622 | * in which case we get out of MODE_IGNORE |
623 | * |
624 | * @param string $elm Namespace of element followed by a space and then tag name of element. |
625 | */ |
626 | private function endElementModeIgnore( $elm ): void { |
627 | if ( $this->curItem[0] === $elm ) { |
628 | array_shift( $this->curItem ); |
629 | array_shift( $this->mode ); |
630 | } |
631 | } |
632 | |
633 | /** |
634 | * Hit a closing element when in MODE_SIMPLE. |
635 | * This generally means that we finished processing a |
636 | * property value, and now have to save the result to the |
637 | * results array |
638 | * |
639 | * For example, when processing: |
640 | * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> |
641 | * this deals with when we hit </exif:DigitalZoomRatio>. |
642 | * |
643 | * Or it could be if we hit the end element of a property |
644 | * of a compound data structure (like a member of an array). |
645 | * |
646 | * @param string $elm Namespace, space, and tag name. |
647 | */ |
648 | private function endElementModeSimple( $elm ): void { |
649 | if ( $this->charContent !== false ) { |
650 | if ( $this->processingArray ) { |
651 | // if we're processing an array, use the original element |
652 | // name instead of rdf:li. |
653 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
654 | } else { |
655 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
656 | } |
657 | $this->saveValue( $ns, $tag, $this->charContent ); |
658 | |
659 | // reset |
660 | $this->charContent = false; |
661 | } |
662 | array_shift( $this->curItem ); |
663 | array_shift( $this->mode ); |
664 | } |
665 | |
666 | /** |
667 | * Hit a closing element in MODE_STRUCT, MODE_SEQ, MODE_BAG |
668 | * generally means we've finished processing a nested structure. |
669 | * resets some internal variables to indicate that. |
670 | * |
671 | * Note this means we hit the closing element not the "</rdf:Seq>". |
672 | * |
673 | * @par For example, when processing: |
674 | * @code{.xml} |
675 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
676 | * </rdf:Seq> </exif:ISOSpeedRatings> |
677 | * @endcode |
678 | * |
679 | * This method is called when we hit the "</exif:ISOSpeedRatings>" tag. |
680 | * |
681 | * @param string $elm Namespace . space . tag name. |
682 | * @throws RuntimeException |
683 | */ |
684 | private function endElementNested( $elm ): void { |
685 | /* cur item must be the same as $elm, unless if in MODE_STRUCT |
686 | * in which case it could also be rdf:Description */ |
687 | if ( $this->curItem[0] !== $elm |
688 | && !( $elm === self::NS_RDF . ' Description' |
689 | && $this->mode[0] === self::MODE_STRUCT ) |
690 | ) { |
691 | throw new RuntimeException( "nesting mismatch. got a </$elm> but expected a </" . |
692 | $this->curItem[0] . '>' ); |
693 | } |
694 | |
695 | // Validate structures. |
696 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
697 | if ( isset( $this->items[$ns][$tag]['validate'] ) ) { |
698 | $info =& $this->items[$ns][$tag]; |
699 | $finalName = $info['map_name'] ?? $tag; |
700 | |
701 | if ( is_array( $info['validate'] ) ) { |
702 | $validate = $info['validate']; |
703 | } else { |
704 | $validator = new Validate( $this->logger ); |
705 | $validate = [ $validator, $info['validate'] ]; |
706 | } |
707 | |
708 | if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { |
709 | // This can happen if all the members of the struct failed validation. |
710 | $this->logger->debug( |
711 | __METHOD__ . " <$ns:$tag> has no valid members.", |
712 | [ 'file' => $this->filename ] |
713 | ); |
714 | } elseif ( is_callable( $validate ) ) { |
715 | $val =& $this->results['xmp-' . $info['map_group']][$finalName]; |
716 | call_user_func_array( $validate, [ $info, &$val, false ] ); |
717 | if ( $val === null ) { |
718 | // the idea being the validation function will unset the variable if |
719 | // its invalid. |
720 | $this->logger->info( |
721 | __METHOD__ . " <$ns:$tag> failed validation.", |
722 | [ 'file' => $this->filename ] |
723 | ); |
724 | unset( $this->results['xmp-' . $info['map_group']][$finalName] ); |
725 | } |
726 | } else { |
727 | $this->logger->warning( |
728 | __METHOD__ . " Validation function for $finalName (" . |
729 | get_class( $validate[0] ) . '::' . $validate[1] . '()) is not callable.', |
730 | [ 'file' => $this->filename ] |
731 | ); |
732 | } |
733 | } |
734 | |
735 | array_shift( $this->curItem ); |
736 | array_shift( $this->mode ); |
737 | $this->ancestorStruct = false; |
738 | $this->processingArray = false; |
739 | $this->itemLang = false; |
740 | } |
741 | |
742 | /** |
743 | * Hit a closing element in MODE_LI (either rdf:Seq, or rdf:Bag ) |
744 | * Add information about what type of element this is. |
745 | * |
746 | * Note we still have to hit the outer "</property>" |
747 | * |
748 | * @par For example, when processing: |
749 | * @code{.xml} |
750 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
751 | * </rdf:Seq> </exif:ISOSpeedRatings> |
752 | * @endcode |
753 | * |
754 | * This method is called when we hit the "</rdf:Seq>". |
755 | * (For comparison, we call endElementModeSimple when we |
756 | * hit the "</rdf:li>") |
757 | * |
758 | * @param string $elm Namespace . ' ' . element name |
759 | * @throws RuntimeException |
760 | */ |
761 | private function endElementModeLi( $elm ): void { |
762 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
763 | $info = $this->items[$ns][$tag]; |
764 | $finalName = $info['map_name'] ?? $tag; |
765 | |
766 | array_shift( $this->mode ); |
767 | |
768 | if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { |
769 | $this->logger->debug( |
770 | __METHOD__ . " Empty compound element $finalName.", |
771 | [ 'file' => $this->filename ] |
772 | ); |
773 | |
774 | return; |
775 | } |
776 | |
777 | if ( $elm === self::NS_RDF . ' Seq' ) { |
778 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ol'; |
779 | } elseif ( $elm === self::NS_RDF . ' Bag' ) { |
780 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ul'; |
781 | } elseif ( $elm === self::NS_RDF . ' Alt' ) { |
782 | // extra if needed as you could theoretically have a non-language alt. |
783 | if ( $info['mode'] === self::MODE_LANG ) { |
784 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'lang'; |
785 | } |
786 | } else { |
787 | throw new RuntimeException( |
788 | __METHOD__ . " expected </rdf:seq> or </rdf:bag> but instead got $elm." |
789 | ); |
790 | } |
791 | } |
792 | |
793 | /** |
794 | * End element while in MODE_QDESC |
795 | * mostly when ending an element when we have a simple value |
796 | * that has qualifiers. |
797 | * |
798 | * Qualifiers aren't all that common, and we don't do anything |
799 | * with them. |
800 | * |
801 | * @param string $elm Namespace and element |
802 | */ |
803 | private function endElementModeQDesc( $elm ): void { |
804 | if ( $elm === self::NS_RDF . ' value' ) { |
805 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
806 | $this->saveValue( $ns, $tag, $this->charContent ); |
807 | |
808 | return; |
809 | } |
810 | |
811 | array_shift( $this->mode ); |
812 | array_shift( $this->curItem ); |
813 | } |
814 | |
815 | /** |
816 | * Handler for hitting a closing element. |
817 | * |
818 | * generally just calls a helper function depending on what |
819 | * mode we're in. |
820 | * |
821 | * Ignores the outer wrapping elements that are optional in |
822 | * xmp and have no meaning. |
823 | * |
824 | * @param resource $parser |
825 | * @param string $elm Namespace . ' ' . element name |
826 | * @throws RuntimeException |
827 | */ |
828 | public function endElement( $parser, $elm ): void { |
829 | if ( $elm === ( self::NS_RDF . ' RDF' ) |
830 | || $elm === 'adobe:ns:meta/ xmpmeta' |
831 | || $elm === 'adobe:ns:meta/ xapmeta' |
832 | ) { |
833 | // ignore these. |
834 | return; |
835 | } |
836 | |
837 | if ( $elm === self::NS_RDF . ' type' ) { |
838 | // these aren't really supported properly yet. |
839 | // However, it appears they almost never used. |
840 | $this->logger->info( |
841 | __METHOD__ . ' encountered <rdf:type>', |
842 | [ 'file' => $this->filename ] |
843 | ); |
844 | } |
845 | |
846 | if ( strpos( $elm, ' ' ) === false ) { |
847 | // This probably shouldn't happen. |
848 | // However, there is a bug in an adobe product |
849 | // that forgets the namespace on some things. |
850 | // (Luckily they are unimportant things). |
851 | $this->logger->info( |
852 | __METHOD__ . " Encountered </$elm> which has no namespace. Skipping.", |
853 | [ 'file' => $this->filename ] |
854 | ); |
855 | |
856 | return; |
857 | } |
858 | |
859 | if ( count( $this->mode ) === 0 ) { |
860 | // This should never ever happen and means |
861 | // there is a pretty major bug in this class. |
862 | throw new RuntimeException( 'Encountered end element with no mode' ); |
863 | } |
864 | |
865 | if ( count( $this->curItem ) === 0 && $this->mode[0] !== self::MODE_INITIAL ) { |
866 | // just to be paranoid. Should always have a curItem, except for initially |
867 | // (aka during MODE_INITIAL). |
868 | throw new RuntimeException( "Hit end element </$elm> but no curItem" ); |
869 | } |
870 | |
871 | switch ( $this->mode[0] ) { |
872 | case self::MODE_IGNORE: |
873 | $this->endElementModeIgnore( $elm ); |
874 | break; |
875 | case self::MODE_SIMPLE: |
876 | $this->endElementModeSimple( $elm ); |
877 | break; |
878 | case self::MODE_STRUCT: |
879 | case self::MODE_SEQ: |
880 | case self::MODE_BAG: |
881 | case self::MODE_LANG: |
882 | case self::MODE_BAGSTRUCT: |
883 | $this->endElementNested( $elm ); |
884 | break; |
885 | case self::MODE_INITIAL: |
886 | if ( $elm === self::NS_RDF . ' Description' ) { |
887 | array_shift( $this->mode ); |
888 | } else { |
889 | throw new RuntimeException( 'Element ended unexpectedly while in MODE_INITIAL' ); |
890 | } |
891 | break; |
892 | case self::MODE_LI: |
893 | case self::MODE_LI_LANG: |
894 | $this->endElementModeLi( $elm ); |
895 | break; |
896 | case self::MODE_QDESC: |
897 | $this->endElementModeQDesc( $elm ); |
898 | break; |
899 | default: |
900 | $this->logger->info( |
901 | __METHOD__ . " no mode (elm = $elm)", |
902 | [ 'file' => $this->filename ] |
903 | ); |
904 | break; |
905 | } |
906 | } |
907 | |
908 | /** |
909 | * Hit an opening element while in MODE_IGNORE |
910 | * |
911 | * XMP is extensible, so ignore any tag we don't understand. |
912 | * |
913 | * Mostly ignores, unless we encounter the element that we are ignoring. |
914 | * in which case we add it to the item stack, so we can ignore things |
915 | * that are nested, correctly. |
916 | * |
917 | * @param string $elm Namespace . ' ' . tag name |
918 | */ |
919 | private function startElementModeIgnore( $elm ): void { |
920 | if ( $elm === $this->curItem[0] ) { |
921 | array_unshift( $this->curItem, $elm ); |
922 | array_unshift( $this->mode, self::MODE_IGNORE ); |
923 | } |
924 | } |
925 | |
926 | /** |
927 | * Start element in MODE_BAG (unordered array) |
928 | * this should always be <rdf:Bag> |
929 | * |
930 | * @param string $elm Namespace . ' ' . tag |
931 | * @throws RuntimeException If we have an element that's not <rdf:Bag> |
932 | */ |
933 | private function startElementModeBag( $elm ): void { |
934 | if ( $elm === self::NS_RDF . ' Bag' ) { |
935 | array_unshift( $this->mode, self::MODE_LI ); |
936 | } else { |
937 | throw new RuntimeException( "Expected <rdf:Bag> but got $elm." ); |
938 | } |
939 | } |
940 | |
941 | /** |
942 | * Start element in MODE_SEQ (ordered array) |
943 | * this should always be <rdf:Seq> |
944 | * |
945 | * @param string $elm Namespace . ' ' . tag |
946 | * @throws RuntimeException If we have an element that's not <rdf:Seq> |
947 | */ |
948 | private function startElementModeSeq( $elm ): void { |
949 | if ( $elm === self::NS_RDF . ' Seq' ) { |
950 | array_unshift( $this->mode, self::MODE_LI ); |
951 | } elseif ( $elm === self::NS_RDF . ' Bag' ) { |
952 | # T29105 |
953 | $this->logger->info( |
954 | __METHOD__ . ' Expected an rdf:Seq, but got an rdf:Bag. Pretending' . |
955 | ' it is a Seq, since some buggy software is known to screw this up.', |
956 | [ 'file' => $this->filename ] |
957 | ); |
958 | array_unshift( $this->mode, self::MODE_LI ); |
959 | } else { |
960 | throw new RuntimeException( "Expected <rdf:Seq> but got $elm." ); |
961 | } |
962 | } |
963 | |
964 | /** |
965 | * Start element in MODE_LANG (language alternative) |
966 | * this should always be <rdf:Alt> |
967 | * |
968 | * This tag tends to be used for metadata like describe this |
969 | * picture, which can be translated into multiple languages. |
970 | * |
971 | * XMP supports non-linguistic alternative selections, |
972 | * which are really only used for thumbnails, which |
973 | * we don't care about. |
974 | * |
975 | * @param string $elm Namespace . ' ' . tag |
976 | * @throws RuntimeException If we have an element that's not <rdf:Alt> |
977 | */ |
978 | private function startElementModeLang( $elm ): void { |
979 | if ( $elm === self::NS_RDF . ' Alt' ) { |
980 | array_unshift( $this->mode, self::MODE_LI_LANG ); |
981 | } else { |
982 | throw new RuntimeException( "Expected <rdf:Seq> but got $elm." ); |
983 | } |
984 | } |
985 | |
986 | /** |
987 | * Handle an opening element when in MODE_SIMPLE |
988 | * |
989 | * This should not happen often. This is for if a simple element |
990 | * already opened has a child element. Could happen for a |
991 | * qualified element. |
992 | * |
993 | * For example: |
994 | * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> |
995 | * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> |
996 | * </exif:DigitalZoomRatio> |
997 | * |
998 | * This method is called when processing the <rdf:Description> element |
999 | * |
1000 | * @param string $elm Namespace and tag names separated by space. |
1001 | * @param array $attribs Attributes of the element. |
1002 | * @throws RuntimeException |
1003 | */ |
1004 | private function startElementModeSimple( $elm, $attribs ): void { |
1005 | if ( $elm === self::NS_RDF . ' Description' ) { |
1006 | // If this value has qualifiers |
1007 | array_unshift( $this->mode, self::MODE_QDESC ); |
1008 | array_unshift( $this->curItem, $this->curItem[0] ); |
1009 | |
1010 | if ( isset( $attribs[self::NS_RDF . ' value'] ) ) { |
1011 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
1012 | $this->saveValue( $ns, $tag, $attribs[self::NS_RDF . ' value'] ); |
1013 | } |
1014 | } elseif ( $elm === self::NS_RDF . ' value' ) { |
1015 | // This should not be here. |
1016 | throw new RuntimeException( __METHOD__ . ' Encountered <rdf:value> where it was unexpected.' ); |
1017 | } else { |
1018 | // something else we don't recognize, like a qualifier maybe. |
1019 | $this->logger->info( __METHOD__ . |
1020 | " Encountered element <{element}> where only expecting character data as value of {curitem}", |
1021 | [ |
1022 | 'element' => $elm, |
1023 | 'curitem' => $this->curItem[0], |
1024 | 'file' => $this->filename, |
1025 | ] |
1026 | ); |
1027 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1028 | array_unshift( $this->curItem, $elm ); |
1029 | } |
1030 | } |
1031 | |
1032 | /** |
1033 | * Start an element when in MODE_QDESC. |
1034 | * This generally happens when a simple element has an inner |
1035 | * rdf:Description to hold qualifier elements. |
1036 | * |
1037 | * For example in: |
1038 | * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> |
1039 | * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> |
1040 | * </exif:DigitalZoomRatio> |
1041 | * Called when processing the <rdf:value> or <foo:someQualifier>. |
1042 | * |
1043 | * @param string $elm Namespace and tag name separated by a space. |
1044 | */ |
1045 | private function startElementModeQDesc( $elm ): void { |
1046 | if ( $elm === self::NS_RDF . ' value' ) { |
1047 | // do nothing |
1048 | return; |
1049 | } |
1050 | |
1051 | // otherwise its a qualifier, which we ignore |
1052 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1053 | array_unshift( $this->curItem, $elm ); |
1054 | } |
1055 | |
1056 | /** |
1057 | * Starting an element when in MODE_INITIAL |
1058 | * This usually happens when we hit an element inside |
1059 | * the outer rdf:Description |
1060 | * |
1061 | * This is generally where most properties start. |
1062 | * |
1063 | * @param string $ns Namespace |
1064 | * @param string $tag Tag name (without namespace prefix) |
1065 | * @param array $attribs Array of attributes |
1066 | * @throws RuntimeException |
1067 | */ |
1068 | private function startElementModeInitial( $ns, $tag, $attribs ): void { |
1069 | if ( $ns !== self::NS_RDF ) { |
1070 | if ( isset( $this->items[$ns][$tag] ) ) { |
1071 | if ( isset( $this->items[$ns][$tag]['structPart'] ) ) { |
1072 | // If this element is supposed to appear only as |
1073 | // a child of a structure, but appears here (not as |
1074 | // a child of a struct), then something weird is |
1075 | // happening, so ignore this element and its children. |
1076 | |
1077 | $this->logger->info( |
1078 | 'Encountered <{element}> outside of its expected parent. Ignoring.', |
1079 | [ 'element' => "$ns:$tag", 'file' => $this->filename ] |
1080 | ); |
1081 | |
1082 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1083 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1084 | |
1085 | return; |
1086 | } |
1087 | $mode = $this->items[$ns][$tag]['mode']; |
1088 | array_unshift( $this->mode, $mode ); |
1089 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1090 | if ( $mode === self::MODE_STRUCT ) { |
1091 | $this->ancestorStruct = $this->items[$ns][$tag]['map_name'] ?? $tag; |
1092 | } |
1093 | if ( $this->charContent !== false ) { |
1094 | // Something weird. |
1095 | // Should not happen in valid XMP. |
1096 | throw new RuntimeException( 'tag nested in non-whitespace characters.' ); |
1097 | } |
1098 | } else { |
1099 | // This element is not on our list of allowed elements so ignore. |
1100 | $this->logger->debug( __METHOD__ . ' Ignoring unrecognized element <{element}>.', |
1101 | [ 'element' => "$ns:$tag", 'file' => $this->filename ] ); |
1102 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1103 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1104 | |
1105 | return; |
1106 | } |
1107 | } |
1108 | // process attributes |
1109 | $this->doAttribs( $attribs ); |
1110 | } |
1111 | |
1112 | /** |
1113 | * Hit an opening element when in a Struct (MODE_STRUCT) |
1114 | * This is generally for fields of a compound property. |
1115 | * |
1116 | * Example of a struct (abbreviated; flash has more properties): |
1117 | * |
1118 | * <exif:Flash> <rdf:Description> <exif:Fired>True</exif:Fired> |
1119 | * <exif:Mode>1</exif:Mode></rdf:Description></exif:Flash> |
1120 | * |
1121 | * or: |
1122 | * |
1123 | * <exif:Flash rdf:parseType='Resource'> <exif:Fired>True</exif:Fired> |
1124 | * <exif:Mode>1</exif:Mode></exif:Flash> |
1125 | * |
1126 | * @param string $ns Namespace |
1127 | * @param string $tag Tag name (no ns) |
1128 | * @param array $attribs Array of attribs w/ values. |
1129 | * @throws RuntimeException |
1130 | */ |
1131 | private function startElementModeStruct( $ns, $tag, $attribs ): void { |
1132 | if ( $ns !== self::NS_RDF ) { |
1133 | if ( isset( $this->items[$ns][$tag] ) ) { |
1134 | if ( isset( $this->items[$ns][$this->ancestorStruct]['children'] ) |
1135 | && !isset( $this->items[$ns][$this->ancestorStruct]['children'][$tag] ) |
1136 | ) { |
1137 | // This assumes that we don't have inter-namespace nesting |
1138 | // which we don't in all the properties we're interested in. |
1139 | throw new RuntimeException( " <$tag> appeared nested in <" . $this->ancestorStruct |
1140 | . "> where it is not allowed." ); |
1141 | } |
1142 | array_unshift( $this->mode, $this->items[$ns][$tag]['mode'] ); |
1143 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1144 | if ( $this->charContent !== false ) { |
1145 | // Something weird. |
1146 | // Should not happen in valid XMP. |
1147 | throw new RuntimeException( "tag <$tag> nested in non-whitespace characters (" . |
1148 | $this->charContent . ")." ); |
1149 | } |
1150 | } else { |
1151 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1152 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1153 | |
1154 | return; |
1155 | } |
1156 | } |
1157 | |
1158 | if ( $ns === self::NS_RDF && $tag === 'Description' ) { |
1159 | $this->doAttribs( $attribs ); |
1160 | array_unshift( $this->mode, self::MODE_STRUCT ); |
1161 | array_unshift( $this->curItem, $this->curItem[0] ); |
1162 | } |
1163 | } |
1164 | |
1165 | /** |
1166 | * opening element in MODE_LI |
1167 | * process elements of arrays. |
1168 | * |
1169 | * Example: |
1170 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
1171 | * </rdf:Seq> </exif:ISOSpeedRatings> |
1172 | * This method is called when we hit the <rdf:li> element. |
1173 | * |
1174 | * @param string $elm Namespace . ' ' . tagname |
1175 | * @param array $attribs Attributes. (needed for BAGSTRUCTS) |
1176 | * @throws RuntimeException If it gets a tag other than <rdf:li> |
1177 | */ |
1178 | private function startElementModeLi( $elm, $attribs ): void { |
1179 | if ( ( $elm ) !== self::NS_RDF . ' li' ) { |
1180 | throw new RuntimeException( "<rdf:li> expected but got $elm." ); |
1181 | } |
1182 | |
1183 | if ( !isset( $this->mode[1] ) ) { |
1184 | // This should never ever ever happen. Checking for it |
1185 | // to be paranoid. |
1186 | throw new RuntimeException( 'In mode Li, but no 2xPrevious mode!' ); |
1187 | } |
1188 | |
1189 | if ( $this->mode[1] === self::MODE_BAGSTRUCT ) { |
1190 | // This list item contains a compound (STRUCT) value. |
1191 | array_unshift( $this->mode, self::MODE_STRUCT ); |
1192 | array_unshift( $this->curItem, $elm ); |
1193 | $this->processingArray = true; |
1194 | |
1195 | if ( !isset( $this->curItem[1] ) ) { |
1196 | // be paranoid. |
1197 | throw new RuntimeException( 'Can not find parent of BAGSTRUCT.' ); |
1198 | } |
1199 | [ $curNS, $curTag ] = explode( ' ', $this->curItem[1] ); |
1200 | $this->ancestorStruct = $this->items[$curNS][$curTag]['map_name'] ?? $curTag; |
1201 | |
1202 | $this->doAttribs( $attribs ); |
1203 | } else { |
1204 | // Normal BAG or SEQ containing simple values. |
1205 | array_unshift( $this->mode, self::MODE_SIMPLE ); |
1206 | // need to add curItem[0] on again since one is for the specific item |
1207 | // and one is for the entire group. |
1208 | array_unshift( $this->curItem, $this->curItem[0] ); |
1209 | $this->processingArray = true; |
1210 | } |
1211 | } |
1212 | |
1213 | /** |
1214 | * Opening element in MODE_LI_LANG. |
1215 | * process elements of language alternatives |
1216 | * |
1217 | * Example: |
1218 | * <dc:title> <rdf:Alt> <rdf:li xml:lang="x-default">My house |
1219 | * </rdf:li> </rdf:Alt> </dc:title> |
1220 | * |
1221 | * This method is called when we hit the <rdf:li> element. |
1222 | * |
1223 | * @param string $elm Namespace . ' ' . tag |
1224 | * @param array $attribs Array of elements (most importantly xml:lang) |
1225 | * @throws RuntimeException If it gets a tag other than <rdf:li> or if no xml:lang |
1226 | */ |
1227 | private function startElementModeLiLang( $elm, $attribs ): void { |
1228 | if ( $elm !== self::NS_RDF . ' li' ) { |
1229 | throw new RuntimeException( __METHOD__ . " <rdf:li> expected but got $elm." ); |
1230 | } |
1231 | if ( !isset( $attribs[self::NS_XML . ' lang'] ) |
1232 | || !preg_match( '/^[-A-Za-z0-9]{2,}$/D', $attribs[self::NS_XML . ' lang'] ) |
1233 | ) { |
1234 | throw new RuntimeException( __METHOD__ |
1235 | . " <rdf:li> did not contain, or has invalid xml:lang attribute in lang alternative" ); |
1236 | } |
1237 | |
1238 | // Lang is case-insensitive. |
1239 | $this->itemLang = strtolower( $attribs[self::NS_XML . ' lang'] ); |
1240 | |
1241 | // need to add curItem[0] on again since one is for the specific item |
1242 | // and one is for the entire group. |
1243 | array_unshift( $this->curItem, $this->curItem[0] ); |
1244 | array_unshift( $this->mode, self::MODE_SIMPLE ); |
1245 | $this->processingArray = true; |
1246 | } |
1247 | |
1248 | /** |
1249 | * Hits an opening element. |
1250 | * Generally just calls a helper based on what MODE we're in. |
1251 | * Also does some initial set up for the wrapper element |
1252 | * |
1253 | * @param resource $parser |
1254 | * @param string $elm Namespace "<space>" element |
1255 | * @param array $attribs Attribute name => value |
1256 | * @throws RuntimeException |
1257 | */ |
1258 | public function startElement( $parser, $elm, $attribs ): void { |
1259 | if ( $elm === self::NS_RDF . ' RDF' |
1260 | || $elm === 'adobe:ns:meta/ xmpmeta' |
1261 | || $elm === 'adobe:ns:meta/ xapmeta' |
1262 | ) { |
1263 | /* ignore. */ |
1264 | return; |
1265 | } |
1266 | |
1267 | if ( $elm === self::NS_RDF . ' Description' ) { |
1268 | if ( count( $this->mode ) === 0 ) { |
1269 | // outer rdf:desc |
1270 | array_unshift( $this->mode, self::MODE_INITIAL ); |
1271 | } |
1272 | } elseif ( $elm === self::NS_RDF . ' type' ) { |
1273 | // This doesn't support rdf:type properly. |
1274 | // In practise, I have yet to see a file that |
1275 | // uses this element, however it is mentioned |
1276 | // on page 25 of part 1 of the xmp standard. |
1277 | // Also, it seems as if exiv2 and exiftool do not support |
1278 | // this either (That or I misunderstand the standard) |
1279 | $this->logger->info( |
1280 | __METHOD__ . ' Encountered <rdf:type> which isn\'t currently supported', |
1281 | [ 'file' => $this->filename ] |
1282 | ); |
1283 | } |
1284 | |
1285 | if ( strpos( $elm, ' ' ) === false ) { |
1286 | // This probably shouldn't happen. |
1287 | $this->logger->info( |
1288 | __METHOD__ . " Encountered <$elm> which has no namespace. Skipping.", |
1289 | [ 'file' => $this->filename ] |
1290 | ); |
1291 | |
1292 | return; |
1293 | } |
1294 | |
1295 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
1296 | |
1297 | if ( count( $this->mode ) === 0 ) { |
1298 | // This should not happen. |
1299 | throw new RuntimeException( 'Error extracting XMP, ' |
1300 | . "encountered <$elm> with no mode" ); |
1301 | } |
1302 | |
1303 | switch ( $this->mode[0] ) { |
1304 | case self::MODE_IGNORE: |
1305 | $this->startElementModeIgnore( $elm ); |
1306 | break; |
1307 | case self::MODE_SIMPLE: |
1308 | $this->startElementModeSimple( $elm, $attribs ); |
1309 | break; |
1310 | case self::MODE_INITIAL: |
1311 | $this->startElementModeInitial( $ns, $tag, $attribs ); |
1312 | break; |
1313 | case self::MODE_STRUCT: |
1314 | $this->startElementModeStruct( $ns, $tag, $attribs ); |
1315 | break; |
1316 | case self::MODE_BAG: |
1317 | case self::MODE_BAGSTRUCT: |
1318 | $this->startElementModeBag( $elm ); |
1319 | break; |
1320 | case self::MODE_SEQ: |
1321 | $this->startElementModeSeq( $elm ); |
1322 | break; |
1323 | case self::MODE_LANG: |
1324 | $this->startElementModeLang( $elm ); |
1325 | break; |
1326 | case self::MODE_LI_LANG: |
1327 | $this->startElementModeLiLang( $elm, $attribs ); |
1328 | break; |
1329 | case self::MODE_LI: |
1330 | $this->startElementModeLi( $elm, $attribs ); |
1331 | break; |
1332 | case self::MODE_QDESC: |
1333 | $this->startElementModeQDesc( $elm ); |
1334 | break; |
1335 | default: |
1336 | throw new RuntimeException( 'StartElement in unknown mode: ' . $this->mode[0] ); |
1337 | } |
1338 | } |
1339 | |
1340 | /** |
1341 | * Process attributes. |
1342 | * Simple values can be stored as either a tag or attribute |
1343 | * |
1344 | * Often the initial "<rdf:Description>" tag just has all the simple |
1345 | * properties as attributes. |
1346 | * |
1347 | * @par Example: |
1348 | * @code |
1349 | * <rdf:Description rdf:about="" |
1350 | * xmlns:exif="http://ns.adobe.com/exif/1.0/" exif:DigitalZoomRatio="0/10"> |
1351 | * @endcode |
1352 | * |
1353 | * @param array $attribs Array attribute=>value |
1354 | * @throws RuntimeException |
1355 | */ |
1356 | private function doAttribs( $attribs ): void { |
1357 | // first check for rdf:parseType attribute, as that can change |
1358 | // how the attributes are interpreted. |
1359 | |
1360 | if ( isset( $attribs[self::NS_RDF . ' parseType'] ) |
1361 | && $attribs[self::NS_RDF . ' parseType'] === 'Resource' |
1362 | && $this->mode[0] === self::MODE_SIMPLE |
1363 | ) { |
1364 | // this is equivalent to having an inner rdf:Description |
1365 | $this->mode[0] = self::MODE_QDESC; |
1366 | } |
1367 | foreach ( $attribs as $name => $val ) { |
1368 | if ( strpos( $name, ' ' ) === false ) { |
1369 | // This shouldn't happen, but so far some old software forgets namespace |
1370 | // on rdf:about. |
1371 | $this->logger->info( |
1372 | __METHOD__ . ' Encountered non-namespaced attribute: ' . |
1373 | " $name=\"$val\". Skipping. ", |
1374 | [ 'file' => $this->filename ] |
1375 | ); |
1376 | continue; |
1377 | } |
1378 | [ $ns, $tag ] = explode( ' ', $name, 2 ); |
1379 | if ( $ns === self::NS_RDF ) { |
1380 | if ( $tag === 'value' || $tag === 'resource' ) { |
1381 | // resource is for url. |
1382 | // value attribute is a weird way of just putting the contents. |
1383 | $this->char( $this->xmlParser, $val ); |
1384 | } |
1385 | } elseif ( isset( $this->items[$ns][$tag] ) ) { |
1386 | if ( $this->mode[0] === self::MODE_SIMPLE ) { |
1387 | throw new RuntimeException( __METHOD__ |
1388 | . " $ns:$tag found as attribute where not allowed" ); |
1389 | } |
1390 | $this->saveValue( $ns, $tag, $val ); |
1391 | } else { |
1392 | $this->logger->debug( |
1393 | __METHOD__ . " Ignoring unrecognized element <$ns:$tag>.", |
1394 | [ 'file' => $this->filename ] |
1395 | ); |
1396 | } |
1397 | } |
1398 | } |
1399 | |
1400 | /** |
1401 | * Given an extracted value, save it to results array |
1402 | * |
1403 | * note also uses $this->ancestorStruct and |
1404 | * $this->processingArray to determine what name to |
1405 | * save the value under. (in addition to $tag). |
1406 | * |
1407 | * @param string $ns Namespace of tag this is for |
1408 | * @param string $tag Tag name |
1409 | * @param string $val Value to save |
1410 | */ |
1411 | private function saveValue( $ns, $tag, $val ): void { |
1412 | $info =& $this->items[$ns][$tag]; |
1413 | $finalName = $info['map_name'] ?? $tag; |
1414 | if ( isset( $info['validate'] ) ) { |
1415 | if ( is_array( $info['validate'] ) ) { |
1416 | $validate = $info['validate']; |
1417 | } else { |
1418 | $validator = new Validate( $this->logger ); |
1419 | $validate = [ $validator, $info['validate'] ]; |
1420 | } |
1421 | |
1422 | if ( is_callable( $validate ) ) { |
1423 | call_user_func_array( $validate, [ $info, &$val, true ] ); |
1424 | // the reasoning behind using &$val instead of using the return value |
1425 | // is to be consistent between here and validating structures. |
1426 | if ( $val === null ) { |
1427 | $this->logger->info( |
1428 | __METHOD__ . " <$ns:$tag> failed validation.", |
1429 | [ 'file' => $this->filename ] |
1430 | ); |
1431 | |
1432 | return; |
1433 | } |
1434 | } else { |
1435 | $this->logger->warning( |
1436 | __METHOD__ . " Validation function for $finalName (" . |
1437 | get_class( $validate[0] ) . '::' . $validate[1] . '()) is not callable.', |
1438 | [ 'file' => $this->filename ] |
1439 | ); |
1440 | } |
1441 | } |
1442 | |
1443 | if ( $this->ancestorStruct && $this->processingArray ) { |
1444 | // Aka both an array and a struct. ( self::MODE_BAGSTRUCT ) |
1445 | $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][][$finalName] = $val; |
1446 | } elseif ( $this->ancestorStruct ) { |
1447 | $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][$finalName] = $val; |
1448 | } elseif ( $this->processingArray ) { |
1449 | if ( $this->itemLang === false ) { |
1450 | // normal array |
1451 | $this->results['xmp-' . $info['map_group']][$finalName][] = $val; |
1452 | } else { |
1453 | // lang array. |
1454 | $this->results['xmp-' . $info['map_group']][$finalName][$this->itemLang] = $val; |
1455 | } |
1456 | } else { |
1457 | $this->results['xmp-' . $info['map_group']][$finalName] = $val; |
1458 | } |
1459 | } |
1460 | } |