Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
71.89% |
381 / 530 |
|
28.57% |
8 / 28 |
CRAP | |
0.00% |
0 / 1 |
Reader | |
71.89% |
381 / 530 |
|
28.57% |
8 / 28 |
1011.09 | |
0.00% |
0 / 1 |
__construct | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
2.02 | |||
destroyXMLParser | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
resetXMLParser | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isSupported | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
getResults | |
56.67% |
17 / 30 |
|
0.00% |
0 / 1 |
40.52 | |||
parse | |
69.62% |
55 / 79 |
|
0.00% |
0 / 1 |
31.22 | |||
parseExtended | |
86.11% |
31 / 36 |
|
0.00% |
0 / 1 |
9.22 | |||
char | |
76.92% |
10 / 13 |
|
0.00% |
0 / 1 |
7.60 | |||
checkParseSafety | |
78.79% |
26 / 33 |
|
0.00% |
0 / 1 |
8.61 | |||
endElementModeIgnore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
endElementModeSimple | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
endElementNested | |
67.57% |
25 / 37 |
|
0.00% |
0 / 1 |
11.76 | |||
endElementModeLi | |
55.00% |
11 / 20 |
|
0.00% |
0 / 1 |
9.28 | |||
endElementModeQDesc | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
endElement | |
56.41% |
22 / 39 |
|
0.00% |
0 / 1 |
62.09 | |||
startElementModeIgnore | |
33.33% |
1 / 3 |
|
0.00% |
0 / 1 |
3.19 | |||
startElementModeBag | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
startElementModeSeq | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
3.07 | |||
startElementModeLang | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
startElementModeSimple | |
33.33% |
6 / 18 |
|
0.00% |
0 / 1 |
8.74 | |||
startElementModeQDesc | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
startElementModeInitial | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
6 | |||
startElementModeStruct | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
9.37 | |||
startElementModeLi | |
37.50% |
6 / 16 |
|
0.00% |
0 / 1 |
11.10 | |||
startElementModeLiLang | |
70.00% |
7 / 10 |
|
0.00% |
0 / 1 |
4.43 | |||
startElement | |
72.73% |
32 / 44 |
|
0.00% |
0 / 1 |
29.95 | |||
doAttribs | |
88.00% |
22 / 25 |
|
0.00% |
0 / 1 |
11.21 | |||
saveValue | |
75.86% |
22 / 29 |
|
0.00% |
0 / 1 |
11.41 |
1 | <?php |
2 | /** |
3 | * Reader for XMP data containing properties relevant to images. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Media |
22 | */ |
23 | |
24 | namespace Wikimedia\XMPReader; |
25 | |
26 | use Exception; |
27 | use Psr\Log\LoggerAwareInterface; |
28 | use Psr\Log\LoggerAwareTrait; |
29 | use Psr\Log\LoggerInterface; |
30 | use Psr\Log\NullLogger; |
31 | use RuntimeException; |
32 | use Wikimedia\ScopedCallback; |
33 | use XMLReader; |
34 | |
35 | /** |
36 | * Class for reading xmp data containing properties relevant to |
37 | * images, and spitting out an array that FormatMetadata accepts. |
38 | * |
39 | * Note, this is not meant to recognize every possible thing you can |
40 | * encode in XMP. It should recognize all the properties we want. |
41 | * For example, it doesn't have support for structures with multiple |
42 | * nesting levels, as none of the properties we're supporting use that |
43 | * feature. If it comes across properties it doesn't recognize, it should |
44 | * ignore them. |
45 | * |
46 | * The public methods one would call in this class are |
47 | * - parse( $content ) |
48 | * Reads in xmp content. |
49 | * Can potentially be called multiple times with partial data each time. |
50 | * - parseExtended( $content ) |
51 | * Reads XMPExtended blocks (jpeg files only). |
52 | * - getResults |
53 | * Outputs a results array. |
54 | * |
55 | * Note XMP kind of looks like rdf. They are not the same thing - XMP is |
56 | * encoded as a specific subset of rdf. This class can read XMP. It cannot |
57 | * read rdf. |
58 | */ |
59 | class Reader implements LoggerAwareInterface { |
60 | use LoggerAwareTrait; |
61 | |
62 | /** @var array XMP item configuration array */ |
63 | protected array $items; |
64 | |
65 | /** @var array Array to hold the current element (and previous element, and so on) */ |
66 | private array $curItem = []; |
67 | |
68 | /** @var bool|string The structure name when processing nested structures. */ |
69 | private $ancestorStruct = false; |
70 | |
71 | /** @var bool|string Temporary holder for character data that appears in xmp doc. */ |
72 | private $charContent = false; |
73 | |
74 | /** @var array Stores the state the xmpreader is in (see MODE_FOO constants) */ |
75 | private array $mode = []; |
76 | |
77 | /** @var array Array to hold results */ |
78 | private array $results = []; |
79 | |
80 | /** @var bool If we're doing a seq or bag. */ |
81 | private bool $processingArray = false; |
82 | |
83 | /** @var bool|string Used for lang alts only */ |
84 | private $itemLang = false; |
85 | |
86 | /** @var resource|null A resource handle for the XML parser */ |
87 | private $xmlParser; |
88 | |
89 | /** @var bool|string Character set like 'UTF-8' */ |
90 | private $charset = false; |
91 | |
92 | /** @var int */ |
93 | private int $extendedXMPOffset = 0; |
94 | |
95 | /** @var int Flag determining if the XMP is safe to parse */ |
96 | private int $parsable = 0; |
97 | |
98 | /** @var string Buffer of XML to parse */ |
99 | private string $xmlParsableBuffer = ''; |
100 | |
101 | /** |
102 | * @var string |
103 | */ |
104 | private string $filename; |
105 | |
106 | /** |
107 | * These are various mode constants. |
108 | * they are used to figure out what to do |
109 | * with an element when its encountered. |
110 | * |
111 | * For example, MODE_IGNORE is used when processing |
112 | * a property we're not interested in. So if a new |
113 | * element pops up when we're in that mode, we ignore it. |
114 | */ |
115 | private const MODE_INITIAL = 0; |
116 | private const MODE_IGNORE = 1; |
117 | private const MODE_LI = 2; |
118 | private const MODE_LI_LANG = 3; |
119 | private const MODE_QDESC = 4; |
120 | |
121 | // The following MODE constants are also used in the |
122 | // $items array to denote what type of property the item is. |
123 | public const MODE_SIMPLE = 10; |
124 | // structure (associative array) |
125 | public const MODE_STRUCT = 11; |
126 | // ordered list |
127 | public const MODE_SEQ = 12; |
128 | // unordered list |
129 | public const MODE_BAG = 13; |
130 | public const MODE_LANG = 14; |
131 | // non-language alt. Currently not implemented, and not needed atm. |
132 | public const MODE_ALT = 15; |
133 | // A BAG of Structs. |
134 | public const MODE_BAGSTRUCT = 16; |
135 | |
136 | private const NS_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; |
137 | private const NS_XML = 'http://www.w3.org/XML/1998/namespace'; |
138 | |
139 | // States used while determining if XML is safe to parse |
140 | private const PARSABLE_UNKNOWN = 0; |
141 | private const PARSABLE_OK = 1; |
142 | private const PARSABLE_BUFFERING = 2; |
143 | private const PARSABLE_NO = 3; |
144 | |
145 | /** |
146 | * Primary job is to initialize the XMLParser |
147 | * |
148 | * @param LoggerInterface|null $logger Logger instance if available |
149 | * @param string $filename |
150 | */ |
151 | public function __construct( ?LoggerInterface $logger = null, $filename = 'unknown' ) { |
152 | if ( $logger ) { |
153 | $this->setLogger( $logger ); |
154 | } else { |
155 | $this->setLogger( new NullLogger() ); |
156 | } |
157 | $this->filename = $filename; |
158 | |
159 | $this->items = Info::getItems(); |
160 | |
161 | $this->resetXMLParser(); |
162 | } |
163 | |
164 | /** |
165 | * free the XML parser. |
166 | * |
167 | * @note It is unclear to me if we really need to do this ourselves |
168 | * or if php garbage collection will automatically free the xmlParser |
169 | * when it is no longer needed. |
170 | */ |
171 | private function destroyXMLParser(): void { |
172 | if ( $this->xmlParser ) { |
173 | xml_parser_free( $this->xmlParser ); |
174 | $this->xmlParser = null; |
175 | } |
176 | } |
177 | |
178 | /** |
179 | * Main use is if a single item has multiple xmp documents describing it. |
180 | * For example in jpeg's with extendedXMP |
181 | */ |
182 | private function resetXMLParser(): void { |
183 | $this->destroyXMLParser(); |
184 | |
185 | $this->xmlParser = xml_parser_create_ns( 'UTF-8', ' ' ); |
186 | xml_parser_set_option( $this->xmlParser, XML_OPTION_CASE_FOLDING, 0 ); |
187 | xml_parser_set_option( $this->xmlParser, XML_OPTION_SKIP_WHITE, 1 ); |
188 | |
189 | xml_set_element_handler( $this->xmlParser, |
190 | [ $this, 'startElement' ], |
191 | [ $this, 'endElement' ] ); |
192 | |
193 | xml_set_character_data_handler( $this->xmlParser, [ $this, 'char' ] ); |
194 | |
195 | $this->parsable = self::PARSABLE_UNKNOWN; |
196 | $this->xmlParsableBuffer = ''; |
197 | } |
198 | |
199 | /** |
200 | * Check if this instance supports using this class |
201 | * |
202 | * @return bool |
203 | */ |
204 | public static function isSupported(): bool { |
205 | return function_exists( 'xml_parser_create_ns' ) && class_exists( XMLReader::class ); |
206 | } |
207 | |
208 | /** |
209 | * Get the result array. Do some post-processing before returning |
210 | * the array, and transform any metadata that is special-cased. |
211 | * |
212 | * @return array Array of results as an array of arrays suitable for |
213 | * FormatMetadata::getFormattedData(). |
214 | */ |
215 | public function getResults(): array { |
216 | // xmp-special is for metadata that affects how stuff |
217 | // is extracted. For example xmpNote:HasExtendedXMP. |
218 | |
219 | // It is also used to handle photoshop:AuthorsPosition |
220 | // which is weird and really part of another property, |
221 | // see 2:85 in IPTC. See also pg 21 of IPTC4XMP standard. |
222 | // The location fields also use it. |
223 | |
224 | $data = $this->results; |
225 | |
226 | if ( isset( $data['xmp-special']['AuthorsPosition'] ) |
227 | && is_string( $data['xmp-special']['AuthorsPosition'] ) |
228 | && isset( $data['xmp-general']['Artist'][0] ) |
229 | ) { |
230 | // Note, if there is more than one creator, |
231 | // this only applies to first. This also will |
232 | // only apply to the dc:Creator prop, not the |
233 | // exif:Artist prop. |
234 | |
235 | $data['xmp-general']['Artist'][0] = |
236 | $data['xmp-special']['AuthorsPosition'] . ', ' |
237 | . $data['xmp-general']['Artist'][0]; |
238 | } |
239 | |
240 | // Go through the LocationShown and LocationCreated |
241 | // changing it to the non-hierarchical form used by |
242 | // the other location fields. |
243 | |
244 | if ( isset( $data['xmp-special']['LocationShown'][0] ) |
245 | // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset |
246 | && is_array( $data['xmp-special']['LocationShown'][0] ) |
247 | ) { |
248 | // the is_array is just paranoia. It should always |
249 | // be an array. |
250 | foreach ( $data['xmp-special']['LocationShown'] as $loc ) { |
251 | if ( !is_array( $loc ) ) { |
252 | // To avoid copying over the _type meta-fields. |
253 | continue; |
254 | } |
255 | foreach ( $loc as $field => $val ) { |
256 | $data['xmp-general'][$field . 'Dest'][] = $val; |
257 | } |
258 | } |
259 | } |
260 | if ( isset( $data['xmp-special']['LocationCreated'][0] ) |
261 | // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset |
262 | && is_array( $data['xmp-special']['LocationCreated'][0] ) |
263 | ) { |
264 | // the is_array is just paranoia. It should always |
265 | // be an array. |
266 | foreach ( $data['xmp-special']['LocationCreated'] as $loc ) { |
267 | if ( !is_array( $loc ) ) { |
268 | // To avoid copying over the _type meta-fields. |
269 | continue; |
270 | } |
271 | foreach ( $loc as $field => $val ) { |
272 | $data['xmp-general'][$field . 'Created'][] = $val; |
273 | } |
274 | } |
275 | } |
276 | |
277 | // We don't want to return the special values, since they're |
278 | // special and not info to be stored about the file. |
279 | unset( $data['xmp-special'] ); |
280 | |
281 | // Convert GPSAltitude to negative if below sea level. |
282 | if ( isset( $data['xmp-exif']['GPSAltitudeRef'] ) |
283 | && isset( $data['xmp-exif']['GPSAltitude'] ) |
284 | ) { |
285 | // Must convert to a real before multiplying by -1 |
286 | // Validate guarantees there will always be a '/' in this value. |
287 | [ $nom, $denom ] = explode( '/', $data['xmp-exif']['GPSAltitude'] ); |
288 | // @phan-suppress-next-line PhanTypeInvalidLeftOperandOfNumericOp, PhanTypeInvalidRightOperandOfNumericOp |
289 | $data['xmp-exif']['GPSAltitude'] = $nom / $denom; |
290 | |
291 | // @phan-suppress-next-line PhanTypeInvalidDimOffset |
292 | if ( $data['xmp-exif']['GPSAltitudeRef'] == '1' ) { |
293 | $data['xmp-exif']['GPSAltitude'] *= -1; |
294 | } |
295 | unset( $data['xmp-exif']['GPSAltitudeRef'] ); |
296 | } |
297 | |
298 | return $data; |
299 | } |
300 | |
301 | /** |
302 | * Main function to parse XMP. Use getResults to get results. |
303 | * |
304 | * Also catches any errors during processing, writes them to |
305 | * debug log, blanks result array and returns false. |
306 | * |
307 | * @param string $content XMP data |
308 | * @param bool $allOfIt If this is all the data (true), or if it's split up (false). Default true |
309 | * @throws RuntimeException |
310 | * @return bool Success. |
311 | */ |
312 | public function parse( $content, $allOfIt = true ): bool { |
313 | if ( !$this->xmlParser ) { |
314 | $this->resetXMLParser(); |
315 | } |
316 | try { |
317 | |
318 | // detect encoding by looking for BOM which is supposed to be in processing instruction. |
319 | // see page 12 of http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart3.pdf |
320 | if ( !$this->charset ) { |
321 | $bom = []; |
322 | if ( preg_match( '/\xEF\xBB\xBF|\xFE\xFF|\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\xFF\xFE/', |
323 | $content, $bom ) |
324 | ) { |
325 | switch ( $bom[0] ) { |
326 | case "\xFE\xFF": |
327 | $this->charset = 'UTF-16BE'; |
328 | break; |
329 | case "\xFF\xFE": |
330 | $this->charset = 'UTF-16LE'; |
331 | break; |
332 | case "\x00\x00\xFE\xFF": |
333 | $this->charset = 'UTF-32BE'; |
334 | break; |
335 | case "\xFF\xFE\x00\x00": |
336 | $this->charset = 'UTF-32LE'; |
337 | break; |
338 | case "\xEF\xBB\xBF": |
339 | $this->charset = 'UTF-8'; |
340 | break; |
341 | default: |
342 | // this should be impossible to get to |
343 | throw new RuntimeException( "Invalid BOM" ); |
344 | } |
345 | } else { |
346 | // standard specifically says, if no bom assume utf-8 |
347 | $this->charset = 'UTF-8'; |
348 | } |
349 | } |
350 | if ( $this->charset !== 'UTF-8' ) { |
351 | // don't convert if already utf-8 |
352 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
353 | $content = @iconv( $this->charset, 'UTF-8//IGNORE', $content ); |
354 | } |
355 | |
356 | // Replace any null bytes with the replacement character (T320282) |
357 | $content = str_replace( "\0", "\u{FFFD}", $content ); |
358 | |
359 | // Ensure the XMP block does not have an xml doctype declaration, which |
360 | // could declare entities unsafe to parse with xml_parse (T85848/T71210). |
361 | if ( $this->parsable !== self::PARSABLE_OK ) { |
362 | if ( $this->parsable === self::PARSABLE_NO ) { |
363 | throw new RuntimeException( 'Unsafe doctype declaration in XML.' ); |
364 | } |
365 | |
366 | $content = $this->xmlParsableBuffer . $content; |
367 | if ( !$this->checkParseSafety( $content ) ) { |
368 | if ( !$allOfIt && $this->parsable !== self::PARSABLE_NO ) { |
369 | // parse wasn't Unsuccessful yet, so return true |
370 | // in this case. |
371 | return true; |
372 | } |
373 | $msg = ( $this->parsable === self::PARSABLE_NO ) ? |
374 | 'Unsafe doctype declaration in XML.' : |
375 | 'No root element found in XML.'; |
376 | throw new RuntimeException( $msg ); |
377 | } |
378 | } |
379 | |
380 | $ok = xml_parse( $this->xmlParser, $content, $allOfIt ); |
381 | if ( !$ok ) { |
382 | $code = xml_get_error_code( $this->xmlParser ); |
383 | $error = xml_error_string( $code ); |
384 | $line = xml_get_current_line_number( $this->xmlParser ); |
385 | $col = xml_get_current_column_number( $this->xmlParser ); |
386 | $offset = xml_get_current_byte_index( $this->xmlParser ); |
387 | |
388 | $this->logger->info( |
389 | '{method} : Error reading XMP content: {error} ' . |
390 | '(file: {file}, line: {line} column: {column} ' . |
391 | 'byte offset: {offset})', |
392 | [ |
393 | 'method' => __METHOD__, |
394 | 'error_code' => $code, |
395 | 'error' => $error, |
396 | 'file' => $this->filename, |
397 | 'line' => $line, |
398 | 'column' => $col, |
399 | 'offset' => $offset, |
400 | 'content' => $content, |
401 | ] |
402 | ); |
403 | // blank if error. |
404 | $this->results = []; |
405 | $this->destroyXMLParser(); |
406 | return false; |
407 | } |
408 | } catch ( Exception $e ) { |
409 | $this->logger->warning( |
410 | '{method} {message}', |
411 | [ |
412 | 'method' => __METHOD__, |
413 | 'message' => $e->getMessage(), |
414 | 'exception' => $e, |
415 | 'file' => $this->filename, |
416 | 'content' => $content, |
417 | ] |
418 | ); |
419 | $this->results = []; |
420 | return false; |
421 | } |
422 | if ( $allOfIt ) { |
423 | $this->destroyXMLParser(); |
424 | } |
425 | |
426 | return true; |
427 | } |
428 | |
429 | /** Entry point for XMPExtended blocks in jpeg files |
430 | * |
431 | * @todo In serious need of testing |
432 | * @see http://www.adobe.ge/devnet/xmp/pdfs/XMPSpecificationPart3.pdf XMP spec part 3 page 20 |
433 | * @param string $content XMPExtended block minus the namespace signature |
434 | * @return bool If it succeeded. |
435 | */ |
436 | public function parseExtended( $content ): bool { |
437 | // @todo FIXME: This is untested. Hard to find example files |
438 | // or programs that make such files.. |
439 | $guid = substr( $content, 0, 32 ); |
440 | if ( !isset( $this->results['xmp-special']['HasExtendedXMP'] ) |
441 | || $this->results['xmp-special']['HasExtendedXMP'] !== $guid |
442 | ) { |
443 | $this->logger->info( __METHOD__ . |
444 | " Ignoring XMPExtended block due to wrong guid (guid= '{guid}')", |
445 | [ |
446 | 'guid' => $guid, |
447 | 'file' => $this->filename, |
448 | ] ); |
449 | |
450 | return false; |
451 | } |
452 | $len = unpack( 'Nlength/Noffset', substr( $content, 32, 8 ) ); |
453 | |
454 | if ( !$len || |
455 | $len['length'] < 4 || |
456 | $len['offset'] < 0 || |
457 | $len['offset'] > $len['length'] |
458 | ) { |
459 | $this->logger->info( |
460 | __METHOD__ . 'Error reading extended XMP block, invalid length or offset.', |
461 | [ 'file' => $this->filename ] |
462 | ); |
463 | |
464 | return false; |
465 | } |
466 | |
467 | // we're not very robust here. we should accept it in the wrong order. |
468 | // To quote the XMP standard: |
469 | // "A JPEG writer should write the ExtendedXMP marker segments in order, |
470 | // immediately following the StandardXMP. However, the JPEG standard |
471 | // does not require preservation of marker segment order. A robust JPEG |
472 | // reader should tolerate the marker segments in any order." |
473 | // On the other hand, the probability that an image will have more than |
474 | // 128k of metadata is rather low... so the probability that it will have |
475 | // > 128k, and be in the wrong order is very low... |
476 | |
477 | if ( $len['offset'] !== $this->extendedXMPOffset ) { |
478 | $this->logger->info( __METHOD__ . 'Ignoring XMPExtended block due to wrong order. (Offset was ' |
479 | . $len['offset'] . ' but expected ' . $this->extendedXMPOffset . ')', |
480 | [ 'file' => $this->filename ] |
481 | ); |
482 | |
483 | return false; |
484 | } |
485 | |
486 | if ( $len['offset'] === 0 ) { |
487 | // if we're starting the extended block, we've probably already |
488 | // done the XMPStandard block, so reset. |
489 | $this->resetXMLParser(); |
490 | } |
491 | |
492 | $this->extendedXMPOffset += $len['length']; |
493 | |
494 | $actualContent = substr( $content, 40 ); |
495 | |
496 | $atEnd = ( $this->extendedXMPOffset === strlen( $actualContent ) ); |
497 | |
498 | $this->logger->debug( |
499 | __METHOD__ . 'Parsing a XMPExtended block', |
500 | [ 'file' => $this->filename ] |
501 | ); |
502 | |
503 | return $this->parse( $actualContent, $atEnd ); |
504 | } |
505 | |
506 | /** |
507 | * Character data handler |
508 | * Called whenever character data is found in the xmp document. |
509 | * |
510 | * does nothing if we're in MODE_IGNORE or if the data is whitespace |
511 | * throws an error if we're not in MODE_SIMPLE (as we're not allowed to have character |
512 | * data in the other modes). |
513 | * |
514 | * As an example, this happens when we encounter XMP like: |
515 | * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> |
516 | * and are processing the 0/10 bit. |
517 | * |
518 | * @param resource $parser XMLParser reference to the xml parser |
519 | * @param string $data Character data |
520 | * @throws RuntimeException On invalid data |
521 | */ |
522 | public function char( $parser, $data ): void { |
523 | $data = trim( $data ); |
524 | if ( trim( $data ) === "" ) { |
525 | return; |
526 | } |
527 | |
528 | if ( !isset( $this->mode[0] ) ) { |
529 | throw new RuntimeException( 'Unexpected character data before first rdf:Description element' ); |
530 | } |
531 | |
532 | if ( $this->mode[0] === self::MODE_IGNORE ) { |
533 | return; |
534 | } |
535 | |
536 | if ( $this->mode[0] !== self::MODE_SIMPLE |
537 | && $this->mode[0] !== self::MODE_QDESC |
538 | ) { |
539 | throw new RuntimeException( 'character data where not expected. (mode ' . $this->mode[0] . ')' ); |
540 | } |
541 | |
542 | // to check, how does this handle w.s. |
543 | if ( $this->charContent === false ) { |
544 | $this->charContent = $data; |
545 | } else { |
546 | $this->charContent .= $data; |
547 | } |
548 | } |
549 | |
550 | /** |
551 | * Check if a block of XML is safe to pass to xml_parse, i.e. doesn't |
552 | * contain a doctype declaration which could contain a dos attack if we |
553 | * parse it and expand internal entities (T85848). |
554 | * |
555 | * @param string $content xml string to check for parse safety |
556 | * @return bool true if the xml is safe to parse, false otherwise |
557 | */ |
558 | private function checkParseSafety( $content ): bool { |
559 | $reader = new XMLReader(); |
560 | $result = null; |
561 | |
562 | // Pull in the arbitrary MAX_URI_LENGTH from libxml2... |
563 | $maxUriLength = 1024 * 1024; |
564 | $dataUri = 'data://text/plain,' . urlencode( $content ); |
565 | if ( strlen( $dataUri ) > $maxUriLength ) { |
566 | // libxml2 won't parse this file as a data URI due to the length. |
567 | return false; |
568 | } |
569 | |
570 | // For XMLReader to parse incomplete/invalid XML, it has to be open()'ed |
571 | // instead of using XML(). |
572 | if ( !$reader->open( |
573 | $dataUri, |
574 | null, |
575 | LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET |
576 | ) ) { |
577 | return false; |
578 | } |
579 | |
580 | if ( LIBXML_VERSION < 20900 ) { |
581 | $oldDisable = libxml_disable_entity_loader( true ); |
582 | /** @noinspection PhpUnusedLocalVariableInspection */ |
583 | $reset = new ScopedCallback( |
584 | 'libxml_disable_entity_loader', |
585 | [ $oldDisable ] |
586 | ); |
587 | } |
588 | |
589 | $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false ); |
590 | |
591 | // Even with LIBXML_NOWARNING set, XMLReader::read gives a warning |
592 | // when parsing truncated XML, which causes unit tests to fail. |
593 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
594 | while ( @$reader->read() ) { |
595 | if ( $reader->nodeType === XMLReader::ELEMENT ) { |
596 | // Reached the first element without hitting a doctype declaration |
597 | $this->parsable = self::PARSABLE_OK; |
598 | $result = true; |
599 | break; |
600 | } |
601 | if ( $reader->nodeType === XMLReader::DOC_TYPE ) { |
602 | $this->parsable = self::PARSABLE_NO; |
603 | $result = false; |
604 | break; |
605 | } |
606 | } |
607 | |
608 | if ( $result !== null ) { |
609 | return $result; |
610 | } |
611 | |
612 | // Reached the end of the parsable xml without finding an element |
613 | // or doctype. Buffer and try again. |
614 | $this->parsable = self::PARSABLE_BUFFERING; |
615 | $this->xmlParsableBuffer = $content; |
616 | return false; |
617 | } |
618 | |
619 | /** When we hit a closing element in MODE_IGNORE |
620 | * Check to see if this is the element we started to ignore, |
621 | * in which case we get out of MODE_IGNORE |
622 | * |
623 | * @param string $elm Namespace of element followed by a space and then tag name of element. |
624 | */ |
625 | private function endElementModeIgnore( $elm ): void { |
626 | if ( $this->curItem[0] === $elm ) { |
627 | array_shift( $this->curItem ); |
628 | array_shift( $this->mode ); |
629 | } |
630 | } |
631 | |
632 | /** |
633 | * Hit a closing element when in MODE_SIMPLE. |
634 | * This generally means that we finished processing a |
635 | * property value, and now have to save the result to the |
636 | * results array |
637 | * |
638 | * For example, when processing: |
639 | * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> |
640 | * this deals with when we hit </exif:DigitalZoomRatio>. |
641 | * |
642 | * Or it could be if we hit the end element of a property |
643 | * of a compound data structure (like a member of an array). |
644 | * |
645 | * @param string $elm Namespace, space, and tag name. |
646 | */ |
647 | private function endElementModeSimple( $elm ): void { |
648 | if ( $this->charContent !== false ) { |
649 | if ( $this->processingArray ) { |
650 | // if we're processing an array, use the original element |
651 | // name instead of rdf:li. |
652 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
653 | } else { |
654 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
655 | } |
656 | $this->saveValue( $ns, $tag, $this->charContent ); |
657 | |
658 | // reset |
659 | $this->charContent = false; |
660 | } |
661 | array_shift( $this->curItem ); |
662 | array_shift( $this->mode ); |
663 | } |
664 | |
665 | /** |
666 | * Hit a closing element in MODE_STRUCT, MODE_SEQ, MODE_BAG |
667 | * generally means we've finished processing a nested structure. |
668 | * resets some internal variables to indicate that. |
669 | * |
670 | * Note this means we hit the closing element not the "</rdf:Seq>". |
671 | * |
672 | * @par For example, when processing: |
673 | * @code{.xml} |
674 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
675 | * </rdf:Seq> </exif:ISOSpeedRatings> |
676 | * @endcode |
677 | * |
678 | * This method is called when we hit the "</exif:ISOSpeedRatings>" tag. |
679 | * |
680 | * @param string $elm Namespace . space . tag name. |
681 | * @throws RuntimeException |
682 | */ |
683 | private function endElementNested( $elm ): void { |
684 | /* cur item must be the same as $elm, unless if in MODE_STRUCT |
685 | * in which case it could also be rdf:Description */ |
686 | if ( $this->curItem[0] !== $elm |
687 | && !( $elm === self::NS_RDF . ' Description' |
688 | && $this->mode[0] === self::MODE_STRUCT ) |
689 | ) { |
690 | throw new RuntimeException( "nesting mismatch. got a </$elm> but expected a </" . |
691 | $this->curItem[0] . '>' ); |
692 | } |
693 | |
694 | // Validate structures. |
695 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
696 | if ( isset( $this->items[$ns][$tag]['validate'] ) ) { |
697 | $info =& $this->items[$ns][$tag]; |
698 | $finalName = $info['map_name'] ?? $tag; |
699 | |
700 | if ( is_array( $info['validate'] ) ) { |
701 | $validate = $info['validate']; |
702 | } else { |
703 | $validator = new Validate( $this->logger ); |
704 | $validate = [ $validator, $info['validate'] ]; |
705 | } |
706 | |
707 | if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { |
708 | // This can happen if all the members of the struct failed validation. |
709 | $this->logger->debug( |
710 | __METHOD__ . " <$ns:$tag> has no valid members.", |
711 | [ 'file' => $this->filename ] |
712 | ); |
713 | } elseif ( is_callable( $validate ) ) { |
714 | $val =& $this->results['xmp-' . $info['map_group']][$finalName]; |
715 | call_user_func_array( $validate, [ $info, &$val, false ] ); |
716 | if ( $val === null ) { |
717 | // the idea being the validation function will unset the variable if |
718 | // its invalid. |
719 | $this->logger->info( |
720 | __METHOD__ . " <$ns:$tag> failed validation.", |
721 | [ 'file' => $this->filename ] |
722 | ); |
723 | unset( $this->results['xmp-' . $info['map_group']][$finalName] ); |
724 | } |
725 | } else { |
726 | $this->logger->warning( |
727 | __METHOD__ . " Validation function for $finalName (" . |
728 | get_class( $validate[0] ) . '::' . $validate[1] . '()) is not callable.', |
729 | [ 'file' => $this->filename ] |
730 | ); |
731 | } |
732 | } |
733 | |
734 | array_shift( $this->curItem ); |
735 | array_shift( $this->mode ); |
736 | $this->ancestorStruct = false; |
737 | $this->processingArray = false; |
738 | $this->itemLang = false; |
739 | } |
740 | |
741 | /** |
742 | * Hit a closing element in MODE_LI (either rdf:Seq, or rdf:Bag ) |
743 | * Add information about what type of element this is. |
744 | * |
745 | * Note we still have to hit the outer "</property>" |
746 | * |
747 | * @par For example, when processing: |
748 | * @code{.xml} |
749 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
750 | * </rdf:Seq> </exif:ISOSpeedRatings> |
751 | * @endcode |
752 | * |
753 | * This method is called when we hit the "</rdf:Seq>". |
754 | * (For comparison, we call endElementModeSimple when we |
755 | * hit the "</rdf:li>") |
756 | * |
757 | * @param string $elm Namespace . ' ' . element name |
758 | * @throws RuntimeException |
759 | */ |
760 | private function endElementModeLi( $elm ): void { |
761 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
762 | $info = $this->items[$ns][$tag]; |
763 | $finalName = $info['map_name'] ?? $tag; |
764 | |
765 | array_shift( $this->mode ); |
766 | |
767 | if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { |
768 | $this->logger->debug( |
769 | __METHOD__ . " Empty compound element $finalName.", |
770 | [ 'file' => $this->filename ] |
771 | ); |
772 | |
773 | return; |
774 | } |
775 | |
776 | if ( $elm === self::NS_RDF . ' Seq' ) { |
777 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ol'; |
778 | } elseif ( $elm === self::NS_RDF . ' Bag' ) { |
779 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ul'; |
780 | } elseif ( $elm === self::NS_RDF . ' Alt' ) { |
781 | // extra if needed as you could theoretically have a non-language alt. |
782 | if ( $info['mode'] === self::MODE_LANG ) { |
783 | $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'lang'; |
784 | } |
785 | } else { |
786 | throw new RuntimeException( |
787 | __METHOD__ . " expected </rdf:seq> or </rdf:bag> but instead got $elm." |
788 | ); |
789 | } |
790 | } |
791 | |
792 | /** |
793 | * End element while in MODE_QDESC |
794 | * mostly when ending an element when we have a simple value |
795 | * that has qualifiers. |
796 | * |
797 | * Qualifiers aren't all that common, and we don't do anything |
798 | * with them. |
799 | * |
800 | * @param string $elm Namespace and element |
801 | */ |
802 | private function endElementModeQDesc( $elm ): void { |
803 | if ( $elm === self::NS_RDF . ' value' ) { |
804 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
805 | $this->saveValue( $ns, $tag, $this->charContent ); |
806 | |
807 | return; |
808 | } |
809 | |
810 | array_shift( $this->mode ); |
811 | array_shift( $this->curItem ); |
812 | } |
813 | |
814 | /** |
815 | * Handler for hitting a closing element. |
816 | * |
817 | * generally just calls a helper function depending on what |
818 | * mode we're in. |
819 | * |
820 | * Ignores the outer wrapping elements that are optional in |
821 | * xmp and have no meaning. |
822 | * |
823 | * @param resource $parser |
824 | * @param string $elm Namespace . ' ' . element name |
825 | * @throws RuntimeException |
826 | */ |
827 | public function endElement( $parser, $elm ): void { |
828 | if ( $elm === ( self::NS_RDF . ' RDF' ) |
829 | || $elm === 'adobe:ns:meta/ xmpmeta' |
830 | || $elm === 'adobe:ns:meta/ xapmeta' |
831 | ) { |
832 | // ignore these. |
833 | return; |
834 | } |
835 | |
836 | if ( $elm === self::NS_RDF . ' type' ) { |
837 | // these aren't really supported properly yet. |
838 | // However, it appears they almost never used. |
839 | $this->logger->info( |
840 | __METHOD__ . ' encountered <rdf:type>', |
841 | [ 'file' => $this->filename ] |
842 | ); |
843 | } |
844 | |
845 | if ( strpos( $elm, ' ' ) === false ) { |
846 | // This probably shouldn't happen. |
847 | // However, there is a bug in an adobe product |
848 | // that forgets the namespace on some things. |
849 | // (Luckily they are unimportant things). |
850 | $this->logger->info( |
851 | __METHOD__ . " Encountered </$elm> which has no namespace. Skipping.", |
852 | [ 'file' => $this->filename ] |
853 | ); |
854 | |
855 | return; |
856 | } |
857 | |
858 | if ( count( $this->mode ) === 0 ) { |
859 | // This should never ever happen and means |
860 | // there is a pretty major bug in this class. |
861 | throw new RuntimeException( 'Encountered end element with no mode' ); |
862 | } |
863 | |
864 | if ( count( $this->curItem ) === 0 && $this->mode[0] !== self::MODE_INITIAL ) { |
865 | // just to be paranoid. Should always have a curItem, except for initially |
866 | // (aka during MODE_INITIAL). |
867 | throw new RuntimeException( "Hit end element </$elm> but no curItem" ); |
868 | } |
869 | |
870 | switch ( $this->mode[0] ) { |
871 | case self::MODE_IGNORE: |
872 | $this->endElementModeIgnore( $elm ); |
873 | break; |
874 | case self::MODE_SIMPLE: |
875 | $this->endElementModeSimple( $elm ); |
876 | break; |
877 | case self::MODE_STRUCT: |
878 | case self::MODE_SEQ: |
879 | case self::MODE_BAG: |
880 | case self::MODE_LANG: |
881 | case self::MODE_BAGSTRUCT: |
882 | $this->endElementNested( $elm ); |
883 | break; |
884 | case self::MODE_INITIAL: |
885 | if ( $elm === self::NS_RDF . ' Description' ) { |
886 | array_shift( $this->mode ); |
887 | } else { |
888 | throw new RuntimeException( 'Element ended unexpectedly while in MODE_INITIAL' ); |
889 | } |
890 | break; |
891 | case self::MODE_LI: |
892 | case self::MODE_LI_LANG: |
893 | $this->endElementModeLi( $elm ); |
894 | break; |
895 | case self::MODE_QDESC: |
896 | $this->endElementModeQDesc( $elm ); |
897 | break; |
898 | default: |
899 | $this->logger->info( |
900 | __METHOD__ . " no mode (elm = $elm)", |
901 | [ 'file' => $this->filename ] |
902 | ); |
903 | break; |
904 | } |
905 | } |
906 | |
907 | /** |
908 | * Hit an opening element while in MODE_IGNORE |
909 | * |
910 | * XMP is extensible, so ignore any tag we don't understand. |
911 | * |
912 | * Mostly ignores, unless we encounter the element that we are ignoring. |
913 | * in which case we add it to the item stack, so we can ignore things |
914 | * that are nested, correctly. |
915 | * |
916 | * @param string $elm Namespace . ' ' . tag name |
917 | */ |
918 | private function startElementModeIgnore( $elm ): void { |
919 | if ( $elm === $this->curItem[0] ) { |
920 | array_unshift( $this->curItem, $elm ); |
921 | array_unshift( $this->mode, self::MODE_IGNORE ); |
922 | } |
923 | } |
924 | |
925 | /** |
926 | * Start element in MODE_BAG (unordered array) |
927 | * this should always be <rdf:Bag> |
928 | * |
929 | * @param string $elm Namespace . ' ' . tag |
930 | * @throws RuntimeException If we have an element that's not <rdf:Bag> |
931 | */ |
932 | private function startElementModeBag( $elm ): void { |
933 | if ( $elm === self::NS_RDF . ' Bag' ) { |
934 | array_unshift( $this->mode, self::MODE_LI ); |
935 | } else { |
936 | throw new RuntimeException( "Expected <rdf:Bag> but got $elm." ); |
937 | } |
938 | } |
939 | |
940 | /** |
941 | * Start element in MODE_SEQ (ordered array) |
942 | * this should always be <rdf:Seq> |
943 | * |
944 | * @param string $elm Namespace . ' ' . tag |
945 | * @throws RuntimeException If we have an element that's not <rdf:Seq> |
946 | */ |
947 | private function startElementModeSeq( $elm ): void { |
948 | if ( $elm === self::NS_RDF . ' Seq' ) { |
949 | array_unshift( $this->mode, self::MODE_LI ); |
950 | } elseif ( $elm === self::NS_RDF . ' Bag' ) { |
951 | # T29105 |
952 | $this->logger->info( |
953 | __METHOD__ . ' Expected an rdf:Seq, but got an rdf:Bag. Pretending' . |
954 | ' it is a Seq, since some buggy software is known to screw this up.', |
955 | [ 'file' => $this->filename ] |
956 | ); |
957 | array_unshift( $this->mode, self::MODE_LI ); |
958 | } else { |
959 | throw new RuntimeException( "Expected <rdf:Seq> but got $elm." ); |
960 | } |
961 | } |
962 | |
963 | /** |
964 | * Start element in MODE_LANG (language alternative) |
965 | * this should always be <rdf:Alt> |
966 | * |
967 | * This tag tends to be used for metadata like describe this |
968 | * picture, which can be translated into multiple languages. |
969 | * |
970 | * XMP supports non-linguistic alternative selections, |
971 | * which are really only used for thumbnails, which |
972 | * we don't care about. |
973 | * |
974 | * @param string $elm Namespace . ' ' . tag |
975 | * @throws RuntimeException If we have an element that's not <rdf:Alt> |
976 | */ |
977 | private function startElementModeLang( $elm ): void { |
978 | if ( $elm === self::NS_RDF . ' Alt' ) { |
979 | array_unshift( $this->mode, self::MODE_LI_LANG ); |
980 | } else { |
981 | throw new RuntimeException( "Expected <rdf:Seq> but got $elm." ); |
982 | } |
983 | } |
984 | |
985 | /** |
986 | * Handle an opening element when in MODE_SIMPLE |
987 | * |
988 | * This should not happen often. This is for if a simple element |
989 | * already opened has a child element. Could happen for a |
990 | * qualified element. |
991 | * |
992 | * For example: |
993 | * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> |
994 | * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> |
995 | * </exif:DigitalZoomRatio> |
996 | * |
997 | * This method is called when processing the <rdf:Description> element |
998 | * |
999 | * @param string $elm Namespace and tag names separated by space. |
1000 | * @param array $attribs Attributes of the element. |
1001 | * @throws RuntimeException |
1002 | */ |
1003 | private function startElementModeSimple( $elm, $attribs ): void { |
1004 | if ( $elm === self::NS_RDF . ' Description' ) { |
1005 | // If this value has qualifiers |
1006 | array_unshift( $this->mode, self::MODE_QDESC ); |
1007 | array_unshift( $this->curItem, $this->curItem[0] ); |
1008 | |
1009 | if ( isset( $attribs[self::NS_RDF . ' value'] ) ) { |
1010 | [ $ns, $tag ] = explode( ' ', $this->curItem[0], 2 ); |
1011 | $this->saveValue( $ns, $tag, $attribs[self::NS_RDF . ' value'] ); |
1012 | } |
1013 | } elseif ( $elm === self::NS_RDF . ' value' ) { |
1014 | // This should not be here. |
1015 | throw new RuntimeException( __METHOD__ . ' Encountered <rdf:value> where it was unexpected.' ); |
1016 | } else { |
1017 | // something else we don't recognize, like a qualifier maybe. |
1018 | $this->logger->info( __METHOD__ . |
1019 | " Encountered element <{element}> where only expecting character data as value of {curitem}", |
1020 | [ |
1021 | 'element' => $elm, |
1022 | 'curitem' => $this->curItem[0], |
1023 | 'file' => $this->filename, |
1024 | ] |
1025 | ); |
1026 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1027 | array_unshift( $this->curItem, $elm ); |
1028 | } |
1029 | } |
1030 | |
1031 | /** |
1032 | * Start an element when in MODE_QDESC. |
1033 | * This generally happens when a simple element has an inner |
1034 | * rdf:Description to hold qualifier elements. |
1035 | * |
1036 | * For example in: |
1037 | * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> |
1038 | * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> |
1039 | * </exif:DigitalZoomRatio> |
1040 | * Called when processing the <rdf:value> or <foo:someQualifier>. |
1041 | * |
1042 | * @param string $elm Namespace and tag name separated by a space. |
1043 | */ |
1044 | private function startElementModeQDesc( $elm ): void { |
1045 | if ( $elm === self::NS_RDF . ' value' ) { |
1046 | // do nothing |
1047 | return; |
1048 | } |
1049 | |
1050 | // otherwise its a qualifier, which we ignore |
1051 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1052 | array_unshift( $this->curItem, $elm ); |
1053 | } |
1054 | |
1055 | /** |
1056 | * Starting an element when in MODE_INITIAL |
1057 | * This usually happens when we hit an element inside |
1058 | * the outer rdf:Description |
1059 | * |
1060 | * This is generally where most properties start. |
1061 | * |
1062 | * @param string $ns Namespace |
1063 | * @param string $tag Tag name (without namespace prefix) |
1064 | * @param array $attribs Array of attributes |
1065 | * @throws RuntimeException |
1066 | */ |
1067 | private function startElementModeInitial( $ns, $tag, $attribs ): void { |
1068 | if ( $ns !== self::NS_RDF ) { |
1069 | if ( isset( $this->items[$ns][$tag] ) ) { |
1070 | if ( isset( $this->items[$ns][$tag]['structPart'] ) ) { |
1071 | // If this element is supposed to appear only as |
1072 | // a child of a structure, but appears here (not as |
1073 | // a child of a struct), then something weird is |
1074 | // happening, so ignore this element and its children. |
1075 | |
1076 | $this->logger->info( |
1077 | 'Encountered <{element}> outside of its expected parent. Ignoring.', |
1078 | [ 'element' => "$ns:$tag", 'file' => $this->filename ] |
1079 | ); |
1080 | |
1081 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1082 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1083 | |
1084 | return; |
1085 | } |
1086 | $mode = $this->items[$ns][$tag]['mode']; |
1087 | array_unshift( $this->mode, $mode ); |
1088 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1089 | if ( $mode === self::MODE_STRUCT ) { |
1090 | $this->ancestorStruct = $this->items[$ns][$tag]['map_name'] ?? $tag; |
1091 | } |
1092 | if ( $this->charContent !== false ) { |
1093 | // Something weird. |
1094 | // Should not happen in valid XMP. |
1095 | throw new RuntimeException( 'tag nested in non-whitespace characters.' ); |
1096 | } |
1097 | } else { |
1098 | // This element is not on our list of allowed elements so ignore. |
1099 | $this->logger->debug( __METHOD__ . ' Ignoring unrecognized element <{element}>.', |
1100 | [ 'element' => "$ns:$tag", 'file' => $this->filename ] ); |
1101 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1102 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1103 | |
1104 | return; |
1105 | } |
1106 | } |
1107 | // process attributes |
1108 | $this->doAttribs( $attribs ); |
1109 | } |
1110 | |
1111 | /** |
1112 | * Hit an opening element when in a Struct (MODE_STRUCT) |
1113 | * This is generally for fields of a compound property. |
1114 | * |
1115 | * Example of a struct (abbreviated; flash has more properties): |
1116 | * |
1117 | * <exif:Flash> <rdf:Description> <exif:Fired>True</exif:Fired> |
1118 | * <exif:Mode>1</exif:Mode></rdf:Description></exif:Flash> |
1119 | * |
1120 | * or: |
1121 | * |
1122 | * <exif:Flash rdf:parseType='Resource'> <exif:Fired>True</exif:Fired> |
1123 | * <exif:Mode>1</exif:Mode></exif:Flash> |
1124 | * |
1125 | * @param string $ns Namespace |
1126 | * @param string $tag Tag name (no ns) |
1127 | * @param array $attribs Array of attribs w/ values. |
1128 | * @throws RuntimeException |
1129 | */ |
1130 | private function startElementModeStruct( $ns, $tag, $attribs ): void { |
1131 | if ( $ns !== self::NS_RDF ) { |
1132 | if ( isset( $this->items[$ns][$tag] ) ) { |
1133 | if ( isset( $this->items[$ns][$this->ancestorStruct]['children'] ) |
1134 | && !isset( $this->items[$ns][$this->ancestorStruct]['children'][$tag] ) |
1135 | ) { |
1136 | // This assumes that we don't have inter-namespace nesting |
1137 | // which we don't in all the properties we're interested in. |
1138 | throw new RuntimeException( " <$tag> appeared nested in <" . $this->ancestorStruct |
1139 | . "> where it is not allowed." ); |
1140 | } |
1141 | array_unshift( $this->mode, $this->items[$ns][$tag]['mode'] ); |
1142 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1143 | if ( $this->charContent !== false ) { |
1144 | // Something weird. |
1145 | // Should not happen in valid XMP. |
1146 | throw new RuntimeException( "tag <$tag> nested in non-whitespace characters (" . |
1147 | $this->charContent . ")." ); |
1148 | } |
1149 | } else { |
1150 | array_unshift( $this->mode, self::MODE_IGNORE ); |
1151 | array_unshift( $this->curItem, $ns . ' ' . $tag ); |
1152 | |
1153 | return; |
1154 | } |
1155 | } |
1156 | |
1157 | if ( $ns === self::NS_RDF && $tag === 'Description' ) { |
1158 | $this->doAttribs( $attribs ); |
1159 | array_unshift( $this->mode, self::MODE_STRUCT ); |
1160 | array_unshift( $this->curItem, $this->curItem[0] ); |
1161 | } |
1162 | } |
1163 | |
1164 | /** |
1165 | * opening element in MODE_LI |
1166 | * process elements of arrays. |
1167 | * |
1168 | * Example: |
1169 | * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> |
1170 | * </rdf:Seq> </exif:ISOSpeedRatings> |
1171 | * This method is called when we hit the <rdf:li> element. |
1172 | * |
1173 | * @param string $elm Namespace . ' ' . tagname |
1174 | * @param array $attribs Attributes. (needed for BAGSTRUCTS) |
1175 | * @throws RuntimeException If it gets a tag other than <rdf:li> |
1176 | */ |
1177 | private function startElementModeLi( $elm, $attribs ): void { |
1178 | if ( ( $elm ) !== self::NS_RDF . ' li' ) { |
1179 | throw new RuntimeException( "<rdf:li> expected but got $elm." ); |
1180 | } |
1181 | |
1182 | if ( !isset( $this->mode[1] ) ) { |
1183 | // This should never ever ever happen. Checking for it |
1184 | // to be paranoid. |
1185 | throw new RuntimeException( 'In mode Li, but no 2xPrevious mode!' ); |
1186 | } |
1187 | |
1188 | if ( $this->mode[1] === self::MODE_BAGSTRUCT ) { |
1189 | // This list item contains a compound (STRUCT) value. |
1190 | array_unshift( $this->mode, self::MODE_STRUCT ); |
1191 | array_unshift( $this->curItem, $elm ); |
1192 | $this->processingArray = true; |
1193 | |
1194 | if ( !isset( $this->curItem[1] ) ) { |
1195 | // be paranoid. |
1196 | throw new RuntimeException( 'Can not find parent of BAGSTRUCT.' ); |
1197 | } |
1198 | [ $curNS, $curTag ] = explode( ' ', $this->curItem[1] ); |
1199 | $this->ancestorStruct = $this->items[$curNS][$curTag]['map_name'] ?? $curTag; |
1200 | |
1201 | $this->doAttribs( $attribs ); |
1202 | } else { |
1203 | // Normal BAG or SEQ containing simple values. |
1204 | array_unshift( $this->mode, self::MODE_SIMPLE ); |
1205 | // need to add curItem[0] on again since one is for the specific item |
1206 | // and one is for the entire group. |
1207 | array_unshift( $this->curItem, $this->curItem[0] ); |
1208 | $this->processingArray = true; |
1209 | } |
1210 | } |
1211 | |
1212 | /** |
1213 | * Opening element in MODE_LI_LANG. |
1214 | * process elements of language alternatives |
1215 | * |
1216 | * Example: |
1217 | * <dc:title> <rdf:Alt> <rdf:li xml:lang="x-default">My house |
1218 | * </rdf:li> </rdf:Alt> </dc:title> |
1219 | * |
1220 | * This method is called when we hit the <rdf:li> element. |
1221 | * |
1222 | * @param string $elm Namespace . ' ' . tag |
1223 | * @param array $attribs Array of elements (most importantly xml:lang) |
1224 | * @throws RuntimeException If it gets a tag other than <rdf:li> or if no xml:lang |
1225 | */ |
1226 | private function startElementModeLiLang( $elm, $attribs ): void { |
1227 | if ( $elm !== self::NS_RDF . ' li' ) { |
1228 | throw new RuntimeException( __METHOD__ . " <rdf:li> expected but got $elm." ); |
1229 | } |
1230 | if ( !isset( $attribs[self::NS_XML . ' lang'] ) |
1231 | || !preg_match( '/^[-A-Za-z0-9]{2,}$/D', $attribs[self::NS_XML . ' lang'] ) |
1232 | ) { |
1233 | throw new RuntimeException( __METHOD__ |
1234 | . " <rdf:li> did not contain, or has invalid xml:lang attribute in lang alternative" ); |
1235 | } |
1236 | |
1237 | // Lang is case-insensitive. |
1238 | $this->itemLang = strtolower( $attribs[self::NS_XML . ' lang'] ); |
1239 | |
1240 | // need to add curItem[0] on again since one is for the specific item |
1241 | // and one is for the entire group. |
1242 | array_unshift( $this->curItem, $this->curItem[0] ); |
1243 | array_unshift( $this->mode, self::MODE_SIMPLE ); |
1244 | $this->processingArray = true; |
1245 | } |
1246 | |
1247 | /** |
1248 | * Hits an opening element. |
1249 | * Generally just calls a helper based on what MODE we're in. |
1250 | * Also does some initial set up for the wrapper element |
1251 | * |
1252 | * @param resource $parser |
1253 | * @param string $elm Namespace "<space>" element |
1254 | * @param array $attribs Attribute name => value |
1255 | * @throws RuntimeException |
1256 | */ |
1257 | public function startElement( $parser, $elm, $attribs ): void { |
1258 | if ( $elm === self::NS_RDF . ' RDF' |
1259 | || $elm === 'adobe:ns:meta/ xmpmeta' |
1260 | || $elm === 'adobe:ns:meta/ xapmeta' |
1261 | ) { |
1262 | /* ignore. */ |
1263 | return; |
1264 | } |
1265 | |
1266 | if ( $elm === self::NS_RDF . ' Description' ) { |
1267 | if ( count( $this->mode ) === 0 ) { |
1268 | // outer rdf:desc |
1269 | array_unshift( $this->mode, self::MODE_INITIAL ); |
1270 | } |
1271 | } elseif ( $elm === self::NS_RDF . ' type' ) { |
1272 | // This doesn't support rdf:type properly. |
1273 | // In practise, I have yet to see a file that |
1274 | // uses this element, however it is mentioned |
1275 | // on page 25 of part 1 of the xmp standard. |
1276 | // Also, it seems as if exiv2 and exiftool do not support |
1277 | // this either (That or I misunderstand the standard) |
1278 | $this->logger->info( |
1279 | __METHOD__ . ' Encountered <rdf:type> which isn\'t currently supported', |
1280 | [ 'file' => $this->filename ] |
1281 | ); |
1282 | } |
1283 | |
1284 | if ( strpos( $elm, ' ' ) === false ) { |
1285 | // This probably shouldn't happen. |
1286 | $this->logger->info( |
1287 | __METHOD__ . " Encountered <$elm> which has no namespace. Skipping.", |
1288 | [ 'file' => $this->filename ] |
1289 | ); |
1290 | |
1291 | return; |
1292 | } |
1293 | |
1294 | [ $ns, $tag ] = explode( ' ', $elm, 2 ); |
1295 | |
1296 | if ( count( $this->mode ) === 0 ) { |
1297 | // This should not happen. |
1298 | throw new RuntimeException( 'Error extracting XMP, ' |
1299 | . "encountered <$elm> with no mode" ); |
1300 | } |
1301 | |
1302 | switch ( $this->mode[0] ) { |
1303 | case self::MODE_IGNORE: |
1304 | $this->startElementModeIgnore( $elm ); |
1305 | break; |
1306 | case self::MODE_SIMPLE: |
1307 | $this->startElementModeSimple( $elm, $attribs ); |
1308 | break; |
1309 | case self::MODE_INITIAL: |
1310 | $this->startElementModeInitial( $ns, $tag, $attribs ); |
1311 | break; |
1312 | case self::MODE_STRUCT: |
1313 | $this->startElementModeStruct( $ns, $tag, $attribs ); |
1314 | break; |
1315 | case self::MODE_BAG: |
1316 | case self::MODE_BAGSTRUCT: |
1317 | $this->startElementModeBag( $elm ); |
1318 | break; |
1319 | case self::MODE_SEQ: |
1320 | $this->startElementModeSeq( $elm ); |
1321 | break; |
1322 | case self::MODE_LANG: |
1323 | $this->startElementModeLang( $elm ); |
1324 | break; |
1325 | case self::MODE_LI_LANG: |
1326 | $this->startElementModeLiLang( $elm, $attribs ); |
1327 | break; |
1328 | case self::MODE_LI: |
1329 | $this->startElementModeLi( $elm, $attribs ); |
1330 | break; |
1331 | case self::MODE_QDESC: |
1332 | $this->startElementModeQDesc( $elm ); |
1333 | break; |
1334 | default: |
1335 | throw new RuntimeException( 'StartElement in unknown mode: ' . $this->mode[0] ); |
1336 | } |
1337 | } |
1338 | |
1339 | /** |
1340 | * Process attributes. |
1341 | * Simple values can be stored as either a tag or attribute |
1342 | * |
1343 | * Often the initial "<rdf:Description>" tag just has all the simple |
1344 | * properties as attributes. |
1345 | * |
1346 | * @par Example: |
1347 | * @code |
1348 | * <rdf:Description rdf:about="" |
1349 | * xmlns:exif="http://ns.adobe.com/exif/1.0/" exif:DigitalZoomRatio="0/10"> |
1350 | * @endcode |
1351 | * |
1352 | * @param array $attribs Array attribute=>value |
1353 | * @throws RuntimeException |
1354 | */ |
1355 | private function doAttribs( $attribs ): void { |
1356 | // first check for rdf:parseType attribute, as that can change |
1357 | // how the attributes are interpreted. |
1358 | |
1359 | if ( isset( $attribs[self::NS_RDF . ' parseType'] ) |
1360 | && $attribs[self::NS_RDF . ' parseType'] === 'Resource' |
1361 | && $this->mode[0] === self::MODE_SIMPLE |
1362 | ) { |
1363 | // this is equivalent to having an inner rdf:Description |
1364 | $this->mode[0] = self::MODE_QDESC; |
1365 | } |
1366 | foreach ( $attribs as $name => $val ) { |
1367 | if ( strpos( $name, ' ' ) === false ) { |
1368 | // This shouldn't happen, but so far some old software forgets namespace |
1369 | // on rdf:about. |
1370 | $this->logger->info( |
1371 | __METHOD__ . ' Encountered non-namespaced attribute: ' . |
1372 | " $name=\"$val\". Skipping. ", |
1373 | [ 'file' => $this->filename ] |
1374 | ); |
1375 | continue; |
1376 | } |
1377 | [ $ns, $tag ] = explode( ' ', $name, 2 ); |
1378 | if ( $ns === self::NS_RDF ) { |
1379 | if ( $tag === 'value' || $tag === 'resource' ) { |
1380 | // resource is for url. |
1381 | // value attribute is a weird way of just putting the contents. |
1382 | $this->char( $this->xmlParser, $val ); |
1383 | } |
1384 | } elseif ( isset( $this->items[$ns][$tag] ) ) { |
1385 | if ( $this->mode[0] === self::MODE_SIMPLE ) { |
1386 | throw new RuntimeException( __METHOD__ |
1387 | . " $ns:$tag found as attribute where not allowed" ); |
1388 | } |
1389 | $this->saveValue( $ns, $tag, $val ); |
1390 | } else { |
1391 | $this->logger->debug( |
1392 | __METHOD__ . " Ignoring unrecognized element <$ns:$tag>.", |
1393 | [ 'file' => $this->filename ] |
1394 | ); |
1395 | } |
1396 | } |
1397 | } |
1398 | |
1399 | /** |
1400 | * Given an extracted value, save it to results array |
1401 | * |
1402 | * note also uses $this->ancestorStruct and |
1403 | * $this->processingArray to determine what name to |
1404 | * save the value under. (in addition to $tag). |
1405 | * |
1406 | * @param string $ns Namespace of tag this is for |
1407 | * @param string $tag Tag name |
1408 | * @param string $val Value to save |
1409 | */ |
1410 | private function saveValue( $ns, $tag, $val ): void { |
1411 | $info =& $this->items[$ns][$tag]; |
1412 | $finalName = $info['map_name'] ?? $tag; |
1413 | if ( isset( $info['validate'] ) ) { |
1414 | if ( is_array( $info['validate'] ) ) { |
1415 | $validate = $info['validate']; |
1416 | } else { |
1417 | $validator = new Validate( $this->logger ); |
1418 | $validate = [ $validator, $info['validate'] ]; |
1419 | } |
1420 | |
1421 | if ( is_callable( $validate ) ) { |
1422 | call_user_func_array( $validate, [ $info, &$val, true ] ); |
1423 | // the reasoning behind using &$val instead of using the return value |
1424 | // is to be consistent between here and validating structures. |
1425 | if ( $val === null ) { |
1426 | $this->logger->info( |
1427 | __METHOD__ . " <$ns:$tag> failed validation.", |
1428 | [ 'file' => $this->filename ] |
1429 | ); |
1430 | |
1431 | return; |
1432 | } |
1433 | } else { |
1434 | $this->logger->warning( |
1435 | __METHOD__ . " Validation function for $finalName (" . |
1436 | get_class( $validate[0] ) . '::' . $validate[1] . '()) is not callable.', |
1437 | [ 'file' => $this->filename ] |
1438 | ); |
1439 | } |
1440 | } |
1441 | |
1442 | if ( $this->ancestorStruct && $this->processingArray ) { |
1443 | // Aka both an array and a struct. ( self::MODE_BAGSTRUCT ) |
1444 | $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][][$finalName] = $val; |
1445 | } elseif ( $this->ancestorStruct ) { |
1446 | $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][$finalName] = $val; |
1447 | } elseif ( $this->processingArray ) { |
1448 | if ( $this->itemLang === false ) { |
1449 | // normal array |
1450 | $this->results['xmp-' . $info['map_group']][$finalName][] = $val; |
1451 | } else { |
1452 | // lang array. |
1453 | $this->results['xmp-' . $info['map_group']][$finalName][$this->itemLang] = $val; |
1454 | } |
1455 | } else { |
1456 | $this->results['xmp-' . $info['map_group']][$finalName] = $val; |
1457 | } |
1458 | } |
1459 | } |