MediaWiki REL1_31
XMP.php
Go to the documentation of this file.
1<?php
24use Psr\Log\LoggerAwareInterface;
25use Psr\Log\LoggerInterface;
26use Psr\Log\NullLogger;
27use Wikimedia\ScopedCallback;
28
53class XMPReader implements LoggerAwareInterface {
55 protected $items;
56
58 private $curItem = [];
59
61 private $ancestorStruct = false;
62
64 private $charContent = false;
65
67 private $mode = [];
68
70 private $results = [];
71
73 private $processingArray = false;
74
76 private $itemLang = false;
77
79 private $xmlParser;
80
82 private $charset = false;
83
85 private $extendedXMPOffset = 0;
86
88 private $parsable = 0;
89
91 private $xmlParsableBuffer = '';
92
102 const MODE_INITIAL = 0;
103 const MODE_IGNORE = 1;
104 const MODE_LI = 2;
105 const MODE_LI_LANG = 3;
106 const MODE_QDESC = 4;
107
108 // The following MODE constants are also used in the
109 // $items array to denote what type of property the item is.
110 const MODE_SIMPLE = 10;
111 const MODE_STRUCT = 11; // structure (associative array)
112 const MODE_SEQ = 12; // ordered list
113 const MODE_BAG = 13; // unordered list
114 const MODE_LANG = 14;
115 const MODE_ALT = 15; // non-language alt. Currently not implemented, and not needed atm.
116 const MODE_BAGSTRUCT = 16; // A BAG of Structs.
117
118 const NS_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
119 const NS_XML = 'http://www.w3.org/XML/1998/namespace';
120
121 // States used while determining if XML is safe to parse
123 const PARSABLE_OK = 1;
125 const PARSABLE_NO = 3;
126
130 private $logger;
131
135 private $filename;
136
142 function __construct( LoggerInterface $logger = null, $filename = 'unknown' ) {
143 if ( !function_exists( 'xml_parser_create_ns' ) ) {
144 // this should already be checked by this point
145 throw new RuntimeException( 'XMP support requires XML Parser' );
146 }
147 if ( $logger ) {
148 $this->setLogger( $logger );
149 } else {
150 $this->setLogger( new NullLogger() );
151 }
152 $this->filename = $filename;
153
154 $this->items = XMPInfo::getItems();
155
156 $this->resetXMLParser();
157 }
158
159 public function setLogger( LoggerInterface $logger ) {
160 $this->logger = $logger;
161 }
162
170 private function destroyXMLParser() {
171 if ( $this->xmlParser ) {
172 xml_parser_free( $this->xmlParser );
173 $this->xmlParser = null;
174 }
175 }
176
181 private function resetXMLParser() {
182 $this->destroyXMLParser();
183
184 $this->xmlParser = xml_parser_create_ns( 'UTF-8', ' ' );
185 xml_parser_set_option( $this->xmlParser, XML_OPTION_CASE_FOLDING, 0 );
186 xml_parser_set_option( $this->xmlParser, XML_OPTION_SKIP_WHITE, 1 );
187
188 xml_set_element_handler( $this->xmlParser,
189 [ $this, 'startElement' ],
190 [ $this, 'endElement' ] );
191
192 xml_set_character_data_handler( $this->xmlParser, [ $this, 'char' ] );
193
194 $this->parsable = self::PARSABLE_UNKNOWN;
195 $this->xmlParsableBuffer = '';
196 }
197
202 public static function isSupported() {
203 return function_exists( 'xml_parser_create_ns' ) && class_exists( 'XMLReader' );
204 }
205
212 public function getResults() {
213 // xmp-special is for metadata that affects how stuff
214 // is extracted. For example xmpNote:HasExtendedXMP.
215
216 // It is also used to handle photoshop:AuthorsPosition
217 // which is weird and really part of another property,
218 // see 2:85 in IPTC. See also pg 21 of IPTC4XMP standard.
219 // The location fields also use it.
220
221 $data = $this->results;
222
223 if ( isset( $data['xmp-special']['AuthorsPosition'] )
224 && is_string( $data['xmp-special']['AuthorsPosition'] )
225 && isset( $data['xmp-general']['Artist'][0] )
226 ) {
227 // Note, if there is more than one creator,
228 // this only applies to first. This also will
229 // only apply to the dc:Creator prop, not the
230 // exif:Artist prop.
231
232 $data['xmp-general']['Artist'][0] =
233 $data['xmp-special']['AuthorsPosition'] . ', '
234 . $data['xmp-general']['Artist'][0];
235 }
236
237 // Go through the LocationShown and LocationCreated
238 // changing it to the non-hierarchal form used by
239 // the other location fields.
240
241 if ( isset( $data['xmp-special']['LocationShown'][0] )
242 && is_array( $data['xmp-special']['LocationShown'][0] )
243 ) {
244 // the is_array is just paranoia. It should always
245 // be an array.
246 foreach ( $data['xmp-special']['LocationShown'] as $loc ) {
247 if ( !is_array( $loc ) ) {
248 // To avoid copying over the _type meta-fields.
249 continue;
250 }
251 foreach ( $loc as $field => $val ) {
252 $data['xmp-general'][$field . 'Dest'][] = $val;
253 }
254 }
255 }
256 if ( isset( $data['xmp-special']['LocationCreated'][0] )
257 && is_array( $data['xmp-special']['LocationCreated'][0] )
258 ) {
259 // the is_array is just paranoia. It should always
260 // be an array.
261 foreach ( $data['xmp-special']['LocationCreated'] as $loc ) {
262 if ( !is_array( $loc ) ) {
263 // To avoid copying over the _type meta-fields.
264 continue;
265 }
266 foreach ( $loc as $field => $val ) {
267 $data['xmp-general'][$field . 'Created'][] = $val;
268 }
269 }
270 }
271
272 // We don't want to return the special values, since they're
273 // special and not info to be stored about the file.
274 unset( $data['xmp-special'] );
275
276 // Convert GPSAltitude to negative if below sea level.
277 if ( isset( $data['xmp-exif']['GPSAltitudeRef'] )
278 && isset( $data['xmp-exif']['GPSAltitude'] )
279 ) {
280 // Must convert to a real before multiplying by -1
281 // XMPValidate guarantees there will always be a '/' in this value.
282 list( $nom, $denom ) = explode( '/', $data['xmp-exif']['GPSAltitude'] );
283 $data['xmp-exif']['GPSAltitude'] = $nom / $denom;
284
285 if ( $data['xmp-exif']['GPSAltitudeRef'] == '1' ) {
286 $data['xmp-exif']['GPSAltitude'] *= -1;
287 }
288 unset( $data['xmp-exif']['GPSAltitudeRef'] );
289 }
290
291 return $data;
292 }
293
306 public function parse( $content, $allOfIt = true ) {
307 if ( !$this->xmlParser ) {
308 $this->resetXMLParser();
309 }
310 try {
311
312 // detect encoding by looking for BOM which is supposed to be in processing instruction.
313 // see page 12 of http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart3.pdf
314 if ( !$this->charset ) {
315 $bom = [];
316 if ( preg_match( '/\xEF\xBB\xBF|\xFE\xFF|\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\xFF\xFE/',
317 $content, $bom )
318 ) {
319 switch ( $bom[0] ) {
320 case "\xFE\xFF":
321 $this->charset = 'UTF-16BE';
322 break;
323 case "\xFF\xFE":
324 $this->charset = 'UTF-16LE';
325 break;
326 case "\x00\x00\xFE\xFF":
327 $this->charset = 'UTF-32BE';
328 break;
329 case "\xFF\xFE\x00\x00":
330 $this->charset = 'UTF-32LE';
331 break;
332 case "\xEF\xBB\xBF":
333 $this->charset = 'UTF-8';
334 break;
335 default:
336 // this should be impossible to get to
337 throw new RuntimeException( "Invalid BOM" );
338 }
339 } else {
340 // standard specifically says, if no bom assume utf-8
341 $this->charset = 'UTF-8';
342 }
343 }
344 if ( $this->charset !== 'UTF-8' ) {
345 // don't convert if already utf-8
346 Wikimedia\suppressWarnings();
347 $content = iconv( $this->charset, 'UTF-8//IGNORE', $content );
348 Wikimedia\restoreWarnings();
349 }
350
351 // Ensure the XMP block does not have an xml doctype declaration, which
352 // could declare entities unsafe to parse with xml_parse (T85848/T71210).
353 if ( $this->parsable !== self::PARSABLE_OK ) {
354 if ( $this->parsable === self::PARSABLE_NO ) {
355 throw new RuntimeException( 'Unsafe doctype declaration in XML.' );
356 }
357
358 $content = $this->xmlParsableBuffer . $content;
359 if ( !$this->checkParseSafety( $content ) ) {
360 if ( !$allOfIt && $this->parsable !== self::PARSABLE_NO ) {
361 // parse wasn't Unsuccessful yet, so return true
362 // in this case.
363 return true;
364 }
365 $msg = ( $this->parsable === self::PARSABLE_NO ) ?
366 'Unsafe doctype declaration in XML.' :
367 'No root element found in XML.';
368 throw new RuntimeException( $msg );
369 }
370 }
371
372 $ok = xml_parse( $this->xmlParser, $content, $allOfIt );
373 if ( !$ok ) {
374 $code = xml_get_error_code( $this->xmlParser );
375 $error = xml_error_string( $code );
376 $line = xml_get_current_line_number( $this->xmlParser );
377 $col = xml_get_current_column_number( $this->xmlParser );
378 $offset = xml_get_current_byte_index( $this->xmlParser );
379
380 $this->logger->info(
381 '{method} : Error reading XMP content: {error} ' .
382 '(file: {file}, line: {line} column: {column} ' .
383 'byte offset: {offset})',
384 [
385 'method' => __METHOD__,
386 'error_code' => $code,
387 'error' => $error,
388 'file' => $this->filename,
389 'line' => $line,
390 'column' => $col,
391 'offset' => $offset,
392 'content' => $content,
393 ] );
394 $this->results = []; // blank if error.
395 $this->destroyXMLParser();
396 return false;
397 }
398 } catch ( Exception $e ) {
399 $this->logger->warning(
400 '{method} {exception}',
401 [
402 'method' => __METHOD__,
403 'exception' => $e,
404 'file' => $this->filename,
405 'content' => $content,
406 ]
407 );
408 $this->results = [];
409 return false;
410 }
411 if ( $allOfIt ) {
412 $this->destroyXMLParser();
413 }
414
415 return true;
416 }
417
425 public function parseExtended( $content ) {
426 // @todo FIXME: This is untested. Hard to find example files
427 // or programs that make such files..
428 $guid = substr( $content, 0, 32 );
429 if ( !isset( $this->results['xmp-special']['HasExtendedXMP'] )
430 || $this->results['xmp-special']['HasExtendedXMP'] !== $guid
431 ) {
432 $this->logger->info( __METHOD__ .
433 " Ignoring XMPExtended block due to wrong guid (guid= '{guid}')",
434 [
435 'guid' => $guid,
436 'file' => $this->filename,
437 ]
438 );
439
440 return false;
441 }
442 $len = unpack( 'Nlength/Noffset', substr( $content, 32, 8 ) );
443
444 if ( !$len ||
445 $len['length'] < 4 ||
446 $len['offset'] < 0 ||
447 $len['offset'] > $len['length']
448 ) {
449 $this->logger->info(
450 __METHOD__ . 'Error reading extended XMP block, invalid length or offset.',
451 [ 'file' => $this->filename ]
452 );
453
454 return false;
455 }
456
457 // we're not very robust here. we should accept it in the wrong order.
458 // To quote the XMP standard:
459 // "A JPEG writer should write the ExtendedXMP marker segments in order,
460 // immediately following the StandardXMP. However, the JPEG standard
461 // does not require preservation of marker segment order. A robust JPEG
462 // reader should tolerate the marker segments in any order."
463 // On the other hand, the probability that an image will have more than
464 // 128k of metadata is rather low... so the probability that it will have
465 // > 128k, and be in the wrong order is very low...
466
467 if ( $len['offset'] !== $this->extendedXMPOffset ) {
468 $this->logger->info( __METHOD__ . 'Ignoring XMPExtended block due to wrong order. (Offset was '
469 . $len['offset'] . ' but expected ' . $this->extendedXMPOffset . ')',
470 [ 'file' => $this->filename ]
471 );
472
473 return false;
474 }
475
476 if ( $len['offset'] === 0 ) {
477 // if we're starting the extended block, we've probably already
478 // done the XMPStandard block, so reset.
479 $this->resetXMLParser();
480 }
481
482 $this->extendedXMPOffset += $len['length'];
483
484 $actualContent = substr( $content, 40 );
485
486 if ( $this->extendedXMPOffset === strlen( $actualContent ) ) {
487 $atEnd = true;
488 } else {
489 $atEnd = false;
490 }
491
492 $this->logger->debug(
493 __METHOD__ . 'Parsing a XMPExtended block',
494 [ 'file' => $this->filename ]
495 );
496
497 return $this->parse( $actualContent, $atEnd );
498 }
499
516 function char( $parser, $data ) {
517 $data = trim( $data );
518 if ( trim( $data ) === "" ) {
519 return;
520 }
521
522 if ( !isset( $this->mode[0] ) ) {
523 throw new RuntimeException( 'Unexpected character data before first rdf:Description element' );
524 }
525
526 if ( $this->mode[0] === self::MODE_IGNORE ) {
527 return;
528 }
529
530 if ( $this->mode[0] !== self::MODE_SIMPLE
531 && $this->mode[0] !== self::MODE_QDESC
532 ) {
533 throw new RuntimeException( 'character data where not expected. (mode ' . $this->mode[0] . ')' );
534 }
535
536 // to check, how does this handle w.s.
537 if ( $this->charContent === false ) {
538 $this->charContent = $data;
539 } else {
540 $this->charContent .= $data;
541 }
542 }
543
552 private function checkParseSafety( $content ) {
553 $reader = new XMLReader();
554 $result = null;
555
556 // For XMLReader to parse incomplete/invalid XML, it has to be open()'ed
557 // instead of using XML().
558 $reader->open(
559 'data://text/plain,' . urlencode( $content ),
560 null,
561 LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET
562 );
563
564 $oldDisable = libxml_disable_entity_loader( true );
566 $reset = new ScopedCallback(
567 'libxml_disable_entity_loader',
568 [ $oldDisable ]
569 );
570 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false );
571
572 // Even with LIBXML_NOWARNING set, XMLReader::read gives a warning
573 // when parsing truncated XML, which causes unit tests to fail.
574 Wikimedia\suppressWarnings();
575 while ( $reader->read() ) {
576 if ( $reader->nodeType === XMLReader::ELEMENT ) {
577 // Reached the first element without hitting a doctype declaration
578 $this->parsable = self::PARSABLE_OK;
579 $result = true;
580 break;
581 }
582 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
583 $this->parsable = self::PARSABLE_NO;
584 $result = false;
585 break;
586 }
587 }
588 Wikimedia\restoreWarnings();
589
590 if ( !is_null( $result ) ) {
591 return $result;
592 }
593
594 // Reached the end of the parsable xml without finding an element
595 // or doctype. Buffer and try again.
596 $this->parsable = self::PARSABLE_BUFFERING;
597 $this->xmlParsableBuffer = $content;
598 return false;
599 }
600
607 private function endElementModeIgnore( $elm ) {
608 if ( $this->curItem[0] === $elm ) {
609 array_shift( $this->curItem );
610 array_shift( $this->mode );
611 }
612 }
613
629 private function endElementModeSimple( $elm ) {
630 if ( $this->charContent !== false ) {
631 if ( $this->processingArray ) {
632 // if we're processing an array, use the original element
633 // name instead of rdf:li.
634 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 );
635 } else {
636 list( $ns, $tag ) = explode( ' ', $elm, 2 );
637 }
638 $this->saveValue( $ns, $tag, $this->charContent );
639
640 $this->charContent = false; // reset
641 }
642 array_shift( $this->curItem );
643 array_shift( $this->mode );
644 }
645
664 private function endElementNested( $elm ) {
665 /* cur item must be the same as $elm, unless if in MODE_STRUCT
666 * in which case it could also be rdf:Description */
667 if ( $this->curItem[0] !== $elm
668 && !( $elm === self::NS_RDF . ' Description'
669 && $this->mode[0] === self::MODE_STRUCT )
670 ) {
671 throw new RuntimeException( "nesting mismatch. got a </$elm> but expected a </" .
672 $this->curItem[0] . '>' );
673 }
674
675 // Validate structures.
676 list( $ns, $tag ) = explode( ' ', $elm, 2 );
677 if ( isset( $this->items[$ns][$tag]['validate'] ) ) {
678 $info =& $this->items[$ns][$tag];
679 $finalName = isset( $info['map_name'] )
680 ? $info['map_name'] : $tag;
681
682 if ( is_array( $info['validate'] ) ) {
683 $validate = $info['validate'];
684 } else {
685 $validator = new XMPValidate( $this->logger );
686 $validate = [ $validator, $info['validate'] ];
687 }
688
689 if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) {
690 // This can happen if all the members of the struct failed validation.
691 $this->logger->debug(
692 __METHOD__ . " <$ns:$tag> has no valid members.",
693 [ 'file' => $this->filename ]
694 );
695 } elseif ( is_callable( $validate ) ) {
696 $val =& $this->results['xmp-' . $info['map_group']][$finalName];
697 call_user_func_array( $validate, [ $info, &$val, false ] );
698 if ( is_null( $val ) ) {
699 // the idea being the validation function will unset the variable if
700 // its invalid.
701 $this->logger->info(
702 __METHOD__ . " <$ns:$tag> failed validation.",
703 [ 'file' => $this->filename ]
704 );
705 unset( $this->results['xmp-' . $info['map_group']][$finalName] );
706 }
707 } else {
708 $this->logger->warning(
709 __METHOD__ . " Validation function for $finalName (" .
710 $validate[0] . '::' . $validate[1] . '()) is not callable.',
711 [ 'file' => $this->filename ]
712 );
713 }
714 }
715
716 array_shift( $this->curItem );
717 array_shift( $this->mode );
718 $this->ancestorStruct = false;
719 $this->processingArray = false;
720 $this->itemLang = false;
721 }
722
742 private function endElementModeLi( $elm ) {
743 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 );
744 $info = $this->items[$ns][$tag];
745 $finalName = isset( $info['map_name'] )
746 ? $info['map_name'] : $tag;
747
748 array_shift( $this->mode );
749
750 if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) {
751 $this->logger->debug(
752 __METHOD__ . " Empty compund element $finalName.",
753 [ 'file' => $this->filename ]
754 );
755
756 return;
757 }
758
759 if ( $elm === self::NS_RDF . ' Seq' ) {
760 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ol';
761 } elseif ( $elm === self::NS_RDF . ' Bag' ) {
762 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ul';
763 } elseif ( $elm === self::NS_RDF . ' Alt' ) {
764 // extra if needed as you could theoretically have a non-language alt.
765 if ( $info['mode'] === self::MODE_LANG ) {
766 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'lang';
767 }
768 } else {
769 throw new RuntimeException(
770 __METHOD__ . " expected </rdf:seq> or </rdf:bag> but instead got $elm."
771 );
772 }
773 }
774
785 private function endElementModeQDesc( $elm ) {
786 if ( $elm === self::NS_RDF . ' value' ) {
787 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 );
788 $this->saveValue( $ns, $tag, $this->charContent );
789
790 return;
791 } else {
792 array_shift( $this->mode );
793 array_shift( $this->curItem );
794 }
795 }
796
810 function endElement( $parser, $elm ) {
811 if ( $elm === ( self::NS_RDF . ' RDF' )
812 || $elm === 'adobe:ns:meta/ xmpmeta'
813 || $elm === 'adobe:ns:meta/ xapmeta'
814 ) {
815 // ignore these.
816 return;
817 }
818
819 if ( $elm === self::NS_RDF . ' type' ) {
820 // these aren't really supported properly yet.
821 // However, it appears they almost never used.
822 $this->logger->info(
823 __METHOD__ . ' encountered <rdf:type>',
824 [ 'file' => $this->filename ]
825 );
826 }
827
828 if ( strpos( $elm, ' ' ) === false ) {
829 // This probably shouldn't happen.
830 // However, there is a bug in an adobe product
831 // that forgets the namespace on some things.
832 // (Luckily they are unimportant things).
833 $this->logger->info(
834 __METHOD__ . " Encountered </$elm> which has no namespace. Skipping.",
835 [ 'file' => $this->filename ]
836 );
837
838 return;
839 }
840
841 if ( count( $this->mode ) === 0 ) {
842 // This should never ever happen and means
843 // there is a pretty major bug in this class.
844 throw new RuntimeException( 'Encountered end element with no mode' );
845 }
846
847 if ( count( $this->curItem ) == 0 && $this->mode[0] !== self::MODE_INITIAL ) {
848 // just to be paranoid. Should always have a curItem, except for initially
849 // (aka during MODE_INITAL).
850 throw new RuntimeException( "Hit end element </$elm> but no curItem" );
851 }
852
853 switch ( $this->mode[0] ) {
855 $this->endElementModeIgnore( $elm );
856 break;
858 $this->endElementModeSimple( $elm );
859 break;
861 case self::MODE_SEQ:
862 case self::MODE_BAG:
863 case self::MODE_LANG:
865 $this->endElementNested( $elm );
866 break;
868 if ( $elm === self::NS_RDF . ' Description' ) {
869 array_shift( $this->mode );
870 } else {
871 throw new RuntimeException( 'Element ended unexpectedly while in MODE_INITIAL' );
872 }
873 break;
874 case self::MODE_LI:
876 $this->endElementModeLi( $elm );
877 break;
878 case self::MODE_QDESC:
879 $this->endElementModeQDesc( $elm );
880 break;
881 default:
882 $this->logger->info(
883 __METHOD__ . " no mode (elm = $elm)",
884 [ 'file' => $this->filename ]
885 );
886 break;
887 }
888 }
889
901 private function startElementModeIgnore( $elm ) {
902 if ( $elm === $this->curItem[0] ) {
903 array_unshift( $this->curItem, $elm );
904 array_unshift( $this->mode, self::MODE_IGNORE );
905 }
906 }
907
915 private function startElementModeBag( $elm ) {
916 if ( $elm === self::NS_RDF . ' Bag' ) {
917 array_unshift( $this->mode, self::MODE_LI );
918 } else {
919 throw new RuntimeException( "Expected <rdf:Bag> but got $elm." );
920 }
921 }
922
930 private function startElementModeSeq( $elm ) {
931 if ( $elm === self::NS_RDF . ' Seq' ) {
932 array_unshift( $this->mode, self::MODE_LI );
933 } elseif ( $elm === self::NS_RDF . ' Bag' ) {
934 # T29105
935 $this->logger->info(
936 __METHOD__ . ' Expected an rdf:Seq, but got an rdf:Bag. Pretending' .
937 ' it is a Seq, since some buggy software is known to screw this up.',
938 [ 'file' => $this->filename ]
939 );
940 array_unshift( $this->mode, self::MODE_LI );
941 } else {
942 throw new RuntimeException( "Expected <rdf:Seq> but got $elm." );
943 }
944 }
945
960 private function startElementModeLang( $elm ) {
961 if ( $elm === self::NS_RDF . ' Alt' ) {
962 array_unshift( $this->mode, self::MODE_LI_LANG );
963 } else {
964 throw new RuntimeException( "Expected <rdf:Seq> but got $elm." );
965 }
966 }
967
986 private function startElementModeSimple( $elm, $attribs ) {
987 if ( $elm === self::NS_RDF . ' Description' ) {
988 // If this value has qualifiers
989 array_unshift( $this->mode, self::MODE_QDESC );
990 array_unshift( $this->curItem, $this->curItem[0] );
991
992 if ( isset( $attribs[self::NS_RDF . ' value'] ) ) {
993 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 );
994 $this->saveValue( $ns, $tag, $attribs[self::NS_RDF . ' value'] );
995 }
996 } elseif ( $elm === self::NS_RDF . ' value' ) {
997 // This should not be here.
998 throw new RuntimeException( __METHOD__ . ' Encountered <rdf:value> where it was unexpected.' );
999 } else {
1000 // something else we don't recognize, like a qualifier maybe.
1001 $this->logger->info( __METHOD__ .
1002 " Encountered element <{element}> where only expecting character data as value of {curitem}",
1003 [
1004 'element' => $elm,
1005 'curitem' => $this->curItem[0],
1006 'file' => $this->filename,
1007 ]
1008 );
1009 array_unshift( $this->mode, self::MODE_IGNORE );
1010 array_unshift( $this->curItem, $elm );
1011 }
1012 }
1013
1027 private function startElementModeQDesc( $elm ) {
1028 if ( $elm === self::NS_RDF . ' value' ) {
1029 return; // do nothing
1030 } else {
1031 // otherwise its a qualifier, which we ignore
1032 array_unshift( $this->mode, self::MODE_IGNORE );
1033 array_unshift( $this->curItem, $elm );
1034 }
1035 }
1036
1049 private function startElementModeInitial( $ns, $tag, $attribs ) {
1050 if ( $ns !== self::NS_RDF ) {
1051 if ( isset( $this->items[$ns][$tag] ) ) {
1052 if ( isset( $this->items[$ns][$tag]['structPart'] ) ) {
1053 // If this element is supposed to appear only as
1054 // a child of a structure, but appears here (not as
1055 // a child of a struct), then something weird is
1056 // happening, so ignore this element and its children.
1057
1058 $this->logger->info(
1059 'Encountered <{element}> outside of its expected parent. Ignoring.',
1060 [ 'element' => "$ns:$tag", 'file' => $this->filename ]
1061 );
1062
1063 array_unshift( $this->mode, self::MODE_IGNORE );
1064 array_unshift( $this->curItem, $ns . ' ' . $tag );
1065
1066 return;
1067 }
1068 $mode = $this->items[$ns][$tag]['mode'];
1069 array_unshift( $this->mode, $mode );
1070 array_unshift( $this->curItem, $ns . ' ' . $tag );
1071 if ( $mode === self::MODE_STRUCT ) {
1072 $this->ancestorStruct = isset( $this->items[$ns][$tag]['map_name'] )
1073 ? $this->items[$ns][$tag]['map_name'] : $tag;
1074 }
1075 if ( $this->charContent !== false ) {
1076 // Something weird.
1077 // Should not happen in valid XMP.
1078 throw new RuntimeException( 'tag nested in non-whitespace characters.' );
1079 }
1080 } else {
1081 // This element is not on our list of allowed elements so ignore.
1082 $this->logger->debug( __METHOD__ . ' Ignoring unrecognized element <{element}>.',
1083 [ 'element' => "$ns:$tag", 'file' => $this->filename ] );
1084 array_unshift( $this->mode, self::MODE_IGNORE );
1085 array_unshift( $this->curItem, $ns . ' ' . $tag );
1086
1087 return;
1088 }
1089 }
1090 // process attributes
1091 $this->doAttribs( $attribs );
1092 }
1093
1113 private function startElementModeStruct( $ns, $tag, $attribs ) {
1114 if ( $ns !== self::NS_RDF ) {
1115 if ( isset( $this->items[$ns][$tag] ) ) {
1116 if ( isset( $this->items[$ns][$this->ancestorStruct]['children'] )
1117 && !isset( $this->items[$ns][$this->ancestorStruct]['children'][$tag] )
1118 ) {
1119 // This assumes that we don't have inter-namespace nesting
1120 // which we don't in all the properties we're interested in.
1121 throw new RuntimeException( " <$tag> appeared nested in <" . $this->ancestorStruct
1122 . "> where it is not allowed." );
1123 }
1124 array_unshift( $this->mode, $this->items[$ns][$tag]['mode'] );
1125 array_unshift( $this->curItem, $ns . ' ' . $tag );
1126 if ( $this->charContent !== false ) {
1127 // Something weird.
1128 // Should not happen in valid XMP.
1129 throw new RuntimeException( "tag <$tag> nested in non-whitespace characters (" .
1130 $this->charContent . ")." );
1131 }
1132 } else {
1133 array_unshift( $this->mode, self::MODE_IGNORE );
1134 array_unshift( $this->curItem, $ns . ' ' . $tag );
1135
1136 return;
1137 }
1138 }
1139
1140 if ( $ns === self::NS_RDF && $tag === 'Description' ) {
1141 $this->doAttribs( $attribs );
1142 array_unshift( $this->mode, self::MODE_STRUCT );
1143 array_unshift( $this->curItem, $this->curItem[0] );
1144 }
1145 }
1146
1160 private function startElementModeLi( $elm, $attribs ) {
1161 if ( ( $elm ) !== self::NS_RDF . ' li' ) {
1162 throw new RuntimeException( "<rdf:li> expected but got $elm." );
1163 }
1164
1165 if ( !isset( $this->mode[1] ) ) {
1166 // This should never ever ever happen. Checking for it
1167 // to be paranoid.
1168 throw new RuntimeException( 'In mode Li, but no 2xPrevious mode!' );
1169 }
1170
1171 if ( $this->mode[1] === self::MODE_BAGSTRUCT ) {
1172 // This list item contains a compound (STRUCT) value.
1173 array_unshift( $this->mode, self::MODE_STRUCT );
1174 array_unshift( $this->curItem, $elm );
1175 $this->processingArray = true;
1176
1177 if ( !isset( $this->curItem[1] ) ) {
1178 // be paranoid.
1179 throw new RuntimeException( 'Can not find parent of BAGSTRUCT.' );
1180 }
1181 list( $curNS, $curTag ) = explode( ' ', $this->curItem[1] );
1182 $this->ancestorStruct = isset( $this->items[$curNS][$curTag]['map_name'] )
1183 ? $this->items[$curNS][$curTag]['map_name'] : $curTag;
1184
1185 $this->doAttribs( $attribs );
1186 } else {
1187 // Normal BAG or SEQ containing simple values.
1188 array_unshift( $this->mode, self::MODE_SIMPLE );
1189 // need to add curItem[0] on again since one is for the specific item
1190 // and one is for the entire group.
1191 array_unshift( $this->curItem, $this->curItem[0] );
1192 $this->processingArray = true;
1193 }
1194 }
1195
1210 private function startElementModeLiLang( $elm, $attribs ) {
1211 if ( $elm !== self::NS_RDF . ' li' ) {
1212 throw new RuntimeException( __METHOD__ . " <rdf:li> expected but got $elm." );
1213 }
1214 if ( !isset( $attribs[self::NS_XML . ' lang'] )
1215 || !preg_match( '/^[-A-Za-z0-9]{2,}$/D', $attribs[self::NS_XML . ' lang'] )
1216 ) {
1217 throw new RuntimeException( __METHOD__
1218 . " <rdf:li> did not contain, or has invalid xml:lang attribute in lang alternative" );
1219 }
1220
1221 // Lang is case-insensitive.
1222 $this->itemLang = strtolower( $attribs[self::NS_XML . ' lang'] );
1223
1224 // need to add curItem[0] on again since one is for the specific item
1225 // and one is for the entire group.
1226 array_unshift( $this->curItem, $this->curItem[0] );
1227 array_unshift( $this->mode, self::MODE_SIMPLE );
1228 $this->processingArray = true;
1229 }
1230
1241 function startElement( $parser, $elm, $attribs ) {
1242 if ( $elm === self::NS_RDF . ' RDF'
1243 || $elm === 'adobe:ns:meta/ xmpmeta'
1244 || $elm === 'adobe:ns:meta/ xapmeta'
1245 ) {
1246 /* ignore. */
1247 return;
1248 } elseif ( $elm === self::NS_RDF . ' Description' ) {
1249 if ( count( $this->mode ) === 0 ) {
1250 // outer rdf:desc
1251 array_unshift( $this->mode, self::MODE_INITIAL );
1252 }
1253 } elseif ( $elm === self::NS_RDF . ' type' ) {
1254 // This doesn't support rdf:type properly.
1255 // In practise I have yet to see a file that
1256 // uses this element, however it is mentioned
1257 // on page 25 of part 1 of the xmp standard.
1258 // Also it seems as if exiv2 and exiftool do not support
1259 // this either (That or I misunderstand the standard)
1260 $this->logger->info(
1261 __METHOD__ . ' Encountered <rdf:type> which isn\'t currently supported',
1262 [ 'file' => $this->filename ]
1263 );
1264 }
1265
1266 if ( strpos( $elm, ' ' ) === false ) {
1267 // This probably shouldn't happen.
1268 $this->logger->info(
1269 __METHOD__ . " Encountered <$elm> which has no namespace. Skipping.",
1270 [ 'file' => $this->filename ]
1271 );
1272
1273 return;
1274 }
1275
1276 list( $ns, $tag ) = explode( ' ', $elm, 2 );
1277
1278 if ( count( $this->mode ) === 0 ) {
1279 // This should not happen.
1280 throw new RuntimeException( 'Error extracting XMP, '
1281 . "encountered <$elm> with no mode" );
1282 }
1283
1284 switch ( $this->mode[0] ) {
1285 case self::MODE_IGNORE:
1286 $this->startElementModeIgnore( $elm );
1287 break;
1288 case self::MODE_SIMPLE:
1289 $this->startElementModeSimple( $elm, $attribs );
1290 break;
1291 case self::MODE_INITIAL:
1292 $this->startElementModeInitial( $ns, $tag, $attribs );
1293 break;
1294 case self::MODE_STRUCT:
1295 $this->startElementModeStruct( $ns, $tag, $attribs );
1296 break;
1297 case self::MODE_BAG:
1299 $this->startElementModeBag( $elm );
1300 break;
1301 case self::MODE_SEQ:
1302 $this->startElementModeSeq( $elm );
1303 break;
1304 case self::MODE_LANG:
1305 $this->startElementModeLang( $elm );
1306 break;
1307 case self::MODE_LI_LANG:
1308 $this->startElementModeLiLang( $elm, $attribs );
1309 break;
1310 case self::MODE_LI:
1311 $this->startElementModeLi( $elm, $attribs );
1312 break;
1313 case self::MODE_QDESC:
1314 $this->startElementModeQDesc( $elm );
1315 break;
1316 default:
1317 throw new RuntimeException( 'StartElement in unknown mode: ' . $this->mode[0] );
1318 }
1319 }
1320
1321 // phpcs:disable Generic.Files.LineLength
1337 // phpcs:enable
1338 private function doAttribs( $attribs ) {
1339 // first check for rdf:parseType attribute, as that can change
1340 // how the attributes are interperted.
1341
1342 if ( isset( $attribs[self::NS_RDF . ' parseType'] )
1343 && $attribs[self::NS_RDF . ' parseType'] === 'Resource'
1344 && $this->mode[0] === self::MODE_SIMPLE
1345 ) {
1346 // this is equivalent to having an inner rdf:Description
1347 $this->mode[0] = self::MODE_QDESC;
1348 }
1349 foreach ( $attribs as $name => $val ) {
1350 if ( strpos( $name, ' ' ) === false ) {
1351 // This shouldn't happen, but so far some old software forgets namespace
1352 // on rdf:about.
1353 $this->logger->info(
1354 __METHOD__ . ' Encountered non-namespaced attribute: ' .
1355 " $name=\"$val\". Skipping. ",
1356 [ 'file' => $this->filename ]
1357 );
1358 continue;
1359 }
1360 list( $ns, $tag ) = explode( ' ', $name, 2 );
1361 if ( $ns === self::NS_RDF ) {
1362 if ( $tag === 'value' || $tag === 'resource' ) {
1363 // resource is for url.
1364 // value attribute is a weird way of just putting the contents.
1365 $this->char( $this->xmlParser, $val );
1366 }
1367 } elseif ( isset( $this->items[$ns][$tag] ) ) {
1368 if ( $this->mode[0] === self::MODE_SIMPLE ) {
1369 throw new RuntimeException( __METHOD__
1370 . " $ns:$tag found as attribute where not allowed" );
1371 }
1372 $this->saveValue( $ns, $tag, $val );
1373 } else {
1374 $this->logger->debug(
1375 __METHOD__ . " Ignoring unrecognized element <$ns:$tag>.",
1376 [ 'file' => $this->filename ]
1377 );
1378 }
1379 }
1380 }
1381
1393 private function saveValue( $ns, $tag, $val ) {
1394 $info =& $this->items[$ns][$tag];
1395 $finalName = isset( $info['map_name'] )
1396 ? $info['map_name'] : $tag;
1397 if ( isset( $info['validate'] ) ) {
1398 if ( is_array( $info['validate'] ) ) {
1399 $validate = $info['validate'];
1400 } else {
1401 $validator = new XMPValidate( $this->logger );
1402 $validate = [ $validator, $info['validate'] ];
1403 }
1404
1405 if ( is_callable( $validate ) ) {
1406 call_user_func_array( $validate, [ $info, &$val, true ] );
1407 // the reasoning behind using &$val instead of using the return value
1408 // is to be consistent between here and validating structures.
1409 if ( is_null( $val ) ) {
1410 $this->logger->info(
1411 __METHOD__ . " <$ns:$tag> failed validation.",
1412 [ 'file' => $this->filename ]
1413 );
1414
1415 return;
1416 }
1417 } else {
1418 $this->logger->warning(
1419 __METHOD__ . " Validation function for $finalName (" .
1420 $validate[0] . '::' . $validate[1] . '()) is not callable.',
1421 [ 'file' => $this->filename ]
1422 );
1423 }
1424 }
1425
1426 if ( $this->ancestorStruct && $this->processingArray ) {
1427 // Aka both an array and a struct. ( self::MODE_BAGSTRUCT )
1428 $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][][$finalName] = $val;
1429 } elseif ( $this->ancestorStruct ) {
1430 $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][$finalName] = $val;
1431 } elseif ( $this->processingArray ) {
1432 if ( $this->itemLang === false ) {
1433 // normal array
1434 $this->results['xmp-' . $info['map_group']][$finalName][] = $val;
1435 } else {
1436 // lang array.
1437 $this->results['xmp-' . $info['map_group']][$finalName][$this->itemLang] = $val;
1438 }
1439 } else {
1440 $this->results['xmp-' . $info['map_group']][$finalName] = $val;
1441 }
1442 }
1443}
$line
Definition cdb.php:59
static getItems()
Get the items array.
Definition XMPInfo.php:33
Class for reading xmp data containing properties relevant to images, and spitting out an array that F...
Definition XMP.php:53
const NS_XML
Definition XMP.php:119
const NS_RDF
Definition XMP.php:118
bool string $charContent
Temporary holder for character data that appears in xmp doc.
Definition XMP.php:64
const MODE_SEQ
Definition XMP.php:112
const MODE_LANG
Definition XMP.php:114
bool string $itemLang
Used for lang alts only.
Definition XMP.php:76
destroyXMLParser()
free the XML parser.
Definition XMP.php:170
const MODE_STRUCT
Definition XMP.php:111
const PARSABLE_BUFFERING
Definition XMP.php:124
LoggerInterface $logger
Definition XMP.php:130
const PARSABLE_NO
Definition XMP.php:125
const PARSABLE_OK
Definition XMP.php:123
endElementModeIgnore( $elm)
When we hit a closing element in MODE_IGNORE Check to see if this is the element we started to ignore...
Definition XMP.php:607
const MODE_LI
Definition XMP.php:104
char( $parser, $data)
Character data handler Called whenever character data is found in the xmp document.
Definition XMP.php:516
setLogger(LoggerInterface $logger)
Definition XMP.php:159
getResults()
Get the result array.
Definition XMP.php:212
startElementModeLang( $elm)
Start element in MODE_LANG (language alternative) this should always be <rdf:Alt>
Definition XMP.php:960
array $results
Array to hold results.
Definition XMP.php:70
bool string $charset
Character set like 'UTF-8'.
Definition XMP.php:82
doAttribs( $attribs)
Process attributes.
Definition XMP.php:1338
parseExtended( $content)
Entry point for XMPExtended blocks in jpeg files.
Definition XMP.php:425
endElementModeSimple( $elm)
Hit a closing element when in MODE_SIMPLE.
Definition XMP.php:629
array $curItem
Array to hold the current element (and previous element, and so on)
Definition XMP.php:58
const PARSABLE_UNKNOWN
Definition XMP.php:122
startElementModeStruct( $ns, $tag, $attribs)
Hit an opening element when in a Struct (MODE_STRUCT) This is generally for fields of a compound prop...
Definition XMP.php:1113
static isSupported()
Check if this instance supports using this class.
Definition XMP.php:202
parse( $content, $allOfIt=true)
Main function to call to parse XMP.
Definition XMP.php:306
array $mode
Stores the state the xmpreader is in (see MODE_FOO constants)
Definition XMP.php:67
int $parsable
Flag determining if the XMP is safe to parse.
Definition XMP.php:88
resetXMLParser()
Main use is if a single item has multiple xmp documents describing it.
Definition XMP.php:181
startElement( $parser, $elm, $attribs)
Hits an opening element.
Definition XMP.php:1241
startElementModeBag( $elm)
Start element in MODE_BAG (unordered array) this should always be <rdf:Bag>
Definition XMP.php:915
const MODE_INITIAL
These are various mode constants.
Definition XMP.php:102
const MODE_ALT
Definition XMP.php:115
startElementModeQDesc( $elm)
Start an element when in MODE_QDESC.
Definition XMP.php:1027
startElementModeInitial( $ns, $tag, $attribs)
Starting an element when in MODE_INITIAL This usually happens when we hit an element inside the outer...
Definition XMP.php:1049
const MODE_LI_LANG
Definition XMP.php:105
saveValue( $ns, $tag, $val)
Given an extracted value, save it to results array.
Definition XMP.php:1393
endElement( $parser, $elm)
Handler for hitting a closing element.
Definition XMP.php:810
startElementModeSeq( $elm)
Start element in MODE_SEQ (ordered array) this should always be <rdf:Seq>
Definition XMP.php:930
string $xmlParsableBuffer
Buffer of XML to parse.
Definition XMP.php:91
const MODE_QDESC
Definition XMP.php:106
const MODE_SIMPLE
Definition XMP.php:110
const MODE_BAG
Definition XMP.php:113
endElementModeLi( $elm)
Hit a closing element in MODE_LI (either rdf:Seq, or rdf:Bag ) Add information about what type of ele...
Definition XMP.php:742
__construct(LoggerInterface $logger=null, $filename='unknown')
Primary job is to initialize the XMLParser.
Definition XMP.php:142
array $items
XMP item configuration array.
Definition XMP.php:55
startElementModeSimple( $elm, $attribs)
Handle an opening element when in MODE_SIMPLE.
Definition XMP.php:986
const MODE_BAGSTRUCT
Definition XMP.php:116
resource $xmlParser
A resource handle for the XML parser.
Definition XMP.php:79
endElementNested( $elm)
Hit a closing element in MODE_STRUCT, MODE_SEQ, MODE_BAG generally means we've finished processing a ...
Definition XMP.php:664
startElementModeIgnore( $elm)
Hit an opening element while in MODE_IGNORE.
Definition XMP.php:901
string $filename
Definition XMP.php:135
startElementModeLi( $elm, $attribs)
opening element in MODE_LI process elements of arrays.
Definition XMP.php:1160
bool $processingArray
If we're doing a seq or bag.
Definition XMP.php:73
const MODE_IGNORE
Definition XMP.php:103
bool string $ancestorStruct
The structure name when processing nested structures.
Definition XMP.php:61
int $extendedXMPOffset
Definition XMP.php:85
endElementModeQDesc( $elm)
End element while in MODE_QDESC mostly when ending an element when we have a simple value that has qu...
Definition XMP.php:785
checkParseSafety( $content)
Check if a block of XML is safe to pass to xml_parse, i.e.
Definition XMP.php:552
startElementModeLiLang( $elm, $attribs)
Opening element in MODE_LI_LANG.
Definition XMP.php:1210
This contains some static methods for validating XMP properties.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
namespace being checked & $result
Definition hooks.txt:2323
do that in ParserLimitReportFormat instead $parser
Definition hooks.txt:2603
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output modifiable & $code
Definition hooks.txt:865
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition hooks.txt:2014
returning false will NOT prevent logging $e
Definition hooks.txt:2176