MediaWiki master
IPTC.php
Go to the documentation of this file.
1<?php
24use Wikimedia\AtEase\AtEase;
25
31class IPTC {
42 public static function parse( $rawData ) {
43 $parsed = iptcparse( $rawData );
44 $data = [];
45 if ( !is_array( $parsed ) ) {
46 return $data;
47 }
48
49 $c = '';
50 // charset info contained in tag 1:90.
51 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
52 $c = self::getCharset( $parsed['1#090'][0] );
53 if ( $c === false ) {
54 // Unknown charset. refuse to parse.
55 // note: There is a different between
56 // unknown and no charset specified.
57 return [];
58 }
59 unset( $parsed['1#090'] );
60 }
61
62 foreach ( $parsed as $tag => $val ) {
63 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
64 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
65 continue;
66 }
67 switch ( $tag ) {
68 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
69 $data['ImageDescription'] = self::convIPTC( $val, $c );
70 break;
71 case '2#116': /* copyright. Mapped with exif copyright */
72 $data['Copyright'] = self::convIPTC( $val, $c );
73 break;
74 case '2#080': /* byline. Mapped with exif Artist */
75 /* merge with byline title (2:85)
76 * like how exif does it with
77 * Title, person. Not sure if this is best
78 * approach since we no longer have the two fields
79 * separate. each byline title entry corresponds to a
80 * specific byline. */
81
82 $bylines = self::convIPTC( $val, $c );
83 if ( isset( $parsed['2#085'] ) ) {
84 $titles = self::convIPTC( $parsed['2#085'], $c );
85 } else {
86 $titles = [];
87 }
88
89 $titleCount = count( $titles );
90 for ( $i = 0; $i < $titleCount; $i++ ) {
91 if ( isset( $bylines[$i] ) ) {
92 // theoretically this should always be set
93 // but doesn't hurt to be careful.
94 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
95 }
96 }
97 $data['Artist'] = $bylines;
98 break;
99 case '2#025': /* keywords */
100 $data['Keywords'] = self::convIPTC( $val, $c );
101 break;
102 case '2#101': /* Country (shown) */
103 $data['CountryDest'] = self::convIPTC( $val, $c );
104 break;
105 case '2#095': /* state/province (shown) */
106 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
107 break;
108 case '2#090': /* city (Shown) */
109 $data['CityDest'] = self::convIPTC( $val, $c );
110 break;
111 case '2#092': /* sublocation (shown) */
112 $data['SublocationDest'] = self::convIPTC( $val, $c );
113 break;
114 case '2#005': /* object name/title */
115 $data['ObjectName'] = self::convIPTC( $val, $c );
116 break;
117 case '2#040': /* special instructions */
118 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
119 break;
120 case '2#105': /* headline */
121 $data['Headline'] = self::convIPTC( $val, $c );
122 break;
123 case '2#110': /* credit */
124 /*"Identifies the provider of the objectdata,
125 * not necessarily the owner/creator". */
126 $data['Credit'] = self::convIPTC( $val, $c );
127 break;
128 case '2#115': /* source */
129 /* "Identifies the original owner of the intellectual content of the
130 *objectdata. This could be an agency, a member of an agency or
131 *an individual." */
132 $data['Source'] = self::convIPTC( $val, $c );
133 break;
134
135 case '2#007': /* edit status (lead, correction, etc) */
136 $data['EditStatus'] = self::convIPTC( $val, $c );
137 break;
138 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
139 $data['iimCategory'] = self::convIPTC( $val, $c );
140 break;
141 case '2#020': /* category. deprecated. */
142 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
143 break;
144 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
145 $data['Urgency'] = self::convIPTC( $val, $c );
146 break;
147 case '2#022':
148 /* "Identifies objectdata that recurs often and predictably...
149 * Example: Euroweather" */
150 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
151 break;
152 case '2#026':
153 /* Content location code (iso 3166 + some custom things)
154 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
155 * See wikipedia article on iso 3166 and appendix D of iim std. */
156 $data['LocationDestCode'] = self::convIPTC( $val, $c );
157 break;
158 case '2#027':
159 /* Content location name. Full printable name
160 * of location of photo. */
161 $data['LocationDest'] = self::convIPTC( $val, $c );
162 break;
163 case '2#065':
164 /* Originating Program.
165 * Combine with Program version (2:70) if present.
166 */
167 $software = self::convIPTC( $val, $c );
168
169 if ( count( $software ) !== 1 ) {
170 // according to iim standard this cannot have multiple values
171 // so if there is more than one, something weird is happening,
172 // and we skip it.
173 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
174 break;
175 }
176
177 if ( isset( $parsed['2#070'] ) ) {
178 // if a version is set for the software.
179 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
180 unset( $parsed['2#070'] );
181 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
182 } else {
183 $data['Software'] = $software;
184 }
185 break;
186 case '2#075':
187 /* Object cycle.
188 * a for morning (am), p for evening, b for both */
189 $data['ObjectCycle'] = self::convIPTC( $val, $c );
190 break;
191 case '2#100':
192 /* Country/Primary location code.
193 * "Indicates the code of the country/primary location where the
194 * intellectual property of the objectdata was created"
195 * unclear how this differs from 2#026
196 */
197 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
198 break;
199 case '2#103':
200 /* original transmission ref.
201 * "A code representing the location of original transmission ac-
202 * cording to practises of the provider."
203 */
204 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
205 break;
206 case '2#118': /*contact*/
207 $data['Contact'] = self::convIPTC( $val, $c );
208 break;
209 case '2#122':
210 /* Writer/Editor
211 * "Identification of the name of the person involved in the writing,
212 * editing or correcting the objectdata or caption/abstract."
213 */
214 $data['Writer'] = self::convIPTC( $val, $c );
215 break;
216 case '2#135': /* lang code */
217 $data['LanguageCode'] = self::convIPTC( $val, $c );
218 break;
219
220 // Start date stuff.
221 // It doesn't accept incomplete dates even though they are valid
222 // according to spec.
223 // Should potentially store timezone as well.
224 case '2#055':
225 // Date created (not date digitized).
226 // Maps to exif DateTimeOriginal
227 $time = $parsed['2#060'] ?? [];
228 $timestamp = self::timeHelper( $val, $time, $c );
229 if ( $timestamp ) {
230 $data['DateTimeOriginal'] = $timestamp;
231 }
232 break;
233
234 case '2#062':
235 // Date converted to digital representation.
236 // Maps to exif DateTimeDigitized
237 $time = $parsed['2#063'] ?? [];
238 $timestamp = self::timeHelper( $val, $time, $c );
239 if ( $timestamp ) {
240 $data['DateTimeDigitized'] = $timestamp;
241 }
242 break;
243
244 case '2#030':
245 // Date released.
246 $time = $parsed['2#035'] ?? [];
247 $timestamp = self::timeHelper( $val, $time, $c );
248 if ( $timestamp ) {
249 $data['DateTimeReleased'] = $timestamp;
250 }
251 break;
252
253 case '2#037':
254 // Date expires.
255 $time = $parsed['2#038'] ?? [];
256 $timestamp = self::timeHelper( $val, $time, $c );
257 if ( $timestamp ) {
258 $data['DateTimeExpires'] = $timestamp;
259 }
260 break;
261
262 case '2#000': /* iim version */
263 // unlike other tags, this is a 2-byte binary number.
264 // technically this is required if there is iptc data
265 // but in practise it isn't always there.
266 if ( strlen( $val[0] ) === 2 ) {
267 // if is just to be paranoid.
268 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
269 $versionValue += ord( substr( $val[0], 1, 1 ) );
270 $data['iimVersion'] = $versionValue;
271 }
272 break;
273
274 case '2#004':
275 // IntellectualGenere.
276 // first 4 characters are an id code
277 // That we're not really interested in.
278
279 // This prop is weird, since it's
280 // allowed to have multiple values
281 // in iim 4.1, but not in the XMP
282 // stuff. We're going to just
283 // extract the first value.
284 $con = self::convIPTC( $val, $c );
285 if ( strlen( $con[0] ) < 5 ) {
286 wfDebugLog( 'iptc', 'IPTC: '
287 . '2:04 too short. '
288 . 'Ignoring.' );
289 break;
290 }
291 $extracted = substr( $con[0], 4 );
292 $data['IntellectualGenre'] = $extracted;
293 break;
294
295 case '2#012':
296 // Subject News code - this is a compound field
297 // at the moment we only extract the subject news
298 // code, which is an 8 digit (ascii) number
299 // describing the subject matter of the content.
300 $codes = self::convIPTC( $val, $c );
301 foreach ( $codes as $ic ) {
302 $fields = explode( ':', $ic, 3 );
303
304 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
305 wfDebugLog( 'IPTC', 'IPTC: '
306 . 'Invalid 2:12 - ' . $ic );
307 break;
308 }
309 $data['SubjectNewsCode'] = $fields[1];
310 }
311 break;
312
313 // purposely does not do 2:125, 2:130, 2:131,
314 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
315 // 2:200, 2:201, 2:202
316 // or the audio stuff (2:150 to 2:154)
317
318 case '2#070':
319 case '2#060':
320 case '2#063':
321 case '2#085':
322 case '2#038':
323 case '2#035':
324 // ignore. Handled elsewhere.
325 break;
326
327 default:
328 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
329 break;
330 }
331 }
332
333 return $data;
334 }
335
345 private static function timeHelper( $date, $time, $charset ) {
346 if ( count( $date ) === 1 ) {
347 // the standard says this should always be 1
348 // just double checking.
349 [ $date ] = self::convIPTC( $date, $charset );
350 } else {
351 return null;
352 }
353
354 if ( count( $time ) === 1 ) {
355 [ $time ] = self::convIPTC( $time, $charset );
356 $dateOnly = false;
357 } else {
358 $time = '000000+0000'; // placeholder
359 $dateOnly = true;
360 }
361
362 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
363 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
364 && substr( $date, 0, 4 ) !== '0000'
365 && substr( $date, 4, 2 ) !== '00'
366 && substr( $date, 6, 2 ) !== '00'
367 ) ) {
368 // something wrong.
369 // Note, this rejects some valid dates according to iptc spec
370 // for example: the date 00000400 means the photo was taken in
371 // April, but the year and day is unknown. We don't process these
372 // types of incomplete dates atm.
373 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
374
375 return null;
376 }
377
378 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
379 if ( $unixTS === false ) {
380 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
381
382 return null;
383 }
384
385 $tz = ( (int)substr( $time, 7, 2 ) * 60 * 60 )
386 + ( (int)substr( $time, 9, 2 ) * 60 );
387
388 if ( substr( $time, 6, 1 ) === '-' ) {
389 $tz = -$tz;
390 }
391
392 $finalTimestamp = wfTimestamp( TS_EXIF, (int)$unixTS + $tz );
393 if ( $finalTimestamp === false ) {
394 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( (int)$unixTS + $tz ) );
395
396 return null;
397 }
398 if ( $dateOnly ) {
399 // return the date only
400 return substr( $finalTimestamp, 0, 10 );
401 }
402 return $finalTimestamp;
403 }
404
412 private static function convIPTC( $data, $charset ) {
413 if ( is_array( $data ) ) {
414 foreach ( $data as &$val ) {
415 $val = self::convIPTCHelper( $val, $charset );
416 }
417 } else {
418 $data = self::convIPTCHelper( $data, $charset );
419 }
420
421 return $data;
422 }
423
431 private static function convIPTCHelper( $data, $charset ) {
432 if ( $charset ) {
433 AtEase::suppressWarnings();
434 $data = iconv( $charset, "UTF-8//IGNORE", $data );
435 AtEase::restoreWarnings();
436 if ( $data === false ) {
437 $data = "";
438 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
439 }
440 } else {
441 // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
442 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
443 $oldData = $data;
444 UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
445 if ( $data === $oldData ) {
446 return $data; // if validation didn't change $data
447 }
448 return self::convIPTCHelper( $oldData, 'Windows-1252' );
449 }
450
451 return trim( $data );
452 }
453
462 public static function getCharset( $tag ) {
463 // According to iim standard, charset is defined by the tag 1:90.
464 // in which there are iso 2022 escape sequences to specify the character set.
465 // the iim standard seems to encourage that all necessary escape sequences are
466 // in the 1:90 tag, but says it doesn't have to be.
467
468 // This is in need of more testing probably. This is definitely not complete.
469 // however reading the docs of some other iptc software, it appears that most iptc software
470 // only recognizes utf-8. If 1:90 tag is not present content is
471 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
472
473 // This also won't work if there are more than one escape sequence in the 1:90 tag
474 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
475
476 // This is just going through the charsets mentioned in appendix C of the iim standard.
477
478 // \x1b = ESC.
479 switch ( $tag ) {
480 case "\x1b%G": // utf-8
481 // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
482 case "\x1b(B": // ascii
483 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
484 $c = 'UTF-8';
485 break;
486 case "\x1b(A": // like ascii, but british.
487 $c = 'ISO646-GB';
488 break;
489 case "\x1b(C": // some obscure sweedish/finland encoding
490 $c = 'ISO-IR-8-1';
491 break;
492 case "\x1b(D":
493 $c = 'ISO-IR-8-2';
494 break;
495 case "\x1b(E": // some obscure danish/norway encoding
496 $c = 'ISO-IR-9-1';
497 break;
498 case "\x1b(F":
499 $c = 'ISO-IR-9-2';
500 break;
501 case "\x1b(G":
502 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
503 break;
504 case "\x1b(I":
505 $c = "ISO646-IT";
506 break;
507 case "\x1b(L":
508 $c = "ISO646-PT";
509 break;
510 case "\x1b(Z":
511 $c = "ISO646-ES";
512 break;
513 case "\x1b([":
514 $c = "GREEK7-OLD";
515 break;
516 case "\x1b(K":
517 $c = "ISO646-DE";
518 break;
519 case "\x1b(N": // crylic
520 $c = "ISO_5427";
521 break;
522 case "\x1b(`": // iso646-NO
523 $c = "NS_4551-1";
524 break;
525 case "\x1b(f": // iso646-FR
526 $c = "NF_Z_62-010";
527 break;
528 case "\x1b(g":
529 $c = "PT2"; // iso646-PT2
530 break;
531 case "\x1b(h":
532 $c = "ES2";
533 break;
534 case "\x1b(i": // iso646-HU
535 $c = "MSZ_7795.3";
536 break;
537 case "\x1b(w":
538 $c = "CSA_Z243.4-1985-1";
539 break;
540 case "\x1b(x":
541 $c = "CSA_Z243.4-1985-2";
542 break;
543 case "\x1b\$(B":
544 case "\x1b\$B":
545 case "\x1b&@\x1b\$B":
546 case "\x1b&@\x1b\$(B":
547 $c = "JIS_C6226-1983";
548 break;
549 case "\x1b-A": // iso-8859-1. at least for the high code characters.
550 case "\x1b(@\x1b-A":
551 case "\x1b(B\x1b-A":
552 $c = 'ISO-8859-1';
553 break;
554 case "\x1b-B": // iso-8859-2. at least for the high code characters.
555 $c = 'ISO-8859-2';
556 break;
557 case "\x1b-C": // iso-8859-3. at least for the high code characters.
558 $c = 'ISO-8859-3';
559 break;
560 case "\x1b-D": // iso-8859-4. at least for the high code characters.
561 $c = 'ISO-8859-4';
562 break;
563 case "\x1b-E": // iso-8859-5. at least for the high code characters.
564 $c = 'ISO-8859-5';
565 break;
566 case "\x1b-F": // iso-8859-6. at least for the high code characters.
567 $c = 'ISO-8859-6';
568 break;
569 case "\x1b-G": // iso-8859-7. at least for the high code characters.
570 $c = 'ISO-8859-7';
571 break;
572 case "\x1b-H": // iso-8859-8. at least for the high code characters.
573 $c = 'ISO-8859-8';
574 break;
575 case "\x1b-I": // CSN_369103. at least for the high code characters.
576 $c = 'CSN_369103';
577 break;
578 default:
579 wfDebugLog( 'iptc', __METHOD__ . ': Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
580 // at this point just give up and refuse to parse iptc?
581 $c = false;
582 }
583 return $c;
584 }
585}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Class for some IPTC functions.
Definition IPTC.php:31
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki.
Definition IPTC.php:42
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition IPTC.php:462