MediaWiki master
IPTC.php
Go to the documentation of this file.
1<?php
10namespace MediaWiki\Media;
11
12use Wikimedia\Timestamp\TimestampFormat as TS;
13
19class IPTC {
30 public static function parse( $rawData ) {
31 $parsed = iptcparse( $rawData );
32 $data = [];
33 if ( !is_array( $parsed ) ) {
34 return $data;
35 }
36
37 $c = '';
38 // charset info contained in tag 1:90.
39 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
40 $c = self::getCharset( $parsed['1#090'][0] );
41 if ( $c === false ) {
42 // Unknown charset. refuse to parse.
43 // note: There is a different between
44 // unknown and no charset specified.
45 return [];
46 }
47 unset( $parsed['1#090'] );
48 }
49
50 foreach ( $parsed as $tag => $val ) {
51 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
52 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
53 continue;
54 }
55 switch ( $tag ) {
56 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
57 $data['ImageDescription'] = self::convIPTC( $val, $c );
58 break;
59 case '2#116': /* copyright. Mapped with exif copyright */
60 $data['Copyright'] = self::convIPTC( $val, $c );
61 break;
62 case '2#080': /* byline. Mapped with exif Artist */
63 /* merge with byline title (2:85)
64 * like how exif does it with
65 * Title, person. Not sure if this is best
66 * approach since we no longer have the two fields
67 * separate. each byline title entry corresponds to a
68 * specific byline. */
69
70 $bylines = self::convIPTC( $val, $c );
71 if ( isset( $parsed['2#085'] ) ) {
72 $titles = self::convIPTC( $parsed['2#085'], $c );
73 } else {
74 $titles = [];
75 }
76
77 $titleCount = count( $titles );
78 for ( $i = 0; $i < $titleCount; $i++ ) {
79 if ( isset( $bylines[$i] ) ) {
80 // theoretically this should always be set
81 // but doesn't hurt to be careful.
82 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
83 }
84 }
85 $data['Artist'] = $bylines;
86 break;
87 case '2#025': /* keywords */
88 $data['Keywords'] = self::convIPTC( $val, $c );
89 break;
90 case '2#101': /* Country (shown) */
91 $data['CountryDest'] = self::convIPTC( $val, $c );
92 break;
93 case '2#095': /* state/province (shown) */
94 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
95 break;
96 case '2#090': /* city (Shown) */
97 $data['CityDest'] = self::convIPTC( $val, $c );
98 break;
99 case '2#092': /* sublocation (shown) */
100 $data['SublocationDest'] = self::convIPTC( $val, $c );
101 break;
102 case '2#005': /* object name/title */
103 $data['ObjectName'] = self::convIPTC( $val, $c );
104 break;
105 case '2#040': /* special instructions */
106 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
107 break;
108 case '2#105': /* headline */
109 $data['Headline'] = self::convIPTC( $val, $c );
110 break;
111 case '2#110': /* credit */
112 /*"Identifies the provider of the objectdata,
113 * not necessarily the owner/creator". */
114 $data['Credit'] = self::convIPTC( $val, $c );
115 break;
116 case '2#115': /* source */
117 /* "Identifies the original owner of the intellectual content of the
118 *objectdata. This could be an agency, a member of an agency or
119 *an individual." */
120 $data['Source'] = self::convIPTC( $val, $c );
121 break;
122
123 case '2#007': /* edit status (lead, correction, etc) */
124 $data['EditStatus'] = self::convIPTC( $val, $c );
125 break;
126 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
127 $data['iimCategory'] = self::convIPTC( $val, $c );
128 break;
129 case '2#020': /* category. deprecated. */
130 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
131 break;
132 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
133 $data['Urgency'] = self::convIPTC( $val, $c );
134 break;
135 case '2#022':
136 /* "Identifies objectdata that recurs often and predictably...
137 * Example: Euroweather" */
138 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
139 break;
140 case '2#026':
141 /* Content location code (iso 3166 + some custom things)
142 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
143 * See wikipedia article on iso 3166 and appendix D of iim std. */
144 $data['LocationDestCode'] = self::convIPTC( $val, $c );
145 break;
146 case '2#027':
147 /* Content location name. Full printable name
148 * of location of photo. */
149 $data['LocationDest'] = self::convIPTC( $val, $c );
150 break;
151 case '2#065':
152 /* Originating Program.
153 * Combine with Program version (2:70) if present.
154 */
155 $software = self::convIPTC( $val, $c );
156
157 if ( count( $software ) !== 1 ) {
158 // according to iim standard this cannot have multiple values
159 // so if there is more than one, something weird is happening,
160 // and we skip it.
161 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
162 break;
163 }
164
165 if ( isset( $parsed['2#070'] ) ) {
166 // if a version is set for the software.
167 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
168 unset( $parsed['2#070'] );
169 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
170 } else {
171 $data['Software'] = $software;
172 }
173 break;
174 case '2#075':
175 /* Object cycle.
176 * a for morning (am), p for evening, b for both */
177 $data['ObjectCycle'] = self::convIPTC( $val, $c );
178 break;
179 case '2#100':
180 /* Country/Primary location code.
181 * "Indicates the code of the country/primary location where the
182 * intellectual property of the objectdata was created"
183 * unclear how this differs from 2#026
184 */
185 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
186 break;
187 case '2#103':
188 /* original transmission ref.
189 * "A code representing the location of original transmission ac-
190 * cording to practises of the provider."
191 */
192 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
193 break;
194 case '2#118': /*contact*/
195 $data['Contact'] = self::convIPTC( $val, $c );
196 break;
197 case '2#122':
198 /* Writer/Editor
199 * "Identification of the name of the person involved in the writing,
200 * editing or correcting the objectdata or caption/abstract."
201 */
202 $data['Writer'] = self::convIPTC( $val, $c );
203 break;
204 case '2#135': /* lang code */
205 $data['LanguageCode'] = self::convIPTC( $val, $c );
206 break;
207
208 // Start date stuff.
209 // It doesn't accept incomplete dates even though they are valid
210 // according to spec.
211 // Should potentially store timezone as well.
212 case '2#055':
213 // Date created (not date digitized).
214 // Maps to exif DateTimeOriginal
215 $time = $parsed['2#060'] ?? [];
216 $timestamp = self::timeHelper( $val, $time, $c );
217 if ( $timestamp ) {
218 $data['DateTimeOriginal'] = $timestamp;
219 }
220 break;
221
222 case '2#062':
223 // Date converted to digital representation.
224 // Maps to exif DateTimeDigitized
225 $time = $parsed['2#063'] ?? [];
226 $timestamp = self::timeHelper( $val, $time, $c );
227 if ( $timestamp ) {
228 $data['DateTimeDigitized'] = $timestamp;
229 }
230 break;
231
232 case '2#030':
233 // Date released.
234 $time = $parsed['2#035'] ?? [];
235 $timestamp = self::timeHelper( $val, $time, $c );
236 if ( $timestamp ) {
237 $data['DateTimeReleased'] = $timestamp;
238 }
239 break;
240
241 case '2#037':
242 // Date expires.
243 $time = $parsed['2#038'] ?? [];
244 $timestamp = self::timeHelper( $val, $time, $c );
245 if ( $timestamp ) {
246 $data['DateTimeExpires'] = $timestamp;
247 }
248 break;
249
250 case '2#000': /* iim version */
251 // unlike other tags, this is a 2-byte binary number.
252 // technically this is required if there is iptc data
253 // but in practise it isn't always there.
254 if ( strlen( $val[0] ) === 2 ) {
255 // if is just to be paranoid.
256 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
257 $versionValue += ord( substr( $val[0], 1, 1 ) );
258 $data['iimVersion'] = $versionValue;
259 }
260 break;
261
262 case '2#004':
263 // IntellectualGenere.
264 // first 4 characters are an id code
265 // That we're not really interested in.
266
267 // This prop is weird, since it's
268 // allowed to have multiple values
269 // in iim 4.1, but not in the XMP
270 // stuff. We're going to just
271 // extract the first value.
272 $con = self::convIPTC( $val, $c );
273 if ( strlen( $con[0] ) < 5 ) {
274 wfDebugLog( 'iptc', 'IPTC: '
275 . '2:04 too short. '
276 . 'Ignoring.' );
277 break;
278 }
279 $extracted = substr( $con[0], 4 );
280 $data['IntellectualGenre'] = $extracted;
281 break;
282
283 case '2#012':
284 // Subject News code - this is a compound field
285 // at the moment we only extract the subject news
286 // code, which is an 8 digit (ascii) number
287 // describing the subject matter of the content.
288 $codes = self::convIPTC( $val, $c );
289 foreach ( $codes as $ic ) {
290 $fields = explode( ':', $ic, 3 );
291
292 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
293 wfDebugLog( 'IPTC', 'IPTC: '
294 . 'Invalid 2:12 - ' . $ic );
295 break;
296 }
297 $data['SubjectNewsCode'] = $fields[1];
298 }
299 break;
300
301 // purposely does not do 2:125, 2:130, 2:131,
302 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
303 // 2:200, 2:201, 2:202
304 // or the audio stuff (2:150 to 2:154)
305
306 case '2#070':
307 case '2#060':
308 case '2#063':
309 case '2#085':
310 case '2#038':
311 case '2#035':
312 // ignore. Handled elsewhere.
313 break;
314
315 default:
316 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
317 break;
318 }
319 }
320
321 return $data;
322 }
323
333 private static function timeHelper( $date, $time, $charset ) {
334 if ( count( $date ) === 1 ) {
335 // the standard says this should always be 1
336 // just double checking.
337 [ $date ] = self::convIPTC( $date, $charset );
338 } else {
339 return null;
340 }
341
342 if ( count( $time ) === 1 ) {
343 [ $time ] = self::convIPTC( $time, $charset );
344 $dateOnly = false;
345 } else {
346 $time = '000000+0000'; // placeholder
347 $dateOnly = true;
348 }
349
350 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
351 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
352 && substr( $date, 0, 4 ) !== '0000'
353 && substr( $date, 4, 2 ) !== '00'
354 && substr( $date, 6, 2 ) !== '00'
355 ) ) {
356 // something wrong.
357 // Note, this rejects some valid dates according to iptc spec
358 // for example: the date 00000400 means the photo was taken in
359 // April, but the year and day is unknown. We don't process these
360 // types of incomplete dates atm.
361 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
362
363 return null;
364 }
365
366 $unixTS = wfTimestamp( TS::UNIX, $date . substr( $time, 0, 6 ) );
367 if ( $unixTS === false ) {
368 wfDebugLog( 'iptc', "IPTC: can't convert date to TS::UNIX: $date $time." );
369
370 return null;
371 }
372
373 $tz = ( (int)substr( $time, 7, 2 ) * 60 * 60 )
374 + ( (int)substr( $time, 9, 2 ) * 60 );
375
376 if ( substr( $time, 6, 1 ) === '-' ) {
377 $tz = -$tz;
378 }
379
380 $finalTimestamp = wfTimestamp( TS::EXIF, (int)$unixTS + $tz );
381 if ( $finalTimestamp === false ) {
382 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( (int)$unixTS + $tz ) );
383
384 return null;
385 }
386 if ( $dateOnly ) {
387 // return the date only
388 return substr( $finalTimestamp, 0, 10 );
389 }
390 return $finalTimestamp;
391 }
392
400 private static function convIPTC( $data, $charset ) {
401 if ( is_array( $data ) ) {
402 foreach ( $data as &$val ) {
403 $val = self::convIPTCHelper( $val, $charset );
404 }
405 } else {
406 $data = self::convIPTCHelper( $data, $charset );
407 }
408
409 return $data;
410 }
411
419 private static function convIPTCHelper( $data, $charset ) {
420 if ( $charset ) {
421 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
422 $data = @iconv( $charset, "UTF-8//IGNORE", $data );
423 if ( $data === false ) {
424 $data = "";
425 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
426 }
427 } else {
428 // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
429 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
430 $oldData = $data;
431 \UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
432 if ( $data === $oldData ) {
433 return $data; // if validation didn't change $data
434 }
435 return self::convIPTCHelper( $oldData, 'Windows-1252' );
436 }
437
438 return trim( $data );
439 }
440
449 public static function getCharset( $tag ) {
450 // According to iim standard, charset is defined by the tag 1:90.
451 // in which there are iso 2022 escape sequences to specify the character set.
452 // the iim standard seems to encourage that all necessary escape sequences are
453 // in the 1:90 tag, but says it doesn't have to be.
454
455 // This is in need of more testing probably. This is definitely not complete.
456 // however reading the docs of some other iptc software, it appears that most iptc software
457 // only recognizes utf-8. If 1:90 tag is not present content is
458 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
459
460 // This also won't work if there are more than one escape sequence in the 1:90 tag
461 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
462
463 // This is just going through the charsets mentioned in appendix C of the iim standard.
464
465 // \x1b = ESC.
466 switch ( $tag ) {
467 case "\x1b%G": // utf-8
468 // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
469 case "\x1b(B": // ascii
470 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
471 $c = 'UTF-8';
472 break;
473 case "\x1b(A": // like ascii, but british.
474 $c = 'ISO646-GB';
475 break;
476 case "\x1b(C": // some obscure sweedish/finland encoding
477 $c = 'ISO-IR-8-1';
478 break;
479 case "\x1b(D":
480 $c = 'ISO-IR-8-2';
481 break;
482 case "\x1b(E": // some obscure danish/norway encoding
483 $c = 'ISO-IR-9-1';
484 break;
485 case "\x1b(F":
486 $c = 'ISO-IR-9-2';
487 break;
488 case "\x1b(G":
489 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
490 break;
491 case "\x1b(I":
492 $c = "ISO646-IT";
493 break;
494 case "\x1b(L":
495 $c = "ISO646-PT";
496 break;
497 case "\x1b(Z":
498 $c = "ISO646-ES";
499 break;
500 case "\x1b([":
501 $c = "GREEK7-OLD";
502 break;
503 case "\x1b(K":
504 $c = "ISO646-DE";
505 break;
506 case "\x1b(N": // crylic
507 $c = "ISO_5427";
508 break;
509 case "\x1b(`": // iso646-NO
510 $c = "NS_4551-1";
511 break;
512 case "\x1b(f": // iso646-FR
513 $c = "NF_Z_62-010";
514 break;
515 case "\x1b(g":
516 $c = "PT2"; // iso646-PT2
517 break;
518 case "\x1b(h":
519 $c = "ES2";
520 break;
521 case "\x1b(i": // iso646-HU
522 $c = "MSZ_7795.3";
523 break;
524 case "\x1b(w":
525 $c = "CSA_Z243.4-1985-1";
526 break;
527 case "\x1b(x":
528 $c = "CSA_Z243.4-1985-2";
529 break;
530 case "\x1b\$(B":
531 case "\x1b\$B":
532 case "\x1b&@\x1b\$B":
533 case "\x1b&@\x1b\$(B":
534 $c = "JIS_C6226-1983";
535 break;
536 case "\x1b-A": // iso-8859-1. at least for the high code characters.
537 case "\x1b(@\x1b-A":
538 case "\x1b(B\x1b-A":
539 $c = 'ISO-8859-1';
540 break;
541 case "\x1b-B": // iso-8859-2. at least for the high code characters.
542 $c = 'ISO-8859-2';
543 break;
544 case "\x1b-C": // iso-8859-3. at least for the high code characters.
545 $c = 'ISO-8859-3';
546 break;
547 case "\x1b-D": // iso-8859-4. at least for the high code characters.
548 $c = 'ISO-8859-4';
549 break;
550 case "\x1b-E": // iso-8859-5. at least for the high code characters.
551 $c = 'ISO-8859-5';
552 break;
553 case "\x1b-F": // iso-8859-6. at least for the high code characters.
554 $c = 'ISO-8859-6';
555 break;
556 case "\x1b-G": // iso-8859-7. at least for the high code characters.
557 $c = 'ISO-8859-7';
558 break;
559 case "\x1b-H": // iso-8859-8. at least for the high code characters.
560 $c = 'ISO-8859-8';
561 break;
562 case "\x1b-I": // CSN_369103. at least for the high code characters.
563 $c = 'CSN_369103';
564 break;
565 default:
566 wfDebugLog( 'iptc', __METHOD__ . ': Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
567 // at this point just give up and refuse to parse iptc?
568 $c = false;
569 }
570 return $c;
571 }
572}
573
575class_alias( IPTC::class, 'IPTC' );
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
Class for some IPTC functions.
Definition IPTC.php:19
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition IPTC.php:449
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki.
Definition IPTC.php:30