MediaWiki  master
IPTC.php
Go to the documentation of this file.
1 <?php
29 class IPTC {
40  static function parse( $rawData ) {
41  $parsed = iptcparse( $rawData );
42  $data = [];
43  if ( !is_array( $parsed ) ) {
44  return $data;
45  }
46 
47  $c = '';
48  // charset info contained in tag 1:90.
49  if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50  $c = self::getCharset( $parsed['1#090'][0] );
51  if ( $c === false ) {
52  // Unknown charset. refuse to parse.
53  // note: There is a different between
54  // unknown and no charset specified.
55  return [];
56  }
57  unset( $parsed['1#090'] );
58  }
59 
60  foreach ( $parsed as $tag => $val ) {
61  if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62  wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63  continue;
64  }
65  switch ( $tag ) {
66  case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67  $data['ImageDescription'] = self::convIPTC( $val, $c );
68  break;
69  case '2#116': /* copyright. Mapped with exif copyright */
70  $data['Copyright'] = self::convIPTC( $val, $c );
71  break;
72  case '2#080': /* byline. Mapped with exif Artist */
73  /* merge with byline title (2:85)
74  * like how exif does it with
75  * Title, person. Not sure if this is best
76  * approach since we no longer have the two fields
77  * separate. each byline title entry corresponds to a
78  * specific byline. */
79 
80  $bylines = self::convIPTC( $val, $c );
81  if ( isset( $parsed['2#085'] ) ) {
82  $titles = self::convIPTC( $parsed['2#085'], $c );
83  } else {
84  $titles = [];
85  }
86 
87  $titleCount = count( $titles );
88  for ( $i = 0; $i < $titleCount; $i++ ) {
89  if ( isset( $bylines[$i] ) ) {
90  // theoretically this should always be set
91  // but doesn't hurt to be careful.
92  $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
93  }
94  }
95  $data['Artist'] = $bylines;
96  break;
97  case '2#025': /* keywords */
98  $data['Keywords'] = self::convIPTC( $val, $c );
99  break;
100  case '2#101': /* Country (shown) */
101  $data['CountryDest'] = self::convIPTC( $val, $c );
102  break;
103  case '2#095': /* state/province (shown) */
104  $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105  break;
106  case '2#090': /* city (Shown) */
107  $data['CityDest'] = self::convIPTC( $val, $c );
108  break;
109  case '2#092': /* sublocation (shown) */
110  $data['SublocationDest'] = self::convIPTC( $val, $c );
111  break;
112  case '2#005': /* object name/title */
113  $data['ObjectName'] = self::convIPTC( $val, $c );
114  break;
115  case '2#040': /* special instructions */
116  $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117  break;
118  case '2#105': /* headline */
119  $data['Headline'] = self::convIPTC( $val, $c );
120  break;
121  case '2#110': /* credit */
122  /*"Identifies the provider of the objectdata,
123  * not necessarily the owner/creator". */
124  $data['Credit'] = self::convIPTC( $val, $c );
125  break;
126  case '2#115': /* source */
127  /* "Identifies the original owner of the intellectual content of the
128  *objectdata. This could be an agency, a member of an agency or
129  *an individual." */
130  $data['Source'] = self::convIPTC( $val, $c );
131  break;
132 
133  case '2#007': /* edit status (lead, correction, etc) */
134  $data['EditStatus'] = self::convIPTC( $val, $c );
135  break;
136  case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137  $data['iimCategory'] = self::convIPTC( $val, $c );
138  break;
139  case '2#020': /* category. deprecated. */
140  $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141  break;
142  case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143  $data['Urgency'] = self::convIPTC( $val, $c );
144  break;
145  case '2#022':
146  /* "Identifies objectdata that recurs often and predictably...
147  * Example: Euroweather" */
148  $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149  break;
150  case '2#026':
151  /* Content location code (iso 3166 + some custom things)
152  * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153  * See wikipedia article on iso 3166 and appendix D of iim std. */
154  $data['LocationDestCode'] = self::convIPTC( $val, $c );
155  break;
156  case '2#027':
157  /* Content location name. Full printable name
158  * of location of photo. */
159  $data['LocationDest'] = self::convIPTC( $val, $c );
160  break;
161  case '2#065':
162  /* Originating Program.
163  * Combine with Program version (2:70) if present.
164  */
165  $software = self::convIPTC( $val, $c );
166 
167  if ( count( $software ) !== 1 ) {
168  // according to iim standard this cannot have multiple values
169  // so if there is more than one, something weird is happening,
170  // and we skip it.
171  wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172  break;
173  }
174 
175  if ( isset( $parsed['2#070'] ) ) {
176  // if a version is set for the software.
177  $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178  unset( $parsed['2#070'] );
179  $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
180  } else {
181  $data['Software'] = $software;
182  }
183  break;
184  case '2#075':
185  /* Object cycle.
186  * a for morning (am), p for evening, b for both */
187  $data['ObjectCycle'] = self::convIPTC( $val, $c );
188  break;
189  case '2#100':
190  /* Country/Primary location code.
191  * "Indicates the code of the country/primary location where the
192  * intellectual property of the objectdata was created"
193  * unclear how this differs from 2#026
194  */
195  $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196  break;
197  case '2#103':
198  /* original transmission ref.
199  * "A code representing the location of original transmission ac-
200  * cording to practises of the provider."
201  */
202  $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203  break;
204  case '2#118': /*contact*/
205  $data['Contact'] = self::convIPTC( $val, $c );
206  break;
207  case '2#122':
208  /* Writer/Editor
209  * "Identification of the name of the person involved in the writing,
210  * editing or correcting the objectdata or caption/abstract."
211  */
212  $data['Writer'] = self::convIPTC( $val, $c );
213  break;
214  case '2#135': /* lang code */
215  $data['LanguageCode'] = self::convIPTC( $val, $c );
216  break;
217 
218  // Start date stuff.
219  // It doesn't accept incomplete dates even though they are valid
220  // according to spec.
221  // Should potentially store timezone as well.
222  case '2#055':
223  // Date created (not date digitized).
224  // Maps to exif DateTimeOriginal
225  $time = $parsed['2#060'] ?? [];
226  $timestamp = self::timeHelper( $val, $time, $c );
227  if ( $timestamp ) {
228  $data['DateTimeOriginal'] = $timestamp;
229  }
230  break;
231 
232  case '2#062':
233  // Date converted to digital representation.
234  // Maps to exif DateTimeDigitized
235  $time = $parsed['2#063'] ?? [];
236  $timestamp = self::timeHelper( $val, $time, $c );
237  if ( $timestamp ) {
238  $data['DateTimeDigitized'] = $timestamp;
239  }
240  break;
241 
242  case '2#030':
243  // Date released.
244  $time = $parsed['2#035'] ?? [];
245  $timestamp = self::timeHelper( $val, $time, $c );
246  if ( $timestamp ) {
247  $data['DateTimeReleased'] = $timestamp;
248  }
249  break;
250 
251  case '2#037':
252  // Date expires.
253  $time = $parsed['2#038'] ?? [];
254  $timestamp = self::timeHelper( $val, $time, $c );
255  if ( $timestamp ) {
256  $data['DateTimeExpires'] = $timestamp;
257  }
258  break;
259 
260  case '2#000': /* iim version */
261  // unlike other tags, this is a 2-byte binary number.
262  // technically this is required if there is iptc data
263  // but in practise it isn't always there.
264  if ( strlen( $val[0] ) == 2 ) {
265  // if is just to be paranoid.
266  $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
267  $versionValue += ord( substr( $val[0], 1, 1 ) );
268  $data['iimVersion'] = $versionValue;
269  }
270  break;
271 
272  case '2#004':
273  // IntellectualGenere.
274  // first 4 characters are an id code
275  // That we're not really interested in.
276 
277  // This prop is weird, since it's
278  // allowed to have multiple values
279  // in iim 4.1, but not in the XMP
280  // stuff. We're going to just
281  // extract the first value.
282  $con = self::convIPTC( $val, $c );
283  if ( strlen( $con[0] ) < 5 ) {
284  wfDebugLog( 'iptc', 'IPTC: '
285  . '2:04 too short. '
286  . 'Ignoring.' );
287  break;
288  }
289  $extracted = substr( $con[0], 4 );
290  $data['IntellectualGenre'] = $extracted;
291  break;
292 
293  case '2#012':
294  // Subject News code - this is a compound field
295  // at the moment we only extract the subject news
296  // code, which is an 8 digit (ascii) number
297  // describing the subject matter of the content.
298  $codes = self::convIPTC( $val, $c );
299  foreach ( $codes as $ic ) {
300  $fields = explode( ':', $ic, 3 );
301 
302  if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
303  wfDebugLog( 'IPTC', 'IPTC: '
304  . 'Invalid 2:12 - ' . $ic );
305  break;
306  }
307  $data['SubjectNewsCode'] = $fields[1];
308  }
309  break;
310 
311  // purposely does not do 2:125, 2:130, 2:131,
312  // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
313  // 2:200, 2:201, 2:202
314  // or the audio stuff (2:150 to 2:154)
315 
316  case '2#070':
317  case '2#060':
318  case '2#063':
319  case '2#085':
320  case '2#038':
321  case '2#035':
322  // ignore. Handled elsewhere.
323  break;
324 
325  default:
326  wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
327  break;
328  }
329  }
330 
331  return $data;
332  }
333 
343  private static function timeHelper( $date, $time, $charset ) {
344  if ( count( $date ) === 1 ) {
345  // the standard says this should always be 1
346  // just double checking.
347  list( $date ) = self::convIPTC( $date, $charset );
348  } else {
349  return null;
350  }
351 
352  if ( count( $time ) === 1 ) {
353  list( $time ) = self::convIPTC( $time, $charset );
354  $dateOnly = false;
355  } else {
356  $time = '000000+0000'; // placeholder
357  $dateOnly = true;
358  }
359 
360  if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
361  && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
362  && substr( $date, 0, 4 ) !== '0000'
363  && substr( $date, 4, 2 ) !== '00'
364  && substr( $date, 6, 2 ) !== '00'
365  ) ) {
366  // something wrong.
367  // Note, this rejects some valid dates according to iptc spec
368  // for example: the date 00000400 means the photo was taken in
369  // April, but the year and day is unknown. We don't process these
370  // types of incomplete dates atm.
371  wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
372 
373  return null;
374  }
375 
376  $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
377  if ( $unixTS === false ) {
378  wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
379 
380  return null;
381  }
382 
383  $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
384  + ( intval( substr( $time, 9, 2 ) ) * 60 );
385 
386  if ( substr( $time, 6, 1 ) === '-' ) {
387  $tz = -$tz;
388  }
389 
390  $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
391  if ( $finalTimestamp === false ) {
392  wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
393 
394  return null;
395  }
396  if ( $dateOnly ) {
397  // return the date only
398  return substr( $finalTimestamp, 0, 10 );
399  } else {
400  return $finalTimestamp;
401  }
402  }
403 
411  private static function convIPTC( $data, $charset ) {
412  if ( is_array( $data ) ) {
413  foreach ( $data as &$val ) {
414  $val = self::convIPTCHelper( $val, $charset );
415  }
416  } else {
417  $data = self::convIPTCHelper( $data, $charset );
418  }
419 
420  return $data;
421  }
422 
430  private static function convIPTCHelper( $data, $charset ) {
431  if ( $charset ) {
432  Wikimedia\suppressWarnings();
433  $data = iconv( $charset, "UTF-8//IGNORE", $data );
434  Wikimedia\restoreWarnings();
435  if ( $data === false ) {
436  $data = "";
437  wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
438  }
439  } else {
440  // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
441  // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
442  $oldData = $data;
443  UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
444  if ( $data === $oldData ) {
445  return $data; // if validation didn't change $data
446  } else {
447  return self::convIPTCHelper( $oldData, 'Windows-1252' );
448  }
449  }
450 
451  return trim( $data );
452  }
453 
462  static function getCharset( $tag ) {
463  // According to iim standard, charset is defined by the tag 1:90.
464  // in which there are iso 2022 escape sequences to specify the character set.
465  // the iim standard seems to encourage that all necessary escape sequences are
466  // in the 1:90 tag, but says it doesn't have to be.
467 
468  // This is in need of more testing probably. This is definitely not complete.
469  // however reading the docs of some other iptc software, it appears that most iptc software
470  // only recognizes utf-8. If 1:90 tag is not present content is
471  // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
472 
473  // This also won't work if there are more than one escape sequence in the 1:90 tag
474  // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
475 
476  // This is just going through the charsets mentioned in appendix C of the iim standard.
477 
478  // \x1b = ESC.
479  switch ( $tag ) {
480  case "\x1b%G": // utf-8
481  // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
482  case "\x1b(B": // ascii
483  case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
484  $c = 'UTF-8';
485  break;
486  case "\x1b(A": // like ascii, but british.
487  $c = 'ISO646-GB';
488  break;
489  case "\x1b(C": // some obscure sweedish/finland encoding
490  $c = 'ISO-IR-8-1';
491  break;
492  case "\x1b(D":
493  $c = 'ISO-IR-8-2';
494  break;
495  case "\x1b(E": // some obscure danish/norway encoding
496  $c = 'ISO-IR-9-1';
497  break;
498  case "\x1b(F":
499  $c = 'ISO-IR-9-2';
500  break;
501  case "\x1b(G":
502  $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
503  break;
504  case "\x1b(I":
505  $c = "ISO646-IT";
506  break;
507  case "\x1b(L":
508  $c = "ISO646-PT";
509  break;
510  case "\x1b(Z":
511  $c = "ISO646-ES";
512  break;
513  case "\x1b([":
514  $c = "GREEK7-OLD";
515  break;
516  case "\x1b(K":
517  $c = "ISO646-DE";
518  break;
519  case "\x1b(N": // crylic
520  $c = "ISO_5427";
521  break;
522  case "\x1b(`": // iso646-NO
523  $c = "NS_4551-1";
524  break;
525  case "\x1b(f": // iso646-FR
526  $c = "NF_Z_62-010";
527  break;
528  case "\x1b(g":
529  $c = "PT2"; // iso646-PT2
530  break;
531  case "\x1b(h":
532  $c = "ES2";
533  break;
534  case "\x1b(i": // iso646-HU
535  $c = "MSZ_7795.3";
536  break;
537  case "\x1b(w":
538  $c = "CSA_Z243.4-1985-1";
539  break;
540  case "\x1b(x":
541  $c = "CSA_Z243.4-1985-2";
542  break;
543  case "\x1b\$(B":
544  case "\x1b\$B":
545  case "\x1b&@\x1b\$B":
546  case "\x1b&@\x1b\$(B":
547  $c = "JIS_C6226-1983";
548  break;
549  case "\x1b-A": // iso-8859-1. at least for the high code characters.
550  case "\x1b(@\x1b-A":
551  case "\x1b(B\x1b-A":
552  $c = 'ISO-8859-1';
553  break;
554  case "\x1b-B": // iso-8859-2. at least for the high code characters.
555  $c = 'ISO-8859-2';
556  break;
557  case "\x1b-C": // iso-8859-3. at least for the high code characters.
558  $c = 'ISO-8859-3';
559  break;
560  case "\x1b-D": // iso-8859-4. at least for the high code characters.
561  $c = 'ISO-8859-4';
562  break;
563  case "\x1b-E": // iso-8859-5. at least for the high code characters.
564  $c = 'ISO-8859-5';
565  break;
566  case "\x1b-F": // iso-8859-6. at least for the high code characters.
567  $c = 'ISO-8859-6';
568  break;
569  case "\x1b-G": // iso-8859-7. at least for the high code characters.
570  $c = 'ISO-8859-7';
571  break;
572  case "\x1b-H": // iso-8859-8. at least for the high code characters.
573  $c = 'ISO-8859-8';
574  break;
575  case "\x1b-I": // CSN_369103. at least for the high code characters.
576  $c = 'CSN_369103';
577  break;
578  default:
579  wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
580  // at this point just give up and refuse to parse iptc?
581  $c = false;
582  }
583  return $c;
584  }
585 }
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki...
Definition: IPTC.php:40
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static timeHelper( $date, $time, $charset)
Convert an iptc date and time tags into the exif format.
Definition: IPTC.php:343
static convIPTC( $data, $charset)
Helper function to convert charset for iptc values.
Definition: IPTC.php:411
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not...
static convIPTCHelper( $data, $charset)
Helper function of a helper function to convert charset for iptc values.
Definition: IPTC.php:430
Class for some IPTC functions.
Definition: IPTC.php:29
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition: IPTC.php:462