MediaWiki  1.34.0
IPTC.php
Go to the documentation of this file.
1 <?php
29 class IPTC {
41  static function parse( $rawData ) {
42  $parsed = iptcparse( $rawData );
43  $data = [];
44  if ( !is_array( $parsed ) ) {
45  return $data;
46  }
47 
48  $c = '';
49  // charset info contained in tag 1:90.
50  if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
51  $c = self::getCharset( $parsed['1#090'][0] );
52  if ( $c === false ) {
53  // Unknown charset. refuse to parse.
54  // note: There is a different between
55  // unknown and no charset specified.
56  return [];
57  }
58  unset( $parsed['1#090'] );
59  }
60 
61  foreach ( $parsed as $tag => $val ) {
62  if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
63  wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
64  continue;
65  }
66  switch ( $tag ) {
67  case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
68  $data['ImageDescription'] = self::convIPTC( $val, $c );
69  break;
70  case '2#116': /* copyright. Mapped with exif copyright */
71  $data['Copyright'] = self::convIPTC( $val, $c );
72  break;
73  case '2#080': /* byline. Mapped with exif Artist */
74  /* merge with byline title (2:85)
75  * like how exif does it with
76  * Title, person. Not sure if this is best
77  * approach since we no longer have the two fields
78  * separate. each byline title entry corresponds to a
79  * specific byline. */
80 
81  $bylines = self::convIPTC( $val, $c );
82  if ( isset( $parsed['2#085'] ) ) {
83  $titles = self::convIPTC( $parsed['2#085'], $c );
84  } else {
85  $titles = [];
86  }
87 
88  $titleCount = count( $titles );
89  for ( $i = 0; $i < $titleCount; $i++ ) {
90  if ( isset( $bylines[$i] ) ) {
91  // theoretically this should always be set
92  // but doesn't hurt to be careful.
93  $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
94  }
95  }
96  $data['Artist'] = $bylines;
97  break;
98  case '2#025': /* keywords */
99  $data['Keywords'] = self::convIPTC( $val, $c );
100  break;
101  case '2#101': /* Country (shown) */
102  $data['CountryDest'] = self::convIPTC( $val, $c );
103  break;
104  case '2#095': /* state/province (shown) */
105  $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
106  break;
107  case '2#090': /* city (Shown) */
108  $data['CityDest'] = self::convIPTC( $val, $c );
109  break;
110  case '2#092': /* sublocation (shown) */
111  $data['SublocationDest'] = self::convIPTC( $val, $c );
112  break;
113  case '2#005': /* object name/title */
114  $data['ObjectName'] = self::convIPTC( $val, $c );
115  break;
116  case '2#040': /* special instructions */
117  $data['SpecialInstructions'] = self::convIPTC( $val, $c );
118  break;
119  case '2#105': /* headline */
120  $data['Headline'] = self::convIPTC( $val, $c );
121  break;
122  case '2#110': /* credit */
123  /*"Identifies the provider of the objectdata,
124  * not necessarily the owner/creator". */
125  $data['Credit'] = self::convIPTC( $val, $c );
126  break;
127  case '2#115': /* source */
128  /* "Identifies the original owner of the intellectual content of the
129  *objectdata. This could be an agency, a member of an agency or
130  *an individual." */
131  $data['Source'] = self::convIPTC( $val, $c );
132  break;
133 
134  case '2#007': /* edit status (lead, correction, etc) */
135  $data['EditStatus'] = self::convIPTC( $val, $c );
136  break;
137  case '2#015': /* category. deprecated. max 3 letters in theory, often more */
138  $data['iimCategory'] = self::convIPTC( $val, $c );
139  break;
140  case '2#020': /* category. deprecated. */
141  $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
142  break;
143  case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
144  $data['Urgency'] = self::convIPTC( $val, $c );
145  break;
146  case '2#022':
147  /* "Identifies objectdata that recurs often and predictably...
148  * Example: Euroweather" */
149  $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
150  break;
151  case '2#026':
152  /* Content location code (iso 3166 + some custom things)
153  * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
154  * See wikipedia article on iso 3166 and appendix D of iim std. */
155  $data['LocationDestCode'] = self::convIPTC( $val, $c );
156  break;
157  case '2#027':
158  /* Content location name. Full printable name
159  * of location of photo. */
160  $data['LocationDest'] = self::convIPTC( $val, $c );
161  break;
162  case '2#065':
163  /* Originating Program.
164  * Combine with Program version (2:70) if present.
165  */
166  $software = self::convIPTC( $val, $c );
167 
168  if ( count( $software ) !== 1 ) {
169  // according to iim standard this cannot have multiple values
170  // so if there is more than one, something weird is happening,
171  // and we skip it.
172  wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
173  break;
174  }
175 
176  if ( isset( $parsed['2#070'] ) ) {
177  // if a version is set for the software.
178  $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
179  unset( $parsed['2#070'] );
180  $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
181  } else {
182  $data['Software'] = $software;
183  }
184  break;
185  case '2#075':
186  /* Object cycle.
187  * a for morning (am), p for evening, b for both */
188  $data['ObjectCycle'] = self::convIPTC( $val, $c );
189  break;
190  case '2#100':
191  /* Country/Primary location code.
192  * "Indicates the code of the country/primary location where the
193  * intellectual property of the objectdata was created"
194  * unclear how this differs from 2#026
195  */
196  $data['CountryCodeDest'] = self::convIPTC( $val, $c );
197  break;
198  case '2#103':
199  /* original transmission ref.
200  * "A code representing the location of original transmission ac-
201  * cording to practises of the provider."
202  */
203  $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
204  break;
205  case '2#118': /*contact*/
206  $data['Contact'] = self::convIPTC( $val, $c );
207  break;
208  case '2#122':
209  /* Writer/Editor
210  * "Identification of the name of the person involved in the writing,
211  * editing or correcting the objectdata or caption/abstract."
212  */
213  $data['Writer'] = self::convIPTC( $val, $c );
214  break;
215  case '2#135': /* lang code */
216  $data['LanguageCode'] = self::convIPTC( $val, $c );
217  break;
218 
219  // Start date stuff.
220  // It doesn't accept incomplete dates even though they are valid
221  // according to spec.
222  // Should potentially store timezone as well.
223  case '2#055':
224  // Date created (not date digitized).
225  // Maps to exif DateTimeOriginal
226  $time = $parsed['2#060'] ?? [];
227  $timestamp = self::timeHelper( $val, $time, $c );
228  if ( $timestamp ) {
229  $data['DateTimeOriginal'] = $timestamp;
230  }
231  break;
232 
233  case '2#062':
234  // Date converted to digital representation.
235  // Maps to exif DateTimeDigitized
236  $time = $parsed['2#063'] ?? [];
237  $timestamp = self::timeHelper( $val, $time, $c );
238  if ( $timestamp ) {
239  $data['DateTimeDigitized'] = $timestamp;
240  }
241  break;
242 
243  case '2#030':
244  // Date released.
245  $time = $parsed['2#035'] ?? [];
246  $timestamp = self::timeHelper( $val, $time, $c );
247  if ( $timestamp ) {
248  $data['DateTimeReleased'] = $timestamp;
249  }
250  break;
251 
252  case '2#037':
253  // Date expires.
254  $time = $parsed['2#038'] ?? [];
255  $timestamp = self::timeHelper( $val, $time, $c );
256  if ( $timestamp ) {
257  $data['DateTimeExpires'] = $timestamp;
258  }
259  break;
260 
261  case '2#000': /* iim version */
262  // unlike other tags, this is a 2-byte binary number.
263  // technically this is required if there is iptc data
264  // but in practise it isn't always there.
265  if ( strlen( $val[0] ) == 2 ) {
266  // if is just to be paranoid.
267  $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
268  $versionValue += ord( substr( $val[0], 1, 1 ) );
269  $data['iimVersion'] = $versionValue;
270  }
271  break;
272 
273  case '2#004':
274  // IntellectualGenere.
275  // first 4 characters are an id code
276  // That we're not really interested in.
277 
278  // This prop is weird, since it's
279  // allowed to have multiple values
280  // in iim 4.1, but not in the XMP
281  // stuff. We're going to just
282  // extract the first value.
283  $con = self::convIPTC( $val, $c );
284  if ( strlen( $con[0] ) < 5 ) {
285  wfDebugLog( 'iptc', 'IPTC: '
286  . '2:04 too short. '
287  . 'Ignoring.' );
288  break;
289  }
290  $extracted = substr( $con[0], 4 );
291  $data['IntellectualGenre'] = $extracted;
292  break;
293 
294  case '2#012':
295  // Subject News code - this is a compound field
296  // at the moment we only extract the subject news
297  // code, which is an 8 digit (ascii) number
298  // describing the subject matter of the content.
299  $codes = self::convIPTC( $val, $c );
300  foreach ( $codes as $ic ) {
301  $fields = explode( ':', $ic, 3 );
302 
303  if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
304  wfDebugLog( 'IPTC', 'IPTC: '
305  . 'Invalid 2:12 - ' . $ic );
306  break;
307  }
308  $data['SubjectNewsCode'] = $fields[1];
309  }
310  break;
311 
312  // purposely does not do 2:125, 2:130, 2:131,
313  // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
314  // 2:200, 2:201, 2:202
315  // or the audio stuff (2:150 to 2:154)
316 
317  case '2#070':
318  case '2#060':
319  case '2#063':
320  case '2#085':
321  case '2#038':
322  case '2#035':
323  // ignore. Handled elsewhere.
324  break;
325 
326  default:
327  wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
328  break;
329  }
330  }
331 
332  return $data;
333  }
334 
344  private static function timeHelper( $date, $time, $charset ) {
345  if ( count( $date ) === 1 ) {
346  // the standard says this should always be 1
347  // just double checking.
348  list( $date ) = self::convIPTC( $date, $charset );
349  } else {
350  return null;
351  }
352 
353  if ( count( $time ) === 1 ) {
354  list( $time ) = self::convIPTC( $time, $charset );
355  $dateOnly = false;
356  } else {
357  $time = '000000+0000'; // placeholder
358  $dateOnly = true;
359  }
360 
361  if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
362  && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
363  && substr( $date, 0, 4 ) !== '0000'
364  && substr( $date, 4, 2 ) !== '00'
365  && substr( $date, 6, 2 ) !== '00'
366  ) ) {
367  // something wrong.
368  // Note, this rejects some valid dates according to iptc spec
369  // for example: the date 00000400 means the photo was taken in
370  // April, but the year and day is unknown. We don't process these
371  // types of incomplete dates atm.
372  wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
373 
374  return null;
375  }
376 
377  $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
378  if ( $unixTS === false ) {
379  wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
380 
381  return null;
382  }
383 
384  $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
385  + ( intval( substr( $time, 9, 2 ) ) * 60 );
386 
387  if ( substr( $time, 6, 1 ) === '-' ) {
388  $tz = -$tz;
389  }
390 
391  $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
392  if ( $finalTimestamp === false ) {
393  wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
394 
395  return null;
396  }
397  if ( $dateOnly ) {
398  // return the date only
399  return substr( $finalTimestamp, 0, 10 );
400  } else {
401  return $finalTimestamp;
402  }
403  }
404 
412  private static function convIPTC( $data, $charset ) {
413  if ( is_array( $data ) ) {
414  foreach ( $data as &$val ) {
415  $val = self::convIPTCHelper( $val, $charset );
416  }
417  } else {
418  $data = self::convIPTCHelper( $data, $charset );
419  }
420 
421  return $data;
422  }
423 
431  private static function convIPTCHelper( $data, $charset ) {
432  if ( $charset ) {
433  Wikimedia\suppressWarnings();
434  $data = iconv( $charset, "UTF-8//IGNORE", $data );
435  Wikimedia\restoreWarnings();
436  if ( $data === false ) {
437  $data = "";
438  wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
439  }
440  } else {
441  // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
442  // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
443  $oldData = $data;
444  UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
445  if ( $data === $oldData ) {
446  return $data; // if validation didn't change $data
447  } else {
448  return self::convIPTCHelper( $oldData, 'Windows-1252' );
449  }
450  }
451 
452  return trim( $data );
453  }
454 
463  static function getCharset( $tag ) {
464  // According to iim standard, charset is defined by the tag 1:90.
465  // in which there are iso 2022 escape sequences to specify the character set.
466  // the iim standard seems to encourage that all necessary escape sequences are
467  // in the 1:90 tag, but says it doesn't have to be.
468 
469  // This is in need of more testing probably. This is definitely not complete.
470  // however reading the docs of some other iptc software, it appears that most iptc software
471  // only recognizes utf-8. If 1:90 tag is not present content is
472  // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
473 
474  // This also won't work if there are more than one escape sequence in the 1:90 tag
475  // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
476 
477  // This is just going through the charsets mentioned in appendix C of the iim standard.
478 
479  // \x1b = ESC.
480  switch ( $tag ) {
481  case "\x1b%G": // utf-8
482  // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
483  case "\x1b(B": // ascii
484  case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
485  $c = 'UTF-8';
486  break;
487  case "\x1b(A": // like ascii, but british.
488  $c = 'ISO646-GB';
489  break;
490  case "\x1b(C": // some obscure sweedish/finland encoding
491  $c = 'ISO-IR-8-1';
492  break;
493  case "\x1b(D":
494  $c = 'ISO-IR-8-2';
495  break;
496  case "\x1b(E": // some obscure danish/norway encoding
497  $c = 'ISO-IR-9-1';
498  break;
499  case "\x1b(F":
500  $c = 'ISO-IR-9-2';
501  break;
502  case "\x1b(G":
503  $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
504  break;
505  case "\x1b(I":
506  $c = "ISO646-IT";
507  break;
508  case "\x1b(L":
509  $c = "ISO646-PT";
510  break;
511  case "\x1b(Z":
512  $c = "ISO646-ES";
513  break;
514  case "\x1b([":
515  $c = "GREEK7-OLD";
516  break;
517  case "\x1b(K":
518  $c = "ISO646-DE";
519  break;
520  case "\x1b(N": // crylic
521  $c = "ISO_5427";
522  break;
523  case "\x1b(`": // iso646-NO
524  $c = "NS_4551-1";
525  break;
526  case "\x1b(f": // iso646-FR
527  $c = "NF_Z_62-010";
528  break;
529  case "\x1b(g":
530  $c = "PT2"; // iso646-PT2
531  break;
532  case "\x1b(h":
533  $c = "ES2";
534  break;
535  case "\x1b(i": // iso646-HU
536  $c = "MSZ_7795.3";
537  break;
538  case "\x1b(w":
539  $c = "CSA_Z243.4-1985-1";
540  break;
541  case "\x1b(x":
542  $c = "CSA_Z243.4-1985-2";
543  break;
544  case "\x1b\$(B":
545  case "\x1b\$B":
546  case "\x1b&@\x1b\$B":
547  case "\x1b&@\x1b\$(B":
548  $c = "JIS_C6226-1983";
549  break;
550  case "\x1b-A": // iso-8859-1. at least for the high code characters.
551  case "\x1b(@\x1b-A":
552  case "\x1b(B\x1b-A":
553  $c = 'ISO-8859-1';
554  break;
555  case "\x1b-B": // iso-8859-2. at least for the high code characters.
556  $c = 'ISO-8859-2';
557  break;
558  case "\x1b-C": // iso-8859-3. at least for the high code characters.
559  $c = 'ISO-8859-3';
560  break;
561  case "\x1b-D": // iso-8859-4. at least for the high code characters.
562  $c = 'ISO-8859-4';
563  break;
564  case "\x1b-E": // iso-8859-5. at least for the high code characters.
565  $c = 'ISO-8859-5';
566  break;
567  case "\x1b-F": // iso-8859-6. at least for the high code characters.
568  $c = 'ISO-8859-6';
569  break;
570  case "\x1b-G": // iso-8859-7. at least for the high code characters.
571  $c = 'ISO-8859-7';
572  break;
573  case "\x1b-H": // iso-8859-8. at least for the high code characters.
574  $c = 'ISO-8859-8';
575  break;
576  case "\x1b-I": // CSN_369103. at least for the high code characters.
577  $c = 'CSN_369103';
578  break;
579  default:
580  wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
581  // at this point just give up and refuse to parse iptc?
582  $c = false;
583  }
584  return $c;
585  }
586 }
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1869
IPTC\convIPTCHelper
static convIPTCHelper( $data, $charset)
Helper function of a helper function to convert charset for iptc values.
Definition: IPTC.php:431
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1007
IPTC
Class for some IPTC functions.
Definition: IPTC.php:29
IPTC\parse
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki.
Definition: IPTC.php:41
IPTC\getCharset
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition: IPTC.php:463
IPTC\convIPTC
static convIPTC( $data, $charset)
Helper function to convert charset for iptc values.
Definition: IPTC.php:412
IPTC\timeHelper
static timeHelper( $date, $time, $charset)
Convert an iptc date and time tags into the exif format.
Definition: IPTC.php:344