MediaWiki  1.28.0
IPTC.php
Go to the documentation of this file.
1 <?php
29 class IPTC {
40  static function parse( $rawData ) {
41  $parsed = iptcparse( $rawData );
42  $data = [];
43  if ( !is_array( $parsed ) ) {
44  return $data;
45  }
46 
47  $c = '';
48  // charset info contained in tag 1:90.
49  if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50  $c = self::getCharset( $parsed['1#090'][0] );
51  if ( $c === false ) {
52  // Unknown charset. refuse to parse.
53  // note: There is a different between
54  // unknown and no charset specified.
55  return [];
56  }
57  unset( $parsed['1#090'] );
58  }
59 
60  foreach ( $parsed as $tag => $val ) {
61  if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62  wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63  continue;
64  }
65  switch ( $tag ) {
66  case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67  $data['ImageDescription'] = self::convIPTC( $val, $c );
68  break;
69  case '2#116': /* copyright. Mapped with exif copyright */
70  $data['Copyright'] = self::convIPTC( $val, $c );
71  break;
72  case '2#080': /* byline. Mapped with exif Artist */
73  /* merge with byline title (2:85)
74  * like how exif does it with
75  * Title, person. Not sure if this is best
76  * approach since we no longer have the two fields
77  * separate. each byline title entry corresponds to a
78  * specific byline. */
79 
80  $bylines = self::convIPTC( $val, $c );
81  if ( isset( $parsed['2#085'] ) ) {
82  $titles = self::convIPTC( $parsed['2#085'], $c );
83  } else {
84  $titles = [];
85  }
86 
87  $titleCount = count( $titles );
88  for ( $i = 0; $i < $titleCount; $i++ ) {
89  if ( isset( $bylines[$i] ) ) {
90  // theoretically this should always be set
91  // but doesn't hurt to be careful.
92  $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
93  }
94  }
95  $data['Artist'] = $bylines;
96  break;
97  case '2#025': /* keywords */
98  $data['Keywords'] = self::convIPTC( $val, $c );
99  break;
100  case '2#101': /* Country (shown)*/
101  $data['CountryDest'] = self::convIPTC( $val, $c );
102  break;
103  case '2#095': /* state/province (shown) */
104  $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105  break;
106  case '2#090': /* city (Shown) */
107  $data['CityDest'] = self::convIPTC( $val, $c );
108  break;
109  case '2#092': /* sublocation (shown) */
110  $data['SublocationDest'] = self::convIPTC( $val, $c );
111  break;
112  case '2#005': /* object name/title */
113  $data['ObjectName'] = self::convIPTC( $val, $c );
114  break;
115  case '2#040': /* special instructions */
116  $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117  break;
118  case '2#105': /* headline*/
119  $data['Headline'] = self::convIPTC( $val, $c );
120  break;
121  case '2#110': /* credit */
122  /*"Identifies the provider of the objectdata,
123  * not necessarily the owner/creator". */
124  $data['Credit'] = self::convIPTC( $val, $c );
125  break;
126  case '2#115': /* source */
127  /* "Identifies the original owner of the intellectual content of the
128  *objectdata. This could be an agency, a member of an agency or
129  *an individual." */
130  $data['Source'] = self::convIPTC( $val, $c );
131  break;
132 
133  case '2#007': /* edit status (lead, correction, etc) */
134  $data['EditStatus'] = self::convIPTC( $val, $c );
135  break;
136  case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137  $data['iimCategory'] = self::convIPTC( $val, $c );
138  break;
139  case '2#020': /* category. deprecated. */
140  $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141  break;
142  case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143  $data['Urgency'] = self::convIPTC( $val, $c );
144  break;
145  case '2#022':
146  /* "Identifies objectdata that recurs often and predictably...
147  * Example: Euroweather" */
148  $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149  break;
150  case '2#026':
151  /* Content location code (iso 3166 + some custom things)
152  * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153  * See wikipedia article on iso 3166 and appendix D of iim std. */
154  $data['LocationDestCode'] = self::convIPTC( $val, $c );
155  break;
156  case '2#027':
157  /* Content location name. Full printable name
158  * of location of photo. */
159  $data['LocationDest'] = self::convIPTC( $val, $c );
160  break;
161  case '2#065':
162  /* Originating Program.
163  * Combine with Program version (2:70) if present.
164  */
165  $software = self::convIPTC( $val, $c );
166 
167  if ( count( $software ) !== 1 ) {
168  // according to iim standard this cannot have multiple values
169  // so if there is more than one, something weird is happening,
170  // and we skip it.
171  wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172  break;
173  }
174 
175  if ( isset( $parsed['2#070'] ) ) {
176  // if a version is set for the software.
177  $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178  unset( $parsed['2#070'] );
179  $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
180  } else {
181  $data['Software'] = $software;
182  }
183  break;
184  case '2#075':
185  /* Object cycle.
186  * a for morning (am), p for evening, b for both */
187  $data['ObjectCycle'] = self::convIPTC( $val, $c );
188  break;
189  case '2#100':
190  /* Country/Primary location code.
191  * "Indicates the code of the country/primary location where the
192  * intellectual property of the objectdata was created"
193  * unclear how this differs from 2#026
194  */
195  $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196  break;
197  case '2#103':
198  /* original transmission ref.
199  * "A code representing the location of original transmission ac-
200  * cording to practises of the provider."
201  */
202  $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203  break;
204  case '2#118': /*contact*/
205  $data['Contact'] = self::convIPTC( $val, $c );
206  break;
207  case '2#122':
208  /* Writer/Editor
209  * "Identification of the name of the person involved in the writing,
210  * editing or correcting the objectdata or caption/abstract."
211  */
212  $data['Writer'] = self::convIPTC( $val, $c );
213  break;
214  case '2#135': /* lang code */
215  $data['LanguageCode'] = self::convIPTC( $val, $c );
216  break;
217 
218  // Start date stuff.
219  // It doesn't accept incomplete dates even though they are valid
220  // according to spec.
221  // Should potentially store timezone as well.
222  case '2#055':
223  // Date created (not date digitized).
224  // Maps to exif DateTimeOriginal
225  if ( isset( $parsed['2#060'] ) ) {
226  $time = $parsed['2#060'];
227  } else {
228  $time = [];
229  }
230  $timestamp = self::timeHelper( $val, $time, $c );
231  if ( $timestamp ) {
232  $data['DateTimeOriginal'] = $timestamp;
233  }
234  break;
235 
236  case '2#062':
237  // Date converted to digital representation.
238  // Maps to exif DateTimeDigitized
239  if ( isset( $parsed['2#063'] ) ) {
240  $time = $parsed['2#063'];
241  } else {
242  $time = [];
243  }
244  $timestamp = self::timeHelper( $val, $time, $c );
245  if ( $timestamp ) {
246  $data['DateTimeDigitized'] = $timestamp;
247  }
248  break;
249 
250  case '2#030':
251  // Date released.
252  if ( isset( $parsed['2#035'] ) ) {
253  $time = $parsed['2#035'];
254  } else {
255  $time = [];
256  }
257  $timestamp = self::timeHelper( $val, $time, $c );
258  if ( $timestamp ) {
259  $data['DateTimeReleased'] = $timestamp;
260  }
261  break;
262 
263  case '2#037':
264  // Date expires.
265  if ( isset( $parsed['2#038'] ) ) {
266  $time = $parsed['2#038'];
267  } else {
268  $time = [];
269  }
270  $timestamp = self::timeHelper( $val, $time, $c );
271  if ( $timestamp ) {
272  $data['DateTimeExpires'] = $timestamp;
273  }
274  break;
275 
276  case '2#000': /* iim version */
277  // unlike other tags, this is a 2-byte binary number.
278  // technically this is required if there is iptc data
279  // but in practise it isn't always there.
280  if ( strlen( $val[0] ) == 2 ) {
281  // if is just to be paranoid.
282  $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
283  $versionValue += ord( substr( $val[0], 1, 1 ) );
284  $data['iimVersion'] = $versionValue;
285  }
286  break;
287 
288  case '2#004':
289  // IntellectualGenere.
290  // first 4 characters are an id code
291  // That we're not really interested in.
292 
293  // This prop is weird, since it's
294  // allowed to have multiple values
295  // in iim 4.1, but not in the XMP
296  // stuff. We're going to just
297  // extract the first value.
298  $con = self::convIPTC( $val, $c );
299  if ( strlen( $con[0] ) < 5 ) {
300  wfDebugLog( 'iptc', 'IPTC: '
301  . '2:04 too short. '
302  . 'Ignoring.' );
303  break;
304  }
305  $extracted = substr( $con[0], 4 );
306  $data['IntellectualGenre'] = $extracted;
307  break;
308 
309  case '2#012':
310  // Subject News code - this is a compound field
311  // at the moment we only extract the subject news
312  // code, which is an 8 digit (ascii) number
313  // describing the subject matter of the content.
314  $codes = self::convIPTC( $val, $c );
315  foreach ( $codes as $ic ) {
316  $fields = explode( ':', $ic, 3 );
317 
318  if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
319  wfDebugLog( 'IPTC', 'IPTC: '
320  . 'Invalid 2:12 - ' . $ic );
321  break;
322  }
323  $data['SubjectNewsCode'] = $fields[1];
324  }
325  break;
326 
327  // purposely does not do 2:125, 2:130, 2:131,
328  // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
329  // 2:200, 2:201, 2:202
330  // or the audio stuff (2:150 to 2:154)
331 
332  case '2#070':
333  case '2#060':
334  case '2#063':
335  case '2#085':
336  case '2#038':
337  case '2#035':
338  // ignore. Handled elsewhere.
339  break;
340 
341  default:
342  wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
343  break;
344  }
345  }
346 
347  return $data;
348  }
349 
359  private static function timeHelper( $date, $time, $c ) {
360  if ( count( $date ) === 1 ) {
361  // the standard says this should always be 1
362  // just double checking.
363  list( $date ) = self::convIPTC( $date, $c );
364  } else {
365  return null;
366  }
367 
368  if ( count( $time ) === 1 ) {
369  list( $time ) = self::convIPTC( $time, $c );
370  $dateOnly = false;
371  } else {
372  $time = '000000+0000'; // placeholder
373  $dateOnly = true;
374  }
375 
376  if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
377  && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
378  && substr( $date, 0, 4 ) !== '0000'
379  && substr( $date, 4, 2 ) !== '00'
380  && substr( $date, 6, 2 ) !== '00'
381  ) ) {
382  // something wrong.
383  // Note, this rejects some valid dates according to iptc spec
384  // for example: the date 00000400 means the photo was taken in
385  // April, but the year and day is unknown. We don't process these
386  // types of incomplete dates atm.
387  wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
388 
389  return null;
390  }
391 
392  $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
393  if ( $unixTS === false ) {
394  wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
395 
396  return null;
397  }
398 
399  $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
400  + ( intval( substr( $time, 9, 2 ) ) * 60 );
401 
402  if ( substr( $time, 6, 1 ) === '-' ) {
403  $tz = -$tz;
404  }
405 
406  $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
407  if ( $finalTimestamp === false ) {
408  wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
409 
410  return null;
411  }
412  if ( $dateOnly ) {
413  // return the date only
414  return substr( $finalTimestamp, 0, 10 );
415  } else {
416  return $finalTimestamp;
417  }
418  }
419 
427  private static function convIPTC( $data, $charset ) {
428  if ( is_array( $data ) ) {
429  foreach ( $data as &$val ) {
430  $val = self::convIPTCHelper( $val, $charset );
431  }
432  } else {
433  $data = self::convIPTCHelper( $data, $charset );
434  }
435 
436  return $data;
437  }
438 
446  private static function convIPTCHelper( $data, $charset ) {
447  if ( $charset ) {
448  MediaWiki\suppressWarnings();
449  $data = iconv( $charset, "UTF-8//IGNORE", $data );
450  MediaWiki\restoreWarnings();
451  if ( $data === false ) {
452  $data = "";
453  wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
454  }
455  } else {
456  // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
457  // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
458  $oldData = $data;
459  UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
460  if ( $data === $oldData ) {
461  return $data; // if validation didn't change $data
462  } else {
463  return self::convIPTCHelper( $oldData, 'Windows-1252' );
464  }
465  }
466 
467  return trim( $data );
468  }
469 
478  static function getCharset( $tag ) {
479 
480  // According to iim standard, charset is defined by the tag 1:90.
481  // in which there are iso 2022 escape sequences to specify the character set.
482  // the iim standard seems to encourage that all necessary escape sequences are
483  // in the 1:90 tag, but says it doesn't have to be.
484 
485  // This is in need of more testing probably. This is definitely not complete.
486  // however reading the docs of some other iptc software, it appears that most iptc software
487  // only recognizes utf-8. If 1:90 tag is not present content is
488  // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
489 
490  // This also won't work if there are more than one escape sequence in the 1:90 tag
491  // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
492 
493  // This is just going through the charsets mentioned in appendix C of the iim standard.
494 
495  // \x1b = ESC.
496  switch ( $tag ) {
497  case "\x1b%G": // utf-8
498  // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
499  case "\x1b(B": // ascii
500  case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
501  $c = 'UTF-8';
502  break;
503  case "\x1b(A": // like ascii, but british.
504  $c = 'ISO646-GB';
505  break;
506  case "\x1b(C": // some obscure sweedish/finland encoding
507  $c = 'ISO-IR-8-1';
508  break;
509  case "\x1b(D":
510  $c = 'ISO-IR-8-2';
511  break;
512  case "\x1b(E": // some obscure danish/norway encoding
513  $c = 'ISO-IR-9-1';
514  break;
515  case "\x1b(F":
516  $c = 'ISO-IR-9-2';
517  break;
518  case "\x1b(G":
519  $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
520  break;
521  case "\x1b(I":
522  $c = "ISO646-IT";
523  break;
524  case "\x1b(L":
525  $c = "ISO646-PT";
526  break;
527  case "\x1b(Z":
528  $c = "ISO646-ES";
529  break;
530  case "\x1b([":
531  $c = "GREEK7-OLD";
532  break;
533  case "\x1b(K":
534  $c = "ISO646-DE";
535  break;
536  case "\x1b(N": // crylic
537  $c = "ISO_5427";
538  break;
539  case "\x1b(`": // iso646-NO
540  $c = "NS_4551-1";
541  break;
542  case "\x1b(f": // iso646-FR
543  $c = "NF_Z_62-010";
544  break;
545  case "\x1b(g":
546  $c = "PT2"; // iso646-PT2
547  break;
548  case "\x1b(h":
549  $c = "ES2";
550  break;
551  case "\x1b(i": // iso646-HU
552  $c = "MSZ_7795.3";
553  break;
554  case "\x1b(w":
555  $c = "CSA_Z243.4-1985-1";
556  break;
557  case "\x1b(x":
558  $c = "CSA_Z243.4-1985-2";
559  break;
560  case "\x1b\$(B":
561  case "\x1b\$B":
562  case "\x1b&@\x1b\$B":
563  case "\x1b&@\x1b\$(B":
564  $c = "JIS_C6226-1983";
565  break;
566  case "\x1b-A": // iso-8859-1. at least for the high code characters.
567  case "\x1b(@\x1b-A":
568  case "\x1b(B\x1b-A":
569  $c = 'ISO-8859-1';
570  break;
571  case "\x1b-B": // iso-8859-2. at least for the high code characters.
572  $c = 'ISO-8859-2';
573  break;
574  case "\x1b-C": // iso-8859-3. at least for the high code characters.
575  $c = 'ISO-8859-3';
576  break;
577  case "\x1b-D": // iso-8859-4. at least for the high code characters.
578  $c = 'ISO-8859-4';
579  break;
580  case "\x1b-E": // iso-8859-5. at least for the high code characters.
581  $c = 'ISO-8859-5';
582  break;
583  case "\x1b-F": // iso-8859-6. at least for the high code characters.
584  $c = 'ISO-8859-6';
585  break;
586  case "\x1b-G": // iso-8859-7. at least for the high code characters.
587  $c = 'ISO-8859-7';
588  break;
589  case "\x1b-H": // iso-8859-8. at least for the high code characters.
590  $c = 'ISO-8859-8';
591  break;
592  case "\x1b-I": // CSN_369103. at least for the high code characters.
593  $c = 'CSN_369103';
594  break;
595  default:
596  wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
597  // at this point just give up and refuse to parse iptc?
598  $c = false;
599  }
600  return $c;
601  }
602 }
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static convIPTCHelper($data, $charset)
Helper function of a helper function to convert charset for iptc values.
Definition: IPTC.php:446
static getCharset($tag)
take the value of 1:90 tag and returns a charset
Definition: IPTC.php:478
const TS_UNIX
Unix time - the number of seconds since 1970-01-01 00:00:00 UTC.
Definition: defines.php:6
wfTimestamp($outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
wfDebugLog($logGroup, $text, $dest= 'all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not...
if($limit) $timestamp
const TS_EXIF
An Exif timestamp (YYYY:MM:DD HH:MM:SS)
Definition: defines.php:37
static timeHelper($date, $time, $c)
Convert an iptc date and time tags into the exif format.
Definition: IPTC.php:359
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books $tag
Definition: hooks.txt:1007
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
static convIPTC($data, $charset)
Helper function to convert charset for iptc values.
Definition: IPTC.php:427
linkcache txt The LinkCache class maintains a list of article titles and the information about whether or not the article exists in the database This is used to mark up links when displaying a page If the same link appears more than once on any page then it only has to be looked up once In most cases link lookups are done in batches with the LinkBatch class or the equivalent in so the link cache is mostly useful for short snippets of parsed and for links in the navigation areas of the skin The link cache was formerly used to track links used in a document for the purposes of updating the link tables This application is now deprecated To create a you can use the following $titles
Definition: linkcache.txt:17
static parse($rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki...
Definition: IPTC.php:40
Class for some IPTC functions.
Definition: IPTC.php:29
see documentation in includes Linker php for Linker::makeImageLink & $time
Definition: hooks.txt:1749