MediaWiki REL1_31
IPTC.php
Go to the documentation of this file.
1<?php
29class IPTC {
40 static function parse( $rawData ) {
41 $parsed = iptcparse( $rawData );
42 $data = [];
43 if ( !is_array( $parsed ) ) {
44 return $data;
45 }
46
47 $c = '';
48 // charset info contained in tag 1:90.
49 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50 $c = self::getCharset( $parsed['1#090'][0] );
51 if ( $c === false ) {
52 // Unknown charset. refuse to parse.
53 // note: There is a different between
54 // unknown and no charset specified.
55 return [];
56 }
57 unset( $parsed['1#090'] );
58 }
59
60 foreach ( $parsed as $tag => $val ) {
61 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63 continue;
64 }
65 switch ( $tag ) {
66 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67 $data['ImageDescription'] = self::convIPTC( $val, $c );
68 break;
69 case '2#116': /* copyright. Mapped with exif copyright */
70 $data['Copyright'] = self::convIPTC( $val, $c );
71 break;
72 case '2#080': /* byline. Mapped with exif Artist */
73 /* merge with byline title (2:85)
74 * like how exif does it with
75 * Title, person. Not sure if this is best
76 * approach since we no longer have the two fields
77 * separate. each byline title entry corresponds to a
78 * specific byline. */
79
80 $bylines = self::convIPTC( $val, $c );
81 if ( isset( $parsed['2#085'] ) ) {
82 $titles = self::convIPTC( $parsed['2#085'], $c );
83 } else {
84 $titles = [];
85 }
86
87 $titleCount = count( $titles );
88 for ( $i = 0; $i < $titleCount; $i++ ) {
89 if ( isset( $bylines[$i] ) ) {
90 // theoretically this should always be set
91 // but doesn't hurt to be careful.
92 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
93 }
94 }
95 $data['Artist'] = $bylines;
96 break;
97 case '2#025': /* keywords */
98 $data['Keywords'] = self::convIPTC( $val, $c );
99 break;
100 case '2#101': /* Country (shown) */
101 $data['CountryDest'] = self::convIPTC( $val, $c );
102 break;
103 case '2#095': /* state/province (shown) */
104 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105 break;
106 case '2#090': /* city (Shown) */
107 $data['CityDest'] = self::convIPTC( $val, $c );
108 break;
109 case '2#092': /* sublocation (shown) */
110 $data['SublocationDest'] = self::convIPTC( $val, $c );
111 break;
112 case '2#005': /* object name/title */
113 $data['ObjectName'] = self::convIPTC( $val, $c );
114 break;
115 case '2#040': /* special instructions */
116 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117 break;
118 case '2#105': /* headline */
119 $data['Headline'] = self::convIPTC( $val, $c );
120 break;
121 case '2#110': /* credit */
122 /*"Identifies the provider of the objectdata,
123 * not necessarily the owner/creator". */
124 $data['Credit'] = self::convIPTC( $val, $c );
125 break;
126 case '2#115': /* source */
127 /* "Identifies the original owner of the intellectual content of the
128 *objectdata. This could be an agency, a member of an agency or
129 *an individual." */
130 $data['Source'] = self::convIPTC( $val, $c );
131 break;
132
133 case '2#007': /* edit status (lead, correction, etc) */
134 $data['EditStatus'] = self::convIPTC( $val, $c );
135 break;
136 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137 $data['iimCategory'] = self::convIPTC( $val, $c );
138 break;
139 case '2#020': /* category. deprecated. */
140 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141 break;
142 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143 $data['Urgency'] = self::convIPTC( $val, $c );
144 break;
145 case '2#022':
146 /* "Identifies objectdata that recurs often and predictably...
147 * Example: Euroweather" */
148 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149 break;
150 case '2#026':
151 /* Content location code (iso 3166 + some custom things)
152 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153 * See wikipedia article on iso 3166 and appendix D of iim std. */
154 $data['LocationDestCode'] = self::convIPTC( $val, $c );
155 break;
156 case '2#027':
157 /* Content location name. Full printable name
158 * of location of photo. */
159 $data['LocationDest'] = self::convIPTC( $val, $c );
160 break;
161 case '2#065':
162 /* Originating Program.
163 * Combine with Program version (2:70) if present.
164 */
165 $software = self::convIPTC( $val, $c );
166
167 if ( count( $software ) !== 1 ) {
168 // according to iim standard this cannot have multiple values
169 // so if there is more than one, something weird is happening,
170 // and we skip it.
171 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172 break;
173 }
174
175 if ( isset( $parsed['2#070'] ) ) {
176 // if a version is set for the software.
177 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178 unset( $parsed['2#070'] );
179 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
180 } else {
181 $data['Software'] = $software;
182 }
183 break;
184 case '2#075':
185 /* Object cycle.
186 * a for morning (am), p for evening, b for both */
187 $data['ObjectCycle'] = self::convIPTC( $val, $c );
188 break;
189 case '2#100':
190 /* Country/Primary location code.
191 * "Indicates the code of the country/primary location where the
192 * intellectual property of the objectdata was created"
193 * unclear how this differs from 2#026
194 */
195 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196 break;
197 case '2#103':
198 /* original transmission ref.
199 * "A code representing the location of original transmission ac-
200 * cording to practises of the provider."
201 */
202 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203 break;
204 case '2#118': /*contact*/
205 $data['Contact'] = self::convIPTC( $val, $c );
206 break;
207 case '2#122':
208 /* Writer/Editor
209 * "Identification of the name of the person involved in the writing,
210 * editing or correcting the objectdata or caption/abstract."
211 */
212 $data['Writer'] = self::convIPTC( $val, $c );
213 break;
214 case '2#135': /* lang code */
215 $data['LanguageCode'] = self::convIPTC( $val, $c );
216 break;
217
218 // Start date stuff.
219 // It doesn't accept incomplete dates even though they are valid
220 // according to spec.
221 // Should potentially store timezone as well.
222 case '2#055':
223 // Date created (not date digitized).
224 // Maps to exif DateTimeOriginal
225 if ( isset( $parsed['2#060'] ) ) {
226 $time = $parsed['2#060'];
227 } else {
228 $time = [];
229 }
230 $timestamp = self::timeHelper( $val, $time, $c );
231 if ( $timestamp ) {
232 $data['DateTimeOriginal'] = $timestamp;
233 }
234 break;
235
236 case '2#062':
237 // Date converted to digital representation.
238 // Maps to exif DateTimeDigitized
239 if ( isset( $parsed['2#063'] ) ) {
240 $time = $parsed['2#063'];
241 } else {
242 $time = [];
243 }
244 $timestamp = self::timeHelper( $val, $time, $c );
245 if ( $timestamp ) {
246 $data['DateTimeDigitized'] = $timestamp;
247 }
248 break;
249
250 case '2#030':
251 // Date released.
252 if ( isset( $parsed['2#035'] ) ) {
253 $time = $parsed['2#035'];
254 } else {
255 $time = [];
256 }
257 $timestamp = self::timeHelper( $val, $time, $c );
258 if ( $timestamp ) {
259 $data['DateTimeReleased'] = $timestamp;
260 }
261 break;
262
263 case '2#037':
264 // Date expires.
265 if ( isset( $parsed['2#038'] ) ) {
266 $time = $parsed['2#038'];
267 } else {
268 $time = [];
269 }
270 $timestamp = self::timeHelper( $val, $time, $c );
271 if ( $timestamp ) {
272 $data['DateTimeExpires'] = $timestamp;
273 }
274 break;
275
276 case '2#000': /* iim version */
277 // unlike other tags, this is a 2-byte binary number.
278 // technically this is required if there is iptc data
279 // but in practise it isn't always there.
280 if ( strlen( $val[0] ) == 2 ) {
281 // if is just to be paranoid.
282 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
283 $versionValue += ord( substr( $val[0], 1, 1 ) );
284 $data['iimVersion'] = $versionValue;
285 }
286 break;
287
288 case '2#004':
289 // IntellectualGenere.
290 // first 4 characters are an id code
291 // That we're not really interested in.
292
293 // This prop is weird, since it's
294 // allowed to have multiple values
295 // in iim 4.1, but not in the XMP
296 // stuff. We're going to just
297 // extract the first value.
298 $con = self::convIPTC( $val, $c );
299 if ( strlen( $con[0] ) < 5 ) {
300 wfDebugLog( 'iptc', 'IPTC: '
301 . '2:04 too short. '
302 . 'Ignoring.' );
303 break;
304 }
305 $extracted = substr( $con[0], 4 );
306 $data['IntellectualGenre'] = $extracted;
307 break;
308
309 case '2#012':
310 // Subject News code - this is a compound field
311 // at the moment we only extract the subject news
312 // code, which is an 8 digit (ascii) number
313 // describing the subject matter of the content.
314 $codes = self::convIPTC( $val, $c );
315 foreach ( $codes as $ic ) {
316 $fields = explode( ':', $ic, 3 );
317
318 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
319 wfDebugLog( 'IPTC', 'IPTC: '
320 . 'Invalid 2:12 - ' . $ic );
321 break;
322 }
323 $data['SubjectNewsCode'] = $fields[1];
324 }
325 break;
326
327 // purposely does not do 2:125, 2:130, 2:131,
328 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
329 // 2:200, 2:201, 2:202
330 // or the audio stuff (2:150 to 2:154)
331
332 case '2#070':
333 case '2#060':
334 case '2#063':
335 case '2#085':
336 case '2#038':
337 case '2#035':
338 // ignore. Handled elsewhere.
339 break;
340
341 default:
342 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
343 break;
344 }
345 }
346
347 return $data;
348 }
349
359 private static function timeHelper( $date, $time, $charset ) {
360 if ( count( $date ) === 1 ) {
361 // the standard says this should always be 1
362 // just double checking.
363 list( $date ) = self::convIPTC( $date, $charset );
364 } else {
365 return null;
366 }
367
368 if ( count( $time ) === 1 ) {
369 list( $time ) = self::convIPTC( $time, $charset );
370 $dateOnly = false;
371 } else {
372 $time = '000000+0000'; // placeholder
373 $dateOnly = true;
374 }
375
376 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
377 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
378 && substr( $date, 0, 4 ) !== '0000'
379 && substr( $date, 4, 2 ) !== '00'
380 && substr( $date, 6, 2 ) !== '00'
381 ) ) {
382 // something wrong.
383 // Note, this rejects some valid dates according to iptc spec
384 // for example: the date 00000400 means the photo was taken in
385 // April, but the year and day is unknown. We don't process these
386 // types of incomplete dates atm.
387 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
388
389 return null;
390 }
391
392 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
393 if ( $unixTS === false ) {
394 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
395
396 return null;
397 }
398
399 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
400 + ( intval( substr( $time, 9, 2 ) ) * 60 );
401
402 if ( substr( $time, 6, 1 ) === '-' ) {
403 $tz = -$tz;
404 }
405
406 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
407 if ( $finalTimestamp === false ) {
408 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
409
410 return null;
411 }
412 if ( $dateOnly ) {
413 // return the date only
414 return substr( $finalTimestamp, 0, 10 );
415 } else {
416 return $finalTimestamp;
417 }
418 }
419
427 private static function convIPTC( $data, $charset ) {
428 if ( is_array( $data ) ) {
429 foreach ( $data as &$val ) {
430 $val = self::convIPTCHelper( $val, $charset );
431 }
432 } else {
433 $data = self::convIPTCHelper( $data, $charset );
434 }
435
436 return $data;
437 }
438
446 private static function convIPTCHelper( $data, $charset ) {
447 if ( $charset ) {
448 Wikimedia\suppressWarnings();
449 $data = iconv( $charset, "UTF-8//IGNORE", $data );
450 Wikimedia\restoreWarnings();
451 if ( $data === false ) {
452 $data = "";
453 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
454 }
455 } else {
456 // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
457 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
458 $oldData = $data;
459 UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
460 if ( $data === $oldData ) {
461 return $data; // if validation didn't change $data
462 } else {
463 return self::convIPTCHelper( $oldData, 'Windows-1252' );
464 }
465 }
466
467 return trim( $data );
468 }
469
478 static function getCharset( $tag ) {
479 // According to iim standard, charset is defined by the tag 1:90.
480 // in which there are iso 2022 escape sequences to specify the character set.
481 // the iim standard seems to encourage that all necessary escape sequences are
482 // in the 1:90 tag, but says it doesn't have to be.
483
484 // This is in need of more testing probably. This is definitely not complete.
485 // however reading the docs of some other iptc software, it appears that most iptc software
486 // only recognizes utf-8. If 1:90 tag is not present content is
487 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
488
489 // This also won't work if there are more than one escape sequence in the 1:90 tag
490 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
491
492 // This is just going through the charsets mentioned in appendix C of the iim standard.
493
494 // \x1b = ESC.
495 switch ( $tag ) {
496 case "\x1b%G": // utf-8
497 // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
498 case "\x1b(B": // ascii
499 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
500 $c = 'UTF-8';
501 break;
502 case "\x1b(A": // like ascii, but british.
503 $c = 'ISO646-GB';
504 break;
505 case "\x1b(C": // some obscure sweedish/finland encoding
506 $c = 'ISO-IR-8-1';
507 break;
508 case "\x1b(D":
509 $c = 'ISO-IR-8-2';
510 break;
511 case "\x1b(E": // some obscure danish/norway encoding
512 $c = 'ISO-IR-9-1';
513 break;
514 case "\x1b(F":
515 $c = 'ISO-IR-9-2';
516 break;
517 case "\x1b(G":
518 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
519 break;
520 case "\x1b(I":
521 $c = "ISO646-IT";
522 break;
523 case "\x1b(L":
524 $c = "ISO646-PT";
525 break;
526 case "\x1b(Z":
527 $c = "ISO646-ES";
528 break;
529 case "\x1b([":
530 $c = "GREEK7-OLD";
531 break;
532 case "\x1b(K":
533 $c = "ISO646-DE";
534 break;
535 case "\x1b(N": // crylic
536 $c = "ISO_5427";
537 break;
538 case "\x1b(`": // iso646-NO
539 $c = "NS_4551-1";
540 break;
541 case "\x1b(f": // iso646-FR
542 $c = "NF_Z_62-010";
543 break;
544 case "\x1b(g":
545 $c = "PT2"; // iso646-PT2
546 break;
547 case "\x1b(h":
548 $c = "ES2";
549 break;
550 case "\x1b(i": // iso646-HU
551 $c = "MSZ_7795.3";
552 break;
553 case "\x1b(w":
554 $c = "CSA_Z243.4-1985-1";
555 break;
556 case "\x1b(x":
557 $c = "CSA_Z243.4-1985-2";
558 break;
559 case "\x1b\$(B":
560 case "\x1b\$B":
561 case "\x1b&@\x1b\$B":
562 case "\x1b&@\x1b\$(B":
563 $c = "JIS_C6226-1983";
564 break;
565 case "\x1b-A": // iso-8859-1. at least for the high code characters.
566 case "\x1b(@\x1b-A":
567 case "\x1b(B\x1b-A":
568 $c = 'ISO-8859-1';
569 break;
570 case "\x1b-B": // iso-8859-2. at least for the high code characters.
571 $c = 'ISO-8859-2';
572 break;
573 case "\x1b-C": // iso-8859-3. at least for the high code characters.
574 $c = 'ISO-8859-3';
575 break;
576 case "\x1b-D": // iso-8859-4. at least for the high code characters.
577 $c = 'ISO-8859-4';
578 break;
579 case "\x1b-E": // iso-8859-5. at least for the high code characters.
580 $c = 'ISO-8859-5';
581 break;
582 case "\x1b-F": // iso-8859-6. at least for the high code characters.
583 $c = 'ISO-8859-6';
584 break;
585 case "\x1b-G": // iso-8859-7. at least for the high code characters.
586 $c = 'ISO-8859-7';
587 break;
588 case "\x1b-H": // iso-8859-8. at least for the high code characters.
589 $c = 'ISO-8859-8';
590 break;
591 case "\x1b-I": // CSN_369103. at least for the high code characters.
592 $c = 'CSN_369103';
593 break;
594 default:
595 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
596 // at this point just give up and refuse to parse iptc?
597 $c = false;
598 }
599 return $c;
600 }
601}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Class for some IPTC functions.
Definition IPTC.php:29
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki.
Definition IPTC.php:40
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition IPTC.php:478
static timeHelper( $date, $time, $charset)
Convert an iptc date and time tags into the exif format.
Definition IPTC.php:359
static convIPTCHelper( $data, $charset)
Helper function of a helper function to convert charset for iptc values.
Definition IPTC.php:446
static convIPTC( $data, $charset)
Helper function to convert charset for iptc values.
Definition IPTC.php:427
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
see documentation in includes Linker php for Linker::makeImageLink & $time
Definition hooks.txt:1795
linkcache txt The LinkCache class maintains a list of article titles and the information about whether or not the article exists in the database This is used to mark up links when displaying a page If the same link appears more than once on any page then it only has to be looked up once In most cases link lookups are done in batches with the LinkBatch class or the equivalent in so the link cache is mostly useful for short snippets of parsed and for links in the navigation areas of the skin The link cache was formerly used to track links used in a document for the purposes of updating the link tables This application is now deprecated To create a you can use the following $titles
Definition linkcache.txt:17