MediaWiki REL1_33
IPTC.php
Go to the documentation of this file.
1<?php
29class IPTC {
40 static function parse( $rawData ) {
41 $parsed = iptcparse( $rawData );
42 $data = [];
43 if ( !is_array( $parsed ) ) {
44 return $data;
45 }
46
47 $c = '';
48 // charset info contained in tag 1:90.
49 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50 $c = self::getCharset( $parsed['1#090'][0] );
51 if ( $c === false ) {
52 // Unknown charset. refuse to parse.
53 // note: There is a different between
54 // unknown and no charset specified.
55 return [];
56 }
57 unset( $parsed['1#090'] );
58 }
59
60 foreach ( $parsed as $tag => $val ) {
61 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63 continue;
64 }
65 switch ( $tag ) {
66 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67 $data['ImageDescription'] = self::convIPTC( $val, $c );
68 break;
69 case '2#116': /* copyright. Mapped with exif copyright */
70 $data['Copyright'] = self::convIPTC( $val, $c );
71 break;
72 case '2#080': /* byline. Mapped with exif Artist */
73 /* merge with byline title (2:85)
74 * like how exif does it with
75 * Title, person. Not sure if this is best
76 * approach since we no longer have the two fields
77 * separate. each byline title entry corresponds to a
78 * specific byline. */
79
80 $bylines = self::convIPTC( $val, $c );
81 if ( isset( $parsed['2#085'] ) ) {
82 $titles = self::convIPTC( $parsed['2#085'], $c );
83 } else {
84 $titles = [];
85 }
86
87 $titleCount = count( $titles );
88 for ( $i = 0; $i < $titleCount; $i++ ) {
89 if ( isset( $bylines[$i] ) ) {
90 // theoretically this should always be set
91 // but doesn't hurt to be careful.
92 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
93 }
94 }
95 $data['Artist'] = $bylines;
96 break;
97 case '2#025': /* keywords */
98 $data['Keywords'] = self::convIPTC( $val, $c );
99 break;
100 case '2#101': /* Country (shown) */
101 $data['CountryDest'] = self::convIPTC( $val, $c );
102 break;
103 case '2#095': /* state/province (shown) */
104 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105 break;
106 case '2#090': /* city (Shown) */
107 $data['CityDest'] = self::convIPTC( $val, $c );
108 break;
109 case '2#092': /* sublocation (shown) */
110 $data['SublocationDest'] = self::convIPTC( $val, $c );
111 break;
112 case '2#005': /* object name/title */
113 $data['ObjectName'] = self::convIPTC( $val, $c );
114 break;
115 case '2#040': /* special instructions */
116 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117 break;
118 case '2#105': /* headline */
119 $data['Headline'] = self::convIPTC( $val, $c );
120 break;
121 case '2#110': /* credit */
122 /*"Identifies the provider of the objectdata,
123 * not necessarily the owner/creator". */
124 $data['Credit'] = self::convIPTC( $val, $c );
125 break;
126 case '2#115': /* source */
127 /* "Identifies the original owner of the intellectual content of the
128 *objectdata. This could be an agency, a member of an agency or
129 *an individual." */
130 $data['Source'] = self::convIPTC( $val, $c );
131 break;
132
133 case '2#007': /* edit status (lead, correction, etc) */
134 $data['EditStatus'] = self::convIPTC( $val, $c );
135 break;
136 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137 $data['iimCategory'] = self::convIPTC( $val, $c );
138 break;
139 case '2#020': /* category. deprecated. */
140 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141 break;
142 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143 $data['Urgency'] = self::convIPTC( $val, $c );
144 break;
145 case '2#022':
146 /* "Identifies objectdata that recurs often and predictably...
147 * Example: Euroweather" */
148 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149 break;
150 case '2#026':
151 /* Content location code (iso 3166 + some custom things)
152 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153 * See wikipedia article on iso 3166 and appendix D of iim std. */
154 $data['LocationDestCode'] = self::convIPTC( $val, $c );
155 break;
156 case '2#027':
157 /* Content location name. Full printable name
158 * of location of photo. */
159 $data['LocationDest'] = self::convIPTC( $val, $c );
160 break;
161 case '2#065':
162 /* Originating Program.
163 * Combine with Program version (2:70) if present.
164 */
165 $software = self::convIPTC( $val, $c );
166
167 if ( count( $software ) !== 1 ) {
168 // according to iim standard this cannot have multiple values
169 // so if there is more than one, something weird is happening,
170 // and we skip it.
171 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172 break;
173 }
174
175 if ( isset( $parsed['2#070'] ) ) {
176 // if a version is set for the software.
177 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178 unset( $parsed['2#070'] );
179 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
180 } else {
181 $data['Software'] = $software;
182 }
183 break;
184 case '2#075':
185 /* Object cycle.
186 * a for morning (am), p for evening, b for both */
187 $data['ObjectCycle'] = self::convIPTC( $val, $c );
188 break;
189 case '2#100':
190 /* Country/Primary location code.
191 * "Indicates the code of the country/primary location where the
192 * intellectual property of the objectdata was created"
193 * unclear how this differs from 2#026
194 */
195 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196 break;
197 case '2#103':
198 /* original transmission ref.
199 * "A code representing the location of original transmission ac-
200 * cording to practises of the provider."
201 */
202 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203 break;
204 case '2#118': /*contact*/
205 $data['Contact'] = self::convIPTC( $val, $c );
206 break;
207 case '2#122':
208 /* Writer/Editor
209 * "Identification of the name of the person involved in the writing,
210 * editing or correcting the objectdata or caption/abstract."
211 */
212 $data['Writer'] = self::convIPTC( $val, $c );
213 break;
214 case '2#135': /* lang code */
215 $data['LanguageCode'] = self::convIPTC( $val, $c );
216 break;
217
218 // Start date stuff.
219 // It doesn't accept incomplete dates even though they are valid
220 // according to spec.
221 // Should potentially store timezone as well.
222 case '2#055':
223 // Date created (not date digitized).
224 // Maps to exif DateTimeOriginal
225 $time = $parsed['2#060'] ?? [];
226 $timestamp = self::timeHelper( $val, $time, $c );
227 if ( $timestamp ) {
228 $data['DateTimeOriginal'] = $timestamp;
229 }
230 break;
231
232 case '2#062':
233 // Date converted to digital representation.
234 // Maps to exif DateTimeDigitized
235 $time = $parsed['2#063'] ?? [];
236 $timestamp = self::timeHelper( $val, $time, $c );
237 if ( $timestamp ) {
238 $data['DateTimeDigitized'] = $timestamp;
239 }
240 break;
241
242 case '2#030':
243 // Date released.
244 $time = $parsed['2#035'] ?? [];
245 $timestamp = self::timeHelper( $val, $time, $c );
246 if ( $timestamp ) {
247 $data['DateTimeReleased'] = $timestamp;
248 }
249 break;
250
251 case '2#037':
252 // Date expires.
253 $time = $parsed['2#038'] ?? [];
254 $timestamp = self::timeHelper( $val, $time, $c );
255 if ( $timestamp ) {
256 $data['DateTimeExpires'] = $timestamp;
257 }
258 break;
259
260 case '2#000': /* iim version */
261 // unlike other tags, this is a 2-byte binary number.
262 // technically this is required if there is iptc data
263 // but in practise it isn't always there.
264 if ( strlen( $val[0] ) == 2 ) {
265 // if is just to be paranoid.
266 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
267 $versionValue += ord( substr( $val[0], 1, 1 ) );
268 $data['iimVersion'] = $versionValue;
269 }
270 break;
271
272 case '2#004':
273 // IntellectualGenere.
274 // first 4 characters are an id code
275 // That we're not really interested in.
276
277 // This prop is weird, since it's
278 // allowed to have multiple values
279 // in iim 4.1, but not in the XMP
280 // stuff. We're going to just
281 // extract the first value.
282 $con = self::convIPTC( $val, $c );
283 if ( strlen( $con[0] ) < 5 ) {
284 wfDebugLog( 'iptc', 'IPTC: '
285 . '2:04 too short. '
286 . 'Ignoring.' );
287 break;
288 }
289 $extracted = substr( $con[0], 4 );
290 $data['IntellectualGenre'] = $extracted;
291 break;
292
293 case '2#012':
294 // Subject News code - this is a compound field
295 // at the moment we only extract the subject news
296 // code, which is an 8 digit (ascii) number
297 // describing the subject matter of the content.
298 $codes = self::convIPTC( $val, $c );
299 foreach ( $codes as $ic ) {
300 $fields = explode( ':', $ic, 3 );
301
302 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
303 wfDebugLog( 'IPTC', 'IPTC: '
304 . 'Invalid 2:12 - ' . $ic );
305 break;
306 }
307 $data['SubjectNewsCode'] = $fields[1];
308 }
309 break;
310
311 // purposely does not do 2:125, 2:130, 2:131,
312 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
313 // 2:200, 2:201, 2:202
314 // or the audio stuff (2:150 to 2:154)
315
316 case '2#070':
317 case '2#060':
318 case '2#063':
319 case '2#085':
320 case '2#038':
321 case '2#035':
322 // ignore. Handled elsewhere.
323 break;
324
325 default:
326 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
327 break;
328 }
329 }
330
331 return $data;
332 }
333
343 private static function timeHelper( $date, $time, $charset ) {
344 if ( count( $date ) === 1 ) {
345 // the standard says this should always be 1
346 // just double checking.
347 list( $date ) = self::convIPTC( $date, $charset );
348 } else {
349 return null;
350 }
351
352 if ( count( $time ) === 1 ) {
353 list( $time ) = self::convIPTC( $time, $charset );
354 $dateOnly = false;
355 } else {
356 $time = '000000+0000'; // placeholder
357 $dateOnly = true;
358 }
359
360 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
361 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
362 && substr( $date, 0, 4 ) !== '0000'
363 && substr( $date, 4, 2 ) !== '00'
364 && substr( $date, 6, 2 ) !== '00'
365 ) ) {
366 // something wrong.
367 // Note, this rejects some valid dates according to iptc spec
368 // for example: the date 00000400 means the photo was taken in
369 // April, but the year and day is unknown. We don't process these
370 // types of incomplete dates atm.
371 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
372
373 return null;
374 }
375
376 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
377 if ( $unixTS === false ) {
378 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
379
380 return null;
381 }
382
383 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
384 + ( intval( substr( $time, 9, 2 ) ) * 60 );
385
386 if ( substr( $time, 6, 1 ) === '-' ) {
387 $tz = -$tz;
388 }
389
390 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
391 if ( $finalTimestamp === false ) {
392 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
393
394 return null;
395 }
396 if ( $dateOnly ) {
397 // return the date only
398 return substr( $finalTimestamp, 0, 10 );
399 } else {
400 return $finalTimestamp;
401 }
402 }
403
411 private static function convIPTC( $data, $charset ) {
412 if ( is_array( $data ) ) {
413 foreach ( $data as &$val ) {
414 $val = self::convIPTCHelper( $val, $charset );
415 }
416 } else {
417 $data = self::convIPTCHelper( $data, $charset );
418 }
419
420 return $data;
421 }
422
430 private static function convIPTCHelper( $data, $charset ) {
431 if ( $charset ) {
432 Wikimedia\suppressWarnings();
433 $data = iconv( $charset, "UTF-8//IGNORE", $data );
434 Wikimedia\restoreWarnings();
435 if ( $data === false ) {
436 $data = "";
437 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
438 }
439 } else {
440 // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
441 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
442 $oldData = $data;
443 UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
444 if ( $data === $oldData ) {
445 return $data; // if validation didn't change $data
446 } else {
447 return self::convIPTCHelper( $oldData, 'Windows-1252' );
448 }
449 }
450
451 return trim( $data );
452 }
453
462 static function getCharset( $tag ) {
463 // According to iim standard, charset is defined by the tag 1:90.
464 // in which there are iso 2022 escape sequences to specify the character set.
465 // the iim standard seems to encourage that all necessary escape sequences are
466 // in the 1:90 tag, but says it doesn't have to be.
467
468 // This is in need of more testing probably. This is definitely not complete.
469 // however reading the docs of some other iptc software, it appears that most iptc software
470 // only recognizes utf-8. If 1:90 tag is not present content is
471 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
472
473 // This also won't work if there are more than one escape sequence in the 1:90 tag
474 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
475
476 // This is just going through the charsets mentioned in appendix C of the iim standard.
477
478 // \x1b = ESC.
479 switch ( $tag ) {
480 case "\x1b%G": // utf-8
481 // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
482 case "\x1b(B": // ascii
483 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
484 $c = 'UTF-8';
485 break;
486 case "\x1b(A": // like ascii, but british.
487 $c = 'ISO646-GB';
488 break;
489 case "\x1b(C": // some obscure sweedish/finland encoding
490 $c = 'ISO-IR-8-1';
491 break;
492 case "\x1b(D":
493 $c = 'ISO-IR-8-2';
494 break;
495 case "\x1b(E": // some obscure danish/norway encoding
496 $c = 'ISO-IR-9-1';
497 break;
498 case "\x1b(F":
499 $c = 'ISO-IR-9-2';
500 break;
501 case "\x1b(G":
502 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
503 break;
504 case "\x1b(I":
505 $c = "ISO646-IT";
506 break;
507 case "\x1b(L":
508 $c = "ISO646-PT";
509 break;
510 case "\x1b(Z":
511 $c = "ISO646-ES";
512 break;
513 case "\x1b([":
514 $c = "GREEK7-OLD";
515 break;
516 case "\x1b(K":
517 $c = "ISO646-DE";
518 break;
519 case "\x1b(N": // crylic
520 $c = "ISO_5427";
521 break;
522 case "\x1b(`": // iso646-NO
523 $c = "NS_4551-1";
524 break;
525 case "\x1b(f": // iso646-FR
526 $c = "NF_Z_62-010";
527 break;
528 case "\x1b(g":
529 $c = "PT2"; // iso646-PT2
530 break;
531 case "\x1b(h":
532 $c = "ES2";
533 break;
534 case "\x1b(i": // iso646-HU
535 $c = "MSZ_7795.3";
536 break;
537 case "\x1b(w":
538 $c = "CSA_Z243.4-1985-1";
539 break;
540 case "\x1b(x":
541 $c = "CSA_Z243.4-1985-2";
542 break;
543 case "\x1b\$(B":
544 case "\x1b\$B":
545 case "\x1b&@\x1b\$B":
546 case "\x1b&@\x1b\$(B":
547 $c = "JIS_C6226-1983";
548 break;
549 case "\x1b-A": // iso-8859-1. at least for the high code characters.
550 case "\x1b(@\x1b-A":
551 case "\x1b(B\x1b-A":
552 $c = 'ISO-8859-1';
553 break;
554 case "\x1b-B": // iso-8859-2. at least for the high code characters.
555 $c = 'ISO-8859-2';
556 break;
557 case "\x1b-C": // iso-8859-3. at least for the high code characters.
558 $c = 'ISO-8859-3';
559 break;
560 case "\x1b-D": // iso-8859-4. at least for the high code characters.
561 $c = 'ISO-8859-4';
562 break;
563 case "\x1b-E": // iso-8859-5. at least for the high code characters.
564 $c = 'ISO-8859-5';
565 break;
566 case "\x1b-F": // iso-8859-6. at least for the high code characters.
567 $c = 'ISO-8859-6';
568 break;
569 case "\x1b-G": // iso-8859-7. at least for the high code characters.
570 $c = 'ISO-8859-7';
571 break;
572 case "\x1b-H": // iso-8859-8. at least for the high code characters.
573 $c = 'ISO-8859-8';
574 break;
575 case "\x1b-I": // CSN_369103. at least for the high code characters.
576 $c = 'CSN_369103';
577 break;
578 default:
579 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
580 // at this point just give up and refuse to parse iptc?
581 $c = false;
582 }
583 return $c;
584 }
585}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Class for some IPTC functions.
Definition IPTC.php:29
static parse( $rawData)
This takes the results of iptcparse() and puts it into a form that can be handled by mediawiki.
Definition IPTC.php:40
static getCharset( $tag)
take the value of 1:90 tag and returns a charset
Definition IPTC.php:462
static timeHelper( $date, $time, $charset)
Convert an iptc date and time tags into the exif format.
Definition IPTC.php:343
static convIPTCHelper( $data, $charset)
Helper function of a helper function to convert charset for iptc values.
Definition IPTC.php:430
static convIPTC( $data, $charset)
Helper function to convert charset for iptc values.
Definition IPTC.php:411
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
$data
Utility to generate mapping file used in mw.Title (phpCharToUpper.json)
see documentation in includes Linker php for Linker::makeImageLink & $time
Definition hooks.txt:1802
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition injection.txt:37
linkcache txt The LinkCache class maintains a list of article titles and the information about whether or not the article exists in the database This is used to mark up links when displaying a page If the same link appears more than once on any page then it only has to be looked up once In most cases link lookups are done in batches with the LinkBatch class or the equivalent in so the link cache is mostly useful for short snippets of parsed and for links in the navigation areas of the skin The link cache was formerly used to track links used in a document for the purposes of updating the link tables This application is now deprecated To create a you can use the following $titles
Definition linkcache.txt:17