Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
10.69% covered (danger)
10.69%
34 / 318
0.00% covered (danger)
0.00%
0 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
IPTC
10.73% covered (danger)
10.73%
34 / 317
0.00% covered (danger)
0.00%
0 / 5
10538.23
0.00% covered (danger)
0.00%
0 / 1
 parse
9.94% covered (danger)
9.94%
17 / 171
0.00% covered (danger)
0.00%
0 / 1
2962.04
 timeHelper
0.00% covered (danger)
0.00%
0 / 30
0.00% covered (danger)
0.00%
0 / 1
156
 convIPTC
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
3.07
 convIPTCHelper
81.82% covered (warning)
81.82%
9 / 11
0.00% covered (danger)
0.00%
0 / 1
4.10
 getCharset
4.00% covered (danger)
4.00%
4 / 100
0.00% covered (danger)
0.00%
0 / 1
1384.68
1<?php
2/**
3 * Class for some IPTC functions.
4 *
5 * @license GPL-2.0-or-later
6 * @file
7 * @ingroup Media
8 */
9
10namespace MediaWiki\Media;
11
12use Wikimedia\Timestamp\TimestampFormat as TS;
13
14/**
15 * Class for some IPTC functions.
16 *
17 * @ingroup Media
18 */
19class IPTC {
20    /**
21     * This takes the results of iptcparse() and puts it into a
22     * form that can be handled by mediawiki. Generally called from
23     * BitmapMetadataHandler::doApp13.
24     *
25     * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
26     *
27     * @param string $rawData The app13 block from jpeg containing iptc/iim data
28     * @return array IPTC metadata array
29     */
30    public static function parse( $rawData ) {
31        $parsed = iptcparse( $rawData );
32        $data = [];
33        if ( !is_array( $parsed ) ) {
34            return $data;
35        }
36
37        $c = '';
38        // charset info contained in tag 1:90.
39        if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
40            $c = self::getCharset( $parsed['1#090'][0] );
41            if ( $c === false ) {
42                // Unknown charset. refuse to parse.
43                // note: There is a different between
44                // unknown and no charset specified.
45                return [];
46            }
47            unset( $parsed['1#090'] );
48        }
49
50        foreach ( $parsed as $tag => $val ) {
51            if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
52                wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
53                continue;
54            }
55            switch ( $tag ) {
56                case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
57                    $data['ImageDescription'] = self::convIPTC( $val, $c );
58                    break;
59                case '2#116': /* copyright. Mapped with exif copyright */
60                    $data['Copyright'] = self::convIPTC( $val, $c );
61                    break;
62                case '2#080': /* byline. Mapped with exif Artist */
63                    /* merge with byline title (2:85)
64                     * like how exif does it with
65                     * Title, person. Not sure if this is best
66                     * approach since we no longer have the two fields
67                     * separate. each byline title entry corresponds to a
68                     * specific byline. */
69
70                    $bylines = self::convIPTC( $val, $c );
71                    if ( isset( $parsed['2#085'] ) ) {
72                        $titles = self::convIPTC( $parsed['2#085'], $c );
73                    } else {
74                        $titles = [];
75                    }
76
77                    $titleCount = count( $titles );
78                    for ( $i = 0; $i < $titleCount; $i++ ) {
79                        if ( isset( $bylines[$i] ) ) {
80                            // theoretically this should always be set
81                            // but doesn't hurt to be careful.
82                            $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
83                        }
84                    }
85                    $data['Artist'] = $bylines;
86                    break;
87                case '2#025': /* keywords */
88                    $data['Keywords'] = self::convIPTC( $val, $c );
89                    break;
90                case '2#101': /* Country (shown) */
91                    $data['CountryDest'] = self::convIPTC( $val, $c );
92                    break;
93                case '2#095': /* state/province (shown) */
94                    $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
95                    break;
96                case '2#090': /* city (Shown) */
97                    $data['CityDest'] = self::convIPTC( $val, $c );
98                    break;
99                case '2#092': /* sublocation (shown) */
100                    $data['SublocationDest'] = self::convIPTC( $val, $c );
101                    break;
102                case '2#005': /* object name/title */
103                    $data['ObjectName'] = self::convIPTC( $val, $c );
104                    break;
105                case '2#040': /* special instructions */
106                    $data['SpecialInstructions'] = self::convIPTC( $val, $c );
107                    break;
108                case '2#105': /* headline */
109                    $data['Headline'] = self::convIPTC( $val, $c );
110                    break;
111                case '2#110': /* credit */
112                    /*"Identifies the provider of the objectdata,
113                     * not necessarily the owner/creator". */
114                    $data['Credit'] = self::convIPTC( $val, $c );
115                    break;
116                case '2#115': /* source */
117                    /* "Identifies the original owner of the intellectual content of the
118                     *objectdata. This could be an agency, a member of an agency or
119                     *an individual." */
120                    $data['Source'] = self::convIPTC( $val, $c );
121                    break;
122
123                case '2#007': /* edit status (lead, correction, etc) */
124                    $data['EditStatus'] = self::convIPTC( $val, $c );
125                    break;
126                case '2#015': /* category. deprecated. max 3 letters in theory, often more */
127                    $data['iimCategory'] = self::convIPTC( $val, $c );
128                    break;
129                case '2#020': /* category. deprecated. */
130                    $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
131                    break;
132                case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
133                    $data['Urgency'] = self::convIPTC( $val, $c );
134                    break;
135                case '2#022':
136                    /* "Identifies objectdata that recurs often and predictably...
137                     * Example: Euroweather" */
138                    $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
139                    break;
140                case '2#026':
141                    /* Content location code (iso 3166 + some custom things)
142                     * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
143                     * See wikipedia article on iso 3166 and appendix D of iim std. */
144                    $data['LocationDestCode'] = self::convIPTC( $val, $c );
145                    break;
146                case '2#027':
147                    /* Content location name. Full printable name
148                     * of location of photo. */
149                    $data['LocationDest'] = self::convIPTC( $val, $c );
150                    break;
151                case '2#065':
152                    /* Originating Program.
153                     * Combine with Program version (2:70) if present.
154                     */
155                    $software = self::convIPTC( $val, $c );
156
157                    if ( count( $software ) !== 1 ) {
158                        // according to iim standard this cannot have multiple values
159                        // so if there is more than one, something weird is happening,
160                        // and we skip it.
161                        wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
162                        break;
163                    }
164
165                    if ( isset( $parsed['2#070'] ) ) {
166                        // if a version is set for the software.
167                        $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
168                        unset( $parsed['2#070'] );
169                        $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
170                    } else {
171                        $data['Software'] = $software;
172                    }
173                    break;
174                case '2#075':
175                    /* Object cycle.
176                     * a for morning (am), p for evening, b for both */
177                    $data['ObjectCycle'] = self::convIPTC( $val, $c );
178                    break;
179                case '2#100':
180                    /* Country/Primary location code.
181                     * "Indicates the code of the country/primary location where the
182                     * intellectual property of the objectdata was created"
183                     * unclear how this differs from 2#026
184                     */
185                    $data['CountryCodeDest'] = self::convIPTC( $val, $c );
186                    break;
187                case '2#103':
188                    /* original transmission ref.
189                     * "A code representing the location of original transmission ac-
190                     * cording to practises of the provider."
191                     */
192                    $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
193                    break;
194                case '2#118': /*contact*/
195                    $data['Contact'] = self::convIPTC( $val, $c );
196                    break;
197                case '2#122':
198                    /* Writer/Editor
199                     * "Identification of the name of the person involved in the writing,
200                     * editing or correcting the objectdata or caption/abstract."
201                     */
202                    $data['Writer'] = self::convIPTC( $val, $c );
203                    break;
204                case '2#135': /* lang code */
205                    $data['LanguageCode'] = self::convIPTC( $val, $c );
206                    break;
207
208                // Start date stuff.
209                // It doesn't accept incomplete dates even though they are valid
210                // according to spec.
211                // Should potentially store timezone as well.
212                case '2#055':
213                    // Date created (not date digitized).
214                    // Maps to exif DateTimeOriginal
215                    $time = $parsed['2#060'] ?? [];
216                    $timestamp = self::timeHelper( $val, $time, $c );
217                    if ( $timestamp ) {
218                        $data['DateTimeOriginal'] = $timestamp;
219                    }
220                    break;
221
222                case '2#062':
223                    // Date converted to digital representation.
224                    // Maps to exif DateTimeDigitized
225                    $time = $parsed['2#063'] ?? [];
226                    $timestamp = self::timeHelper( $val, $time, $c );
227                    if ( $timestamp ) {
228                        $data['DateTimeDigitized'] = $timestamp;
229                    }
230                    break;
231
232                case '2#030':
233                    // Date released.
234                    $time = $parsed['2#035'] ?? [];
235                    $timestamp = self::timeHelper( $val, $time, $c );
236                    if ( $timestamp ) {
237                        $data['DateTimeReleased'] = $timestamp;
238                    }
239                    break;
240
241                case '2#037':
242                    // Date expires.
243                    $time = $parsed['2#038'] ?? [];
244                    $timestamp = self::timeHelper( $val, $time, $c );
245                    if ( $timestamp ) {
246                        $data['DateTimeExpires'] = $timestamp;
247                    }
248                    break;
249
250                case '2#000': /* iim version */
251                    // unlike other tags, this is a 2-byte binary number.
252                    // technically this is required if there is iptc data
253                    // but in practise it isn't always there.
254                    if ( strlen( $val[0] ) === 2 ) {
255                        // if is just to be paranoid.
256                        $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
257                        $versionValue += ord( substr( $val[0], 1, 1 ) );
258                        $data['iimVersion'] = $versionValue;
259                    }
260                    break;
261
262                case '2#004':
263                    // IntellectualGenere.
264                    // first 4 characters are an id code
265                    // That we're not really interested in.
266
267                    // This prop is weird, since it's
268                    // allowed to have multiple values
269                    // in iim 4.1, but not in the XMP
270                    // stuff. We're going to just
271                    // extract the first value.
272                    $con = self::convIPTC( $val, $c );
273                    if ( strlen( $con[0] ) < 5 ) {
274                        wfDebugLog( 'iptc', 'IPTC: '
275                            . '2:04 too short. '
276                            . 'Ignoring.' );
277                        break;
278                    }
279                    $extracted = substr( $con[0], 4 );
280                    $data['IntellectualGenre'] = $extracted;
281                    break;
282
283                case '2#012':
284                    // Subject News code - this is a compound field
285                    // at the moment we only extract the subject news
286                    // code, which is an 8 digit (ascii) number
287                    // describing the subject matter of the content.
288                    $codes = self::convIPTC( $val, $c );
289                    foreach ( $codes as $ic ) {
290                        $fields = explode( ':', $ic, 3 );
291
292                        if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
293                            wfDebugLog( 'IPTC', 'IPTC: '
294                                . 'Invalid 2:12 - ' . $ic );
295                            break;
296                        }
297                        $data['SubjectNewsCode'] = $fields[1];
298                    }
299                    break;
300
301                // purposely does not do 2:125, 2:130, 2:131,
302                // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
303                // 2:200, 2:201, 2:202
304                // or the audio stuff (2:150 to 2:154)
305
306                case '2#070':
307                case '2#060':
308                case '2#063':
309                case '2#085':
310                case '2#038':
311                case '2#035':
312                    // ignore. Handled elsewhere.
313                    break;
314
315                default:
316                    wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
317                    break;
318            }
319        }
320
321        return $data;
322    }
323
324    /**
325     * Convert an iptc date and time tags into the exif format
326     *
327     * @todo Potentially this should also capture the timezone offset.
328     * @param array $date The date tag
329     * @param array $time The time tag
330     * @param string $charset
331     * @return string|null Date in EXIF format.
332     */
333    private static function timeHelper( $date, $time, $charset ) {
334        if ( count( $date ) === 1 ) {
335            // the standard says this should always be 1
336            // just double checking.
337            [ $date ] = self::convIPTC( $date, $charset );
338        } else {
339            return null;
340        }
341
342        if ( count( $time ) === 1 ) {
343            [ $time ] = self::convIPTC( $time, $charset );
344            $dateOnly = false;
345        } else {
346            $time = '000000+0000'; // placeholder
347            $dateOnly = true;
348        }
349
350        if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
351            && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
352            && substr( $date, 0, 4 ) !== '0000'
353            && substr( $date, 4, 2 ) !== '00'
354            && substr( $date, 6, 2 ) !== '00'
355        ) ) {
356            // something wrong.
357            // Note, this rejects some valid dates according to iptc spec
358            // for example: the date 00000400 means the photo was taken in
359            // April, but the year and day is unknown. We don't process these
360            // types of incomplete dates atm.
361            wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
362
363            return null;
364        }
365
366        $unixTS = wfTimestamp( TS::UNIX, $date . substr( $time, 0, 6 ) );
367        if ( $unixTS === false ) {
368            wfDebugLog( 'iptc', "IPTC: can't convert date to TS::UNIX: $date $time." );
369
370            return null;
371        }
372
373        $tz = ( (int)substr( $time, 7, 2 ) * 60 * 60 )
374            + ( (int)substr( $time, 9, 2 ) * 60 );
375
376        if ( substr( $time, 6, 1 ) === '-' ) {
377            $tz = -$tz;
378        }
379
380        $finalTimestamp = wfTimestamp( TS::EXIF, (int)$unixTS + $tz );
381        if ( $finalTimestamp === false ) {
382            wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( (int)$unixTS + $tz ) );
383
384            return null;
385        }
386        if ( $dateOnly ) {
387            // return the date only
388            return substr( $finalTimestamp, 0, 10 );
389        }
390        return $finalTimestamp;
391    }
392
393    /**
394     * Helper function to convert charset for iptc values.
395     * @param string|array $data The iptc string
396     * @param string $charset
397     *
398     * @return string|array
399     */
400    private static function convIPTC( $data, $charset ) {
401        if ( is_array( $data ) ) {
402            foreach ( $data as &$val ) {
403                $val = self::convIPTCHelper( $val, $charset );
404            }
405        } else {
406            $data = self::convIPTCHelper( $data, $charset );
407        }
408
409        return $data;
410    }
411
412    /**
413     * Helper function of a helper function to convert charset for iptc values.
414     * @param string|array $data The IPTC string
415     * @param string $charset
416     *
417     * @return string
418     */
419    private static function convIPTCHelper( $data, $charset ) {
420        if ( $charset ) {
421            // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
422            $data = @iconv( $charset, "UTF-8//IGNORE", $data );
423            if ( $data === false ) {
424                $data = "";
425                wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
426            }
427        } else {
428            // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
429            // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
430            $oldData = $data;
431            \UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
432            if ( $data === $oldData ) {
433                return $data; // if validation didn't change $data
434            }
435            return self::convIPTCHelper( $oldData, 'Windows-1252' );
436        }
437
438        return trim( $data );
439    }
440
441    /**
442     * take the value of 1:90 tag and returns a charset
443     * @param string $tag 1:90 tag.
444     * @return string Charset name or "?"
445     * Warning, this function does not (and is not intended to) detect
446     * all iso 2022 escape codes. In practise, the code for utf-8 is the
447     * only code that seems to have wide use. It does detect that code.
448     */
449    public static function getCharset( $tag ) {
450        // According to iim standard, charset is defined by the tag 1:90.
451        // in which there are iso 2022 escape sequences to specify the character set.
452        // the iim standard seems to encourage that all necessary escape sequences are
453        // in the 1:90 tag, but says it doesn't have to be.
454
455        // This is in need of more testing probably. This is definitely not complete.
456        // however reading the docs of some other iptc software, it appears that most iptc software
457        // only recognizes utf-8. If 1:90 tag is not present content is
458        // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
459
460        // This also won't work if there are more than one escape sequence in the 1:90 tag
461        // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
462
463        // This is just going through the charsets mentioned in appendix C of the iim standard.
464
465        //  \x1b = ESC.
466        switch ( $tag ) {
467            case "\x1b%G": // utf-8
468            // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
469            case "\x1b(B": // ascii
470            case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
471                $c = 'UTF-8';
472                break;
473            case "\x1b(A": // like ascii, but british.
474                $c = 'ISO646-GB';
475                break;
476            case "\x1b(C": // some obscure sweedish/finland encoding
477                $c = 'ISO-IR-8-1';
478                break;
479            case "\x1b(D":
480                $c = 'ISO-IR-8-2';
481                break;
482            case "\x1b(E": // some obscure danish/norway encoding
483                $c = 'ISO-IR-9-1';
484                break;
485            case "\x1b(F":
486                $c = 'ISO-IR-9-2';
487                break;
488            case "\x1b(G":
489                $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
490                break;
491            case "\x1b(I":
492                $c = "ISO646-IT";
493                break;
494            case "\x1b(L":
495                $c = "ISO646-PT";
496                break;
497            case "\x1b(Z":
498                $c = "ISO646-ES";
499                break;
500            case "\x1b([":
501                $c = "GREEK7-OLD";
502                break;
503            case "\x1b(K":
504                $c = "ISO646-DE";
505                break;
506            case "\x1b(N": // crylic
507                $c = "ISO_5427";
508                break;
509            case "\x1b(`": // iso646-NO
510                $c = "NS_4551-1";
511                break;
512            case "\x1b(f": // iso646-FR
513                $c = "NF_Z_62-010";
514                break;
515            case "\x1b(g":
516                $c = "PT2"; // iso646-PT2
517                break;
518            case "\x1b(h":
519                $c = "ES2";
520                break;
521            case "\x1b(i": // iso646-HU
522                $c = "MSZ_7795.3";
523                break;
524            case "\x1b(w":
525                $c = "CSA_Z243.4-1985-1";
526                break;
527            case "\x1b(x":
528                $c = "CSA_Z243.4-1985-2";
529                break;
530            case "\x1b\$(B":
531            case "\x1b\$B":
532            case "\x1b&@\x1b\$B":
533            case "\x1b&@\x1b\$(B":
534                $c = "JIS_C6226-1983";
535                break;
536            case "\x1b-A": // iso-8859-1. at least for the high code characters.
537            case "\x1b(@\x1b-A":
538            case "\x1b(B\x1b-A":
539                $c = 'ISO-8859-1';
540                break;
541            case "\x1b-B": // iso-8859-2. at least for the high code characters.
542                $c = 'ISO-8859-2';
543                break;
544            case "\x1b-C": // iso-8859-3. at least for the high code characters.
545                $c = 'ISO-8859-3';
546                break;
547            case "\x1b-D": // iso-8859-4. at least for the high code characters.
548                $c = 'ISO-8859-4';
549                break;
550            case "\x1b-E": // iso-8859-5. at least for the high code characters.
551                $c = 'ISO-8859-5';
552                break;
553            case "\x1b-F": // iso-8859-6. at least for the high code characters.
554                $c = 'ISO-8859-6';
555                break;
556            case "\x1b-G": // iso-8859-7. at least for the high code characters.
557                $c = 'ISO-8859-7';
558                break;
559            case "\x1b-H": // iso-8859-8. at least for the high code characters.
560                $c = 'ISO-8859-8';
561                break;
562            case "\x1b-I": // CSN_369103. at least for the high code characters.
563                $c = 'CSN_369103';
564                break;
565            default:
566                wfDebugLog( 'iptc', __METHOD__ . ': Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
567                // at this point just give up and refuse to parse iptc?
568                $c = false;
569        }
570        return $c;
571    }
572}
573
574/** @deprecated class alias since 1.46 */
575class_alias( IPTC::class, 'IPTC' );