Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
6.58% covered (danger)
6.58%
21 / 319
0.00% covered (danger)
0.00%
0 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
IPTC
6.58% covered (danger)
6.58%
21 / 319
0.00% covered (danger)
0.00%
0 / 5
12056.69
0.00% covered (danger)
0.00%
0 / 1
 parse
9.94% covered (danger)
9.94%
17 / 171
0.00% covered (danger)
0.00%
0 / 1
2962.04
 timeHelper
0.00% covered (danger)
0.00%
0 / 30
0.00% covered (danger)
0.00%
0 / 1
156
 convIPTC
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 convIPTCHelper
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
20
 getCharset
4.00% covered (danger)
4.00%
4 / 100
0.00% covered (danger)
0.00%
0 / 1
1384.68
1<?php
2/**
3 * Class for some IPTC functions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Media
22 */
23
24use Wikimedia\AtEase\AtEase;
25
26/**
27 * Class for some IPTC functions.
28 *
29 * @ingroup Media
30 */
31class IPTC {
32    /**
33     * This takes the results of iptcparse() and puts it into a
34     * form that can be handled by mediawiki. Generally called from
35     * BitmapMetadataHandler::doApp13.
36     *
37     * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
38     *
39     * @param string $rawData The app13 block from jpeg containing iptc/iim data
40     * @return array IPTC metadata array
41     */
42    public static function parse( $rawData ) {
43        $parsed = iptcparse( $rawData );
44        $data = [];
45        if ( !is_array( $parsed ) ) {
46            return $data;
47        }
48
49        $c = '';
50        // charset info contained in tag 1:90.
51        if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
52            $c = self::getCharset( $parsed['1#090'][0] );
53            if ( $c === false ) {
54                // Unknown charset. refuse to parse.
55                // note: There is a different between
56                // unknown and no charset specified.
57                return [];
58            }
59            unset( $parsed['1#090'] );
60        }
61
62        foreach ( $parsed as $tag => $val ) {
63            if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
64                wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
65                continue;
66            }
67            switch ( $tag ) {
68                case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
69                    $data['ImageDescription'] = self::convIPTC( $val, $c );
70                    break;
71                case '2#116': /* copyright. Mapped with exif copyright */
72                    $data['Copyright'] = self::convIPTC( $val, $c );
73                    break;
74                case '2#080': /* byline. Mapped with exif Artist */
75                    /* merge with byline title (2:85)
76                     * like how exif does it with
77                     * Title, person. Not sure if this is best
78                     * approach since we no longer have the two fields
79                     * separate. each byline title entry corresponds to a
80                     * specific byline. */
81
82                    $bylines = self::convIPTC( $val, $c );
83                    if ( isset( $parsed['2#085'] ) ) {
84                        $titles = self::convIPTC( $parsed['2#085'], $c );
85                    } else {
86                        $titles = [];
87                    }
88
89                    $titleCount = count( $titles );
90                    for ( $i = 0; $i < $titleCount; $i++ ) {
91                        if ( isset( $bylines[$i] ) ) {
92                            // theoretically this should always be set
93                            // but doesn't hurt to be careful.
94                            $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
95                        }
96                    }
97                    $data['Artist'] = $bylines;
98                    break;
99                case '2#025': /* keywords */
100                    $data['Keywords'] = self::convIPTC( $val, $c );
101                    break;
102                case '2#101': /* Country (shown) */
103                    $data['CountryDest'] = self::convIPTC( $val, $c );
104                    break;
105                case '2#095': /* state/province (shown) */
106                    $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
107                    break;
108                case '2#090': /* city (Shown) */
109                    $data['CityDest'] = self::convIPTC( $val, $c );
110                    break;
111                case '2#092': /* sublocation (shown) */
112                    $data['SublocationDest'] = self::convIPTC( $val, $c );
113                    break;
114                case '2#005': /* object name/title */
115                    $data['ObjectName'] = self::convIPTC( $val, $c );
116                    break;
117                case '2#040': /* special instructions */
118                    $data['SpecialInstructions'] = self::convIPTC( $val, $c );
119                    break;
120                case '2#105': /* headline */
121                    $data['Headline'] = self::convIPTC( $val, $c );
122                    break;
123                case '2#110': /* credit */
124                    /*"Identifies the provider of the objectdata,
125                     * not necessarily the owner/creator". */
126                    $data['Credit'] = self::convIPTC( $val, $c );
127                    break;
128                case '2#115': /* source */
129                    /* "Identifies the original owner of the intellectual content of the
130                     *objectdata. This could be an agency, a member of an agency or
131                     *an individual." */
132                    $data['Source'] = self::convIPTC( $val, $c );
133                    break;
134
135                case '2#007': /* edit status (lead, correction, etc) */
136                    $data['EditStatus'] = self::convIPTC( $val, $c );
137                    break;
138                case '2#015': /* category. deprecated. max 3 letters in theory, often more */
139                    $data['iimCategory'] = self::convIPTC( $val, $c );
140                    break;
141                case '2#020': /* category. deprecated. */
142                    $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
143                    break;
144                case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
145                    $data['Urgency'] = self::convIPTC( $val, $c );
146                    break;
147                case '2#022':
148                    /* "Identifies objectdata that recurs often and predictably...
149                     * Example: Euroweather" */
150                    $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
151                    break;
152                case '2#026':
153                    /* Content location code (iso 3166 + some custom things)
154                     * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
155                     * See wikipedia article on iso 3166 and appendix D of iim std. */
156                    $data['LocationDestCode'] = self::convIPTC( $val, $c );
157                    break;
158                case '2#027':
159                    /* Content location name. Full printable name
160                     * of location of photo. */
161                    $data['LocationDest'] = self::convIPTC( $val, $c );
162                    break;
163                case '2#065':
164                    /* Originating Program.
165                     * Combine with Program version (2:70) if present.
166                     */
167                    $software = self::convIPTC( $val, $c );
168
169                    if ( count( $software ) !== 1 ) {
170                        // according to iim standard this cannot have multiple values
171                        // so if there is more than one, something weird is happening,
172                        // and we skip it.
173                        wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
174                        break;
175                    }
176
177                    if ( isset( $parsed['2#070'] ) ) {
178                        // if a version is set for the software.
179                        $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
180                        unset( $parsed['2#070'] );
181                        $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
182                    } else {
183                        $data['Software'] = $software;
184                    }
185                    break;
186                case '2#075':
187                    /* Object cycle.
188                     * a for morning (am), p for evening, b for both */
189                    $data['ObjectCycle'] = self::convIPTC( $val, $c );
190                    break;
191                case '2#100':
192                    /* Country/Primary location code.
193                     * "Indicates the code of the country/primary location where the
194                     * intellectual property of the objectdata was created"
195                     * unclear how this differs from 2#026
196                     */
197                    $data['CountryCodeDest'] = self::convIPTC( $val, $c );
198                    break;
199                case '2#103':
200                    /* original transmission ref.
201                     * "A code representing the location of original transmission ac-
202                     * cording to practises of the provider."
203                     */
204                    $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
205                    break;
206                case '2#118': /*contact*/
207                    $data['Contact'] = self::convIPTC( $val, $c );
208                    break;
209                case '2#122':
210                    /* Writer/Editor
211                     * "Identification of the name of the person involved in the writing,
212                     * editing or correcting the objectdata or caption/abstract."
213                     */
214                    $data['Writer'] = self::convIPTC( $val, $c );
215                    break;
216                case '2#135': /* lang code */
217                    $data['LanguageCode'] = self::convIPTC( $val, $c );
218                    break;
219
220                // Start date stuff.
221                // It doesn't accept incomplete dates even though they are valid
222                // according to spec.
223                // Should potentially store timezone as well.
224                case '2#055':
225                    // Date created (not date digitized).
226                    // Maps to exif DateTimeOriginal
227                    $time = $parsed['2#060'] ?? [];
228                    $timestamp = self::timeHelper( $val, $time, $c );
229                    if ( $timestamp ) {
230                        $data['DateTimeOriginal'] = $timestamp;
231                    }
232                    break;
233
234                case '2#062':
235                    // Date converted to digital representation.
236                    // Maps to exif DateTimeDigitized
237                    $time = $parsed['2#063'] ?? [];
238                    $timestamp = self::timeHelper( $val, $time, $c );
239                    if ( $timestamp ) {
240                        $data['DateTimeDigitized'] = $timestamp;
241                    }
242                    break;
243
244                case '2#030':
245                    // Date released.
246                    $time = $parsed['2#035'] ?? [];
247                    $timestamp = self::timeHelper( $val, $time, $c );
248                    if ( $timestamp ) {
249                        $data['DateTimeReleased'] = $timestamp;
250                    }
251                    break;
252
253                case '2#037':
254                    // Date expires.
255                    $time = $parsed['2#038'] ?? [];
256                    $timestamp = self::timeHelper( $val, $time, $c );
257                    if ( $timestamp ) {
258                        $data['DateTimeExpires'] = $timestamp;
259                    }
260                    break;
261
262                case '2#000': /* iim version */
263                    // unlike other tags, this is a 2-byte binary number.
264                    // technically this is required if there is iptc data
265                    // but in practise it isn't always there.
266                    if ( strlen( $val[0] ) === 2 ) {
267                        // if is just to be paranoid.
268                        $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
269                        $versionValue += ord( substr( $val[0], 1, 1 ) );
270                        $data['iimVersion'] = $versionValue;
271                    }
272                    break;
273
274                case '2#004':
275                    // IntellectualGenere.
276                    // first 4 characters are an id code
277                    // That we're not really interested in.
278
279                    // This prop is weird, since it's
280                    // allowed to have multiple values
281                    // in iim 4.1, but not in the XMP
282                    // stuff. We're going to just
283                    // extract the first value.
284                    $con = self::convIPTC( $val, $c );
285                    if ( strlen( $con[0] ) < 5 ) {
286                        wfDebugLog( 'iptc', 'IPTC: '
287                            . '2:04 too short. '
288                            . 'Ignoring.' );
289                        break;
290                    }
291                    $extracted = substr( $con[0], 4 );
292                    $data['IntellectualGenre'] = $extracted;
293                    break;
294
295                case '2#012':
296                    // Subject News code - this is a compound field
297                    // at the moment we only extract the subject news
298                    // code, which is an 8 digit (ascii) number
299                    // describing the subject matter of the content.
300                    $codes = self::convIPTC( $val, $c );
301                    foreach ( $codes as $ic ) {
302                        $fields = explode( ':', $ic, 3 );
303
304                        if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
305                            wfDebugLog( 'IPTC', 'IPTC: '
306                                . 'Invalid 2:12 - ' . $ic );
307                            break;
308                        }
309                        $data['SubjectNewsCode'] = $fields[1];
310                    }
311                    break;
312
313                // purposely does not do 2:125, 2:130, 2:131,
314                // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
315                // 2:200, 2:201, 2:202
316                // or the audio stuff (2:150 to 2:154)
317
318                case '2#070':
319                case '2#060':
320                case '2#063':
321                case '2#085':
322                case '2#038':
323                case '2#035':
324                    // ignore. Handled elsewhere.
325                    break;
326
327                default:
328                    wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
329                    break;
330            }
331        }
332
333        return $data;
334    }
335
336    /**
337     * Convert an iptc date and time tags into the exif format
338     *
339     * @todo Potentially this should also capture the timezone offset.
340     * @param array $date The date tag
341     * @param array $time The time tag
342     * @param string $charset
343     * @return string|null Date in EXIF format.
344     */
345    private static function timeHelper( $date, $time, $charset ) {
346        if ( count( $date ) === 1 ) {
347            // the standard says this should always be 1
348            // just double checking.
349            [ $date ] = self::convIPTC( $date, $charset );
350        } else {
351            return null;
352        }
353
354        if ( count( $time ) === 1 ) {
355            [ $time ] = self::convIPTC( $time, $charset );
356            $dateOnly = false;
357        } else {
358            $time = '000000+0000'; // placeholder
359            $dateOnly = true;
360        }
361
362        if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
363            && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
364            && substr( $date, 0, 4 ) !== '0000'
365            && substr( $date, 4, 2 ) !== '00'
366            && substr( $date, 6, 2 ) !== '00'
367        ) ) {
368            // something wrong.
369            // Note, this rejects some valid dates according to iptc spec
370            // for example: the date 00000400 means the photo was taken in
371            // April, but the year and day is unknown. We don't process these
372            // types of incomplete dates atm.
373            wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
374
375            return null;
376        }
377
378        $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
379        if ( $unixTS === false ) {
380            wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
381
382            return null;
383        }
384
385        $tz = ( (int)substr( $time, 7, 2 ) * 60 * 60 )
386            + ( (int)substr( $time, 9, 2 ) * 60 );
387
388        if ( substr( $time, 6, 1 ) === '-' ) {
389            $tz = -$tz;
390        }
391
392        $finalTimestamp = wfTimestamp( TS_EXIF, (int)$unixTS + $tz );
393        if ( $finalTimestamp === false ) {
394            wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( (int)$unixTS + $tz ) );
395
396            return null;
397        }
398        if ( $dateOnly ) {
399            // return the date only
400            return substr( $finalTimestamp, 0, 10 );
401        }
402        return $finalTimestamp;
403    }
404
405    /**
406     * Helper function to convert charset for iptc values.
407     * @param string|array $data The iptc string
408     * @param string $charset
409     *
410     * @return string|array
411     */
412    private static function convIPTC( $data, $charset ) {
413        if ( is_array( $data ) ) {
414            foreach ( $data as &$val ) {
415                $val = self::convIPTCHelper( $val, $charset );
416            }
417        } else {
418            $data = self::convIPTCHelper( $data, $charset );
419        }
420
421        return $data;
422    }
423
424    /**
425     * Helper function of a helper function to convert charset for iptc values.
426     * @param string|array $data The IPTC string
427     * @param string $charset
428     *
429     * @return string
430     */
431    private static function convIPTCHelper( $data, $charset ) {
432        if ( $charset ) {
433            AtEase::suppressWarnings();
434            $data = iconv( $charset, "UTF-8//IGNORE", $data );
435            AtEase::restoreWarnings();
436            if ( $data === false ) {
437                $data = "";
438                wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
439            }
440        } else {
441            // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
442            // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
443            $oldData = $data;
444            UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
445            if ( $data === $oldData ) {
446                return $data; // if validation didn't change $data
447            }
448            return self::convIPTCHelper( $oldData, 'Windows-1252' );
449        }
450
451        return trim( $data );
452    }
453
454    /**
455     * take the value of 1:90 tag and returns a charset
456     * @param string $tag 1:90 tag.
457     * @return string Charset name or "?"
458     * Warning, this function does not (and is not intended to) detect
459     * all iso 2022 escape codes. In practise, the code for utf-8 is the
460     * only code that seems to have wide use. It does detect that code.
461     */
462    public static function getCharset( $tag ) {
463        // According to iim standard, charset is defined by the tag 1:90.
464        // in which there are iso 2022 escape sequences to specify the character set.
465        // the iim standard seems to encourage that all necessary escape sequences are
466        // in the 1:90 tag, but says it doesn't have to be.
467
468        // This is in need of more testing probably. This is definitely not complete.
469        // however reading the docs of some other iptc software, it appears that most iptc software
470        // only recognizes utf-8. If 1:90 tag is not present content is
471        // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
472
473        // This also won't work if there are more than one escape sequence in the 1:90 tag
474        // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
475
476        // This is just going through the charsets mentioned in appendix C of the iim standard.
477
478        //  \x1b = ESC.
479        switch ( $tag ) {
480            case "\x1b%G": // utf-8
481            // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
482            case "\x1b(B": // ascii
483            case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
484                $c = 'UTF-8';
485                break;
486            case "\x1b(A": // like ascii, but british.
487                $c = 'ISO646-GB';
488                break;
489            case "\x1b(C": // some obscure sweedish/finland encoding
490                $c = 'ISO-IR-8-1';
491                break;
492            case "\x1b(D":
493                $c = 'ISO-IR-8-2';
494                break;
495            case "\x1b(E": // some obscure danish/norway encoding
496                $c = 'ISO-IR-9-1';
497                break;
498            case "\x1b(F":
499                $c = 'ISO-IR-9-2';
500                break;
501            case "\x1b(G":
502                $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
503                break;
504            case "\x1b(I":
505                $c = "ISO646-IT";
506                break;
507            case "\x1b(L":
508                $c = "ISO646-PT";
509                break;
510            case "\x1b(Z":
511                $c = "ISO646-ES";
512                break;
513            case "\x1b([":
514                $c = "GREEK7-OLD";
515                break;
516            case "\x1b(K":
517                $c = "ISO646-DE";
518                break;
519            case "\x1b(N": // crylic
520                $c = "ISO_5427";
521                break;
522            case "\x1b(`": // iso646-NO
523                $c = "NS_4551-1";
524                break;
525            case "\x1b(f": // iso646-FR
526                $c = "NF_Z_62-010";
527                break;
528            case "\x1b(g":
529                $c = "PT2"; // iso646-PT2
530                break;
531            case "\x1b(h":
532                $c = "ES2";
533                break;
534            case "\x1b(i": // iso646-HU
535                $c = "MSZ_7795.3";
536                break;
537            case "\x1b(w":
538                $c = "CSA_Z243.4-1985-1";
539                break;
540            case "\x1b(x":
541                $c = "CSA_Z243.4-1985-2";
542                break;
543            case "\x1b\$(B":
544            case "\x1b\$B":
545            case "\x1b&@\x1b\$B":
546            case "\x1b&@\x1b\$(B":
547                $c = "JIS_C6226-1983";
548                break;
549            case "\x1b-A": // iso-8859-1. at least for the high code characters.
550            case "\x1b(@\x1b-A":
551            case "\x1b(B\x1b-A":
552                $c = 'ISO-8859-1';
553                break;
554            case "\x1b-B": // iso-8859-2. at least for the high code characters.
555                $c = 'ISO-8859-2';
556                break;
557            case "\x1b-C": // iso-8859-3. at least for the high code characters.
558                $c = 'ISO-8859-3';
559                break;
560            case "\x1b-D": // iso-8859-4. at least for the high code characters.
561                $c = 'ISO-8859-4';
562                break;
563            case "\x1b-E": // iso-8859-5. at least for the high code characters.
564                $c = 'ISO-8859-5';
565                break;
566            case "\x1b-F": // iso-8859-6. at least for the high code characters.
567                $c = 'ISO-8859-6';
568                break;
569            case "\x1b-G": // iso-8859-7. at least for the high code characters.
570                $c = 'ISO-8859-7';
571                break;
572            case "\x1b-H": // iso-8859-8. at least for the high code characters.
573                $c = 'ISO-8859-8';
574                break;
575            case "\x1b-I": // CSN_369103. at least for the high code characters.
576                $c = 'CSN_369103';
577                break;
578            default:
579                wfDebugLog( 'iptc', __METHOD__ . ': Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
580                // at this point just give up and refuse to parse iptc?
581                $c = false;
582        }
583        return $c;
584    }
585}