MediaWiki REL1_39
JpegMetadataExtractor.php
Go to the documentation of this file.
1<?php
24use Wikimedia\AtEase\AtEase;
25use Wikimedia\XMPReader\Reader as XMPReader;
26
40 private const MAX_JPEG_SEGMENTS = 200;
41
53 public static function segmentSplitter( $filename ) {
54 $showXMP = XMPReader::isSupported();
55
56 $segmentCount = 0;
57
58 $segments = [
59 'XMP_ext' => [],
60 'COM' => [],
61 'PSIR' => [],
62 ];
63
64 if ( !$filename ) {
65 throw new MWException( "No filename specified for " . __METHOD__ );
66 }
67 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
68 throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
69 }
70
71 $fh = fopen( $filename, "rb" );
72
73 if ( !$fh ) {
74 throw new MWException( "Could not open file $filename" );
75 }
76
77 $buffer = fread( $fh, 2 );
78 if ( $buffer !== "\xFF\xD8" ) {
79 throw new MWException( "Not a jpeg, no SOI" );
80 }
81 while ( !feof( $fh ) ) {
82 $buffer = fread( $fh, 1 );
83 $segmentCount++;
84 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
85 throw new MWException( 'Too many jpeg segments. Aborting' );
86 }
87 while ( $buffer !== "\xFF" && !feof( $fh ) ) {
88 // In theory JPEG files are not allowed to contain anything between the sections,
89 // but in practice they sometimes do. It's customary to ignore the garbage data.
90 $buffer = fread( $fh, 1 );
91 }
92
93 $buffer = fread( $fh, 1 );
94 while ( $buffer === "\xFF" && !feof( $fh ) ) {
95 // Skip through any 0xFF padding bytes.
96 $buffer = fread( $fh, 1 );
97 }
98 if ( $buffer === "\xFE" ) {
99 // COM section -- file comment
100 // First see if valid utf-8,
101 // if not try to convert it to windows-1252.
102 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
103 UtfNormal\Validator::quickIsNFCVerify( $com );
104 // turns $com to valid utf-8.
105 // thus if no change, its utf-8, otherwise its something else.
106 if ( $com !== $oldCom ) {
107 AtEase::suppressWarnings();
108 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
109 AtEase::restoreWarnings();
110 }
111 // Try it again, if its still not a valid string, then probably
112 // binary junk or some really weird encoding, so don't extract.
113 UtfNormal\Validator::quickIsNFCVerify( $com );
114 if ( $com === $oldCom ) {
115 $segments["COM"][] = $oldCom;
116 } else {
117 wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." );
118 }
119 } elseif ( $buffer === "\xE1" ) {
120 // APP1 section (Exif, XMP, and XMP extended)
121 // only extract if XMP is enabled.
122 $temp = self::jpegExtractMarker( $fh );
123 // check what type of app segment this is.
124 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
125 // use trim to remove trailing \0 chars
126 $segments["XMP"] = trim( substr( $temp, 29 ) );
127 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
128 // use trim to remove trailing \0 chars
129 $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
130 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
131 // Some images (especially flickr images) seem to have this.
132 // I really have no idea what the deal is with them, but
133 // whatever...
134 // use trim to remove trailing \0 chars
135 $segments["XMP"] = trim( substr( $temp, 29 ) );
136 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
137 . "Using anyways." );
138 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
139 // Just need to find out what the byte order is.
140 // because php's exif plugin sucks...
141 // This is a II for little Endian, MM for big. Not a unicode BOM.
142 $byteOrderMarker = substr( $temp, 6, 2 );
143 if ( $byteOrderMarker === 'MM' ) {
144 $segments['byteOrder'] = 'BE';
145 } elseif ( $byteOrderMarker === 'II' ) {
146 $segments['byteOrder'] = 'LE';
147 } else {
148 wfDebug( __METHOD__ . " Invalid byte ordering?!" );
149 }
150 }
151 } elseif ( $buffer === "\xED" ) {
152 // APP13 - PSIR. IPTC and some photoshop stuff
153 $temp = self::jpegExtractMarker( $fh );
154 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
155 $segments["PSIR"][] = $temp;
156 }
157 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
158 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
159 return $segments;
160 } elseif ( in_array( $buffer, [
161 "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7",
162 "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] )
163 ) {
164 // SOF0, SOF1, SOF2, ... (same list as getimagesize)
165 $temp = self::jpegExtractMarker( $fh );
166 $segments["SOF"] = wfUnpack( 'Cbits/nheight/nwidth/Ccomponents', $temp );
167 } else {
168 // segment we don't care about, so skip
169 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
170 if ( $size['int'] < 2 ) {
171 throw new MWException( "invalid marker size in jpeg" );
172 }
173 // Note it's possible to seek beyond end of file if truncated.
174 // fseek doesn't report a failure in this case.
175 fseek( $fh, $size['int'] - 2, SEEK_CUR );
176 }
177 }
178 // shouldn't get here.
179 throw new MWException( "Reached end of jpeg file unexpectedly" );
180 }
181
188 private static function jpegExtractMarker( &$fh ) {
189 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
190 if ( $size['int'] < 2 ) {
191 throw new MWException( "invalid marker size in jpeg" );
192 }
193 if ( $size['int'] === 2 ) {
194 // fread( ..., 0 ) generates a warning
195 return '';
196 }
197 $segment = fread( $fh, $size['int'] - 2 );
198 if ( strlen( $segment ) !== $size['int'] - 2 ) {
199 throw new MWException( "Segment shorter than expected" );
200 }
201
202 return $segment;
203 }
204
219 public static function doPSIR( $app13 ) {
220 if ( !$app13 ) {
221 throw new MWException( "No App13 segment given" );
222 }
223 // First compare hash with real thing
224 // 0x404 contains IPTC, 0x425 has hash
225 // This is used to determine if the iptc is newer than
226 // the xmp data, as xmp programs update the hash,
227 // where non-xmp programs don't.
228
229 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
230 $appLen = strlen( $app13 );
231 $realHash = "";
232 $recordedHash = "";
233
234 // the +12 is the length of an empty item.
235 while ( $offset + 12 <= $appLen ) {
236 $valid = true;
237 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
238 // its supposed to be 8BIM
239 // but apparently sometimes isn't esp. in
240 // really old jpg's
241 $valid = false;
242 }
243 $offset += 4;
244 $id = substr( $app13, $offset, 2 );
245 // id is a 2 byte id number which identifies
246 // the piece of info this record contains.
247
248 $offset += 2;
249
250 // some record types can contain a name, which
251 // is a pascal string 0-padded to be an even
252 // number of bytes. Most times (and any time
253 // we care) this is empty, making it two null bytes.
254
255 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
256 // we never use the name so skip it. +1 for length byte
257 if ( $lenName % 2 == 1 ) {
258 $lenName++;
259 } // pad to even.
260 $offset += $lenName;
261
262 // now length of data (unsigned long big endian)
263 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
264 // PHP can take issue with very large unsigned ints and make them negative.
265 // Which should never ever happen, as this has to be inside a segment
266 // which is limited to a 16 bit number.
267 if ( $lenData['len'] < 0 ) {
268 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
269 }
270
271 $offset += 4; // 4bytes length field;
272
273 // this should not happen, but check.
274 if ( $lenData['len'] + $offset > $appLen ) {
275 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
276 . "; offset=$offset; total length=$appLen)" );
277 }
278
279 if ( $valid ) {
280 switch ( $id ) {
281 case "\x04\x04":
282 // IPTC block
283 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
284 break;
285 case "\x04\x25":
286 $recordedHash = substr( $app13, $offset, $lenData['len'] );
287 break;
288 }
289 }
290
291 // if odd, add 1 to length to account for
292 // null pad byte.
293 if ( $lenData['len'] % 2 == 1 ) {
294 $lenData['len']++;
295 }
296 $offset += $lenData['len'];
297 }
298
299 if ( !$realHash || !$recordedHash ) {
300 return 'iptc-no-hash';
301 } elseif ( $realHash === $recordedHash ) {
302 return 'iptc-good-hash';
303 } else { /*$realHash !== $recordedHash */
304 return 'iptc-bad-hash';
305 }
306 }
307}
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfUnpack( $format, $data, $length=false)
Wrapper around php's unpack.
Class for reading jpegs and extracting metadata.
static doPSIR( $app13)
This reads the photoshop image resource.
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
MediaWiki exception.