Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.31% |
102 / 127 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
JpegMetadataExtractor | |
80.31% |
102 / 127 |
|
0.00% |
0 / 3 |
74.43 | |
0.00% |
0 / 1 |
segmentSplitter | |
84.00% |
63 / 75 |
|
0.00% |
0 / 1 |
37.46 | |||
jpegExtractMarker | |
54.55% |
6 / 11 |
|
0.00% |
0 / 1 |
7.35 | |||
doPSIR | |
80.49% |
33 / 41 |
|
0.00% |
0 / 1 |
16.67 |
1 | <?php |
2 | /** |
3 | * Extraction of JPEG image metadata. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Media |
22 | */ |
23 | |
24 | use MediaWiki\Libs\UnpackFailedException; |
25 | use Wikimedia\AtEase\AtEase; |
26 | use Wikimedia\XMPReader\Reader as XMPReader; |
27 | |
28 | /** |
29 | * Class for reading jpegs and extracting metadata. |
30 | * see also BitmapMetadataHandler. |
31 | * |
32 | * Based somewhat on GIFMetadataExtractor. |
33 | * |
34 | * @ingroup Media |
35 | */ |
36 | class JpegMetadataExtractor { |
37 | /** |
38 | * The max segment is a safety check. A JPEG file should never even remotely have |
39 | * that many segments. Your average file has about 10. |
40 | */ |
41 | private const MAX_JPEG_SEGMENTS = 200; |
42 | |
43 | /** Function to extract metadata segments of interest from jpeg files |
44 | * based on GIFMetadataExtractor. |
45 | * |
46 | * we can almost use getimagesize to do this |
47 | * but gis doesn't support having multiple app1 segments |
48 | * and those can't extract xmp on files containing both exif and xmp data |
49 | * |
50 | * @param string $filename Name of jpeg file |
51 | * @return array Array of interesting segments. |
52 | * @throws InvalidJpegException |
53 | */ |
54 | public static function segmentSplitter( $filename ) { |
55 | $showXMP = XMPReader::isSupported(); |
56 | |
57 | $segmentCount = 0; |
58 | |
59 | $segments = [ |
60 | 'XMP_ext' => [], |
61 | 'COM' => [], |
62 | 'PSIR' => [], |
63 | ]; |
64 | |
65 | if ( !$filename ) { |
66 | throw new InvalidJpegException( "No filename specified for " . __METHOD__ ); |
67 | } |
68 | if ( !file_exists( $filename ) || is_dir( $filename ) ) { |
69 | throw new InvalidJpegException( "Invalid file $filename passed to " . __METHOD__ ); |
70 | } |
71 | |
72 | $fh = fopen( $filename, "rb" ); |
73 | |
74 | if ( !$fh ) { |
75 | throw new InvalidJpegException( "Could not open file $filename" ); |
76 | } |
77 | |
78 | $buffer = fread( $fh, 2 ); |
79 | if ( $buffer !== "\xFF\xD8" ) { |
80 | throw new InvalidJpegException( "Not a jpeg, no SOI" ); |
81 | } |
82 | while ( !feof( $fh ) ) { |
83 | $buffer = fread( $fh, 1 ); |
84 | $segmentCount++; |
85 | if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) { |
86 | throw new InvalidJpegException( 'Too many jpeg segments. Aborting' ); |
87 | } |
88 | while ( $buffer !== "\xFF" && !feof( $fh ) ) { |
89 | // In theory JPEG files are not allowed to contain anything between the sections, |
90 | // but in practice they sometimes do. It's customary to ignore the garbage data. |
91 | $buffer = fread( $fh, 1 ); |
92 | } |
93 | |
94 | $buffer = fread( $fh, 1 ); |
95 | while ( $buffer === "\xFF" && !feof( $fh ) ) { |
96 | // Skip through any 0xFF padding bytes. |
97 | $buffer = fread( $fh, 1 ); |
98 | } |
99 | if ( $buffer === "\xFE" ) { |
100 | // COM section -- file comment |
101 | // First see if valid utf-8, |
102 | // if not try to convert it to windows-1252. |
103 | $com = $oldCom = trim( self::jpegExtractMarker( $fh ) ); |
104 | UtfNormal\Validator::quickIsNFCVerify( $com ); |
105 | // turns $com to valid utf-8. |
106 | // thus if no change, it's utf-8, otherwise it's something else. |
107 | if ( $com !== $oldCom ) { |
108 | AtEase::suppressWarnings(); |
109 | $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom ); |
110 | AtEase::restoreWarnings(); |
111 | } |
112 | // Try it again, if it's still not a valid string, then probably |
113 | // binary junk or some really weird encoding, so don't extract. |
114 | UtfNormal\Validator::quickIsNFCVerify( $com ); |
115 | if ( $com === $oldCom ) { |
116 | $segments["COM"][] = $oldCom; |
117 | } else { |
118 | wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." ); |
119 | } |
120 | } elseif ( $buffer === "\xE1" ) { |
121 | // APP1 section (Exif, XMP, and XMP extended) |
122 | // only extract if XMP is enabled. |
123 | $temp = self::jpegExtractMarker( $fh ); |
124 | // check what type of app segment this is. |
125 | if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { |
126 | // use trim to remove trailing \0 chars |
127 | $segments["XMP"] = trim( substr( $temp, 29 ) ); |
128 | } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) { |
129 | // use trim to remove trailing \0 chars |
130 | $segments["XMP_ext"][] = trim( substr( $temp, 35 ) ); |
131 | } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { |
132 | // Some images (especially flickr images) seem to have this. |
133 | // I really have no idea what the deal is with them, but |
134 | // whatever... |
135 | // use trim to remove trailing \0 chars |
136 | $segments["XMP"] = trim( substr( $temp, 29 ) ); |
137 | wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier ' |
138 | . "Using anyways." ); |
139 | } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) { |
140 | // Just need to find out what the byte order is. |
141 | // because php's exif plugin sucks... |
142 | // This is a II for little Endian, MM for big. Not a unicode BOM. |
143 | $byteOrderMarker = substr( $temp, 6, 2 ); |
144 | if ( $byteOrderMarker === 'MM' ) { |
145 | $segments['byteOrder'] = 'BE'; |
146 | } elseif ( $byteOrderMarker === 'II' ) { |
147 | $segments['byteOrder'] = 'LE'; |
148 | } else { |
149 | wfDebug( __METHOD__ . " Invalid byte ordering?!" ); |
150 | } |
151 | } |
152 | } elseif ( $buffer === "\xED" ) { |
153 | // APP13 - PSIR. IPTC and some photoshop stuff |
154 | $temp = self::jpegExtractMarker( $fh ); |
155 | if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) { |
156 | $segments["PSIR"][] = $temp; |
157 | } |
158 | } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) { |
159 | // EOI - end of image or SOS - start of scan. either way we're past any interesting segments |
160 | return $segments; |
161 | } elseif ( in_array( $buffer, [ |
162 | "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7", |
163 | "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] ) |
164 | ) { |
165 | // SOF0, SOF1, SOF2, ... (same list as getimagesize) |
166 | $temp = self::jpegExtractMarker( $fh ); |
167 | try { |
168 | $segments["SOF"] = StringUtils::unpack( 'Cbits/nheight/nwidth/Ccomponents', $temp ); |
169 | } catch ( UnpackFailedException $e ) { |
170 | throw new InvalidJpegException( $e->getMessage() ); |
171 | } |
172 | } else { |
173 | // segment we don't care about, so skip |
174 | try { |
175 | $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 ); |
176 | } catch ( UnpackFailedException $e ) { |
177 | throw new InvalidJpegException( $e->getMessage() ); |
178 | } |
179 | if ( $size['int'] < 2 ) { |
180 | throw new InvalidJpegException( "invalid marker size in jpeg" ); |
181 | } |
182 | // Note it's possible to seek beyond end of file if truncated. |
183 | // fseek doesn't report a failure in this case. |
184 | fseek( $fh, $size['int'] - 2, SEEK_CUR ); |
185 | } |
186 | } |
187 | // shouldn't get here. |
188 | throw new InvalidJpegException( "Reached end of jpeg file unexpectedly" ); |
189 | } |
190 | |
191 | /** |
192 | * Helper function for jpegSegmentSplitter |
193 | * @param resource &$fh File handle for JPEG file |
194 | * @throws InvalidJpegException |
195 | * @return string Data content of segment. |
196 | */ |
197 | private static function jpegExtractMarker( &$fh ) { |
198 | try { |
199 | $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 ); |
200 | } catch ( UnpackFailedException $e ) { |
201 | throw new InvalidJpegException( $e->getMessage() ); |
202 | } |
203 | if ( $size['int'] < 2 ) { |
204 | throw new InvalidJpegException( "invalid marker size in jpeg" ); |
205 | } |
206 | if ( $size['int'] === 2 ) { |
207 | // fread( ..., 0 ) generates a warning |
208 | return ''; |
209 | } |
210 | $segment = fread( $fh, $size['int'] - 2 ); |
211 | if ( strlen( $segment ) !== $size['int'] - 2 ) { |
212 | throw new InvalidJpegException( "Segment shorter than expected" ); |
213 | } |
214 | |
215 | return $segment; |
216 | } |
217 | |
218 | /** |
219 | * This reads the photoshop image resource. |
220 | * Currently it only compares the iptc/iim hash |
221 | * with the stored hash, which is used to determine the precedence |
222 | * of the iptc data. In future it may extract some other info, like |
223 | * url of copyright license. |
224 | * |
225 | * This should generally be called by BitmapMetadataHandler::doApp13() |
226 | * |
227 | * @param string $app13 Photoshop psir app13 block from jpg. |
228 | * @throws InvalidPSIRException |
229 | * @return string If the iptc hash is good or not. One of 'iptc-no-hash', |
230 | * 'iptc-good-hash', 'iptc-bad-hash'. |
231 | */ |
232 | public static function doPSIR( $app13 ) { |
233 | if ( !$app13 ) { |
234 | throw new InvalidPSIRException( "No App13 segment given" ); |
235 | } |
236 | // First compare hash with real thing |
237 | // 0x404 contains IPTC, 0x425 has hash |
238 | // This is used to determine if the iptc is newer than |
239 | // the xmp data, as xmp programs update the hash, |
240 | // where non-xmp programs don't. |
241 | |
242 | $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked. |
243 | $appLen = strlen( $app13 ); |
244 | $realHash = ""; |
245 | $recordedHash = ""; |
246 | |
247 | // the +12 is the length of an empty item. |
248 | while ( $offset + 12 <= $appLen ) { |
249 | $valid = true; |
250 | if ( substr( $app13, $offset, 4 ) !== '8BIM' ) { |
251 | // it's supposed to be 8BIM |
252 | // but apparently sometimes isn't esp. in |
253 | // really old jpg's |
254 | $valid = false; |
255 | } |
256 | $offset += 4; |
257 | $id = substr( $app13, $offset, 2 ); |
258 | // id is a 2 byte id number which identifies |
259 | // the piece of info this record contains. |
260 | |
261 | $offset += 2; |
262 | |
263 | // some record types can contain a name, which |
264 | // is a pascal string 0-padded to be an even |
265 | // number of bytes. Most times (and any time |
266 | // we care) this is empty, making it two null bytes. |
267 | |
268 | $lenName = ord( substr( $app13, $offset, 1 ) ) + 1; |
269 | // we never use the name so skip it. +1 for length byte |
270 | if ( $lenName % 2 === 1 ) { |
271 | $lenName++; |
272 | } // pad to even. |
273 | $offset += $lenName; |
274 | |
275 | // now length of data (unsigned long big endian) |
276 | try { |
277 | $lenData = StringUtils::unpack( 'Nlen', substr( $app13, $offset, 4 ), 4 ); |
278 | } catch ( UnpackFailedException $e ) { |
279 | throw new InvalidPSIRException( $e->getMessage() ); |
280 | } |
281 | // PHP can take issue with very large unsigned ints and make them negative. |
282 | // Which should never ever happen, as this has to be inside a segment |
283 | // which is limited to a 16 bit number. |
284 | if ( $lenData['len'] < 0 ) { |
285 | throw new InvalidPSIRException( "Too big PSIR (" . $lenData['len'] . ')' ); |
286 | } |
287 | |
288 | $offset += 4; // 4bytes length field; |
289 | |
290 | // this should not happen, but check. |
291 | if ( $lenData['len'] + $offset > $appLen ) { |
292 | throw new InvalidPSIRException( "PSIR data too long. (item length=" . $lenData['len'] |
293 | . "; offset=$offset; total length=$appLen)" ); |
294 | } |
295 | |
296 | if ( $valid ) { |
297 | switch ( $id ) { |
298 | case "\x04\x04": |
299 | // IPTC block |
300 | $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true ); |
301 | break; |
302 | case "\x04\x25": |
303 | $recordedHash = substr( $app13, $offset, $lenData['len'] ); |
304 | break; |
305 | } |
306 | } |
307 | |
308 | // if odd, add 1 to length to account for |
309 | // null pad byte. |
310 | if ( $lenData['len'] % 2 === 1 ) { |
311 | $lenData['len']++; |
312 | } |
313 | $offset += $lenData['len']; |
314 | } |
315 | |
316 | if ( !$realHash || !$recordedHash ) { |
317 | return 'iptc-no-hash'; |
318 | } |
319 | if ( $realHash === $recordedHash ) { |
320 | return 'iptc-good-hash'; |
321 | } |
322 | /* if $realHash !== $recordedHash */ |
323 | return 'iptc-bad-hash'; |
324 | } |
325 | } |