MediaWiki master
JpegMetadataExtractor.php
Go to the documentation of this file.
1<?php
10namespace MediaWiki\Media;
11
12use Wikimedia\AtEase\AtEase;
15use Wikimedia\XMPReader\Reader as XMPReader;
16
30 private const MAX_JPEG_SEGMENTS = 200;
31
44 public static function segmentSplitter( $filename ) {
45 if ( !$filename ) {
46 throw new InvalidJpegException( "No filename specified for " . __METHOD__ );
47 }
48 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
49 throw new InvalidJpegException( "Invalid file $filename passed to " . __METHOD__ );
50 }
51
52 $fh = fopen( $filename, "rb" );
53
54 if ( !$fh ) {
55 throw new InvalidJpegException( "Could not open file $filename" );
56 }
57
58 $buffer = fread( $fh, 2 );
59 if ( $buffer !== "\xFF\xD8" ) {
60 throw new InvalidJpegException( "Not a jpeg, no SOI" );
61 }
62
63 $showXMP = XMPReader::isSupported();
64
65 $segmentCount = 0;
66
67 $segments = [
68 'XMP_ext' => [],
69 'COM' => [],
70 'PSIR' => [],
71 ];
72
73 while ( !feof( $fh ) ) {
74 $buffer = fread( $fh, 1 );
75 $segmentCount++;
76 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
77 throw new InvalidJpegException( 'Too many jpeg segments. Aborting' );
78 }
79 while ( $buffer !== "\xFF" && !feof( $fh ) ) {
80 // In theory JPEG files are not allowed to contain anything between the sections,
81 // but in practice they sometimes do. It's customary to ignore the garbage data.
82 $buffer = fread( $fh, 1 );
83 }
84
85 $buffer = fread( $fh, 1 );
86 while ( $buffer === "\xFF" && !feof( $fh ) ) {
87 // Skip through any 0xFF padding bytes.
88 $buffer = fread( $fh, 1 );
89 }
90 if ( $buffer === "\xFE" ) {
91 // COM section -- file comment
92 // First see if valid utf-8,
93 // if not try to convert it to windows-1252.
94 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
95 \UtfNormal\Validator::quickIsNFCVerify( $com );
96 // turns $com to valid utf-8.
97 // thus if no change, it's utf-8, otherwise it's something else.
98 if ( $com !== $oldCom ) {
99 AtEase::suppressWarnings();
100 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
101 AtEase::restoreWarnings();
102 }
103 // Try it again, if it's still not a valid string, then probably
104 // binary junk or some really weird encoding, so don't extract.
105 \UtfNormal\Validator::quickIsNFCVerify( $com );
106 if ( $com === $oldCom ) {
107 $segments["COM"][] = $oldCom;
108 } else {
109 wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." );
110 }
111 } elseif ( $buffer === "\xE1" ) {
112 // APP1 section (Exif, XMP, and XMP extended)
113 // only extract if XMP is enabled.
114 $temp = self::jpegExtractMarker( $fh );
115 // check what type of app segment this is.
116 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
117 // use trim to remove trailing \0 chars
118 $segments["XMP"] = trim( substr( $temp, 29 ) );
119 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
120 // use trim to remove trailing \0 chars
121 $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
122 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
123 // Some images (especially flickr images) seem to have this.
124 // I really have no idea what the deal is with them, but
125 // whatever...
126 // use trim to remove trailing \0 chars
127 $segments["XMP"] = trim( substr( $temp, 29 ) );
128 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
129 . "Using anyways." );
130 } elseif ( str_starts_with( $temp, "Exif\0\0" ) ) {
131 // Just need to find out what the byte order is.
132 // because php's exif plugin sucks...
133 // This is a II for little Endian, MM for big. Not a unicode BOM.
134 $byteOrderMarker = substr( $temp, 6, 2 );
135 if ( $byteOrderMarker === 'MM' ) {
136 $segments['byteOrder'] = 'BE';
137 } elseif ( $byteOrderMarker === 'II' ) {
138 $segments['byteOrder'] = 'LE';
139 } else {
140 wfDebug( __METHOD__ . " Invalid byte ordering?!" );
141 }
142 }
143 } elseif ( $buffer === "\xED" ) {
144 // APP13 - PSIR. IPTC and some Photoshop stuff
145 $temp = self::jpegExtractMarker( $fh );
146 if ( str_starts_with( $temp, "Photoshop 3.0\x00" ) ) {
147 $segments["PSIR"][] = $temp;
148 }
149 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
150 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
151 return $segments;
152 } elseif ( in_array( $buffer, [
153 "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7",
154 "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] )
155 ) {
156 // SOF0, SOF1, SOF2, ... (same list as getimagesize)
157 $temp = self::jpegExtractMarker( $fh );
158 try {
159 $segments["SOF"] = StringUtils::unpack( 'Cbits/nheight/nwidth/Ccomponents', $temp );
160 } catch ( UnpackFailedException $e ) {
161 throw new InvalidJpegException( $e->getMessage() );
162 }
163 } else {
164 // segment we don't care about, so skip
165 try {
166 $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 );
167 } catch ( UnpackFailedException $e ) {
168 throw new InvalidJpegException( $e->getMessage() );
169 }
170 if ( $size['int'] < 2 ) {
171 throw new InvalidJpegException( "invalid marker size in jpeg" );
172 }
173 // Note it's possible to seek beyond the end of the file if truncated.
174 // fseek doesn't report a failure in this case.
175 fseek( $fh, $size['int'] - 2, SEEK_CUR );
176 }
177 }
178 // shouldn't get here.
179 throw new InvalidJpegException( "Reached end of jpeg file unexpectedly" );
180 }
181
188 private static function jpegExtractMarker( &$fh ) {
189 try {
190 $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 );
191 } catch ( UnpackFailedException $e ) {
192 throw new InvalidJpegException( $e->getMessage() );
193 }
194 if ( $size['int'] < 2 ) {
195 throw new InvalidJpegException( "invalid marker size in jpeg" );
196 }
197 if ( $size['int'] === 2 ) {
198 // fread( ..., 0 ) generates a warning
199 return '';
200 }
201 $segment = fread( $fh, $size['int'] - 2 );
202 if ( strlen( $segment ) !== $size['int'] - 2 ) {
203 throw new InvalidJpegException( "Segment shorter than expected" );
204 }
205
206 return $segment;
207 }
208
223 public static function doPSIR( $app13 ) {
224 if ( !$app13 ) {
225 throw new InvalidPSIRException( "No App13 segment given" );
226 }
227 // First, compare hash with the real thing.
228 // 0x404 contains IPTC, 0x425 has hash
229 // This is used to determine if the iptc is newer than
230 // the xmp data, as xmp programs update the hash,
231 // where non-xmp programs don't.
232
233 // skip past PHOTOSHOP 3.0 identifier. should already be checked.
234 $offset = 14;
235 $appLen = strlen( $app13 );
236 $realHash = "";
237 $recordedHash = "";
238
239 // the +12 is the length of an empty item.
240 while ( $offset + 12 <= $appLen ) {
241 $valid = true;
242 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
243 // it's supposed to be 8BIM, but apparently sometimes isn't, especially in really old jpg's
244 $valid = false;
245 }
246 $offset += 4;
247 $id = substr( $app13, $offset, 2 );
248 // id is a 2-byte id number which identifies
249 // the piece of info this record contains.
250
251 $offset += 2;
252
253 // some record types can contain a name, which
254 // is a pascal string 0-padded to be an even
255 // number of bytes. Most times (and any time
256 // we care) this is empty, making it two null bytes.
257
258 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
259 // we never use the name, so skip it. +1 for length byte
260 if ( $lenName % 2 === 1 ) {
261 // pad to even.
262 $lenName++;
263 }
264 $offset += $lenName;
265
266 // now length of data (unsigned long big endian)
267 try {
268 $lenData = StringUtils::unpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
269 } catch ( UnpackFailedException $e ) {
270 throw new InvalidPSIRException( $e->getMessage() );
271 }
272 // PHP can take issue with very large unsigned ints and make them negative.
273 // Which should never ever happen, as this has to be inside a segment
274 // which is limited to a 16-bit number.
275 if ( $lenData['len'] < 0 ) {
276 throw new InvalidPSIRException( "Too big PSIR (" . $lenData['len'] . ')' );
277 }
278
279 // 4-byte length field;
280 $offset += 4;
281
282 // this should not happen, but check.
283 if ( $lenData['len'] + $offset > $appLen ) {
284 throw new InvalidPSIRException( "PSIR data too long. (item length=" . $lenData['len']
285 . "; offset=$offset; total length=$appLen)" );
286 }
287
288 if ( $valid ) {
289 switch ( $id ) {
290 case "\x04\x04":
291 // IPTC block
292 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
293 break;
294 case "\x04\x25":
295 $recordedHash = substr( $app13, $offset, $lenData['len'] );
296 break;
297 }
298 }
299
300 // if odd, add 1 to length to account for
301 // null pad byte.
302 if ( $lenData['len'] % 2 === 1 ) {
303 $lenData['len']++;
304 }
305 $offset += $lenData['len'];
306 }
307
308 if ( !$realHash || !$recordedHash ) {
309 return 'iptc-no-hash';
310 }
311 if ( $realHash === $recordedHash ) {
312 return 'iptc-good-hash';
313 }
314 /* if $realHash !== $recordedHash */
315 return 'iptc-bad-hash';
316 }
317}
318
320class_alias( JpegMetadataExtractor::class, 'JpegMetadataExtractor' );
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Class for reading jpegs and extracting metadata.
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
static doPSIR( $app13)
This reads the Photoshop image resource.
A collection of static methods to play with strings.