MediaWiki  master
JpegMetadataExtractor.php
Go to the documentation of this file.
1 <?php
24 use Wikimedia\XMPReader\Reader as XMPReader;
25 
39  private const MAX_JPEG_SEGMENTS = 200;
40 
52  public static function segmentSplitter( $filename ) {
53  $showXMP = XMPReader::isSupported();
54 
55  $segmentCount = 0;
56 
57  $segments = [
58  'XMP_ext' => [],
59  'COM' => [],
60  'PSIR' => [],
61  ];
62 
63  if ( !$filename ) {
64  throw new MWException( "No filename specified for " . __METHOD__ );
65  }
66  if ( !file_exists( $filename ) || is_dir( $filename ) ) {
67  throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
68  }
69 
70  $fh = fopen( $filename, "rb" );
71 
72  if ( !$fh ) {
73  throw new MWException( "Could not open file $filename" );
74  }
75 
76  $buffer = fread( $fh, 2 );
77  if ( $buffer !== "\xFF\xD8" ) {
78  throw new MWException( "Not a jpeg, no SOI" );
79  }
80  while ( !feof( $fh ) ) {
81  $buffer = fread( $fh, 1 );
82  $segmentCount++;
83  if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
84  throw new MWException( 'Too many jpeg segments. Aborting' );
85  }
86  while ( $buffer !== "\xFF" && !feof( $fh ) ) {
87  // In theory JPEG files are not allowed to contain anything between the sections,
88  // but in practice they sometimes do. It's customary to ignore the garbage data.
89  $buffer = fread( $fh, 1 );
90  }
91 
92  $buffer = fread( $fh, 1 );
93  while ( $buffer === "\xFF" && !feof( $fh ) ) {
94  // Skip through any 0xFF padding bytes.
95  $buffer = fread( $fh, 1 );
96  }
97  if ( $buffer === "\xFE" ) {
98  // COM section -- file comment
99  // First see if valid utf-8,
100  // if not try to convert it to windows-1252.
101  $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
102  UtfNormal\Validator::quickIsNFCVerify( $com );
103  // turns $com to valid utf-8.
104  // thus if no change, its utf-8, otherwise its something else.
105  if ( $com !== $oldCom ) {
106  Wikimedia\suppressWarnings();
107  $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
108  Wikimedia\restoreWarnings();
109  }
110  // Try it again, if its still not a valid string, then probably
111  // binary junk or some really weird encoding, so don't extract.
112  UtfNormal\Validator::quickIsNFCVerify( $com );
113  if ( $com === $oldCom ) {
114  $segments["COM"][] = $oldCom;
115  } else {
116  wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." );
117  }
118  } elseif ( $buffer === "\xE1" ) {
119  // APP1 section (Exif, XMP, and XMP extended)
120  // only extract if XMP is enabled.
121  $temp = self::jpegExtractMarker( $fh );
122  // check what type of app segment this is.
123  if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
124  // use trim to remove trailing \0 chars
125  $segments["XMP"] = trim( substr( $temp, 29 ) );
126  } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
127  // use trim to remove trailing \0 chars
128  $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
129  } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
130  // Some images (especially flickr images) seem to have this.
131  // I really have no idea what the deal is with them, but
132  // whatever...
133  // use trim to remove trailing \0 chars
134  $segments["XMP"] = trim( substr( $temp, 29 ) );
135  wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
136  . "Using anyways." );
137  } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
138  // Just need to find out what the byte order is.
139  // because php's exif plugin sucks...
140  // This is a II for little Endian, MM for big. Not a unicode BOM.
141  $byteOrderMarker = substr( $temp, 6, 2 );
142  if ( $byteOrderMarker === 'MM' ) {
143  $segments['byteOrder'] = 'BE';
144  } elseif ( $byteOrderMarker === 'II' ) {
145  $segments['byteOrder'] = 'LE';
146  } else {
147  wfDebug( __METHOD__ . " Invalid byte ordering?!" );
148  }
149  }
150  } elseif ( $buffer === "\xED" ) {
151  // APP13 - PSIR. IPTC and some photoshop stuff
152  $temp = self::jpegExtractMarker( $fh );
153  if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
154  $segments["PSIR"][] = $temp;
155  }
156  } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
157  // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
158  return $segments;
159  } elseif ( in_array( $buffer, [
160  "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7",
161  "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] )
162  ) {
163  // SOF0, SOF1, SOF2, ... (same list as getimagesize)
164  $temp = self::jpegExtractMarker( $fh );
165  $segments["SOF"] = wfUnpack( 'Cbits/nheight/nwidth/Ccomponents', $temp );
166  } else {
167  // segment we don't care about, so skip
168  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
169  if ( $size['int'] < 2 ) {
170  throw new MWException( "invalid marker size in jpeg" );
171  }
172  // Note it's possible to seek beyond end of file if truncated.
173  // fseek doesn't report a failure in this case.
174  fseek( $fh, $size['int'] - 2, SEEK_CUR );
175  }
176  }
177  // shouldn't get here.
178  throw new MWException( "Reached end of jpeg file unexpectedly" );
179  }
180 
187  private static function jpegExtractMarker( &$fh ) {
188  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
189  if ( $size['int'] < 2 ) {
190  throw new MWException( "invalid marker size in jpeg" );
191  }
192  if ( $size['int'] === 2 ) {
193  // fread( ..., 0 ) generates a warning
194  return '';
195  }
196  $segment = fread( $fh, $size['int'] - 2 );
197  if ( strlen( $segment ) !== $size['int'] - 2 ) {
198  throw new MWException( "Segment shorter than expected" );
199  }
200 
201  return $segment;
202  }
203 
218  public static function doPSIR( $app13 ) {
219  if ( !$app13 ) {
220  throw new MWException( "No App13 segment given" );
221  }
222  // First compare hash with real thing
223  // 0x404 contains IPTC, 0x425 has hash
224  // This is used to determine if the iptc is newer than
225  // the xmp data, as xmp programs update the hash,
226  // where non-xmp programs don't.
227 
228  $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
229  $appLen = strlen( $app13 );
230  $realHash = "";
231  $recordedHash = "";
232 
233  // the +12 is the length of an empty item.
234  while ( $offset + 12 <= $appLen ) {
235  $valid = true;
236  if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
237  // its supposed to be 8BIM
238  // but apparently sometimes isn't esp. in
239  // really old jpg's
240  $valid = false;
241  }
242  $offset += 4;
243  $id = substr( $app13, $offset, 2 );
244  // id is a 2 byte id number which identifies
245  // the piece of info this record contains.
246 
247  $offset += 2;
248 
249  // some record types can contain a name, which
250  // is a pascal string 0-padded to be an even
251  // number of bytes. Most times (and any time
252  // we care) this is empty, making it two null bytes.
253 
254  $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
255  // we never use the name so skip it. +1 for length byte
256  if ( $lenName % 2 == 1 ) {
257  $lenName++;
258  } // pad to even.
259  $offset += $lenName;
260 
261  // now length of data (unsigned long big endian)
262  $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
263  // PHP can take issue with very large unsigned ints and make them negative.
264  // Which should never ever happen, as this has to be inside a segment
265  // which is limited to a 16 bit number.
266  if ( $lenData['len'] < 0 ) {
267  throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
268  }
269 
270  $offset += 4; // 4bytes length field;
271 
272  // this should not happen, but check.
273  if ( $lenData['len'] + $offset > $appLen ) {
274  throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
275  . "; offset=$offset; total length=$appLen)" );
276  }
277 
278  if ( $valid ) {
279  switch ( $id ) {
280  case "\x04\x04":
281  // IPTC block
282  $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
283  break;
284  case "\x04\x25":
285  $recordedHash = substr( $app13, $offset, $lenData['len'] );
286  break;
287  }
288  }
289 
290  // if odd, add 1 to length to account for
291  // null pad byte.
292  if ( $lenData['len'] % 2 == 1 ) {
293  $lenData['len']++;
294  }
295  $offset += $lenData['len'];
296  }
297 
298  if ( !$realHash || !$recordedHash ) {
299  return 'iptc-no-hash';
300  } elseif ( $realHash === $recordedHash ) {
301  return 'iptc-good-hash';
302  } else { /*$realHash !== $recordedHash */
303  return 'iptc-bad-hash';
304  }
305  }
306 }
JpegMetadataExtractor
Class for reading jpegs and extracting metadata.
Definition: JpegMetadataExtractor.php:34
JpegMetadataExtractor\jpegExtractMarker
static jpegExtractMarker(&$fh)
Helper function for jpegSegmentSplitter.
Definition: JpegMetadataExtractor.php:187
wfUnpack
wfUnpack( $format, $data, $length=false)
Wrapper around php's unpack.
Definition: GlobalFunctions.php:2479
MWException
MediaWiki exception.
Definition: MWException.php:29
JpegMetadataExtractor\doPSIR
static doPSIR( $app13)
This reads the photoshop image resource.
Definition: JpegMetadataExtractor.php:218
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:894
JpegMetadataExtractor\segmentSplitter
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
Definition: JpegMetadataExtractor.php:52
JpegMetadataExtractor\MAX_JPEG_SEGMENTS
const MAX_JPEG_SEGMENTS
The max segment is a safety check.
Definition: JpegMetadataExtractor.php:39