MediaWiki  1.23.1
JpegMetadataExtractor.php
Go to the documentation of this file.
1 <?php
33  const MAX_JPEG_SEGMENTS = 200;
34 
35  // the max segment is a sanity check.
36  // A jpeg file should never even remotely have
37  // that many segments. Your average file has about 10.
38 
50  static function segmentSplitter( $filename ) {
51  $showXMP = function_exists( 'xml_parser_create_ns' );
52 
53  $segmentCount = 0;
54 
55  $segments = array(
56  'XMP_ext' => array(),
57  'COM' => array(),
58  'PSIR' => array(),
59  );
60 
61  if ( !$filename ) {
62  throw new MWException( "No filename specified for " . __METHOD__ );
63  }
64  if ( !file_exists( $filename ) || is_dir( $filename ) ) {
65  throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
66  }
67 
68  $fh = fopen( $filename, "rb" );
69 
70  if ( !$fh ) {
71  throw new MWException( "Could not open file $filename" );
72  }
73 
74  $buffer = fread( $fh, 2 );
75  if ( $buffer !== "\xFF\xD8" ) {
76  throw new MWException( "Not a jpeg, no SOI" );
77  }
78  while ( !feof( $fh ) ) {
79  $buffer = fread( $fh, 1 );
80  $segmentCount++;
81  if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
82  // this is just a sanity check
83  throw new MWException( 'Too many jpeg segments. Aborting' );
84  }
85  if ( $buffer !== "\xFF" ) {
86  throw new MWException( "Error reading jpeg file marker. " .
87  "Expected 0xFF but got " . bin2hex( $buffer ) );
88  }
89 
90  $buffer = fread( $fh, 1 );
91  while ( $buffer === "\xFF" && !feof( $fh ) ) {
92  // Skip through any 0xFF padding bytes.
93  $buffer = fread( $fh, 1 );
94  }
95  if ( $buffer === "\xFE" ) {
96 
97  // COM section -- file comment
98  // First see if valid utf-8,
99  // if not try to convert it to windows-1252.
100  $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
102  // turns $com to valid utf-8.
103  // thus if no change, its utf-8, otherwise its something else.
104  if ( $com !== $oldCom ) {
106  $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
108  }
109  // Try it again, if its still not a valid string, then probably
110  // binary junk or some really weird encoding, so don't extract.
112  if ( $com === $oldCom ) {
113  $segments["COM"][] = $oldCom;
114  } else {
115  wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
116  }
117  } elseif ( $buffer === "\xE1" ) {
118  // APP1 section (Exif, XMP, and XMP extended)
119  // only extract if XMP is enabled.
120  $temp = self::jpegExtractMarker( $fh );
121  // check what type of app segment this is.
122  if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
123  $segments["XMP"] = substr( $temp, 29 );
124  } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
125  $segments["XMP_ext"][] = substr( $temp, 35 );
126  } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
127  // Some images (especially flickr images) seem to have this.
128  // I really have no idea what the deal is with them, but
129  // whatever...
130  $segments["XMP"] = substr( $temp, 29 );
131  wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
132  . "Using anyways.\n" );
133  } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
134  // Just need to find out what the byte order is.
135  // because php's exif plugin sucks...
136  // This is a II for little Endian, MM for big. Not a unicode BOM.
137  $byteOrderMarker = substr( $temp, 6, 2 );
138  if ( $byteOrderMarker === 'MM' ) {
139  $segments['byteOrder'] = 'BE';
140  } elseif ( $byteOrderMarker === 'II' ) {
141  $segments['byteOrder'] = 'LE';
142  } else {
143  wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
144  }
145  }
146  } elseif ( $buffer === "\xED" ) {
147  // APP13 - PSIR. IPTC and some photoshop stuff
148  $temp = self::jpegExtractMarker( $fh );
149  if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
150  $segments["PSIR"][] = $temp;
151  }
152  } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
153  // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
154  return $segments;
155  } else {
156  // segment we don't care about, so skip
157  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
158  if ( $size['int'] <= 2 ) {
159  throw new MWException( "invalid marker size in jpeg" );
160  }
161  fseek( $fh, $size['int'] - 2, SEEK_CUR );
162  }
163  }
164  // shouldn't get here.
165  throw new MWException( "Reached end of jpeg file unexpectedly" );
166  }
167 
174  private static function jpegExtractMarker( &$fh ) {
175  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
176  if ( $size['int'] <= 2 ) {
177  throw new MWException( "invalid marker size in jpeg" );
178  }
179  $segment = fread( $fh, $size['int'] - 2 );
180  if ( strlen( $segment ) !== $size['int'] - 2 ) {
181  throw new MWException( "Segment shorter than expected" );
182  }
183 
184  return $segment;
185  }
186 
201  public static function doPSIR( $app13 ) {
202  if ( !$app13 ) {
203  throw new MWException( "No App13 segment given" );
204  }
205  // First compare hash with real thing
206  // 0x404 contains IPTC, 0x425 has hash
207  // This is used to determine if the iptc is newer than
208  // the xmp data, as xmp programs update the hash,
209  // where non-xmp programs don't.
210 
211  $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
212  $appLen = strlen( $app13 );
213  $realHash = "";
214  $recordedHash = "";
215 
216  // the +12 is the length of an empty item.
217  while ( $offset + 12 <= $appLen ) {
218  $valid = true;
219  if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
220  // its supposed to be 8BIM
221  // but apparently sometimes isn't esp. in
222  // really old jpg's
223  $valid = false;
224  }
225  $offset += 4;
226  $id = substr( $app13, $offset, 2 );
227  // id is a 2 byte id number which identifies
228  // the piece of info this record contains.
229 
230  $offset += 2;
231 
232  // some record types can contain a name, which
233  // is a pascal string 0-padded to be an even
234  // number of bytes. Most times (and any time
235  // we care) this is empty, making it two null bytes.
236 
237  $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
238  // we never use the name so skip it. +1 for length byte
239  if ( $lenName % 2 == 1 ) {
240  $lenName++;
241  } // pad to even.
242  $offset += $lenName;
243 
244  // now length of data (unsigned long big endian)
245  $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
246  // PHP can take issue with very large unsigned ints and make them negative.
247  // Which should never ever happen, as this has to be inside a segment
248  // which is limited to a 16 bit number.
249  if ( $lenData['len'] < 0 ) {
250  throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
251  }
252 
253  $offset += 4; // 4bytes length field;
254 
255  // this should not happen, but check.
256  if ( $lenData['len'] + $offset > $appLen ) {
257  throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
258  . "; offset=$offset; total length=$appLen)" );
259  }
260 
261  if ( $valid ) {
262  switch ( $id ) {
263  case "\x04\x04":
264  // IPTC block
265  $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
266  break;
267  case "\x04\x25":
268  $recordedHash = substr( $app13, $offset, $lenData['len'] );
269  break;
270  }
271  }
272 
273  // if odd, add 1 to length to account for
274  // null pad byte.
275  if ( $lenData['len'] % 2 == 1 ) {
276  $lenData['len']++;
277  }
278  $offset += $lenData['len'];
279  }
280 
281  if ( !$realHash || !$recordedHash ) {
282  return 'iptc-no-hash';
283  } elseif ( $realHash === $recordedHash ) {
284  return 'iptc-good-hash';
285  } else { /*$realHash !== $recordedHash */
286  return 'iptc-bad-hash';
287  }
288  }
289 }
JpegMetadataExtractor
Class for reading jpegs and extracting metadata.
Definition: JpegMetadataExtractor.php:32
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
JpegMetadataExtractor\jpegExtractMarker
static jpegExtractMarker(&$fh)
Helper function for jpegSegmentSplitter.
Definition: JpegMetadataExtractor.php:174
wfUnpack
wfUnpack( $format, $data, $length=false)
Wrapper around php's unpack.
Definition: GlobalFunctions.php:4019
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2387
MWException
MediaWiki exception.
Definition: MWException.php:26
wfRestoreWarnings
wfRestoreWarnings()
Restore error level to previous value.
Definition: GlobalFunctions.php:2417
JpegMetadataExtractor\doPSIR
static doPSIR( $app13)
This reads the photoshop image resource.
Definition: JpegMetadataExtractor.php:201
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
wfDebug
wfDebug( $text, $dest='all')
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:933
$size
$size
Definition: RandomTest.php:75
JpegMetadataExtractor\segmentSplitter
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
Definition: JpegMetadataExtractor.php:50
JpegMetadataExtractor\MAX_JPEG_SEGMENTS
const MAX_JPEG_SEGMENTS
Definition: JpegMetadataExtractor.php:33
UtfNormal\quickIsNFCVerify
static quickIsNFCVerify(&$string)
Returns true if the string is definitely in NFC.
Definition: UtfNormal.php:243