MediaWiki  1.33.1
JpegMetadataExtractor.php
Go to the documentation of this file.
1 <?php
24 use Wikimedia\XMPReader\Reader as XMPReader;
25 
35  const MAX_JPEG_SEGMENTS = 200;
36 
37  // the max segment is a sanity check.
38  // A jpeg file should never even remotely have
39  // that many segments. Your average file has about 10.
40 
52  static function segmentSplitter( $filename ) {
53  $showXMP = XMPReader::isSupported();
54 
55  $segmentCount = 0;
56 
57  $segments = [
58  'XMP_ext' => [],
59  'COM' => [],
60  'PSIR' => [],
61  ];
62 
63  if ( !$filename ) {
64  throw new MWException( "No filename specified for " . __METHOD__ );
65  }
66  if ( !file_exists( $filename ) || is_dir( $filename ) ) {
67  throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
68  }
69 
70  $fh = fopen( $filename, "rb" );
71 
72  if ( !$fh ) {
73  throw new MWException( "Could not open file $filename" );
74  }
75 
76  $buffer = fread( $fh, 2 );
77  if ( $buffer !== "\xFF\xD8" ) {
78  throw new MWException( "Not a jpeg, no SOI" );
79  }
80  while ( !feof( $fh ) ) {
81  $buffer = fread( $fh, 1 );
82  $segmentCount++;
83  if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
84  // this is just a sanity check
85  throw new MWException( 'Too many jpeg segments. Aborting' );
86  }
87  while ( $buffer !== "\xFF" && !feof( $fh ) ) {
88  // In theory JPEG files are not allowed to contain anything between the sections,
89  // but in practice they sometimes do. It's customary to ignore the garbage data.
90  $buffer = fread( $fh, 1 );
91  }
92 
93  $buffer = fread( $fh, 1 );
94  while ( $buffer === "\xFF" && !feof( $fh ) ) {
95  // Skip through any 0xFF padding bytes.
96  $buffer = fread( $fh, 1 );
97  }
98  if ( $buffer === "\xFE" ) {
99  // COM section -- file comment
100  // First see if valid utf-8,
101  // if not try to convert it to windows-1252.
102  $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
103  UtfNormal\Validator::quickIsNFCVerify( $com );
104  // turns $com to valid utf-8.
105  // thus if no change, its utf-8, otherwise its something else.
106  if ( $com !== $oldCom ) {
107  Wikimedia\suppressWarnings();
108  $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
109  Wikimedia\restoreWarnings();
110  }
111  // Try it again, if its still not a valid string, then probably
112  // binary junk or some really weird encoding, so don't extract.
113  UtfNormal\Validator::quickIsNFCVerify( $com );
114  if ( $com === $oldCom ) {
115  $segments["COM"][] = $oldCom;
116  } else {
117  wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
118  }
119  } elseif ( $buffer === "\xE1" ) {
120  // APP1 section (Exif, XMP, and XMP extended)
121  // only extract if XMP is enabled.
122  $temp = self::jpegExtractMarker( $fh );
123  // check what type of app segment this is.
124  if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
125  // use trim to remove trailing \0 chars
126  $segments["XMP"] = trim( substr( $temp, 29 ) );
127  } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
128  // use trim to remove trailing \0 chars
129  $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
130  } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
131  // Some images (especially flickr images) seem to have this.
132  // I really have no idea what the deal is with them, but
133  // whatever...
134  // use trim to remove trailing \0 chars
135  $segments["XMP"] = trim( substr( $temp, 29 ) );
136  wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
137  . "Using anyways.\n" );
138  } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
139  // Just need to find out what the byte order is.
140  // because php's exif plugin sucks...
141  // This is a II for little Endian, MM for big. Not a unicode BOM.
142  $byteOrderMarker = substr( $temp, 6, 2 );
143  if ( $byteOrderMarker === 'MM' ) {
144  $segments['byteOrder'] = 'BE';
145  } elseif ( $byteOrderMarker === 'II' ) {
146  $segments['byteOrder'] = 'LE';
147  } else {
148  wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
149  }
150  }
151  } elseif ( $buffer === "\xED" ) {
152  // APP13 - PSIR. IPTC and some photoshop stuff
153  $temp = self::jpegExtractMarker( $fh );
154  if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
155  $segments["PSIR"][] = $temp;
156  }
157  } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
158  // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
159  return $segments;
160  } else {
161  // segment we don't care about, so skip
162  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
163  if ( $size['int'] < 2 ) {
164  throw new MWException( "invalid marker size in jpeg" );
165  }
166  // Note it's possible to seek beyond end of file if truncated.
167  // fseek doesn't report a failure in this case.
168  fseek( $fh, $size['int'] - 2, SEEK_CUR );
169  }
170  }
171  // shouldn't get here.
172  throw new MWException( "Reached end of jpeg file unexpectedly" );
173  }
174 
181  private static function jpegExtractMarker( &$fh ) {
182  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
183  if ( $size['int'] < 2 ) {
184  throw new MWException( "invalid marker size in jpeg" );
185  }
186  if ( $size['int'] === 2 ) {
187  // fread( ..., 0 ) generates a warning
188  return '';
189  }
190  $segment = fread( $fh, $size['int'] - 2 );
191  if ( strlen( $segment ) !== $size['int'] - 2 ) {
192  throw new MWException( "Segment shorter than expected" );
193  }
194 
195  return $segment;
196  }
197 
212  public static function doPSIR( $app13 ) {
213  if ( !$app13 ) {
214  throw new MWException( "No App13 segment given" );
215  }
216  // First compare hash with real thing
217  // 0x404 contains IPTC, 0x425 has hash
218  // This is used to determine if the iptc is newer than
219  // the xmp data, as xmp programs update the hash,
220  // where non-xmp programs don't.
221 
222  $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
223  $appLen = strlen( $app13 );
224  $realHash = "";
225  $recordedHash = "";
226 
227  // the +12 is the length of an empty item.
228  while ( $offset + 12 <= $appLen ) {
229  $valid = true;
230  if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
231  // its supposed to be 8BIM
232  // but apparently sometimes isn't esp. in
233  // really old jpg's
234  $valid = false;
235  }
236  $offset += 4;
237  $id = substr( $app13, $offset, 2 );
238  // id is a 2 byte id number which identifies
239  // the piece of info this record contains.
240 
241  $offset += 2;
242 
243  // some record types can contain a name, which
244  // is a pascal string 0-padded to be an even
245  // number of bytes. Most times (and any time
246  // we care) this is empty, making it two null bytes.
247 
248  $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
249  // we never use the name so skip it. +1 for length byte
250  if ( $lenName % 2 == 1 ) {
251  $lenName++;
252  } // pad to even.
253  $offset += $lenName;
254 
255  // now length of data (unsigned long big endian)
256  $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
257  // PHP can take issue with very large unsigned ints and make them negative.
258  // Which should never ever happen, as this has to be inside a segment
259  // which is limited to a 16 bit number.
260  if ( $lenData['len'] < 0 ) {
261  throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
262  }
263 
264  $offset += 4; // 4bytes length field;
265 
266  // this should not happen, but check.
267  if ( $lenData['len'] + $offset > $appLen ) {
268  throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
269  . "; offset=$offset; total length=$appLen)" );
270  }
271 
272  if ( $valid ) {
273  switch ( $id ) {
274  case "\x04\x04":
275  // IPTC block
276  $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
277  break;
278  case "\x04\x25":
279  $recordedHash = substr( $app13, $offset, $lenData['len'] );
280  break;
281  }
282  }
283 
284  // if odd, add 1 to length to account for
285  // null pad byte.
286  if ( $lenData['len'] % 2 == 1 ) {
287  $lenData['len']++;
288  }
289  $offset += $lenData['len'];
290  }
291 
292  if ( !$realHash || !$recordedHash ) {
293  return 'iptc-no-hash';
294  } elseif ( $realHash === $recordedHash ) {
295  return 'iptc-good-hash';
296  } else { /*$realHash !== $recordedHash */
297  return 'iptc-bad-hash';
298  }
299  }
300 }
JpegMetadataExtractor
Class for reading jpegs and extracting metadata.
Definition: JpegMetadataExtractor.php:34
JpegMetadataExtractor\jpegExtractMarker
static jpegExtractMarker(&$fh)
Helper function for jpegSegmentSplitter.
Definition: JpegMetadataExtractor.php:181
wfUnpack
wfUnpack( $format, $data, $length=false)
Wrapper around php's unpack.
Definition: GlobalFunctions.php:3000
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
MWException
MediaWiki exception.
Definition: MWException.php:26
JpegMetadataExtractor\doPSIR
static doPSIR( $app13)
This reads the photoshop image resource.
Definition: JpegMetadataExtractor.php:212
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:949
XMPReader
Definition: XMPReader.php:33
JpegMetadataExtractor\segmentSplitter
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
Definition: JpegMetadataExtractor.php:52
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
JpegMetadataExtractor\MAX_JPEG_SEGMENTS
const MAX_JPEG_SEGMENTS
Definition: JpegMetadataExtractor.php:35
$buffer
$buffer
Definition: mwdoc-filter.php:49