MediaWiki REL1_28
JpegMetadataExtractor.php
Go to the documentation of this file.
1<?php
33 const MAX_JPEG_SEGMENTS = 200;
34
35 // the max segment is a sanity check.
36 // A jpeg file should never even remotely have
37 // that many segments. Your average file has about 10.
38
50 static function segmentSplitter( $filename ) {
51 $showXMP = XMPReader::isSupported();
52
53 $segmentCount = 0;
54
55 $segments = [
56 'XMP_ext' => [],
57 'COM' => [],
58 'PSIR' => [],
59 ];
60
61 if ( !$filename ) {
62 throw new MWException( "No filename specified for " . __METHOD__ );
63 }
64 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
65 throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
66 }
67
68 $fh = fopen( $filename, "rb" );
69
70 if ( !$fh ) {
71 throw new MWException( "Could not open file $filename" );
72 }
73
74 $buffer = fread( $fh, 2 );
75 if ( $buffer !== "\xFF\xD8" ) {
76 throw new MWException( "Not a jpeg, no SOI" );
77 }
78 while ( !feof( $fh ) ) {
79 $buffer = fread( $fh, 1 );
80 $segmentCount++;
81 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
82 // this is just a sanity check
83 throw new MWException( 'Too many jpeg segments. Aborting' );
84 }
85 while ( $buffer !== "\xFF" ) {
86 // In theory JPEG files are not allowed to contain anything between the sections,
87 // but in practice they sometimes do. It's customary to ignore the garbage data.
88 $buffer = fread( $fh, 1 );
89 }
90
91 $buffer = fread( $fh, 1 );
92 while ( $buffer === "\xFF" && !feof( $fh ) ) {
93 // Skip through any 0xFF padding bytes.
94 $buffer = fread( $fh, 1 );
95 }
96 if ( $buffer === "\xFE" ) {
97
98 // COM section -- file comment
99 // First see if valid utf-8,
100 // if not try to convert it to windows-1252.
101 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
102 UtfNormal\Validator::quickIsNFCVerify( $com );
103 // turns $com to valid utf-8.
104 // thus if no change, its utf-8, otherwise its something else.
105 if ( $com !== $oldCom ) {
106 MediaWiki\suppressWarnings();
107 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
108 MediaWiki\restoreWarnings();
109 }
110 // Try it again, if its still not a valid string, then probably
111 // binary junk or some really weird encoding, so don't extract.
112 UtfNormal\Validator::quickIsNFCVerify( $com );
113 if ( $com === $oldCom ) {
114 $segments["COM"][] = $oldCom;
115 } else {
116 wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
117 }
118 } elseif ( $buffer === "\xE1" ) {
119 // APP1 section (Exif, XMP, and XMP extended)
120 // only extract if XMP is enabled.
121 $temp = self::jpegExtractMarker( $fh );
122 // check what type of app segment this is.
123 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
124 $segments["XMP"] = substr( $temp, 29 );
125 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
126 $segments["XMP_ext"][] = substr( $temp, 35 );
127 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
128 // Some images (especially flickr images) seem to have this.
129 // I really have no idea what the deal is with them, but
130 // whatever...
131 $segments["XMP"] = substr( $temp, 29 );
132 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
133 . "Using anyways.\n" );
134 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
135 // Just need to find out what the byte order is.
136 // because php's exif plugin sucks...
137 // This is a II for little Endian, MM for big. Not a unicode BOM.
138 $byteOrderMarker = substr( $temp, 6, 2 );
139 if ( $byteOrderMarker === 'MM' ) {
140 $segments['byteOrder'] = 'BE';
141 } elseif ( $byteOrderMarker === 'II' ) {
142 $segments['byteOrder'] = 'LE';
143 } else {
144 wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
145 }
146 }
147 } elseif ( $buffer === "\xED" ) {
148 // APP13 - PSIR. IPTC and some photoshop stuff
149 $temp = self::jpegExtractMarker( $fh );
150 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
151 $segments["PSIR"][] = $temp;
152 }
153 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
154 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
155 return $segments;
156 } else {
157 // segment we don't care about, so skip
158 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
159 if ( $size['int'] < 2 ) {
160 throw new MWException( "invalid marker size in jpeg" );
161 }
162 fseek( $fh, $size['int'] - 2, SEEK_CUR );
163 }
164 }
165 // shouldn't get here.
166 throw new MWException( "Reached end of jpeg file unexpectedly" );
167 }
168
175 private static function jpegExtractMarker( &$fh ) {
176 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
177 if ( $size['int'] < 2 ) {
178 throw new MWException( "invalid marker size in jpeg" );
179 }
180 if ( $size['int'] === 2 ) {
181 // fread( ..., 0 ) generates a warning
182 return '';
183 }
184 $segment = fread( $fh, $size['int'] - 2 );
185 if ( strlen( $segment ) !== $size['int'] - 2 ) {
186 throw new MWException( "Segment shorter than expected" );
187 }
188
189 return $segment;
190 }
191
206 public static function doPSIR( $app13 ) {
207 if ( !$app13 ) {
208 throw new MWException( "No App13 segment given" );
209 }
210 // First compare hash with real thing
211 // 0x404 contains IPTC, 0x425 has hash
212 // This is used to determine if the iptc is newer than
213 // the xmp data, as xmp programs update the hash,
214 // where non-xmp programs don't.
215
216 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
217 $appLen = strlen( $app13 );
218 $realHash = "";
219 $recordedHash = "";
220
221 // the +12 is the length of an empty item.
222 while ( $offset + 12 <= $appLen ) {
223 $valid = true;
224 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
225 // its supposed to be 8BIM
226 // but apparently sometimes isn't esp. in
227 // really old jpg's
228 $valid = false;
229 }
230 $offset += 4;
231 $id = substr( $app13, $offset, 2 );
232 // id is a 2 byte id number which identifies
233 // the piece of info this record contains.
234
235 $offset += 2;
236
237 // some record types can contain a name, which
238 // is a pascal string 0-padded to be an even
239 // number of bytes. Most times (and any time
240 // we care) this is empty, making it two null bytes.
241
242 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
243 // we never use the name so skip it. +1 for length byte
244 if ( $lenName % 2 == 1 ) {
245 $lenName++;
246 } // pad to even.
247 $offset += $lenName;
248
249 // now length of data (unsigned long big endian)
250 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
251 // PHP can take issue with very large unsigned ints and make them negative.
252 // Which should never ever happen, as this has to be inside a segment
253 // which is limited to a 16 bit number.
254 if ( $lenData['len'] < 0 ) {
255 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
256 }
257
258 $offset += 4; // 4bytes length field;
259
260 // this should not happen, but check.
261 if ( $lenData['len'] + $offset > $appLen ) {
262 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
263 . "; offset=$offset; total length=$appLen)" );
264 }
265
266 if ( $valid ) {
267 switch ( $id ) {
268 case "\x04\x04":
269 // IPTC block
270 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
271 break;
272 case "\x04\x25":
273 $recordedHash = substr( $app13, $offset, $lenData['len'] );
274 break;
275 }
276 }
277
278 // if odd, add 1 to length to account for
279 // null pad byte.
280 if ( $lenData['len'] % 2 == 1 ) {
281 $lenData['len']++;
282 }
283 $offset += $lenData['len'];
284 }
285
286 if ( !$realHash || !$recordedHash ) {
287 return 'iptc-no-hash';
288 } elseif ( $realHash === $recordedHash ) {
289 return 'iptc-good-hash';
290 } else { /*$realHash !== $recordedHash */
291 return 'iptc-bad-hash';
292 }
293 }
294}
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfUnpack( $format, $data, $length=false)
Wrapper around php's unpack.
Class for reading jpegs and extracting metadata.
static doPSIR( $app13)
This reads the photoshop image resource.
static segmentSplitter( $filename)
Function to extract metadata segments of interest from jpeg files based on GIFMetadataExtractor.
static jpegExtractMarker(&$fh)
Helper function for jpegSegmentSplitter.
MediaWiki exception.
static isSupported()
Check if this instance supports using this class.
Definition XMP.php:198
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition injection.txt:37
$buffer