Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
77.78% |
98 / 126 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
| JpegMetadataExtractor | |
78.40% |
98 / 125 |
|
0.00% |
0 / 3 |
81.31 | |
0.00% |
0 / 1 |
| segmentSplitter | |
80.82% |
59 / 73 |
|
0.00% |
0 / 1 |
40.68 | |||
| jpegExtractMarker | |
54.55% |
6 / 11 |
|
0.00% |
0 / 1 |
7.35 | |||
| doPSIR | |
80.49% |
33 / 41 |
|
0.00% |
0 / 1 |
16.67 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Extraction of JPEG image metadata. |
| 4 | * |
| 5 | * @license GPL-2.0-or-later |
| 6 | * @file |
| 7 | * @ingroup Media |
| 8 | */ |
| 9 | |
| 10 | namespace MediaWiki\Media; |
| 11 | |
| 12 | use Wikimedia\StringUtils\StringUtils; |
| 13 | use Wikimedia\UnpackFailedException; |
| 14 | use Wikimedia\XMPReader\Reader as XMPReader; |
| 15 | |
| 16 | /** |
| 17 | * Class for reading jpegs and extracting metadata. |
| 18 | * See also BitmapMetadataHandler. |
| 19 | * |
| 20 | * Based somewhat on GIFMetadataExtractor. |
| 21 | * |
| 22 | * @ingroup Media |
| 23 | */ |
| 24 | class JpegMetadataExtractor { |
| 25 | /** |
| 26 | * The max segment is a safety check. A JPEG file should never even remotely have |
| 27 | * that many segments. Your average file has about 10. |
| 28 | */ |
| 29 | private const MAX_JPEG_SEGMENTS = 200; |
| 30 | |
| 31 | /** |
| 32 | * Function to extract metadata segments of interest from jpeg files |
| 33 | * based on GIFMetadataExtractor. |
| 34 | * |
| 35 | * We can almost use getimagesize to do this, |
| 36 | * but gis doesn't support having multiple app1 segments, |
| 37 | * and those can't extract xmp on files containing both exif and xmp data |
| 38 | * |
| 39 | * @param string $filename Name of the jpeg file |
| 40 | * @return array Array of interesting segments. |
| 41 | * @throws InvalidJpegException |
| 42 | */ |
| 43 | public static function segmentSplitter( $filename ) { |
| 44 | if ( !$filename ) { |
| 45 | throw new InvalidJpegException( "No filename specified for " . __METHOD__ ); |
| 46 | } |
| 47 | if ( !file_exists( $filename ) || is_dir( $filename ) ) { |
| 48 | throw new InvalidJpegException( "Invalid file $filename passed to " . __METHOD__ ); |
| 49 | } |
| 50 | |
| 51 | $fh = fopen( $filename, "rb" ); |
| 52 | |
| 53 | if ( !$fh ) { |
| 54 | throw new InvalidJpegException( "Could not open file $filename" ); |
| 55 | } |
| 56 | |
| 57 | $buffer = fread( $fh, 2 ); |
| 58 | if ( $buffer !== "\xFF\xD8" ) { |
| 59 | throw new InvalidJpegException( "Not a jpeg, no SOI" ); |
| 60 | } |
| 61 | |
| 62 | $showXMP = XMPReader::isSupported(); |
| 63 | |
| 64 | $segmentCount = 0; |
| 65 | |
| 66 | $segments = [ |
| 67 | 'XMP_ext' => [], |
| 68 | 'COM' => [], |
| 69 | 'PSIR' => [], |
| 70 | ]; |
| 71 | |
| 72 | while ( !feof( $fh ) ) { |
| 73 | $buffer = fread( $fh, 1 ); |
| 74 | $segmentCount++; |
| 75 | if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) { |
| 76 | throw new InvalidJpegException( 'Too many jpeg segments. Aborting' ); |
| 77 | } |
| 78 | while ( $buffer !== "\xFF" && !feof( $fh ) ) { |
| 79 | // In theory JPEG files are not allowed to contain anything between the sections, |
| 80 | // but in practice they sometimes do. It's customary to ignore the garbage data. |
| 81 | $buffer = fread( $fh, 1 ); |
| 82 | } |
| 83 | |
| 84 | $buffer = fread( $fh, 1 ); |
| 85 | while ( $buffer === "\xFF" && !feof( $fh ) ) { |
| 86 | // Skip through any 0xFF padding bytes. |
| 87 | $buffer = fread( $fh, 1 ); |
| 88 | } |
| 89 | if ( $buffer === "\xFE" ) { |
| 90 | // COM section -- file comment |
| 91 | // First see if valid utf-8, |
| 92 | // if not try to convert it to windows-1252. |
| 93 | $com = $oldCom = trim( self::jpegExtractMarker( $fh ) ); |
| 94 | \UtfNormal\Validator::quickIsNFCVerify( $com ); |
| 95 | // turns $com to valid utf-8. |
| 96 | // thus if no change, it's utf-8, otherwise it's something else. |
| 97 | if ( $com !== $oldCom ) { |
| 98 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
| 99 | $com = $oldCom = @iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom ); |
| 100 | } |
| 101 | // Try it again, if it's still not a valid string, then probably |
| 102 | // binary junk or some really weird encoding, so don't extract. |
| 103 | \UtfNormal\Validator::quickIsNFCVerify( $com ); |
| 104 | if ( $com === $oldCom ) { |
| 105 | $segments["COM"][] = $oldCom; |
| 106 | } else { |
| 107 | wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." ); |
| 108 | } |
| 109 | } elseif ( $buffer === "\xE1" ) { |
| 110 | // APP1 section (Exif, XMP, and XMP extended) |
| 111 | // only extract if XMP is enabled. |
| 112 | $temp = self::jpegExtractMarker( $fh ); |
| 113 | // check what type of app segment this is. |
| 114 | if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { |
| 115 | // use trim to remove trailing \0 chars |
| 116 | $segments["XMP"] = trim( substr( $temp, 29 ) ); |
| 117 | } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) { |
| 118 | // use trim to remove trailing \0 chars |
| 119 | $segments["XMP_ext"][] = trim( substr( $temp, 35 ) ); |
| 120 | } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { |
| 121 | // Some images (especially flickr images) seem to have this. |
| 122 | // I really have no idea what the deal is with them, but |
| 123 | // whatever... |
| 124 | // use trim to remove trailing \0 chars |
| 125 | $segments["XMP"] = trim( substr( $temp, 29 ) ); |
| 126 | wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier ' |
| 127 | . "Using anyways." ); |
| 128 | } elseif ( str_starts_with( $temp, "Exif\0\0" ) ) { |
| 129 | // Just need to find out what the byte order is. |
| 130 | // because php's exif plugin sucks... |
| 131 | // This is a II for little Endian, MM for big. Not a unicode BOM. |
| 132 | $byteOrderMarker = substr( $temp, 6, 2 ); |
| 133 | if ( $byteOrderMarker === 'MM' ) { |
| 134 | $segments['byteOrder'] = 'BE'; |
| 135 | } elseif ( $byteOrderMarker === 'II' ) { |
| 136 | $segments['byteOrder'] = 'LE'; |
| 137 | } else { |
| 138 | wfDebug( __METHOD__ . " Invalid byte ordering?!" ); |
| 139 | } |
| 140 | } |
| 141 | } elseif ( $buffer === "\xED" ) { |
| 142 | // APP13 - PSIR. IPTC and some Photoshop stuff |
| 143 | $temp = self::jpegExtractMarker( $fh ); |
| 144 | if ( str_starts_with( $temp, "Photoshop 3.0\x00" ) ) { |
| 145 | $segments["PSIR"][] = $temp; |
| 146 | } |
| 147 | } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) { |
| 148 | // EOI - end of image or SOS - start of scan. either way we're past any interesting segments |
| 149 | return $segments; |
| 150 | } elseif ( in_array( $buffer, [ |
| 151 | "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7", |
| 152 | "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] ) |
| 153 | ) { |
| 154 | // SOF0, SOF1, SOF2, ... (same list as getimagesize) |
| 155 | $temp = self::jpegExtractMarker( $fh ); |
| 156 | try { |
| 157 | $segments["SOF"] = StringUtils::unpack( 'Cbits/nheight/nwidth/Ccomponents', $temp ); |
| 158 | } catch ( UnpackFailedException $e ) { |
| 159 | throw new InvalidJpegException( $e->getMessage() ); |
| 160 | } |
| 161 | } else { |
| 162 | // segment we don't care about, so skip |
| 163 | try { |
| 164 | $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 ); |
| 165 | } catch ( UnpackFailedException $e ) { |
| 166 | throw new InvalidJpegException( $e->getMessage() ); |
| 167 | } |
| 168 | if ( $size['int'] < 2 ) { |
| 169 | throw new InvalidJpegException( "invalid marker size in jpeg" ); |
| 170 | } |
| 171 | // Note it's possible to seek beyond the end of the file if truncated. |
| 172 | // fseek doesn't report a failure in this case. |
| 173 | fseek( $fh, $size['int'] - 2, SEEK_CUR ); |
| 174 | } |
| 175 | } |
| 176 | // shouldn't get here. |
| 177 | throw new InvalidJpegException( "Reached end of jpeg file unexpectedly" ); |
| 178 | } |
| 179 | |
| 180 | /** |
| 181 | * Helper function for jpegSegmentSplitter |
| 182 | * @param resource &$fh File handle for JPEG file |
| 183 | * @throws InvalidJpegException |
| 184 | * @return string Data content of segment. |
| 185 | */ |
| 186 | private static function jpegExtractMarker( &$fh ) { |
| 187 | try { |
| 188 | $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 ); |
| 189 | } catch ( UnpackFailedException $e ) { |
| 190 | throw new InvalidJpegException( $e->getMessage() ); |
| 191 | } |
| 192 | if ( $size['int'] < 2 ) { |
| 193 | throw new InvalidJpegException( "invalid marker size in jpeg" ); |
| 194 | } |
| 195 | if ( $size['int'] === 2 ) { |
| 196 | // fread( ..., 0 ) generates a warning |
| 197 | return ''; |
| 198 | } |
| 199 | $segment = fread( $fh, $size['int'] - 2 ); |
| 200 | if ( strlen( $segment ) !== $size['int'] - 2 ) { |
| 201 | throw new InvalidJpegException( "Segment shorter than expected" ); |
| 202 | } |
| 203 | |
| 204 | return $segment; |
| 205 | } |
| 206 | |
| 207 | /** |
| 208 | * This reads the Photoshop image resource. |
| 209 | * Currently, it only compares the iptc/iim hash |
| 210 | * with the stored hash, which is used to determine the precedence |
| 211 | * of the iptc data. In the future it may extract some other info, like |
| 212 | * url of copyright license. |
| 213 | * |
| 214 | * This should generally be called by BitmapMetadataHandler::doApp13() |
| 215 | * |
| 216 | * @param string $app13 Photoshop psir app13 block from jpg. |
| 217 | * @throws InvalidPSIRException |
| 218 | * @return string If the iptc hash is good or not. One of 'iptc-no-hash', |
| 219 | * 'iptc-good-hash', 'iptc-bad-hash'. |
| 220 | */ |
| 221 | public static function doPSIR( $app13 ) { |
| 222 | if ( !$app13 ) { |
| 223 | throw new InvalidPSIRException( "No App13 segment given" ); |
| 224 | } |
| 225 | // First, compare hash with the real thing. |
| 226 | // 0x404 contains IPTC, 0x425 has hash |
| 227 | // This is used to determine if the iptc is newer than |
| 228 | // the xmp data, as xmp programs update the hash, |
| 229 | // where non-xmp programs don't. |
| 230 | |
| 231 | // skip past PHOTOSHOP 3.0 identifier. should already be checked. |
| 232 | $offset = 14; |
| 233 | $appLen = strlen( $app13 ); |
| 234 | $realHash = ""; |
| 235 | $recordedHash = ""; |
| 236 | |
| 237 | // the +12 is the length of an empty item. |
| 238 | while ( $offset + 12 <= $appLen ) { |
| 239 | $valid = true; |
| 240 | if ( substr( $app13, $offset, 4 ) !== '8BIM' ) { |
| 241 | // it's supposed to be 8BIM, but apparently sometimes isn't, especially in really old jpg's |
| 242 | $valid = false; |
| 243 | } |
| 244 | $offset += 4; |
| 245 | $id = substr( $app13, $offset, 2 ); |
| 246 | // id is a 2-byte id number which identifies |
| 247 | // the piece of info this record contains. |
| 248 | |
| 249 | $offset += 2; |
| 250 | |
| 251 | // some record types can contain a name, which |
| 252 | // is a pascal string 0-padded to be an even |
| 253 | // number of bytes. Most times (and any time |
| 254 | // we care) this is empty, making it two null bytes. |
| 255 | |
| 256 | $lenName = ord( substr( $app13, $offset, 1 ) ) + 1; |
| 257 | // we never use the name, so skip it. +1 for length byte |
| 258 | if ( $lenName % 2 === 1 ) { |
| 259 | // pad to even. |
| 260 | $lenName++; |
| 261 | } |
| 262 | $offset += $lenName; |
| 263 | |
| 264 | // now length of data (unsigned long big endian) |
| 265 | try { |
| 266 | $lenData = StringUtils::unpack( 'Nlen', substr( $app13, $offset, 4 ), 4 ); |
| 267 | } catch ( UnpackFailedException $e ) { |
| 268 | throw new InvalidPSIRException( $e->getMessage() ); |
| 269 | } |
| 270 | // PHP can take issue with very large unsigned ints and make them negative. |
| 271 | // Which should never ever happen, as this has to be inside a segment |
| 272 | // which is limited to a 16-bit number. |
| 273 | if ( $lenData['len'] < 0 ) { |
| 274 | throw new InvalidPSIRException( "Too big PSIR (" . $lenData['len'] . ')' ); |
| 275 | } |
| 276 | |
| 277 | // 4-byte length field; |
| 278 | $offset += 4; |
| 279 | |
| 280 | // this should not happen, but check. |
| 281 | if ( $lenData['len'] + $offset > $appLen ) { |
| 282 | throw new InvalidPSIRException( "PSIR data too long. (item length=" . $lenData['len'] |
| 283 | . "; offset=$offset; total length=$appLen)" ); |
| 284 | } |
| 285 | |
| 286 | if ( $valid ) { |
| 287 | switch ( $id ) { |
| 288 | case "\x04\x04": |
| 289 | // IPTC block |
| 290 | $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true ); |
| 291 | break; |
| 292 | case "\x04\x25": |
| 293 | $recordedHash = substr( $app13, $offset, $lenData['len'] ); |
| 294 | break; |
| 295 | } |
| 296 | } |
| 297 | |
| 298 | // if odd, add 1 to length to account for |
| 299 | // null pad byte. |
| 300 | if ( $lenData['len'] % 2 === 1 ) { |
| 301 | $lenData['len']++; |
| 302 | } |
| 303 | $offset += $lenData['len']; |
| 304 | } |
| 305 | |
| 306 | if ( !$realHash || !$recordedHash ) { |
| 307 | return 'iptc-no-hash'; |
| 308 | } |
| 309 | if ( $realHash === $recordedHash ) { |
| 310 | return 'iptc-good-hash'; |
| 311 | } |
| 312 | /* if $realHash !== $recordedHash */ |
| 313 | return 'iptc-bad-hash'; |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | /** @deprecated class alias since 1.46 */ |
| 318 | class_alias( JpegMetadataExtractor::class, 'JpegMetadataExtractor' ); |