MediaWiki master
PNGMetadataExtractor.php
Go to the documentation of this file.
1<?php
14namespace MediaWiki\Media;
15
16use InvalidArgumentException;
17use Wikimedia\AtEase\AtEase;
18use Wikimedia\Timestamp\TimestampFormat as TS;
19
27 private static $crcSize;
28
30 private static $textChunks;
31
32 public const VERSION = 1;
33 private const MAX_CHUNK_SIZE = 3_145_728; // 3 mebibytes
34
39 public static function getMetadata( $filename ) {
40 self::$crcSize = 4;
41 /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
42 * and https://www.w3.org/TR/PNG/#11keywords
43 */
44 self::$textChunks = [
45 'xml:com.adobe.xmp' => 'xmp',
46 # Artist is unofficial. Author is the recommended
47 # keyword in the PNG spec. However some people output
48 # Artist so support both.
49 'artist' => 'Artist',
50 'model' => 'Model',
51 'make' => 'Make',
52 'author' => 'Artist',
53 'comment' => 'PNGFileComment',
54 'description' => 'ImageDescription',
55 'title' => 'ObjectName',
56 'copyright' => 'Copyright',
57 # Source as in original device used to make image
58 # not as in who gave you the image
59 'source' => 'Model',
60 'software' => 'Software',
61 'disclaimer' => 'Disclaimer',
62 'warning' => 'ContentWarning',
63 'url' => 'Identifier', # Not sure if this is best mapping. Maybe WebStatement.
64 'label' => 'Label',
65 'creation time' => 'DateTimeDigitized',
66 /* Other potentially useful things - Document */
67 ];
68
69 $frameCount = 0;
70 $loopCount = 1;
71 $text = [];
72 $duration = 0.0;
73 $width = 0;
74 $height = 0;
75 $bitDepth = 0;
76 $colorType = 'unknown';
77 $exif = null;
78
79 if ( !$filename ) {
80 throw new InvalidArgumentException( __METHOD__ . ": No file name specified" );
81 }
82
83 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
84 throw new InvalidArgumentException( __METHOD__ . ": File $filename does not exist" );
85 }
86
87 $fh = fopen( $filename, 'rb' );
88
89 if ( !$fh ) {
90 throw new InvalidArgumentException( __METHOD__ . ": Unable to open file $filename" );
91 }
92
93 // Check for the PNG header
94 $buf = self::read( $fh, 8 );
95 if ( $buf !== "\x89PNG\x0d\x0a\x1a\x0a" ) {
96 throw new InvalidArgumentException( __METHOD__ . ": Not a valid PNG file; header: $buf" );
97 }
98
99 // Read chunks
100 while ( !feof( $fh ) ) {
101 $buf = self::read( $fh, 4 );
102 $chunk_size = unpack( "N", $buf )[1];
103
104 if ( $chunk_size < 0 || $chunk_size > self::MAX_CHUNK_SIZE ) {
105 wfDebug( __METHOD__ . ': Chunk size of ' . $chunk_size .
106 ' too big, skipping. Max size is: ' . self::MAX_CHUNK_SIZE );
107 if ( fseek( $fh, 4 + $chunk_size + self::$crcSize, SEEK_CUR ) !== 0 ) {
108 throw new InvalidArgumentException( __METHOD__ . ': seek error' );
109 }
110 continue;
111 }
112
113 $chunk_type = self::read( $fh, 4 );
114 $buf = self::read( $fh, $chunk_size );
115 $crc = self::read( $fh, self::$crcSize );
116 $computed = crc32( $chunk_type . $buf );
117 if ( pack( 'N', $computed ) !== $crc ) {
118 wfDebug( __METHOD__ . ': chunk has invalid CRC, skipping' );
119 continue;
120 }
121
122 if ( $chunk_type === "IHDR" ) {
123 $width = unpack( 'N', substr( $buf, 0, 4 ) )[1];
124 $height = unpack( 'N', substr( $buf, 4, 4 ) )[1];
125 $bitDepth = ord( substr( $buf, 8, 1 ) );
126 // Detect the color type in British English as per the spec
127 // https://www.w3.org/TR/PNG/#11IHDR
128 $colorType = match ( ord( substr( $buf, 9, 1 ) ) ) {
129 0 => 'greyscale',
130 2 => 'truecolour',
131 3 => 'index-coloured',
132 4 => 'greyscale-alpha',
133 6 => 'truecolour-alpha',
134 default => 'unknown'
135 };
136 } elseif ( $chunk_type === "acTL" ) {
137 if ( $chunk_size < 4 ) {
138 wfDebug( __METHOD__ . ": acTL chunk too small" );
139 continue;
140 }
141
142 $actl = unpack( "Nframes/Nplays", $buf );
143 $frameCount = $actl['frames'];
144 $loopCount = $actl['plays'];
145 } elseif ( $chunk_type === "fcTL" ) {
146 $buf = substr( $buf, 20 );
147 if ( strlen( $buf ) < 4 ) {
148 wfDebug( __METHOD__ . ": fcTL chunk too small" );
149 continue;
150 }
151
152 $fctldur = unpack( "ndelay_num/ndelay_den", $buf );
153 if ( $fctldur['delay_den'] == 0 ) {
154 $fctldur['delay_den'] = 100;
155 }
156 if ( $fctldur['delay_num'] ) {
157 $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
158 }
159 } elseif ( $chunk_type === "iTXt" ) {
160 // Extracts iTXt chunks, uncompressing if necessary.
161 $items = [];
162 if ( preg_match(
163 '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
164 $buf, $items )
165 ) {
166 /* $items[1] = text chunk name, $items[2] = compressed flag,
167 * $items[3] = lang code (or ""), $items[4]= compression type.
168 * $items[5] = content
169 */
170
171 // Theoretically should be case-sensitive, but in practise...
172 $items[1] = strtolower( $items[1] );
173 if ( !isset( self::$textChunks[$items[1]] ) ) {
174 // Only extract textual chunks on our list.
175 continue;
176 }
177
178 $items[3] = strtolower( $items[3] );
179 if ( $items[3] == '' ) {
180 // if no lang specified use x-default like in xmp.
181 $items[3] = 'x-default';
182 }
183
184 // if compressed
185 if ( $items[2] === "\x01" ) {
186 if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
187 AtEase::suppressWarnings();
188 $items[5] = gzuncompress( $items[5] );
189 AtEase::restoreWarnings();
190
191 if ( $items[5] === false ) {
192 // decompression failed
193 wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] );
194 continue;
195 }
196 } else {
197 wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
198 . " or potentially invalid compression method" );
199 continue;
200 }
201 }
202 $finalKeyword = self::$textChunks[$items[1]];
203 $text[$finalKeyword][$items[3]] = $items[5];
204 $text[$finalKeyword]['_type'] = 'lang';
205 } else {
206 // Error reading iTXt chunk
207 wfDebug( __METHOD__ . ": Invalid iTXt chunk" );
208 }
209 } elseif ( $chunk_type === 'tEXt' ) {
210 // In case there is no \x00 which will make explode fail.
211 if ( !str_contains( $buf, "\x00" ) ) {
212 wfDebug( __METHOD__ . ": Invalid tEXt chunk: no null byte" );
213 continue;
214 }
215
216 [ $keyword, $content ] = explode( "\x00", $buf, 2 );
217 if ( $keyword === '' ) {
218 wfDebug( __METHOD__ . ": Empty tEXt keyword" );
219 continue;
220 }
221
222 // Theoretically should be case-sensitive, but in practise...
223 $keyword = strtolower( $keyword );
224 if ( !isset( self::$textChunks[$keyword] ) ) {
225 // Don't recognize chunk, so skip.
226 continue;
227 }
228 AtEase::suppressWarnings();
229 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
230 AtEase::restoreWarnings();
231
232 if ( $content === false ) {
233 wfDebug( __METHOD__ . ": Read error (error with iconv)" );
234 continue;
235 }
236
237 $finalKeyword = self::$textChunks[$keyword];
238 $text[$finalKeyword]['x-default'] = $content;
239 $text[$finalKeyword]['_type'] = 'lang';
240 } elseif ( $chunk_type === 'zTXt' ) {
241 if ( function_exists( 'gzuncompress' ) ) {
242 // In case there is no \x00 which will make explode fail.
243 if ( !str_contains( $buf, "\x00" ) ) {
244 wfDebug( __METHOD__ . ": No null byte in zTXt chunk" );
245 continue;
246 }
247
248 [ $keyword, $postKeyword ] = explode( "\x00", $buf, 2 );
249 if ( $keyword === '' || $postKeyword === '' ) {
250 wfDebug( __METHOD__ . ": Empty zTXt chunk" );
251 continue;
252 }
253 // Theoretically should be case-sensitive, but in practise...
254 $keyword = strtolower( $keyword );
255
256 if ( !isset( self::$textChunks[$keyword] ) ) {
257 // Don't recognize chunk, so skip.
258 continue;
259 }
260 $compression = substr( $postKeyword, 0, 1 );
261 $content = substr( $postKeyword, 1 );
262 if ( $compression !== "\x00" ) {
263 wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping." );
264 continue;
265 }
266
267 AtEase::suppressWarnings();
268 $content = gzuncompress( $content );
269 AtEase::restoreWarnings();
270
271 if ( $content === false ) {
272 // decompression failed
273 wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword );
274 continue;
275 }
276
277 AtEase::suppressWarnings();
278 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
279 AtEase::restoreWarnings();
280
281 if ( $content === false ) {
282 wfDebug( __METHOD__ . ": iconv error in zTXt chunk" );
283 continue;
284 }
285
286 $finalKeyword = self::$textChunks[$keyword];
287 $text[$finalKeyword]['x-default'] = $content;
288 $text[$finalKeyword]['_type'] = 'lang';
289 } else {
290 wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping." );
291 }
292 } elseif ( $chunk_type === 'tIME' ) {
293 // last mod timestamp.
294 if ( $chunk_size !== 7 ) {
295 wfDebug( __METHOD__ . ": tIME wrong size" );
296 continue;
297 }
298
299 // Note: spec says this should be UTC.
300 $t = unpack( "ny/Cm/Cd/Ch/Cmin/Cs", $buf );
301 $strTime = sprintf( "%04d%02d%02d%02d%02d%02d",
302 $t['y'], $t['m'], $t['d'], $t['h'],
303 $t['min'], $t['s'] );
304
305 $exifTime = wfTimestamp( TS::EXIF, $strTime );
306
307 if ( $exifTime ) {
308 $text['DateTime'] = $exifTime;
309 }
310 } elseif ( $chunk_type === 'pHYs' ) {
311 // how big pixels are (dots per meter).
312 if ( $chunk_size !== 9 ) {
313 wfDebug( __METHOD__ . ": pHYs wrong size" );
314 continue;
315 }
316
317 $dim = unpack( "Nwidth/Nheight/Cunit", $buf );
318 if ( $dim['unit'] === 1 ) {
319 // Need to check for negative because php
320 // doesn't deal with super-large unsigned 32-bit ints well
321 if ( $dim['width'] > 0 && $dim['height'] > 0 ) {
322 // unit is meters
323 // (as opposed to 0 = undefined )
324 $text['XResolution'] = $dim['width']
325 . '/100';
326 $text['YResolution'] = $dim['height']
327 . '/100';
328 $text['ResolutionUnit'] = 3;
329 // 3 = dots per cm (from Exif).
330 }
331 }
332 } elseif ( $chunk_type === "eXIf" ) {
333 // There are 4 competing ways to store Exif
334 // in a PNG file. This is the official one.
335 if (
336 $chunk_size < 4 || (
337 substr( $buf, 0, 4 ) !== "II\x2A\x00" &&
338 substr( $buf, 0, 4 ) !== "MM\x00\x2A"
339 )
340 ) {
341 wfDebug( __METHOD__ . ": Invalid eXIf tag" );
342 }
343 $exif = $buf;
344 } elseif ( $chunk_type === "IEND" ) {
345 break;
346 }
347 }
348 fclose( $fh );
349
350 if ( $loopCount > 1 ) {
351 $duration *= $loopCount;
352 }
353
354 if ( isset( $text['DateTimeDigitized'] ) ) {
355 // Convert date format from rfc2822 to exif.
356 foreach ( $text['DateTimeDigitized'] as $name => &$value ) {
357 if ( $name === '_type' ) {
358 continue;
359 }
360
361 // @todo FIXME: Currently timezones are ignored.
362 // possibly should be wfTimestamp's
363 // responsibility. (at least for numeric TZ)
364 $formatted = wfTimestamp( TS::EXIF, $value );
365 if ( $formatted ) {
366 // Only change if we could convert the
367 // date.
368 // The png standard says it should be
369 // in rfc2822 format, but not required.
370 // In general for the exif stuff we
371 // prettify the date if we can, but we
372 // display as-is if we cannot or if
373 // it is invalid.
374 // So do the same here.
375
376 $value = $formatted;
377 }
378 }
379 }
380
381 return [
382 'width' => $width,
383 'height' => $height,
384 'frameCount' => $frameCount,
385 'loopCount' => $loopCount,
386 'duration' => $duration,
387 'text' => $text,
388 'bitDepth' => $bitDepth,
389 'colorType' => $colorType,
390 'exif' => $exif,
391 ];
392 }
393
402 private static function read( $fh, $size ) {
403 if ( $size === 0 ) {
404 return '';
405 }
406
407 $result = fread( $fh, $size );
408 if ( $result === false ) {
409 throw new InvalidArgumentException( __METHOD__ . ': read error' );
410 }
411 if ( strlen( $result ) < $size ) {
412 throw new InvalidArgumentException( __METHOD__ . ': unexpected end of file' );
413 }
414 return $result;
415 }
416}
417
419class_alias( PNGMetadataExtractor::class, 'PNGMetadataExtractor' );
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.