MediaWiki master
PNGMetadataExtractor.php
Go to the documentation of this file.
1<?php
14namespace MediaWiki\Media;
15
16use InvalidArgumentException;
17use Wikimedia\Timestamp\TimestampFormat as TS;
18
26 private static $crcSize;
27
29 private static $textChunks;
30
31 public const VERSION = 1;
32 private const MAX_CHUNK_SIZE = 3_145_728; // 3 mebibytes
33
38 public static function getMetadata( $filename ) {
39 self::$crcSize = 4;
40 /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
41 * and https://www.w3.org/TR/PNG/#11keywords
42 */
43 self::$textChunks = [
44 'xml:com.adobe.xmp' => 'xmp',
45 # Artist is unofficial. Author is the recommended
46 # keyword in the PNG spec. However some people output
47 # Artist so support both.
48 'artist' => 'Artist',
49 'model' => 'Model',
50 'make' => 'Make',
51 'author' => 'Artist',
52 'comment' => 'PNGFileComment',
53 'description' => 'ImageDescription',
54 'title' => 'ObjectName',
55 'copyright' => 'Copyright',
56 # Source as in original device used to make image
57 # not as in who gave you the image
58 'source' => 'Model',
59 'software' => 'Software',
60 'disclaimer' => 'Disclaimer',
61 'warning' => 'ContentWarning',
62 'url' => 'Identifier', # Not sure if this is best mapping. Maybe WebStatement.
63 'label' => 'Label',
64 'creation time' => 'DateTimeDigitized',
65 /* Other potentially useful things - Document */
66 ];
67
68 $frameCount = 0;
69 $loopCount = 1;
70 $text = [];
71 $duration = 0.0;
72 $width = 0;
73 $height = 0;
74 $bitDepth = 0;
75 $colorType = 'unknown';
76 $exif = null;
77
78 if ( !$filename ) {
79 throw new InvalidArgumentException( __METHOD__ . ": No file name specified" );
80 }
81
82 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
83 throw new InvalidArgumentException( __METHOD__ . ": File $filename does not exist" );
84 }
85
86 $fh = fopen( $filename, 'rb' );
87
88 if ( !$fh ) {
89 throw new InvalidArgumentException( __METHOD__ . ": Unable to open file $filename" );
90 }
91
92 // Check for the PNG header
93 $buf = self::read( $fh, 8 );
94 if ( $buf !== "\x89PNG\x0d\x0a\x1a\x0a" ) {
95 throw new InvalidArgumentException( __METHOD__ . ": Not a valid PNG file; header: $buf" );
96 }
97
98 // Read chunks
99 while ( !feof( $fh ) ) {
100 $buf = self::read( $fh, 4 );
101 $chunk_size = unpack( "N", $buf )[1];
102
103 if ( $chunk_size < 0 || $chunk_size > self::MAX_CHUNK_SIZE ) {
104 wfDebug( __METHOD__ . ': Chunk size of ' . $chunk_size .
105 ' too big, skipping. Max size is: ' . self::MAX_CHUNK_SIZE );
106 if ( fseek( $fh, 4 + $chunk_size + self::$crcSize, SEEK_CUR ) !== 0 ) {
107 throw new InvalidArgumentException( __METHOD__ . ': seek error' );
108 }
109 continue;
110 }
111
112 $chunk_type = self::read( $fh, 4 );
113 $buf = self::read( $fh, $chunk_size );
114 $crc = self::read( $fh, self::$crcSize );
115 $computed = crc32( $chunk_type . $buf );
116 if ( pack( 'N', $computed ) !== $crc ) {
117 wfDebug( __METHOD__ . ': chunk has invalid CRC, skipping' );
118 continue;
119 }
120
121 if ( $chunk_type === "IHDR" ) {
122 $width = unpack( 'N', substr( $buf, 0, 4 ) )[1];
123 $height = unpack( 'N', substr( $buf, 4, 4 ) )[1];
124 $bitDepth = ord( substr( $buf, 8, 1 ) );
125 // Detect the color type in British English as per the spec
126 // https://www.w3.org/TR/PNG/#11IHDR
127 $colorType = match ( ord( substr( $buf, 9, 1 ) ) ) {
128 0 => 'greyscale',
129 2 => 'truecolour',
130 3 => 'index-coloured',
131 4 => 'greyscale-alpha',
132 6 => 'truecolour-alpha',
133 default => 'unknown'
134 };
135 } elseif ( $chunk_type === "acTL" ) {
136 if ( $chunk_size < 4 ) {
137 wfDebug( __METHOD__ . ": acTL chunk too small" );
138 continue;
139 }
140
141 $actl = unpack( "Nframes/Nplays", $buf );
142 $frameCount = $actl['frames'];
143 $loopCount = $actl['plays'];
144 } elseif ( $chunk_type === "fcTL" ) {
145 $buf = substr( $buf, 20 );
146 if ( strlen( $buf ) < 4 ) {
147 wfDebug( __METHOD__ . ": fcTL chunk too small" );
148 continue;
149 }
150
151 $fctldur = unpack( "ndelay_num/ndelay_den", $buf );
152 if ( $fctldur['delay_den'] == 0 ) {
153 $fctldur['delay_den'] = 100;
154 }
155 if ( $fctldur['delay_num'] ) {
156 $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
157 }
158 } elseif ( $chunk_type === "iTXt" ) {
159 // Extracts iTXt chunks, uncompressing if necessary.
160 $items = [];
161 if ( preg_match(
162 '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
163 $buf, $items )
164 ) {
165 /* $items[1] = text chunk name, $items[2] = compressed flag,
166 * $items[3] = lang code (or ""), $items[4]= compression type.
167 * $items[5] = content
168 */
169
170 // Theoretically should be case-sensitive, but in practice...
171 $items[1] = strtolower( $items[1] );
172 if ( !isset( self::$textChunks[$items[1]] ) ) {
173 // Only extract textual chunks on our list.
174 continue;
175 }
176
177 $items[3] = strtolower( $items[3] );
178 if ( $items[3] == '' ) {
179 // if no lang specified use x-default like in xmp.
180 $items[3] = 'x-default';
181 }
182
183 // if compressed
184 if ( $items[2] === "\x01" ) {
185 if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
186 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
187 $items[5] = @gzuncompress( $items[5] );
188
189 if ( $items[5] === false ) {
190 // decompression failed
191 wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] );
192 continue;
193 }
194 } else {
195 wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
196 . " or potentially invalid compression method" );
197 continue;
198 }
199 }
200 $finalKeyword = self::$textChunks[$items[1]];
201 $text[$finalKeyword][$items[3]] = $items[5];
202 $text[$finalKeyword]['_type'] = 'lang';
203 } else {
204 // Error reading iTXt chunk
205 wfDebug( __METHOD__ . ": Invalid iTXt chunk" );
206 }
207 } elseif ( $chunk_type === 'tEXt' ) {
208 // In case there is no \x00 which will make explode fail.
209 if ( !str_contains( $buf, "\x00" ) ) {
210 wfDebug( __METHOD__ . ": Invalid tEXt chunk: no null byte" );
211 continue;
212 }
213
214 [ $keyword, $content ] = explode( "\x00", $buf, 2 );
215 if ( $keyword === '' ) {
216 wfDebug( __METHOD__ . ": Empty tEXt keyword" );
217 continue;
218 }
219
220 // Theoretically should be case-sensitive, but in practice...
221 $keyword = strtolower( $keyword );
222 if ( !isset( self::$textChunks[$keyword] ) ) {
223 // Don't recognize chunk, so skip.
224 continue;
225 }
226 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
227 $content = @iconv( 'ISO-8859-1', 'UTF-8', $content );
228
229 if ( $content === false ) {
230 wfDebug( __METHOD__ . ": Read error (error with iconv)" );
231 continue;
232 }
233
234 $finalKeyword = self::$textChunks[$keyword];
235 $text[$finalKeyword]['x-default'] = $content;
236 $text[$finalKeyword]['_type'] = 'lang';
237 } elseif ( $chunk_type === 'zTXt' ) {
238 if ( function_exists( 'gzuncompress' ) ) {
239 // In case there is no \x00 which will make explode fail.
240 if ( !str_contains( $buf, "\x00" ) ) {
241 wfDebug( __METHOD__ . ": No null byte in zTXt chunk" );
242 continue;
243 }
244
245 [ $keyword, $postKeyword ] = explode( "\x00", $buf, 2 );
246 if ( $keyword === '' || $postKeyword === '' ) {
247 wfDebug( __METHOD__ . ": Empty zTXt chunk" );
248 continue;
249 }
250 // Theoretically should be case-sensitive, but in practice...
251 $keyword = strtolower( $keyword );
252
253 if ( !isset( self::$textChunks[$keyword] ) ) {
254 // Don't recognize chunk, so skip.
255 continue;
256 }
257 $compression = substr( $postKeyword, 0, 1 );
258 $content = substr( $postKeyword, 1 );
259 if ( $compression !== "\x00" ) {
260 wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping." );
261 continue;
262 }
263
264 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
265 $content = @gzuncompress( $content );
266
267 if ( $content === false ) {
268 // decompression failed
269 wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword );
270 continue;
271 }
272
273 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
274 $content = @iconv( 'ISO-8859-1', 'UTF-8', $content );
275
276 if ( $content === false ) {
277 wfDebug( __METHOD__ . ": iconv error in zTXt chunk" );
278 continue;
279 }
280
281 $finalKeyword = self::$textChunks[$keyword];
282 $text[$finalKeyword]['x-default'] = $content;
283 $text[$finalKeyword]['_type'] = 'lang';
284 } else {
285 wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping." );
286 }
287 } elseif ( $chunk_type === 'tIME' ) {
288 // last mod timestamp.
289 if ( $chunk_size !== 7 ) {
290 wfDebug( __METHOD__ . ": tIME wrong size" );
291 continue;
292 }
293
294 // Note: spec says this should be UTC.
295 $t = unpack( "ny/Cm/Cd/Ch/Cmin/Cs", $buf );
296 $strTime = sprintf( "%04d%02d%02d%02d%02d%02d",
297 $t['y'], $t['m'], $t['d'], $t['h'],
298 $t['min'], $t['s'] );
299
300 $exifTime = wfTimestamp( TS::EXIF, $strTime );
301
302 if ( $exifTime ) {
303 $text['DateTime'] = $exifTime;
304 }
305 } elseif ( $chunk_type === 'pHYs' ) {
306 // how big pixels are (dots per meter).
307 if ( $chunk_size !== 9 ) {
308 wfDebug( __METHOD__ . ": pHYs wrong size" );
309 continue;
310 }
311
312 $dim = unpack( "Nwidth/Nheight/Cunit", $buf );
313 if ( $dim['unit'] === 1 ) {
314 // Need to check for negative because php
315 // doesn't deal with super-large unsigned 32-bit ints well
316 if ( $dim['width'] > 0 && $dim['height'] > 0 ) {
317 // unit is meters
318 // (as opposed to 0 = undefined )
319 $text['XResolution'] = $dim['width']
320 . '/100';
321 $text['YResolution'] = $dim['height']
322 . '/100';
323 $text['ResolutionUnit'] = 3;
324 // 3 = dots per cm (from Exif).
325 }
326 }
327 } elseif ( $chunk_type === "eXIf" ) {
328 // There are 4 competing ways to store Exif
329 // in a PNG file. This is the official one.
330 if (
331 $chunk_size < 4 || (
332 substr( $buf, 0, 4 ) !== "II\x2A\x00" &&
333 substr( $buf, 0, 4 ) !== "MM\x00\x2A"
334 )
335 ) {
336 wfDebug( __METHOD__ . ": Invalid eXIf tag" );
337 }
338 $exif = $buf;
339 } elseif ( $chunk_type === "IEND" ) {
340 break;
341 }
342 }
343 fclose( $fh );
344
345 if ( $loopCount > 1 ) {
346 $duration *= $loopCount;
347 }
348
349 if ( isset( $text['DateTimeDigitized'] ) ) {
350 // Convert date format from rfc2822 to exif.
351 foreach ( $text['DateTimeDigitized'] as $name => &$value ) {
352 if ( $name === '_type' ) {
353 continue;
354 }
355
356 // @todo FIXME: Currently timezones are ignored.
357 // possibly should be wfTimestamp's
358 // responsibility. (at least for numeric TZ)
359 $formatted = wfTimestamp( TS::EXIF, $value );
360 if ( $formatted ) {
361 // Only change if we could convert the
362 // date.
363 // The png standard says it should be
364 // in rfc2822 format, but not required.
365 // In general for the exif stuff we
366 // prettify the date if we can, but we
367 // display as-is if we cannot or if
368 // it is invalid.
369 // So do the same here.
370
371 $value = $formatted;
372 }
373 }
374 }
375
376 return [
377 'width' => $width,
378 'height' => $height,
379 'frameCount' => $frameCount,
380 'loopCount' => $loopCount,
381 'duration' => $duration,
382 'text' => $text,
383 'bitDepth' => $bitDepth,
384 'colorType' => $colorType,
385 'exif' => $exif,
386 ];
387 }
388
397 private static function read( $fh, $size ) {
398 if ( $size === 0 ) {
399 return '';
400 }
401
402 $result = fread( $fh, $size );
403 if ( $result === false ) {
404 throw new InvalidArgumentException( __METHOD__ . ': read error' );
405 }
406 if ( strlen( $result ) < $size ) {
407 throw new InvalidArgumentException( __METHOD__ . ': unexpected end of file' );
408 }
409 return $result;
410 }
411}
412
414class_alias( PNGMetadataExtractor::class, 'PNGMetadataExtractor' );
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.