MediaWiki REL1_39
PNGMetadataExtractor.php
Go to the documentation of this file.
1<?php
28use Wikimedia\AtEase\AtEase;
29
37 private static $pngSig;
38
40 private static $crcSize;
41
43 private static $textChunks;
44
45 public const VERSION = 1;
46 private const MAX_CHUNK_SIZE = 3145728; // 3 mebibytes
47
48 public static function getMetadata( $filename ) {
49 self::$pngSig = pack( "C8", 137, 80, 78, 71, 13, 10, 26, 10 );
50 self::$crcSize = 4;
51 /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
52 * and https://www.w3.org/TR/PNG/#11keywords
53 */
54 self::$textChunks = [
55 'xml:com.adobe.xmp' => 'xmp',
56 # Artist is unofficial. Author is the recommended
57 # keyword in the PNG spec. However some people output
58 # Artist so support both.
59 'artist' => 'Artist',
60 'model' => 'Model',
61 'make' => 'Make',
62 'author' => 'Artist',
63 'comment' => 'PNGFileComment',
64 'description' => 'ImageDescription',
65 'title' => 'ObjectName',
66 'copyright' => 'Copyright',
67 # Source as in original device used to make image
68 # not as in who gave you the image
69 'source' => 'Model',
70 'software' => 'Software',
71 'disclaimer' => 'Disclaimer',
72 'warning' => 'ContentWarning',
73 'url' => 'Identifier', # Not sure if this is best mapping. Maybe WebStatement.
74 'label' => 'Label',
75 'creation time' => 'DateTimeDigitized',
76 /* Other potentially useful things - Document */
77 ];
78
79 $frameCount = 0;
80 $loopCount = 1;
81 $text = [];
82 $duration = 0.0;
83 $width = 0;
84 $height = 0;
85 $bitDepth = 0;
86 $colorType = 'unknown';
87
88 if ( !$filename ) {
89 throw new Exception( __METHOD__ . ": No file name specified" );
90 } elseif ( !file_exists( $filename ) || is_dir( $filename ) ) {
91 throw new Exception( __METHOD__ . ": File $filename does not exist" );
92 }
93
94 $fh = fopen( $filename, 'rb' );
95
96 if ( !$fh ) {
97 throw new Exception( __METHOD__ . ": Unable to open file $filename" );
98 }
99
100 // Check for the PNG header
101 $buf = self::read( $fh, 8 );
102 if ( $buf != self::$pngSig ) {
103 throw new Exception( __METHOD__ . ": Not a valid PNG file; header: $buf" );
104 }
105
106 // Read chunks
107 while ( !feof( $fh ) ) {
108 $buf = self::read( $fh, 4 );
109 $chunk_size = unpack( "N", $buf )[1];
110
111 if ( $chunk_size < 0 || $chunk_size > self::MAX_CHUNK_SIZE ) {
112 wfDebug( __METHOD__ . ': Chunk size of ' . $chunk_size .
113 ' too big, skipping. Max size is: ' . self::MAX_CHUNK_SIZE );
114 if ( fseek( $fh, 4 + $chunk_size + self::$crcSize, SEEK_CUR ) !== 0 ) {
115 throw new Exception( __METHOD__ . ': seek error' );
116 }
117 continue;
118 }
119
120 $chunk_type = self::read( $fh, 4 );
121 $buf = self::read( $fh, $chunk_size );
122 $crc = self::read( $fh, self::$crcSize );
123 $computed = crc32( $chunk_type . $buf );
124 if ( pack( 'N', $computed ) !== $crc ) {
125 wfDebug( __METHOD__ . ': chunk has invalid CRC, skipping' );
126 continue;
127 }
128
129 if ( $chunk_type == "IHDR" ) {
130 $width = unpack( 'N', substr( $buf, 0, 4 ) )[1];
131 $height = unpack( 'N', substr( $buf, 4, 4 ) )[1];
132 $bitDepth = ord( substr( $buf, 8, 1 ) );
133 // Detect the color type in British English as per the spec
134 // https://www.w3.org/TR/PNG/#11IHDR
135 switch ( ord( substr( $buf, 9, 1 ) ) ) {
136 case 0:
137 $colorType = 'greyscale';
138 break;
139 case 2:
140 $colorType = 'truecolour';
141 break;
142 case 3:
143 $colorType = 'index-coloured';
144 break;
145 case 4:
146 $colorType = 'greyscale-alpha';
147 break;
148 case 6:
149 $colorType = 'truecolour-alpha';
150 break;
151 default:
152 $colorType = 'unknown';
153 break;
154 }
155 } elseif ( $chunk_type == "acTL" ) {
156 if ( $chunk_size < 4 ) {
157 wfDebug( __METHOD__ . ": acTL chunk too small" );
158 continue;
159 }
160
161 $actl = unpack( "Nframes/Nplays", $buf );
162 $frameCount = $actl['frames'];
163 $loopCount = $actl['plays'];
164 } elseif ( $chunk_type == "fcTL" ) {
165 $buf = substr( $buf, 20 );
166 if ( strlen( $buf ) < 4 ) {
167 wfDebug( __METHOD__ . ": fcTL chunk too small" );
168 continue;
169 }
170
171 $fctldur = unpack( "ndelay_num/ndelay_den", $buf );
172 if ( $fctldur['delay_den'] == 0 ) {
173 $fctldur['delay_den'] = 100;
174 }
175 if ( $fctldur['delay_num'] ) {
176 $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
177 }
178 } elseif ( $chunk_type == "iTXt" ) {
179 // Extracts iTXt chunks, uncompressing if necessary.
180 $items = [];
181 if ( preg_match(
182 '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
183 $buf, $items )
184 ) {
185 /* $items[1] = text chunk name, $items[2] = compressed flag,
186 * $items[3] = lang code (or ""), $items[4]= compression type.
187 * $items[5] = content
188 */
189
190 // Theoretically should be case-sensitive, but in practise...
191 $items[1] = strtolower( $items[1] );
192 if ( !isset( self::$textChunks[$items[1]] ) ) {
193 // Only extract textual chunks on our list.
194 continue;
195 }
196
197 $items[3] = strtolower( $items[3] );
198 if ( $items[3] == '' ) {
199 // if no lang specified use x-default like in xmp.
200 $items[3] = 'x-default';
201 }
202
203 // if compressed
204 if ( $items[2] == "\x01" ) {
205 if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
206 AtEase::suppressWarnings();
207 $items[5] = gzuncompress( $items[5] );
208 AtEase::restoreWarnings();
209
210 if ( $items[5] === false ) {
211 // decompression failed
212 wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] );
213 continue;
214 }
215 } else {
216 wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
217 . " or potentially invalid compression method" );
218 continue;
219 }
220 }
221 $finalKeyword = self::$textChunks[$items[1]];
222 $text[$finalKeyword][$items[3]] = $items[5];
223 $text[$finalKeyword]['_type'] = 'lang';
224 } else {
225 // Error reading iTXt chunk
226 wfDebug( __METHOD__ . ": Invalid iTXt chunk" );
227 }
228 } elseif ( $chunk_type == 'tEXt' ) {
229 // In case there is no \x00 which will make explode fail.
230 if ( strpos( $buf, "\x00" ) === false ) {
231 wfDebug( __METHOD__ . ": Invalid tEXt chunk: no null byte" );
232 continue;
233 }
234
235 list( $keyword, $content ) = explode( "\x00", $buf, 2 );
236 if ( $keyword === '' ) {
237 wfDebug( __METHOD__ . ": Empty tEXt keyword" );
238 continue;
239 }
240
241 // Theoretically should be case-sensitive, but in practise...
242 $keyword = strtolower( $keyword );
243 if ( !isset( self::$textChunks[$keyword] ) ) {
244 // Don't recognize chunk, so skip.
245 continue;
246 }
247 AtEase::suppressWarnings();
248 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
249 AtEase::restoreWarnings();
250
251 if ( $content === false ) {
252 wfDebug( __METHOD__ . ": Read error (error with iconv)" );
253 continue;
254 }
255
256 $finalKeyword = self::$textChunks[$keyword];
257 $text[$finalKeyword]['x-default'] = $content;
258 $text[$finalKeyword]['_type'] = 'lang';
259 } elseif ( $chunk_type == 'zTXt' ) {
260 if ( function_exists( 'gzuncompress' ) ) {
261 // In case there is no \x00 which will make explode fail.
262 if ( strpos( $buf, "\x00" ) === false ) {
263 wfDebug( __METHOD__ . ": No null byte in zTXt chunk" );
264 continue;
265 }
266
267 list( $keyword, $postKeyword ) = explode( "\x00", $buf, 2 );
268 if ( $keyword === '' || $postKeyword === '' ) {
269 wfDebug( __METHOD__ . ": Empty zTXt chunk" );
270 continue;
271 }
272 // Theoretically should be case-sensitive, but in practise...
273 $keyword = strtolower( $keyword );
274
275 if ( !isset( self::$textChunks[$keyword] ) ) {
276 // Don't recognize chunk, so skip.
277 continue;
278 }
279 $compression = substr( $postKeyword, 0, 1 );
280 $content = substr( $postKeyword, 1 );
281 if ( $compression !== "\x00" ) {
282 wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping." );
283 continue;
284 }
285
286 AtEase::suppressWarnings();
287 $content = gzuncompress( $content );
288 AtEase::restoreWarnings();
289
290 if ( $content === false ) {
291 // decompression failed
292 wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword );
293 continue;
294 }
295
296 AtEase::suppressWarnings();
297 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
298 AtEase::restoreWarnings();
299
300 if ( $content === false ) {
301 wfDebug( __METHOD__ . ": iconv error in zTXt chunk" );
302 continue;
303 }
304
305 $finalKeyword = self::$textChunks[$keyword];
306 $text[$finalKeyword]['x-default'] = $content;
307 $text[$finalKeyword]['_type'] = 'lang';
308 } else {
309 wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping." );
310 }
311 } elseif ( $chunk_type == 'tIME' ) {
312 // last mod timestamp.
313 if ( $chunk_size !== 7 ) {
314 wfDebug( __METHOD__ . ": tIME wrong size" );
315 continue;
316 }
317
318 // Note: spec says this should be UTC.
319 $t = unpack( "ny/Cm/Cd/Ch/Cmin/Cs", $buf );
320 $strTime = sprintf( "%04d%02d%02d%02d%02d%02d",
321 $t['y'], $t['m'], $t['d'], $t['h'],
322 $t['min'], $t['s'] );
323
324 $exifTime = wfTimestamp( TS_EXIF, $strTime );
325
326 if ( $exifTime ) {
327 $text['DateTime'] = $exifTime;
328 }
329 } elseif ( $chunk_type == 'pHYs' ) {
330 // how big pixels are (dots per meter).
331 if ( $chunk_size !== 9 ) {
332 wfDebug( __METHOD__ . ": pHYs wrong size" );
333 continue;
334 }
335
336 $dim = unpack( "Nwidth/Nheight/Cunit", $buf );
337 if ( $dim['unit'] == 1 ) {
338 // Need to check for negative because php
339 // doesn't deal with super-large unsigned 32-bit ints well
340 if ( $dim['width'] > 0 && $dim['height'] > 0 ) {
341 // unit is meters
342 // (as opposed to 0 = undefined )
343 $text['XResolution'] = $dim['width']
344 . '/100';
345 $text['YResolution'] = $dim['height']
346 . '/100';
347 $text['ResolutionUnit'] = 3;
348 // 3 = dots per cm (from Exif).
349 }
350 }
351 } elseif ( $chunk_type == "IEND" ) {
352 break;
353 }
354 }
355 fclose( $fh );
356
357 if ( $loopCount > 1 ) {
358 $duration *= $loopCount;
359 }
360
361 if ( isset( $text['DateTimeDigitized'] ) ) {
362 // Convert date format from rfc2822 to exif.
363 foreach ( $text['DateTimeDigitized'] as $name => &$value ) {
364 if ( $name === '_type' ) {
365 continue;
366 }
367
368 // @todo FIXME: Currently timezones are ignored.
369 // possibly should be wfTimestamp's
370 // responsibility. (at least for numeric TZ)
371 $formatted = wfTimestamp( TS_EXIF, $value );
372 if ( $formatted ) {
373 // Only change if we could convert the
374 // date.
375 // The png standard says it should be
376 // in rfc2822 format, but not required.
377 // In general for the exif stuff we
378 // prettify the date if we can, but we
379 // display as-is if we cannot or if
380 // it is invalid.
381 // So do the same here.
382
383 $value = $formatted;
384 }
385 }
386 }
387
388 return [
389 'width' => $width,
390 'height' => $height,
391 'frameCount' => $frameCount,
392 'loopCount' => $loopCount,
393 'duration' => $duration,
394 'text' => $text,
395 'bitDepth' => $bitDepth,
396 'colorType' => $colorType,
397 ];
398 }
399
408 private static function read( $fh, $size ) {
409 if ( $size === 0 ) {
410 return '';
411 }
412
413 $result = fread( $fh, $size );
414 if ( $result === false ) {
415 throw new Exception( __METHOD__ . ': read error' );
416 }
417 if ( strlen( $result ) < $size ) {
418 throw new Exception( __METHOD__ . ': unexpected end of file' );
419 }
420 return $result;
421 }
422}
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static getMetadata( $filename)
$content
Definition router.php:76