MediaWiki  master
PNGMetadataExtractor.php
Go to the documentation of this file.
1 <?php
28 use Wikimedia\AtEase\AtEase;
29 
37  private static $pngSig;
38 
40  private static $crcSize;
41 
43  private static $textChunks;
44 
45  public const VERSION = 1;
46  private const MAX_CHUNK_SIZE = 3145728; // 3 mebibytes
47 
48  public static function getMetadata( $filename ) {
49  self::$pngSig = pack( "C8", 137, 80, 78, 71, 13, 10, 26, 10 );
50  self::$crcSize = 4;
51  /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
52  * and https://www.w3.org/TR/PNG/#11keywords
53  */
54  self::$textChunks = [
55  'xml:com.adobe.xmp' => 'xmp',
56  # Artist is unofficial. Author is the recommended
57  # keyword in the PNG spec. However some people output
58  # Artist so support both.
59  'artist' => 'Artist',
60  'model' => 'Model',
61  'make' => 'Make',
62  'author' => 'Artist',
63  'comment' => 'PNGFileComment',
64  'description' => 'ImageDescription',
65  'title' => 'ObjectName',
66  'copyright' => 'Copyright',
67  # Source as in original device used to make image
68  # not as in who gave you the image
69  'source' => 'Model',
70  'software' => 'Software',
71  'disclaimer' => 'Disclaimer',
72  'warning' => 'ContentWarning',
73  'url' => 'Identifier', # Not sure if this is best mapping. Maybe WebStatement.
74  'label' => 'Label',
75  'creation time' => 'DateTimeDigitized',
76  /* Other potentially useful things - Document */
77  ];
78 
79  $frameCount = 0;
80  $loopCount = 1;
81  $text = [];
82  $duration = 0.0;
83  $width = 0;
84  $height = 0;
85  $bitDepth = 0;
86  $colorType = 'unknown';
87 
88  if ( !$filename ) {
89  throw new Exception( __METHOD__ . ": No file name specified" );
90  } elseif ( !file_exists( $filename ) || is_dir( $filename ) ) {
91  throw new Exception( __METHOD__ . ": File $filename does not exist" );
92  }
93 
94  $fh = fopen( $filename, 'rb' );
95 
96  if ( !$fh ) {
97  throw new Exception( __METHOD__ . ": Unable to open file $filename" );
98  }
99 
100  // Check for the PNG header
101  $buf = self::read( $fh, 8 );
102  if ( $buf != self::$pngSig ) {
103  throw new Exception( __METHOD__ . ": Not a valid PNG file; header: $buf" );
104  }
105 
106  // Read chunks
107  while ( !feof( $fh ) ) {
108  $buf = self::read( $fh, 4 );
109  $chunk_size = unpack( "N", $buf )[1];
110 
111  if ( $chunk_size < 0 || $chunk_size > self::MAX_CHUNK_SIZE ) {
112  wfDebug( __METHOD__ . ': Chunk size of ' . $chunk_size .
113  ' too big, skipping. Max size is: ' . self::MAX_CHUNK_SIZE );
114  if ( fseek( $fh, 4 + $chunk_size + self::$crcSize, SEEK_CUR ) !== 0 ) {
115  throw new Exception( __METHOD__ . ': seek error' );
116  }
117  continue;
118  }
119 
120  $chunk_type = self::read( $fh, 4 );
121  $buf = self::read( $fh, $chunk_size );
122  $crc = self::read( $fh, self::$crcSize );
123  $computed = crc32( $chunk_type . $buf );
124  if ( pack( 'N', $computed ) !== $crc ) {
125  wfDebug( __METHOD__ . ': chunk has invalid CRC, skipping' );
126  continue;
127  }
128 
129  if ( $chunk_type == "IHDR" ) {
130  $width = unpack( 'N', substr( $buf, 0, 4 ) )[1];
131  $height = unpack( 'N', substr( $buf, 4, 4 ) )[1];
132  $bitDepth = ord( substr( $buf, 8, 1 ) );
133  // Detect the color type in British English as per the spec
134  // https://www.w3.org/TR/PNG/#11IHDR
135  switch ( ord( substr( $buf, 9, 1 ) ) ) {
136  case 0:
137  $colorType = 'greyscale';
138  break;
139  case 2:
140  $colorType = 'truecolour';
141  break;
142  case 3:
143  $colorType = 'index-coloured';
144  break;
145  case 4:
146  $colorType = 'greyscale-alpha';
147  break;
148  case 6:
149  $colorType = 'truecolour-alpha';
150  break;
151  default:
152  $colorType = 'unknown';
153  break;
154  }
155  } elseif ( $chunk_type == "acTL" ) {
156  if ( $chunk_size < 4 ) {
157  wfDebug( __METHOD__ . ": acTL chunk too small" );
158  continue;
159  }
160 
161  $actl = unpack( "Nframes/Nplays", $buf );
162  $frameCount = $actl['frames'];
163  $loopCount = $actl['plays'];
164  } elseif ( $chunk_type == "fcTL" ) {
165  $buf = substr( $buf, 20 );
166  if ( strlen( $buf ) < 4 ) {
167  wfDebug( __METHOD__ . ": fcTL chunk too small" );
168  continue;
169  }
170 
171  $fctldur = unpack( "ndelay_num/ndelay_den", $buf );
172  if ( $fctldur['delay_den'] == 0 ) {
173  $fctldur['delay_den'] = 100;
174  }
175  if ( $fctldur['delay_num'] ) {
176  $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
177  }
178  } elseif ( $chunk_type == "iTXt" ) {
179  // Extracts iTXt chunks, uncompressing if necessary.
180  $items = [];
181  if ( preg_match(
182  '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
183  $buf, $items )
184  ) {
185  /* $items[1] = text chunk name, $items[2] = compressed flag,
186  * $items[3] = lang code (or ""), $items[4]= compression type.
187  * $items[5] = content
188  */
189 
190  // Theoretically should be case-sensitive, but in practise...
191  $items[1] = strtolower( $items[1] );
192  if ( !isset( self::$textChunks[$items[1]] ) ) {
193  // Only extract textual chunks on our list.
194  continue;
195  }
196 
197  $items[3] = strtolower( $items[3] );
198  if ( $items[3] == '' ) {
199  // if no lang specified use x-default like in xmp.
200  $items[3] = 'x-default';
201  }
202 
203  // if compressed
204  if ( $items[2] == "\x01" ) {
205  if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
206  AtEase::suppressWarnings();
207  $items[5] = gzuncompress( $items[5] );
208  AtEase::restoreWarnings();
209 
210  if ( $items[5] === false ) {
211  // decompression failed
212  wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] );
213  continue;
214  }
215  } else {
216  wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
217  . " or potentially invalid compression method" );
218  continue;
219  }
220  }
221  $finalKeyword = self::$textChunks[$items[1]];
222  $text[$finalKeyword][$items[3]] = $items[5];
223  $text[$finalKeyword]['_type'] = 'lang';
224  } else {
225  // Error reading iTXt chunk
226  wfDebug( __METHOD__ . ": Invalid iTXt chunk" );
227  }
228  } elseif ( $chunk_type == 'tEXt' ) {
229  // In case there is no \x00 which will make explode fail.
230  if ( strpos( $buf, "\x00" ) === false ) {
231  wfDebug( __METHOD__ . ": Invalid tEXt chunk: no null byte" );
232  continue;
233  }
234 
235  list( $keyword, $content ) = explode( "\x00", $buf, 2 );
236  if ( $keyword === '' ) {
237  wfDebug( __METHOD__ . ": Empty tEXt keyword" );
238  continue;
239  }
240 
241  // Theoretically should be case-sensitive, but in practise...
242  $keyword = strtolower( $keyword );
243  if ( !isset( self::$textChunks[$keyword] ) ) {
244  // Don't recognize chunk, so skip.
245  continue;
246  }
247  AtEase::suppressWarnings();
248  $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
249  AtEase::restoreWarnings();
250 
251  if ( $content === false ) {
252  wfDebug( __METHOD__ . ": Read error (error with iconv)" );
253  continue;
254  }
255 
256  $finalKeyword = self::$textChunks[$keyword];
257  $text[$finalKeyword]['x-default'] = $content;
258  $text[$finalKeyword]['_type'] = 'lang';
259  } elseif ( $chunk_type == 'zTXt' ) {
260  if ( function_exists( 'gzuncompress' ) ) {
261  // In case there is no \x00 which will make explode fail.
262  if ( strpos( $buf, "\x00" ) === false ) {
263  wfDebug( __METHOD__ . ": No null byte in zTXt chunk" );
264  continue;
265  }
266 
267  list( $keyword, $postKeyword ) = explode( "\x00", $buf, 2 );
268  if ( $keyword === '' || $postKeyword === '' ) {
269  wfDebug( __METHOD__ . ": Empty zTXt chunk" );
270  continue;
271  }
272  // Theoretically should be case-sensitive, but in practise...
273  $keyword = strtolower( $keyword );
274 
275  if ( !isset( self::$textChunks[$keyword] ) ) {
276  // Don't recognize chunk, so skip.
277  continue;
278  }
279  $compression = substr( $postKeyword, 0, 1 );
280  $content = substr( $postKeyword, 1 );
281  if ( $compression !== "\x00" ) {
282  wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping." );
283  continue;
284  }
285 
286  AtEase::suppressWarnings();
287  $content = gzuncompress( $content );
288  AtEase::restoreWarnings();
289 
290  if ( $content === false ) {
291  // decompression failed
292  wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword );
293  continue;
294  }
295 
296  AtEase::suppressWarnings();
297  $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
298  AtEase::restoreWarnings();
299 
300  if ( $content === false ) {
301  wfDebug( __METHOD__ . ": iconv error in zTXt chunk" );
302  continue;
303  }
304 
305  $finalKeyword = self::$textChunks[$keyword];
306  $text[$finalKeyword]['x-default'] = $content;
307  $text[$finalKeyword]['_type'] = 'lang';
308  } else {
309  wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping." );
310  }
311  } elseif ( $chunk_type == 'tIME' ) {
312  // last mod timestamp.
313  if ( $chunk_size !== 7 ) {
314  wfDebug( __METHOD__ . ": tIME wrong size" );
315  continue;
316  }
317 
318  // Note: spec says this should be UTC.
319  $t = unpack( "ny/Cm/Cd/Ch/Cmin/Cs", $buf );
320  $strTime = sprintf( "%04d%02d%02d%02d%02d%02d",
321  $t['y'], $t['m'], $t['d'], $t['h'],
322  $t['min'], $t['s'] );
323 
324  $exifTime = wfTimestamp( TS_EXIF, $strTime );
325 
326  if ( $exifTime ) {
327  $text['DateTime'] = $exifTime;
328  }
329  } elseif ( $chunk_type == 'pHYs' ) {
330  // how big pixels are (dots per meter).
331  if ( $chunk_size !== 9 ) {
332  wfDebug( __METHOD__ . ": pHYs wrong size" );
333  continue;
334  }
335 
336  $dim = unpack( "Nwidth/Nheight/Cunit", $buf );
337  if ( $dim['unit'] == 1 ) {
338  // Need to check for negative because php
339  // doesn't deal with super-large unsigned 32-bit ints well
340  if ( $dim['width'] > 0 && $dim['height'] > 0 ) {
341  // unit is meters
342  // (as opposed to 0 = undefined )
343  $text['XResolution'] = $dim['width']
344  . '/100';
345  $text['YResolution'] = $dim['height']
346  . '/100';
347  $text['ResolutionUnit'] = 3;
348  // 3 = dots per cm (from Exif).
349  }
350  }
351  } elseif ( $chunk_type == "IEND" ) {
352  break;
353  }
354  }
355  fclose( $fh );
356 
357  if ( $loopCount > 1 ) {
358  $duration *= $loopCount;
359  }
360 
361  if ( isset( $text['DateTimeDigitized'] ) ) {
362  // Convert date format from rfc2822 to exif.
363  foreach ( $text['DateTimeDigitized'] as $name => &$value ) {
364  if ( $name === '_type' ) {
365  continue;
366  }
367 
368  // @todo FIXME: Currently timezones are ignored.
369  // possibly should be wfTimestamp's
370  // responsibility. (at least for numeric TZ)
371  $formatted = wfTimestamp( TS_EXIF, $value );
372  if ( $formatted ) {
373  // Only change if we could convert the
374  // date.
375  // The png standard says it should be
376  // in rfc2822 format, but not required.
377  // In general for the exif stuff we
378  // prettify the date if we can, but we
379  // display as-is if we cannot or if
380  // it is invalid.
381  // So do the same here.
382 
383  $value = $formatted;
384  }
385  }
386  }
387 
388  return [
389  'width' => $width,
390  'height' => $height,
391  'frameCount' => $frameCount,
392  'loopCount' => $loopCount,
393  'duration' => $duration,
394  'text' => $text,
395  'bitDepth' => $bitDepth,
396  'colorType' => $colorType,
397  ];
398  }
399 
408  private static function read( $fh, $size ) {
409  if ( $size === 0 ) {
410  return '';
411  }
412 
413  $result = fread( $fh, $size );
414  if ( $result === false ) {
415  throw new Exception( __METHOD__ . ': read error' );
416  }
417  if ( strlen( $result ) < $size ) {
418  throw new Exception( __METHOD__ . ': unexpected end of file' );
419  }
420  return $result;
421  }
422 }
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static read( $fh, $size)
Read a chunk, checking to make sure its not too big.
static getMetadata( $filename)
$content
Definition: router.php:76