MediaWiki  master
MimeAnalyzer.php
Go to the documentation of this file.
1 <?php
24 
30 class MimeAnalyzer implements LoggerAwareInterface {
32  protected $typeFile;
34  protected $infoFile;
36  protected $xmlTypes;
38  protected $initCallback;
40  protected $detectCallback;
42  protected $guessCallback;
44  protected $extCallback;
46  protected $mediaTypes = null;
48  protected $mimeTypeAliases = null;
50  protected $mimetoExt = null;
51 
53  public $mExtToMime = null; // legacy name; field accessed by hooks
54 
56  protected $IEAnalyzer;
57 
59  private $extraTypes = '';
61  private $extraInfo = '';
62 
64  private $logger;
65 
85  protected static $wellKnownTypes = <<<EOT
86 application/ogg ogx ogg ogm ogv oga spx opus
87 application/pdf pdf
88 application/vnd.oasis.opendocument.chart odc
89 application/vnd.oasis.opendocument.chart-template otc
90 application/vnd.oasis.opendocument.database odb
91 application/vnd.oasis.opendocument.formula odf
92 application/vnd.oasis.opendocument.formula-template otf
93 application/vnd.oasis.opendocument.graphics odg
94 application/vnd.oasis.opendocument.graphics-template otg
95 application/vnd.oasis.opendocument.image odi
96 application/vnd.oasis.opendocument.image-template oti
97 application/vnd.oasis.opendocument.presentation odp
98 application/vnd.oasis.opendocument.presentation-template otp
99 application/vnd.oasis.opendocument.spreadsheet ods
100 application/vnd.oasis.opendocument.spreadsheet-template ots
101 application/vnd.oasis.opendocument.text odt
102 application/vnd.oasis.opendocument.text-master otm
103 application/vnd.oasis.opendocument.text-template ott
104 application/vnd.oasis.opendocument.text-web oth
105 application/javascript js
106 application/x-shockwave-flash swf
107 audio/midi mid midi kar
108 audio/mpeg mpga mpa mp2 mp3
109 audio/x-aiff aif aiff aifc
110 audio/x-wav wav
111 audio/ogg oga spx ogg opus
112 audio/opus opus ogg oga ogg spx
113 image/x-bmp bmp
114 image/gif gif
115 image/jpeg jpeg jpg jpe
116 image/png png
117 image/svg+xml svg
118 image/svg svg
119 image/tiff tiff tif
120 image/vnd.djvu djvu
121 image/x.djvu djvu
122 image/x-djvu djvu
123 image/x-portable-pixmap ppm
124 image/x-xcf xcf
125 text/plain txt
126 text/html html htm
127 video/ogg ogv ogm ogg
128 video/mpeg mpg mpeg
129 EOT;
130 
137  protected static $wellKnownInfo = <<<EOT
138 application/pdf [OFFICE]
139 application/vnd.oasis.opendocument.chart [OFFICE]
140 application/vnd.oasis.opendocument.chart-template [OFFICE]
141 application/vnd.oasis.opendocument.database [OFFICE]
142 application/vnd.oasis.opendocument.formula [OFFICE]
143 application/vnd.oasis.opendocument.formula-template [OFFICE]
144 application/vnd.oasis.opendocument.graphics [OFFICE]
145 application/vnd.oasis.opendocument.graphics-template [OFFICE]
146 application/vnd.oasis.opendocument.image [OFFICE]
147 application/vnd.oasis.opendocument.image-template [OFFICE]
148 application/vnd.oasis.opendocument.presentation [OFFICE]
149 application/vnd.oasis.opendocument.presentation-template [OFFICE]
150 application/vnd.oasis.opendocument.spreadsheet [OFFICE]
151 application/vnd.oasis.opendocument.spreadsheet-template [OFFICE]
152 application/vnd.oasis.opendocument.text [OFFICE]
153 application/vnd.oasis.opendocument.text-template [OFFICE]
154 application/vnd.oasis.opendocument.text-master [OFFICE]
155 application/vnd.oasis.opendocument.text-web [OFFICE]
156 application/javascript text/javascript application/x-javascript [EXECUTABLE]
157 application/x-shockwave-flash [MULTIMEDIA]
158 audio/midi [AUDIO]
159 audio/x-aiff [AUDIO]
160 audio/x-wav [AUDIO]
161 audio/mp3 audio/mpeg [AUDIO]
162 application/ogg audio/ogg video/ogg [MULTIMEDIA]
163 image/x-bmp image/x-ms-bmp image/bmp [BITMAP]
164 image/gif [BITMAP]
165 image/jpeg [BITMAP]
166 image/png [BITMAP]
167 image/svg+xml [DRAWING]
168 image/tiff [BITMAP]
169 image/vnd.djvu [BITMAP]
170 image/x-xcf [BITMAP]
171 image/x-portable-pixmap [BITMAP]
172 text/plain [TEXT]
173 text/html [TEXT]
174 video/ogg [VIDEO]
175 video/mpeg [VIDEO]
176 unknown/unknown application/octet-stream application/x-empty [UNKNOWN]
177 EOT;
178 
194  public function __construct( array $params ) {
195  $this->typeFile = $params['typeFile'];
196  $this->infoFile = $params['infoFile'];
197  $this->xmlTypes = $params['xmlTypes'];
198  $this->initCallback = $params['initCallback'] ?? null;
199  $this->detectCallback = $params['detectCallback'] ?? null;
200  $this->guessCallback = $params['guessCallback'] ?? null;
201  $this->extCallback = $params['extCallback'] ?? null;
202  $this->logger = $params['logger'] ?? new \Psr\Log\NullLogger();
203 
204  $this->loadFiles();
205  }
206 
207  protected function loadFiles() {
212  # Allow media handling extensions adding MIME-types and MIME-info
213  if ( $this->initCallback ) {
214  call_user_func( $this->initCallback, $this );
215  }
216 
217  $types = self::$wellKnownTypes;
218 
219  $mimeTypeFile = $this->typeFile;
220  if ( $mimeTypeFile ) {
221  if ( is_file( $mimeTypeFile ) && is_readable( $mimeTypeFile ) ) {
222  $this->logger->info( __METHOD__ . ": loading mime types from $mimeTypeFile\n" );
223  $types .= "\n";
224  $types .= file_get_contents( $mimeTypeFile );
225  } else {
226  $this->logger->info( __METHOD__ . ": can't load mime types from $mimeTypeFile\n" );
227  }
228  } else {
229  $this->logger->info( __METHOD__ .
230  ": no mime types file defined, using built-ins only.\n" );
231  }
232 
233  $types .= "\n" . $this->extraTypes;
234 
235  $types = str_replace( [ "\r\n", "\n\r", "\n\n", "\r\r", "\r" ], "\n", $types );
236  $types = str_replace( "\t", " ", $types );
237 
238  $this->mimetoExt = [];
239  $this->mExtToMime = [];
240 
241  $lines = explode( "\n", $types );
242  foreach ( $lines as $s ) {
243  $s = trim( $s );
244  if ( empty( $s ) ) {
245  continue;
246  }
247  if ( strpos( $s, '#' ) === 0 ) {
248  continue;
249  }
250 
251  $s = strtolower( $s );
252  $i = strpos( $s, ' ' );
253 
254  if ( $i === false ) {
255  continue;
256  }
257 
258  $mime = substr( $s, 0, $i );
259  $ext = trim( substr( $s, $i + 1 ) );
260 
261  if ( empty( $ext ) ) {
262  continue;
263  }
264 
265  if ( !empty( $this->mimetoExt[$mime] ) ) {
266  $this->mimetoExt[$mime] .= ' ' . $ext;
267  } else {
268  $this->mimetoExt[$mime] = $ext;
269  }
270 
271  $extensions = explode( ' ', $ext );
272 
273  foreach ( $extensions as $e ) {
274  $e = trim( $e );
275  if ( empty( $e ) ) {
276  continue;
277  }
278 
279  if ( !empty( $this->mExtToMime[$e] ) ) {
280  $this->mExtToMime[$e] .= ' ' . $mime;
281  } else {
282  $this->mExtToMime[$e] = $mime;
283  }
284  }
285  }
286 
291  $mimeInfoFile = $this->infoFile;
292 
293  $info = self::$wellKnownInfo;
294 
295  if ( $mimeInfoFile ) {
296  if ( is_file( $mimeInfoFile ) && is_readable( $mimeInfoFile ) ) {
297  $this->logger->info( __METHOD__ . ": loading mime info from $mimeInfoFile\n" );
298  $info .= "\n";
299  $info .= file_get_contents( $mimeInfoFile );
300  } else {
301  $this->logger->info( __METHOD__ . ": can't load mime info from $mimeInfoFile\n" );
302  }
303  } else {
304  $this->logger->info( __METHOD__ .
305  ": no mime info file defined, using built-ins only.\n" );
306  }
307 
308  $info .= "\n" . $this->extraInfo;
309 
310  $info = str_replace( [ "\r\n", "\n\r", "\n\n", "\r\r", "\r" ], "\n", $info );
311  $info = str_replace( "\t", " ", $info );
312 
313  $this->mimeTypeAliases = [];
314  $this->mediaTypes = [];
315 
316  $lines = explode( "\n", $info );
317  foreach ( $lines as $s ) {
318  $s = trim( $s );
319  if ( empty( $s ) ) {
320  continue;
321  }
322  if ( strpos( $s, '#' ) === 0 ) {
323  continue;
324  }
325 
326  $s = strtolower( $s );
327  $i = strpos( $s, ' ' );
328 
329  if ( $i === false ) {
330  continue;
331  }
332 
333  # print "processing MIME INFO line $s<br>";
334 
335  $match = [];
336  if ( preg_match( '!\[\s*(\w+)\s*\]!', $s, $match ) ) {
337  $s = preg_replace( '!\[\s*(\w+)\s*\]!', '', $s );
338  $mtype = trim( strtoupper( $match[1] ) );
339  } else {
340  $mtype = MEDIATYPE_UNKNOWN;
341  }
342 
343  $m = explode( ' ', $s );
344 
345  if ( !isset( $this->mediaTypes[$mtype] ) ) {
346  $this->mediaTypes[$mtype] = [];
347  }
348 
349  foreach ( $m as $mime ) {
350  $mime = trim( $mime );
351  if ( empty( $mime ) ) {
352  continue;
353  }
354 
355  $this->mediaTypes[$mtype][] = $mime;
356  }
357 
358  if ( count( $m ) > 1 ) {
359  $main = $m[0];
360  $mCount = count( $m );
361  for ( $i = 1; $i < $mCount; $i += 1 ) {
362  $mime = $m[$i];
363  $this->mimeTypeAliases[$mime] = $main;
364  }
365  }
366  }
367  }
368 
369  public function setLogger( LoggerInterface $logger ) {
370  $this->logger = $logger;
371  }
372 
379  public function addExtraTypes( $types ) {
380  $this->extraTypes .= "\n" . $types;
381  }
382 
389  public function addExtraInfo( $info ) {
390  $this->extraInfo .= "\n" . $info;
391  }
392 
401  public function getExtensionsForType( $mime ) {
402  $mime = strtolower( $mime );
403 
404  // Check the mime-to-ext map
405  if ( isset( $this->mimetoExt[$mime] ) ) {
406  return $this->mimetoExt[$mime];
407  }
408 
409  // Resolve the MIME type to the canonical type
410  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
411  $mime = $this->mimeTypeAliases[$mime];
412  if ( isset( $this->mimetoExt[$mime] ) ) {
413  return $this->mimetoExt[$mime];
414  }
415  }
416 
417  return null;
418  }
419 
427  public function getTypesForExtension( $ext ) {
428  $ext = strtolower( $ext );
429 
430  $r = $this->mExtToMime[$ext] ?? null;
431  return $r;
432  }
433 
441  public function guessTypesForExtension( $ext ) {
442  $m = $this->getTypesForExtension( $ext );
443  if ( is_null( $m ) ) {
444  return null;
445  }
446 
447  // TODO: Check if this is needed; strtok( $m, ' ' ) should be sufficient
448  $m = trim( $m );
449  $m = preg_replace( '/\s.*$/', '', $m );
450 
451  return $m;
452  }
453 
463  public function isMatchingExtension( $extension, $mime ) {
464  $ext = $this->getExtensionsForType( $mime );
465 
466  if ( !$ext ) {
467  return null; // Unknown MIME type
468  }
469 
470  $ext = explode( ' ', $ext );
471 
472  $extension = strtolower( $extension );
473  return in_array( $extension, $ext );
474  }
475 
484  public function isPHPImageType( $mime ) {
485  // As defined by imagegetsize and image_type_to_mime
486  static $types = [
487  'image/gif', 'image/jpeg', 'image/png',
488  'image/x-bmp', 'image/xbm', 'image/tiff',
489  'image/jp2', 'image/jpeg2000', 'image/iff',
490  'image/xbm', 'image/x-xbitmap',
491  'image/vnd.wap.wbmp', 'image/vnd.xiff',
492  'image/x-photoshop',
493  'application/x-shockwave-flash',
494  ];
495 
496  return in_array( $mime, $types );
497  }
498 
511  function isRecognizableExtension( $extension ) {
512  static $types = [
513  // Types recognized by getimagesize()
514  'gif', 'jpeg', 'jpg', 'png', 'swf', 'psd',
515  'bmp', 'tiff', 'tif', 'jpc', 'jp2',
516  'jpx', 'jb2', 'swc', 'iff', 'wbmp',
517  'xbm',
518 
519  // Formats we recognize magic numbers for
520  'djvu', 'ogx', 'ogg', 'ogv', 'oga', 'spx', 'opus',
521  'mid', 'pdf', 'wmf', 'xcf', 'webm', 'mkv', 'mka',
522  'webp', 'mp3',
523 
524  // XML formats we sure hope we recognize reliably
525  'svg',
526 
527  // 3D formats
528  'stl',
529  ];
530  return in_array( strtolower( $extension ), $types );
531  }
532 
544  public function improveTypeFromExtension( $mime, $ext ) {
545  if ( $mime === 'unknown/unknown' ) {
546  if ( $this->isRecognizableExtension( $ext ) ) {
547  $this->logger->info( __METHOD__ . ': refusing to guess mime type for .' .
548  "$ext file, we should have recognized it\n" );
549  } else {
550  // Not something we can detect, so simply
551  // trust the file extension
552  $mime = $this->guessTypesForExtension( $ext );
553  }
554  } elseif ( $mime === 'application/x-opc+zip' ) {
555  if ( $this->isMatchingExtension( $ext, $mime ) ) {
556  // A known file extension for an OPC file,
557  // find the proper MIME type for that file extension
558  $mime = $this->guessTypesForExtension( $ext );
559  } else {
560  $this->logger->info( __METHOD__ .
561  ": refusing to guess better type for $mime file, " .
562  ".$ext is not a known OPC extension.\n" );
563  $mime = 'application/zip';
564  }
565  } elseif ( $mime === 'text/plain' && $this->findMediaType( ".$ext" ) === MEDIATYPE_TEXT ) {
566  // Textual types are sometimes not recognized properly.
567  // If detected as text/plain, and has an extension which is textual
568  // improve to the extension's type. For example, csv and json are often
569  // misdetected as text/plain.
570  $mime = $this->guessTypesForExtension( $ext );
571  }
572 
573  # Media handling extensions can improve the MIME detected
574  $callback = $this->extCallback;
575  if ( $callback ) {
576  $callback( $this, $ext, $mime /* by reference */ );
577  }
578 
579  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
580  $mime = $this->mimeTypeAliases[$mime];
581  }
582 
583  $this->logger->info( __METHOD__ . ": improved mime type for .$ext: $mime\n" );
584  return $mime;
585  }
586 
601  public function guessMimeType( $file, $ext = true ) {
602  if ( $ext ) { // TODO: make $ext default to false. Or better, remove it.
603  $this->logger->info( __METHOD__ .
604  ": WARNING: use of the \$ext parameter is deprecated. " .
605  "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
606  }
607 
608  $mime = $this->doGuessMimeType( $file, $ext );
609 
610  if ( !$mime ) {
611  $this->logger->info( __METHOD__ .
612  ": internal type detection failed for $file (.$ext)...\n" );
613  $mime = $this->detectMimeType( $file, $ext );
614  }
615 
616  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
617  $mime = $this->mimeTypeAliases[$mime];
618  }
619 
620  $this->logger->info( __METHOD__ . ": guessed mime type of $file: $mime\n" );
621  return $mime;
622  }
623 
634  private function doGuessMimeType( $file, $ext ) {
635  // Read a chunk of the file
636  Wikimedia\suppressWarnings();
637  $f = fopen( $file, 'rb' );
638  Wikimedia\restoreWarnings();
639 
640  if ( !$f ) {
641  return 'unknown/unknown';
642  }
643 
644  $fsize = filesize( $file );
645  if ( $fsize === false ) {
646  return 'unknown/unknown';
647  }
648 
649  $head = fread( $f, 1024 );
650  $tailLength = min( 65558, $fsize ); // 65558 = maximum size of a zip EOCDR
651  if ( fseek( $f, -1 * $tailLength, SEEK_END ) === -1 ) {
652  throw new UnexpectedValueException(
653  "Seeking $tailLength bytes from EOF failed in " . __METHOD__ );
654  }
655  $tail = $tailLength ? fread( $f, $tailLength ) : '';
656 
657  $this->logger->info( __METHOD__ .
658  ": analyzing head and tail of $file for magic numbers.\n" );
659 
660  // Hardcode a few magic number checks...
661  $headers = [
662  // Multimedia...
663  'MThd' => 'audio/midi',
664  'OggS' => 'application/ogg',
665  'ID3' => 'audio/mpeg',
666  "\xff\xfb" => 'audio/mpeg', // MPEG-1 layer 3
667  "\xff\xf3" => 'audio/mpeg', // MPEG-2 layer 3 (lower sample rates)
668  "\xff\xe3" => 'audio/mpeg', // MPEG-2.5 layer 3 (very low sample rates)
669 
670  // Image formats...
671  // Note that WMF may have a bare header, no magic number.
672  "\x01\x00\x09\x00" => 'application/x-msmetafile', // Possibly prone to false positives?
673  "\xd7\xcd\xc6\x9a" => 'application/x-msmetafile',
674  '%PDF' => 'application/pdf',
675  'gimp xcf' => 'image/x-xcf',
676 
677  // Some forbidden fruit...
678  'MZ' => 'application/octet-stream', // DOS/Windows executable
679  "\xca\xfe\xba\xbe" => 'application/octet-stream', // Mach-O binary
680  "\x7fELF" => 'application/octet-stream', // ELF binary
681  ];
682 
683  foreach ( $headers as $magic => $candidate ) {
684  if ( strncmp( $head, $magic, strlen( $magic ) ) == 0 ) {
685  $this->logger->info( __METHOD__ .
686  ": magic header in $file recognized as $candidate\n" );
687  return $candidate;
688  }
689  }
690 
691  /* Look for WebM and Matroska files */
692  if ( strncmp( $head, pack( "C4", 0x1a, 0x45, 0xdf, 0xa3 ), 4 ) == 0 ) {
693  $doctype = strpos( $head, "\x42\x82" );
694  if ( $doctype ) {
695  // Next byte is datasize, then data (sizes larger than 1 byte are stupid muxers)
696  $data = substr( $head, $doctype + 3, 8 );
697  if ( strncmp( $data, "matroska", 8 ) == 0 ) {
698  $this->logger->info( __METHOD__ . ": recognized file as video/x-matroska\n" );
699  return "video/x-matroska";
700  } elseif ( strncmp( $data, "webm", 4 ) == 0 ) {
701  // XXX HACK look for a video track, if we don't find it, this is an audio file
702  $videotrack = strpos( $head, "\x86\x85V_VP" );
703 
704  if ( $videotrack ) {
705  // There is a video track, so this is a video file.
706  $this->logger->info( __METHOD__ . ": recognized file as video/webm\n" );
707  return "video/webm";
708  }
709 
710  $this->logger->info( __METHOD__ . ": recognized file as audio/webm\n" );
711  return "audio/webm";
712  }
713  }
714  $this->logger->info( __METHOD__ . ": unknown EBML file\n" );
715  return "unknown/unknown";
716  }
717 
718  /* Look for WebP */
719  if ( strncmp( $head, "RIFF", 4 ) == 0 &&
720  strncmp( substr( $head, 8, 7 ), "WEBPVP8", 7 ) == 0
721  ) {
722  $this->logger->info( __METHOD__ . ": recognized file as image/webp\n" );
723  return "image/webp";
724  }
725 
726  /* Look for MS Compound Binary (OLE) files */
727  if ( strncmp( $head, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8 ) == 0 ) {
728  $this->logger->info( __METHOD__ . ': recognized MS CFB (OLE) file' );
729  return $this->detectMicrosoftBinaryType( $f );
730  }
731 
744  if ( ( strpos( $head, '<?php' ) !== false ) ||
745  ( strpos( $head, "<\x00?\x00p\x00h\x00p" ) !== false ) ||
746  ( strpos( $head, "<\x00?\x00 " ) !== false ) ||
747  ( strpos( $head, "<\x00?\x00\n" ) !== false ) ||
748  ( strpos( $head, "<\x00?\x00\t" ) !== false ) ||
749  ( strpos( $head, "<\x00?\x00=" ) !== false )
750  ) {
751  $this->logger->info( __METHOD__ . ": recognized $file as application/x-php\n" );
752  return 'application/x-php';
753  }
754 
758  $xml = new XmlTypeCheck( $file );
759  if ( $xml->wellFormed ) {
761  return $xmlTypes[$xml->getRootElement()] ?? 'application/xml';
762  }
763 
767  $script_type = null;
768 
769  # detect by shebang
770  if ( substr( $head, 0, 2 ) == "#!" ) {
771  $script_type = "ASCII";
772  } elseif ( substr( $head, 0, 5 ) == "\xef\xbb\xbf#!" ) {
773  $script_type = "UTF-8";
774  } elseif ( substr( $head, 0, 7 ) == "\xfe\xff\x00#\x00!" ) {
775  $script_type = "UTF-16BE";
776  } elseif ( substr( $head, 0, 7 ) == "\xff\xfe#\x00!" ) {
777  $script_type = "UTF-16LE";
778  }
779 
780  if ( $script_type ) {
781  if ( $script_type !== "UTF-8" && $script_type !== "ASCII" ) {
782  // Quick and dirty fold down to ASCII!
783  $pack = [ 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' ];
784  $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
785  $head = '';
786  foreach ( $chars as $codepoint ) {
787  if ( $codepoint < 128 ) {
788  $head .= chr( $codepoint );
789  } else {
790  $head .= '?';
791  }
792  }
793  }
794 
795  $match = [];
796 
797  if ( preg_match( '%/?([^\s]+/)(\w+)%', $head, $match ) ) {
798  $mime = "application/x-{$match[2]}";
799  $this->logger->info( __METHOD__ . ": shell script recognized as $mime\n" );
800  return $mime;
801  }
802  }
803 
804  // Check for ZIP variants (before getimagesize)
805  $eocdrPos = strpos( $tail, "PK\x05\x06" );
806  if ( $eocdrPos !== false ) {
807  $this->logger->info( __METHOD__ . ": ZIP signature present in $file\n" );
808  // Check if it really is a ZIP file, make sure the EOCDR is at the end (T40432)
809  $commentLength = unpack( "n", substr( $tail, $eocdrPos + 20 ) )[0];
810  if ( $eocdrPos + 22 + $commentLength !== strlen( $tail ) ) {
811  $this->logger->info( __METHOD__ . ": ZIP EOCDR not at end. Not a ZIP file." );
812  } else {
813  return $this->detectZipType( $head, $tail, $ext );
814  }
815  }
816 
817  // Check for STL (3D) files
818  // @see https://en.wikipedia.org/wiki/STL_(file_format)
819  if ( $fsize >= 15 &&
820  stripos( $head, 'SOLID ' ) === 0 &&
821  preg_match( '/\RENDSOLID .*$/i', $tail ) ) {
822  // ASCII STL file
823  return 'application/sla';
824  } elseif ( $fsize > 84 ) {
825  // binary STL file
826  $triangles = substr( $head, 80, 4 );
827  $triangles = unpack( 'V', $triangles );
828  $triangles = reset( $triangles );
829  if ( $triangles !== false && $fsize === 84 + ( $triangles * 50 ) ) {
830  return 'application/sla';
831  }
832  }
833 
834  Wikimedia\suppressWarnings();
835  $gis = getimagesize( $file );
836  Wikimedia\restoreWarnings();
837 
838  if ( $gis && isset( $gis['mime'] ) ) {
839  $mime = $gis['mime'];
840  $this->logger->info( __METHOD__ . ": getimagesize detected $file as $mime\n" );
841  return $mime;
842  }
843 
844  # Media handling extensions can guess the MIME by content
845  # It's intentionally here so that if core is wrong about a type (false positive),
846  # people will hopefully nag and submit patches :)
847  $mime = false;
848  # Some strings by reference for performance - assuming well-behaved hooks
849  $callback = $this->guessCallback;
850  if ( $callback ) {
851  $callback( $this, $head, $tail, $file, $mime /* by reference */ );
852  }
853 
854  return $mime;
855  }
856 
870  function detectZipType( $header, $tail = null, $ext = false ) {
871  if ( $ext ) { # TODO: remove $ext param
872  $this->logger->info( __METHOD__ .
873  ": WARNING: use of the \$ext parameter is deprecated. " .
874  "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
875  }
876 
877  $mime = 'application/zip';
878  $opendocTypes = [
879  'chart-template',
880  'chart',
881  'formula-template',
882  'formula',
883  'graphics-template',
884  'graphics',
885  'image-template',
886  'image',
887  'presentation-template',
888  'presentation',
889  'spreadsheet-template',
890  'spreadsheet',
891  'text-template',
892  'text-master',
893  'text-web',
894  'text' ];
895 
896  // https://lists.oasis-open.org/archives/office/200505/msg00006.html
897  $types = '(?:' . implode( '|', $opendocTypes ) . ')';
898  $opendocRegex = "/^mimetype(application\/vnd\.oasis\.opendocument\.$types)/";
899 
900  $openxmlRegex = "/^\[Content_Types\].xml/";
901 
902  if ( preg_match( $opendocRegex, substr( $header, 30 ), $matches ) ) {
903  $mime = $matches[1];
904  $this->logger->info( __METHOD__ . ": detected $mime from ZIP archive\n" );
905  } elseif ( preg_match( $openxmlRegex, substr( $header, 30 ) ) ) {
906  $mime = "application/x-opc+zip";
907  # TODO: remove the block below, as soon as improveTypeFromExtension is used everywhere
908  if ( $ext !== true && $ext !== false ) {
913  if ( $this->isMatchingExtension( $ext, $mime ) ) {
914  /* A known file extension for an OPC file,
915  * find the proper mime type for that file extension
916  */
917  $mime = $this->guessTypesForExtension( $ext );
918  } else {
919  $mime = "application/zip";
920  }
921  }
922  $this->logger->info( __METHOD__ .
923  ": detected an Open Packaging Conventions archive: $mime\n" );
924  } elseif ( substr( $header, 0, 8 ) == "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" &&
925  ( $headerpos = strpos( $tail, "PK\x03\x04" ) ) !== false &&
926  preg_match( $openxmlRegex, substr( $tail, $headerpos + 30 ) ) ) {
927  if ( substr( $header, 512, 4 ) == "\xEC\xA5\xC1\x00" ) {
928  $mime = "application/msword";
929  }
930  switch ( substr( $header, 512, 6 ) ) {
931  case "\xEC\xA5\xC1\x00\x0E\x00":
932  case "\xEC\xA5\xC1\x00\x1C\x00":
933  case "\xEC\xA5\xC1\x00\x43\x00":
934  $mime = "application/vnd.ms-powerpoint";
935  break;
936  case "\xFD\xFF\xFF\xFF\x10\x00":
937  case "\xFD\xFF\xFF\xFF\x1F\x00":
938  case "\xFD\xFF\xFF\xFF\x22\x00":
939  case "\xFD\xFF\xFF\xFF\x23\x00":
940  case "\xFD\xFF\xFF\xFF\x28\x00":
941  case "\xFD\xFF\xFF\xFF\x29\x00":
942  case "\xFD\xFF\xFF\xFF\x10\x02":
943  case "\xFD\xFF\xFF\xFF\x1F\x02":
944  case "\xFD\xFF\xFF\xFF\x22\x02":
945  case "\xFD\xFF\xFF\xFF\x23\x02":
946  case "\xFD\xFF\xFF\xFF\x28\x02":
947  case "\xFD\xFF\xFF\xFF\x29\x02":
948  $mime = "application/vnd.msexcel";
949  break;
950  }
951 
952  $this->logger->info( __METHOD__ .
953  ": detected a MS Office document with OPC trailer\n" );
954  } else {
955  $this->logger->info( __METHOD__ . ": unable to identify type of ZIP archive\n" );
956  }
957  return $mime;
958  }
959 
967  function detectMicrosoftBinaryType( $handle ) {
968  $info = MSCompoundFileReader::readHandle( $handle );
969  if ( !$info['valid'] ) {
970  $this->logger->info( __METHOD__ . ': invalid file format' );
971  return 'unknown/unknown';
972  }
973  if ( !$info['mime'] ) {
974  $this->logger->info( __METHOD__ . ": unrecognised document subtype" );
975  return 'unknown/unknown';
976  }
977  return $info['mime'];
978  }
979 
997  private function detectMimeType( $file, $ext = true ) {
999  if ( $ext ) {
1000  $this->logger->info( __METHOD__ .
1001  ": WARNING: use of the \$ext parameter is deprecated. "
1002  . "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
1003  }
1004 
1005  $callback = $this->detectCallback;
1006  $m = null;
1007  if ( $callback ) {
1008  $m = $callback( $file );
1009  } else {
1010  $m = mime_content_type( $file );
1011  }
1012 
1013  if ( $m ) {
1014  # normalize
1015  $m = preg_replace( '![;, ].*$!', '', $m ); # strip charset, etc
1016  $m = trim( $m );
1017  $m = strtolower( $m );
1018 
1019  if ( strpos( $m, 'unknown' ) !== false ) {
1020  $m = null;
1021  } else {
1022  $this->logger->info( __METHOD__ . ": magic mime type of $file: $m\n" );
1023  return $m;
1024  }
1025  }
1026 
1027  // If desired, look at extension as a fallback.
1028  if ( $ext === true ) {
1029  $i = strrpos( $file, '.' );
1030  $ext = strtolower( $i ? substr( $file, $i + 1 ) : '' );
1031  }
1032  if ( $ext ) {
1033  if ( $this->isRecognizableExtension( $ext ) ) {
1034  $this->logger->info( __METHOD__ . ": refusing to guess mime type for .$ext file, "
1035  . "we should have recognized it\n" );
1036  } else {
1037  $m = $this->guessTypesForExtension( $ext );
1038  if ( $m ) {
1039  $this->logger->info( __METHOD__ . ": extension mime type of $file: $m\n" );
1040  return $m;
1041  }
1042  }
1043  }
1044 
1045  // Unknown type
1046  $this->logger->info( __METHOD__ . ": failed to guess mime type for $file!\n" );
1047  return 'unknown/unknown';
1048  }
1049 
1066  function getMediaType( $path = null, $mime = null ) {
1067  if ( !$mime && !$path ) {
1068  return MEDIATYPE_UNKNOWN;
1069  }
1070 
1071  // If MIME type is unknown, guess it
1072  if ( !$mime ) {
1073  $mime = $this->guessMimeType( $path, false );
1074  }
1075 
1076  // Special code for ogg - detect if it's video (theora),
1077  // else label it as sound.
1078  if ( $mime == 'application/ogg' && file_exists( $path ) ) {
1079  // Read a chunk of the file
1080  $f = fopen( $path, "rt" );
1081  if ( !$f ) {
1082  return MEDIATYPE_UNKNOWN;
1083  }
1084  $head = fread( $f, 256 );
1085  fclose( $f );
1086 
1087  $head = str_replace( 'ffmpeg2theora', '', strtolower( $head ) );
1088 
1089  // This is an UGLY HACK, file should be parsed correctly
1090  if ( strpos( $head, 'theora' ) !== false ) {
1091  return MEDIATYPE_VIDEO;
1092  } elseif ( strpos( $head, 'vorbis' ) !== false ) {
1093  return MEDIATYPE_AUDIO;
1094  } elseif ( strpos( $head, 'flac' ) !== false ) {
1095  return MEDIATYPE_AUDIO;
1096  } elseif ( strpos( $head, 'speex' ) !== false ) {
1097  return MEDIATYPE_AUDIO;
1098  } elseif ( strpos( $head, 'opus' ) !== false ) {
1099  return MEDIATYPE_AUDIO;
1100  } else {
1101  return MEDIATYPE_MULTIMEDIA;
1102  }
1103  }
1104 
1105  $type = null;
1106  // Check for entry for full MIME type
1107  if ( $mime ) {
1108  $type = $this->findMediaType( $mime );
1109  if ( $type !== MEDIATYPE_UNKNOWN ) {
1110  return $type;
1111  }
1112  }
1113 
1114  // Check for entry for file extension
1115  if ( $path ) {
1116  $i = strrpos( $path, '.' );
1117  $e = strtolower( $i ? substr( $path, $i + 1 ) : '' );
1118 
1119  // TODO: look at multi-extension if this fails, parse from full path
1120  $type = $this->findMediaType( '.' . $e );
1121  if ( $type !== MEDIATYPE_UNKNOWN ) {
1122  return $type;
1123  }
1124  }
1125 
1126  // Check major MIME type
1127  if ( $mime ) {
1128  $i = strpos( $mime, '/' );
1129  if ( $i !== false ) {
1130  $major = substr( $mime, 0, $i );
1131  $type = $this->findMediaType( $major );
1132  if ( $type !== MEDIATYPE_UNKNOWN ) {
1133  return $type;
1134  }
1135  }
1136  }
1137 
1138  if ( !$type ) {
1140  }
1141 
1142  return $type;
1143  }
1144 
1155  function findMediaType( $extMime ) {
1156  if ( strpos( $extMime, '.' ) === 0 ) {
1157  // If it's an extension, look up the MIME types
1158  $m = $this->getTypesForExtension( substr( $extMime, 1 ) );
1159  if ( !$m ) {
1160  return MEDIATYPE_UNKNOWN;
1161  }
1162 
1163  $m = explode( ' ', $m );
1164  } else {
1165  // Normalize MIME type
1166  if ( isset( $this->mimeTypeAliases[$extMime] ) ) {
1167  $extMime = $this->mimeTypeAliases[$extMime];
1168  }
1169 
1170  $m = [ $extMime ];
1171  }
1172 
1173  foreach ( $m as $mime ) {
1174  foreach ( $this->mediaTypes as $type => $codes ) {
1175  if ( in_array( $mime, $codes, true ) ) {
1176  return $type;
1177  }
1178  }
1179  }
1180 
1181  return MEDIATYPE_UNKNOWN;
1182  }
1183 
1189  public function getMediaTypes() {
1190  return array_keys( $this->mediaTypes );
1191  }
1192 
1202  public function getIEMimeTypes( $fileName, $chunk, $proposed ) {
1203  $ca = $this->getIEContentAnalyzer();
1204  return $ca->getRealMimesFromData( $fileName, $chunk, $proposed );
1205  }
1206 
1212  protected function getIEContentAnalyzer() {
1213  if ( is_null( $this->IEAnalyzer ) ) {
1214  $this->IEAnalyzer = new IEContentAnalyzer;
1215  }
1216  return $this->IEAnalyzer;
1217  }
1218 }
guessMimeType( $file, $ext=true)
MIME type detection.
setLogger(LoggerInterface $logger)
string $xmlTypes
either a plain
Definition: hooks.txt:2043
const MEDIATYPE_TEXT
Definition: defines.php:41
callable $initCallback
detectZipType( $header, $tail=null, $ext=false)
Detect application-specific file type of a given ZIP file from its header data.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException' returning false will NOT prevent logging $e
Definition: hooks.txt:2159
getIEMimeTypes( $fileName, $chunk, $proposed)
Get the MIME types that various versions of Internet Explorer would detect from a chunk of the conten...
const MEDIATYPE_MULTIMEDIA
Definition: defines.php:37
addExtraTypes( $types)
Adds to the list mapping MIME to file extensions.
isPHPImageType( $mime)
Returns true if the MIME type is known to represent an image format supported by the PHP GD library...
IEContentAnalyzer $IEAnalyzer
__construct(array $params)
const MEDIATYPE_UNKNOWN
Definition: defines.php:26
This list may contain false positives That usually means there is additional text with links below the first Each row contains links to the first and second as well as the first line of the second redirect text
getMediaTypes()
Returns an array of media types (MEDIATYPE_xxx constants)
array $mediaTypes
Mapping of media types to arrays of MIME types.
static readHandle( $fileHandle)
Read from an open seekable handle.
improveTypeFromExtension( $mime, $ext)
Improves a MIME type using the file extension.
const MEDIATYPE_VIDEO
Definition: defines.php:35
callable $detectCallback
isMatchingExtension( $extension, $mime)
Tests if the extension matches the given MIME type.
callable $extCallback
This class simulates Microsoft Internet Explorer&#39;s terribly broken and insecure MIME type detection a...
Using a hook running we can avoid having all this option specific stuff in our mainline code Using the function We ve cleaned up the code here by removing clumps of infrequently used code and moving them off somewhere else It s much easier for someone working with this code to see what s _really_ going and make changes or fix bugs In we can take all the code that deals with the little used title reversing etc
Definition: hooks.txt:91
static $wellKnownInfo
Defines a set of well known MIME info entries This is used as a fallback to mime.info files...
string $extraTypes
Extra MIME types, set for example by media handling extensions.
getMediaType( $path=null, $mime=null)
Determine the media type code for a file, using its MIME type, name and possibly its contents...
$params
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
getExtensionsForType( $mime)
Returns a list of file extensions for a given MIME type as a space separated string or null if the MI...
$header
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
LoggerInterface $logger
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
string $infoFile
$lines
Definition: router.php:61
addExtraInfo( $info)
Adds to the list mapping MIME to media type.
const MEDIATYPE_AUDIO
Definition: defines.php:32
array $mExtToMime
Map of file extensions types to MIME types (as a space separated list)
array $mimeTypeAliases
Map of MIME type aliases.
array $mimetoExt
Map of MIME types to file extensions (as a space separated list)
string $typeFile
if(!is_readable( $file)) $ext
Definition: router.php:48
doGuessMimeType( $file, $ext)
Guess the MIME type from the file contents.
$f
Definition: router.php:79
getIEContentAnalyzer()
Get a cached instance of IEContentAnalyzer.
callable $guessCallback
guessTypesForExtension( $ext)
Returns a single MIME type for a given file extension or null if unknown.
static $wellKnownTypes
Defines a set of well known MIME types This is used as a fallback to mime.types files.
isRecognizableExtension( $extension)
Returns true if the extension represents a type which can be reliably detected from its content...
detectMimeType( $file, $ext=true)
Internal MIME type detection.
findMediaType( $extMime)
Returns a media code matching the given MIME type or file extension.
string $extraInfo
Extra MIME info, set for example by media handling extensions.
$matches
getTypesForExtension( $ext)
Returns a list of MIME types for a given file extension as a space separated string or null if the ex...
detectMicrosoftBinaryType( $handle)
Detect the type of a Microsoft Compound Binary a.k.a.