22use Psr\Log\LoggerAwareInterface;
23use Psr\Log\LoggerInterface;
24use Psr\Log\NullLogger;
33class MimeAnalyzer
implements LoggerAwareInterface {
41 protected $initCallback;
43 protected $detectCallback;
45 protected $guessCallback;
47 protected $extCallback;
49 protected $mediaTypes =
null;
51 protected $mimeTypeAliases =
null;
53 protected $mimeToExts = [];
55 protected $extToMimes = [];
58 public $mExtToMime = [];
61 protected $IEAnalyzer;
64 private $extraTypes =
'';
66 private $extraInfo =
'';
72 public const USE_INTERNAL =
'internal';
89 public function __construct( array $params ) {
90 $this->typeFile = $params[
'typeFile'];
91 $this->infoFile = $params[
'infoFile'];
92 $this->xmlTypes = $params[
'xmlTypes'];
93 $this->initCallback = $params[
'initCallback'] ??
null;
94 $this->detectCallback = $params[
'detectCallback'] ??
null;
95 $this->guessCallback = $params[
'guessCallback'] ??
null;
96 $this->extCallback = $params[
'extCallback'] ??
null;
97 $this->logger = $params[
'logger'] ??
new NullLogger();
102 protected function loadFiles() {
103 # Allow media handling extensions adding MIME-types and MIME-info
104 if ( $this->initCallback ) {
105 call_user_func( $this->initCallback, $this );
108 $rawTypes = $this->extraTypes;
109 if ( $this->typeFile === self::USE_INTERNAL ) {
110 $this->mimeToExts = MimeMap::MIME_EXTENSIONS;
112 $this->mimeToExts = MimeMapMinimal::MIME_EXTENSIONS;
113 if ( $this->typeFile ) {
114 $rawTypes = file_get_contents( $this->typeFile ) .
"\n" . $this->extraTypes;
118 $this->parseMimeTypes( $rawTypes );
122 foreach ( $this->mimeToExts as
$mime => $exts ) {
123 foreach ( $exts as
$ext ) {
130 foreach ( $this->mExtToMime as
$ext => $mimes ) {
131 foreach ( explode(
' ', $mimes ) as
$mime ) {
136 $rawInfo = $this->extraInfo;
137 if ( $this->infoFile === self::USE_INTERNAL ) {
138 $this->mimeTypeAliases = MimeMap::MIME_TYPE_ALIASES;
139 $this->mediaTypes = MimeMap::MEDIA_TYPES;
141 $this->mimeTypeAliases = MimeMapMinimal::MIME_TYPE_ALIASES;
142 $this->mediaTypes = MimeMapMinimal::MEDIA_TYPES;
143 if ( $this->infoFile ) {
144 $rawInfo = file_get_contents( $this->infoFile ) .
"\n" . $this->extraInfo;
148 $this->parseMimeInfo( $rawInfo );
152 protected function parseMimeTypes( $rawMimeTypes ) {
153 $rawMimeTypes = str_replace( [
"\r\n",
"\n\r",
"\n\n",
"\r\r",
"\r" ],
"\n", $rawMimeTypes );
154 $rawMimeTypes = str_replace(
"\t",
" ", $rawMimeTypes );
156 $lines = explode(
"\n", $rawMimeTypes );
162 if ( strpos(
$s,
'#' ) === 0 ) {
166 $s = strtolower(
$s );
167 $i = strpos(
$s,
' ' );
169 if ( $i ===
false ) {
174 $ext = trim( substr(
$s, $i + 1 ) );
176 if ( empty(
$ext ) ) {
180 $tokens = preg_split(
'/\s+/',
$s, -1, PREG_SPLIT_NO_EMPTY );
181 if ( count( $tokens ) > 1 ) {
182 $mime = array_shift( $tokens );
183 $this->mimeToExts[
$mime] = array_values( array_unique(
184 array_merge( $this->mimeToExts[
$mime] ?? [], $tokens ) ) );
189 protected function parseMimeInfo( $rawMimeInfo ) {
190 $rawMimeInfo = str_replace( [
"\r\n",
"\n\r",
"\n\n",
"\r\r",
"\r" ],
"\n", $rawMimeInfo );
191 $rawMimeInfo = str_replace(
"\t",
" ", $rawMimeInfo );
193 $lines = explode(
"\n", $rawMimeInfo );
199 if ( strpos(
$s,
'#' ) === 0 ) {
203 $s = strtolower(
$s );
204 $i = strpos(
$s,
' ' );
206 if ( $i ===
false ) {
210 # print "processing MIME INFO line $s<br>";
213 if ( preg_match(
'!\[\s*(\w+)\s*\]!',
$s, $match ) ) {
214 $s = preg_replace(
'!\[\s*(\w+)\s*\]!',
'',
$s );
215 $mtype = trim( strtoupper( $match[1] ) );
220 $m = preg_split(
'/\s+/',
$s, -1, PREG_SPLIT_NO_EMPTY );
222 if ( !isset( $this->mediaTypes[$mtype] ) ) {
223 $this->mediaTypes[$mtype] = [];
226 foreach ( $m as
$mime ) {
228 if ( empty(
$mime ) ) {
232 $this->mediaTypes[$mtype][] =
$mime;
235 if ( count( $m ) > 1 ) {
237 $mCount = count( $m );
238 for ( $i = 1; $i < $mCount; $i += 1 ) {
240 $this->mimeTypeAliases[
$mime] = $main;
246 public function setLogger( LoggerInterface $logger ) {
247 $this->logger = $logger;
256 public function addExtraTypes( $types ) {
257 $this->extraTypes .=
"\n" . $types;
266 public function addExtraInfo( $info ) {
267 $this->extraInfo .=
"\n" . $info;
279 public function getExtensionsForType(
$mime ) {
280 $exts = $this->getExtensionsFromMimeType(
$mime );
281 return $exts ? implode(
' ', $exts ) : null;
293 public function getExtensionsFromMimeType(
$mime ) {
295 if ( !isset( $this->mimeToExts[
$mime] ) && isset( $this->mimeTypeAliases[
$mime] ) ) {
298 return $this->mimeToExts[
$mime] ?? [];
310 public function getMimeTypesFromExtension(
$ext ) {
312 return $this->extToMimes[
$ext] ?? [];
323 public function getMimeTypeFromExtensionOrNull(
$ext ) {
324 $types = $this->getMimeTypesFromExtension(
$ext );
325 return $types[0] ??
null;
336 public function guessTypesForExtension(
$ext ) {
337 return $this->getMimeTypeFromExtensionOrNull(
$ext );
348 public function getTypesForExtension(
$ext ) {
349 $types = $this->getMimeTypesFromExtension(
$ext );
350 return $types ? implode(
' ', $types ) : null;
361 public function getExtensionFromMimeTypeOrNull(
$mime ) {
362 $exts = $this->getExtensionsFromMimeType(
$mime );
363 return $exts[0] ??
null;
375 public function isMatchingExtension( $extension,
$mime ) {
376 $exts = $this->getExtensionsFromMimeType(
$mime );
382 return in_array( strtolower( $extension ), $exts );
393 public function isPHPImageType(
$mime ) {
396 'image/gif',
'image/jpeg',
'image/png',
397 'image/x-bmp',
'image/xbm',
'image/tiff',
398 'image/jp2',
'image/jpeg2000',
'image/iff',
399 'image/xbm',
'image/x-xbitmap',
400 'image/vnd.wap.wbmp',
'image/vnd.xiff',
402 'application/x-shockwave-flash',
405 return in_array(
$mime, $types );
420 public function isRecognizableExtension( $extension ) {
423 'gif',
'jpeg',
'jpg',
'png',
'swf',
'psd',
424 'bmp',
'tiff',
'tif',
'jpc',
'jp2',
425 'jpx',
'jb2',
'swc',
'iff',
'wbmp',
429 'djvu',
'ogx',
'ogg',
'ogv',
'oga',
'spx',
'opus',
430 'mid',
'pdf',
'wmf',
'xcf',
'webm',
'mkv',
'mka',
439 return in_array( strtolower( $extension ), $types );
456 public function improveTypeFromExtension(
$mime,
$ext ) {
457 if (
$mime ===
'unknown/unknown' ) {
458 if ( $this->isRecognizableExtension(
$ext ) ) {
459 $this->logger->info( __METHOD__ .
': refusing to guess mime type for .' .
460 "$ext file, we should have recognized it\n" );
464 $mime = $this->getMimeTypeFromExtensionOrNull(
$ext );
466 } elseif (
$mime ===
'application/x-opc+zip' ) {
467 if ( $this->isMatchingExtension(
$ext,
$mime ) ) {
470 $mime = $this->getMimeTypeFromExtensionOrNull(
$ext );
472 $this->logger->info( __METHOD__ .
473 ": refusing to guess better type for $mime file, " .
474 ".$ext is not a known OPC extension.\n" );
475 $mime =
'application/zip';
477 } elseif (
$mime ===
'text/plain' && $this->findMediaType(
".$ext" ) ===
MEDIATYPE_TEXT ) {
482 $mime = $this->getMimeTypeFromExtensionOrNull(
$ext );
485 # Media handling extensions can improve the MIME detected
486 $callback = $this->extCallback;
491 if ( isset( $this->mimeTypeAliases[
$mime] ) ) {
495 $this->logger->info( __METHOD__ .
": improved mime type for .$ext: $mime\n" );
513 public function guessMimeType(
$file,
$ext =
true ) {
515 $this->logger->info( __METHOD__ .
516 ": WARNING: use of the \$ext parameter is deprecated. " .
517 "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
523 $this->logger->info( __METHOD__ .
524 ": internal type detection failed for $file (.$ext)...\n" );
528 if ( isset( $this->mimeTypeAliases[
$mime] ) ) {
532 $this->logger->info( __METHOD__ .
": guessed mime type of $file: $mime\n" );
546 private function doGuessMimeType(
$file,
$ext ) {
548 Wikimedia\suppressWarnings();
549 $f = fopen(
$file,
'rb' );
550 Wikimedia\restoreWarnings();
553 return 'unknown/unknown';
556 $fsize = filesize(
$file );
557 if ( $fsize ===
false ) {
558 return 'unknown/unknown';
561 $head = fread( $f, 1024 );
562 $tailLength = min( 65558, $fsize );
563 if ( fseek( $f, -1 * $tailLength, SEEK_END ) === -1 ) {
564 throw new UnexpectedValueException(
565 "Seeking $tailLength bytes from EOF failed in " . __METHOD__ );
567 $tail = $tailLength ? fread( $f, $tailLength ) :
'';
569 $this->logger->info( __METHOD__ .
570 ": analyzing head and tail of $file for magic numbers.\n" );
575 'MThd' =>
'audio/midi',
576 'OggS' =>
'application/ogg',
577 'ID3' =>
'audio/mpeg',
578 "\xff\xfb" =>
'audio/mpeg',
579 "\xff\xf3" =>
'audio/mpeg',
580 "\xff\xe3" =>
'audio/mpeg',
584 "\x01\x00\x09\x00" =>
'application/x-msmetafile',
585 "\xd7\xcd\xc6\x9a" =>
'application/x-msmetafile',
586 '%PDF' =>
'application/pdf',
587 'gimp xcf' =>
'image/x-xcf',
590 'MZ' =>
'application/octet-stream',
591 "\xca\xfe\xba\xbe" =>
'application/octet-stream',
592 "\x7fELF" =>
'application/octet-stream',
595 foreach ( $headers as $magic => $candidate ) {
596 if ( strncmp( $head, $magic, strlen( $magic ) ) == 0 ) {
597 $this->logger->info( __METHOD__ .
598 ": magic header in $file recognized as $candidate\n" );
604 if ( strncmp( $head, pack(
"C4", 0x1a, 0x45, 0xdf, 0xa3 ), 4 ) == 0 ) {
605 $doctype = strpos( $head,
"\x42\x82" );
608 $data = substr( $head, $doctype + 3, 8 );
609 if ( strncmp( $data,
"matroska", 8 ) == 0 ) {
610 $this->logger->info( __METHOD__ .
": recognized file as video/x-matroska\n" );
611 return "video/x-matroska";
612 } elseif ( strncmp( $data,
"webm", 4 ) == 0 ) {
614 $videotrack = strpos( $head,
"\x86\x85V_VP" );
618 $this->logger->info( __METHOD__ .
": recognized file as video/webm\n" );
622 $this->logger->info( __METHOD__ .
": recognized file as audio/webm\n" );
626 $this->logger->info( __METHOD__ .
": unknown EBML file\n" );
627 return "unknown/unknown";
631 if ( strncmp( $head,
"RIFF", 4 ) == 0 &&
632 strncmp( substr( $head, 8, 7 ),
"WEBPVP8", 7 ) == 0
634 $this->logger->info( __METHOD__ .
": recognized file as image/webp\n" );
639 if ( strncmp( $head,
"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8 ) == 0 ) {
640 $this->logger->info( __METHOD__ .
': recognized MS CFB (OLE) file' );
641 return $this->detectMicrosoftBinaryType( $f );
656 if ( ( strpos( $head,
'<?php' ) !==
false ) ||
657 ( strpos( $head,
"<\x00?\x00p\x00h\x00p" ) !==
false ) ||
658 ( strpos( $head,
"<\x00?\x00 " ) !==
false ) ||
659 ( strpos( $head,
"<\x00?\x00\n" ) !==
false ) ||
660 ( strpos( $head,
"<\x00?\x00\t" ) !==
false ) ||
661 ( strpos( $head,
"<\x00?\x00=" ) !==
false )
663 $this->logger->info( __METHOD__ .
": recognized $file as application/x-php\n" );
664 return 'application/x-php';
670 Wikimedia\suppressWarnings();
672 Wikimedia\restoreWarnings();
673 if ( $xml->wellFormed ) {
674 $xmlTypes = $this->xmlTypes;
675 return $xmlTypes[$xml->getRootElement()] ??
'application/xml';
684 if ( substr( $head, 0, 2 ) ==
"#!" ) {
685 $script_type =
"ASCII";
686 } elseif ( substr( $head, 0, 5 ) ==
"\xef\xbb\xbf#!" ) {
687 $script_type =
"UTF-8";
688 } elseif ( substr( $head, 0, 7 ) ==
"\xfe\xff\x00#\x00!" ) {
689 $script_type =
"UTF-16BE";
690 } elseif ( substr( $head, 0, 7 ) ==
"\xff\xfe#\x00!" ) {
691 $script_type =
"UTF-16LE";
694 if ( $script_type ) {
695 if ( $script_type !==
"UTF-8" && $script_type !==
"ASCII" ) {
697 $pack = [
'UTF-16BE' =>
'n*',
'UTF-16LE' =>
'v*' ];
698 $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
700 foreach ( $chars as $codepoint ) {
701 if ( $codepoint < 128 ) {
702 $head .= chr( $codepoint );
711 if ( preg_match(
'%/?([^\s]+/)(\w+)%', $head, $match ) ) {
712 $mime =
"application/x-{$match[2]}";
713 $this->logger->info( __METHOD__ .
": shell script recognized as $mime\n" );
719 $eocdrPos = strpos( $tail,
"PK\x05\x06" );
720 if ( $eocdrPos !==
false && $eocdrPos <= strlen( $tail ) - 22 ) {
721 $this->logger->info( __METHOD__ .
": ZIP signature present in $file\n" );
723 $commentLength = unpack(
"n", substr( $tail, $eocdrPos + 20 ) )[1];
724 if ( $eocdrPos + 22 + $commentLength !== strlen( $tail ) ) {
725 $this->logger->info( __METHOD__ .
": ZIP EOCDR not at end. Not a ZIP file." );
727 return $this->detectZipType( $head, $tail,
$ext );
734 stripos( $head,
'SOLID ' ) === 0 &&
735 preg_match(
'/\RENDSOLID .*$/i', $tail ) ) {
737 return 'application/sla';
738 } elseif ( $fsize > 84 ) {
740 $triangles = substr( $head, 80, 4 );
741 $triangles = unpack(
'V', $triangles );
742 $triangles = reset( $triangles );
743 if ( $triangles !==
false && $fsize === 84 + ( $triangles * 50 ) ) {
744 return 'application/sla';
748 Wikimedia\suppressWarnings();
749 $gis = getimagesize(
$file );
750 Wikimedia\restoreWarnings();
752 if ( $gis && isset( $gis[
'mime'] ) ) {
753 $mime = $gis[
'mime'];
754 $this->logger->info( __METHOD__ .
": getimagesize detected $file as $mime\n" );
758 # Media handling extensions can guess the MIME by content
759 # It's intentionally here so that if core is wrong about a type (false positive),
760 # people will hopefully nag and submit patches :)
762 # Some strings by reference for performance - assuming well-behaved hooks
763 $callback = $this->guessCallback;
765 $callback( $this, $head, $tail,
$file,
$mime );
784 public function detectZipType(
$header, $tail =
null,
$ext =
false ) {
785 if (
$ext ) { # TODO:
remove $ext param
786 $this->logger->info( __METHOD__ .
787 ": WARNING: use of the \$ext parameter is deprecated. " .
788 "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
791 $mime =
'application/zip';
793 # In OASIS Open Document Format v1.2, Database front end document
794 # has a recommended MIME type of:
795 # application/vnd.oasis.opendocument.base
796 # Despite the type registered at the IANA being 'database' which is
797 # supposed to be normative.
809 'presentation-template',
811 'spreadsheet-template',
822 $types =
'(?:' . implode(
'|', $opendocTypes ) .
')';
823 $opendocRegex =
"/^mimetype(application\/vnd\.oasis\.opendocument\.$types)/";
825 $openxmlRegex =
"/^\[Content_Types\].xml/";
829 $this->logger->info( __METHOD__ .
": detected $mime from ZIP archive\n" );
830 } elseif ( preg_match( $openxmlRegex, substr(
$header, 30 ) ) ) {
831 $mime =
"application/x-opc+zip";
832 # TODO: remove the block below, as soon as improveTypeFromExtension is used everywhere
833 if (
$ext !==
true &&
$ext !==
false ) {
838 if ( $this->isMatchingExtension(
$ext,
$mime ) ) {
842 $mime = $this->getMimeTypeFromExtensionOrNull(
$ext );
844 $mime =
"application/zip";
847 $this->logger->info( __METHOD__ .
848 ": detected an Open Packaging Conventions archive: $mime\n" );
849 } elseif ( substr(
$header, 0, 8 ) ==
"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" &&
850 ( $headerpos = strpos( $tail,
"PK\x03\x04" ) ) !==
false &&
851 preg_match( $openxmlRegex, substr( $tail, $headerpos + 30 ) ) ) {
852 if ( substr(
$header, 512, 4 ) ==
"\xEC\xA5\xC1\x00" ) {
853 $mime =
"application/msword";
855 switch ( substr(
$header, 512, 6 ) ) {
856 case "\xEC\xA5\xC1\x00\x0E\x00":
857 case "\xEC\xA5\xC1\x00\x1C\x00":
858 case "\xEC\xA5\xC1\x00\x43\x00":
859 $mime =
"application/vnd.ms-powerpoint";
861 case "\xFD\xFF\xFF\xFF\x10\x00":
862 case "\xFD\xFF\xFF\xFF\x1F\x00":
863 case "\xFD\xFF\xFF\xFF\x22\x00":
864 case "\xFD\xFF\xFF\xFF\x23\x00":
865 case "\xFD\xFF\xFF\xFF\x28\x00":
866 case "\xFD\xFF\xFF\xFF\x29\x00":
867 case "\xFD\xFF\xFF\xFF\x10\x02":
868 case "\xFD\xFF\xFF\xFF\x1F\x02":
869 case "\xFD\xFF\xFF\xFF\x22\x02":
870 case "\xFD\xFF\xFF\xFF\x23\x02":
871 case "\xFD\xFF\xFF\xFF\x28\x02":
872 case "\xFD\xFF\xFF\xFF\x29\x02":
873 $mime =
"application/vnd.msexcel";
877 $this->logger->info( __METHOD__ .
878 ": detected a MS Office document with OPC trailer\n" );
880 $this->logger->info( __METHOD__ .
": unable to identify type of ZIP archive\n" );
892 private function detectMicrosoftBinaryType( $handle ) {
894 if ( !$info[
'valid'] ) {
895 $this->logger->info( __METHOD__ .
': invalid file format' );
896 return 'unknown/unknown';
898 if ( !$info[
'mime'] ) {
899 $this->logger->info( __METHOD__ .
": unrecognised document subtype" );
900 return 'unknown/unknown';
902 return $info[
'mime'];
922 private function detectMimeType(
$file,
$ext =
true ) {
925 $this->logger->info( __METHOD__ .
926 ": WARNING: use of the \$ext parameter is deprecated. "
927 .
"Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
930 $callback = $this->detectCallback;
933 $m = $callback(
$file );
935 $m = mime_content_type(
$file );
940 $m = preg_replace(
'![;, ].*$!',
'', $m ); # strip charset, etc
942 $m = strtolower( $m );
944 if ( strpos( $m,
'unknown' ) !==
false ) {
947 $this->logger->info( __METHOD__ .
": magic mime type of $file: $m\n" );
953 if (
$ext ===
true ) {
954 $i = strrpos(
$file,
'.' );
955 $ext = strtolower( $i ? substr(
$file, $i + 1 ) :
'' );
958 if ( $this->isRecognizableExtension(
$ext ) ) {
959 $this->logger->info( __METHOD__ .
": refusing to guess mime type for .$ext file, "
960 .
"we should have recognized it\n" );
962 $m = $this->getMimeTypeFromExtensionOrNull(
$ext );
964 $this->logger->info( __METHOD__ .
": extension mime type of $file: $m\n" );
971 $this->logger->info( __METHOD__ .
": failed to guess mime type for $file!\n" );
972 return 'unknown/unknown';
991 public function getMediaType(
$path =
null,
$mime =
null ) {
1003 if (
$mime ==
'application/ogg' && is_string(
$path ) && file_exists(
$path ) ) {
1005 $f = fopen(
$path,
"rt" );
1009 $head = fread( $f, 256 );
1012 $head = str_replace(
'ffmpeg2theora',
'', strtolower( $head ) );
1015 if ( strpos( $head,
'theora' ) !==
false ) {
1017 } elseif ( strpos( $head,
'vorbis' ) !==
false ) {
1019 } elseif ( strpos( $head,
'flac' ) !==
false ) {
1021 } elseif ( strpos( $head,
'speex' ) !==
false ) {
1023 } elseif ( strpos( $head,
'opus' ) !==
false ) {
1041 $i = strrpos(
$path,
'.' );
1042 $e = strtolower( $i ? substr(
$path, $i + 1 ) :
'' );
1045 $type = $this->findMediaType(
'.' . $e );
1053 $i = strpos(
$mime,
'/' );
1054 if ( $i !==
false ) {
1055 $major = substr(
$mime, 0, $i );
1056 $type = $this->findMediaType( $major );
1080 public function findMediaType( $extMime ) {
1081 if ( strpos( $extMime,
'.' ) === 0 ) {
1083 $m = $this->getTypesForExtension( substr( $extMime, 1 ) );
1088 $m = explode(
' ', $m );
1091 if ( isset( $this->mimeTypeAliases[$extMime] ) ) {
1092 $extMime = $this->mimeTypeAliases[$extMime];
1098 foreach ( $m as
$mime ) {
1099 foreach ( $this->mediaTypes as
$type => $codes ) {
1100 if ( in_array(
$mime, $codes,
true ) ) {
1114 public function getMediaTypes() {
1115 return array_keys( $this->mediaTypes );
1127 public function getIEMimeTypes( $fileName, $chunk, $proposed ) {
1128 $ca = $this->getIEContentAnalyzer();
1129 return $ca->getRealMimesFromData( $fileName, $chunk, $proposed );
1137 protected function getIEContentAnalyzer() {
1138 if ( $this->IEAnalyzer ===
null ) {
1141 return $this->IEAnalyzer;
This class simulates Microsoft Internet Explorer's terribly broken and insecure MIME type detection a...
static readHandle( $fileHandle)
Read from an open seekable handle.
const MEDIATYPE_MULTIMEDIA
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
if(!is_readable( $file)) $ext
if(!file_exists( $CREDITS)) $lines