MediaWiki  master
MimeAnalyzer.php
Go to the documentation of this file.
1 <?php
25 
31 class MimeAnalyzer implements LoggerAwareInterface {
33  protected $typeFile;
35  protected $infoFile;
37  protected $xmlTypes;
39  protected $initCallback;
41  protected $detectCallback;
43  protected $guessCallback;
45  protected $extCallback;
47  protected $mediaTypes = null;
49  protected $mimeTypeAliases = null;
51  protected $mimetoExt = null;
52 
54  public $mExtToMime = null; // legacy name; field accessed by hooks
55 
57  protected $IEAnalyzer;
58 
60  private $extraTypes = '';
62  private $extraInfo = '';
63 
65  private $logger;
66 
86  protected static $wellKnownTypes = <<<EOT
87 application/ogg ogx ogg ogm ogv oga spx opus
88 application/pdf pdf
89 application/vnd.oasis.opendocument.chart odc
90 application/vnd.oasis.opendocument.chart-template otc
91 application/vnd.oasis.opendocument.database odb
92 application/vnd.oasis.opendocument.formula odf
93 application/vnd.oasis.opendocument.formula-template otf
94 application/vnd.oasis.opendocument.graphics odg
95 application/vnd.oasis.opendocument.graphics-template otg
96 application/vnd.oasis.opendocument.image odi
97 application/vnd.oasis.opendocument.image-template oti
98 application/vnd.oasis.opendocument.presentation odp
99 application/vnd.oasis.opendocument.presentation-template otp
100 application/vnd.oasis.opendocument.spreadsheet ods
101 application/vnd.oasis.opendocument.spreadsheet-template ots
102 application/vnd.oasis.opendocument.text odt
103 application/vnd.oasis.opendocument.text-master otm
104 application/vnd.oasis.opendocument.text-template ott
105 application/vnd.oasis.opendocument.text-web oth
106 application/javascript js
107 application/x-shockwave-flash swf
108 audio/midi mid midi kar
109 audio/mpeg mpga mpa mp2 mp3
110 audio/x-aiff aif aiff aifc
111 audio/x-wav wav
112 audio/ogg oga spx ogg opus
113 audio/opus opus ogg oga ogg spx
114 image/x-bmp bmp
115 image/gif gif
116 image/jpeg jpeg jpg jpe
117 image/png png
118 image/svg+xml svg
119 image/svg svg
120 image/tiff tiff tif
121 image/vnd.djvu djvu
122 image/x.djvu djvu
123 image/x-djvu djvu
124 image/x-portable-pixmap ppm
125 image/x-xcf xcf
126 text/plain txt
127 text/html html htm
128 video/ogg ogv ogm ogg
129 video/mpeg mpg mpeg
130 EOT;
131 
138  protected static $wellKnownInfo = <<<EOT
139 application/pdf [OFFICE]
140 application/vnd.oasis.opendocument.chart [OFFICE]
141 application/vnd.oasis.opendocument.chart-template [OFFICE]
142 application/vnd.oasis.opendocument.database [OFFICE]
143 application/vnd.oasis.opendocument.formula [OFFICE]
144 application/vnd.oasis.opendocument.formula-template [OFFICE]
145 application/vnd.oasis.opendocument.graphics [OFFICE]
146 application/vnd.oasis.opendocument.graphics-template [OFFICE]
147 application/vnd.oasis.opendocument.image [OFFICE]
148 application/vnd.oasis.opendocument.image-template [OFFICE]
149 application/vnd.oasis.opendocument.presentation [OFFICE]
150 application/vnd.oasis.opendocument.presentation-template [OFFICE]
151 application/vnd.oasis.opendocument.spreadsheet [OFFICE]
152 application/vnd.oasis.opendocument.spreadsheet-template [OFFICE]
153 application/vnd.oasis.opendocument.text [OFFICE]
154 application/vnd.oasis.opendocument.text-template [OFFICE]
155 application/vnd.oasis.opendocument.text-master [OFFICE]
156 application/vnd.oasis.opendocument.text-web [OFFICE]
157 application/javascript text/javascript application/x-javascript [EXECUTABLE]
158 application/x-shockwave-flash [MULTIMEDIA]
159 audio/midi [AUDIO]
160 audio/x-aiff [AUDIO]
161 audio/x-wav [AUDIO]
162 audio/mp3 audio/mpeg [AUDIO]
163 application/ogg audio/ogg video/ogg [MULTIMEDIA]
164 image/x-bmp image/x-ms-bmp image/bmp [BITMAP]
165 image/gif [BITMAP]
166 image/jpeg [BITMAP]
167 image/png [BITMAP]
168 image/svg+xml [DRAWING]
169 image/tiff [BITMAP]
170 image/vnd.djvu [BITMAP]
171 image/x-xcf [BITMAP]
172 image/x-portable-pixmap [BITMAP]
173 text/plain [TEXT]
174 text/html [TEXT]
175 video/ogg [VIDEO]
176 video/mpeg [VIDEO]
177 unknown/unknown application/octet-stream application/x-empty [UNKNOWN]
178 EOT;
179 
195  public function __construct( array $params ) {
196  $this->typeFile = $params['typeFile'];
197  $this->infoFile = $params['infoFile'];
198  $this->xmlTypes = $params['xmlTypes'];
199  $this->initCallback = $params['initCallback'] ?? null;
200  $this->detectCallback = $params['detectCallback'] ?? null;
201  $this->guessCallback = $params['guessCallback'] ?? null;
202  $this->extCallback = $params['extCallback'] ?? null;
203  $this->logger = $params['logger'] ?? new NullLogger();
204 
205  $this->loadFiles();
206  }
207 
208  protected function loadFiles() {
213  # Allow media handling extensions adding MIME-types and MIME-info
214  if ( $this->initCallback ) {
215  call_user_func( $this->initCallback, $this );
216  }
217 
218  $types = self::$wellKnownTypes;
219 
220  $mimeTypeFile = $this->typeFile;
221  if ( $mimeTypeFile ) {
222  if ( is_file( $mimeTypeFile ) && is_readable( $mimeTypeFile ) ) {
223  $this->logger->info( __METHOD__ . ": loading mime types from $mimeTypeFile\n" );
224  $types .= "\n";
225  $types .= file_get_contents( $mimeTypeFile );
226  } else {
227  $this->logger->info( __METHOD__ . ": can't load mime types from $mimeTypeFile\n" );
228  }
229  } else {
230  $this->logger->info( __METHOD__ .
231  ": no mime types file defined, using built-ins only.\n" );
232  }
233 
234  $types .= "\n" . $this->extraTypes;
235 
236  $types = str_replace( [ "\r\n", "\n\r", "\n\n", "\r\r", "\r" ], "\n", $types );
237  $types = str_replace( "\t", " ", $types );
238 
239  $this->mimetoExt = [];
240  $this->mExtToMime = [];
241 
242  $lines = explode( "\n", $types );
243  foreach ( $lines as $s ) {
244  $s = trim( $s );
245  if ( empty( $s ) ) {
246  continue;
247  }
248  if ( strpos( $s, '#' ) === 0 ) {
249  continue;
250  }
251 
252  $s = strtolower( $s );
253  $i = strpos( $s, ' ' );
254 
255  if ( $i === false ) {
256  continue;
257  }
258 
259  $mime = substr( $s, 0, $i );
260  $ext = trim( substr( $s, $i + 1 ) );
261 
262  if ( empty( $ext ) ) {
263  continue;
264  }
265 
266  if ( !empty( $this->mimetoExt[$mime] ) ) {
267  $this->mimetoExt[$mime] .= ' ' . $ext;
268  } else {
269  $this->mimetoExt[$mime] = $ext;
270  }
271 
272  $extensions = explode( ' ', $ext );
273 
274  foreach ( $extensions as $e ) {
275  $e = trim( $e );
276  if ( empty( $e ) ) {
277  continue;
278  }
279 
280  if ( !empty( $this->mExtToMime[$e] ) ) {
281  $this->mExtToMime[$e] .= ' ' . $mime;
282  } else {
283  $this->mExtToMime[$e] = $mime;
284  }
285  }
286  }
287 
292  $mimeInfoFile = $this->infoFile;
293 
294  $info = self::$wellKnownInfo;
295 
296  if ( $mimeInfoFile ) {
297  if ( is_file( $mimeInfoFile ) && is_readable( $mimeInfoFile ) ) {
298  $this->logger->info( __METHOD__ . ": loading mime info from $mimeInfoFile\n" );
299  $info .= "\n";
300  $info .= file_get_contents( $mimeInfoFile );
301  } else {
302  $this->logger->info( __METHOD__ . ": can't load mime info from $mimeInfoFile\n" );
303  }
304  } else {
305  $this->logger->info( __METHOD__ .
306  ": no mime info file defined, using built-ins only.\n" );
307  }
308 
309  $info .= "\n" . $this->extraInfo;
310 
311  $info = str_replace( [ "\r\n", "\n\r", "\n\n", "\r\r", "\r" ], "\n", $info );
312  $info = str_replace( "\t", " ", $info );
313 
314  $this->mimeTypeAliases = [];
315  $this->mediaTypes = [];
316 
317  $lines = explode( "\n", $info );
318  foreach ( $lines as $s ) {
319  $s = trim( $s );
320  if ( empty( $s ) ) {
321  continue;
322  }
323  if ( strpos( $s, '#' ) === 0 ) {
324  continue;
325  }
326 
327  $s = strtolower( $s );
328  $i = strpos( $s, ' ' );
329 
330  if ( $i === false ) {
331  continue;
332  }
333 
334  # print "processing MIME INFO line $s<br>";
335 
336  $match = [];
337  if ( preg_match( '!\[\s*(\w+)\s*\]!', $s, $match ) ) {
338  $s = preg_replace( '!\[\s*(\w+)\s*\]!', '', $s );
339  $mtype = trim( strtoupper( $match[1] ) );
340  } else {
341  $mtype = MEDIATYPE_UNKNOWN;
342  }
343 
344  $m = explode( ' ', $s );
345 
346  if ( !isset( $this->mediaTypes[$mtype] ) ) {
347  $this->mediaTypes[$mtype] = [];
348  }
349 
350  foreach ( $m as $mime ) {
351  $mime = trim( $mime );
352  if ( empty( $mime ) ) {
353  continue;
354  }
355 
356  $this->mediaTypes[$mtype][] = $mime;
357  }
358 
359  if ( count( $m ) > 1 ) {
360  $main = $m[0];
361  $mCount = count( $m );
362  for ( $i = 1; $i < $mCount; $i += 1 ) {
363  $mime = $m[$i];
364  $this->mimeTypeAliases[$mime] = $main;
365  }
366  }
367  }
368  }
369 
370  public function setLogger( LoggerInterface $logger ) {
371  $this->logger = $logger;
372  }
373 
380  public function addExtraTypes( $types ) {
381  $this->extraTypes .= "\n" . $types;
382  }
383 
390  public function addExtraInfo( $info ) {
391  $this->extraInfo .= "\n" . $info;
392  }
393 
402  public function getExtensionsForType( $mime ) {
403  $mime = strtolower( $mime );
404 
405  // Check the mime-to-ext map
406  if ( isset( $this->mimetoExt[$mime] ) ) {
407  return $this->mimetoExt[$mime];
408  }
409 
410  // Resolve the MIME type to the canonical type
411  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
412  $mime = $this->mimeTypeAliases[$mime];
413  if ( isset( $this->mimetoExt[$mime] ) ) {
414  return $this->mimetoExt[$mime];
415  }
416  }
417 
418  return null;
419  }
420 
428  public function getTypesForExtension( $ext ) {
429  $ext = strtolower( $ext );
430 
431  $r = $this->mExtToMime[$ext] ?? null;
432  return $r;
433  }
434 
442  public function guessTypesForExtension( $ext ) {
443  $m = $this->getTypesForExtension( $ext );
444  if ( is_null( $m ) ) {
445  return null;
446  }
447 
448  // TODO: Check if this is needed; strtok( $m, ' ' ) should be sufficient
449  $m = trim( $m );
450  $m = preg_replace( '/\s.*$/', '', $m );
451 
452  return $m;
453  }
454 
464  public function isMatchingExtension( $extension, $mime ) {
465  $ext = $this->getExtensionsForType( $mime );
466 
467  if ( !$ext ) {
468  return null; // Unknown MIME type
469  }
470 
471  $ext = explode( ' ', $ext );
472 
473  $extension = strtolower( $extension );
474  return in_array( $extension, $ext );
475  }
476 
485  public function isPHPImageType( $mime ) {
486  // As defined by imagegetsize and image_type_to_mime
487  static $types = [
488  'image/gif', 'image/jpeg', 'image/png',
489  'image/x-bmp', 'image/xbm', 'image/tiff',
490  'image/jp2', 'image/jpeg2000', 'image/iff',
491  'image/xbm', 'image/x-xbitmap',
492  'image/vnd.wap.wbmp', 'image/vnd.xiff',
493  'image/x-photoshop',
494  'application/x-shockwave-flash',
495  ];
496 
497  return in_array( $mime, $types );
498  }
499 
512  function isRecognizableExtension( $extension ) {
513  static $types = [
514  // Types recognized by getimagesize()
515  'gif', 'jpeg', 'jpg', 'png', 'swf', 'psd',
516  'bmp', 'tiff', 'tif', 'jpc', 'jp2',
517  'jpx', 'jb2', 'swc', 'iff', 'wbmp',
518  'xbm',
519 
520  // Formats we recognize magic numbers for
521  'djvu', 'ogx', 'ogg', 'ogv', 'oga', 'spx', 'opus',
522  'mid', 'pdf', 'wmf', 'xcf', 'webm', 'mkv', 'mka',
523  'webp', 'mp3',
524 
525  // XML formats we sure hope we recognize reliably
526  'svg',
527 
528  // 3D formats
529  'stl',
530  ];
531  return in_array( strtolower( $extension ), $types );
532  }
533 
545  public function improveTypeFromExtension( $mime, $ext ) {
546  if ( $mime === 'unknown/unknown' ) {
547  if ( $this->isRecognizableExtension( $ext ) ) {
548  $this->logger->info( __METHOD__ . ': refusing to guess mime type for .' .
549  "$ext file, we should have recognized it\n" );
550  } else {
551  // Not something we can detect, so simply
552  // trust the file extension
553  $mime = $this->guessTypesForExtension( $ext );
554  }
555  } elseif ( $mime === 'application/x-opc+zip' ) {
556  if ( $this->isMatchingExtension( $ext, $mime ) ) {
557  // A known file extension for an OPC file,
558  // find the proper MIME type for that file extension
559  $mime = $this->guessTypesForExtension( $ext );
560  } else {
561  $this->logger->info( __METHOD__ .
562  ": refusing to guess better type for $mime file, " .
563  ".$ext is not a known OPC extension.\n" );
564  $mime = 'application/zip';
565  }
566  } elseif ( $mime === 'text/plain' && $this->findMediaType( ".$ext" ) === MEDIATYPE_TEXT ) {
567  // Textual types are sometimes not recognized properly.
568  // If detected as text/plain, and has an extension which is textual
569  // improve to the extension's type. For example, csv and json are often
570  // misdetected as text/plain.
571  $mime = $this->guessTypesForExtension( $ext );
572  }
573 
574  # Media handling extensions can improve the MIME detected
575  $callback = $this->extCallback;
576  if ( $callback ) {
577  $callback( $this, $ext, $mime /* by reference */ );
578  }
579 
580  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
581  $mime = $this->mimeTypeAliases[$mime];
582  }
583 
584  $this->logger->info( __METHOD__ . ": improved mime type for .$ext: $mime\n" );
585  return $mime;
586  }
587 
602  public function guessMimeType( $file, $ext = true ) {
603  if ( $ext ) { // TODO: make $ext default to false. Or better, remove it.
604  $this->logger->info( __METHOD__ .
605  ": WARNING: use of the \$ext parameter is deprecated. " .
606  "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
607  }
608 
609  $mime = $this->doGuessMimeType( $file, $ext );
610 
611  if ( !$mime ) {
612  $this->logger->info( __METHOD__ .
613  ": internal type detection failed for $file (.$ext)...\n" );
614  $mime = $this->detectMimeType( $file, $ext );
615  }
616 
617  if ( isset( $this->mimeTypeAliases[$mime] ) ) {
618  $mime = $this->mimeTypeAliases[$mime];
619  }
620 
621  $this->logger->info( __METHOD__ . ": guessed mime type of $file: $mime\n" );
622  return $mime;
623  }
624 
635  private function doGuessMimeType( $file, $ext ) {
636  // Read a chunk of the file
637  Wikimedia\suppressWarnings();
638  $f = fopen( $file, 'rb' );
639  Wikimedia\restoreWarnings();
640 
641  if ( !$f ) {
642  return 'unknown/unknown';
643  }
644 
645  $fsize = filesize( $file );
646  if ( $fsize === false ) {
647  return 'unknown/unknown';
648  }
649 
650  $head = fread( $f, 1024 );
651  $tailLength = min( 65558, $fsize ); // 65558 = maximum size of a zip EOCDR
652  if ( fseek( $f, -1 * $tailLength, SEEK_END ) === -1 ) {
653  throw new UnexpectedValueException(
654  "Seeking $tailLength bytes from EOF failed in " . __METHOD__ );
655  }
656  $tail = $tailLength ? fread( $f, $tailLength ) : '';
657 
658  $this->logger->info( __METHOD__ .
659  ": analyzing head and tail of $file for magic numbers.\n" );
660 
661  // Hardcode a few magic number checks...
662  $headers = [
663  // Multimedia...
664  'MThd' => 'audio/midi',
665  'OggS' => 'application/ogg',
666  'ID3' => 'audio/mpeg',
667  "\xff\xfb" => 'audio/mpeg', // MPEG-1 layer 3
668  "\xff\xf3" => 'audio/mpeg', // MPEG-2 layer 3 (lower sample rates)
669  "\xff\xe3" => 'audio/mpeg', // MPEG-2.5 layer 3 (very low sample rates)
670 
671  // Image formats...
672  // Note that WMF may have a bare header, no magic number.
673  "\x01\x00\x09\x00" => 'application/x-msmetafile', // Possibly prone to false positives?
674  "\xd7\xcd\xc6\x9a" => 'application/x-msmetafile',
675  '%PDF' => 'application/pdf',
676  'gimp xcf' => 'image/x-xcf',
677 
678  // Some forbidden fruit...
679  'MZ' => 'application/octet-stream', // DOS/Windows executable
680  "\xca\xfe\xba\xbe" => 'application/octet-stream', // Mach-O binary
681  "\x7fELF" => 'application/octet-stream', // ELF binary
682  ];
683 
684  foreach ( $headers as $magic => $candidate ) {
685  if ( strncmp( $head, $magic, strlen( $magic ) ) == 0 ) {
686  $this->logger->info( __METHOD__ .
687  ": magic header in $file recognized as $candidate\n" );
688  return $candidate;
689  }
690  }
691 
692  /* Look for WebM and Matroska files */
693  if ( strncmp( $head, pack( "C4", 0x1a, 0x45, 0xdf, 0xa3 ), 4 ) == 0 ) {
694  $doctype = strpos( $head, "\x42\x82" );
695  if ( $doctype ) {
696  // Next byte is datasize, then data (sizes larger than 1 byte are stupid muxers)
697  $data = substr( $head, $doctype + 3, 8 );
698  if ( strncmp( $data, "matroska", 8 ) == 0 ) {
699  $this->logger->info( __METHOD__ . ": recognized file as video/x-matroska\n" );
700  return "video/x-matroska";
701  } elseif ( strncmp( $data, "webm", 4 ) == 0 ) {
702  // XXX HACK look for a video track, if we don't find it, this is an audio file
703  $videotrack = strpos( $head, "\x86\x85V_VP" );
704 
705  if ( $videotrack ) {
706  // There is a video track, so this is a video file.
707  $this->logger->info( __METHOD__ . ": recognized file as video/webm\n" );
708  return "video/webm";
709  }
710 
711  $this->logger->info( __METHOD__ . ": recognized file as audio/webm\n" );
712  return "audio/webm";
713  }
714  }
715  $this->logger->info( __METHOD__ . ": unknown EBML file\n" );
716  return "unknown/unknown";
717  }
718 
719  /* Look for WebP */
720  if ( strncmp( $head, "RIFF", 4 ) == 0 &&
721  strncmp( substr( $head, 8, 7 ), "WEBPVP8", 7 ) == 0
722  ) {
723  $this->logger->info( __METHOD__ . ": recognized file as image/webp\n" );
724  return "image/webp";
725  }
726 
727  /* Look for MS Compound Binary (OLE) files */
728  if ( strncmp( $head, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8 ) == 0 ) {
729  $this->logger->info( __METHOD__ . ': recognized MS CFB (OLE) file' );
730  return $this->detectMicrosoftBinaryType( $f );
731  }
732 
745  if ( ( strpos( $head, '<?php' ) !== false ) ||
746  ( strpos( $head, "<\x00?\x00p\x00h\x00p" ) !== false ) ||
747  ( strpos( $head, "<\x00?\x00 " ) !== false ) ||
748  ( strpos( $head, "<\x00?\x00\n" ) !== false ) ||
749  ( strpos( $head, "<\x00?\x00\t" ) !== false ) ||
750  ( strpos( $head, "<\x00?\x00=" ) !== false )
751  ) {
752  $this->logger->info( __METHOD__ . ": recognized $file as application/x-php\n" );
753  return 'application/x-php';
754  }
755 
759  Wikimedia\suppressWarnings();
760  $xml = new XmlTypeCheck( $file );
761  Wikimedia\restoreWarnings();
762  if ( $xml->wellFormed ) {
764  return $xmlTypes[$xml->getRootElement()] ?? 'application/xml';
765  }
766 
770  $script_type = null;
771 
772  # detect by shebang
773  if ( substr( $head, 0, 2 ) == "#!" ) {
774  $script_type = "ASCII";
775  } elseif ( substr( $head, 0, 5 ) == "\xef\xbb\xbf#!" ) {
776  $script_type = "UTF-8";
777  } elseif ( substr( $head, 0, 7 ) == "\xfe\xff\x00#\x00!" ) {
778  $script_type = "UTF-16BE";
779  } elseif ( substr( $head, 0, 7 ) == "\xff\xfe#\x00!" ) {
780  $script_type = "UTF-16LE";
781  }
782 
783  if ( $script_type ) {
784  if ( $script_type !== "UTF-8" && $script_type !== "ASCII" ) {
785  // Quick and dirty fold down to ASCII!
786  $pack = [ 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' ];
787  $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
788  $head = '';
789  foreach ( $chars as $codepoint ) {
790  if ( $codepoint < 128 ) {
791  $head .= chr( $codepoint );
792  } else {
793  $head .= '?';
794  }
795  }
796  }
797 
798  $match = [];
799 
800  if ( preg_match( '%/?([^\s]+/)(\w+)%', $head, $match ) ) {
801  $mime = "application/x-{$match[2]}";
802  $this->logger->info( __METHOD__ . ": shell script recognized as $mime\n" );
803  return $mime;
804  }
805  }
806 
807  // Check for ZIP variants (before getimagesize)
808  $eocdrPos = strpos( $tail, "PK\x05\x06" );
809  if ( $eocdrPos !== false && $eocdrPos <= strlen( $tail ) - 22 ) {
810  $this->logger->info( __METHOD__ . ": ZIP signature present in $file\n" );
811  // Check if it really is a ZIP file, make sure the EOCDR is at the end (T40432)
812  $commentLength = unpack( "n", substr( $tail, $eocdrPos + 20 ) )[1];
813  if ( $eocdrPos + 22 + $commentLength !== strlen( $tail ) ) {
814  $this->logger->info( __METHOD__ . ": ZIP EOCDR not at end. Not a ZIP file." );
815  } else {
816  return $this->detectZipType( $head, $tail, $ext );
817  }
818  }
819 
820  // Check for STL (3D) files
821  // @see https://en.wikipedia.org/wiki/STL_(file_format)
822  if ( $fsize >= 15 &&
823  stripos( $head, 'SOLID ' ) === 0 &&
824  preg_match( '/\RENDSOLID .*$/i', $tail ) ) {
825  // ASCII STL file
826  return 'application/sla';
827  } elseif ( $fsize > 84 ) {
828  // binary STL file
829  $triangles = substr( $head, 80, 4 );
830  $triangles = unpack( 'V', $triangles );
831  $triangles = reset( $triangles );
832  if ( $triangles !== false && $fsize === 84 + ( $triangles * 50 ) ) {
833  return 'application/sla';
834  }
835  }
836 
837  Wikimedia\suppressWarnings();
838  $gis = getimagesize( $file );
839  Wikimedia\restoreWarnings();
840 
841  if ( $gis && isset( $gis['mime'] ) ) {
842  $mime = $gis['mime'];
843  $this->logger->info( __METHOD__ . ": getimagesize detected $file as $mime\n" );
844  return $mime;
845  }
846 
847  # Media handling extensions can guess the MIME by content
848  # It's intentionally here so that if core is wrong about a type (false positive),
849  # people will hopefully nag and submit patches :)
850  $mime = false;
851  # Some strings by reference for performance - assuming well-behaved hooks
852  $callback = $this->guessCallback;
853  if ( $callback ) {
854  $callback( $this, $head, $tail, $file, $mime /* by reference */ );
855  }
856 
857  return $mime;
858  }
859 
873  function detectZipType( $header, $tail = null, $ext = false ) {
874  if ( $ext ) { # TODO: remove $ext param
875  $this->logger->info( __METHOD__ .
876  ": WARNING: use of the \$ext parameter is deprecated. " .
877  "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
878  }
879 
880  $mime = 'application/zip';
881  $opendocTypes = [
882  # In OASIS Open Document Format v1.2, Database front end document
883  # has a recommended MIME type of:
884  # application/vnd.oasis.opendocument.base
885  # Despite the type registered at the IANA being 'database' which is
886  # supposed to be normative.
887  # T35515
888  'base',
889 
890  'chart-template',
891  'chart',
892  'formula-template',
893  'formula',
894  'graphics-template',
895  'graphics',
896  'image-template',
897  'image',
898  'presentation-template',
899  'presentation',
900  'spreadsheet-template',
901  'spreadsheet',
902  'text-template',
903  'text-master',
904  'text-web',
905  'text' ];
906 
907  // The list of document types is available in OASIS Open Document
908  // Format version 1.2 under Appendix C. It is not normative though,
909  // supposedly types registered at the IANA should be.
910  // http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html
911  $types = '(?:' . implode( '|', $opendocTypes ) . ')';
912  $opendocRegex = "/^mimetype(application\/vnd\.oasis\.opendocument\.$types)/";
913 
914  $openxmlRegex = "/^\[Content_Types\].xml/";
915 
916  if ( preg_match( $opendocRegex, substr( $header, 30 ), $matches ) ) {
917  $mime = $matches[1];
918  $this->logger->info( __METHOD__ . ": detected $mime from ZIP archive\n" );
919  } elseif ( preg_match( $openxmlRegex, substr( $header, 30 ) ) ) {
920  $mime = "application/x-opc+zip";
921  # TODO: remove the block below, as soon as improveTypeFromExtension is used everywhere
922  if ( $ext !== true && $ext !== false ) {
927  if ( $this->isMatchingExtension( $ext, $mime ) ) {
928  /* A known file extension for an OPC file,
929  * find the proper mime type for that file extension
930  */
931  $mime = $this->guessTypesForExtension( $ext );
932  } else {
933  $mime = "application/zip";
934  }
935  }
936  $this->logger->info( __METHOD__ .
937  ": detected an Open Packaging Conventions archive: $mime\n" );
938  } elseif ( substr( $header, 0, 8 ) == "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" &&
939  ( $headerpos = strpos( $tail, "PK\x03\x04" ) ) !== false &&
940  preg_match( $openxmlRegex, substr( $tail, $headerpos + 30 ) ) ) {
941  if ( substr( $header, 512, 4 ) == "\xEC\xA5\xC1\x00" ) {
942  $mime = "application/msword";
943  }
944  switch ( substr( $header, 512, 6 ) ) {
945  case "\xEC\xA5\xC1\x00\x0E\x00":
946  case "\xEC\xA5\xC1\x00\x1C\x00":
947  case "\xEC\xA5\xC1\x00\x43\x00":
948  $mime = "application/vnd.ms-powerpoint";
949  break;
950  case "\xFD\xFF\xFF\xFF\x10\x00":
951  case "\xFD\xFF\xFF\xFF\x1F\x00":
952  case "\xFD\xFF\xFF\xFF\x22\x00":
953  case "\xFD\xFF\xFF\xFF\x23\x00":
954  case "\xFD\xFF\xFF\xFF\x28\x00":
955  case "\xFD\xFF\xFF\xFF\x29\x00":
956  case "\xFD\xFF\xFF\xFF\x10\x02":
957  case "\xFD\xFF\xFF\xFF\x1F\x02":
958  case "\xFD\xFF\xFF\xFF\x22\x02":
959  case "\xFD\xFF\xFF\xFF\x23\x02":
960  case "\xFD\xFF\xFF\xFF\x28\x02":
961  case "\xFD\xFF\xFF\xFF\x29\x02":
962  $mime = "application/vnd.msexcel";
963  break;
964  }
965 
966  $this->logger->info( __METHOD__ .
967  ": detected a MS Office document with OPC trailer\n" );
968  } else {
969  $this->logger->info( __METHOD__ . ": unable to identify type of ZIP archive\n" );
970  }
971  return $mime;
972  }
973 
981  function detectMicrosoftBinaryType( $handle ) {
982  $info = MSCompoundFileReader::readHandle( $handle );
983  if ( !$info['valid'] ) {
984  $this->logger->info( __METHOD__ . ': invalid file format' );
985  return 'unknown/unknown';
986  }
987  if ( !$info['mime'] ) {
988  $this->logger->info( __METHOD__ . ": unrecognised document subtype" );
989  return 'unknown/unknown';
990  }
991  return $info['mime'];
992  }
993 
1011  private function detectMimeType( $file, $ext = true ) {
1013  if ( $ext ) {
1014  $this->logger->info( __METHOD__ .
1015  ": WARNING: use of the \$ext parameter is deprecated. "
1016  . "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
1017  }
1018 
1019  $callback = $this->detectCallback;
1020  $m = null;
1021  if ( $callback ) {
1022  $m = $callback( $file );
1023  } else {
1024  $m = mime_content_type( $file );
1025  }
1026 
1027  if ( $m ) {
1028  # normalize
1029  $m = preg_replace( '![;, ].*$!', '', $m ); # strip charset, etc
1030  $m = trim( $m );
1031  $m = strtolower( $m );
1032 
1033  if ( strpos( $m, 'unknown' ) !== false ) {
1034  $m = null;
1035  } else {
1036  $this->logger->info( __METHOD__ . ": magic mime type of $file: $m\n" );
1037  return $m;
1038  }
1039  }
1040 
1041  // If desired, look at extension as a fallback.
1042  if ( $ext === true ) {
1043  $i = strrpos( $file, '.' );
1044  $ext = strtolower( $i ? substr( $file, $i + 1 ) : '' );
1045  }
1046  if ( $ext ) {
1047  if ( $this->isRecognizableExtension( $ext ) ) {
1048  $this->logger->info( __METHOD__ . ": refusing to guess mime type for .$ext file, "
1049  . "we should have recognized it\n" );
1050  } else {
1051  $m = $this->guessTypesForExtension( $ext );
1052  if ( $m ) {
1053  $this->logger->info( __METHOD__ . ": extension mime type of $file: $m\n" );
1054  return $m;
1055  }
1056  }
1057  }
1058 
1059  // Unknown type
1060  $this->logger->info( __METHOD__ . ": failed to guess mime type for $file!\n" );
1061  return 'unknown/unknown';
1062  }
1063 
1080  function getMediaType( $path = null, $mime = null ) {
1081  if ( !$mime && !$path ) {
1082  return MEDIATYPE_UNKNOWN;
1083  }
1084 
1085  // If MIME type is unknown, guess it
1086  if ( !$mime ) {
1087  $mime = $this->guessMimeType( $path, false );
1088  }
1089 
1090  // Special code for ogg - detect if it's video (theora),
1091  // else label it as sound.
1092  if ( $mime == 'application/ogg' && file_exists( $path ) ) {
1093  // Read a chunk of the file
1094  $f = fopen( $path, "rt" );
1095  if ( !$f ) {
1096  return MEDIATYPE_UNKNOWN;
1097  }
1098  $head = fread( $f, 256 );
1099  fclose( $f );
1100 
1101  $head = str_replace( 'ffmpeg2theora', '', strtolower( $head ) );
1102 
1103  // This is an UGLY HACK, file should be parsed correctly
1104  if ( strpos( $head, 'theora' ) !== false ) {
1105  return MEDIATYPE_VIDEO;
1106  } elseif ( strpos( $head, 'vorbis' ) !== false ) {
1107  return MEDIATYPE_AUDIO;
1108  } elseif ( strpos( $head, 'flac' ) !== false ) {
1109  return MEDIATYPE_AUDIO;
1110  } elseif ( strpos( $head, 'speex' ) !== false ) {
1111  return MEDIATYPE_AUDIO;
1112  } elseif ( strpos( $head, 'opus' ) !== false ) {
1113  return MEDIATYPE_AUDIO;
1114  } else {
1115  return MEDIATYPE_MULTIMEDIA;
1116  }
1117  }
1118 
1119  $type = null;
1120  // Check for entry for full MIME type
1121  if ( $mime ) {
1122  $type = $this->findMediaType( $mime );
1123  if ( $type !== MEDIATYPE_UNKNOWN ) {
1124  return $type;
1125  }
1126  }
1127 
1128  // Check for entry for file extension
1129  if ( $path ) {
1130  $i = strrpos( $path, '.' );
1131  $e = strtolower( $i ? substr( $path, $i + 1 ) : '' );
1132 
1133  // TODO: look at multi-extension if this fails, parse from full path
1134  $type = $this->findMediaType( '.' . $e );
1135  if ( $type !== MEDIATYPE_UNKNOWN ) {
1136  return $type;
1137  }
1138  }
1139 
1140  // Check major MIME type
1141  if ( $mime ) {
1142  $i = strpos( $mime, '/' );
1143  if ( $i !== false ) {
1144  $major = substr( $mime, 0, $i );
1145  $type = $this->findMediaType( $major );
1146  if ( $type !== MEDIATYPE_UNKNOWN ) {
1147  return $type;
1148  }
1149  }
1150  }
1151 
1152  if ( !$type ) {
1154  }
1155 
1156  return $type;
1157  }
1158 
1169  function findMediaType( $extMime ) {
1170  if ( strpos( $extMime, '.' ) === 0 ) {
1171  // If it's an extension, look up the MIME types
1172  $m = $this->getTypesForExtension( substr( $extMime, 1 ) );
1173  if ( !$m ) {
1174  return MEDIATYPE_UNKNOWN;
1175  }
1176 
1177  $m = explode( ' ', $m );
1178  } else {
1179  // Normalize MIME type
1180  if ( isset( $this->mimeTypeAliases[$extMime] ) ) {
1181  $extMime = $this->mimeTypeAliases[$extMime];
1182  }
1183 
1184  $m = [ $extMime ];
1185  }
1186 
1187  foreach ( $m as $mime ) {
1188  foreach ( $this->mediaTypes as $type => $codes ) {
1189  if ( in_array( $mime, $codes, true ) ) {
1190  return $type;
1191  }
1192  }
1193  }
1194 
1195  return MEDIATYPE_UNKNOWN;
1196  }
1197 
1203  public function getMediaTypes() {
1204  return array_keys( $this->mediaTypes );
1205  }
1206 
1216  public function getIEMimeTypes( $fileName, $chunk, $proposed ) {
1217  $ca = $this->getIEContentAnalyzer();
1218  return $ca->getRealMimesFromData( $fileName, $chunk, $proposed );
1219  }
1220 
1226  protected function getIEContentAnalyzer() {
1227  if ( is_null( $this->IEAnalyzer ) ) {
1228  $this->IEAnalyzer = new IEContentAnalyzer;
1229  }
1230  return $this->IEAnalyzer;
1231  }
1232 }
guessMimeType( $file, $ext=true)
MIME type detection.
setLogger(LoggerInterface $logger)
string $xmlTypes
either a plain
Definition: hooks.txt:2044
const MEDIATYPE_TEXT
Definition: defines.php:41
callable $initCallback
detectZipType( $header, $tail=null, $ext=false)
Detect application-specific file type of a given ZIP file from its header data.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException' returning false will NOT prevent logging $e
Definition: hooks.txt:2158
getIEMimeTypes( $fileName, $chunk, $proposed)
Get the MIME types that various versions of Internet Explorer would detect from a chunk of the conten...
const MEDIATYPE_MULTIMEDIA
Definition: defines.php:37
addExtraTypes( $types)
Adds to the list mapping MIME to file extensions.
isPHPImageType( $mime)
Returns true if the MIME type is known to represent an image format supported by the PHP GD library...
IEContentAnalyzer $IEAnalyzer
__construct(array $params)
const MEDIATYPE_UNKNOWN
Definition: defines.php:26
This list may contain false positives That usually means there is additional text with links below the first Each row contains links to the first and second as well as the first line of the second redirect text
getMediaTypes()
Returns an array of media types (MEDIATYPE_xxx constants)
array $mediaTypes
Mapping of media types to arrays of MIME types.
static readHandle( $fileHandle)
Read from an open seekable handle.
improveTypeFromExtension( $mime, $ext)
Improves a MIME type using the file extension.
const MEDIATYPE_VIDEO
Definition: defines.php:35
callable $detectCallback
isMatchingExtension( $extension, $mime)
Tests if the extension matches the given MIME type.
callable $extCallback
This class simulates Microsoft Internet Explorer&#39;s terribly broken and insecure MIME type detection a...
Using a hook running we can avoid having all this option specific stuff in our mainline code Using the function We ve cleaned up the code here by removing clumps of infrequently used code and moving them off somewhere else It s much easier for someone working with this code to see what s _really_ going and make changes or fix bugs In we can take all the code that deals with the little used title reversing etc
Definition: hooks.txt:91
static $wellKnownInfo
Defines a set of well known MIME info entries This is used as a fallback to mime.info files...
string $extraTypes
Extra MIME types, set for example by media handling extensions.
getMediaType( $path=null, $mime=null)
Determine the media type code for a file, using its MIME type, name and possibly its contents...
$params
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:773
getExtensionsForType( $mime)
Returns a list of file extensions for a given MIME type as a space separated string or null if the MI...
$header
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
LoggerInterface $logger
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
string $infoFile
$lines
Definition: router.php:61
addExtraInfo( $info)
Adds to the list mapping MIME to media type.
const MEDIATYPE_AUDIO
Definition: defines.php:32
array $mExtToMime
Map of file extensions types to MIME types (as a space separated list)
array $mimeTypeAliases
Map of MIME type aliases.
array $mimetoExt
Map of MIME types to file extensions (as a space separated list)
string $typeFile
if(!is_readable( $file)) $ext
Definition: router.php:48
doGuessMimeType( $file, $ext)
Guess the MIME type from the file contents.
$f
Definition: router.php:79
getIEContentAnalyzer()
Get a cached instance of IEContentAnalyzer.
callable $guessCallback
guessTypesForExtension( $ext)
Returns a single MIME type for a given file extension or null if unknown.
static $wellKnownTypes
Defines a set of well known MIME types This is used as a fallback to mime.types files.
isRecognizableExtension( $extension)
Returns true if the extension represents a type which can be reliably detected from its content...
detectMimeType( $file, $ext=true)
Internal MIME type detection.
findMediaType( $extMime)
Returns a media code matching the given MIME type or file extension.
string $extraInfo
Extra MIME info, set for example by media handling extensions.
$matches
getTypesForExtension( $ext)
Returns a list of MIME types for a given file extension as a space separated string or null if the ex...
detectMicrosoftBinaryType( $handle)
Detect the type of a Microsoft Compound Binary a.k.a.