MediaWiki  1.34.0
IEContentAnalyzer.php
Go to the documentation of this file.
1 <?php
31  protected $baseTypeTable = [
32  'ambiguous' /*1*/ => [
33  'text/plain',
34  'application/octet-stream',
35  'application/x-netcdf', // [sic]
36  ],
37  'text' /*3*/ => [
38  'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
39  'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
40  ],
41  'binary' /*4*/ => [
42  'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
43  'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
44  'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
45  'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
46  'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
47  'application/x-msdownload'
48  ],
49  'html' /*5*/ => [ 'text/html' ],
50  ];
51 
55  protected $addedTypes = [
56  'ie07' => [
57  'text' => [ 'text/xml', 'application/xml' ]
58  ],
59  ];
60 
67  protected $registry = [
68  '.323' => 'text/h323',
69  '.3g2' => 'video/3gpp2',
70  '.3gp' => 'video/3gpp',
71  '.3gp2' => 'video/3gpp2',
72  '.3gpp' => 'video/3gpp',
73  '.aac' => 'audio/aac',
74  '.ac3' => 'audio/ac3',
75  '.accda' => 'application/msaccess',
76  '.accdb' => 'application/msaccess',
77  '.accdc' => 'application/msaccess',
78  '.accde' => 'application/msaccess',
79  '.accdr' => 'application/msaccess',
80  '.accdt' => 'application/msaccess',
81  '.ade' => 'application/msaccess',
82  '.adp' => 'application/msaccess',
83  '.adts' => 'audio/aac',
84  '.ai' => 'application/postscript',
85  '.aif' => 'audio/aiff',
86  '.aifc' => 'audio/aiff',
87  '.aiff' => 'audio/aiff',
88  '.amc' => 'application/x-mpeg',
89  '.application' => 'application/x-ms-application',
90  '.asf' => 'video/x-ms-asf',
91  '.asx' => 'video/x-ms-asf',
92  '.au' => 'audio/basic',
93  '.avi' => 'video/avi',
94  '.bmp' => 'image/bmp',
95  '.caf' => 'audio/x-caf',
96  '.cat' => 'application/vnd.ms-pki.seccat',
97  '.cbo' => 'application/sha',
98  '.cdda' => 'audio/aiff',
99  '.cer' => 'application/x-x509-ca-cert',
100  '.conf' => 'text/plain',
101  '.crl' => 'application/pkix-crl',
102  '.crt' => 'application/x-x509-ca-cert',
103  '.css' => 'text/css',
104  '.csv' => 'application/vnd.ms-excel',
105  '.der' => 'application/x-x509-ca-cert',
106  '.dib' => 'image/bmp',
107  '.dif' => 'video/x-dv',
108  '.dll' => 'application/x-msdownload',
109  '.doc' => 'application/msword',
110  '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
111  '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
112  '.dot' => 'application/msword',
113  '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
114  '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
115  '.dv' => 'video/x-dv',
116  '.dwfx' => 'model/vnd.dwfx+xps',
117  '.edn' => 'application/vnd.adobe.edn',
118  '.eml' => 'message/rfc822',
119  '.eps' => 'application/postscript',
120  '.etd' => 'application/x-ebx',
121  '.exe' => 'application/x-msdownload',
122  '.fdf' => 'application/vnd.fdf',
123  '.fif' => 'application/fractals',
124  '.gif' => 'image/gif',
125  '.gsm' => 'audio/x-gsm',
126  '.hqx' => 'application/mac-binhex40',
127  '.hta' => 'application/hta',
128  '.htc' => 'text/x-component',
129  '.htm' => 'text/html',
130  '.html' => 'text/html',
131  '.htt' => 'text/webviewhtml',
132  '.hxa' => 'application/xml',
133  '.hxc' => 'application/xml',
134  '.hxd' => 'application/octet-stream',
135  '.hxe' => 'application/xml',
136  '.hxf' => 'application/xml',
137  '.hxh' => 'application/octet-stream',
138  '.hxi' => 'application/octet-stream',
139  '.hxk' => 'application/xml',
140  '.hxq' => 'application/octet-stream',
141  '.hxr' => 'application/octet-stream',
142  '.hxs' => 'application/octet-stream',
143  '.hxt' => 'application/xml',
144  '.hxv' => 'application/xml',
145  '.hxw' => 'application/octet-stream',
146  '.ico' => 'image/x-icon',
147  '.iii' => 'application/x-iphone',
148  '.ins' => 'application/x-internet-signup',
149  '.iqy' => 'text/x-ms-iqy',
150  '.isp' => 'application/x-internet-signup',
151  '.jfif' => 'image/jpeg',
152  '.jnlp' => 'application/x-java-jnlp-file',
153  '.jpe' => 'image/jpeg',
154  '.jpeg' => 'image/jpeg',
155  '.jpg' => 'image/jpeg',
156  '.jtx' => 'application/x-jtx+xps',
157  '.latex' => 'application/x-latex',
158  '.log' => 'text/plain',
159  '.m1v' => 'video/mpeg',
160  '.m2v' => 'video/mpeg',
161  '.m3u' => 'audio/x-mpegurl',
162  '.mac' => 'image/x-macpaint',
163  '.man' => 'application/x-troff-man',
164  '.mda' => 'application/msaccess',
165  '.mdb' => 'application/msaccess',
166  '.mde' => 'application/msaccess',
167  '.mfp' => 'application/x-shockwave-flash',
168  '.mht' => 'message/rfc822',
169  '.mhtml' => 'message/rfc822',
170  '.mid' => 'audio/mid',
171  '.midi' => 'audio/mid',
172  '.mod' => 'video/mpeg',
173  '.mov' => 'video/quicktime',
174  '.mp2' => 'video/mpeg',
175  '.mp2v' => 'video/mpeg',
176  '.mp3' => 'audio/mpeg',
177  '.mp4' => 'video/mp4',
178  '.mpa' => 'video/mpeg',
179  '.mpe' => 'video/mpeg',
180  '.mpeg' => 'video/mpeg',
181  '.mpf' => 'application/vnd.ms-mediapackage',
182  '.mpg' => 'video/mpeg',
183  '.mpv2' => 'video/mpeg',
184  '.mqv' => 'video/quicktime',
185  '.NMW' => 'application/nmwb',
186  '.nws' => 'message/rfc822',
187  '.odc' => 'text/x-ms-odc',
188  '.ols' => 'application/vnd.ms-publisher',
189  '.p10' => 'application/pkcs10',
190  '.p12' => 'application/x-pkcs12',
191  '.p7b' => 'application/x-pkcs7-certificates',
192  '.p7c' => 'application/pkcs7-mime',
193  '.p7m' => 'application/pkcs7-mime',
194  '.p7r' => 'application/x-pkcs7-certreqresp',
195  '.p7s' => 'application/pkcs7-signature',
196  '.pct' => 'image/pict',
197  '.pdf' => 'application/pdf',
198  '.pdx' => 'application/vnd.adobe.pdx',
199  '.pfx' => 'application/x-pkcs12',
200  '.pic' => 'image/pict',
201  '.pict' => 'image/pict',
202  '.pinstall' => 'application/x-picasa-detect',
203  '.pko' => 'application/vnd.ms-pki.pko',
204  '.png' => 'image/png',
205  '.pnt' => 'image/x-macpaint',
206  '.pntg' => 'image/x-macpaint',
207  '.pot' => 'application/vnd.ms-powerpoint',
208  '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
209  '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
210  '.ppa' => 'application/vnd.ms-powerpoint',
211  '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
212  '.pps' => 'application/vnd.ms-powerpoint',
213  '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
214  '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
215  '.ppt' => 'application/vnd.ms-powerpoint',
216  '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
217  '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
218  '.prf' => 'application/pics-rules',
219  '.ps' => 'application/postscript',
220  '.pub' => 'application/vnd.ms-publisher',
221  '.pwz' => 'application/vnd.ms-powerpoint',
222  '.py' => 'text/plain',
223  '.pyw' => 'text/plain',
224  '.qht' => 'text/x-html-insertion',
225  '.qhtm' => 'text/x-html-insertion',
226  '.qt' => 'video/quicktime',
227  '.qti' => 'image/x-quicktime',
228  '.qtif' => 'image/x-quicktime',
229  '.qtl' => 'application/x-quicktimeplayer',
230  '.rat' => 'application/rat-file',
231  '.rmf' => 'application/vnd.adobe.rmf',
232  '.rmi' => 'audio/mid',
233  '.rqy' => 'text/x-ms-rqy',
234  '.rtf' => 'application/msword',
235  '.sct' => 'text/scriptlet',
236  '.sd2' => 'audio/x-sd2',
237  '.sdp' => 'application/sdp',
238  '.shtml' => 'text/html',
239  '.sit' => 'application/x-stuffit',
240  '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
241  '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
242  '.slk' => 'application/vnd.ms-excel',
243  '.snd' => 'audio/basic',
244  '.so' => 'application/x-apachemodule',
245  '.sol' => 'text/plain',
246  '.sor' => 'text/plain',
247  '.spc' => 'application/x-pkcs7-certificates',
248  '.spl' => 'application/futuresplash',
249  '.sst' => 'application/vnd.ms-pki.certstore',
250  '.stl' => 'application/vnd.ms-pki.stl',
251  '.swf' => 'application/x-shockwave-flash',
252  '.thmx' => 'application/vnd.ms-officetheme',
253  '.tif' => 'image/tiff',
254  '.tiff' => 'image/tiff',
255  '.txt' => 'text/plain',
256  '.uls' => 'text/iuls',
257  '.vcf' => 'text/x-vcard',
258  '.vdx' => 'application/vnd.ms-visio.viewer',
259  '.vsd' => 'application/vnd.ms-visio.viewer',
260  '.vss' => 'application/vnd.ms-visio.viewer',
261  '.vst' => 'application/vnd.ms-visio.viewer',
262  '.vsx' => 'application/vnd.ms-visio.viewer',
263  '.vtx' => 'application/vnd.ms-visio.viewer',
264  '.wav' => 'audio/wav',
265  '.wax' => 'audio/x-ms-wax',
266  '.wbk' => 'application/msword',
267  '.wdp' => 'image/vnd.ms-photo',
268  '.wiz' => 'application/msword',
269  '.wm' => 'video/x-ms-wm',
270  '.wma' => 'audio/x-ms-wma',
271  '.wmd' => 'application/x-ms-wmd',
272  '.wmv' => 'video/x-ms-wmv',
273  '.wmx' => 'video/x-ms-wmx',
274  '.wmz' => 'application/x-ms-wmz',
275  '.wpl' => 'application/vnd.ms-wpl',
276  '.wsc' => 'text/scriptlet',
277  '.wvx' => 'video/x-ms-wvx',
278  '.xaml' => 'application/xaml+xml',
279  '.xbap' => 'application/x-ms-xbap',
280  '.xdp' => 'application/vnd.adobe.xdp+xml',
281  '.xfdf' => 'application/vnd.adobe.xfdf',
282  '.xht' => 'application/xhtml+xml',
283  '.xhtml' => 'application/xhtml+xml',
284  '.xla' => 'application/vnd.ms-excel',
285  '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
286  '.xlk' => 'application/vnd.ms-excel',
287  '.xll' => 'application/vnd.ms-excel',
288  '.xlm' => 'application/vnd.ms-excel',
289  '.xls' => 'application/vnd.ms-excel',
290  '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
291  '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
292  '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
293  '.xlt' => 'application/vnd.ms-excel',
294  '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
295  '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
296  '.xlw' => 'application/vnd.ms-excel',
297  '.xml' => 'text/xml',
298  '.xps' => 'application/vnd.ms-xpsdocument',
299  '.xsl' => 'text/xml',
300  ];
301 
307  protected $versions = [ 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ];
308 
312  protected $typeTable = [];
313 
314  function __construct() {
315  // Construct versioned type arrays from the base type array plus additions
316  $types = $this->baseTypeTable;
317  foreach ( $this->versions as $version ) {
318  if ( isset( $this->addedTypes[$version] ) ) {
319  foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
320  $types[$format] = array_merge( $types[$format], $addedTypes );
321  }
322  }
323  $this->typeTable[$version] = $types;
324  }
325  }
326 
337  public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
338  $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
339  $types = array_map( [ $this, 'translateMimeType' ], $types );
340  return $types;
341  }
342 
349  public function translateMimeType( $type ) {
350  static $table = [
351  'image/pjpeg' => 'image/jpeg',
352  'image/x-png' => 'image/png',
353  'image/x-wmf' => 'application/x-msmetafile',
354  'image/bmp' => 'image/x-bmp',
355  'application/x-zip-compressed' => 'application/zip',
356  'application/x-compressed' => 'application/x-compress',
357  'application/x-gzip-compressed' => 'application/x-gzip',
358  'audio/mid' => 'audio/midi',
359  ];
360  if ( isset( $table[$type] ) ) {
361  $type = $table[$type];
362  }
363  return $type;
364  }
365 
375  public function getMimesFromData( $fileName, $chunk, $proposed ) {
376  $types = [];
377  foreach ( $this->versions as $version ) {
378  $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
379  }
380  return $types;
381  }
382 
391  protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
392  // Strip text after a semicolon
393  $semiPos = strpos( $proposed, ';' );
394  if ( $semiPos !== false ) {
395  $proposed = substr( $proposed, 0, $semiPos );
396  }
397 
398  $proposedFormat = $this->getDataFormat( $version, $proposed );
399  if ( $proposedFormat == 'unknown'
400  && $proposed != 'multipart/mixed'
401  && $proposed != 'multipart/x-mixed-replace'
402  ) {
403  return $proposed;
404  }
405  if ( strval( $chunk ) === '' ) {
406  return $proposed;
407  }
408 
409  // Truncate chunk at 255 bytes
410  $chunk = substr( $chunk, 0, 255 );
411 
412  // IE does the Check*Headers() calls last, and instead does the following image
413  // type checks by directly looking for the magic numbers. What I do here should
414  // have the same effect since the magic number checks are identical in both cases.
415  $result = $this->sampleData( $version, $chunk );
416  $sampleFound = $result['found'];
417  $counters = $result['counters'];
418  $binaryType = $this->checkBinaryHeaders( $version, $chunk );
419  $textType = $this->checkTextHeaders( $version, $chunk );
420 
421  if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
422  return 'text/html';
423  }
424  if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
425  return 'image/gif';
426  }
427  if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
428  && $binaryType == 'image/pjpeg'
429  ) {
430  return $proposed;
431  }
432  // PNG check added in IE 7
433  if ( $version >= 'ie07'
434  && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
435  && $binaryType == 'image/x-png'
436  ) {
437  return $proposed;
438  }
439 
440  // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
441  if ( isset( $sampleFound['cdf'] ) ) {
442  return 'application/x-cdf';
443  }
444 
445  // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
446  // previous versions
447  if ( isset( $sampleFound['rss'] ) ) {
448  return 'application/rss+xml';
449  }
450  if ( isset( $sampleFound['rdf-tag'] )
451  && isset( $sampleFound['rdf-url'] )
452  && isset( $sampleFound['rdf-purl'] )
453  ) {
454  return 'application/rss+xml';
455  }
456  if ( isset( $sampleFound['atom'] ) ) {
457  return 'application/atom+xml';
458  }
459 
460  if ( isset( $sampleFound['xml'] ) ) {
461  // TODO: I'm not sure under what circumstances this flag is enabled
462  if ( strpos( $version, 'strict' ) !== false ) {
463  if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
464  return 'text/xml';
465  }
466  } else {
467  return 'text/xml';
468  }
469  }
470  if ( isset( $sampleFound['html'] ) ) {
471  // TODO: I'm not sure under what circumstances this flag is enabled
472  if ( strpos( $version, 'nohtml' ) !== false ) {
473  if ( $proposed == 'text/plain' ) {
474  return 'text/html';
475  }
476  } else {
477  return 'text/html';
478  }
479  }
480  if ( isset( $sampleFound['xbm'] ) ) {
481  return 'image/x-bitmap';
482  }
483  if ( isset( $sampleFound['binhex'] ) ) {
484  return 'application/macbinhex40';
485  }
486  if ( isset( $sampleFound['scriptlet'] ) ) {
487  if ( strpos( $version, 'strict' ) !== false ) {
488  if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
489  return 'text/scriptlet';
490  }
491  } else {
492  return 'text/scriptlet';
493  }
494  }
495 
496  // Freaky heuristics to determine if the data is text or binary
497  // The heuristic is of course broken for non-ASCII text
498  if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
499  < ( $counters['ctrl'] + $counters['high'] ) * 16
500  ) {
501  $kindOfBinary = true;
502  $type = $binaryType ?: $textType;
503  if ( $type === false ) {
504  $type = 'application/octet-stream';
505  }
506  } else {
507  $kindOfBinary = false;
508  $type = $textType ?: $binaryType;
509  if ( $type === false ) {
510  $type = 'text/plain';
511  }
512  }
513 
514  // Check if the output format is ambiguous
515  // This generally means that detection failed, real types aren't ambiguous
516  $detectedFormat = $this->getDataFormat( $version, $type );
517  if ( $detectedFormat != 'ambiguous' ) {
518  return $type;
519  }
520 
521  if ( $proposedFormat != 'ambiguous' ) {
522  // FormatAgreesWithData()
523  if ( $proposedFormat == 'text' && !$kindOfBinary ) {
524  return $proposed;
525  }
526  if ( $proposedFormat == 'binary' && $kindOfBinary ) {
527  return $proposed;
528  }
529  if ( $proposedFormat == 'html' ) {
530  return $proposed;
531  }
532  }
533 
534  // Find a MIME type by searching the registry for the file extension.
535  $dotPos = strrpos( $fileName, '.' );
536  if ( $dotPos === false ) {
537  return $type;
538  }
539  $ext = substr( $fileName, $dotPos );
540  if ( isset( $this->registry[$ext] ) ) {
541  return $this->registry[$ext];
542  }
543 
544  // TODO: If the extension has an application registered to it, IE will return
545  // application/octet-stream. We'll skip that, so we could erroneously
546  // return text/plain or application/x-netcdf where application/octet-stream
547  // would be correct.
548 
549  return $type;
550  }
551 
559  private function checkTextHeaders( $version, $chunk ) {
560  $chunk2 = substr( $chunk, 0, 2 );
561  $chunk4 = substr( $chunk, 0, 4 );
562  $chunk5 = substr( $chunk, 0, 5 );
563  if ( $chunk4 == '%PDF' ) {
564  return 'application/pdf';
565  }
566  if ( $chunk2 == '%!' ) {
567  return 'application/postscript';
568  }
569  if ( $chunk5 == '{\\rtf' ) {
570  return 'text/richtext';
571  }
572  if ( $chunk5 == 'begin' ) {
573  return 'application/base64';
574  }
575  return false;
576  }
577 
585  private function checkBinaryHeaders( $version, $chunk ) {
586  $chunk2 = substr( $chunk, 0, 2 );
587  $chunk3 = substr( $chunk, 0, 3 );
588  $chunk4 = substr( $chunk, 0, 4 );
589  $chunk5 = substr( $chunk, 0, 5 );
590  $chunk5uc = strtoupper( $chunk5 );
591  $chunk8 = substr( $chunk, 0, 8 );
592  if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
593  return 'image/gif';
594  }
595  if ( $chunk2 == "\xff\xd8" ) {
596  return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
597  }
598 
599  if ( $chunk2 == 'BM'
600  && substr( $chunk, 6, 2 ) == "\000\000"
601  && substr( $chunk, 8, 2 ) == "\000\000"
602  ) {
603  return 'image/bmp'; // another non-standard MIME
604  }
605  if ( $chunk4 == 'RIFF'
606  && substr( $chunk, 8, 4 ) == 'WAVE'
607  ) {
608  return 'audio/wav';
609  }
610  // These were integer literals in IE
611  // Perhaps the author was not sure what the target endianness was
612  if ( $chunk4 == ".sd\000"
613  || $chunk4 == ".snd"
614  || $chunk4 == "\000ds."
615  || $chunk4 == "dns."
616  ) {
617  return 'audio/basic';
618  }
619  if ( $chunk3 == "MM\000" ) {
620  return 'image/tiff';
621  }
622  if ( $chunk2 == 'MZ' ) {
623  return 'application/x-msdownload';
624  }
625  if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
626  return 'image/x-png'; // [sic]
627  }
628  if ( strlen( $chunk ) >= 5 ) {
629  $byte2 = ord( $chunk[2] );
630  $byte4 = ord( $chunk[4] );
631  if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
632  return 'image/x-jg';
633  }
634  }
635  // More endian confusion?
636  if ( $chunk4 == 'MROF' ) {
637  return 'audio/x-aiff';
638  }
639  $chunk4_8 = substr( $chunk, 8, 4 );
640  if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
641  return 'audio/x-aiff';
642  }
643  if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
644  return 'video/avi';
645  }
646  if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
647  return 'video/mpeg';
648  }
649  if ( $chunk4 == "\001\000\000\000"
650  && substr( $chunk, 40, 4 ) == ' EMF'
651  ) {
652  return 'image/x-emf';
653  }
654  if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
655  return 'image/x-wmf';
656  }
657  if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
658  return 'application/java';
659  }
660  if ( $chunk2 == 'PK' ) {
661  return 'application/x-zip-compressed';
662  }
663  if ( $chunk2 == "\x1f\x9d" ) {
664  return 'application/x-compressed';
665  }
666  if ( $chunk2 == "\x1f\x8b" ) {
667  return 'application/x-gzip-compressed';
668  }
669  // Skip redundant check for ZIP
670  if ( $chunk5 == "MThd\000" ) {
671  return 'audio/mid';
672  }
673  if ( $chunk4 == '%PDF' ) {
674  return 'application/pdf';
675  }
676  return false;
677  }
678 
686  protected function sampleData( $version, $chunk ) {
687  $found = [];
688  $counters = [
689  'ctrl' => 0,
690  'high' => 0,
691  'low' => 0,
692  'lf' => 0,
693  'cr' => 0,
694  'ff' => 0
695  ];
696  $htmlTags = [
697  'html',
698  'head',
699  'title',
700  'body',
701  'script',
702  'a href',
703  'pre',
704  'img',
705  'plaintext',
706  'table'
707  ];
708  $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
709  $rdfPurl = 'http://purl.org/rss/1.0/';
710  $xbmMagic1 = '#define';
711  $xbmMagic2 = '_width';
712  $xbmMagic3 = '_bits';
713  $binhexMagic = 'converted with BinHex';
714  $chunkLength = strlen( $chunk );
715 
716  for ( $offset = 0; $offset < $chunkLength; $offset++ ) {
717  $curChar = $chunk[$offset];
718  if ( $curChar == "\x0a" ) {
719  $counters['lf']++;
720  continue;
721  } elseif ( $curChar == "\x0d" ) {
722  $counters['cr']++;
723  continue;
724  } elseif ( $curChar == "\x0c" ) {
725  $counters['ff']++;
726  continue;
727  } elseif ( $curChar == "\t" ) {
728  $counters['low']++;
729  continue;
730  } elseif ( ord( $curChar ) < 32 ) {
731  $counters['ctrl']++;
732  continue;
733  } elseif ( ord( $curChar ) >= 128 ) {
734  $counters['high']++;
735  continue;
736  }
737 
738  $counters['low']++;
739  if ( $curChar == '<' ) {
740  // XML
741  $remainder = substr( $chunk, $offset + 1 );
742  if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
743  $nextChar = substr( $chunk, $offset + 5, 1 );
744  if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
745  $found['xml'] = true;
746  }
747  }
748  // Scriptlet (JSP)
749  if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
750  $found['scriptlet'] = true;
751  break;
752  }
753  // HTML
754  foreach ( $htmlTags as $tag ) {
755  if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
756  $found['html'] = true;
757  }
758  }
759  // Skip broken check for additional tags (HR etc.)
760 
761  // CHANNEL replaced by RSS, RDF and FEED in IE 7
762  if ( $version < 'ie07' ) {
763  if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
764  $found['cdf'] = true;
765  }
766  } else {
767  // RSS
768  if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
769  $found['rss'] = true;
770  break; // return from SampleData
771  }
772  if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
773  $found['rdf-tag'] = true;
774  // no break
775  }
776  if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
777  $found['atom'] = true;
778  break;
779  }
780  }
781  continue;
782  }
783  // Skip broken check for -->
784 
785  // RSS URL checks
786  // For some reason both URLs must appear before it is recognised
787  $remainder = substr( $chunk, $offset );
788  if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
789  $found['rdf-url'] = true;
790  if ( isset( $found['rdf-tag'] )
791  && isset( $found['rdf-purl'] ) // [sic]
792  ) {
793  break;
794  }
795  continue;
796  }
797 
798  if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
799  if ( isset( $found['rdf-tag'] )
800  && isset( $found['rdf-url'] ) // [sic]
801  ) {
802  break;
803  }
804  continue;
805  }
806 
807  // XBM checks
808  if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
809  $found['xbm1'] = true;
810  continue;
811  }
812  if ( $curChar == '_' ) {
813  if ( isset( $found['xbm2'] ) ) {
814  if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
815  $found['xbm'] = true;
816  break;
817  }
818  } elseif ( isset( $found['xbm1'] ) ) {
819  if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
820  $found['xbm2'] = true;
821  }
822  }
823  }
824 
825  // BinHex
826  if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
827  $found['binhex'] = true;
828  }
829  }
830  return [ 'found' => $found, 'counters' => $counters ];
831  }
832 
838  protected function getDataFormat( $version, $type ) {
839  $types = $this->typeTable[$version];
840  if ( $type == '(null)' || strval( $type ) === '' ) {
841  return 'ambiguous';
842  }
843  foreach ( $types as $format => $list ) {
844  if ( in_array( $type, $list ) ) {
845  return $format;
846  }
847  }
848  return 'unknown';
849  }
850 }
IEContentAnalyzer\checkBinaryHeaders
checkBinaryHeaders( $version, $chunk)
Check for binary headers at the start of the chunk Confirmed same in 5 and 7.
Definition: IEContentAnalyzer.php:585
IEContentAnalyzer\sampleData
sampleData( $version, $chunk)
Do heuristic checks on the bulk of the data sample.
Definition: IEContentAnalyzer.php:686
IEContentAnalyzer\$addedTypes
$addedTypes
Changes to the type table in later versions of IE.
Definition: IEContentAnalyzer.php:55
IEContentAnalyzer\getRealMimesFromData
getRealMimesFromData( $fileName, $chunk, $proposed)
Get the MIME types from getMimesFromData(), but convert the result from IE's idiosyncratic private ty...
Definition: IEContentAnalyzer.php:337
IEContentAnalyzer\$typeTable
$typeTable
Type table with versions expanded.
Definition: IEContentAnalyzer.php:312
IEContentAnalyzer\getMimesFromData
getMimesFromData( $fileName, $chunk, $proposed)
Get the untranslated MIME types for all known versions.
Definition: IEContentAnalyzer.php:375
IEContentAnalyzer\$baseTypeTable
$baseTypeTable
Relevant data taken from the type table in IE 5.
Definition: IEContentAnalyzer.php:31
IEContentAnalyzer\$registry
$registry
An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a typical Windows installation.
Definition: IEContentAnalyzer.php:67
IEContentAnalyzer\checkTextHeaders
checkTextHeaders( $version, $chunk)
Check for text headers at the start of the chunk Confirmed same in 5 and 7.
Definition: IEContentAnalyzer.php:559
IEContentAnalyzer
This class simulates Microsoft Internet Explorer's terribly broken and insecure MIME type detection a...
Definition: IEContentAnalyzer.php:27
IEContentAnalyzer\getMimeTypeForVersion
getMimeTypeForVersion( $version, $fileName, $chunk, $proposed)
Get the MIME type for a given named version.
Definition: IEContentAnalyzer.php:391
IEContentAnalyzer\translateMimeType
translateMimeType( $type)
Translate a MIME type from IE's idiosyncratic private types into more commonly understood type string...
Definition: IEContentAnalyzer.php:349
IEContentAnalyzer\getDataFormat
getDataFormat( $version, $type)
Definition: IEContentAnalyzer.php:838
$ext
if(!is_readable( $file)) $ext
Definition: router.php:48
IEContentAnalyzer\__construct
__construct()
Definition: IEContentAnalyzer.php:314
IEContentAnalyzer\$versions
$versions
IE versions which have been analysed to bring you this class, and for which some substantive differen...
Definition: IEContentAnalyzer.php:307
$type
$type
Definition: testCompression.php:48