MediaWiki REL1_34
IEContentAnalyzer.php
Go to the documentation of this file.
1<?php
31 protected $baseTypeTable = [
32 'ambiguous' /*1*/ => [
33 'text/plain',
34 'application/octet-stream',
35 'application/x-netcdf', // [sic]
36 ],
37 'text' /*3*/ => [
38 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
39 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
40 ],
41 'binary' /*4*/ => [
42 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
43 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
44 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
45 'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
46 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
47 'application/x-msdownload'
48 ],
49 'html' /*5*/ => [ 'text/html' ],
50 ];
51
55 protected $addedTypes = [
56 'ie07' => [
57 'text' => [ 'text/xml', 'application/xml' ]
58 ],
59 ];
60
67 protected $registry = [
68 '.323' => 'text/h323',
69 '.3g2' => 'video/3gpp2',
70 '.3gp' => 'video/3gpp',
71 '.3gp2' => 'video/3gpp2',
72 '.3gpp' => 'video/3gpp',
73 '.aac' => 'audio/aac',
74 '.ac3' => 'audio/ac3',
75 '.accda' => 'application/msaccess',
76 '.accdb' => 'application/msaccess',
77 '.accdc' => 'application/msaccess',
78 '.accde' => 'application/msaccess',
79 '.accdr' => 'application/msaccess',
80 '.accdt' => 'application/msaccess',
81 '.ade' => 'application/msaccess',
82 '.adp' => 'application/msaccess',
83 '.adts' => 'audio/aac',
84 '.ai' => 'application/postscript',
85 '.aif' => 'audio/aiff',
86 '.aifc' => 'audio/aiff',
87 '.aiff' => 'audio/aiff',
88 '.amc' => 'application/x-mpeg',
89 '.application' => 'application/x-ms-application',
90 '.asf' => 'video/x-ms-asf',
91 '.asx' => 'video/x-ms-asf',
92 '.au' => 'audio/basic',
93 '.avi' => 'video/avi',
94 '.bmp' => 'image/bmp',
95 '.caf' => 'audio/x-caf',
96 '.cat' => 'application/vnd.ms-pki.seccat',
97 '.cbo' => 'application/sha',
98 '.cdda' => 'audio/aiff',
99 '.cer' => 'application/x-x509-ca-cert',
100 '.conf' => 'text/plain',
101 '.crl' => 'application/pkix-crl',
102 '.crt' => 'application/x-x509-ca-cert',
103 '.css' => 'text/css',
104 '.csv' => 'application/vnd.ms-excel',
105 '.der' => 'application/x-x509-ca-cert',
106 '.dib' => 'image/bmp',
107 '.dif' => 'video/x-dv',
108 '.dll' => 'application/x-msdownload',
109 '.doc' => 'application/msword',
110 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
111 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
112 '.dot' => 'application/msword',
113 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
114 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
115 '.dv' => 'video/x-dv',
116 '.dwfx' => 'model/vnd.dwfx+xps',
117 '.edn' => 'application/vnd.adobe.edn',
118 '.eml' => 'message/rfc822',
119 '.eps' => 'application/postscript',
120 '.etd' => 'application/x-ebx',
121 '.exe' => 'application/x-msdownload',
122 '.fdf' => 'application/vnd.fdf',
123 '.fif' => 'application/fractals',
124 '.gif' => 'image/gif',
125 '.gsm' => 'audio/x-gsm',
126 '.hqx' => 'application/mac-binhex40',
127 '.hta' => 'application/hta',
128 '.htc' => 'text/x-component',
129 '.htm' => 'text/html',
130 '.html' => 'text/html',
131 '.htt' => 'text/webviewhtml',
132 '.hxa' => 'application/xml',
133 '.hxc' => 'application/xml',
134 '.hxd' => 'application/octet-stream',
135 '.hxe' => 'application/xml',
136 '.hxf' => 'application/xml',
137 '.hxh' => 'application/octet-stream',
138 '.hxi' => 'application/octet-stream',
139 '.hxk' => 'application/xml',
140 '.hxq' => 'application/octet-stream',
141 '.hxr' => 'application/octet-stream',
142 '.hxs' => 'application/octet-stream',
143 '.hxt' => 'application/xml',
144 '.hxv' => 'application/xml',
145 '.hxw' => 'application/octet-stream',
146 '.ico' => 'image/x-icon',
147 '.iii' => 'application/x-iphone',
148 '.ins' => 'application/x-internet-signup',
149 '.iqy' => 'text/x-ms-iqy',
150 '.isp' => 'application/x-internet-signup',
151 '.jfif' => 'image/jpeg',
152 '.jnlp' => 'application/x-java-jnlp-file',
153 '.jpe' => 'image/jpeg',
154 '.jpeg' => 'image/jpeg',
155 '.jpg' => 'image/jpeg',
156 '.jtx' => 'application/x-jtx+xps',
157 '.latex' => 'application/x-latex',
158 '.log' => 'text/plain',
159 '.m1v' => 'video/mpeg',
160 '.m2v' => 'video/mpeg',
161 '.m3u' => 'audio/x-mpegurl',
162 '.mac' => 'image/x-macpaint',
163 '.man' => 'application/x-troff-man',
164 '.mda' => 'application/msaccess',
165 '.mdb' => 'application/msaccess',
166 '.mde' => 'application/msaccess',
167 '.mfp' => 'application/x-shockwave-flash',
168 '.mht' => 'message/rfc822',
169 '.mhtml' => 'message/rfc822',
170 '.mid' => 'audio/mid',
171 '.midi' => 'audio/mid',
172 '.mod' => 'video/mpeg',
173 '.mov' => 'video/quicktime',
174 '.mp2' => 'video/mpeg',
175 '.mp2v' => 'video/mpeg',
176 '.mp3' => 'audio/mpeg',
177 '.mp4' => 'video/mp4',
178 '.mpa' => 'video/mpeg',
179 '.mpe' => 'video/mpeg',
180 '.mpeg' => 'video/mpeg',
181 '.mpf' => 'application/vnd.ms-mediapackage',
182 '.mpg' => 'video/mpeg',
183 '.mpv2' => 'video/mpeg',
184 '.mqv' => 'video/quicktime',
185 '.NMW' => 'application/nmwb',
186 '.nws' => 'message/rfc822',
187 '.odc' => 'text/x-ms-odc',
188 '.ols' => 'application/vnd.ms-publisher',
189 '.p10' => 'application/pkcs10',
190 '.p12' => 'application/x-pkcs12',
191 '.p7b' => 'application/x-pkcs7-certificates',
192 '.p7c' => 'application/pkcs7-mime',
193 '.p7m' => 'application/pkcs7-mime',
194 '.p7r' => 'application/x-pkcs7-certreqresp',
195 '.p7s' => 'application/pkcs7-signature',
196 '.pct' => 'image/pict',
197 '.pdf' => 'application/pdf',
198 '.pdx' => 'application/vnd.adobe.pdx',
199 '.pfx' => 'application/x-pkcs12',
200 '.pic' => 'image/pict',
201 '.pict' => 'image/pict',
202 '.pinstall' => 'application/x-picasa-detect',
203 '.pko' => 'application/vnd.ms-pki.pko',
204 '.png' => 'image/png',
205 '.pnt' => 'image/x-macpaint',
206 '.pntg' => 'image/x-macpaint',
207 '.pot' => 'application/vnd.ms-powerpoint',
208 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
209 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
210 '.ppa' => 'application/vnd.ms-powerpoint',
211 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
212 '.pps' => 'application/vnd.ms-powerpoint',
213 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
214 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
215 '.ppt' => 'application/vnd.ms-powerpoint',
216 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
217 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
218 '.prf' => 'application/pics-rules',
219 '.ps' => 'application/postscript',
220 '.pub' => 'application/vnd.ms-publisher',
221 '.pwz' => 'application/vnd.ms-powerpoint',
222 '.py' => 'text/plain',
223 '.pyw' => 'text/plain',
224 '.qht' => 'text/x-html-insertion',
225 '.qhtm' => 'text/x-html-insertion',
226 '.qt' => 'video/quicktime',
227 '.qti' => 'image/x-quicktime',
228 '.qtif' => 'image/x-quicktime',
229 '.qtl' => 'application/x-quicktimeplayer',
230 '.rat' => 'application/rat-file',
231 '.rmf' => 'application/vnd.adobe.rmf',
232 '.rmi' => 'audio/mid',
233 '.rqy' => 'text/x-ms-rqy',
234 '.rtf' => 'application/msword',
235 '.sct' => 'text/scriptlet',
236 '.sd2' => 'audio/x-sd2',
237 '.sdp' => 'application/sdp',
238 '.shtml' => 'text/html',
239 '.sit' => 'application/x-stuffit',
240 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
241 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
242 '.slk' => 'application/vnd.ms-excel',
243 '.snd' => 'audio/basic',
244 '.so' => 'application/x-apachemodule',
245 '.sol' => 'text/plain',
246 '.sor' => 'text/plain',
247 '.spc' => 'application/x-pkcs7-certificates',
248 '.spl' => 'application/futuresplash',
249 '.sst' => 'application/vnd.ms-pki.certstore',
250 '.stl' => 'application/vnd.ms-pki.stl',
251 '.swf' => 'application/x-shockwave-flash',
252 '.thmx' => 'application/vnd.ms-officetheme',
253 '.tif' => 'image/tiff',
254 '.tiff' => 'image/tiff',
255 '.txt' => 'text/plain',
256 '.uls' => 'text/iuls',
257 '.vcf' => 'text/x-vcard',
258 '.vdx' => 'application/vnd.ms-visio.viewer',
259 '.vsd' => 'application/vnd.ms-visio.viewer',
260 '.vss' => 'application/vnd.ms-visio.viewer',
261 '.vst' => 'application/vnd.ms-visio.viewer',
262 '.vsx' => 'application/vnd.ms-visio.viewer',
263 '.vtx' => 'application/vnd.ms-visio.viewer',
264 '.wav' => 'audio/wav',
265 '.wax' => 'audio/x-ms-wax',
266 '.wbk' => 'application/msword',
267 '.wdp' => 'image/vnd.ms-photo',
268 '.wiz' => 'application/msword',
269 '.wm' => 'video/x-ms-wm',
270 '.wma' => 'audio/x-ms-wma',
271 '.wmd' => 'application/x-ms-wmd',
272 '.wmv' => 'video/x-ms-wmv',
273 '.wmx' => 'video/x-ms-wmx',
274 '.wmz' => 'application/x-ms-wmz',
275 '.wpl' => 'application/vnd.ms-wpl',
276 '.wsc' => 'text/scriptlet',
277 '.wvx' => 'video/x-ms-wvx',
278 '.xaml' => 'application/xaml+xml',
279 '.xbap' => 'application/x-ms-xbap',
280 '.xdp' => 'application/vnd.adobe.xdp+xml',
281 '.xfdf' => 'application/vnd.adobe.xfdf',
282 '.xht' => 'application/xhtml+xml',
283 '.xhtml' => 'application/xhtml+xml',
284 '.xla' => 'application/vnd.ms-excel',
285 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
286 '.xlk' => 'application/vnd.ms-excel',
287 '.xll' => 'application/vnd.ms-excel',
288 '.xlm' => 'application/vnd.ms-excel',
289 '.xls' => 'application/vnd.ms-excel',
290 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
291 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
292 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
293 '.xlt' => 'application/vnd.ms-excel',
294 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
295 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
296 '.xlw' => 'application/vnd.ms-excel',
297 '.xml' => 'text/xml',
298 '.xps' => 'application/vnd.ms-xpsdocument',
299 '.xsl' => 'text/xml',
300 ];
301
307 protected $versions = [ 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ];
308
312 protected $typeTable = [];
313
314 function __construct() {
315 // Construct versioned type arrays from the base type array plus additions
316 $types = $this->baseTypeTable;
317 foreach ( $this->versions as $version ) {
318 if ( isset( $this->addedTypes[$version] ) ) {
319 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
320 $types[$format] = array_merge( $types[$format], $addedTypes );
321 }
322 }
323 $this->typeTable[$version] = $types;
324 }
325 }
326
337 public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
338 $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
339 $types = array_map( [ $this, 'translateMimeType' ], $types );
340 return $types;
341 }
342
349 public function translateMimeType( $type ) {
350 static $table = [
351 'image/pjpeg' => 'image/jpeg',
352 'image/x-png' => 'image/png',
353 'image/x-wmf' => 'application/x-msmetafile',
354 'image/bmp' => 'image/x-bmp',
355 'application/x-zip-compressed' => 'application/zip',
356 'application/x-compressed' => 'application/x-compress',
357 'application/x-gzip-compressed' => 'application/x-gzip',
358 'audio/mid' => 'audio/midi',
359 ];
360 if ( isset( $table[$type] ) ) {
361 $type = $table[$type];
362 }
363 return $type;
364 }
365
375 public function getMimesFromData( $fileName, $chunk, $proposed ) {
376 $types = [];
377 foreach ( $this->versions as $version ) {
378 $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
379 }
380 return $types;
381 }
382
391 protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
392 // Strip text after a semicolon
393 $semiPos = strpos( $proposed, ';' );
394 if ( $semiPos !== false ) {
395 $proposed = substr( $proposed, 0, $semiPos );
396 }
397
398 $proposedFormat = $this->getDataFormat( $version, $proposed );
399 if ( $proposedFormat == 'unknown'
400 && $proposed != 'multipart/mixed'
401 && $proposed != 'multipart/x-mixed-replace'
402 ) {
403 return $proposed;
404 }
405 if ( strval( $chunk ) === '' ) {
406 return $proposed;
407 }
408
409 // Truncate chunk at 255 bytes
410 $chunk = substr( $chunk, 0, 255 );
411
412 // IE does the Check*Headers() calls last, and instead does the following image
413 // type checks by directly looking for the magic numbers. What I do here should
414 // have the same effect since the magic number checks are identical in both cases.
415 $result = $this->sampleData( $version, $chunk );
416 $sampleFound = $result['found'];
417 $counters = $result['counters'];
418 $binaryType = $this->checkBinaryHeaders( $version, $chunk );
419 $textType = $this->checkTextHeaders( $version, $chunk );
420
421 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
422 return 'text/html';
423 }
424 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
425 return 'image/gif';
426 }
427 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
428 && $binaryType == 'image/pjpeg'
429 ) {
430 return $proposed;
431 }
432 // PNG check added in IE 7
433 if ( $version >= 'ie07'
434 && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
435 && $binaryType == 'image/x-png'
436 ) {
437 return $proposed;
438 }
439
440 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
441 if ( isset( $sampleFound['cdf'] ) ) {
442 return 'application/x-cdf';
443 }
444
445 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
446 // previous versions
447 if ( isset( $sampleFound['rss'] ) ) {
448 return 'application/rss+xml';
449 }
450 if ( isset( $sampleFound['rdf-tag'] )
451 && isset( $sampleFound['rdf-url'] )
452 && isset( $sampleFound['rdf-purl'] )
453 ) {
454 return 'application/rss+xml';
455 }
456 if ( isset( $sampleFound['atom'] ) ) {
457 return 'application/atom+xml';
458 }
459
460 if ( isset( $sampleFound['xml'] ) ) {
461 // TODO: I'm not sure under what circumstances this flag is enabled
462 if ( strpos( $version, 'strict' ) !== false ) {
463 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
464 return 'text/xml';
465 }
466 } else {
467 return 'text/xml';
468 }
469 }
470 if ( isset( $sampleFound['html'] ) ) {
471 // TODO: I'm not sure under what circumstances this flag is enabled
472 if ( strpos( $version, 'nohtml' ) !== false ) {
473 if ( $proposed == 'text/plain' ) {
474 return 'text/html';
475 }
476 } else {
477 return 'text/html';
478 }
479 }
480 if ( isset( $sampleFound['xbm'] ) ) {
481 return 'image/x-bitmap';
482 }
483 if ( isset( $sampleFound['binhex'] ) ) {
484 return 'application/macbinhex40';
485 }
486 if ( isset( $sampleFound['scriptlet'] ) ) {
487 if ( strpos( $version, 'strict' ) !== false ) {
488 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
489 return 'text/scriptlet';
490 }
491 } else {
492 return 'text/scriptlet';
493 }
494 }
495
496 // Freaky heuristics to determine if the data is text or binary
497 // The heuristic is of course broken for non-ASCII text
498 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
499 < ( $counters['ctrl'] + $counters['high'] ) * 16
500 ) {
501 $kindOfBinary = true;
502 $type = $binaryType ?: $textType;
503 if ( $type === false ) {
504 $type = 'application/octet-stream';
505 }
506 } else {
507 $kindOfBinary = false;
508 $type = $textType ?: $binaryType;
509 if ( $type === false ) {
510 $type = 'text/plain';
511 }
512 }
513
514 // Check if the output format is ambiguous
515 // This generally means that detection failed, real types aren't ambiguous
516 $detectedFormat = $this->getDataFormat( $version, $type );
517 if ( $detectedFormat != 'ambiguous' ) {
518 return $type;
519 }
520
521 if ( $proposedFormat != 'ambiguous' ) {
522 // FormatAgreesWithData()
523 if ( $proposedFormat == 'text' && !$kindOfBinary ) {
524 return $proposed;
525 }
526 if ( $proposedFormat == 'binary' && $kindOfBinary ) {
527 return $proposed;
528 }
529 if ( $proposedFormat == 'html' ) {
530 return $proposed;
531 }
532 }
533
534 // Find a MIME type by searching the registry for the file extension.
535 $dotPos = strrpos( $fileName, '.' );
536 if ( $dotPos === false ) {
537 return $type;
538 }
539 $ext = substr( $fileName, $dotPos );
540 if ( isset( $this->registry[$ext] ) ) {
541 return $this->registry[$ext];
542 }
543
544 // TODO: If the extension has an application registered to it, IE will return
545 // application/octet-stream. We'll skip that, so we could erroneously
546 // return text/plain or application/x-netcdf where application/octet-stream
547 // would be correct.
548
549 return $type;
550 }
551
559 private function checkTextHeaders( $version, $chunk ) {
560 $chunk2 = substr( $chunk, 0, 2 );
561 $chunk4 = substr( $chunk, 0, 4 );
562 $chunk5 = substr( $chunk, 0, 5 );
563 if ( $chunk4 == '%PDF' ) {
564 return 'application/pdf';
565 }
566 if ( $chunk2 == '%!' ) {
567 return 'application/postscript';
568 }
569 if ( $chunk5 == '{\\rtf' ) {
570 return 'text/richtext';
571 }
572 if ( $chunk5 == 'begin' ) {
573 return 'application/base64';
574 }
575 return false;
576 }
577
585 private function checkBinaryHeaders( $version, $chunk ) {
586 $chunk2 = substr( $chunk, 0, 2 );
587 $chunk3 = substr( $chunk, 0, 3 );
588 $chunk4 = substr( $chunk, 0, 4 );
589 $chunk5 = substr( $chunk, 0, 5 );
590 $chunk5uc = strtoupper( $chunk5 );
591 $chunk8 = substr( $chunk, 0, 8 );
592 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
593 return 'image/gif';
594 }
595 if ( $chunk2 == "\xff\xd8" ) {
596 return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
597 }
598
599 if ( $chunk2 == 'BM'
600 && substr( $chunk, 6, 2 ) == "\000\000"
601 && substr( $chunk, 8, 2 ) == "\000\000"
602 ) {
603 return 'image/bmp'; // another non-standard MIME
604 }
605 if ( $chunk4 == 'RIFF'
606 && substr( $chunk, 8, 4 ) == 'WAVE'
607 ) {
608 return 'audio/wav';
609 }
610 // These were integer literals in IE
611 // Perhaps the author was not sure what the target endianness was
612 if ( $chunk4 == ".sd\000"
613 || $chunk4 == ".snd"
614 || $chunk4 == "\000ds."
615 || $chunk4 == "dns."
616 ) {
617 return 'audio/basic';
618 }
619 if ( $chunk3 == "MM\000" ) {
620 return 'image/tiff';
621 }
622 if ( $chunk2 == 'MZ' ) {
623 return 'application/x-msdownload';
624 }
625 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
626 return 'image/x-png'; // [sic]
627 }
628 if ( strlen( $chunk ) >= 5 ) {
629 $byte2 = ord( $chunk[2] );
630 $byte4 = ord( $chunk[4] );
631 if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
632 return 'image/x-jg';
633 }
634 }
635 // More endian confusion?
636 if ( $chunk4 == 'MROF' ) {
637 return 'audio/x-aiff';
638 }
639 $chunk4_8 = substr( $chunk, 8, 4 );
640 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
641 return 'audio/x-aiff';
642 }
643 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
644 return 'video/avi';
645 }
646 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
647 return 'video/mpeg';
648 }
649 if ( $chunk4 == "\001\000\000\000"
650 && substr( $chunk, 40, 4 ) == ' EMF'
651 ) {
652 return 'image/x-emf';
653 }
654 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
655 return 'image/x-wmf';
656 }
657 if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
658 return 'application/java';
659 }
660 if ( $chunk2 == 'PK' ) {
661 return 'application/x-zip-compressed';
662 }
663 if ( $chunk2 == "\x1f\x9d" ) {
664 return 'application/x-compressed';
665 }
666 if ( $chunk2 == "\x1f\x8b" ) {
667 return 'application/x-gzip-compressed';
668 }
669 // Skip redundant check for ZIP
670 if ( $chunk5 == "MThd\000" ) {
671 return 'audio/mid';
672 }
673 if ( $chunk4 == '%PDF' ) {
674 return 'application/pdf';
675 }
676 return false;
677 }
678
686 protected function sampleData( $version, $chunk ) {
687 $found = [];
688 $counters = [
689 'ctrl' => 0,
690 'high' => 0,
691 'low' => 0,
692 'lf' => 0,
693 'cr' => 0,
694 'ff' => 0
695 ];
696 $htmlTags = [
697 'html',
698 'head',
699 'title',
700 'body',
701 'script',
702 'a href',
703 'pre',
704 'img',
705 'plaintext',
706 'table'
707 ];
708 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
709 $rdfPurl = 'http://purl.org/rss/1.0/';
710 $xbmMagic1 = '#define';
711 $xbmMagic2 = '_width';
712 $xbmMagic3 = '_bits';
713 $binhexMagic = 'converted with BinHex';
714 $chunkLength = strlen( $chunk );
715
716 for ( $offset = 0; $offset < $chunkLength; $offset++ ) {
717 $curChar = $chunk[$offset];
718 if ( $curChar == "\x0a" ) {
719 $counters['lf']++;
720 continue;
721 } elseif ( $curChar == "\x0d" ) {
722 $counters['cr']++;
723 continue;
724 } elseif ( $curChar == "\x0c" ) {
725 $counters['ff']++;
726 continue;
727 } elseif ( $curChar == "\t" ) {
728 $counters['low']++;
729 continue;
730 } elseif ( ord( $curChar ) < 32 ) {
731 $counters['ctrl']++;
732 continue;
733 } elseif ( ord( $curChar ) >= 128 ) {
734 $counters['high']++;
735 continue;
736 }
737
738 $counters['low']++;
739 if ( $curChar == '<' ) {
740 // XML
741 $remainder = substr( $chunk, $offset + 1 );
742 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
743 $nextChar = substr( $chunk, $offset + 5, 1 );
744 if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
745 $found['xml'] = true;
746 }
747 }
748 // Scriptlet (JSP)
749 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
750 $found['scriptlet'] = true;
751 break;
752 }
753 // HTML
754 foreach ( $htmlTags as $tag ) {
755 if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
756 $found['html'] = true;
757 }
758 }
759 // Skip broken check for additional tags (HR etc.)
760
761 // CHANNEL replaced by RSS, RDF and FEED in IE 7
762 if ( $version < 'ie07' ) {
763 if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
764 $found['cdf'] = true;
765 }
766 } else {
767 // RSS
768 if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
769 $found['rss'] = true;
770 break; // return from SampleData
771 }
772 if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
773 $found['rdf-tag'] = true;
774 // no break
775 }
776 if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
777 $found['atom'] = true;
778 break;
779 }
780 }
781 continue;
782 }
783 // Skip broken check for -->
784
785 // RSS URL checks
786 // For some reason both URLs must appear before it is recognised
787 $remainder = substr( $chunk, $offset );
788 if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
789 $found['rdf-url'] = true;
790 if ( isset( $found['rdf-tag'] )
791 && isset( $found['rdf-purl'] ) // [sic]
792 ) {
793 break;
794 }
795 continue;
796 }
797
798 if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
799 if ( isset( $found['rdf-tag'] )
800 && isset( $found['rdf-url'] ) // [sic]
801 ) {
802 break;
803 }
804 continue;
805 }
806
807 // XBM checks
808 if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
809 $found['xbm1'] = true;
810 continue;
811 }
812 if ( $curChar == '_' ) {
813 if ( isset( $found['xbm2'] ) ) {
814 if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
815 $found['xbm'] = true;
816 break;
817 }
818 } elseif ( isset( $found['xbm1'] ) ) {
819 if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
820 $found['xbm2'] = true;
821 }
822 }
823 }
824
825 // BinHex
826 if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
827 $found['binhex'] = true;
828 }
829 }
830 return [ 'found' => $found, 'counters' => $counters ];
831 }
832
838 protected function getDataFormat( $version, $type ) {
839 $types = $this->typeTable[$version];
840 if ( $type == '(null)' || strval( $type ) === '' ) {
841 return 'ambiguous';
842 }
843 foreach ( $types as $format => $list ) {
844 if ( in_array( $type, $list ) ) {
845 return $format;
846 }
847 }
848 return 'unknown';
849 }
850}
This class simulates Microsoft Internet Explorer's terribly broken and insecure MIME type detection a...
$versions
IE versions which have been analysed to bring you this class, and for which some substantive differen...
getDataFormat( $version, $type)
checkBinaryHeaders( $version, $chunk)
Check for binary headers at the start of the chunk Confirmed same in 5 and 7.
sampleData( $version, $chunk)
Do heuristic checks on the bulk of the data sample.
getMimesFromData( $fileName, $chunk, $proposed)
Get the untranslated MIME types for all known versions.
$addedTypes
Changes to the type table in later versions of IE.
$registry
An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a typical Windows installation.
checkTextHeaders( $version, $chunk)
Check for text headers at the start of the chunk Confirmed same in 5 and 7.
$baseTypeTable
Relevant data taken from the type table in IE 5.
getMimeTypeForVersion( $version, $fileName, $chunk, $proposed)
Get the MIME type for a given named version.
$typeTable
Type table with versions expanded.
translateMimeType( $type)
Translate a MIME type from IE's idiosyncratic private types into more commonly understood type string...
getRealMimesFromData( $fileName, $chunk, $proposed)
Get the MIME types from getMimesFromData(), but convert the result from IE's idiosyncratic private ty...
if(!is_readable( $file)) $ext
Definition router.php:48