Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
92.17% |
153 / 166 |
|
70.59% |
12 / 17 |
CRAP | |
0.00% |
0 / 1 |
| MSCompoundFileReader | |
92.73% |
153 / 165 |
|
70.59% |
12 / 17 |
43.71 | |
0.00% |
0 / 1 |
| readFile | |
37.50% |
3 / 8 |
|
0.00% |
0 / 1 |
2.98 | |||
| readHandle | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
| __construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
| init | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
2 | |||
| sectorOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| decodeClsid | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
| unpackOffset | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| unpack | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
| bin2dec | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| readOffset | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
| readSector | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| error | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| fseek | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
| readDifat | |
71.43% |
10 / 14 |
|
0.00% |
0 / 1 |
5.58 | |||
| getNextSectorIdFromFat | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| getFatSector | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
4.02 | |||
| readDirectory | |
100.00% |
44 / 44 |
|
100.00% |
1 / 1 |
10 | |||
| 1 | <?php |
| 2 | /* |
| 3 | * Copyright 2019 Wikimedia Foundation |
| 4 | * |
| 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
| 6 | * not use this file except in compliance with the License. |
| 7 | * You may obtain a copy of the License at |
| 8 | * |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | * |
| 11 | * Unless required by applicable law or agreed to in writing, software distributed |
| 12 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS |
| 13 | * OF ANY KIND, either express or implied. See the License for the |
| 14 | * specific language governing permissions and limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | namespace Wikimedia\Mime; |
| 18 | |
| 19 | use RuntimeException; |
| 20 | |
| 21 | /** |
| 22 | * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE |
| 23 | * file, and detect the MIME type. |
| 24 | * |
| 25 | * References: |
| 26 | * - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx |
| 27 | * - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx |
| 28 | * - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx |
| 29 | * - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx |
| 30 | * - Python olefile https://github.com/decalage2/olefile |
| 31 | * - OpenOffice.org's Documentation of the Microsoft Compound Document |
| 32 | * File Format https://www.openoffice.org/sc/compdocfileformat.pdf |
| 33 | * |
| 34 | * @since 1.33 |
| 35 | * @ingroup Mime |
| 36 | */ |
| 37 | class MSCompoundFileReader { |
| 38 | /** @var resource */ |
| 39 | private $file; |
| 40 | /** @var array */ |
| 41 | private $header; |
| 42 | /** @var string */ |
| 43 | private $mime; |
| 44 | /** @var string */ |
| 45 | private $mimeFromClsid; |
| 46 | /** @var string|null */ |
| 47 | private $error; |
| 48 | /** @var int|null */ |
| 49 | private $errorCode; |
| 50 | /** @var bool */ |
| 51 | private $valid = false; |
| 52 | |
| 53 | /** @var int */ |
| 54 | private $sectorLength; |
| 55 | /** @var int[] */ |
| 56 | private $difat; |
| 57 | /** @var int[][] */ |
| 58 | private $fat = []; |
| 59 | |
| 60 | private const TYPE_UNALLOCATED = 0; |
| 61 | private const TYPE_STORAGE = 1; |
| 62 | private const TYPE_STREAM = 2; |
| 63 | private const TYPE_ROOT = 5; |
| 64 | |
| 65 | public const ERROR_FILE_OPEN = 1; |
| 66 | public const ERROR_SEEK = 2; |
| 67 | public const ERROR_READ = 3; |
| 68 | public const ERROR_INVALID_SIGNATURE = 4; |
| 69 | public const ERROR_READ_PAST_END = 5; |
| 70 | public const ERROR_INVALID_FORMAT = 6; |
| 71 | |
| 72 | private const MIMES_BY_CLSID = [ |
| 73 | // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File |
| 74 | '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel', |
| 75 | '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel', |
| 76 | '00020906-0000-0000-C000-000000000046' => 'application/msword', |
| 77 | '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint', |
| 78 | ]; |
| 79 | |
| 80 | /** |
| 81 | * Read a file by name |
| 82 | * |
| 83 | * @param string $fileName The full path to the file |
| 84 | * @return array An associative array of information about the file: |
| 85 | * - valid: true if the file is valid, false otherwise |
| 86 | * - error: An error message in English, should be present if valid=false |
| 87 | * - errorCode: One of the self::ERROR_* constants |
| 88 | * - mime: The MIME type detected from the directory contents |
| 89 | * - mimeFromClsid: The MIME type detected from the CLSID on the root |
| 90 | * directory entry |
| 91 | */ |
| 92 | public static function readFile( $fileName ) { |
| 93 | $handle = fopen( $fileName, 'r' ); |
| 94 | if ( $handle === false ) { |
| 95 | return [ |
| 96 | 'valid' => false, |
| 97 | 'error' => 'file does not exist', |
| 98 | 'errorCode' => self::ERROR_FILE_OPEN |
| 99 | ]; |
| 100 | } |
| 101 | return self::readHandle( $handle ); |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Read from an open seekable handle |
| 106 | * |
| 107 | * @param resource $fileHandle |
| 108 | * @return array An associative array of information about the file: |
| 109 | * - valid: true if the file is valid, false otherwise |
| 110 | * - error: An error message in English, should be present if valid=false |
| 111 | * - errorCode: One of the self::ERROR_* constants |
| 112 | * - mime: The MIME type detected from the directory contents |
| 113 | * - mimeFromClsid: The MIME type detected from the CLSID on the root |
| 114 | * directory entry |
| 115 | */ |
| 116 | public static function readHandle( $fileHandle ) { |
| 117 | $reader = new self( $fileHandle ); |
| 118 | $info = [ |
| 119 | 'valid' => $reader->valid, |
| 120 | 'mime' => $reader->mime, |
| 121 | 'mimeFromClsid' => $reader->mimeFromClsid |
| 122 | ]; |
| 123 | if ( $reader->error ) { |
| 124 | $info['error'] = $reader->error; |
| 125 | $info['errorCode'] = $reader->errorCode; |
| 126 | } |
| 127 | return $info; |
| 128 | } |
| 129 | |
| 130 | /** |
| 131 | * @param resource $fileHandle |
| 132 | */ |
| 133 | private function __construct( $fileHandle ) { |
| 134 | $this->file = $fileHandle; |
| 135 | try { |
| 136 | $this->init(); |
| 137 | } catch ( RuntimeException $e ) { |
| 138 | $this->valid = false; |
| 139 | $this->error = $e->getMessage(); |
| 140 | $this->errorCode = $e->getCode(); |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | private function init() { |
| 145 | $this->header = $this->unpackOffset( 0, [ |
| 146 | 'header_signature' => 8, |
| 147 | 'header_clsid' => 16, |
| 148 | 'minor_version' => 2, |
| 149 | 'major_version' => 2, |
| 150 | 'byte_order' => 2, |
| 151 | 'sector_shift' => 2, |
| 152 | 'mini_sector_shift' => 2, |
| 153 | 'reserved' => 6, |
| 154 | 'num_dir_sectors' => 4, |
| 155 | 'num_fat_sectors' => 4, |
| 156 | 'first_dir_sector' => 4, |
| 157 | 'transaction_signature_number' => 4, |
| 158 | 'mini_stream_cutoff_size' => 4, |
| 159 | 'first_mini_fat_sector' => 4, |
| 160 | 'num_mini_fat_sectors' => 4, |
| 161 | 'first_difat_sector' => 4, |
| 162 | 'num_difat_sectors' => 4, |
| 163 | 'difat' => 436, |
| 164 | ] ); |
| 165 | if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) { |
| 166 | $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ), |
| 167 | self::ERROR_INVALID_SIGNATURE ); |
| 168 | } |
| 169 | $this->sectorLength = 1 << $this->header['sector_shift']; |
| 170 | $this->readDifat(); |
| 171 | $this->readDirectory(); |
| 172 | |
| 173 | $this->valid = true; |
| 174 | } |
| 175 | |
| 176 | private function sectorOffset( int $sectorId ): int { |
| 177 | return $this->sectorLength * ( $sectorId + 1 ); |
| 178 | } |
| 179 | |
| 180 | private function decodeClsid( string $binaryClsid ): string { |
| 181 | $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid ); |
| 182 | return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X", |
| 183 | $parts['a'], |
| 184 | $parts['b'], |
| 185 | $parts['c'], |
| 186 | $parts['d1'], |
| 187 | $parts['d2'], |
| 188 | $parts['d3'], |
| 189 | $parts['d4'], |
| 190 | $parts['d5'], |
| 191 | $parts['d6'], |
| 192 | $parts['d7'], |
| 193 | $parts['d8'] |
| 194 | ); |
| 195 | } |
| 196 | |
| 197 | /** |
| 198 | * @param int $offset |
| 199 | * @param int[] $struct |
| 200 | * @return array |
| 201 | */ |
| 202 | private function unpackOffset( $offset, $struct ) { |
| 203 | $block = $this->readOffset( $offset, array_sum( $struct ) ); |
| 204 | return $this->unpack( $block, 0, $struct ); |
| 205 | } |
| 206 | |
| 207 | /** |
| 208 | * @param string $block |
| 209 | * @param int $offset |
| 210 | * @param int[] $struct |
| 211 | * @return array |
| 212 | */ |
| 213 | private function unpack( $block, $offset, $struct ) { |
| 214 | $data = []; |
| 215 | foreach ( $struct as $key => $length ) { |
| 216 | if ( $length > 4 ) { |
| 217 | $data[$key] = substr( $block, $offset, $length ); |
| 218 | } else { |
| 219 | $data[$key] = $this->bin2dec( $block, $offset, $length ); |
| 220 | } |
| 221 | $offset += $length; |
| 222 | } |
| 223 | return $data; |
| 224 | } |
| 225 | |
| 226 | private function bin2dec( string $str, int $offset, int $length ): int { |
| 227 | $value = 0; |
| 228 | for ( $i = $length - 1; $i >= 0; $i-- ) { |
| 229 | $value *= 256; |
| 230 | $value += ord( $str[$offset + $i] ); |
| 231 | } |
| 232 | return $value; |
| 233 | } |
| 234 | |
| 235 | private function readOffset( int $offset, int $length ): string { |
| 236 | $this->fseek( $offset ); |
| 237 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
| 238 | $block = @fread( $this->file, $length ); |
| 239 | if ( $block === false ) { |
| 240 | $this->error( 'error reading from file', self::ERROR_READ ); |
| 241 | } |
| 242 | if ( strlen( $block ) !== $length ) { |
| 243 | $this->error( 'unable to read the required number of bytes from the file', |
| 244 | self::ERROR_READ_PAST_END ); |
| 245 | } |
| 246 | return $block; |
| 247 | } |
| 248 | |
| 249 | private function readSector( int $sectorId ): string { |
| 250 | return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] ); |
| 251 | } |
| 252 | |
| 253 | /** |
| 254 | * @param string $message |
| 255 | * @param int $code |
| 256 | * @return never |
| 257 | */ |
| 258 | private function error( $message, $code ): never { |
| 259 | throw new RuntimeException( $message, $code ); |
| 260 | } |
| 261 | |
| 262 | private function fseek( int $offset ) { |
| 263 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
| 264 | $result = @fseek( $this->file, $offset ); |
| 265 | if ( $result !== 0 ) { |
| 266 | $this->error( "unable to seek to offset $offset", self::ERROR_SEEK ); |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | private function readDifat() { |
| 271 | $binaryDifat = $this->header['difat']; |
| 272 | $nextDifatSector = $this->header['first_difat_sector']; |
| 273 | for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) { |
| 274 | $block = $this->readSector( $nextDifatSector ); |
| 275 | $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 ); |
| 276 | $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 ); |
| 277 | if ( $nextDifatSector == 0xFFFFFFFE ) { |
| 278 | break; |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | $this->difat = []; |
| 283 | for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) { |
| 284 | $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 ); |
| 285 | if ( $fatSector < 0xFFFFFFFC ) { |
| 286 | $this->difat[] = $fatSector; |
| 287 | } else { |
| 288 | break; |
| 289 | } |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | private function getNextSectorIdFromFat( int $sectorId ): int { |
| 294 | $entriesPerSector = intdiv( $this->sectorLength, 4 ); |
| 295 | $fatSectorId = intdiv( $sectorId, $entriesPerSector ); |
| 296 | $fatSectorArray = $this->getFatSector( $fatSectorId ); |
| 297 | return $fatSectorArray[$sectorId % $entriesPerSector]; |
| 298 | } |
| 299 | |
| 300 | private function getFatSector( int $fatSectorId ): array { |
| 301 | if ( !isset( $this->fat[$fatSectorId] ) ) { |
| 302 | $fat = []; |
| 303 | if ( !isset( $this->difat[$fatSectorId] ) ) { |
| 304 | $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT ); |
| 305 | } |
| 306 | $absoluteSectorId = $this->difat[$fatSectorId]; |
| 307 | $block = $this->readSector( $absoluteSectorId ); |
| 308 | for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) { |
| 309 | $fat[] = $this->bin2dec( $block, $pos, 4 ); |
| 310 | } |
| 311 | $this->fat[$fatSectorId] = $fat; |
| 312 | } |
| 313 | return $this->fat[$fatSectorId]; |
| 314 | } |
| 315 | |
| 316 | private function readDirectory() { |
| 317 | $dirSectorId = $this->header['first_dir_sector']; |
| 318 | $binaryDir = ''; |
| 319 | $seenSectorIds = []; |
| 320 | while ( $dirSectorId !== 0xFFFFFFFE ) { |
| 321 | if ( isset( $seenSectorIds[$dirSectorId] ) ) { |
| 322 | $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT ); |
| 323 | } |
| 324 | $seenSectorIds[$dirSectorId] = true; |
| 325 | |
| 326 | $binaryDir .= $this->readSector( $dirSectorId ); |
| 327 | $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId ); |
| 328 | } |
| 329 | |
| 330 | $struct = [ |
| 331 | 'name_raw' => 64, |
| 332 | 'name_length' => 2, |
| 333 | 'object_type' => 1, |
| 334 | 'color' => 1, |
| 335 | 'sid_left' => 4, |
| 336 | 'sid_right' => 4, |
| 337 | 'sid_child' => 4, |
| 338 | 'clsid' => 16, |
| 339 | 'state_bits' => 4, |
| 340 | 'create_time_low' => 4, |
| 341 | 'create_time_high' => 4, |
| 342 | 'modify_time_low' => 4, |
| 343 | 'modify_time_high' => 4, |
| 344 | 'first_sector' => 4, |
| 345 | 'size_low' => 4, |
| 346 | 'size_high' => 4, |
| 347 | ]; |
| 348 | $entryLength = array_sum( $struct ); |
| 349 | |
| 350 | for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) { |
| 351 | $entry = $this->unpack( $binaryDir, $pos, $struct ); |
| 352 | |
| 353 | // According to [MS-CFB] size_high may contain garbage due to a |
| 354 | // bug in a writer, it's best to pretend it is zero |
| 355 | $entry['size_high'] = 0; |
| 356 | |
| 357 | $type = $entry['object_type']; |
| 358 | if ( $type == self::TYPE_UNALLOCATED ) { |
| 359 | continue; |
| 360 | } |
| 361 | |
| 362 | $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) ); |
| 363 | |
| 364 | $clsid = $this->decodeClsid( $entry['clsid'] ); |
| 365 | if ( $type == self::TYPE_ROOT && isset( self::MIMES_BY_CLSID[$clsid] ) ) { |
| 366 | $this->mimeFromClsid = self::MIMES_BY_CLSID[$clsid]; |
| 367 | } |
| 368 | |
| 369 | if ( $name === 'Workbook' ) { |
| 370 | $this->mime = 'application/vnd.ms-excel'; |
| 371 | } elseif ( $name === 'WordDocument' ) { |
| 372 | $this->mime = 'application/msword'; |
| 373 | } elseif ( $name === 'PowerPoint Document' ) { |
| 374 | $this->mime = 'application/vnd.ms-powerpoint'; |
| 375 | } |
| 376 | } |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | /** @deprecated class alias since 1.43 */ |
| 381 | class_alias( MSCompoundFileReader::class, 'MSCompoundFileReader' ); |