Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
92.17% covered (success)
92.17%
153 / 166
70.59% covered (warning)
70.59%
12 / 17
CRAP
0.00% covered (danger)
0.00%
0 / 1
MSCompoundFileReader
92.73% covered (success)
92.73%
153 / 165
70.59% covered (warning)
70.59%
12 / 17
43.71
0.00% covered (danger)
0.00%
0 / 1
 readFile
37.50% covered (danger)
37.50%
3 / 8
0.00% covered (danger)
0.00%
0 / 1
2.98
 readHandle
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 init
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
1 / 1
2
 sectorOffset
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 decodeClsid
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
 unpackOffset
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpack
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 bin2dec
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 readOffset
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
3.02
 readSector
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 error
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fseek
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 readDifat
71.43% covered (warning)
71.43%
10 / 14
0.00% covered (danger)
0.00%
0 / 1
5.58
 getNextSectorIdFromFat
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getFatSector
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
4.02
 readDirectory
100.00% covered (success)
100.00%
44 / 44
100.00% covered (success)
100.00%
1 / 1
10
1<?php
2/*
3 * Copyright 2019 Wikimedia Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
15 */
16
17namespace Wikimedia\Mime;
18
19use RuntimeException;
20
21/**
22 * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
23 * file, and detect the MIME type.
24 *
25 * References:
26 *  - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
27 *  - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
28 *  - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
29 *  - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
30 *  - Python olefile https://github.com/decalage2/olefile
31 *  - OpenOffice.org's Documentation of the Microsoft Compound Document
32 *    File Format https://www.openoffice.org/sc/compdocfileformat.pdf
33 *
34 * @since 1.33
35 * @ingroup Mime
36 */
37class MSCompoundFileReader {
38    /** @var resource */
39    private $file;
40    /** @var array */
41    private $header;
42    /** @var string */
43    private $mime;
44    /** @var string */
45    private $mimeFromClsid;
46    /** @var string|null */
47    private $error;
48    /** @var int|null */
49    private $errorCode;
50    /** @var bool */
51    private $valid = false;
52
53    /** @var int */
54    private $sectorLength;
55    /** @var int[] */
56    private $difat;
57    /** @var int[][] */
58    private $fat = [];
59
60    private const TYPE_UNALLOCATED = 0;
61    private const TYPE_STORAGE = 1;
62    private const TYPE_STREAM = 2;
63    private const TYPE_ROOT = 5;
64
65    public const ERROR_FILE_OPEN = 1;
66    public const ERROR_SEEK = 2;
67    public const ERROR_READ = 3;
68    public const ERROR_INVALID_SIGNATURE = 4;
69    public const ERROR_READ_PAST_END = 5;
70    public const ERROR_INVALID_FORMAT = 6;
71
72    private const MIMES_BY_CLSID = [
73        // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
74        '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
75        '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
76        '00020906-0000-0000-C000-000000000046' => 'application/msword',
77        '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
78    ];
79
80    /**
81     * Read a file by name
82     *
83     * @param string $fileName The full path to the file
84     * @return array An associative array of information about the file:
85     *   - valid: true if the file is valid, false otherwise
86     *   - error: An error message in English, should be present if valid=false
87     *   - errorCode: One of the self::ERROR_* constants
88     *   - mime: The MIME type detected from the directory contents
89     *   - mimeFromClsid: The MIME type detected from the CLSID on the root
90     *     directory entry
91     */
92    public static function readFile( $fileName ) {
93        $handle = fopen( $fileName, 'r' );
94        if ( $handle === false ) {
95            return [
96                'valid' => false,
97                'error' => 'file does not exist',
98                'errorCode' => self::ERROR_FILE_OPEN
99            ];
100        }
101        return self::readHandle( $handle );
102    }
103
104    /**
105     * Read from an open seekable handle
106     *
107     * @param resource $fileHandle
108     * @return array An associative array of information about the file:
109     *   - valid: true if the file is valid, false otherwise
110     *   - error: An error message in English, should be present if valid=false
111     *   - errorCode: One of the self::ERROR_* constants
112     *   - mime: The MIME type detected from the directory contents
113     *   - mimeFromClsid: The MIME type detected from the CLSID on the root
114     *     directory entry
115     */
116    public static function readHandle( $fileHandle ) {
117        $reader = new self( $fileHandle );
118        $info = [
119            'valid' => $reader->valid,
120            'mime' => $reader->mime,
121            'mimeFromClsid' => $reader->mimeFromClsid
122        ];
123        if ( $reader->error ) {
124            $info['error'] = $reader->error;
125            $info['errorCode'] = $reader->errorCode;
126        }
127        return $info;
128    }
129
130    /**
131     * @param resource $fileHandle
132     */
133    private function __construct( $fileHandle ) {
134        $this->file = $fileHandle;
135        try {
136            $this->init();
137        } catch ( RuntimeException $e ) {
138            $this->valid = false;
139            $this->error = $e->getMessage();
140            $this->errorCode = $e->getCode();
141        }
142    }
143
144    private function init() {
145        $this->header = $this->unpackOffset( 0, [
146            'header_signature' => 8,
147            'header_clsid' => 16,
148            'minor_version' => 2,
149            'major_version' => 2,
150            'byte_order' => 2,
151            'sector_shift' => 2,
152            'mini_sector_shift' => 2,
153            'reserved' => 6,
154            'num_dir_sectors' => 4,
155            'num_fat_sectors' => 4,
156            'first_dir_sector' => 4,
157            'transaction_signature_number' => 4,
158            'mini_stream_cutoff_size' => 4,
159            'first_mini_fat_sector' => 4,
160            'num_mini_fat_sectors' => 4,
161            'first_difat_sector' => 4,
162            'num_difat_sectors' => 4,
163            'difat' => 436,
164        ] );
165        if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
166            $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
167                self::ERROR_INVALID_SIGNATURE );
168        }
169        $this->sectorLength = 1 << $this->header['sector_shift'];
170        $this->readDifat();
171        $this->readDirectory();
172
173        $this->valid = true;
174    }
175
176    private function sectorOffset( int $sectorId ): int {
177        return $this->sectorLength * ( $sectorId + 1 );
178    }
179
180    private function decodeClsid( string $binaryClsid ): string {
181        $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
182        return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
183            $parts['a'],
184            $parts['b'],
185            $parts['c'],
186            $parts['d1'],
187            $parts['d2'],
188            $parts['d3'],
189            $parts['d4'],
190            $parts['d5'],
191            $parts['d6'],
192            $parts['d7'],
193            $parts['d8']
194        );
195    }
196
197    /**
198     * @param int $offset
199     * @param int[] $struct
200     * @return array
201     */
202    private function unpackOffset( $offset, $struct ) {
203        $block = $this->readOffset( $offset, array_sum( $struct ) );
204        return $this->unpack( $block, 0, $struct );
205    }
206
207    /**
208     * @param string $block
209     * @param int $offset
210     * @param int[] $struct
211     * @return array
212     */
213    private function unpack( $block, $offset, $struct ) {
214        $data = [];
215        foreach ( $struct as $key => $length ) {
216            if ( $length > 4 ) {
217                $data[$key] = substr( $block, $offset, $length );
218            } else {
219                $data[$key] = $this->bin2dec( $block, $offset, $length );
220            }
221            $offset += $length;
222        }
223        return $data;
224    }
225
226    private function bin2dec( string $str, int $offset, int $length ): int {
227        $value = 0;
228        for ( $i = $length - 1; $i >= 0; $i-- ) {
229            $value *= 256;
230            $value += ord( $str[$offset + $i] );
231        }
232        return $value;
233    }
234
235    private function readOffset( int $offset, int $length ): string {
236        $this->fseek( $offset );
237        // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
238        $block = @fread( $this->file, $length );
239        if ( $block === false ) {
240            $this->error( 'error reading from file', self::ERROR_READ );
241        }
242        if ( strlen( $block ) !== $length ) {
243            $this->error( 'unable to read the required number of bytes from the file',
244                self::ERROR_READ_PAST_END );
245        }
246        return $block;
247    }
248
249    private function readSector( int $sectorId ): string {
250        return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
251    }
252
253    /**
254     * @param string $message
255     * @param int $code
256     * @return never
257     */
258    private function error( $message, $code ): never {
259        throw new RuntimeException( $message, $code );
260    }
261
262    private function fseek( int $offset ) {
263        // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
264        $result = @fseek( $this->file, $offset );
265        if ( $result !== 0 ) {
266            $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
267        }
268    }
269
270    private function readDifat() {
271        $binaryDifat = $this->header['difat'];
272        $nextDifatSector = $this->header['first_difat_sector'];
273        for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
274            $block = $this->readSector( $nextDifatSector );
275            $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
276            $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
277            if ( $nextDifatSector == 0xFFFFFFFE ) {
278                break;
279            }
280        }
281
282        $this->difat = [];
283        for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
284            $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
285            if ( $fatSector < 0xFFFFFFFC ) {
286                $this->difat[] = $fatSector;
287            } else {
288                break;
289            }
290        }
291    }
292
293    private function getNextSectorIdFromFat( int $sectorId ): int {
294        $entriesPerSector = intdiv( $this->sectorLength, 4 );
295        $fatSectorId = intdiv( $sectorId, $entriesPerSector );
296        $fatSectorArray = $this->getFatSector( $fatSectorId );
297        return $fatSectorArray[$sectorId % $entriesPerSector];
298    }
299
300    private function getFatSector( int $fatSectorId ): array {
301        if ( !isset( $this->fat[$fatSectorId] ) ) {
302            $fat = [];
303            if ( !isset( $this->difat[$fatSectorId] ) ) {
304                $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
305            }
306            $absoluteSectorId = $this->difat[$fatSectorId];
307            $block = $this->readSector( $absoluteSectorId );
308            for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
309                $fat[] = $this->bin2dec( $block, $pos, 4 );
310            }
311            $this->fat[$fatSectorId] = $fat;
312        }
313        return $this->fat[$fatSectorId];
314    }
315
316    private function readDirectory() {
317        $dirSectorId = $this->header['first_dir_sector'];
318        $binaryDir = '';
319        $seenSectorIds = [];
320        while ( $dirSectorId !== 0xFFFFFFFE ) {
321            if ( isset( $seenSectorIds[$dirSectorId] ) ) {
322                $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
323            }
324            $seenSectorIds[$dirSectorId] = true;
325
326            $binaryDir .= $this->readSector( $dirSectorId );
327            $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
328        }
329
330        $struct = [
331            'name_raw' => 64,
332            'name_length' => 2,
333            'object_type' => 1,
334            'color' => 1,
335            'sid_left' => 4,
336            'sid_right' => 4,
337            'sid_child' => 4,
338            'clsid' => 16,
339            'state_bits' => 4,
340            'create_time_low' => 4,
341            'create_time_high' => 4,
342            'modify_time_low' => 4,
343            'modify_time_high' => 4,
344            'first_sector' => 4,
345            'size_low' => 4,
346            'size_high' => 4,
347        ];
348        $entryLength = array_sum( $struct );
349
350        for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
351            $entry = $this->unpack( $binaryDir, $pos, $struct );
352
353            // According to [MS-CFB] size_high may contain garbage due to a
354            // bug in a writer, it's best to pretend it is zero
355            $entry['size_high'] = 0;
356
357            $type = $entry['object_type'];
358            if ( $type == self::TYPE_UNALLOCATED ) {
359                continue;
360            }
361
362            $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
363
364            $clsid = $this->decodeClsid( $entry['clsid'] );
365            if ( $type == self::TYPE_ROOT && isset( self::MIMES_BY_CLSID[$clsid] ) ) {
366                $this->mimeFromClsid = self::MIMES_BY_CLSID[$clsid];
367            }
368
369            if ( $name === 'Workbook' ) {
370                $this->mime = 'application/vnd.ms-excel';
371            } elseif ( $name === 'WordDocument' ) {
372                $this->mime = 'application/msword';
373            } elseif ( $name === 'PowerPoint Document' ) {
374                $this->mime = 'application/vnd.ms-powerpoint';
375            }
376        }
377    }
378}
379
380/** @deprecated class alias since 1.43 */
381class_alias( MSCompoundFileReader::class, 'MSCompoundFileReader' );