Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
92.73% covered (success)
92.73%
153 / 165
70.59% covered (warning)
70.59%
12 / 17
CRAP
0.00% covered (danger)
0.00%
0 / 1
MSCompoundFileReader
92.73% covered (success)
92.73%
153 / 165
70.59% covered (warning)
70.59%
12 / 17
43.71
0.00% covered (danger)
0.00%
0 / 1
 readFile
37.50% covered (danger)
37.50%
3 / 8
0.00% covered (danger)
0.00%
0 / 1
2.98
 readHandle
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 init
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
1 / 1
2
 sectorOffset
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 decodeClsid
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
 unpackOffset
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpack
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 bin2dec
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 readOffset
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
3.02
 readSector
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 error
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fseek
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 readDifat
71.43% covered (warning)
71.43%
10 / 14
0.00% covered (danger)
0.00%
0 / 1
5.58
 getNextSectorIdFromFat
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getFatSector
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
4.02
 readDirectory
100.00% covered (success)
100.00%
44 / 44
100.00% covered (success)
100.00%
1 / 1
10
1<?php
2/*
3 * Copyright 2019 Wikimedia Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
15 */
16
17/**
18 * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
19 * file, and detect the MIME type.
20 *
21 * References:
22 *  - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
23 *  - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
24 *  - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
25 *  - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
26 *  - Python olefile https://github.com/decalage2/olefile
27 *  - OpenOffice.org's Documentation of the Microsoft Compound Document
28 *    File Format https://www.openoffice.org/sc/compdocfileformat.pdf
29 *
30 * @since 1.33
31 * @ingroup Mime
32 */
33class MSCompoundFileReader {
34    private $file;
35    private $header;
36    private $mime;
37    private $mimeFromClsid;
38    private $error;
39    private $errorCode;
40    private $valid = false;
41
42    private $sectorLength;
43    private $difat;
44    private $fat = [];
45
46    private const TYPE_UNALLOCATED = 0;
47    private const TYPE_STORAGE = 1;
48    private const TYPE_STREAM = 2;
49    private const TYPE_ROOT = 5;
50
51    public const ERROR_FILE_OPEN = 1;
52    public const ERROR_SEEK = 2;
53    public const ERROR_READ = 3;
54    public const ERROR_INVALID_SIGNATURE = 4;
55    public const ERROR_READ_PAST_END = 5;
56    public const ERROR_INVALID_FORMAT = 6;
57
58    private static $mimesByClsid = [
59        // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
60        '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
61        '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
62        '00020906-0000-0000-C000-000000000046' => 'application/msword',
63        '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
64    ];
65
66    /**
67     * Read a file by name
68     *
69     * @param string $fileName The full path to the file
70     * @return array An associative array of information about the file:
71     *    - valid: true if the file is valid, false otherwise
72     *    - error: An error message in English, should be present if valid=false
73     *    - errorCode: One of the self::ERROR_* constants
74     *    - mime: The MIME type detected from the directory contents
75     *    - mimeFromClsid: The MIME type detected from the CLSID on the root
76     *      directory entry
77     */
78    public static function readFile( $fileName ) {
79        $handle = fopen( $fileName, 'r' );
80        if ( $handle === false ) {
81            return [
82                'valid' => false,
83                'error' => 'file does not exist',
84                'errorCode' => self::ERROR_FILE_OPEN
85            ];
86        }
87        return self::readHandle( $handle );
88    }
89
90    /**
91     * Read from an open seekable handle
92     *
93     * @param resource $fileHandle
94     * @return array An associative array of information about the file:
95     *    - valid: true if the file is valid, false otherwise
96     *    - error: An error message in English, should be present if valid=false
97     *    - errorCode: One of the self::ERROR_* constants
98     *    - mime: The MIME type detected from the directory contents
99     *    - mimeFromClsid: The MIME type detected from the CLSID on the root
100     *      directory entry
101     */
102    public static function readHandle( $fileHandle ) {
103        $reader = new self( $fileHandle );
104        $info = [
105            'valid' => $reader->valid,
106            'mime' => $reader->mime,
107            'mimeFromClsid' => $reader->mimeFromClsid
108        ];
109        if ( $reader->error ) {
110            $info['error'] = $reader->error;
111            $info['errorCode'] = $reader->errorCode;
112        }
113        return $info;
114    }
115
116    private function __construct( $fileHandle ) {
117        $this->file = $fileHandle;
118        try {
119            $this->init();
120        } catch ( RuntimeException $e ) {
121            $this->valid = false;
122            $this->error = $e->getMessage();
123            $this->errorCode = $e->getCode();
124        }
125    }
126
127    private function init() {
128        $this->header = $this->unpackOffset( 0, [
129            'header_signature' => 8,
130            'header_clsid' => 16,
131            'minor_version' => 2,
132            'major_version' => 2,
133            'byte_order' => 2,
134            'sector_shift' => 2,
135            'mini_sector_shift' => 2,
136            'reserved' => 6,
137            'num_dir_sectors' => 4,
138            'num_fat_sectors' => 4,
139            'first_dir_sector' => 4,
140            'transaction_signature_number' => 4,
141            'mini_stream_cutoff_size' => 4,
142            'first_mini_fat_sector' => 4,
143            'num_mini_fat_sectors' => 4,
144            'first_difat_sector' => 4,
145            'num_difat_sectors' => 4,
146            'difat' => 436,
147        ] );
148        if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
149            $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
150                self::ERROR_INVALID_SIGNATURE );
151        }
152        $this->sectorLength = 1 << $this->header['sector_shift'];
153        $this->readDifat();
154        $this->readDirectory();
155
156        $this->valid = true;
157    }
158
159    private function sectorOffset( $sectorId ) {
160        return $this->sectorLength * ( $sectorId + 1 );
161    }
162
163    private function decodeClsid( $binaryClsid ) {
164        $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
165        return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
166            $parts['a'],
167            $parts['b'],
168            $parts['c'],
169            $parts['d1'],
170            $parts['d2'],
171            $parts['d3'],
172            $parts['d4'],
173            $parts['d5'],
174            $parts['d6'],
175            $parts['d7'],
176            $parts['d8']
177        );
178    }
179
180    /**
181     * @param int $offset
182     * @param int[] $struct
183     * @return array
184     */
185    private function unpackOffset( $offset, $struct ) {
186        $block = $this->readOffset( $offset, array_sum( $struct ) );
187        return $this->unpack( $block, 0, $struct );
188    }
189
190    /**
191     * @param string $block
192     * @param int $offset
193     * @param int[] $struct
194     * @return array
195     */
196    private function unpack( $block, $offset, $struct ) {
197        $data = [];
198        foreach ( $struct as $key => $length ) {
199            if ( $length > 4 ) {
200                $data[$key] = substr( $block, $offset, $length );
201            } else {
202                $data[$key] = $this->bin2dec( $block, $offset, $length );
203            }
204            $offset += $length;
205        }
206        return $data;
207    }
208
209    private function bin2dec( $str, $offset, $length ) {
210        $value = 0;
211        for ( $i = $length - 1; $i >= 0; $i-- ) {
212            $value *= 256;
213            $value += ord( $str[$offset + $i] );
214        }
215        return $value;
216    }
217
218    private function readOffset( $offset, $length ) {
219        $this->fseek( $offset );
220        // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
221        $block = @fread( $this->file, $length );
222        if ( $block === false ) {
223            $this->error( 'error reading from file', self::ERROR_READ );
224        }
225        if ( strlen( $block ) !== $length ) {
226            $this->error( 'unable to read the required number of bytes from the file',
227                self::ERROR_READ_PAST_END );
228        }
229        return $block;
230    }
231
232    private function readSector( $sectorId ) {
233        return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
234    }
235
236    /**
237     * @param string $message
238     * @param int $code
239     * @return never
240     */
241    private function error( $message, $code ) {
242        throw new RuntimeException( $message, $code );
243    }
244
245    private function fseek( $offset ) {
246        // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
247        $result = @fseek( $this->file, $offset );
248        if ( $result !== 0 ) {
249            $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
250        }
251    }
252
253    private function readDifat() {
254        $binaryDifat = $this->header['difat'];
255        $nextDifatSector = $this->header['first_difat_sector'];
256        for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
257            $block = $this->readSector( $nextDifatSector );
258            $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
259            $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
260            if ( $nextDifatSector == 0xFFFFFFFE ) {
261                break;
262            }
263        }
264
265        $this->difat = [];
266        for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
267            $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
268            if ( $fatSector < 0xFFFFFFFC ) {
269                $this->difat[] = $fatSector;
270            } else {
271                break;
272            }
273        }
274    }
275
276    private function getNextSectorIdFromFat( $sectorId ) {
277        $entriesPerSector = intdiv( $this->sectorLength, 4 );
278        $fatSectorId = intdiv( $sectorId, $entriesPerSector );
279        $fatSectorArray = $this->getFatSector( $fatSectorId );
280        return $fatSectorArray[$sectorId % $entriesPerSector];
281    }
282
283    private function getFatSector( $fatSectorId ) {
284        if ( !isset( $this->fat[$fatSectorId] ) ) {
285            $fat = [];
286            if ( !isset( $this->difat[$fatSectorId] ) ) {
287                $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
288            }
289            $absoluteSectorId = $this->difat[$fatSectorId];
290            $block = $this->readSector( $absoluteSectorId );
291            for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
292                $fat[] = $this->bin2dec( $block, $pos, 4 );
293            }
294            $this->fat[$fatSectorId] = $fat;
295        }
296        return $this->fat[$fatSectorId];
297    }
298
299    private function readDirectory() {
300        $dirSectorId = $this->header['first_dir_sector'];
301        $binaryDir = '';
302        $seenSectorIds = [];
303        while ( $dirSectorId !== 0xFFFFFFFE ) {
304            if ( isset( $seenSectorIds[$dirSectorId] ) ) {
305                $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
306            }
307            $seenSectorIds[$dirSectorId] = true;
308
309            $binaryDir .= $this->readSector( $dirSectorId );
310            $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
311        }
312
313        $struct = [
314            'name_raw' => 64,
315            'name_length' => 2,
316            'object_type' => 1,
317            'color' => 1,
318            'sid_left' => 4,
319            'sid_right' => 4,
320            'sid_child' => 4,
321            'clsid' => 16,
322            'state_bits' => 4,
323            'create_time_low' => 4,
324            'create_time_high' => 4,
325            'modify_time_low' => 4,
326            'modify_time_high' => 4,
327            'first_sector' => 4,
328            'size_low' => 4,
329            'size_high' => 4,
330        ];
331        $entryLength = array_sum( $struct );
332
333        for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
334            $entry = $this->unpack( $binaryDir, $pos, $struct );
335
336            // According to [MS-CFB] size_high may contain garbage due to a
337            // bug in a writer, it's best to pretend it is zero
338            $entry['size_high'] = 0;
339
340            $type = $entry['object_type'];
341            if ( $type == self::TYPE_UNALLOCATED ) {
342                continue;
343            }
344
345            $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
346
347            $clsid = $this->decodeClsid( $entry['clsid'] );
348            if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
349                $this->mimeFromClsid = self::$mimesByClsid[$clsid];
350            }
351
352            if ( $name === 'Workbook' ) {
353                $this->mime = 'application/vnd.ms-excel';
354            } elseif ( $name === 'WordDocument' ) {
355                $this->mime = 'application/msword';
356            } elseif ( $name === 'PowerPoint Document' ) {
357                $this->mime = 'application/vnd.ms-powerpoint';
358            }
359        }
360    }
361}