Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
92.73% |
153 / 165 |
|
70.59% |
12 / 17 |
CRAP | |
0.00% |
0 / 1 |
MSCompoundFileReader | |
92.73% |
153 / 165 |
|
70.59% |
12 / 17 |
43.71 | |
0.00% |
0 / 1 |
readFile | |
37.50% |
3 / 8 |
|
0.00% |
0 / 1 |
2.98 | |||
readHandle | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
init | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
2 | |||
sectorOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
decodeClsid | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
unpackOffset | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
unpack | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
bin2dec | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
readOffset | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
readSector | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
error | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
fseek | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
readDifat | |
71.43% |
10 / 14 |
|
0.00% |
0 / 1 |
5.58 | |||
getNextSectorIdFromFat | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getFatSector | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
4.02 | |||
readDirectory | |
100.00% |
44 / 44 |
|
100.00% |
1 / 1 |
10 |
1 | <?php |
2 | /* |
3 | * Copyright 2019 Wikimedia Foundation |
4 | * |
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
6 | * not use this file except in compliance with the License. |
7 | * You may obtain a copy of the License at |
8 | * |
9 | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | * |
11 | * Unless required by applicable law or agreed to in writing, software distributed |
12 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS |
13 | * OF ANY KIND, either express or implied. See the License for the |
14 | * specific language governing permissions and limitations under the License. |
15 | */ |
16 | |
17 | /** |
18 | * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE |
19 | * file, and detect the MIME type. |
20 | * |
21 | * References: |
22 | * - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx |
23 | * - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx |
24 | * - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx |
25 | * - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx |
26 | * - Python olefile https://github.com/decalage2/olefile |
27 | * - OpenOffice.org's Documentation of the Microsoft Compound Document |
28 | * File Format https://www.openoffice.org/sc/compdocfileformat.pdf |
29 | * |
30 | * @since 1.33 |
31 | * @ingroup Mime |
32 | */ |
33 | class MSCompoundFileReader { |
34 | private $file; |
35 | private $header; |
36 | private $mime; |
37 | private $mimeFromClsid; |
38 | private $error; |
39 | private $errorCode; |
40 | private $valid = false; |
41 | |
42 | private $sectorLength; |
43 | private $difat; |
44 | private $fat = []; |
45 | |
46 | private const TYPE_UNALLOCATED = 0; |
47 | private const TYPE_STORAGE = 1; |
48 | private const TYPE_STREAM = 2; |
49 | private const TYPE_ROOT = 5; |
50 | |
51 | public const ERROR_FILE_OPEN = 1; |
52 | public const ERROR_SEEK = 2; |
53 | public const ERROR_READ = 3; |
54 | public const ERROR_INVALID_SIGNATURE = 4; |
55 | public const ERROR_READ_PAST_END = 5; |
56 | public const ERROR_INVALID_FORMAT = 6; |
57 | |
58 | private static $mimesByClsid = [ |
59 | // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File |
60 | '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel', |
61 | '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel', |
62 | '00020906-0000-0000-C000-000000000046' => 'application/msword', |
63 | '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint', |
64 | ]; |
65 | |
66 | /** |
67 | * Read a file by name |
68 | * |
69 | * @param string $fileName The full path to the file |
70 | * @return array An associative array of information about the file: |
71 | * - valid: true if the file is valid, false otherwise |
72 | * - error: An error message in English, should be present if valid=false |
73 | * - errorCode: One of the self::ERROR_* constants |
74 | * - mime: The MIME type detected from the directory contents |
75 | * - mimeFromClsid: The MIME type detected from the CLSID on the root |
76 | * directory entry |
77 | */ |
78 | public static function readFile( $fileName ) { |
79 | $handle = fopen( $fileName, 'r' ); |
80 | if ( $handle === false ) { |
81 | return [ |
82 | 'valid' => false, |
83 | 'error' => 'file does not exist', |
84 | 'errorCode' => self::ERROR_FILE_OPEN |
85 | ]; |
86 | } |
87 | return self::readHandle( $handle ); |
88 | } |
89 | |
90 | /** |
91 | * Read from an open seekable handle |
92 | * |
93 | * @param resource $fileHandle |
94 | * @return array An associative array of information about the file: |
95 | * - valid: true if the file is valid, false otherwise |
96 | * - error: An error message in English, should be present if valid=false |
97 | * - errorCode: One of the self::ERROR_* constants |
98 | * - mime: The MIME type detected from the directory contents |
99 | * - mimeFromClsid: The MIME type detected from the CLSID on the root |
100 | * directory entry |
101 | */ |
102 | public static function readHandle( $fileHandle ) { |
103 | $reader = new self( $fileHandle ); |
104 | $info = [ |
105 | 'valid' => $reader->valid, |
106 | 'mime' => $reader->mime, |
107 | 'mimeFromClsid' => $reader->mimeFromClsid |
108 | ]; |
109 | if ( $reader->error ) { |
110 | $info['error'] = $reader->error; |
111 | $info['errorCode'] = $reader->errorCode; |
112 | } |
113 | return $info; |
114 | } |
115 | |
116 | private function __construct( $fileHandle ) { |
117 | $this->file = $fileHandle; |
118 | try { |
119 | $this->init(); |
120 | } catch ( RuntimeException $e ) { |
121 | $this->valid = false; |
122 | $this->error = $e->getMessage(); |
123 | $this->errorCode = $e->getCode(); |
124 | } |
125 | } |
126 | |
127 | private function init() { |
128 | $this->header = $this->unpackOffset( 0, [ |
129 | 'header_signature' => 8, |
130 | 'header_clsid' => 16, |
131 | 'minor_version' => 2, |
132 | 'major_version' => 2, |
133 | 'byte_order' => 2, |
134 | 'sector_shift' => 2, |
135 | 'mini_sector_shift' => 2, |
136 | 'reserved' => 6, |
137 | 'num_dir_sectors' => 4, |
138 | 'num_fat_sectors' => 4, |
139 | 'first_dir_sector' => 4, |
140 | 'transaction_signature_number' => 4, |
141 | 'mini_stream_cutoff_size' => 4, |
142 | 'first_mini_fat_sector' => 4, |
143 | 'num_mini_fat_sectors' => 4, |
144 | 'first_difat_sector' => 4, |
145 | 'num_difat_sectors' => 4, |
146 | 'difat' => 436, |
147 | ] ); |
148 | if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) { |
149 | $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ), |
150 | self::ERROR_INVALID_SIGNATURE ); |
151 | } |
152 | $this->sectorLength = 1 << $this->header['sector_shift']; |
153 | $this->readDifat(); |
154 | $this->readDirectory(); |
155 | |
156 | $this->valid = true; |
157 | } |
158 | |
159 | private function sectorOffset( $sectorId ) { |
160 | return $this->sectorLength * ( $sectorId + 1 ); |
161 | } |
162 | |
163 | private function decodeClsid( $binaryClsid ) { |
164 | $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid ); |
165 | return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X", |
166 | $parts['a'], |
167 | $parts['b'], |
168 | $parts['c'], |
169 | $parts['d1'], |
170 | $parts['d2'], |
171 | $parts['d3'], |
172 | $parts['d4'], |
173 | $parts['d5'], |
174 | $parts['d6'], |
175 | $parts['d7'], |
176 | $parts['d8'] |
177 | ); |
178 | } |
179 | |
180 | /** |
181 | * @param int $offset |
182 | * @param int[] $struct |
183 | * @return array |
184 | */ |
185 | private function unpackOffset( $offset, $struct ) { |
186 | $block = $this->readOffset( $offset, array_sum( $struct ) ); |
187 | return $this->unpack( $block, 0, $struct ); |
188 | } |
189 | |
190 | /** |
191 | * @param string $block |
192 | * @param int $offset |
193 | * @param int[] $struct |
194 | * @return array |
195 | */ |
196 | private function unpack( $block, $offset, $struct ) { |
197 | $data = []; |
198 | foreach ( $struct as $key => $length ) { |
199 | if ( $length > 4 ) { |
200 | $data[$key] = substr( $block, $offset, $length ); |
201 | } else { |
202 | $data[$key] = $this->bin2dec( $block, $offset, $length ); |
203 | } |
204 | $offset += $length; |
205 | } |
206 | return $data; |
207 | } |
208 | |
209 | private function bin2dec( $str, $offset, $length ) { |
210 | $value = 0; |
211 | for ( $i = $length - 1; $i >= 0; $i-- ) { |
212 | $value *= 256; |
213 | $value += ord( $str[$offset + $i] ); |
214 | } |
215 | return $value; |
216 | } |
217 | |
218 | private function readOffset( $offset, $length ) { |
219 | $this->fseek( $offset ); |
220 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
221 | $block = @fread( $this->file, $length ); |
222 | if ( $block === false ) { |
223 | $this->error( 'error reading from file', self::ERROR_READ ); |
224 | } |
225 | if ( strlen( $block ) !== $length ) { |
226 | $this->error( 'unable to read the required number of bytes from the file', |
227 | self::ERROR_READ_PAST_END ); |
228 | } |
229 | return $block; |
230 | } |
231 | |
232 | private function readSector( $sectorId ) { |
233 | return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] ); |
234 | } |
235 | |
236 | /** |
237 | * @param string $message |
238 | * @param int $code |
239 | * @return never |
240 | */ |
241 | private function error( $message, $code ) { |
242 | throw new RuntimeException( $message, $code ); |
243 | } |
244 | |
245 | private function fseek( $offset ) { |
246 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
247 | $result = @fseek( $this->file, $offset ); |
248 | if ( $result !== 0 ) { |
249 | $this->error( "unable to seek to offset $offset", self::ERROR_SEEK ); |
250 | } |
251 | } |
252 | |
253 | private function readDifat() { |
254 | $binaryDifat = $this->header['difat']; |
255 | $nextDifatSector = $this->header['first_difat_sector']; |
256 | for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) { |
257 | $block = $this->readSector( $nextDifatSector ); |
258 | $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 ); |
259 | $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 ); |
260 | if ( $nextDifatSector == 0xFFFFFFFE ) { |
261 | break; |
262 | } |
263 | } |
264 | |
265 | $this->difat = []; |
266 | for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) { |
267 | $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 ); |
268 | if ( $fatSector < 0xFFFFFFFC ) { |
269 | $this->difat[] = $fatSector; |
270 | } else { |
271 | break; |
272 | } |
273 | } |
274 | } |
275 | |
276 | private function getNextSectorIdFromFat( $sectorId ) { |
277 | $entriesPerSector = intdiv( $this->sectorLength, 4 ); |
278 | $fatSectorId = intdiv( $sectorId, $entriesPerSector ); |
279 | $fatSectorArray = $this->getFatSector( $fatSectorId ); |
280 | return $fatSectorArray[$sectorId % $entriesPerSector]; |
281 | } |
282 | |
283 | private function getFatSector( $fatSectorId ) { |
284 | if ( !isset( $this->fat[$fatSectorId] ) ) { |
285 | $fat = []; |
286 | if ( !isset( $this->difat[$fatSectorId] ) ) { |
287 | $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT ); |
288 | } |
289 | $absoluteSectorId = $this->difat[$fatSectorId]; |
290 | $block = $this->readSector( $absoluteSectorId ); |
291 | for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) { |
292 | $fat[] = $this->bin2dec( $block, $pos, 4 ); |
293 | } |
294 | $this->fat[$fatSectorId] = $fat; |
295 | } |
296 | return $this->fat[$fatSectorId]; |
297 | } |
298 | |
299 | private function readDirectory() { |
300 | $dirSectorId = $this->header['first_dir_sector']; |
301 | $binaryDir = ''; |
302 | $seenSectorIds = []; |
303 | while ( $dirSectorId !== 0xFFFFFFFE ) { |
304 | if ( isset( $seenSectorIds[$dirSectorId] ) ) { |
305 | $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT ); |
306 | } |
307 | $seenSectorIds[$dirSectorId] = true; |
308 | |
309 | $binaryDir .= $this->readSector( $dirSectorId ); |
310 | $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId ); |
311 | } |
312 | |
313 | $struct = [ |
314 | 'name_raw' => 64, |
315 | 'name_length' => 2, |
316 | 'object_type' => 1, |
317 | 'color' => 1, |
318 | 'sid_left' => 4, |
319 | 'sid_right' => 4, |
320 | 'sid_child' => 4, |
321 | 'clsid' => 16, |
322 | 'state_bits' => 4, |
323 | 'create_time_low' => 4, |
324 | 'create_time_high' => 4, |
325 | 'modify_time_low' => 4, |
326 | 'modify_time_high' => 4, |
327 | 'first_sector' => 4, |
328 | 'size_low' => 4, |
329 | 'size_high' => 4, |
330 | ]; |
331 | $entryLength = array_sum( $struct ); |
332 | |
333 | for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) { |
334 | $entry = $this->unpack( $binaryDir, $pos, $struct ); |
335 | |
336 | // According to [MS-CFB] size_high may contain garbage due to a |
337 | // bug in a writer, it's best to pretend it is zero |
338 | $entry['size_high'] = 0; |
339 | |
340 | $type = $entry['object_type']; |
341 | if ( $type == self::TYPE_UNALLOCATED ) { |
342 | continue; |
343 | } |
344 | |
345 | $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) ); |
346 | |
347 | $clsid = $this->decodeClsid( $entry['clsid'] ); |
348 | if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) { |
349 | $this->mimeFromClsid = self::$mimesByClsid[$clsid]; |
350 | } |
351 | |
352 | if ( $name === 'Workbook' ) { |
353 | $this->mime = 'application/vnd.ms-excel'; |
354 | } elseif ( $name === 'WordDocument' ) { |
355 | $this->mime = 'application/msword'; |
356 | } elseif ( $name === 'PowerPoint Document' ) { |
357 | $this->mime = 'application/vnd.ms-powerpoint'; |
358 | } |
359 | } |
360 | } |
361 | } |