MediaWiki master
MSCompoundFileReader.php
Go to the documentation of this file.
1<?php
2/*
3 * Copyright 2019 Wikimedia Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
15 */
16
17namespace Wikimedia\Mime;
18
19use RuntimeException;
20
39 private $file;
41 private $header;
43 private $mime;
45 private $mimeFromClsid;
47 private $error;
49 private $errorCode;
51 private $valid = false;
52
54 private $sectorLength;
56 private $difat;
58 private $fat = [];
59
60 private const TYPE_UNALLOCATED = 0;
61 private const TYPE_STORAGE = 1;
62 private const TYPE_STREAM = 2;
63 private const TYPE_ROOT = 5;
64
65 public const ERROR_FILE_OPEN = 1;
66 public const ERROR_SEEK = 2;
67 public const ERROR_READ = 3;
68 public const ERROR_INVALID_SIGNATURE = 4;
69 public const ERROR_READ_PAST_END = 5;
70 public const ERROR_INVALID_FORMAT = 6;
71
72 private const MIMES_BY_CLSID = [
73 // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
74 '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
75 '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
76 '00020906-0000-0000-C000-000000000046' => 'application/msword',
77 '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
78 ];
79
92 public static function readFile( $fileName ) {
93 $handle = fopen( $fileName, 'r' );
94 if ( $handle === false ) {
95 return [
96 'valid' => false,
97 'error' => 'file does not exist',
98 'errorCode' => self::ERROR_FILE_OPEN
99 ];
100 }
101 return self::readHandle( $handle );
102 }
103
116 public static function readHandle( $fileHandle ) {
117 $reader = new self( $fileHandle );
118 $info = [
119 'valid' => $reader->valid,
120 'mime' => $reader->mime,
121 'mimeFromClsid' => $reader->mimeFromClsid
122 ];
123 if ( $reader->error ) {
124 $info['error'] = $reader->error;
125 $info['errorCode'] = $reader->errorCode;
126 }
127 return $info;
128 }
129
130 private function __construct( $fileHandle ) {
131 $this->file = $fileHandle;
132 try {
133 $this->init();
134 } catch ( RuntimeException $e ) {
135 $this->valid = false;
136 $this->error = $e->getMessage();
137 $this->errorCode = $e->getCode();
138 }
139 }
140
141 private function init() {
142 $this->header = $this->unpackOffset( 0, [
143 'header_signature' => 8,
144 'header_clsid' => 16,
145 'minor_version' => 2,
146 'major_version' => 2,
147 'byte_order' => 2,
148 'sector_shift' => 2,
149 'mini_sector_shift' => 2,
150 'reserved' => 6,
151 'num_dir_sectors' => 4,
152 'num_fat_sectors' => 4,
153 'first_dir_sector' => 4,
154 'transaction_signature_number' => 4,
155 'mini_stream_cutoff_size' => 4,
156 'first_mini_fat_sector' => 4,
157 'num_mini_fat_sectors' => 4,
158 'first_difat_sector' => 4,
159 'num_difat_sectors' => 4,
160 'difat' => 436,
161 ] );
162 if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
163 $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
164 self::ERROR_INVALID_SIGNATURE );
165 }
166 $this->sectorLength = 1 << $this->header['sector_shift'];
167 $this->readDifat();
168 $this->readDirectory();
169
170 $this->valid = true;
171 }
172
173 private function sectorOffset( $sectorId ) {
174 return $this->sectorLength * ( $sectorId + 1 );
175 }
176
177 private function decodeClsid( $binaryClsid ) {
178 $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
179 return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
180 $parts['a'],
181 $parts['b'],
182 $parts['c'],
183 $parts['d1'],
184 $parts['d2'],
185 $parts['d3'],
186 $parts['d4'],
187 $parts['d5'],
188 $parts['d6'],
189 $parts['d7'],
190 $parts['d8']
191 );
192 }
193
199 private function unpackOffset( $offset, $struct ) {
200 $block = $this->readOffset( $offset, array_sum( $struct ) );
201 return $this->unpack( $block, 0, $struct );
202 }
203
210 private function unpack( $block, $offset, $struct ) {
211 $data = [];
212 foreach ( $struct as $key => $length ) {
213 if ( $length > 4 ) {
214 $data[$key] = substr( $block, $offset, $length );
215 } else {
216 $data[$key] = $this->bin2dec( $block, $offset, $length );
217 }
218 $offset += $length;
219 }
220 return $data;
221 }
222
223 private function bin2dec( $str, $offset, $length ) {
224 $value = 0;
225 for ( $i = $length - 1; $i >= 0; $i-- ) {
226 $value *= 256;
227 $value += ord( $str[$offset + $i] );
228 }
229 return $value;
230 }
231
232 private function readOffset( $offset, $length ) {
233 $this->fseek( $offset );
234 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
235 $block = @fread( $this->file, $length );
236 if ( $block === false ) {
237 $this->error( 'error reading from file', self::ERROR_READ );
238 }
239 if ( strlen( $block ) !== $length ) {
240 $this->error( 'unable to read the required number of bytes from the file',
241 self::ERROR_READ_PAST_END );
242 }
243 return $block;
244 }
245
246 private function readSector( $sectorId ) {
247 return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
248 }
249
255 private function error( $message, $code ) {
256 throw new RuntimeException( $message, $code );
257 }
258
259 private function fseek( $offset ) {
260 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
261 $result = @fseek( $this->file, $offset );
262 if ( $result !== 0 ) {
263 $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
264 }
265 }
266
267 private function readDifat() {
268 $binaryDifat = $this->header['difat'];
269 $nextDifatSector = $this->header['first_difat_sector'];
270 for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
271 $block = $this->readSector( $nextDifatSector );
272 $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
273 $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
274 if ( $nextDifatSector == 0xFFFFFFFE ) {
275 break;
276 }
277 }
278
279 $this->difat = [];
280 for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
281 $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
282 if ( $fatSector < 0xFFFFFFFC ) {
283 $this->difat[] = $fatSector;
284 } else {
285 break;
286 }
287 }
288 }
289
290 private function getNextSectorIdFromFat( $sectorId ) {
291 $entriesPerSector = intdiv( $this->sectorLength, 4 );
292 $fatSectorId = intdiv( $sectorId, $entriesPerSector );
293 $fatSectorArray = $this->getFatSector( $fatSectorId );
294 return $fatSectorArray[$sectorId % $entriesPerSector];
295 }
296
297 private function getFatSector( $fatSectorId ) {
298 if ( !isset( $this->fat[$fatSectorId] ) ) {
299 $fat = [];
300 if ( !isset( $this->difat[$fatSectorId] ) ) {
301 $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
302 }
303 $absoluteSectorId = $this->difat[$fatSectorId];
304 $block = $this->readSector( $absoluteSectorId );
305 for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
306 $fat[] = $this->bin2dec( $block, $pos, 4 );
307 }
308 $this->fat[$fatSectorId] = $fat;
309 }
310 return $this->fat[$fatSectorId];
311 }
312
313 private function readDirectory() {
314 $dirSectorId = $this->header['first_dir_sector'];
315 $binaryDir = '';
316 $seenSectorIds = [];
317 while ( $dirSectorId !== 0xFFFFFFFE ) {
318 if ( isset( $seenSectorIds[$dirSectorId] ) ) {
319 $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
320 }
321 $seenSectorIds[$dirSectorId] = true;
322
323 $binaryDir .= $this->readSector( $dirSectorId );
324 $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
325 }
326
327 $struct = [
328 'name_raw' => 64,
329 'name_length' => 2,
330 'object_type' => 1,
331 'color' => 1,
332 'sid_left' => 4,
333 'sid_right' => 4,
334 'sid_child' => 4,
335 'clsid' => 16,
336 'state_bits' => 4,
337 'create_time_low' => 4,
338 'create_time_high' => 4,
339 'modify_time_low' => 4,
340 'modify_time_high' => 4,
341 'first_sector' => 4,
342 'size_low' => 4,
343 'size_high' => 4,
344 ];
345 $entryLength = array_sum( $struct );
346
347 for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
348 $entry = $this->unpack( $binaryDir, $pos, $struct );
349
350 // According to [MS-CFB] size_high may contain garbage due to a
351 // bug in a writer, it's best to pretend it is zero
352 $entry['size_high'] = 0;
353
354 $type = $entry['object_type'];
355 if ( $type == self::TYPE_UNALLOCATED ) {
356 continue;
357 }
358
359 $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
360
361 $clsid = $this->decodeClsid( $entry['clsid'] );
362 if ( $type == self::TYPE_ROOT && isset( self::MIMES_BY_CLSID[$clsid] ) ) {
363 $this->mimeFromClsid = self::MIMES_BY_CLSID[$clsid];
364 }
365
366 if ( $name === 'Workbook' ) {
367 $this->mime = 'application/vnd.ms-excel';
368 } elseif ( $name === 'WordDocument' ) {
369 $this->mime = 'application/msword';
370 } elseif ( $name === 'PowerPoint Document' ) {
371 $this->mime = 'application/vnd.ms-powerpoint';
372 }
373 }
374 }
375}
376
378class_alias( MSCompoundFileReader::class, 'MSCompoundFileReader' );
Read the directory of a Microsoft Compound File Binary file, a.k.a.
static readFile( $fileName)
Read a file by name.
static readHandle( $fileHandle)
Read from an open seekable handle.
$header