MediaWiki REL1_37
MSCompoundFileReader.php
Go to the documentation of this file.
1<?php
2/*
3 * Copyright 2019 Wikimedia Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
15 */
16
33 private $file;
34 private $header;
35 private $mime;
37 private $error;
38 private $errorCode;
39 private $valid = false;
40
42 private $difat;
43 private $fat = [];
44
45 private const TYPE_UNALLOCATED = 0;
46 private const TYPE_STORAGE = 1;
47 private const TYPE_STREAM = 2;
48 private const TYPE_ROOT = 5;
49
50 public const ERROR_FILE_OPEN = 1;
51 public const ERROR_SEEK = 2;
52 public const ERROR_READ = 3;
53 public const ERROR_INVALID_SIGNATURE = 4;
54 public const ERROR_READ_PAST_END = 5;
55 public const ERROR_INVALID_FORMAT = 6;
56
57 private static $mimesByClsid = [
58 // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
59 '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
60 '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
61 '00020906-0000-0000-C000-000000000046' => 'application/msword',
62 '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
63 ];
64
77 public static function readFile( $fileName ) {
78 $handle = fopen( $fileName, 'r' );
79 if ( $handle === false ) {
80 return [
81 'valid' => false,
82 'error' => 'file does not exist',
83 'errorCode' => self::ERROR_FILE_OPEN
84 ];
85 }
86 return self::readHandle( $handle );
87 }
88
101 public static function readHandle( $fileHandle ) {
102 $reader = new self( $fileHandle );
103 $info = [
104 'valid' => $reader->valid,
105 'mime' => $reader->mime,
106 'mimeFromClsid' => $reader->mimeFromClsid
107 ];
108 if ( $reader->error ) {
109 $info['error'] = $reader->error;
110 $info['errorCode'] = $reader->errorCode;
111 }
112 return $info;
113 }
114
115 private function __construct( $fileHandle ) {
116 $this->file = $fileHandle;
117 try {
118 $this->init();
119 } catch ( RuntimeException $e ) {
120 $this->valid = false;
121 $this->error = $e->getMessage();
122 $this->errorCode = $e->getCode();
123 }
124 }
125
126 private function init() {
127 $this->header = $this->unpackOffset( 0, [
128 'header_signature' => 8,
129 'header_clsid' => 16,
130 'minor_version' => 2,
131 'major_version' => 2,
132 'byte_order' => 2,
133 'sector_shift' => 2,
134 'mini_sector_shift' => 2,
135 'reserved' => 6,
136 'num_dir_sectors' => 4,
137 'num_fat_sectors' => 4,
138 'first_dir_sector' => 4,
139 'transaction_signature_number' => 4,
140 'mini_stream_cutoff_size' => 4,
141 'first_mini_fat_sector' => 4,
142 'num_mini_fat_sectors' => 4,
143 'first_difat_sector' => 4,
144 'num_difat_sectors' => 4,
145 'difat' => 436,
146 ] );
147 if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
148 $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
149 self::ERROR_INVALID_SIGNATURE );
150 }
151 $this->sectorLength = 1 << $this->header['sector_shift'];
152 $this->readDifat();
153 $this->readDirectory();
154
155 $this->valid = true;
156 }
157
158 private function sectorOffset( $sectorId ) {
159 return $this->sectorLength * ( $sectorId + 1 );
160 }
161
162 private function decodeClsid( $binaryClsid ) {
163 $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
164 return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
165 $parts['a'],
166 $parts['b'],
167 $parts['c'],
168 $parts['d1'],
169 $parts['d2'],
170 $parts['d3'],
171 $parts['d4'],
172 $parts['d5'],
173 $parts['d6'],
174 $parts['d7'],
175 $parts['d8']
176 );
177 }
178
184 private function unpackOffset( $offset, $struct ) {
185 $block = $this->readOffset( $offset, array_sum( $struct ) );
186 return $this->unpack( $block, 0, $struct );
187 }
188
195 private function unpack( $block, $offset, $struct ) {
196 $data = [];
197 foreach ( $struct as $key => $length ) {
198 if ( $length > 4 ) {
199 $data[$key] = substr( $block, $offset, $length );
200 } else {
201 $data[$key] = $this->bin2dec( $block, $offset, $length );
202 }
203 $offset += $length;
204 }
205 return $data;
206 }
207
208 private function bin2dec( $str, $offset, $length ) {
209 $value = 0;
210 for ( $i = $length - 1; $i >= 0; $i-- ) {
211 $value *= 256;
212 $value += ord( $str[$offset + $i] );
213 }
214 return $value;
215 }
216
217 private function readOffset( $offset, $length ) {
218 $this->fseek( $offset );
219 Wikimedia\suppressWarnings();
220 $block = fread( $this->file, $length );
221 Wikimedia\restoreWarnings();
222 if ( $block === false ) {
223 $this->error( 'error reading from file', self::ERROR_READ );
224 }
225 if ( strlen( $block ) !== $length ) {
226 $this->error( 'unable to read the required number of bytes from the file',
227 self::ERROR_READ_PAST_END );
228 }
229 return $block;
230 }
231
232 private function readSector( $sectorId ) {
233 return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
234 }
235
241 private function error( $message, $code ) {
242 throw new RuntimeException( $message, $code );
243 }
244
245 private function fseek( $offset ) {
246 Wikimedia\suppressWarnings();
247 $result = fseek( $this->file, $offset );
248 Wikimedia\restoreWarnings();
249 if ( $result !== 0 ) {
250 $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
251 }
252 }
253
254 private function readDifat() {
255 $binaryDifat = $this->header['difat'];
256 $nextDifatSector = $this->header['first_difat_sector'];
257 for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
258 $block = $this->readSector( $nextDifatSector );
259 $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
260 $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
261 if ( $nextDifatSector == 0xFFFFFFFE ) {
262 break;
263 }
264 }
265
266 $this->difat = [];
267 for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
268 $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
269 if ( $fatSector < 0xFFFFFFFC ) {
270 $this->difat[] = $fatSector;
271 } else {
272 break;
273 }
274 }
275 }
276
277 private function getNextSectorIdFromFat( $sectorId ) {
278 $entriesPerSector = intdiv( $this->sectorLength, 4 );
279 $fatSectorId = intdiv( $sectorId, $entriesPerSector );
280 $fatSectorArray = $this->getFatSector( $fatSectorId );
281 return $fatSectorArray[$sectorId % $entriesPerSector];
282 }
283
284 private function getFatSector( $fatSectorId ) {
285 if ( !isset( $this->fat[$fatSectorId] ) ) {
286 $fat = [];
287 if ( !isset( $this->difat[$fatSectorId] ) ) {
288 $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
289 }
290 $absoluteSectorId = $this->difat[$fatSectorId];
291 $block = $this->readSector( $absoluteSectorId );
292 for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
293 $fat[] = $this->bin2dec( $block, $pos, 4 );
294 }
295 $this->fat[$fatSectorId] = $fat;
296 }
297 return $this->fat[$fatSectorId];
298 }
299
300 private function readDirectory() {
301 $dirSectorId = $this->header['first_dir_sector'];
302 $binaryDir = '';
303 $seenSectorIds = [];
304 while ( $dirSectorId !== 0xFFFFFFFE ) {
305 if ( isset( $seenSectorIds[$dirSectorId] ) ) {
306 $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
307 }
308 $seenSectorIds[$dirSectorId] = true;
309
310 $binaryDir .= $this->readSector( $dirSectorId );
311 $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
312 }
313
314 $struct = [
315 'name_raw' => 64,
316 'name_length' => 2,
317 'object_type' => 1,
318 'color' => 1,
319 'sid_left' => 4,
320 'sid_right' => 4,
321 'sid_child' => 4,
322 'clsid' => 16,
323 'state_bits' => 4,
324 'create_time_low' => 4,
325 'create_time_high' => 4,
326 'modify_time_low' => 4,
327 'modify_time_high' => 4,
328 'first_sector' => 4,
329 'size_low' => 4,
330 'size_high' => 4,
331 ];
332 $entryLength = array_sum( $struct );
333
334 for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
335 $entry = $this->unpack( $binaryDir, $pos, $struct );
336
337 // According to [MS-CFB] size_high may contain garbage due to a
338 // bug in a writer, it's best to pretend it is zero
339 $entry['size_high'] = 0;
340
341 $type = $entry['object_type'];
342 if ( $type == self::TYPE_UNALLOCATED ) {
343 continue;
344 }
345
346 $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
347
348 $clsid = $this->decodeClsid( $entry['clsid'] );
349 if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
350 $this->mimeFromClsid = self::$mimesByClsid[$clsid];
351 }
352
353 if ( $name === 'Workbook' ) {
354 $this->mime = 'application/vnd.ms-excel';
355 } elseif ( $name === 'WordDocument' ) {
356 $this->mime = 'application/msword';
357 } elseif ( $name === 'PowerPoint Document' ) {
358 $this->mime = 'application/vnd.ms-powerpoint';
359 }
360 }
361 }
362}
Read the directory of a Microsoft Compound File Binary file, a.k.a.
static readFile( $fileName)
Read a file by name.
unpack( $block, $offset, $struct)
readOffset( $offset, $length)
unpackOffset( $offset, $struct)
bin2dec( $str, $offset, $length)
static readHandle( $fileHandle)
Read from an open seekable handle.