MediaWiki  master
MSCompoundFileReader.php
Go to the documentation of this file.
1 <?php
2 /*
3  * Copyright 2019 Wikimedia Foundation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License"); you may
6  * not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software distributed
12  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13  * OF ANY KIND, either express or implied. See the License for the
14  * specific language governing permissions and limitations under the License.
15  */
16 
33  private $file;
34  private $header;
35  private $mime;
36  private $mimeFromClsid;
37  private $error;
38  private $errorCode;
39  private $valid = false;
40 
41  private $sectorLength;
42  private $difat;
43  private $fat = [];
44 
45  private const TYPE_UNALLOCATED = 0;
46  private const TYPE_STORAGE = 1;
47  private const TYPE_STREAM = 2;
48  private const TYPE_ROOT = 5;
49 
50  public const ERROR_FILE_OPEN = 1;
51  public const ERROR_SEEK = 2;
52  public const ERROR_READ = 3;
53  public const ERROR_INVALID_SIGNATURE = 4;
54  public const ERROR_READ_PAST_END = 5;
55  public const ERROR_INVALID_FORMAT = 6;
56 
57  private static $mimesByClsid = [
58  // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
59  '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
60  '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
61  '00020906-0000-0000-C000-000000000046' => 'application/msword',
62  '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
63  ];
64 
77  public static function readFile( $fileName ) {
78  $handle = fopen( $fileName, 'r' );
79  if ( $handle === false ) {
80  return [
81  'valid' => false,
82  'error' => 'file does not exist',
83  'errorCode' => self::ERROR_FILE_OPEN
84  ];
85  }
86  return self::readHandle( $handle );
87  }
88 
101  public static function readHandle( $fileHandle ) {
102  $reader = new self( $fileHandle );
103  $info = [
104  'valid' => $reader->valid,
105  'mime' => $reader->mime,
106  'mimeFromClsid' => $reader->mimeFromClsid
107  ];
108  if ( $reader->error ) {
109  $info['error'] = $reader->error;
110  $info['errorCode'] = $reader->errorCode;
111  }
112  return $info;
113  }
114 
115  private function __construct( $fileHandle ) {
116  $this->file = $fileHandle;
117  try {
118  $this->init();
119  } catch ( RuntimeException $e ) {
120  $this->valid = false;
121  $this->error = $e->getMessage();
122  $this->errorCode = $e->getCode();
123  }
124  }
125 
126  private function init() {
127  $this->header = $this->unpackOffset( 0, [
128  'header_signature' => 8,
129  'header_clsid' => 16,
130  'minor_version' => 2,
131  'major_version' => 2,
132  'byte_order' => 2,
133  'sector_shift' => 2,
134  'mini_sector_shift' => 2,
135  'reserved' => 6,
136  'num_dir_sectors' => 4,
137  'num_fat_sectors' => 4,
138  'first_dir_sector' => 4,
139  'transaction_signature_number' => 4,
140  'mini_stream_cutoff_size' => 4,
141  'first_mini_fat_sector' => 4,
142  'num_mini_fat_sectors' => 4,
143  'first_difat_sector' => 4,
144  'num_difat_sectors' => 4,
145  'difat' => 436,
146  ] );
147  if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
148  $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
149  self::ERROR_INVALID_SIGNATURE );
150  }
151  $this->sectorLength = 1 << $this->header['sector_shift'];
152  $this->readDifat();
153  $this->readDirectory();
154 
155  $this->valid = true;
156  }
157 
158  private function sectorOffset( $sectorId ) {
159  return $this->sectorLength * ( $sectorId + 1 );
160  }
161 
162  private function decodeClsid( $binaryClsid ) {
163  $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
164  return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
165  $parts['a'],
166  $parts['b'],
167  $parts['c'],
168  $parts['d1'],
169  $parts['d2'],
170  $parts['d3'],
171  $parts['d4'],
172  $parts['d5'],
173  $parts['d6'],
174  $parts['d7'],
175  $parts['d8']
176  );
177  }
178 
184  private function unpackOffset( $offset, $struct ) {
185  $block = $this->readOffset( $offset, array_sum( $struct ) );
186  return $this->unpack( $block, 0, $struct );
187  }
188 
195  private function unpack( $block, $offset, $struct ) {
196  $data = [];
197  foreach ( $struct as $key => $length ) {
198  if ( $length > 4 ) {
199  $data[$key] = substr( $block, $offset, $length );
200  } else {
201  $data[$key] = $this->bin2dec( $block, $offset, $length );
202  }
203  $offset += $length;
204  }
205  return $data;
206  }
207 
208  private function bin2dec( $str, $offset, $length ) {
209  $value = 0;
210  for ( $i = $length - 1; $i >= 0; $i-- ) {
211  $value *= 256;
212  $value += ord( $str[$offset + $i] );
213  }
214  return $value;
215  }
216 
217  private function readOffset( $offset, $length ) {
218  $this->fseek( $offset );
219  Wikimedia\suppressWarnings();
220  $block = fread( $this->file, $length );
221  Wikimedia\restoreWarnings();
222  if ( $block === false ) {
223  $this->error( 'error reading from file', self::ERROR_READ );
224  }
225  if ( strlen( $block ) !== $length ) {
226  $this->error( 'unable to read the required number of bytes from the file',
227  self::ERROR_READ_PAST_END );
228  }
229  return $block;
230  }
231 
232  private function readSector( $sectorId ) {
233  return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
234  }
235 
236  private function error( $message, $code ) {
237  throw new RuntimeException( $message, $code );
238  }
239 
240  private function fseek( $offset ) {
241  Wikimedia\suppressWarnings();
242  $result = fseek( $this->file, $offset );
243  Wikimedia\restoreWarnings();
244  if ( $result !== 0 ) {
245  $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
246  }
247  }
248 
249  private function readDifat() {
250  $binaryDifat = $this->header['difat'];
251  $nextDifatSector = $this->header['first_difat_sector'];
252  for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
253  $block = $this->readSector( $nextDifatSector );
254  $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
255  $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
256  if ( $nextDifatSector == 0xFFFFFFFE ) {
257  break;
258  }
259  }
260 
261  $this->difat = [];
262  for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
263  $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
264  if ( $fatSector < 0xFFFFFFFC ) {
265  $this->difat[] = $fatSector;
266  } else {
267  break;
268  }
269  }
270  }
271 
272  private function getNextSectorIdFromFat( $sectorId ) {
273  $entriesPerSector = intdiv( $this->sectorLength, 4 );
274  $fatSectorId = intdiv( $sectorId, $entriesPerSector );
275  $fatSectorArray = $this->getFatSector( $fatSectorId );
276  return $fatSectorArray[$sectorId % $entriesPerSector];
277  }
278 
279  private function getFatSector( $fatSectorId ) {
280  if ( !isset( $this->fat[$fatSectorId] ) ) {
281  $fat = [];
282  if ( !isset( $this->difat[$fatSectorId] ) ) {
283  $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
284  }
285  $absoluteSectorId = $this->difat[$fatSectorId];
286  $block = $this->readSector( $absoluteSectorId );
287  for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
288  $fat[] = $this->bin2dec( $block, $pos, 4 );
289  }
290  $this->fat[$fatSectorId] = $fat;
291  }
292  return $this->fat[$fatSectorId];
293  }
294 
295  private function readDirectory() {
296  $dirSectorId = $this->header['first_dir_sector'];
297  $binaryDir = '';
298  $seenSectorIds = [];
299  while ( $dirSectorId !== 0xFFFFFFFE ) {
300  if ( isset( $seenSectorIds[$dirSectorId] ) ) {
301  $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
302  }
303  $seenSectorIds[$dirSectorId] = true;
304 
305  $binaryDir .= $this->readSector( $dirSectorId );
306  $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
307  }
308 
309  $struct = [
310  'name_raw' => 64,
311  'name_length' => 2,
312  'object_type' => 1,
313  'color' => 1,
314  'sid_left' => 4,
315  'sid_right' => 4,
316  'sid_child' => 4,
317  'clsid' => 16,
318  'state_bits' => 4,
319  'create_time_low' => 4,
320  'create_time_high' => 4,
321  'modify_time_low' => 4,
322  'modify_time_high' => 4,
323  'first_sector' => 4,
324  'size_low' => 4,
325  'size_high' => 4,
326  ];
327  $entryLength = array_sum( $struct );
328 
329  for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
330  $entry = $this->unpack( $binaryDir, $pos, $struct );
331 
332  // According to [MS-CFB] size_high may contain garbage due to a
333  // bug in a writer, it's best to pretend it is zero
334  $entry['size_high'] = 0;
335 
336  $type = $entry['object_type'];
337  if ( $type == self::TYPE_UNALLOCATED ) {
338  continue;
339  }
340 
341  $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
342 
343  $clsid = $this->decodeClsid( $entry['clsid'] );
344  if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
345  $this->mimeFromClsid = self::$mimesByClsid[$clsid];
346  }
347 
348  if ( $name === 'Workbook' ) {
349  $this->mime = 'application/vnd.ms-excel';
350  } elseif ( $name === 'WordDocument' ) {
351  $this->mime = 'application/msword';
352  } elseif ( $name === 'PowerPoint Document' ) {
353  $this->mime = 'application/vnd.ms-powerpoint';
354  }
355  }
356  }
357 }
MSCompoundFileReader\ERROR_READ
const ERROR_READ
Definition: MSCompoundFileReader.php:52
MSCompoundFileReader\$file
$file
Definition: MSCompoundFileReader.php:33
MSCompoundFileReader\sectorOffset
sectorOffset( $sectorId)
Definition: MSCompoundFileReader.php:158
MSCompoundFileReader\fseek
fseek( $offset)
Definition: MSCompoundFileReader.php:240
MSCompoundFileReader\$mimeFromClsid
$mimeFromClsid
Definition: MSCompoundFileReader.php:36
MSCompoundFileReader\$sectorLength
$sectorLength
Definition: MSCompoundFileReader.php:41
MSCompoundFileReader\unpack
unpack( $block, $offset, $struct)
Definition: MSCompoundFileReader.php:195
MSCompoundFileReader\$header
$header
Definition: MSCompoundFileReader.php:34
MSCompoundFileReader\init
init()
Definition: MSCompoundFileReader.php:126
MSCompoundFileReader\$mime
$mime
Definition: MSCompoundFileReader.php:35
MSCompoundFileReader\readHandle
static readHandle( $fileHandle)
Read from an open seekable handle.
Definition: MSCompoundFileReader.php:101
MSCompoundFileReader\readDirectory
readDirectory()
Definition: MSCompoundFileReader.php:295
MSCompoundFileReader\decodeClsid
decodeClsid( $binaryClsid)
Definition: MSCompoundFileReader.php:162
MSCompoundFileReader\bin2dec
bin2dec( $str, $offset, $length)
Definition: MSCompoundFileReader.php:208
MSCompoundFileReader\getNextSectorIdFromFat
getNextSectorIdFromFat( $sectorId)
Definition: MSCompoundFileReader.php:272
MSCompoundFileReader\__construct
__construct( $fileHandle)
Definition: MSCompoundFileReader.php:115
MSCompoundFileReader\ERROR_FILE_OPEN
const ERROR_FILE_OPEN
Definition: MSCompoundFileReader.php:50
MSCompoundFileReader\readFile
static readFile( $fileName)
Read a file by name.
Definition: MSCompoundFileReader.php:77
MSCompoundFileReader\TYPE_STORAGE
const TYPE_STORAGE
Definition: MSCompoundFileReader.php:46
MSCompoundFileReader\getFatSector
getFatSector( $fatSectorId)
Definition: MSCompoundFileReader.php:279
MSCompoundFileReader\$difat
$difat
Definition: MSCompoundFileReader.php:42
MSCompoundFileReader\$error
$error
Definition: MSCompoundFileReader.php:37
MSCompoundFileReader\readDifat
readDifat()
Definition: MSCompoundFileReader.php:249
MSCompoundFileReader\$valid
$valid
Definition: MSCompoundFileReader.php:39
MSCompoundFileReader\readSector
readSector( $sectorId)
Definition: MSCompoundFileReader.php:232
MSCompoundFileReader\$errorCode
$errorCode
Definition: MSCompoundFileReader.php:38
MSCompoundFileReader
Read the directory of a Microsoft Compound File Binary file, a.k.a.
Definition: MSCompoundFileReader.php:32
MSCompoundFileReader\ERROR_SEEK
const ERROR_SEEK
Definition: MSCompoundFileReader.php:51
MSCompoundFileReader\TYPE_ROOT
const TYPE_ROOT
Definition: MSCompoundFileReader.php:48
MSCompoundFileReader\$mimesByClsid
static $mimesByClsid
Definition: MSCompoundFileReader.php:57
MSCompoundFileReader\ERROR_INVALID_FORMAT
const ERROR_INVALID_FORMAT
Definition: MSCompoundFileReader.php:55
MSCompoundFileReader\ERROR_READ_PAST_END
const ERROR_READ_PAST_END
Definition: MSCompoundFileReader.php:54
MSCompoundFileReader\TYPE_UNALLOCATED
const TYPE_UNALLOCATED
Definition: MSCompoundFileReader.php:45
MSCompoundFileReader\ERROR_INVALID_SIGNATURE
const ERROR_INVALID_SIGNATURE
Definition: MSCompoundFileReader.php:53
MSCompoundFileReader\error
error( $message, $code)
Definition: MSCompoundFileReader.php:236
MSCompoundFileReader\unpackOffset
unpackOffset( $offset, $struct)
Definition: MSCompoundFileReader.php:184
MSCompoundFileReader\$fat
$fat
Definition: MSCompoundFileReader.php:43
MSCompoundFileReader\readOffset
readOffset( $offset, $length)
Definition: MSCompoundFileReader.php:217
$type
$type
Definition: testCompression.php:52
MSCompoundFileReader\TYPE_STREAM
const TYPE_STREAM
Definition: MSCompoundFileReader.php:47