MediaWiki  master
MSCompoundFileReader.php
Go to the documentation of this file.
1 <?php
2 /*
3  * Copyright 2019 Wikimedia Foundation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License"); you may
6  * not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software distributed
12  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13  * OF ANY KIND, either express or implied. See the License for the
14  * specific language governing permissions and limitations under the License.
15  */
16 
17 use Wikimedia\AtEase\AtEase;
18 
35  private $file;
36  private $header;
37  private $mime;
38  private $mimeFromClsid;
39  private $error;
40  private $errorCode;
41  private $valid = false;
42 
43  private $sectorLength;
44  private $difat;
45  private $fat = [];
46 
47  private const TYPE_UNALLOCATED = 0;
48  private const TYPE_STORAGE = 1;
49  private const TYPE_STREAM = 2;
50  private const TYPE_ROOT = 5;
51 
52  public const ERROR_FILE_OPEN = 1;
53  public const ERROR_SEEK = 2;
54  public const ERROR_READ = 3;
55  public const ERROR_INVALID_SIGNATURE = 4;
56  public const ERROR_READ_PAST_END = 5;
57  public const ERROR_INVALID_FORMAT = 6;
58 
59  private static $mimesByClsid = [
60  // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
61  '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
62  '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
63  '00020906-0000-0000-C000-000000000046' => 'application/msword',
64  '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
65  ];
66 
79  public static function readFile( $fileName ) {
80  $handle = fopen( $fileName, 'r' );
81  if ( $handle === false ) {
82  return [
83  'valid' => false,
84  'error' => 'file does not exist',
85  'errorCode' => self::ERROR_FILE_OPEN
86  ];
87  }
88  return self::readHandle( $handle );
89  }
90 
103  public static function readHandle( $fileHandle ) {
104  $reader = new self( $fileHandle );
105  $info = [
106  'valid' => $reader->valid,
107  'mime' => $reader->mime,
108  'mimeFromClsid' => $reader->mimeFromClsid
109  ];
110  if ( $reader->error ) {
111  $info['error'] = $reader->error;
112  $info['errorCode'] = $reader->errorCode;
113  }
114  return $info;
115  }
116 
117  private function __construct( $fileHandle ) {
118  $this->file = $fileHandle;
119  try {
120  $this->init();
121  } catch ( RuntimeException $e ) {
122  $this->valid = false;
123  $this->error = $e->getMessage();
124  $this->errorCode = $e->getCode();
125  }
126  }
127 
128  private function init() {
129  $this->header = $this->unpackOffset( 0, [
130  'header_signature' => 8,
131  'header_clsid' => 16,
132  'minor_version' => 2,
133  'major_version' => 2,
134  'byte_order' => 2,
135  'sector_shift' => 2,
136  'mini_sector_shift' => 2,
137  'reserved' => 6,
138  'num_dir_sectors' => 4,
139  'num_fat_sectors' => 4,
140  'first_dir_sector' => 4,
141  'transaction_signature_number' => 4,
142  'mini_stream_cutoff_size' => 4,
143  'first_mini_fat_sector' => 4,
144  'num_mini_fat_sectors' => 4,
145  'first_difat_sector' => 4,
146  'num_difat_sectors' => 4,
147  'difat' => 436,
148  ] );
149  if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
150  $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
151  self::ERROR_INVALID_SIGNATURE );
152  }
153  $this->sectorLength = 1 << $this->header['sector_shift'];
154  $this->readDifat();
155  $this->readDirectory();
156 
157  $this->valid = true;
158  }
159 
160  private function sectorOffset( $sectorId ) {
161  return $this->sectorLength * ( $sectorId + 1 );
162  }
163 
164  private function decodeClsid( $binaryClsid ) {
165  $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
166  return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
167  $parts['a'],
168  $parts['b'],
169  $parts['c'],
170  $parts['d1'],
171  $parts['d2'],
172  $parts['d3'],
173  $parts['d4'],
174  $parts['d5'],
175  $parts['d6'],
176  $parts['d7'],
177  $parts['d8']
178  );
179  }
180 
186  private function unpackOffset( $offset, $struct ) {
187  $block = $this->readOffset( $offset, array_sum( $struct ) );
188  return $this->unpack( $block, 0, $struct );
189  }
190 
197  private function unpack( $block, $offset, $struct ) {
198  $data = [];
199  foreach ( $struct as $key => $length ) {
200  if ( $length > 4 ) {
201  $data[$key] = substr( $block, $offset, $length );
202  } else {
203  $data[$key] = $this->bin2dec( $block, $offset, $length );
204  }
205  $offset += $length;
206  }
207  return $data;
208  }
209 
210  private function bin2dec( $str, $offset, $length ) {
211  $value = 0;
212  for ( $i = $length - 1; $i >= 0; $i-- ) {
213  $value *= 256;
214  $value += ord( $str[$offset + $i] );
215  }
216  return $value;
217  }
218 
219  private function readOffset( $offset, $length ) {
220  $this->fseek( $offset );
221  AtEase::suppressWarnings();
222  $block = fread( $this->file, $length );
223  AtEase::restoreWarnings();
224  if ( $block === false ) {
225  $this->error( 'error reading from file', self::ERROR_READ );
226  }
227  if ( strlen( $block ) !== $length ) {
228  $this->error( 'unable to read the required number of bytes from the file',
229  self::ERROR_READ_PAST_END );
230  }
231  return $block;
232  }
233 
234  private function readSector( $sectorId ) {
235  return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
236  }
237 
243  private function error( $message, $code ) {
244  throw new RuntimeException( $message, $code );
245  }
246 
247  private function fseek( $offset ) {
248  AtEase::suppressWarnings();
249  $result = fseek( $this->file, $offset );
250  AtEase::restoreWarnings();
251  if ( $result !== 0 ) {
252  $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
253  }
254  }
255 
256  private function readDifat() {
257  $binaryDifat = $this->header['difat'];
258  $nextDifatSector = $this->header['first_difat_sector'];
259  for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
260  $block = $this->readSector( $nextDifatSector );
261  $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
262  $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
263  if ( $nextDifatSector == 0xFFFFFFFE ) {
264  break;
265  }
266  }
267 
268  $this->difat = [];
269  for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
270  $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
271  if ( $fatSector < 0xFFFFFFFC ) {
272  $this->difat[] = $fatSector;
273  } else {
274  break;
275  }
276  }
277  }
278 
279  private function getNextSectorIdFromFat( $sectorId ) {
280  $entriesPerSector = intdiv( $this->sectorLength, 4 );
281  $fatSectorId = intdiv( $sectorId, $entriesPerSector );
282  $fatSectorArray = $this->getFatSector( $fatSectorId );
283  return $fatSectorArray[$sectorId % $entriesPerSector];
284  }
285 
286  private function getFatSector( $fatSectorId ) {
287  if ( !isset( $this->fat[$fatSectorId] ) ) {
288  $fat = [];
289  if ( !isset( $this->difat[$fatSectorId] ) ) {
290  $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
291  }
292  $absoluteSectorId = $this->difat[$fatSectorId];
293  $block = $this->readSector( $absoluteSectorId );
294  for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
295  $fat[] = $this->bin2dec( $block, $pos, 4 );
296  }
297  $this->fat[$fatSectorId] = $fat;
298  }
299  return $this->fat[$fatSectorId];
300  }
301 
302  private function readDirectory() {
303  $dirSectorId = $this->header['first_dir_sector'];
304  $binaryDir = '';
305  $seenSectorIds = [];
306  while ( $dirSectorId !== 0xFFFFFFFE ) {
307  if ( isset( $seenSectorIds[$dirSectorId] ) ) {
308  $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
309  }
310  $seenSectorIds[$dirSectorId] = true;
311 
312  $binaryDir .= $this->readSector( $dirSectorId );
313  $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
314  }
315 
316  $struct = [
317  'name_raw' => 64,
318  'name_length' => 2,
319  'object_type' => 1,
320  'color' => 1,
321  'sid_left' => 4,
322  'sid_right' => 4,
323  'sid_child' => 4,
324  'clsid' => 16,
325  'state_bits' => 4,
326  'create_time_low' => 4,
327  'create_time_high' => 4,
328  'modify_time_low' => 4,
329  'modify_time_high' => 4,
330  'first_sector' => 4,
331  'size_low' => 4,
332  'size_high' => 4,
333  ];
334  $entryLength = array_sum( $struct );
335 
336  for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
337  $entry = $this->unpack( $binaryDir, $pos, $struct );
338 
339  // According to [MS-CFB] size_high may contain garbage due to a
340  // bug in a writer, it's best to pretend it is zero
341  $entry['size_high'] = 0;
342 
343  $type = $entry['object_type'];
344  if ( $type == self::TYPE_UNALLOCATED ) {
345  continue;
346  }
347 
348  $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
349 
350  $clsid = $this->decodeClsid( $entry['clsid'] );
351  if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
352  $this->mimeFromClsid = self::$mimesByClsid[$clsid];
353  }
354 
355  if ( $name === 'Workbook' ) {
356  $this->mime = 'application/vnd.ms-excel';
357  } elseif ( $name === 'WordDocument' ) {
358  $this->mime = 'application/msword';
359  } elseif ( $name === 'PowerPoint Document' ) {
360  $this->mime = 'application/vnd.ms-powerpoint';
361  }
362  }
363  }
364 }
Read the directory of a Microsoft Compound File Binary file, a.k.a.
static readFile( $fileName)
Read a file by name.
unpack( $block, $offset, $struct)
readOffset( $offset, $length)
unpackOffset( $offset, $struct)
bin2dec( $str, $offset, $length)
static readHandle( $fileHandle)
Read from an open seekable handle.