MediaWiki  master
DjVuImage.php
Go to the documentation of this file.
1 <?php
30 use Wikimedia\AtEase\AtEase;
31 
41 class DjVuImage {
42 
46  private const DJVUTXT_MEMORY_LIMIT = 300000000;
47 
49  private $mFilename;
50 
54  public function __construct( $filename ) {
55  $this->mFilename = $filename;
56  }
57 
62  public function isValid() {
63  $info = $this->getInfo();
64 
65  return $info !== false;
66  }
67 
72  public function getImageSize() {
73  $data = $this->getInfo();
74 
75  if ( $data !== false ) {
76  return [
77  'width' => $data['width'],
78  'height' => $data['height']
79  ];
80  } else {
81  return [];
82  }
83  }
84 
85  // ---------
86 
90  public function dump() {
91  $file = fopen( $this->mFilename, 'rb' );
92  $header = fread( $file, 12 );
93  $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
94  $chunk = $arr['chunk'];
95  $chunkLength = $arr['chunkLength'];
96  echo "$chunk $chunkLength\n";
97  $this->dumpForm( $file, $chunkLength, 1 );
98  fclose( $file );
99  }
100 
101  private function dumpForm( $file, $length, $indent ) {
102  $start = ftell( $file );
103  $secondary = fread( $file, 4 );
104  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
105  while ( ftell( $file ) - $start < $length ) {
106  $chunkHeader = fread( $file, 8 );
107  if ( $chunkHeader == '' ) {
108  break;
109  }
110  $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
111  $chunk = $arr['chunk'];
112  $chunkLength = $arr['chunkLength'];
113  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
114 
115  if ( $chunk == 'FORM' ) {
116  $this->dumpForm( $file, $chunkLength, $indent + 1 );
117  } else {
118  fseek( $file, $chunkLength, SEEK_CUR );
119  if ( $chunkLength & 1 ) {
120  // Padding byte between chunks
121  fseek( $file, 1, SEEK_CUR );
122  }
123  }
124  }
125  }
126 
127  private function getInfo() {
128  AtEase::suppressWarnings();
129  $file = fopen( $this->mFilename, 'rb' );
130  AtEase::restoreWarnings();
131  if ( $file === false ) {
132  wfDebug( __METHOD__ . ": missing or failed file read" );
133 
134  return false;
135  }
136 
137  $header = fread( $file, 16 );
138  $info = false;
139 
140  if ( strlen( $header ) < 16 ) {
141  wfDebug( __METHOD__ . ": too short file header" );
142  } else {
143  $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
144 
145  $subtype = $arr['subtype'];
146  if ( $arr['magic'] != 'AT&T' ) {
147  wfDebug( __METHOD__ . ": not a DjVu file" );
148  } elseif ( $subtype == 'DJVU' ) {
149  // Single-page document
150  $info = $this->getPageInfo( $file );
151  } elseif ( $subtype == 'DJVM' ) {
152  // Multi-page document
153  $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
154  } else {
155  wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'" );
156  }
157  }
158  fclose( $file );
159 
160  return $info;
161  }
162 
163  private function readChunk( $file ) {
164  $header = fread( $file, 8 );
165  if ( strlen( $header ) < 8 ) {
166  return [ false, 0 ];
167  } else {
168  $arr = unpack( 'a4chunk/Nlength', $header );
169 
170  return [ $arr['chunk'], $arr['length'] ];
171  }
172  }
173 
174  private function skipChunk( $file, $chunkLength ) {
175  fseek( $file, $chunkLength, SEEK_CUR );
176 
177  if ( ( $chunkLength & 1 ) && !feof( $file ) ) {
178  // padding byte
179  fseek( $file, 1, SEEK_CUR );
180  }
181  }
182 
183  private function getMultiPageInfo( $file, $formLength ) {
184  // For now, we'll just look for the first page in the file
185  // and report its information, hoping others are the same size.
186  $start = ftell( $file );
187  do {
188  [ $chunk, $length ] = $this->readChunk( $file );
189  if ( !$chunk ) {
190  break;
191  }
192 
193  if ( $chunk == 'FORM' ) {
194  $subtype = fread( $file, 4 );
195  if ( $subtype == 'DJVU' ) {
196  wfDebug( __METHOD__ . ": found first subpage" );
197 
198  return $this->getPageInfo( $file );
199  }
200  $this->skipChunk( $file, $length - 4 );
201  } else {
202  wfDebug( __METHOD__ . ": skipping '$chunk' chunk" );
203  $this->skipChunk( $file, $length );
204  }
205  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
206 
207  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages" );
208 
209  return false;
210  }
211 
212  private function getPageInfo( $file ) {
213  [ $chunk, $length ] = $this->readChunk( $file );
214  if ( $chunk != 'INFO' ) {
215  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'" );
216 
217  return false;
218  }
219 
220  if ( $length < 9 ) {
221  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length" );
222 
223  return false;
224  }
225  $data = fread( $file, $length );
226  if ( strlen( $data ) < $length ) {
227  wfDebug( __METHOD__ . ": INFO chunk cut off" );
228 
229  return false;
230  }
231 
232  $arr = unpack(
233  'nwidth/' .
234  'nheight/' .
235  'Cminor/' .
236  'Cmajor/' .
237  'vresolution/' .
238  'Cgamma', $data );
239 
240  # Newer files have rotation info in byte 10, but we don't use it yet.
241 
242  return [
243  'width' => $arr['width'],
244  'height' => $arr['height'],
245  'version' => "{$arr['major']}.{$arr['minor']}",
246  'resolution' => $arr['resolution'],
247  'gamma' => $arr['gamma'] / 10.0 ];
248  }
249 
254  public function retrieveMetaData() {
255  $djvuDump = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuDump );
256  $djvuTxt = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuTxt );
257  if ( !$this->isValid() ) {
258  return false;
259  }
260 
261  if ( isset( $djvuDump ) ) {
262  # djvudump is faster than djvutoxml (now abandoned) as of version 3.5
263  # https://sourceforge.net/p/djvu/bugs/71/
264  $cmd = Shell::escape( $djvuDump ) . ' ' . Shell::escape( $this->mFilename );
265  $dump = wfShellExec( $cmd );
266  $json = [ 'data' => $this->convertDumpToJSON( $dump ) ];
267  } else {
268  $json = null;
269  }
270  # Text layer
271  if ( isset( $djvuTxt ) ) {
272  $cmd = Shell::escape( $djvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
273  wfDebug( __METHOD__ . ": $cmd" );
274  $retval = '';
275  $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
276  $json['text'] = [];
277  if ( $retval == 0 ) {
278  # Strip some control characters
279  # Ignore carriage returns
280  $txt = preg_replace( "/\\\\013/", "", $txt );
281  # Replace runs of OCR region separators with a single extra line break
282  $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt );
283 
284  $reg = <<<EOR
285  /\‍(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
286  ((?> # Text to match is composed of atoms of either:
287  \\\\. # - any escaped character
288  | # - any character different from " and \
289  [^"\\\\]+
290  )*?)
291  "\s*\‍)
292  | # Or page can be empty ; in this case, djvutxt dumps ()
293  \‍(\s*()\)/sx
294 EOR;
295  $matches = [];
296  preg_match_all( $reg, $txt, $matches );
297  $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] );
298  }
299  }
300 
301  return $json;
302  }
303 
304  private function pageTextCallback( string $match ) {
305  # Get rid of invalid UTF-8
306  $val = UtfNormal\Validator::cleanUp( stripcslashes( $match ) );
307  $val = str_replace( '�', '', $val );
308  return $val;
309  }
310 
315  private function convertDumpToJSON( $dump ) {
316  if ( strval( $dump ) == '' ) {
317  return false;
318  }
319 
320  $dump = str_replace( "\r", '', $dump );
321  $line = strtok( $dump, "\n" );
322  $m = false;
323  $good = false;
324  $result = [];
325  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
326  # Single-page
327  $parsed = $this->parseFormDjvu( $line );
328  if ( $parsed ) {
329  $good = true;
330  } else {
331  return false;
332  }
333  $result['pages'] = [ $parsed ];
334  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
335  # Multi-page
336  $parentLevel = strlen( $m[1] );
337  # Find DIRM
338  $line = strtok( "\n" );
339  $result['pages'] = [];
340  while ( $line !== false ) {
341  $childLevel = strspn( $line, ' ' );
342  if ( $childLevel <= $parentLevel ) {
343  # End of chunk
344  break;
345  }
346 
347  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
348  wfDebug( "Indirect multi-page DjVu document, bad for server!" );
349 
350  return false;
351  }
352 
353  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
354  # Found page
355  $parsed = $this->parseFormDjvu( $line );
356  if ( $parsed ) {
357  $good = true;
358  } else {
359  return false;
360  }
361  $result['pages'][] = $parsed;
362  }
363  $line = strtok( "\n" );
364  }
365  }
366  if ( !$good ) {
367  return false;
368  }
369 
370  return $result;
371  }
372 
373  private function parseFormDjvu( $line ) {
374  $parentLevel = strspn( $line, ' ' );
375  $line = strtok( "\n" );
376  # Find INFO
377  while ( $line !== false ) {
378  $childLevel = strspn( $line, ' ' );
379  if ( $childLevel <= $parentLevel ) {
380  # End of chunk
381  break;
382  }
383 
384  if ( preg_match(
385  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
386  $line,
387  $m
388  ) ) {
389  return [
390  'height' => (int)$m[2],
391  'width' => (int)$m[1],
392  'dpi' => (float)$m[3],
393  'gamma' => (float)$m[4],
394  ];
395  }
396  $line = strtok( "\n" );
397  }
398 
399  # Not found
400  return false;
401  }
402 }
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
$matches
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition: DjVuImage.php:41
retrieveMetaData()
Return an array describing the DjVu image.
Definition: DjVuImage.php:254
dump()
For debugging; dump the IFF chunk structure.
Definition: DjVuImage.php:90
isValid()
Check if the given file is indeed a valid DjVu image file.
Definition: DjVuImage.php:62
getImageSize()
Return width and height.
Definition: DjVuImage.php:72
__construct( $filename)
Definition: DjVuImage.php:54
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Executes shell commands.
Definition: Shell.php:46
$line
Definition: mcc.php:119
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
$header