MediaWiki  master
DjVuImage.php
Go to the documentation of this file.
1 <?php
30 use Wikimedia\AtEase\AtEase;
31 
41 class DjVuImage {
42 
46  private const DJVUTXT_MEMORY_LIMIT = 300000000;
47 
49  private $mFilename;
50 
54  public function __construct( $filename ) {
55  $this->mFilename = $filename;
56  }
57 
62  public function isValid() {
63  $info = $this->getInfo();
64 
65  return $info !== false;
66  }
67 
72  public function getImageSize() {
73  $data = $this->getInfo();
74 
75  if ( $data !== false ) {
76  return [
77  'width' => $data['width'],
78  'height' => $data['height']
79  ];
80  }
81  return [];
82  }
83 
84  // ---------
85 
89  public function dump() {
90  $file = fopen( $this->mFilename, 'rb' );
91  $header = fread( $file, 12 );
92  $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
93  $chunk = $arr['chunk'];
94  $chunkLength = $arr['chunkLength'];
95  echo "$chunk $chunkLength\n";
96  $this->dumpForm( $file, $chunkLength, 1 );
97  fclose( $file );
98  }
99 
100  private function dumpForm( $file, $length, $indent ) {
101  $start = ftell( $file );
102  $secondary = fread( $file, 4 );
103  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
104  while ( ftell( $file ) - $start < $length ) {
105  $chunkHeader = fread( $file, 8 );
106  if ( $chunkHeader == '' ) {
107  break;
108  }
109  $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
110  $chunk = $arr['chunk'];
111  $chunkLength = $arr['chunkLength'];
112  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
113 
114  if ( $chunk === 'FORM' ) {
115  $this->dumpForm( $file, $chunkLength, $indent + 1 );
116  } else {
117  fseek( $file, $chunkLength, SEEK_CUR );
118  if ( $chunkLength & 1 ) {
119  // Padding byte between chunks
120  fseek( $file, 1, SEEK_CUR );
121  }
122  }
123  }
124  }
125 
126  private function getInfo() {
127  AtEase::suppressWarnings();
128  $file = fopen( $this->mFilename, 'rb' );
129  AtEase::restoreWarnings();
130  if ( $file === false ) {
131  wfDebug( __METHOD__ . ": missing or failed file read" );
132 
133  return false;
134  }
135 
136  $header = fread( $file, 16 );
137  $info = false;
138 
139  if ( strlen( $header ) < 16 ) {
140  wfDebug( __METHOD__ . ": too short file header" );
141  } else {
142  $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
143 
144  $subtype = $arr['subtype'];
145  if ( $arr['magic'] !== 'AT&T' ) {
146  wfDebug( __METHOD__ . ": not a DjVu file" );
147  } elseif ( $subtype === 'DJVU' ) {
148  // Single-page document
149  $info = $this->getPageInfo( $file );
150  } elseif ( $subtype === 'DJVM' ) {
151  // Multi-page document
152  $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
153  } else {
154  wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'" );
155  }
156  }
157  fclose( $file );
158 
159  return $info;
160  }
161 
162  private function readChunk( $file ) {
163  $header = fread( $file, 8 );
164  if ( strlen( $header ) < 8 ) {
165  return [ false, 0 ];
166  }
167  $arr = unpack( 'a4chunk/Nlength', $header );
168 
169  return [ $arr['chunk'], $arr['length'] ];
170  }
171 
172  private function skipChunk( $file, $chunkLength ) {
173  fseek( $file, $chunkLength, SEEK_CUR );
174 
175  if ( ( $chunkLength & 1 ) && !feof( $file ) ) {
176  // padding byte
177  fseek( $file, 1, SEEK_CUR );
178  }
179  }
180 
181  private function getMultiPageInfo( $file, $formLength ) {
182  // For now, we'll just look for the first page in the file
183  // and report its information, hoping others are the same size.
184  $start = ftell( $file );
185  do {
186  [ $chunk, $length ] = $this->readChunk( $file );
187  if ( !$chunk ) {
188  break;
189  }
190 
191  if ( $chunk === 'FORM' ) {
192  $subtype = fread( $file, 4 );
193  if ( $subtype === 'DJVU' ) {
194  wfDebug( __METHOD__ . ": found first subpage" );
195 
196  return $this->getPageInfo( $file );
197  }
198  $this->skipChunk( $file, $length - 4 );
199  } else {
200  wfDebug( __METHOD__ . ": skipping '$chunk' chunk" );
201  $this->skipChunk( $file, $length );
202  }
203  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
204 
205  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages" );
206 
207  return false;
208  }
209 
210  private function getPageInfo( $file ) {
211  [ $chunk, $length ] = $this->readChunk( $file );
212  if ( $chunk !== 'INFO' ) {
213  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'" );
214 
215  return false;
216  }
217 
218  if ( $length < 9 ) {
219  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length" );
220 
221  return false;
222  }
223  $data = fread( $file, $length );
224  if ( strlen( $data ) < $length ) {
225  wfDebug( __METHOD__ . ": INFO chunk cut off" );
226 
227  return false;
228  }
229 
230  $arr = unpack(
231  'nwidth/' .
232  'nheight/' .
233  'Cminor/' .
234  'Cmajor/' .
235  'vresolution/' .
236  'Cgamma', $data );
237 
238  # Newer files have rotation info in byte 10, but we don't use it yet.
239 
240  return [
241  'width' => $arr['width'],
242  'height' => $arr['height'],
243  'version' => "{$arr['major']}.{$arr['minor']}",
244  'resolution' => $arr['resolution'],
245  'gamma' => $arr['gamma'] / 10.0 ];
246  }
247 
252  public function retrieveMetaData() {
253  $djvuDump = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuDump );
254  $djvuTxt = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuTxt );
255  if ( !$this->isValid() ) {
256  return false;
257  }
258 
259  if ( isset( $djvuDump ) ) {
260  # djvudump is faster than djvutoxml (now abandoned) as of version 3.5
261  # https://sourceforge.net/p/djvu/bugs/71/
262  $cmd = Shell::escape( $djvuDump ) . ' ' . Shell::escape( $this->mFilename );
263  $dump = wfShellExec( $cmd );
264  $json = [ 'data' => $this->convertDumpToJSON( $dump ) ];
265  } else {
266  $json = null;
267  }
268  # Text layer
269  if ( isset( $djvuTxt ) ) {
270  $cmd = Shell::escape( $djvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
271  wfDebug( __METHOD__ . ": $cmd" );
272  $retval = 0;
273  $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
274  $json['text'] = [];
275  if ( $retval === 0 ) {
276  # Strip some control characters
277  # Ignore carriage returns
278  $txt = preg_replace( "/\\\\013/", "", $txt );
279  # Replace runs of OCR region separators with a single extra line break
280  $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt );
281 
282  $reg = <<<EOR
283  /\‍(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
284  ((?> # Text to match is composed of atoms of either:
285  \\\\. # - any escaped character
286  | # - any character different from " and \
287  [^"\\\\]+
288  )*?)
289  "\s*\‍)
290  | # Or page can be empty ; in this case, djvutxt dumps ()
291  \‍(\s*()\)/sx
292 EOR;
293  $matches = [];
294  preg_match_all( $reg, $txt, $matches );
295  $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] );
296  }
297  }
298 
299  return $json;
300  }
301 
302  private function pageTextCallback( string $match ) {
303  # Get rid of invalid UTF-8
304  $val = UtfNormal\Validator::cleanUp( stripcslashes( $match ) );
305  return str_replace( '�', '', $val );
306  }
307 
312  private function convertDumpToJSON( $dump ) {
313  if ( strval( $dump ) == '' ) {
314  return false;
315  }
316 
317  $dump = str_replace( "\r", '', $dump );
318  $line = strtok( $dump, "\n" );
319  $m = false;
320  $good = false;
321  $result = [];
322  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
323  # Single-page
324  $parsed = $this->parseFormDjvu( $line );
325  if ( $parsed ) {
326  $good = true;
327  } else {
328  return false;
329  }
330  $result['pages'] = [ $parsed ];
331  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
332  # Multi-page
333  $parentLevel = strlen( $m[1] );
334  # Find DIRM
335  $line = strtok( "\n" );
336  $result['pages'] = [];
337  while ( $line !== false ) {
338  $childLevel = strspn( $line, ' ' );
339  if ( $childLevel <= $parentLevel ) {
340  # End of chunk
341  break;
342  }
343 
344  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
345  wfDebug( "Indirect multi-page DjVu document, bad for server!" );
346 
347  return false;
348  }
349 
350  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
351  # Found page
352  $parsed = $this->parseFormDjvu( $line );
353  if ( $parsed ) {
354  $good = true;
355  } else {
356  return false;
357  }
358  $result['pages'][] = $parsed;
359  }
360  $line = strtok( "\n" );
361  }
362  }
363  if ( !$good ) {
364  return false;
365  }
366 
367  return $result;
368  }
369 
370  private function parseFormDjvu( $line ) {
371  $parentLevel = strspn( $line, ' ' );
372  $line = strtok( "\n" );
373  # Find INFO
374  while ( $line !== false ) {
375  $childLevel = strspn( $line, ' ' );
376  if ( $childLevel <= $parentLevel ) {
377  # End of chunk
378  break;
379  }
380 
381  if ( preg_match(
382  '/^ *INFO *\[\d*] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
383  $line,
384  $m
385  ) ) {
386  return [
387  'height' => (int)$m[2],
388  'width' => (int)$m[1],
389  'dpi' => (float)$m[3],
390  'gamma' => (float)$m[4],
391  ];
392  }
393  $line = strtok( "\n" );
394  }
395 
396  # Not found
397  return false;
398  }
399 }
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
$matches
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition: DjVuImage.php:41
retrieveMetaData()
Return an array describing the DjVu image.
Definition: DjVuImage.php:252
dump()
For debugging; dump the IFF chunk structure.
Definition: DjVuImage.php:89
isValid()
Check if the given file is indeed a valid DjVu image file.
Definition: DjVuImage.php:62
getImageSize()
Return width and height.
Definition: DjVuImage.php:72
__construct( $filename)
Definition: DjVuImage.php:54
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Executes shell commands.
Definition: Shell.php:46
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
$header