MediaWiki  master
DjVuImage.php
Go to the documentation of this file.
1 <?php
28 
38 class DjVuImage {
39 
43  private const DJVUTXT_MEMORY_LIMIT = 300000000;
44 
46  private $mFilename;
47 
51  public function __construct( $filename ) {
52  $this->mFilename = $filename;
53  }
54 
59  public function isValid() {
60  $info = $this->getInfo();
61 
62  return $info !== false;
63  }
64 
69  public function getImageSize() {
70  $data = $this->getInfo();
71 
72  if ( $data !== false ) {
73  return [
74  'width' => $data['width'],
75  'height' => $data['height']
76  ];
77  } else {
78  return [];
79  }
80  }
81 
82  // ---------
83 
87  public function dump() {
88  $file = fopen( $this->mFilename, 'rb' );
89  $header = fread( $file, 12 );
90  $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
91  $chunk = $arr['chunk'];
92  $chunkLength = $arr['chunkLength'];
93  echo "$chunk $chunkLength\n";
94  $this->dumpForm( $file, $chunkLength, 1 );
95  fclose( $file );
96  }
97 
98  private function dumpForm( $file, $length, $indent ) {
99  $start = ftell( $file );
100  $secondary = fread( $file, 4 );
101  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
102  while ( ftell( $file ) - $start < $length ) {
103  $chunkHeader = fread( $file, 8 );
104  if ( $chunkHeader == '' ) {
105  break;
106  }
107  $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
108  $chunk = $arr['chunk'];
109  $chunkLength = $arr['chunkLength'];
110  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
111 
112  if ( $chunk == 'FORM' ) {
113  $this->dumpForm( $file, $chunkLength, $indent + 1 );
114  } else {
115  fseek( $file, $chunkLength, SEEK_CUR );
116  if ( $chunkLength & 1 ) {
117  // Padding byte between chunks
118  fseek( $file, 1, SEEK_CUR );
119  }
120  }
121  }
122  }
123 
124  private function getInfo() {
125  Wikimedia\suppressWarnings();
126  $file = fopen( $this->mFilename, 'rb' );
127  Wikimedia\restoreWarnings();
128  if ( $file === false ) {
129  wfDebug( __METHOD__ . ": missing or failed file read" );
130 
131  return false;
132  }
133 
134  $header = fread( $file, 16 );
135  $info = false;
136 
137  if ( strlen( $header ) < 16 ) {
138  wfDebug( __METHOD__ . ": too short file header" );
139  } else {
140  $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
141 
142  $subtype = $arr['subtype'];
143  if ( $arr['magic'] != 'AT&T' ) {
144  wfDebug( __METHOD__ . ": not a DjVu file" );
145  } elseif ( $subtype == 'DJVU' ) {
146  // Single-page document
147  $info = $this->getPageInfo( $file );
148  } elseif ( $subtype == 'DJVM' ) {
149  // Multi-page document
150  $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
151  } else {
152  wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'" );
153  }
154  }
155  fclose( $file );
156 
157  return $info;
158  }
159 
160  private function readChunk( $file ) {
161  $header = fread( $file, 8 );
162  if ( strlen( $header ) < 8 ) {
163  return [ false, 0 ];
164  } else {
165  $arr = unpack( 'a4chunk/Nlength', $header );
166 
167  return [ $arr['chunk'], $arr['length'] ];
168  }
169  }
170 
171  private function skipChunk( $file, $chunkLength ) {
172  fseek( $file, $chunkLength, SEEK_CUR );
173 
174  if ( ( $chunkLength & 1 ) && !feof( $file ) ) {
175  // padding byte
176  fseek( $file, 1, SEEK_CUR );
177  }
178  }
179 
180  private function getMultiPageInfo( $file, $formLength ) {
181  // For now, we'll just look for the first page in the file
182  // and report its information, hoping others are the same size.
183  $start = ftell( $file );
184  do {
185  list( $chunk, $length ) = $this->readChunk( $file );
186  if ( !$chunk ) {
187  break;
188  }
189 
190  if ( $chunk == 'FORM' ) {
191  $subtype = fread( $file, 4 );
192  if ( $subtype == 'DJVU' ) {
193  wfDebug( __METHOD__ . ": found first subpage" );
194 
195  return $this->getPageInfo( $file );
196  }
197  $this->skipChunk( $file, $length - 4 );
198  } else {
199  wfDebug( __METHOD__ . ": skipping '$chunk' chunk" );
200  $this->skipChunk( $file, $length );
201  }
202  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
203 
204  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages" );
205 
206  return false;
207  }
208 
209  private function getPageInfo( $file ) {
210  list( $chunk, $length ) = $this->readChunk( $file );
211  if ( $chunk != 'INFO' ) {
212  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'" );
213 
214  return false;
215  }
216 
217  if ( $length < 9 ) {
218  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length" );
219 
220  return false;
221  }
222  $data = fread( $file, $length );
223  if ( strlen( $data ) < $length ) {
224  wfDebug( __METHOD__ . ": INFO chunk cut off" );
225 
226  return false;
227  }
228 
229  $arr = unpack(
230  'nwidth/' .
231  'nheight/' .
232  'Cminor/' .
233  'Cmajor/' .
234  'vresolution/' .
235  'Cgamma', $data );
236 
237  # Newer files have rotation info in byte 10, but we don't use it yet.
238 
239  return [
240  'width' => $arr['width'],
241  'height' => $arr['height'],
242  'version' => "{$arr['major']}.{$arr['minor']}",
243  'resolution' => $arr['resolution'],
244  'gamma' => $arr['gamma'] / 10.0 ];
245  }
246 
251  public function retrieveMetaData() {
252  global $wgDjvuDump, $wgDjvuTxt;
253 
254  if ( !$this->isValid() ) {
255  return false;
256  }
257 
258  if ( isset( $wgDjvuDump ) ) {
259  # djvudump is faster than djvutoxml (now abandoned) as of version 3.5
260  # https://sourceforge.net/p/djvu/bugs/71/
261  $cmd = Shell::escape( $wgDjvuDump ) . ' ' . Shell::escape( $this->mFilename );
262  $dump = wfShellExec( $cmd );
263  $json = [ 'data' => $this->convertDumpToJSON( $dump ) ];
264  } else {
265  $json = null;
266  }
267  # Text layer
268  if ( isset( $wgDjvuTxt ) ) {
269  $cmd = Shell::escape( $wgDjvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
270  wfDebug( __METHOD__ . ": $cmd" );
271  $retval = '';
272  $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
273  $json['text'] = [];
274  if ( $retval == 0 ) {
275  # Strip some control characters
276  # Ignore carriage returns
277  $txt = preg_replace( "/\\\\013/", "", $txt );
278  # Replace runs of OCR region separators with a single extra line break
279  $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt );
280 
281  $reg = <<<EOR
282  /\‍(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
283  ((?> # Text to match is composed of atoms of either:
284  \\\\. # - any escaped character
285  | # - any character different from " and \
286  [^"\\\\]+
287  )*?)
288  "\s*\‍)
289  | # Or page can be empty ; in this case, djvutxt dumps ()
290  \‍(\s*()\)/sx
291 EOR;
292  $matches = [];
293  preg_match_all( $reg, $txt, $matches );
294  $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] );
295  }
296  }
297 
298  return $json;
299  }
300 
301  private function pageTextCallback( string $match ) {
302  # Get rid of invalid UTF-8
303  $val = UtfNormal\Validator::cleanUp( stripcslashes( $match ) );
304  $val = str_replace( '�', '', $val );
305  return $val;
306  }
307 
312  private function convertDumpToJSON( $dump ) {
313  if ( strval( $dump ) == '' ) {
314  return false;
315  }
316 
317  $dump = str_replace( "\r", '', $dump );
318  $line = strtok( $dump, "\n" );
319  $m = false;
320  $good = false;
321  $result = [];
322  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
323  # Single-page
324  $parsed = $this->parseFormDjvu( $line );
325  if ( $parsed ) {
326  $good = true;
327  } else {
328  return false;
329  }
330  $result['pages'] = [ $parsed ];
331  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
332  # Multi-page
333  $parentLevel = strlen( $m[1] );
334  # Find DIRM
335  $line = strtok( "\n" );
336  $result['pages'] = [];
337  while ( $line !== false ) {
338  $childLevel = strspn( $line, ' ' );
339  if ( $childLevel <= $parentLevel ) {
340  # End of chunk
341  break;
342  }
343 
344  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
345  wfDebug( "Indirect multi-page DjVu document, bad for server!" );
346 
347  return false;
348  }
349 
350  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
351  # Found page
352  $parsed = $this->parseFormDjvu( $line );
353  if ( $parsed ) {
354  $good = true;
355  } else {
356  return false;
357  }
358  $result['pages'][] = $parsed;
359  }
360  $line = strtok( "\n" );
361  }
362  }
363  if ( !$good ) {
364  return false;
365  }
366 
367  return $result;
368  }
369 
370  private function parseFormDjvu( $line ) {
371  $parentLevel = strspn( $line, ' ' );
372  $line = strtok( "\n" );
373  # Find INFO
374  while ( $line !== false ) {
375  $childLevel = strspn( $line, ' ' );
376  if ( $childLevel <= $parentLevel ) {
377  # End of chunk
378  break;
379  }
380 
381  if ( preg_match(
382  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
383  $line,
384  $m
385  ) ) {
386  return [
387  'height' => (int)$m[2],
388  'width' => (int)$m[1],
389  'dpi' => (float)$m[3],
390  'gamma' => (float)$m[4],
391  ];
392  }
393  $line = strtok( "\n" );
394  }
395 
396  # Not found
397  return false;
398  }
399 }
$wgDjvuTxt
$wgDjvuTxt
Path of the djvutxt DJVU text extraction utility Enable this and $wgDjvuDump to enable text layer ext...
Definition: DefaultSettings.php:1809
MediaWiki\Shell\Shell
Executes shell commands.
Definition: Shell.php:45
DjVuImage\skipChunk
skipChunk( $file, $chunkLength)
Definition: DjVuImage.php:171
DjVuImage\convertDumpToJSON
convertDumpToJSON( $dump)
Definition: DjVuImage.php:312
DjVuImage
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition: DjVuImage.php:38
$file
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
DjVuImage\$mFilename
string $mFilename
Definition: DjVuImage.php:46
DjVuImage\getPageInfo
getPageInfo( $file)
Definition: DjVuImage.php:209
DjVuImage\DJVUTXT_MEMORY_LIMIT
const DJVUTXT_MEMORY_LIMIT
Memory limit for the DjVu description software.
Definition: DjVuImage.php:43
$matches
$matches
Definition: NoLocalSettings.php:24
DjVuImage\dumpForm
dumpForm( $file, $length, $indent)
Definition: DjVuImage.php:98
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:894
DjVuImage\readChunk
readChunk( $file)
Definition: DjVuImage.php:160
$header
$header
Definition: updateCredits.php:37
$line
$line
Definition: mcc.php:119
DjVuImage\isValid
isValid()
Check if the given file is indeed a valid DjVu image file.
Definition: DjVuImage.php:59
DjVuImage\parseFormDjvu
parseFormDjvu( $line)
Definition: DjVuImage.php:370
DjVuImage\getInfo
getInfo()
Definition: DjVuImage.php:124
DjVuImage\retrieveMetaData
retrieveMetaData()
Return an array describing the DjVu image.
Definition: DjVuImage.php:251
DjVuImage\dump
dump()
For debugging; dump the IFF chunk structure.
Definition: DjVuImage.php:87
DjVuImage\pageTextCallback
pageTextCallback(string $match)
Definition: DjVuImage.php:301
$wgDjvuDump
$wgDjvuDump
Path of the djvudump executable Enable this and $wgDjvuRenderer to enable djvu rendering example: $wg...
Definition: DefaultSettings.php:1795
DjVuImage\getMultiPageInfo
getMultiPageInfo( $file, $formLength)
Definition: DjVuImage.php:180
DjVuImage\__construct
__construct( $filename)
Definition: DjVuImage.php:51
DjVuImage\getImageSize
getImageSize()
Return width and height.
Definition: DjVuImage.php:69
wfShellExec
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
Definition: GlobalFunctions.php:1910