MediaWiki  1.23.15
DjVuImage.php
Go to the documentation of this file.
1 <?php
36 class DjVuImage {
40  const DJVUTXT_MEMORY_LIMIT = 300000;
41 
47  function __construct( $filename ) {
48  $this->mFilename = $filename;
49  }
50 
55  public function isValid() {
56  $info = $this->getInfo();
57 
58  return $info !== false;
59  }
60 
65  public function getImageSize() {
66  $data = $this->getInfo();
67 
68  if ( $data !== false ) {
69  $width = $data['width'];
70  $height = $data['height'];
71 
72  return array( $width, $height, 'DjVu',
73  "width=\"$width\" height=\"$height\"" );
74  }
75 
76  return false;
77  }
78 
79  // ---------
80 
84  function dump() {
85  $file = fopen( $this->mFilename, 'rb' );
86  $header = fread( $file, 12 );
87  // @todo FIXME: Would be good to replace this extract() call with
88  // something that explicitly initializes local variables.
89  extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) );
92  echo "$chunk $chunkLength\n";
93  $this->dumpForm( $file, $chunkLength, 1 );
94  fclose( $file );
95  }
96 
97  private function dumpForm( $file, $length, $indent ) {
98  $start = ftell( $file );
99  $secondary = fread( $file, 4 );
100  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
101  while ( ftell( $file ) - $start < $length ) {
102  $chunkHeader = fread( $file, 8 );
103  if ( $chunkHeader == '' ) {
104  break;
105  }
106  // @todo FIXME: Would be good to replace this extract() call with
107  // something that explicitly initializes local variables.
108  extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) );
111  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
112 
113  if ( $chunk == 'FORM' ) {
114  $this->dumpForm( $file, $chunkLength, $indent + 1 );
115  } else {
116  fseek( $file, $chunkLength, SEEK_CUR );
117  if ( $chunkLength & 1 == 1 ) {
118  // Padding byte between chunks
119  fseek( $file, 1, SEEK_CUR );
120  }
121  }
122  }
123  }
124 
125  function getInfo() {
127  $file = fopen( $this->mFilename, 'rb' );
129  if ( $file === false ) {
130  wfDebug( __METHOD__ . ": missing or failed file read\n" );
131 
132  return false;
133  }
134 
135  $header = fread( $file, 16 );
136  $info = false;
137 
138  if ( strlen( $header ) < 16 ) {
139  wfDebug( __METHOD__ . ": too short file header\n" );
140  } else {
141  // @todo FIXME: Would be good to replace this extract() call with
142  // something that explicitly initializes local variables.
143  extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) );
144 
149  if ( $magic != 'AT&T' ) {
150  wfDebug( __METHOD__ . ": not a DjVu file\n" );
151  } elseif ( $subtype == 'DJVU' ) {
152  // Single-page document
153  $info = $this->getPageInfo( $file, $formLength );
154  } elseif ( $subtype == 'DJVM' ) {
155  // Multi-page document
156  $info = $this->getMultiPageInfo( $file, $formLength );
157  } else {
158  wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" );
159  }
160  }
161  fclose( $file );
162 
163  return $info;
164  }
165 
166  private function readChunk( $file ) {
167  $header = fread( $file, 8 );
168  if ( strlen( $header ) < 8 ) {
169  return array( false, 0 );
170  } else {
171  // @todo FIXME: Would be good to replace this extract() call with
172  // something that explicitly initializes local variables.
173  extract( unpack( 'a4chunk/Nlength', $header ) );
174 
177  return array( $chunk, $length );
178  }
179  }
180 
181  private function skipChunk( $file, $chunkLength ) {
182  fseek( $file, $chunkLength, SEEK_CUR );
183 
184  if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
185  // padding byte
186  fseek( $file, 1, SEEK_CUR );
187  }
188  }
189 
190  private function getMultiPageInfo( $file, $formLength ) {
191  // For now, we'll just look for the first page in the file
192  // and report its information, hoping others are the same size.
193  $start = ftell( $file );
194  do {
195  list( $chunk, $length ) = $this->readChunk( $file );
196  if ( !$chunk ) {
197  break;
198  }
199 
200  if ( $chunk == 'FORM' ) {
201  $subtype = fread( $file, 4 );
202  if ( $subtype == 'DJVU' ) {
203  wfDebug( __METHOD__ . ": found first subpage\n" );
204 
205  return $this->getPageInfo( $file, $length );
206  }
207  $this->skipChunk( $file, $length - 4 );
208  } else {
209  wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
210  $this->skipChunk( $file, $length );
211  }
212  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
213 
214  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
215 
216  return false;
217  }
218 
219  private function getPageInfo( $file, $formLength ) {
220  list( $chunk, $length ) = $this->readChunk( $file );
221  if ( $chunk != 'INFO' ) {
222  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
223 
224  return false;
225  }
226 
227  if ( $length < 9 ) {
228  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
229 
230  return false;
231  }
232  $data = fread( $file, $length );
233  if ( strlen( $data ) < $length ) {
234  wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
235 
236  return false;
237  }
238 
239  // @todo FIXME: Would be good to replace this extract() call with
240  // something that explicitly initializes local variables.
241  extract( unpack(
242  'nwidth/' .
243  'nheight/' .
244  'Cminor/' .
245  'Cmajor/' .
246  'vresolution/' .
247  'Cgamma', $data ) );
248 
249  # Newer files have rotation info in byte 10, but we don't use it yet.
250 
258  return array(
259  'width' => $width,
260  'height' => $height,
261  'version' => "$major.$minor",
262  'resolution' => $resolution,
263  'gamma' => $gamma / 10.0 );
264  }
265 
270  function retrieveMetaData() {
271  global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
272  wfProfileIn( __METHOD__ );
273 
274  if ( isset( $wgDjvuDump ) ) {
275  # djvudump is faster as of version 3.5
276  # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583
277  wfProfileIn( 'djvudump' );
278  $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
279  $dump = wfShellExec( $cmd );
280  $xml = $this->convertDumpToXML( $dump );
281  wfProfileOut( 'djvudump' );
282  } elseif ( isset( $wgDjvuToXML ) ) {
283  wfProfileIn( 'djvutoxml' );
284  $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
285  wfEscapeShellArg( $this->mFilename );
286  $xml = wfShellExec( $cmd );
287  wfProfileOut( 'djvutoxml' );
288  } else {
289  $xml = null;
290  }
291  # Text layer
292  if ( isset( $wgDjvuTxt ) ) {
293  wfProfileIn( 'djvutxt' );
294  $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
295  wfDebug( __METHOD__ . ": $cmd\n" );
296  $retval = '';
297  $txt = wfShellExec( $cmd, $retval, array(), array( 'memory' => self::DJVUTXT_MEMORY_LIMIT ) );
298  wfProfileOut( 'djvutxt' );
299  if ( $retval == 0 ) {
300  # Strip some control characters
301  $txt = preg_replace( "/[\013\035\037]/", "", $txt );
302  $reg = <<<EOR
303  /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
304  ((?> # Text to match is composed of atoms of either:
305  \\\\. # - any escaped character
306  | # - any character different from " and \
307  [^"\\\\]+
308  )*?)
309  "\s*\)
310  | # Or page can be empty ; in this case, djvutxt dumps ()
311  \(\s*()\)/sx
312 EOR;
313  $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt );
314  $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
315  $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
316  $xml = $xml . $txt . '</mw-djvu>';
317  }
318  }
319  wfProfileOut( __METHOD__ );
320 
321  return $xml;
322  }
323 
324  function pageTextCallback( $matches ) {
325  # Get rid of invalid UTF-8, strip control characters
326  $val = htmlspecialchars( UtfNormal::cleanUp( stripcslashes( $matches[1] ) ) );
327  $val = str_replace( array( "\n", '�' ), array( '&#10;', '' ), $val );
328  return '<PAGE value="' . $val . '" />';
329  }
330 
336  function convertDumpToXML( $dump ) {
337  if ( strval( $dump ) == '' ) {
338  return false;
339  }
340 
341  $xml = <<<EOT
342 <?xml version="1.0" ?>
343 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
344 <DjVuXML>
345 <HEAD></HEAD>
346 <BODY>
347 EOT;
348 
349  $dump = str_replace( "\r", '', $dump );
350  $line = strtok( $dump, "\n" );
351  $m = false;
352  $good = false;
353  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
354  # Single-page
355  if ( $this->parseFormDjvu( $line, $xml ) ) {
356  $good = true;
357  } else {
358  return false;
359  }
360  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
361  # Multi-page
362  $parentLevel = strlen( $m[1] );
363  # Find DIRM
364  $line = strtok( "\n" );
365  while ( $line !== false ) {
366  $childLevel = strspn( $line, ' ' );
367  if ( $childLevel <= $parentLevel ) {
368  # End of chunk
369  break;
370  }
371 
372  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
373  wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
374 
375  return false;
376  }
377  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
378  # Found page
379  if ( $this->parseFormDjvu( $line, $xml ) ) {
380  $good = true;
381  } else {
382  return false;
383  }
384  }
385  $line = strtok( "\n" );
386  }
387  }
388  if ( !$good ) {
389  return false;
390  }
391 
392  $xml .= "</BODY>\n</DjVuXML>\n";
393 
394  return $xml;
395  }
396 
397  function parseFormDjvu( $line, &$xml ) {
398  $parentLevel = strspn( $line, ' ' );
399  $line = strtok( "\n" );
400 
401  # Find INFO
402  while ( $line !== false ) {
403  $childLevel = strspn( $line, ' ' );
404  if ( $childLevel <= $parentLevel ) {
405  # End of chunk
406  break;
407  }
408 
409  if ( preg_match(
410  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
411  $line,
412  $m
413  ) ) {
414  $xml .= Xml::tags(
415  'OBJECT',
416  array(
417  #'data' => '',
418  #'type' => 'image/x.djvu',
419  'height' => $m[2],
420  'width' => $m[1],
421  #'usemap' => '',
422  ),
423  "\n" .
424  Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" .
425  Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n"
426  ) . "\n";
427 
428  return true;
429  }
430  $line = strtok( "\n" );
431  }
432 
433  # Not found
434  return false;
435  }
436 }
wfShellExec
wfShellExec( $cmd, &$retval=null, $environ=array(), $limits=array(), $options=array())
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
Definition: GlobalFunctions.php:2851
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
Xml\tags
static tags( $element, $attribs=null, $contents)
Same as Xml::element(), but does not escape contents.
Definition: Xml.php:131
UtfNormal\cleanUp
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:79
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2434
wfRestoreWarnings
wfRestoreWarnings()
Restore error level to previous value.
Definition: GlobalFunctions.php:2464
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
Xml\element
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:39
version
Prior to version
Definition: maintenance.txt:1
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
$line
$line
Definition: cdb.php:57
wfDebug
wfDebug( $text, $dest='all')
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:980
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
wfEscapeShellArg
wfEscapeShellArg()
Windows-compatible version of escapeshellarg() Windows doesn't recognise single-quotes in the shell,...
Definition: GlobalFunctions.php:2752
$file
if(PHP_SAPI !='cli') $file
Definition: UtfNormalTest2.php:30
in
Prior to maintenance scripts were a hodgepodge of code that had no cohesion or formal method of action Beginning in
Definition: maintenance.txt:1
$retval
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a account incomplete not yet checked for validity & $retval
Definition: hooks.txt:237
page
do that in ParserLimitReportFormat instead use this to modify the parameters of the image and a DIV can begin in one section and end in another Make sure your code can handle that case gracefully See the EditSectionClearerLink extension for an example zero but section is usually empty its values are the globals values my talk page
Definition: hooks.txt:1961