MediaWiki  1.28.1
DjVuImage.php
Go to the documentation of this file.
1 <?php
36 class DjVuImage {
40  const DJVUTXT_MEMORY_LIMIT = 300000;
41 
47  function __construct( $filename ) {
48  $this->mFilename = $filename;
49  }
50 
55  public function isValid() {
56  $info = $this->getInfo();
57 
58  return $info !== false;
59  }
60 
65  public function getImageSize() {
66  $data = $this->getInfo();
67 
68  if ( $data !== false ) {
69  $width = $data['width'];
70  $height = $data['height'];
71 
72  return [ $width, $height, 'DjVu',
73  "width=\"$width\" height=\"$height\"" ];
74  }
75 
76  return false;
77  }
78 
79  // ---------
80 
84  function dump() {
85  $file = fopen( $this->mFilename, 'rb' );
86  $header = fread( $file, 12 );
87  $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
88  $chunk = $arr['chunk'];
89  $chunkLength = $arr['chunkLength'];
90  echo "$chunk $chunkLength\n";
91  $this->dumpForm( $file, $chunkLength, 1 );
92  fclose( $file );
93  }
94 
95  private function dumpForm( $file, $length, $indent ) {
96  $start = ftell( $file );
97  $secondary = fread( $file, 4 );
98  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
99  while ( ftell( $file ) - $start < $length ) {
100  $chunkHeader = fread( $file, 8 );
101  if ( $chunkHeader == '' ) {
102  break;
103  }
104  $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
105  $chunk = $arr['chunk'];
106  $chunkLength = $arr['chunkLength'];
107  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
108 
109  if ( $chunk == 'FORM' ) {
110  $this->dumpForm( $file, $chunkLength, $indent + 1 );
111  } else {
112  fseek( $file, $chunkLength, SEEK_CUR );
113  if ( $chunkLength & 1 == 1 ) {
114  // Padding byte between chunks
115  fseek( $file, 1, SEEK_CUR );
116  }
117  }
118  }
119  }
120 
121  function getInfo() {
122  MediaWiki\suppressWarnings();
123  $file = fopen( $this->mFilename, 'rb' );
124  MediaWiki\restoreWarnings();
125  if ( $file === false ) {
126  wfDebug( __METHOD__ . ": missing or failed file read\n" );
127 
128  return false;
129  }
130 
131  $header = fread( $file, 16 );
132  $info = false;
133 
134  if ( strlen( $header ) < 16 ) {
135  wfDebug( __METHOD__ . ": too short file header\n" );
136  } else {
137  $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
138 
139  $subtype = $arr['subtype'];
140  if ( $arr['magic'] != 'AT&T' ) {
141  wfDebug( __METHOD__ . ": not a DjVu file\n" );
142  } elseif ( $subtype == 'DJVU' ) {
143  // Single-page document
144  $info = $this->getPageInfo( $file );
145  } elseif ( $subtype == 'DJVM' ) {
146  // Multi-page document
147  $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
148  } else {
149  wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
150  }
151  }
152  fclose( $file );
153 
154  return $info;
155  }
156 
157  private function readChunk( $file ) {
158  $header = fread( $file, 8 );
159  if ( strlen( $header ) < 8 ) {
160  return [ false, 0 ];
161  } else {
162  $arr = unpack( 'a4chunk/Nlength', $header );
163 
164  return [ $arr['chunk'], $arr['length'] ];
165  }
166  }
167 
168  private function skipChunk( $file, $chunkLength ) {
169  fseek( $file, $chunkLength, SEEK_CUR );
170 
171  if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
172  // padding byte
173  fseek( $file, 1, SEEK_CUR );
174  }
175  }
176 
177  private function getMultiPageInfo( $file, $formLength ) {
178  // For now, we'll just look for the first page in the file
179  // and report its information, hoping others are the same size.
180  $start = ftell( $file );
181  do {
182  list( $chunk, $length ) = $this->readChunk( $file );
183  if ( !$chunk ) {
184  break;
185  }
186 
187  if ( $chunk == 'FORM' ) {
188  $subtype = fread( $file, 4 );
189  if ( $subtype == 'DJVU' ) {
190  wfDebug( __METHOD__ . ": found first subpage\n" );
191 
192  return $this->getPageInfo( $file );
193  }
194  $this->skipChunk( $file, $length - 4 );
195  } else {
196  wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
197  $this->skipChunk( $file, $length );
198  }
199  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
200 
201  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
202 
203  return false;
204  }
205 
206  private function getPageInfo( $file ) {
207  list( $chunk, $length ) = $this->readChunk( $file );
208  if ( $chunk != 'INFO' ) {
209  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
210 
211  return false;
212  }
213 
214  if ( $length < 9 ) {
215  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
216 
217  return false;
218  }
219  $data = fread( $file, $length );
220  if ( strlen( $data ) < $length ) {
221  wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
222 
223  return false;
224  }
225 
226  $arr = unpack(
227  'nwidth/' .
228  'nheight/' .
229  'Cminor/' .
230  'Cmajor/' .
231  'vresolution/' .
232  'Cgamma', $data );
233 
234  # Newer files have rotation info in byte 10, but we don't use it yet.
235 
236  return [
237  'width' => $arr['width'],
238  'height' => $arr['height'],
239  'version' => "{$arr['major']}.{$arr['minor']}",
240  'resolution' => $arr['resolution'],
241  'gamma' => $arr['gamma'] / 10.0 ];
242  }
243 
248  function retrieveMetaData() {
250 
251  if ( !$this->isValid() ) {
252  return false;
253  }
254 
255  if ( isset( $wgDjvuDump ) ) {
256  # djvudump is faster as of version 3.5
257  # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583
258  $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
259  $dump = wfShellExec( $cmd );
260  $xml = $this->convertDumpToXML( $dump );
261  } elseif ( isset( $wgDjvuToXML ) ) {
262  $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
263  wfEscapeShellArg( $this->mFilename );
264  $xml = wfShellExec( $cmd );
265  } else {
266  $xml = null;
267  }
268  # Text layer
269  if ( isset( $wgDjvuTxt ) ) {
270  $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
271  wfDebug( __METHOD__ . ": $cmd\n" );
272  $retval = '';
273  $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
274  if ( $retval == 0 ) {
275  # Strip some control characters
276  $txt = preg_replace( "/[\013\035\037]/", "", $txt );
277  $reg = <<<EOR
278  /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
279  ((?> # Text to match is composed of atoms of either:
280  \\\\. # - any escaped character
281  | # - any character different from " and \
282  [^"\\\\]+
283  )*?)
284  "\s*\)
285  | # Or page can be empty ; in this case, djvutxt dumps ()
286  \(\s*()\)/sx
287 EOR;
288  $txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
289  $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
290  $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
291  $xml = $xml . $txt . '</mw-djvu>';
292  }
293  }
294 
295  return $xml;
296  }
297 
299  # Get rid of invalid UTF-8, strip control characters
300  $val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
301  $val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
302  return '<PAGE value="' . $val . '" />';
303  }
304 
310  function convertDumpToXML( $dump ) {
311  if ( strval( $dump ) == '' ) {
312  return false;
313  }
314 
315  $xml = <<<EOT
316 <?xml version="1.0" ?>
317 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
318 <DjVuXML>
319 <HEAD></HEAD>
320 <BODY>
321 EOT;
322 
323  $dump = str_replace( "\r", '', $dump );
324  $line = strtok( $dump, "\n" );
325  $m = false;
326  $good = false;
327  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
328  # Single-page
329  if ( $this->parseFormDjvu( $line, $xml ) ) {
330  $good = true;
331  } else {
332  return false;
333  }
334  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
335  # Multi-page
336  $parentLevel = strlen( $m[1] );
337  # Find DIRM
338  $line = strtok( "\n" );
339  while ( $line !== false ) {
340  $childLevel = strspn( $line, ' ' );
341  if ( $childLevel <= $parentLevel ) {
342  # End of chunk
343  break;
344  }
345 
346  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
347  wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
348 
349  return false;
350  }
351  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
352  # Found page
353  if ( $this->parseFormDjvu( $line, $xml ) ) {
354  $good = true;
355  } else {
356  return false;
357  }
358  }
359  $line = strtok( "\n" );
360  }
361  }
362  if ( !$good ) {
363  return false;
364  }
365 
366  $xml .= "</BODY>\n</DjVuXML>\n";
367 
368  return $xml;
369  }
370 
371  function parseFormDjvu( $line, &$xml ) {
372  $parentLevel = strspn( $line, ' ' );
373  $line = strtok( "\n" );
374 
375  # Find INFO
376  while ( $line !== false ) {
377  $childLevel = strspn( $line, ' ' );
378  if ( $childLevel <= $parentLevel ) {
379  # End of chunk
380  break;
381  }
382 
383  if ( preg_match(
384  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
385  $line,
386  $m
387  ) ) {
388  $xml .= Xml::tags(
389  'OBJECT',
390  [
391  # 'data' => '',
392  # 'type' => 'image/x.djvu',
393  'height' => $m[2],
394  'width' => $m[1],
395  # 'usemap' => '',
396  ],
397  "\n" .
398  Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
399  Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
400  ) . "\n";
401 
402  return true;
403  }
404  $line = strtok( "\n" );
405  }
406 
407  # Not found
408  return false;
409  }
410 }
dumpForm($file, $length, $indent)
Definition: DjVuImage.php:95
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
static element($element, $attribs=null, $contents= '', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:39
getPageInfo($file)
Definition: DjVuImage.php:206
null for the local wiki Added in
Definition: hooks.txt:1555
$wgDjvuTxt
Path of the djvutxt DJVU text extraction utility Enable this and $wgDjvuDump to enable text layer ext...
convertDumpToXML($dump)
Hack to temporarily work around djvutoxml bug.
Definition: DjVuImage.php:310
wfShellExec($cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
pageTextCallback($matches)
Definition: DjVuImage.php:298
wfDebug($text, $dest= 'all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
getImageSize()
Return data in the style of getimagesize()
Definition: DjVuImage.php:65
Prior to version
Definition: maintenance.txt:1
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition: DjVuImage.php:36
Unicode normalization routines for working with UTF-8 strings.
Definition: UtfNormal.php:48
dump()
For debugging; dump the IFF chunk structure.
Definition: DjVuImage.php:84
and(b) You must cause any modified files to carry prominent notices stating that You changed the files
__construct($filename)
Constructor.
Definition: DjVuImage.php:47
$header
const DJVUTXT_MEMORY_LIMIT
DJVUTXT_MEMORY_LIMIT Memory limit for the DjVu description software
Definition: DjVuImage.php:40
parseFormDjvu($line, &$xml)
Definition: DjVuImage.php:371
skipChunk($file, $chunkLength)
Definition: DjVuImage.php:168
$wgDjvuDump
Path of the djvudump executable Enable this and $wgDjvuRenderer to enable djvu rendering example: $wg...
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
static tags($element, $attribs=null, $contents)
Same as Xml::element(), but does not escape contents.
Definition: Xml.php:131
isValid()
Check if the given file is indeed a valid DjVu image file.
Definition: DjVuImage.php:55
$line
Definition: cdb.php:59
$wgDjvuToXML
Path of the djvutoxml executable This works like djvudump except much, much slower as of version 3...
wfEscapeShellArg()
Version of escapeshellarg() that works better on Windows.
retrieveMetaData()
Return an XML string describing the DjVu image.
Definition: DjVuImage.php:248
readChunk($file)
Definition: DjVuImage.php:157
getMultiPageInfo($file, $formLength)
Definition: DjVuImage.php:177
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a local account incomplete not yet checked for validity & $retval
Definition: hooks.txt:242
do that in ParserLimitReportFormat instead use this to modify the parameters of the image and a DIV can begin in one section and end in another Make sure your code can handle that case gracefully See the EditSectionClearerLink extension for an example zero but section is usually empty its values are the globals values before the output is cached one of or reset my talk page
Definition: hooks.txt:2491
$matches