MediaWiki  1.33.0
DjVuImage.php
Go to the documentation of this file.
1 <?php
28 
38 class DjVuImage {
42  const DJVUTXT_MEMORY_LIMIT = 300000;
43 
47  function __construct( $filename ) {
48  $this->mFilename = $filename;
49  }
50 
55  public function isValid() {
56  $info = $this->getInfo();
57 
58  return $info !== false;
59  }
60 
65  public function getImageSize() {
66  $data = $this->getInfo();
67 
68  if ( $data !== false ) {
69  $width = $data['width'];
70  $height = $data['height'];
71 
72  return [ $width, $height, 'DjVu',
73  "width=\"$width\" height=\"$height\"" ];
74  }
75 
76  return false;
77  }
78 
79  // ---------
80 
84  function dump() {
85  $file = fopen( $this->mFilename, 'rb' );
86  $header = fread( $file, 12 );
87  $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
88  $chunk = $arr['chunk'];
89  $chunkLength = $arr['chunkLength'];
90  echo "$chunk $chunkLength\n";
91  $this->dumpForm( $file, $chunkLength, 1 );
92  fclose( $file );
93  }
94 
95  private function dumpForm( $file, $length, $indent ) {
96  $start = ftell( $file );
97  $secondary = fread( $file, 4 );
98  echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
99  while ( ftell( $file ) - $start < $length ) {
100  $chunkHeader = fread( $file, 8 );
101  if ( $chunkHeader == '' ) {
102  break;
103  }
104  $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
105  $chunk = $arr['chunk'];
106  $chunkLength = $arr['chunkLength'];
107  echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
108 
109  if ( $chunk == 'FORM' ) {
110  $this->dumpForm( $file, $chunkLength, $indent + 1 );
111  } else {
112  fseek( $file, $chunkLength, SEEK_CUR );
113  if ( ( $chunkLength & 1 ) == 1 ) {
114  // Padding byte between chunks
115  fseek( $file, 1, SEEK_CUR );
116  }
117  }
118  }
119  }
120 
121  function getInfo() {
122  Wikimedia\suppressWarnings();
123  $file = fopen( $this->mFilename, 'rb' );
124  Wikimedia\restoreWarnings();
125  if ( $file === false ) {
126  wfDebug( __METHOD__ . ": missing or failed file read\n" );
127 
128  return false;
129  }
130 
131  $header = fread( $file, 16 );
132  $info = false;
133 
134  if ( strlen( $header ) < 16 ) {
135  wfDebug( __METHOD__ . ": too short file header\n" );
136  } else {
137  $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
138 
139  $subtype = $arr['subtype'];
140  if ( $arr['magic'] != 'AT&T' ) {
141  wfDebug( __METHOD__ . ": not a DjVu file\n" );
142  } elseif ( $subtype == 'DJVU' ) {
143  // Single-page document
144  $info = $this->getPageInfo( $file );
145  } elseif ( $subtype == 'DJVM' ) {
146  // Multi-page document
147  $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
148  } else {
149  wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
150  }
151  }
152  fclose( $file );
153 
154  return $info;
155  }
156 
157  private function readChunk( $file ) {
158  $header = fread( $file, 8 );
159  if ( strlen( $header ) < 8 ) {
160  return [ false, 0 ];
161  } else {
162  $arr = unpack( 'a4chunk/Nlength', $header );
163 
164  return [ $arr['chunk'], $arr['length'] ];
165  }
166  }
167 
168  private function skipChunk( $file, $chunkLength ) {
169  fseek( $file, $chunkLength, SEEK_CUR );
170 
171  if ( ( $chunkLength & 0x01 ) == 1 && !feof( $file ) ) {
172  // padding byte
173  fseek( $file, 1, SEEK_CUR );
174  }
175  }
176 
177  private function getMultiPageInfo( $file, $formLength ) {
178  // For now, we'll just look for the first page in the file
179  // and report its information, hoping others are the same size.
180  $start = ftell( $file );
181  do {
182  list( $chunk, $length ) = $this->readChunk( $file );
183  if ( !$chunk ) {
184  break;
185  }
186 
187  if ( $chunk == 'FORM' ) {
188  $subtype = fread( $file, 4 );
189  if ( $subtype == 'DJVU' ) {
190  wfDebug( __METHOD__ . ": found first subpage\n" );
191 
192  return $this->getPageInfo( $file );
193  }
194  $this->skipChunk( $file, $length - 4 );
195  } else {
196  wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
197  $this->skipChunk( $file, $length );
198  }
199  } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
200 
201  wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
202 
203  return false;
204  }
205 
206  private function getPageInfo( $file ) {
207  list( $chunk, $length ) = $this->readChunk( $file );
208  if ( $chunk != 'INFO' ) {
209  wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
210 
211  return false;
212  }
213 
214  if ( $length < 9 ) {
215  wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
216 
217  return false;
218  }
219  $data = fread( $file, $length );
220  if ( strlen( $data ) < $length ) {
221  wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
222 
223  return false;
224  }
225 
226  $arr = unpack(
227  'nwidth/' .
228  'nheight/' .
229  'Cminor/' .
230  'Cmajor/' .
231  'vresolution/' .
232  'Cgamma', $data );
233 
234  # Newer files have rotation info in byte 10, but we don't use it yet.
235 
236  return [
237  'width' => $arr['width'],
238  'height' => $arr['height'],
239  'version' => "{$arr['major']}.{$arr['minor']}",
240  'resolution' => $arr['resolution'],
241  'gamma' => $arr['gamma'] / 10.0 ];
242  }
243 
248  function retrieveMetaData() {
250 
251  if ( !$this->isValid() ) {
252  return false;
253  }
254 
255  if ( isset( $wgDjvuDump ) ) {
256  # djvudump is faster as of version 3.5
257  # https://sourceforge.net/p/djvu/bugs/71/
258  $cmd = Shell::escape( $wgDjvuDump ) . ' ' . Shell::escape( $this->mFilename );
259  $dump = wfShellExec( $cmd );
260  $xml = $this->convertDumpToXML( $dump );
261  } elseif ( isset( $wgDjvuToXML ) ) {
262  $cmd = Shell::escape( $wgDjvuToXML ) . ' --without-anno --without-text ' .
263  Shell::escape( $this->mFilename );
264  $xml = wfShellExec( $cmd );
265  } else {
266  $xml = null;
267  }
268  # Text layer
269  if ( isset( $wgDjvuTxt ) ) {
270  $cmd = Shell::escape( $wgDjvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
271  wfDebug( __METHOD__ . ": $cmd\n" );
272  $retval = '';
273  $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
274  if ( $retval == 0 ) {
275  # Strip some control characters
276  $txt = preg_replace( "/[\013\035\037]/", "", $txt );
277  $reg = <<<EOR
278  /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
279  ((?> # Text to match is composed of atoms of either:
280  \\\\. # - any escaped character
281  | # - any character different from " and \
282  [^"\\\\]+
283  )*?)
284  "\s*\)
285  | # Or page can be empty ; in this case, djvutxt dumps ()
286  \(\s*()\)/sx
287 EOR;
288  $txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
289  $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
290  $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ) .
291  $txt .
292  '</mw-djvu>';
293  }
294  }
295 
296  return $xml;
297  }
298 
299  function pageTextCallback( $matches ) {
300  # Get rid of invalid UTF-8, strip control characters
301  $val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
302  $val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
303  return '<PAGE value="' . $val . '" />';
304  }
305 
311  function convertDumpToXML( $dump ) {
312  if ( strval( $dump ) == '' ) {
313  return false;
314  }
315 
316  $xml = <<<EOT
317 <?xml version="1.0" ?>
318 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
319 <DjVuXML>
320 <HEAD></HEAD>
321 <BODY>
322 EOT;
323 
324  $dump = str_replace( "\r", '', $dump );
325  $line = strtok( $dump, "\n" );
326  $m = false;
327  $good = false;
328  if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
329  # Single-page
330  if ( $this->parseFormDjvu( $line, $xml ) ) {
331  $good = true;
332  } else {
333  return false;
334  }
335  } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
336  # Multi-page
337  $parentLevel = strlen( $m[1] );
338  # Find DIRM
339  $line = strtok( "\n" );
340  while ( $line !== false ) {
341  $childLevel = strspn( $line, ' ' );
342  if ( $childLevel <= $parentLevel ) {
343  # End of chunk
344  break;
345  }
346 
347  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
348  wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
349 
350  return false;
351  }
352  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
353  # Found page
354  if ( $this->parseFormDjvu( $line, $xml ) ) {
355  $good = true;
356  } else {
357  return false;
358  }
359  }
360  $line = strtok( "\n" );
361  }
362  }
363  if ( !$good ) {
364  return false;
365  }
366 
367  $xml .= "</BODY>\n</DjVuXML>\n";
368 
369  return $xml;
370  }
371 
372  function parseFormDjvu( $line, &$xml ) {
373  $parentLevel = strspn( $line, ' ' );
374  $line = strtok( "\n" );
375 
376  # Find INFO
377  while ( $line !== false ) {
378  $childLevel = strspn( $line, ' ' );
379  if ( $childLevel <= $parentLevel ) {
380  # End of chunk
381  break;
382  }
383 
384  if ( preg_match(
385  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
386  $line,
387  $m
388  ) ) {
389  $xml .= Xml::tags(
390  'OBJECT',
391  [
392  # 'data' => '',
393  # 'type' => 'image/x.djvu',
394  'height' => $m[2],
395  'width' => $m[1],
396  # 'usemap' => '',
397  ],
398  "\n" .
399  Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
400  Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
401  ) . "\n";
402 
403  return true;
404  }
405  $line = strtok( "\n" );
406  }
407 
408  # Not found
409  return false;
410  }
411 }
$wgDjvuTxt
$wgDjvuTxt
Path of the djvutxt DJVU text extraction utility Enable this and $wgDjvuDump to enable text layer ext...
Definition: DefaultSettings.php:1604
MediaWiki\Shell\Shell
Executes shell commands.
Definition: Shell.php:44
$file
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
version
Prior to version
Definition: maintenance.txt:1
page
target page
Definition: All_system_messages.txt:1267
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$data
$data
Utility to generate mapping file used in mw.Title (phpCharToUpper.json)
Definition: generatePhpCharToUpperMappings.php:13
$matches
$matches
Definition: NoLocalSettings.php:24
in
null for the wiki Added in
Definition: hooks.txt:1588
$wgDjvuToXML
$wgDjvuToXML
Path of the djvutoxml executable This works like djvudump except much, much slower as of version 3....
Definition: DefaultSettings.php:1620
Xml\element
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:41
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:949
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
$line
$line
Definition: cdb.php:59
$header
$header
Definition: updateCredits.php:41
Xml\tags
static tags( $element, $attribs, $contents)
Same as Xml::element(), but does not escape contents.
Definition: Xml.php:130
and
and that you know you can do these things To protect your we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights These restrictions translate to certain responsibilities for you if you distribute copies of the or if you modify it For if you distribute copies of such a whether gratis or for a you must give the recipients all the rights that you have You must make sure that receive or can get the source code And you must show them these terms so they know their rights We protect your rights with two and(2) offer you this license which gives you legal permission to copy
$wgDjvuDump
$wgDjvuDump
Path of the djvudump executable Enable this and $wgDjvuRenderer to enable djvu rendering example: $wg...
Definition: DefaultSettings.php:1590
wfShellExec
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
Definition: GlobalFunctions.php:2168