MediaWiki  master
DjVuHandler.php
Go to the documentation of this file.
1 <?php
25 
31 class DjVuHandler extends ImageHandler {
32  const EXPENSIVE_SIZE_LIMIT = 10485760; // 10MiB
33 
37  public function isEnabled() {
39  if ( !$wgDjvuRenderer || ( !$wgDjvuDump && !$wgDjvuToXML ) ) {
40  wfDebug( "DjVu is disabled, please set \$wgDjvuRenderer and \$wgDjvuDump\n" );
41 
42  return false;
43  } else {
44  return true;
45  }
46  }
47 
52  public function mustRender( $file ) {
53  return true;
54  }
55 
61  public function isExpensiveToThumbnail( $file ) {
62  return $file->getSize() > static::EXPENSIVE_SIZE_LIMIT;
63  }
64 
69  public function isMultiPage( $file ) {
70  return true;
71  }
72 
76  public function getParamMap() {
77  return [
78  'img_width' => 'width',
79  'img_page' => 'page',
80  ];
81  }
82 
88  public function validateParam( $name, $value ) {
89  if ( $name === 'page' && trim( $value ) !== (string)intval( $value ) ) {
90  // Extra junk on the end of page, probably actually a caption
91  // e.g. [[File:Foo.djvu|thumb|Page 3 of the document shows foo]]
92  return false;
93  }
94  if ( in_array( $name, [ 'width', 'height', 'page' ] ) ) {
95  if ( $value <= 0 ) {
96  return false;
97  } else {
98  return true;
99  }
100  } else {
101  return false;
102  }
103  }
104 
109  public function makeParamString( $params ) {
110  $page = $params['page'] ?? 1;
111  if ( !isset( $params['width'] ) ) {
112  return false;
113  }
114 
115  return "page{$page}-{$params['width']}px";
116  }
117 
122  public function parseParamString( $str ) {
123  $m = false;
124  if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
125  return [ 'width' => $m[2], 'page' => $m[1] ];
126  } else {
127  return false;
128  }
129  }
130 
135  protected function getScriptParams( $params ) {
136  return [
137  'width' => $params['width'],
138  'page' => $params['page'],
139  ];
140  }
141 
150  function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
152 
153  if ( !$this->normaliseParams( $image, $params ) ) {
154  return new TransformParameterError( $params );
155  }
156  $width = $params['width'];
157  $height = $params['height'];
158  $page = $params['page'];
159 
160  if ( $flags & self::TRANSFORM_LATER ) {
161  $params = [
162  'width' => $width,
163  'height' => $height,
164  'page' => $page
165  ];
166 
167  return new ThumbnailImage( $image, $dstUrl, $dstPath, $params );
168  }
169 
170  if ( !wfMkdirParents( dirname( $dstPath ), null, __METHOD__ ) ) {
171  return new MediaTransformError(
172  'thumbnail_error',
173  $width,
174  $height,
175  wfMessage( 'thumbnail_dest_directory' )
176  );
177  }
178 
179  // Get local copy source for shell scripts
180  // Thumbnail extraction is very inefficient for large files.
181  // Provide a way to pool count limit the number of downloaders.
182  if ( $image->getSize() >= 1e7 ) { // 10MB
183  $work = new PoolCounterWorkViaCallback( 'GetLocalFileCopy', sha1( $image->getName() ),
184  [
185  'doWork' => function () use ( $image ) {
186  return $image->getLocalRefPath();
187  }
188  ]
189  );
190  $srcPath = $work->execute();
191  } else {
192  $srcPath = $image->getLocalRefPath();
193  }
194 
195  if ( $srcPath === false ) { // Failed to get local copy
196  wfDebugLog( 'thumbnail',
197  sprintf( 'Thumbnail failed on %s: could not get local copy of "%s"',
198  wfHostname(), $image->getName() ) );
199 
200  return new MediaTransformError( 'thumbnail_error',
201  $params['width'], $params['height'],
202  wfMessage( 'filemissing' )
203  );
204  }
205 
206  # Use a subshell (brackets) to aggregate stderr from both pipeline commands
207  # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
208  $cmd = '(' . Shell::escape(
209  $wgDjvuRenderer,
210  "-format=ppm",
211  "-page={$page}",
212  "-size={$params['physicalWidth']}x{$params['physicalHeight']}",
213  $srcPath );
214  if ( $wgDjvuPostProcessor ) {
215  $cmd .= " | {$wgDjvuPostProcessor}";
216  }
217  $cmd .= ' > ' . Shell::escape( $dstPath ) . ') 2>&1';
218  wfDebug( __METHOD__ . ": $cmd\n" );
219  $retval = '';
220  $err = wfShellExec( $cmd, $retval );
221 
222  $removed = $this->removeBadFile( $dstPath, $retval );
223  if ( $retval != 0 || $removed ) {
224  $this->logErrorForExternalProcess( $retval, $err, $cmd );
225  return new MediaTransformError( 'thumbnail_error', $width, $height, $err );
226  } else {
227  $params = [
228  'width' => $width,
229  'height' => $height,
230  'page' => $page
231  ];
232 
233  return new ThumbnailImage( $image, $dstUrl, $dstPath, $params );
234  }
235  }
236 
245  function getDjVuImage( $image, $path ) {
246  if ( !$image ) {
247  $deja = new DjVuImage( $path );
248  } elseif ( !isset( $image->dejaImage ) ) {
249  $deja = $image->dejaImage = new DjVuImage( $path );
250  } else {
251  $deja = $image->dejaImage;
252  }
253 
254  return $deja;
255  }
256 
264  private function getUnserializedMetadata( File $file ) {
265  $metadata = $file->getMetadata();
266  if ( substr( $metadata, 0, 3 ) === '<?xml' ) {
267  // Old style. Not serialized but instead just a raw string of XML.
268  return $metadata;
269  }
270 
271  Wikimedia\suppressWarnings();
272  $unser = unserialize( $metadata );
273  Wikimedia\restoreWarnings();
274  if ( is_array( $unser ) ) {
275  if ( isset( $unser['error'] ) ) {
276  return false;
277  } elseif ( isset( $unser['xml'] ) ) {
278  return $unser['xml'];
279  } else {
280  // Should never ever reach here.
281  throw new MWException( "Error unserializing DjVu metadata." );
282  }
283  }
284 
285  // unserialize failed. Guess it wasn't really serialized after all,
286  return $metadata;
287  }
288 
296  public function getMetaTree( $image, $gettext = false ) {
297  if ( $gettext && isset( $image->djvuTextTree ) ) {
298  return $image->djvuTextTree;
299  }
300  if ( !$gettext && isset( $image->dejaMetaTree ) ) {
301  return $image->dejaMetaTree;
302  }
303 
304  $metadata = $this->getUnserializedMetadata( $image );
305  if ( !$this->isMetadataValid( $image, $metadata ) ) {
306  wfDebug( "DjVu XML metadata is invalid or missing, should have been fixed in upgradeRow\n" );
307 
308  return false;
309  }
310 
311  $trees = $this->extractTreesFromMetadata( $metadata );
312  $image->djvuTextTree = $trees['TextTree'];
313  $image->dejaMetaTree = $trees['MetaTree'];
314 
315  if ( $gettext ) {
316  return $image->djvuTextTree;
317  } else {
318  return $image->dejaMetaTree;
319  }
320  }
321 
327  protected function extractTreesFromMetadata( $metadata ) {
328  Wikimedia\suppressWarnings();
329  try {
330  // Set to false rather than null to avoid further attempts
331  $metaTree = false;
332  $textTree = false;
333  $tree = new SimpleXMLElement( $metadata, LIBXML_PARSEHUGE );
334  if ( $tree->getName() == 'mw-djvu' ) {
336  foreach ( $tree->children() as $b ) {
337  if ( $b->getName() == 'DjVuTxt' ) {
338  // @todo File::djvuTextTree and File::dejaMetaTree are declared
339  // dynamically. Add a public File::$data to facilitate this?
340  $textTree = $b;
341  } elseif ( $b->getName() == 'DjVuXML' ) {
342  $metaTree = $b;
343  }
344  }
345  } else {
346  $metaTree = $tree;
347  }
348  } catch ( Exception $e ) {
349  wfDebug( "Bogus multipage XML metadata\n" );
350  }
351  Wikimedia\restoreWarnings();
352 
353  return [ 'MetaTree' => $metaTree, 'TextTree' => $textTree ];
354  }
355 
356  function getImageSize( $image, $path ) {
357  return $this->getDjVuImage( $image, $path )->getImageSize();
358  }
359 
360  public function getThumbType( $ext, $mime, $params = null ) {
361  global $wgDjvuOutputExtension;
362  static $mime;
363  if ( !isset( $mime ) ) {
364  $magic = MediaWiki\MediaWikiServices::getInstance()->getMimeAnalyzer();
365  $mime = $magic->guessTypesForExtension( $wgDjvuOutputExtension );
366  }
367 
368  return [ $wgDjvuOutputExtension, $mime ];
369  }
370 
371  public function getMetadata( $image, $path ) {
372  wfDebug( "Getting DjVu metadata for $path\n" );
373 
374  $xml = $this->getDjVuImage( $image, $path )->retrieveMetaData();
375  if ( $xml === false ) {
376  // Special value so that we don't repetitively try and decode a broken file.
377  return serialize( [ 'error' => 'Error extracting metadata' ] );
378  } else {
379  return serialize( [ 'xml' => $xml ] );
380  }
381  }
382 
383  function getMetadataType( $image ) {
384  return 'djvuxml';
385  }
386 
387  public function isMetadataValid( $image, $metadata ) {
388  return !empty( $metadata ) && $metadata != serialize( [] );
389  }
390 
391  public function pageCount( File $image ) {
392  $info = $this->getDimensionInfo( $image );
393 
394  return $info ? $info['pageCount'] : false;
395  }
396 
397  public function getPageDimensions( File $image, $page ) {
398  $index = $page - 1; // MW starts pages at 1
399 
400  $info = $this->getDimensionInfo( $image );
401  if ( $info && isset( $info['dimensionsByPage'][$index] ) ) {
402  return $info['dimensionsByPage'][$index];
403  }
404 
405  return false;
406  }
407 
408  protected function getDimensionInfo( File $file ) {
409  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
410  return $cache->getWithSetCallback(
411  $cache->makeKey( 'file-djvu', 'dimensions', $file->getSha1() ),
412  $cache::TTL_INDEFINITE,
413  function () use ( $file ) {
414  $tree = $this->getMetaTree( $file );
415  return $this->getDimensionInfoFromMetaTree( $tree );
416  },
417  [ 'pcTTL' => $cache::TTL_INDEFINITE ]
418  );
419  }
420 
426  protected function getDimensionInfoFromMetaTree( $metatree ) {
427  if ( !$metatree ) {
428  return false;
429  }
430 
431  $dimsByPage = [];
432  $count = count( $metatree->xpath( '//OBJECT' ) );
433  for ( $i = 0; $i < $count; $i++ ) {
434  $o = $metatree->BODY[0]->OBJECT[$i];
435  if ( $o ) {
436  $dimsByPage[$i] = [
437  'width' => (int)$o['width'],
438  'height' => (int)$o['height'],
439  ];
440  } else {
441  $dimsByPage[$i] = false;
442  }
443  }
444 
445  return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
446  }
447 
453  function getPageText( File $image, $page ) {
454  $tree = $this->getMetaTree( $image, true );
455  if ( !$tree ) {
456  return false;
457  }
458 
459  $o = $tree->BODY[0]->PAGE[$page - 1];
460  if ( $o ) {
461  $txt = $o['value'];
462 
463  return $txt;
464  } else {
465  return false;
466  }
467  }
468 }
getMetaTree( $image, $gettext=false)
Cache a document tree for the DjVu XML metadata.
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
mustRender( $file)
Definition: DjVuHandler.php:52
getPageText(File $image, $page)
normaliseParams( $image, &$params)
getDimensionInfo(File $file)
removeBadFile( $dstPath, $retval=0)
Check for zero-sized thumbnails.
serialize()
getUnserializedMetadata(File $file)
Get metadata, unserializing it if necessary.
getDimensionInfoFromMetaTree( $metatree)
Given an XML metadata tree, returns dimension information about the document.
makeParamString( $params)
wfHostname()
Fetch server name for use in error reporting etc.
isMetadataValid( $image, $metadata)
isMultiPage( $file)
Definition: DjVuHandler.php:69
extractTreesFromMetadata( $metadata)
Extracts metadata and text trees from metadata XML in string form.
pageCount(File $image)
Handler for DjVu images.
Definition: DjVuHandler.php:31
static getInstance()
Returns the global default instance of the top level service locator.
execute( $skipcache=false)
Get the result of the work (whatever it is), or the result of the error() function.
getPageDimensions(File $image, $page)
isExpensiveToThumbnail( $file)
True if creating thumbnails from the file is large or otherwise resource-intensive.
Definition: DjVuHandler.php:61
parseParamString( $str)
validateParam( $name, $value)
Definition: DjVuHandler.php:88
Shortcut class for parameter validation errors.
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition: DjVuImage.php:38
Convenience class for dealing with PoolCounters using callbacks.
getDjVuImage( $image, $path)
Cache an instance of DjVuImage in an Image object, return that instance.
getImageSize( $image, $path)
const EXPENSIVE_SIZE_LIMIT
Definition: DjVuHandler.php:32
getMetadata()
Get handler-specific metadata Overridden by LocalFile, UnregisteredLocalFile STUB.
Definition: File.php:663
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Media transform output for images.
getMetadata( $image, $path)
$wgDjvuRenderer
Path of the ddjvu DJVU renderer Enable this and $wgDjvuDump to enable djvu rendering example: $wgDjvu...
$cache
Definition: mcc.php:33
$wgDjvuOutputExtension
File extension for the DJVU post processor output.
unserialize( $serialized)
getSha1()
Get the SHA-1 base 36 hash of the file.
Definition: File.php:2142
getThumbType( $ext, $mime, $params=null)
$wgDjvuDump
Path of the djvudump executable Enable this and $wgDjvuRenderer to enable djvu rendering example: $wg...
Media handler abstract base class for images.
getMetadataType( $image)
getScriptParams( $params)
wfMkdirParents( $dir, $mode=null, $caller=null)
Make directory, and make all parent directories if they don&#39;t exist.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not...
logErrorForExternalProcess( $retval, $err, $cmd)
Log an error that occurred in an external process.
if(!is_readable( $file)) $ext
Definition: router.php:48
$wgDjvuToXML
Path of the djvutoxml executable This works like djvudump except much, much slower as of version 3...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Implements some public methods and some protected utility functions which are required by multiple ch...
Definition: File.php:61
doTransform( $image, $dstPath, $dstUrl, $params, $flags=0)
Basic media transform error class.
$wgDjvuPostProcessor
Shell command for the DJVU post processor Default: pnmtojpeg, since ddjvu generates ppm output Set th...