MediaWiki REL1_31
DjVuImage.php
Go to the documentation of this file.
1<?php
36class DjVuImage {
40 const DJVUTXT_MEMORY_LIMIT = 300000;
41
45 function __construct( $filename ) {
46 $this->mFilename = $filename;
47 }
48
53 public function isValid() {
54 $info = $this->getInfo();
55
56 return $info !== false;
57 }
58
63 public function getImageSize() {
64 $data = $this->getInfo();
65
66 if ( $data !== false ) {
67 $width = $data['width'];
68 $height = $data['height'];
69
70 return [ $width, $height, 'DjVu',
71 "width=\"$width\" height=\"$height\"" ];
72 }
73
74 return false;
75 }
76
77 // ---------
78
82 function dump() {
83 $file = fopen( $this->mFilename, 'rb' );
84 $header = fread( $file, 12 );
85 $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
86 $chunk = $arr['chunk'];
87 $chunkLength = $arr['chunkLength'];
88 echo "$chunk $chunkLength\n";
89 $this->dumpForm( $file, $chunkLength, 1 );
90 fclose( $file );
91 }
92
93 private function dumpForm( $file, $length, $indent ) {
94 $start = ftell( $file );
95 $secondary = fread( $file, 4 );
96 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
97 while ( ftell( $file ) - $start < $length ) {
98 $chunkHeader = fread( $file, 8 );
99 if ( $chunkHeader == '' ) {
100 break;
101 }
102 $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
103 $chunk = $arr['chunk'];
104 $chunkLength = $arr['chunkLength'];
105 echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
106
107 if ( $chunk == 'FORM' ) {
108 $this->dumpForm( $file, $chunkLength, $indent + 1 );
109 } else {
110 fseek( $file, $chunkLength, SEEK_CUR );
111 if ( $chunkLength & 1 == 1 ) {
112 // Padding byte between chunks
113 fseek( $file, 1, SEEK_CUR );
114 }
115 }
116 }
117 }
118
119 function getInfo() {
120 Wikimedia\suppressWarnings();
121 $file = fopen( $this->mFilename, 'rb' );
122 Wikimedia\restoreWarnings();
123 if ( $file === false ) {
124 wfDebug( __METHOD__ . ": missing or failed file read\n" );
125
126 return false;
127 }
128
129 $header = fread( $file, 16 );
130 $info = false;
131
132 if ( strlen( $header ) < 16 ) {
133 wfDebug( __METHOD__ . ": too short file header\n" );
134 } else {
135 $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
136
137 $subtype = $arr['subtype'];
138 if ( $arr['magic'] != 'AT&T' ) {
139 wfDebug( __METHOD__ . ": not a DjVu file\n" );
140 } elseif ( $subtype == 'DJVU' ) {
141 // Single-page document
142 $info = $this->getPageInfo( $file );
143 } elseif ( $subtype == 'DJVM' ) {
144 // Multi-page document
145 $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
146 } else {
147 wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
148 }
149 }
150 fclose( $file );
151
152 return $info;
153 }
154
155 private function readChunk( $file ) {
156 $header = fread( $file, 8 );
157 if ( strlen( $header ) < 8 ) {
158 return [ false, 0 ];
159 } else {
160 $arr = unpack( 'a4chunk/Nlength', $header );
161
162 return [ $arr['chunk'], $arr['length'] ];
163 }
164 }
165
166 private function skipChunk( $file, $chunkLength ) {
167 fseek( $file, $chunkLength, SEEK_CUR );
168
169 if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
170 // padding byte
171 fseek( $file, 1, SEEK_CUR );
172 }
173 }
174
175 private function getMultiPageInfo( $file, $formLength ) {
176 // For now, we'll just look for the first page in the file
177 // and report its information, hoping others are the same size.
178 $start = ftell( $file );
179 do {
180 list( $chunk, $length ) = $this->readChunk( $file );
181 if ( !$chunk ) {
182 break;
183 }
184
185 if ( $chunk == 'FORM' ) {
186 $subtype = fread( $file, 4 );
187 if ( $subtype == 'DJVU' ) {
188 wfDebug( __METHOD__ . ": found first subpage\n" );
189
190 return $this->getPageInfo( $file );
191 }
192 $this->skipChunk( $file, $length - 4 );
193 } else {
194 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
195 $this->skipChunk( $file, $length );
196 }
197 } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
198
199 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
200
201 return false;
202 }
203
204 private function getPageInfo( $file ) {
205 list( $chunk, $length ) = $this->readChunk( $file );
206 if ( $chunk != 'INFO' ) {
207 wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
208
209 return false;
210 }
211
212 if ( $length < 9 ) {
213 wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
214
215 return false;
216 }
217 $data = fread( $file, $length );
218 if ( strlen( $data ) < $length ) {
219 wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
220
221 return false;
222 }
223
224 $arr = unpack(
225 'nwidth/' .
226 'nheight/' .
227 'Cminor/' .
228 'Cmajor/' .
229 'vresolution/' .
230 'Cgamma', $data );
231
232 # Newer files have rotation info in byte 10, but we don't use it yet.
233
234 return [
235 'width' => $arr['width'],
236 'height' => $arr['height'],
237 'version' => "{$arr['major']}.{$arr['minor']}",
238 'resolution' => $arr['resolution'],
239 'gamma' => $arr['gamma'] / 10.0 ];
240 }
241
246 function retrieveMetaData() {
248
249 if ( !$this->isValid() ) {
250 return false;
251 }
252
253 if ( isset( $wgDjvuDump ) ) {
254 # djvudump is faster as of version 3.5
255 # https://sourceforge.net/p/djvu/bugs/71/
256 $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
257 $dump = wfShellExec( $cmd );
258 $xml = $this->convertDumpToXML( $dump );
259 } elseif ( isset( $wgDjvuToXML ) ) {
260 $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
261 wfEscapeShellArg( $this->mFilename );
262 $xml = wfShellExec( $cmd );
263 } else {
264 $xml = null;
265 }
266 # Text layer
267 if ( isset( $wgDjvuTxt ) ) {
268 $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
269 wfDebug( __METHOD__ . ": $cmd\n" );
270 $retval = '';
271 $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
272 if ( $retval == 0 ) {
273 # Strip some control characters
274 $txt = preg_replace( "/[\013\035\037]/", "", $txt );
275 $reg = <<<EOR
276 /\‍(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
277 ((?> # Text to match is composed of atoms of either:
278 \\\\. # - any escaped character
279 | # - any character different from " and \
280 [^"\\\\]+
281 )*?)
282 "\s*\‍)
283 | # Or page can be empty ; in this case, djvutxt dumps ()
284 \‍(\s*()\)/sx
285EOR;
286 $txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
287 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
288 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
289 $xml = $xml . $txt . '</mw-djvu>';
290 }
291 }
292
293 return $xml;
294 }
295
297 # Get rid of invalid UTF-8, strip control characters
298 $val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
299 $val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
300 return '<PAGE value="' . $val . '" />';
301 }
302
308 function convertDumpToXML( $dump ) {
309 if ( strval( $dump ) == '' ) {
310 return false;
311 }
312
313 $xml = <<<EOT
314<?xml version="1.0" ?>
315<!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
316<DjVuXML>
317<HEAD></HEAD>
318<BODY>
319EOT;
320
321 $dump = str_replace( "\r", '', $dump );
322 $line = strtok( $dump, "\n" );
323 $m = false;
324 $good = false;
325 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
326 # Single-page
327 if ( $this->parseFormDjvu( $line, $xml ) ) {
328 $good = true;
329 } else {
330 return false;
331 }
332 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
333 # Multi-page
334 $parentLevel = strlen( $m[1] );
335 # Find DIRM
336 $line = strtok( "\n" );
337 while ( $line !== false ) {
338 $childLevel = strspn( $line, ' ' );
339 if ( $childLevel <= $parentLevel ) {
340 # End of chunk
341 break;
342 }
343
344 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
345 wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
346
347 return false;
348 }
349 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
350 # Found page
351 if ( $this->parseFormDjvu( $line, $xml ) ) {
352 $good = true;
353 } else {
354 return false;
355 }
356 }
357 $line = strtok( "\n" );
358 }
359 }
360 if ( !$good ) {
361 return false;
362 }
363
364 $xml .= "</BODY>\n</DjVuXML>\n";
365
366 return $xml;
367 }
368
369 function parseFormDjvu( $line, &$xml ) {
370 $parentLevel = strspn( $line, ' ' );
371 $line = strtok( "\n" );
372
373 # Find INFO
374 while ( $line !== false ) {
375 $childLevel = strspn( $line, ' ' );
376 if ( $childLevel <= $parentLevel ) {
377 # End of chunk
378 break;
379 }
380
381 if ( preg_match(
382 '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
383 $line,
384 $m
385 ) ) {
386 $xml .= Xml::tags(
387 'OBJECT',
388 [
389 # 'data' => '',
390 # 'type' => 'image/x.djvu',
391 'height' => $m[2],
392 'width' => $m[1],
393 # 'usemap' => '',
394 ],
395 "\n" .
396 Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
397 Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
398 ) . "\n";
399
400 return true;
401 }
402 $line = strtok( "\n" );
403 }
404
405 # Not found
406 return false;
407 }
408}
$wgDjvuToXML
Path of the djvutoxml executable This works like djvudump except much, much slower as of version 3....
$wgDjvuTxt
Path of the djvutxt DJVU text extraction utility Enable this and $wgDjvuDump to enable text layer ext...
$wgDjvuDump
Path of the djvudump executable Enable this and $wgDjvuRenderer to enable djvu rendering example: $wg...
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...
wfEscapeShellArg()
Version of escapeshellarg() that works better on Windows.
$line
Definition cdb.php:59
Support for detecting/validating DjVu image files and getting some basic file metadata (resolution et...
Definition DjVuImage.php:36
getPageInfo( $file)
convertDumpToXML( $dump)
Hack to temporarily work around djvutoxml bug.
parseFormDjvu( $line, &$xml)
retrieveMetaData()
Return an XML string describing the DjVu image.
const DJVUTXT_MEMORY_LIMIT
@const DJVUTXT_MEMORY_LIMIT Memory limit for the DjVu description software
Definition DjVuImage.php:40
skipChunk( $file, $chunkLength)
dumpForm( $file, $length, $indent)
Definition DjVuImage.php:93
pageTextCallback( $matches)
dump()
For debugging; dump the IFF chunk structure.
Definition DjVuImage.php:82
isValid()
Check if the given file is indeed a valid DjVu image file.
Definition DjVuImage.php:53
getMultiPageInfo( $file, $formLength)
readChunk( $file)
getImageSize()
Return data in the style of getimagesize()
Definition DjVuImage.php:63
__construct( $filename)
Definition DjVuImage.php:45
Unicode normalization routines for working with UTF-8 strings.
Definition UtfNormal.php:48
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a local account incomplete not yet checked for validity & $retval
Definition hooks.txt:266
Prior to version
$header