Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 224 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
DjVuImage | |
0.00% |
0 / 224 |
|
0.00% |
0 / 14 |
4290 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isValid | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getImageSize | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
dump | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
dumpForm | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
30 | |||
getInfo | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
42 | |||
readChunk | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
skipChunk | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
getMultiPageInfo | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
42 | |||
getPageInfo | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
20 | |||
retrieveMetaData | |
0.00% |
0 / 67 |
|
0.00% |
0 / 1 |
342 | |||
pageTextCallback | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
convertDumpToJSON | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
132 | |||
parseFormDjvu | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | /** |
3 | * DjVu image handler. |
4 | * |
5 | * Copyright © 2006 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Media |
25 | */ |
26 | |
27 | use MediaWiki\MainConfigNames; |
28 | use MediaWiki\MediaWikiServices; |
29 | use MediaWiki\Shell\Shell; |
30 | use Wikimedia\AtEase\AtEase; |
31 | |
32 | /** |
33 | * Support for detecting/validating DjVu image files and getting |
34 | * some basic file metadata (resolution etc) |
35 | * |
36 | * File format docs are available in source package for DjVuLibre: |
37 | * http://djvulibre.djvuzone.org/ |
38 | * |
39 | * @ingroup Media |
40 | */ |
41 | class DjVuImage { |
42 | |
43 | /** |
44 | * Memory limit for the DjVu description software |
45 | */ |
46 | private const DJVUTXT_MEMORY_LIMIT = 300_000_000; |
47 | |
48 | /** @var string */ |
49 | private $mFilename; |
50 | |
51 | /** |
52 | * @param string $filename The DjVu file name. |
53 | */ |
54 | public function __construct( $filename ) { |
55 | $this->mFilename = $filename; |
56 | } |
57 | |
58 | /** |
59 | * Check if the given file is indeed a valid DjVu image file |
60 | * @return bool |
61 | */ |
62 | public function isValid() { |
63 | $info = $this->getInfo(); |
64 | |
65 | return $info !== false; |
66 | } |
67 | |
68 | /** |
69 | * Return width and height |
70 | * @return array An array with "width" and "height" keys, or an empty array on failure. |
71 | */ |
72 | public function getImageSize() { |
73 | $data = $this->getInfo(); |
74 | |
75 | if ( $data !== false ) { |
76 | return [ |
77 | 'width' => $data['width'], |
78 | 'height' => $data['height'] |
79 | ]; |
80 | } |
81 | return []; |
82 | } |
83 | |
84 | // --------- |
85 | |
86 | /** |
87 | * For debugging; dump the IFF chunk structure |
88 | */ |
89 | public function dump() { |
90 | $file = fopen( $this->mFilename, 'rb' ); |
91 | $header = fread( $file, 12 ); |
92 | $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header ); |
93 | $chunk = $arr['chunk']; |
94 | $chunkLength = $arr['chunkLength']; |
95 | echo "$chunk $chunkLength\n"; |
96 | $this->dumpForm( $file, $chunkLength, 1 ); |
97 | fclose( $file ); |
98 | } |
99 | |
100 | private function dumpForm( $file, $length, $indent ) { |
101 | $start = ftell( $file ); |
102 | $secondary = fread( $file, 4 ); |
103 | echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n"; |
104 | while ( ftell( $file ) - $start < $length ) { |
105 | $chunkHeader = fread( $file, 8 ); |
106 | if ( $chunkHeader == '' ) { |
107 | break; |
108 | } |
109 | $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader ); |
110 | $chunk = $arr['chunk']; |
111 | $chunkLength = $arr['chunkLength']; |
112 | echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n"; |
113 | |
114 | if ( $chunk === 'FORM' ) { |
115 | $this->dumpForm( $file, $chunkLength, $indent + 1 ); |
116 | } else { |
117 | fseek( $file, $chunkLength, SEEK_CUR ); |
118 | if ( $chunkLength & 1 ) { |
119 | // Padding byte between chunks |
120 | fseek( $file, 1, SEEK_CUR ); |
121 | } |
122 | } |
123 | } |
124 | } |
125 | |
126 | private function getInfo() { |
127 | AtEase::suppressWarnings(); |
128 | $file = fopen( $this->mFilename, 'rb' ); |
129 | AtEase::restoreWarnings(); |
130 | if ( $file === false ) { |
131 | wfDebug( __METHOD__ . ": missing or failed file read" ); |
132 | |
133 | return false; |
134 | } |
135 | |
136 | $header = fread( $file, 16 ); |
137 | $info = false; |
138 | |
139 | if ( strlen( $header ) < 16 ) { |
140 | wfDebug( __METHOD__ . ": too short file header" ); |
141 | } else { |
142 | $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header ); |
143 | |
144 | $subtype = $arr['subtype']; |
145 | if ( $arr['magic'] !== 'AT&T' ) { |
146 | wfDebug( __METHOD__ . ": not a DjVu file" ); |
147 | } elseif ( $subtype === 'DJVU' ) { |
148 | // Single-page document |
149 | $info = $this->getPageInfo( $file ); |
150 | } elseif ( $subtype === 'DJVM' ) { |
151 | // Multi-page document |
152 | $info = $this->getMultiPageInfo( $file, $arr['formLength'] ); |
153 | } else { |
154 | wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'" ); |
155 | } |
156 | } |
157 | fclose( $file ); |
158 | |
159 | return $info; |
160 | } |
161 | |
162 | private function readChunk( $file ) { |
163 | $header = fread( $file, 8 ); |
164 | if ( strlen( $header ) < 8 ) { |
165 | return [ false, 0 ]; |
166 | } |
167 | $arr = unpack( 'a4chunk/Nlength', $header ); |
168 | |
169 | return [ $arr['chunk'], $arr['length'] ]; |
170 | } |
171 | |
172 | private function skipChunk( $file, $chunkLength ) { |
173 | fseek( $file, $chunkLength, SEEK_CUR ); |
174 | |
175 | if ( ( $chunkLength & 1 ) && !feof( $file ) ) { |
176 | // padding byte |
177 | fseek( $file, 1, SEEK_CUR ); |
178 | } |
179 | } |
180 | |
181 | private function getMultiPageInfo( $file, $formLength ) { |
182 | // For now, we'll just look for the first page in the file |
183 | // and report its information, hoping others are the same size. |
184 | $start = ftell( $file ); |
185 | do { |
186 | [ $chunk, $length ] = $this->readChunk( $file ); |
187 | if ( !$chunk ) { |
188 | break; |
189 | } |
190 | |
191 | if ( $chunk === 'FORM' ) { |
192 | $subtype = fread( $file, 4 ); |
193 | if ( $subtype === 'DJVU' ) { |
194 | wfDebug( __METHOD__ . ": found first subpage" ); |
195 | |
196 | return $this->getPageInfo( $file ); |
197 | } |
198 | $this->skipChunk( $file, $length - 4 ); |
199 | } else { |
200 | wfDebug( __METHOD__ . ": skipping '$chunk' chunk" ); |
201 | $this->skipChunk( $file, $length ); |
202 | } |
203 | } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength ); |
204 | |
205 | wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages" ); |
206 | |
207 | return false; |
208 | } |
209 | |
210 | private function getPageInfo( $file ) { |
211 | [ $chunk, $length ] = $this->readChunk( $file ); |
212 | if ( $chunk !== 'INFO' ) { |
213 | wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'" ); |
214 | |
215 | return false; |
216 | } |
217 | |
218 | if ( $length < 9 ) { |
219 | wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length" ); |
220 | |
221 | return false; |
222 | } |
223 | $data = fread( $file, $length ); |
224 | if ( strlen( $data ) < $length ) { |
225 | wfDebug( __METHOD__ . ": INFO chunk cut off" ); |
226 | |
227 | return false; |
228 | } |
229 | |
230 | $arr = unpack( |
231 | 'nwidth/' . |
232 | 'nheight/' . |
233 | 'Cminor/' . |
234 | 'Cmajor/' . |
235 | 'vresolution/' . |
236 | 'Cgamma', $data ); |
237 | |
238 | # Newer files have rotation info in byte 10, but we don't use it yet. |
239 | |
240 | return [ |
241 | 'width' => $arr['width'], |
242 | 'height' => $arr['height'], |
243 | 'version' => "{$arr['major']}.{$arr['minor']}", |
244 | 'resolution' => $arr['resolution'], |
245 | 'gamma' => $arr['gamma'] / 10.0 ]; |
246 | } |
247 | |
248 | /** |
249 | * Return an array describing the DjVu image |
250 | * @return array|null|false |
251 | */ |
252 | public function retrieveMetaData() { |
253 | $config = MediaWikiServices::getInstance()->getMainConfig(); |
254 | $djvuDump = $config->get( MainConfigNames::DjvuDump ); |
255 | $djvuTxt = $config->get( MainConfigNames::DjvuTxt ); |
256 | $djvuUseBoxedCommand = $config->get( MainConfigNames::DjvuUseBoxedCommand ); |
257 | $shell = $config->get( MainConfigNames::ShellboxShell ); |
258 | if ( !$this->isValid() ) { |
259 | return false; |
260 | } |
261 | |
262 | if ( $djvuTxt === null && $djvuDump === null ) { |
263 | return []; |
264 | } |
265 | |
266 | $txt = null; |
267 | $dump = null; |
268 | |
269 | if ( $djvuUseBoxedCommand ) { |
270 | $command = MediaWikiServices::getInstance()->getShellCommandFactory() |
271 | ->createBoxed( 'djvu' ) |
272 | ->disableNetwork() |
273 | ->firejailDefaultSeccomp() |
274 | ->routeName( 'djvu-metadata' ) |
275 | ->params( $shell, 'scripts/retrieveDjvuMetaData.sh' ) |
276 | ->inputFileFromFile( |
277 | 'scripts/retrieveDjvuMetaData.sh', |
278 | __DIR__ . '/scripts/retrieveDjvuMetaData.sh' ) |
279 | ->inputFileFromFile( 'file.djvu', $this->mFilename ) |
280 | ->memoryLimit( self::DJVUTXT_MEMORY_LIMIT ); |
281 | $env = []; |
282 | if ( $djvuDump !== null ) { |
283 | $env['DJVU_DUMP'] = $djvuDump; |
284 | $command->outputFileToString( 'dump' ); |
285 | } |
286 | if ( $djvuTxt !== null ) { |
287 | $env['DJVU_TXT'] = $djvuTxt; |
288 | $command->outputFileToString( 'txt' ); |
289 | } |
290 | |
291 | $result = $command |
292 | ->environment( $env ) |
293 | ->execute(); |
294 | if ( $result->getExitCode() !== 0 ) { |
295 | wfDebug( 'retrieveDjvuMetaData failed with exit code ' . $result->getExitCode() ); |
296 | return false; |
297 | } |
298 | if ( $djvuDump !== null ) { |
299 | if ( $result->wasReceived( 'dump' ) ) { |
300 | $dump = $result->getFileContents( 'dump' ); |
301 | } else { |
302 | wfDebug( __METHOD__ . ": did not receive dump file" ); |
303 | } |
304 | } |
305 | |
306 | if ( $djvuTxt !== null ) { |
307 | if ( $result->wasReceived( 'txt' ) ) { |
308 | $txt = $result->getFileContents( 'txt' ); |
309 | } else { |
310 | wfDebug( __METHOD__ . ": did not receive text file" ); |
311 | } |
312 | } |
313 | } else { // No boxedcommand |
314 | if ( $djvuDump !== null ) { |
315 | # djvudump is faster than djvutoxml (now abandoned) as of version 3.5 |
316 | # https://sourceforge.net/p/djvu/bugs/71/ |
317 | $cmd = Shell::escape( $djvuDump ) . ' ' . Shell::escape( $this->mFilename ); |
318 | $dump = wfShellExec( $cmd ); |
319 | } |
320 | if ( $djvuTxt !== null ) { |
321 | $cmd = Shell::escape( $djvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename ); |
322 | wfDebug( __METHOD__ . ": $cmd" ); |
323 | $retval = 0; |
324 | $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] ); |
325 | if ( $retval !== 0 ) { |
326 | $txt = null; |
327 | } |
328 | } |
329 | } |
330 | |
331 | # Convert dump to array |
332 | $json = []; |
333 | if ( $dump !== null ) { |
334 | $data = $this->convertDumpToJSON( $dump ); |
335 | if ( $data !== false ) { |
336 | $json = [ 'data' => $data ]; |
337 | } |
338 | } |
339 | |
340 | # Text layer |
341 | if ( $txt !== null ) { |
342 | # Strip some control characters |
343 | # Ignore carriage returns |
344 | $txt = preg_replace( "/\\\\013/", "", $txt ); |
345 | # Replace runs of OCR region separators with a single extra line break |
346 | $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt ); |
347 | |
348 | $reg = <<<EOR |
349 | /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" |
350 | ((?> # Text to match is composed of atoms of either: |
351 | \\\\. # - any escaped character |
352 | | # - any character different from " and \ |
353 | [^"\\\\]+ |
354 | )*?) |
355 | "\s*\) |
356 | | # Or page can be empty ; in this case, djvutxt dumps () |
357 | \(\s*()\)/sx |
358 | EOR; |
359 | $matches = []; |
360 | preg_match_all( $reg, $txt, $matches ); |
361 | $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] ); |
362 | } else { |
363 | $json['text'] = []; |
364 | } |
365 | |
366 | return $json; |
367 | } |
368 | |
369 | private function pageTextCallback( string $match ) { |
370 | # Get rid of invalid UTF-8 |
371 | $val = UtfNormal\Validator::cleanUp( stripcslashes( $match ) ); |
372 | return str_replace( '�', '', $val ); |
373 | } |
374 | |
375 | /** |
376 | * @param string $dump |
377 | * @return array|false |
378 | */ |
379 | private function convertDumpToJSON( $dump ) { |
380 | if ( strval( $dump ) == '' ) { |
381 | return false; |
382 | } |
383 | |
384 | $dump = str_replace( "\r", '', $dump ); |
385 | $line = strtok( $dump, "\n" ); |
386 | $m = false; |
387 | $good = false; |
388 | $result = []; |
389 | if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) { |
390 | # Single-page |
391 | $parsed = $this->parseFormDjvu( $line ); |
392 | if ( $parsed ) { |
393 | $good = true; |
394 | } else { |
395 | return false; |
396 | } |
397 | $result['pages'] = [ $parsed ]; |
398 | } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) { |
399 | # Multi-page |
400 | $parentLevel = strlen( $m[1] ); |
401 | # Find DIRM |
402 | $line = strtok( "\n" ); |
403 | $result['pages'] = []; |
404 | while ( $line !== false ) { |
405 | $childLevel = strspn( $line, ' ' ); |
406 | if ( $childLevel <= $parentLevel ) { |
407 | # End of chunk |
408 | break; |
409 | } |
410 | |
411 | if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) { |
412 | wfDebug( "Indirect multi-page DjVu document, bad for server!" ); |
413 | |
414 | return false; |
415 | } |
416 | |
417 | if ( preg_match( '/^ *FORM:DJVU/', $line ) ) { |
418 | # Found page |
419 | $parsed = $this->parseFormDjvu( $line ); |
420 | if ( $parsed ) { |
421 | $good = true; |
422 | } else { |
423 | return false; |
424 | } |
425 | $result['pages'][] = $parsed; |
426 | } |
427 | $line = strtok( "\n" ); |
428 | } |
429 | } |
430 | if ( !$good ) { |
431 | return false; |
432 | } |
433 | |
434 | return $result; |
435 | } |
436 | |
437 | private function parseFormDjvu( $line ) { |
438 | $parentLevel = strspn( $line, ' ' ); |
439 | $line = strtok( "\n" ); |
440 | # Find INFO |
441 | while ( $line !== false ) { |
442 | $childLevel = strspn( $line, ' ' ); |
443 | if ( $childLevel <= $parentLevel ) { |
444 | # End of chunk |
445 | break; |
446 | } |
447 | |
448 | if ( preg_match( |
449 | '/^ *INFO *\[\d*] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/', |
450 | $line, |
451 | $m |
452 | ) ) { |
453 | return [ |
454 | 'height' => (int)$m[2], |
455 | 'width' => (int)$m[1], |
456 | 'dpi' => (float)$m[3], |
457 | 'gamma' => (float)$m[4], |
458 | ]; |
459 | } |
460 | $line = strtok( "\n" ); |
461 | } |
462 | |
463 | # Not found |
464 | return false; |
465 | } |
466 | } |