46 private const DJVUTXT_MEMORY_LIMIT = 300000000;
55 $this->mFilename = $filename;
63 $info = $this->getInfo();
65 return $info !==
false;
73 $data = $this->getInfo();
75 if ( $data !==
false ) {
77 'width' => $data[
'width'],
78 'height' => $data[
'height']
91 $file = fopen( $this->mFilename,
'rb' );
93 $arr = unpack(
'a4magic/a4chunk/NchunkLength',
$header );
94 $chunk = $arr[
'chunk'];
95 $chunkLength = $arr[
'chunkLength'];
96 echo
"$chunk $chunkLength\n";
97 $this->dumpForm(
$file, $chunkLength, 1 );
101 private function dumpForm(
$file, $length, $indent ) {
102 $start = ftell(
$file );
103 $secondary = fread(
$file, 4 );
104 echo str_repeat(
' ', $indent * 4 ) .
"($secondary)\n";
105 while ( ftell(
$file ) - $start < $length ) {
106 $chunkHeader = fread(
$file, 8 );
107 if ( $chunkHeader ==
'' ) {
110 $arr = unpack(
'a4chunk/NchunkLength', $chunkHeader );
111 $chunk = $arr[
'chunk'];
112 $chunkLength = $arr[
'chunkLength'];
113 echo str_repeat(
' ', $indent * 4 ) .
"$chunk $chunkLength\n";
115 if ( $chunk ==
'FORM' ) {
116 $this->dumpForm(
$file, $chunkLength, $indent + 1 );
118 fseek(
$file, $chunkLength, SEEK_CUR );
119 if ( $chunkLength & 1 ) {
121 fseek(
$file, 1, SEEK_CUR );
127 private function getInfo() {
128 AtEase::suppressWarnings();
129 $file = fopen( $this->mFilename,
'rb' );
130 AtEase::restoreWarnings();
131 if (
$file ===
false ) {
132 wfDebug( __METHOD__ .
": missing or failed file read" );
140 if ( strlen(
$header ) < 16 ) {
141 wfDebug( __METHOD__ .
": too short file header" );
143 $arr = unpack(
'a4magic/a4form/NformLength/a4subtype',
$header );
145 $subtype = $arr[
'subtype'];
146 if ( $arr[
'magic'] !=
'AT&T' ) {
147 wfDebug( __METHOD__ .
": not a DjVu file" );
148 } elseif ( $subtype ==
'DJVU' ) {
150 $info = $this->getPageInfo(
$file );
151 } elseif ( $subtype ==
'DJVM' ) {
153 $info = $this->getMultiPageInfo(
$file, $arr[
'formLength'] );
155 wfDebug( __METHOD__ .
": unrecognized DJVU file type '{$arr['subtype']}'" );
163 private function readChunk(
$file ) {
168 $arr = unpack(
'a4chunk/Nlength',
$header );
170 return [ $arr[
'chunk'], $arr[
'length'] ];
174 private function skipChunk(
$file, $chunkLength ) {
175 fseek(
$file, $chunkLength, SEEK_CUR );
177 if ( ( $chunkLength & 1 ) && !feof(
$file ) ) {
179 fseek(
$file, 1, SEEK_CUR );
183 private function getMultiPageInfo(
$file, $formLength ) {
186 $start = ftell(
$file );
188 list( $chunk, $length ) = $this->readChunk(
$file );
193 if ( $chunk ==
'FORM' ) {
194 $subtype = fread(
$file, 4 );
195 if ( $subtype ==
'DJVU' ) {
196 wfDebug( __METHOD__ .
": found first subpage" );
198 return $this->getPageInfo(
$file );
200 $this->skipChunk(
$file, $length - 4 );
202 wfDebug( __METHOD__ .
": skipping '$chunk' chunk" );
203 $this->skipChunk(
$file, $length );
205 }
while ( $length != 0 && !feof(
$file ) && ftell(
$file ) - $start < $formLength );
207 wfDebug( __METHOD__ .
": multi-page DJVU file contained no pages" );
212 private function getPageInfo(
$file ) {
213 list( $chunk, $length ) = $this->readChunk(
$file );
214 if ( $chunk !=
'INFO' ) {
215 wfDebug( __METHOD__ .
": expected INFO chunk, got '$chunk'" );
221 wfDebug( __METHOD__ .
": INFO should be 9 or 10 bytes, found $length" );
225 $data = fread(
$file, $length );
226 if ( strlen( $data ) < $length ) {
227 wfDebug( __METHOD__ .
": INFO chunk cut off" );
240 # Newer files have rotation info in byte 10, but we don't use it yet.
243 'width' => $arr[
'width'],
244 'height' => $arr[
'height'],
245 'version' =>
"{$arr['major']}.{$arr['minor']}",
246 'resolution' => $arr[
'resolution'],
247 'gamma' => $arr[
'gamma'] / 10.0 ];
255 $djvuDump = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuDump );
256 $djvuTxt = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DjvuTxt );
261 if ( isset( $djvuDump ) ) {
262 # djvudump is faster than djvutoxml (now abandoned) as of version 3.5
264 $cmd = Shell::escape( $djvuDump ) .
' ' . Shell::escape( $this->mFilename );
266 $json = [
'data' => $this->convertDumpToJSON( $dump ) ];
271 if ( isset( $djvuTxt ) ) {
272 $cmd = Shell::escape( $djvuTxt ) .
' --detail=page ' . Shell::escape( $this->mFilename );
273 wfDebug( __METHOD__ .
": $cmd" );
275 $txt =
wfShellExec( $cmd, $retval, [], [
'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
277 if ( $retval == 0 ) {
278 # Strip some control characters
279 # Ignore carriage returns
280 $txt = preg_replace(
"/\\\\013/",
"", $txt );
281 # Replace runs of OCR region separators with a single extra line break
282 $txt = preg_replace(
"/(?:\\\\(035|037))+/",
"\n", $txt );
285 /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*
"
286 ((?> # Text to match is composed of atoms of either:
287 \\\\. # - any escaped character
288 | # - any character different from " and \
292 | # Or page can be empty ; in
this case, djvutxt dumps ()
296 preg_match_all( $reg, $txt,
$matches );
297 $json[
'text'] = array_map( [ $this,
'pageTextCallback' ],
$matches[1] );
304 private function pageTextCallback(
string $match ) {
305 # Get rid of invalid UTF-8
306 $val = UtfNormal\Validator::cleanUp( stripcslashes( $match ) );
307 $val = str_replace(
'�',
'', $val );
315 private function convertDumpToJSON( $dump ) {
316 if ( strval( $dump ) ==
'' ) {
320 $dump = str_replace(
"\r",
'', $dump );
321 $line = strtok( $dump,
"\n" );
325 if ( preg_match(
'/^( *)FORM:DJVU/',
$line, $m ) ) {
327 $parsed = $this->parseFormDjvu(
$line );
333 $result[
'pages'] = [ $parsed ];
334 } elseif ( preg_match(
'/^( *)FORM:DJVM/',
$line, $m ) ) {
336 $parentLevel = strlen( $m[1] );
338 $line = strtok(
"\n" );
339 $result[
'pages'] = [];
340 while (
$line !==
false ) {
341 $childLevel = strspn(
$line,
' ' );
342 if ( $childLevel <= $parentLevel ) {
347 if ( preg_match(
'/^ *DIRM.*indirect/',
$line ) ) {
348 wfDebug(
"Indirect multi-page DjVu document, bad for server!" );
353 if ( preg_match(
'/^ *FORM:DJVU/',
$line ) ) {
355 $parsed = $this->parseFormDjvu(
$line );
361 $result[
'pages'][] = $parsed;
363 $line = strtok(
"\n" );
373 private function parseFormDjvu(
$line ) {
374 $parentLevel = strspn(
$line,
' ' );
375 $line = strtok(
"\n" );
377 while (
$line !==
false ) {
378 $childLevel = strspn(
$line,
' ' );
379 if ( $childLevel <= $parentLevel ) {
385 '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
390 'height' => (int)$m[2],
391 'width' => (
int)$m[1],
392 'dpi' => (float)$m[3],
393 'gamma' => (
float)$m[4],
396 $line = strtok(
"\n" );
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfShellExec( $cmd, &$retval=null, $environ=[], $limits=[], $options=[])
Execute a shell command, with time and memory limits mirrored from the PHP configuration if supported...