MediaWiki
REL1_31
PdfImage.php
Go to the documentation of this file.
1
<?php
22
use
MediaWiki\Logger\LoggerFactory
;
23
use
MediaWiki\Shell\Shell
;
24
use UtfNormal\Validator;
25
31
class
PdfImage
{
32
36
private
$mFilename
;
37
41
function
__construct
( $filename ) {
42
$this->mFilename = $filename;
43
}
44
48
public
function
isValid
() {
49
return
true
;
50
}
51
55
public
function
getImageSize
() {
56
$data = $this->retrieveMetadata();
57
$size =
self::getPageSize
( $data, 1 );
58
59
if
( $size ) {
60
$width = $size[
'width'
];
61
$height = $size[
'height'
];
62
return
[ $width, $height,
'Pdf'
,
63
"width=\"$width\" height=\"$height\""
];
64
}
65
return
false
;
66
}
67
73
public
static
function
getPageSize
( $data, $page ) {
74
global $wgPdfHandlerDpi;
75
76
if
( isset( $data[
'pages'
][$page][
'Page size'
] ) ) {
77
$o = $data[
'pages'
][$page][
'Page size'
];
78
} elseif ( isset( $data[
'Page size'
] ) ) {
79
$o = $data[
'Page size'
];
80
}
else
{
81
$o =
false
;
82
}
83
84
if
( $o ) {
85
if
( isset( $data[
'pages'
][$page][
'Page rot'
] ) ) {
86
$r = $data[
'pages'
][$page][
'Page rot'
];
87
} elseif ( isset( $data[
'Page rot'
] ) ) {
88
$r = $data[
'Page rot'
];
89
}
else
{
90
$r = 0;
91
}
92
$size = explode(
'x'
, $o, 2 );
93
94
if
( $size ) {
95
$width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
96
$height = explode(
' '
, trim( $size[1] ), 2 );
97
$height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
98
if
( ( $r / 90 ) & 1 ) {
99
// Swap width and height for landscape pages
100
$t
= $width;
101
$width = $height;
102
$height =
$t
;
103
}
104
105
return
[
106
'width'
=> $width,
107
'height'
=> $height
108
];
109
}
110
}
111
112
return
false
;
113
}
114
118
public
function
retrieveMetaData
() {
119
global $wgPdfInfo, $wgPdftoText;
120
121
if
( $wgPdfInfo ) {
122
// Note in poppler 0.26 the -meta and page data options worked together,
123
// but as of poppler 0.48 they must be queried separately.
124
// https://bugs.freedesktop.org/show_bug.cgi?id=96801
125
$cmdMeta = [
126
$wgPdfInfo,
127
'-enc'
,
'UTF-8'
, # Report metadata as UTF-8 text...
128
'-meta'
, # Report XMP metadata
129
$this->mFilename
,
130
];
131
$resultMeta = Shell::command( $cmdMeta )
132
->execute();
133
134
$cmdPages = [
135
$wgPdfInfo,
136
'-enc'
,
'UTF-8'
, # Report metadata as UTF-8 text...
137
'-l'
,
'9999999'
, # Report page sizes
for
all
pages
138
$this->mFilename
,
139
];
140
$resultPages = Shell::command( $cmdPages )
141
->execute();
142
143
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
144
$data = $this->
convertDumpToArray
( $dump );
145
}
else
{
146
$data =
null
;
147
}
148
149
// Read text layer
150
if
( isset( $wgPdftoText ) ) {
151
$cmd = [ $wgPdftoText,
$this->mFilename
,
'-'
];
152
$result = Shell::command( $cmd )
153
->execute();
154
$retval
= $result->getExitCode();
155
$txt = $result->getStdout();
156
if
(
$retval
== 0 ) {
157
$txt = str_replace(
"\r\n"
,
"\n"
, $txt );
158
$pages = explode(
"\f"
, $txt );
159
foreach
( $pages as $page => $pageText ) {
160
// Get rid of invalid UTF-8, strip control characters
161
// Note we need to do this per page, as \f page feed would be stripped.
162
$pages[$page] = Validator::cleanUp( $pageText );
163
}
164
$data[
'text'
] = $pages;
165
}
166
}
167
return
$data;
168
}
169
174
protected
function
convertDumpToArray
( $dump ) {
175
if
( strval( $dump ) ==
''
) {
176
return
false
;
177
}
178
179
$lines
= explode(
"\n"
, $dump );
180
$data = [];
181
182
// Metadata is always the last item, and spans multiple lines.
183
$inMetadata =
false
;
184
185
// Basically this loop will go through each line, splitting key value
186
// pairs on the colon, until it gets to a "Metadata:\n" at which point
187
// it will gather all remaining lines into the xmp key.
188
foreach
(
$lines
as
$line
) {
189
if
( $inMetadata ) {
190
// Handle XMP differently due to diffence in line break
191
$data[
'xmp'
] .=
"\n$line"
;
192
continue
;
193
}
194
$bits = explode(
':'
,
$line
, 2 );
195
if
( count( $bits ) > 1 ) {
196
$key = trim( $bits[0] );
197
if
( $key ===
'Metadata'
) {
198
$inMetadata =
true
;
199
$data[
'xmp'
] =
''
;
200
continue
;
201
}
202
$value
= trim( $bits[1] );
203
$matches
= [];
204
// "Page xx rot" will be in poppler 0.20's pdfinfo output
205
// See https://bugs.freedesktop.org/show_bug.cgi?id=41867
206
if
( preg_match(
'/^Page +(\d+) (size|rot)$/'
, $key,
$matches
) ) {
207
$data[
'pages'
][
$matches
[1]][
$matches
[2] ==
'size'
?
'Page size'
:
'Page rot'
] =
$value
;
208
}
else
{
209
$data[$key] =
$value
;
210
}
211
}
212
}
213
$data = $this->
postProcessDump
( $data );
214
return
$data;
215
}
216
226
protected
function
postProcessDump
( array $data ) {
227
$meta =
new
BitmapMetadataHandler
();
228
$items = [];
229
foreach
( $data as $key => $val ) {
230
switch
( $key ) {
231
case
'Title'
:
232
$items[
'ObjectName'
] = $val;
233
break
;
234
case
'Subject'
:
235
$items[
'ImageDescription'
] = $val;
236
break
;
237
case
'Keywords'
:
238
// Sometimes we have empty keywords. This seems
239
// to be a product of how pdfinfo deals with keywords
240
// with spaces in them. Filter such empty keywords
241
$keyList = array_filter( explode(
' '
, $val ) );
242
if
( count( $keyList ) > 0 ) {
243
$items[
'Keywords'
] = $keyList;
244
}
245
break
;
246
case
'Author'
:
247
$items[
'Artist'
] = $val;
248
break
;
249
case
'Creator'
:
250
// Program used to create file.
251
// Different from program used to convert to pdf.
252
$items[
'Software'
] = $val;
253
break
;
254
case
'Producer'
:
255
// Conversion program
256
$items[
'pdf-Producer'
] = $val;
257
break
;
258
case
'ModTime'
:
259
$timestamp =
wfTimestamp
( TS_EXIF, $val );
260
if
( $timestamp ) {
261
// 'if' is just paranoia
262
$items[
'DateTime'
] = $timestamp;
263
}
264
break
;
265
case
'CreationTime'
:
266
$timestamp =
wfTimestamp
( TS_EXIF, $val );
267
if
( $timestamp ) {
268
$items[
'DateTimeDigitized'
] = $timestamp;
269
}
270
break
;
271
// These last two (version and encryption) I was unsure
272
// if we should include in the table, since they aren't
273
// all that useful to editors. I leaned on the side
274
// of including. However not including if file
275
// is optimized/linearized since that is really useless
276
// to an editor.
277
case
'PDF version'
:
278
$items[
'pdf-Version'
] = $val;
279
break
;
280
case
'Encrypted'
:
281
// @todo: The value isn't i18n-ised. The appropriate
282
// place to do that is in FormatMetadata.php
283
// should add a hook a there.
284
// For reference, if encrypted this fields value looks like:
285
// "yes (print:yes copy:no change:no addNotes:no)"
286
$items[
'pdf-Encrypted'
] = $val;
287
break
;
288
// Note 'pages' and 'Pages' are different keys (!)
289
case
'pages'
:
290
// A pdf document can have multiple sized pages in it.
291
// (However 95% of the time, all pages are the same size)
292
// get a list of all the unique page sizes in document.
293
// This doesn't do anything with rotation as of yet,
294
// mostly because I am unsure of what a good way to
295
// present that information to the user would be.
296
$pageSizes = [];
297
foreach
( $val as $page ) {
298
if
( isset( $page[
'Page size'
] ) ) {
299
$pageSizes[$page[
'Page size'
]] =
true
;
300
}
301
}
302
303
$pageSizeArray = array_keys( $pageSizes );
304
if
( count( $pageSizeArray ) > 0 ) {
305
$items[
'pdf-PageSize'
] = $pageSizeArray;
306
}
307
break
;
308
}
309
310
}
311
$meta->addMetadata( $items,
'native'
);
312
313
if
( isset( $data[
'xmp'
] ) && function_exists(
'xml_parser_create_ns'
) ) {
314
// func exists verifies that the xml extension required for XMPReader
315
// is present (Almost always is present)
316
// @todo: This only handles generic xmp properties. Would be improved
317
// by handling pdf xmp properties (pdf and pdfx) via XMPInfo hook.
318
$xmp =
new
XMPReader
( LoggerFactory::getInstance(
'XMP'
) );
319
$xmp->parse( $data[
'xmp'
] );
320
$xmpRes = $xmp->getResults();
321
foreach
( $xmpRes as
$type
=> $xmpSection ) {
322
$meta->addMetadata( $xmpSection,
$type
);
323
}
324
}
325
unset( $data[
'xmp'
] );
326
$data[
'mergedMetadata'
] = $meta->getMetadataArray();
327
return
$data;
328
}
329
}
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition
GlobalFunctions.php:1980
$matches
$matches
Definition
NoLocalSettings.php:24
$line
$line
Definition
cdb.php:59
BitmapMetadataHandler
Class to deal with reconciling and extracting metadata from bitmap images.
Definition
BitmapMetadataHandler.php:36
MediaWiki\Logger\LoggerFactory
PSR-3 logger instance factory.
Definition
LoggerFactory.php:45
MediaWiki\Shell\Shell
Executes shell commands.
Definition
Shell.php:44
PdfImage
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition
PdfImage.php:31
PdfImage\getPageSize
static getPageSize( $data, $page)
Definition
PdfImage.php:73
PdfImage\postProcessDump
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition
PdfImage.php:226
PdfImage\getImageSize
getImageSize()
Definition
PdfImage.php:55
PdfImage\__construct
__construct( $filename)
Definition
PdfImage.php:41
PdfImage\retrieveMetaData
retrieveMetaData()
Definition
PdfImage.php:118
PdfImage\$mFilename
string $mFilename
Definition
PdfImage.php:36
PdfImage\convertDumpToArray
convertDumpToArray( $dump)
Definition
PdfImage.php:174
PdfImage\isValid
isValid()
Definition
PdfImage.php:48
XMPReader
Class for reading xmp data containing properties relevant to images, and spitting out an array that F...
Definition
XMP.php:53
pages
The ContentHandler facility adds support for arbitrary content types on wiki pages
Definition
contenthandler.txt:1
$retval
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a local account incomplete not yet checked for validity & $retval
Definition
hooks.txt:266
$lines
$lines
Definition
router.php:61
$value
$value
Definition
styleTest.css.php:45
$t
$t
Definition
testCompression.php:69
$type
$type
Definition
testCompression.php:48
extensions
PdfHandler
includes
PdfImage.php
Generated on Mon Nov 25 2024 15:34:22 for MediaWiki by
1.10.0