MediaWiki REL1_34
PdfImage.php
Go to the documentation of this file.
1<?php
24use UtfNormal\Validator;
25use Wikimedia\XMPReader\Reader as XMPReader;
26
32class PdfImage {
33
37 private $mFilename;
38
42 public function __construct( $filename ) {
43 $this->mFilename = $filename;
44 }
45
49 public function isValid() {
50 return true;
51 }
52
56 public function getImageSize() {
57 $data = $this->retrieveMetadata();
58 $size = self::getPageSize( $data, 1 );
59
60 if ( $size ) {
61 $width = $size['width'];
62 $height = $size['height'];
63 return [ $width, $height, 'Pdf',
64 "width=\"$width\" height=\"$height\"" ];
65 }
66 return false;
67 }
68
74 public static function getPageSize( $data, $page ) {
75 global $wgPdfHandlerDpi;
76
77 if ( isset( $data['pages'][$page]['Page size'] ) ) {
78 $o = $data['pages'][$page]['Page size'];
79 } elseif ( isset( $data['Page size'] ) ) {
80 $o = $data['Page size'];
81 } else {
82 $o = false;
83 }
84
85 if ( $o ) {
86 if ( isset( $data['pages'][$page]['Page rot'] ) ) {
87 $r = $data['pages'][$page]['Page rot'];
88 } elseif ( isset( $data['Page rot'] ) ) {
89 $r = $data['Page rot'];
90 } else {
91 $r = 0;
92 }
93 $size = explode( 'x', $o, 2 );
94
95 if ( $size ) {
96 $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
97 $height = explode( ' ', trim( $size[1] ), 2 );
98 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
99 if ( ( $r / 90 ) & 1 ) {
100 // Swap width and height for landscape pages
101 $t = $width;
102 $width = $height;
103 $height = $t;
104 }
105
106 return [
107 'width' => $width,
108 'height' => $height
109 ];
110 }
111 }
112
113 return false;
114 }
115
119 public function retrieveMetaData() {
120 global $wgPdfInfo, $wgPdftoText;
121
122 if ( $wgPdfInfo ) {
123 // Note in poppler 0.26 the -meta and page data options worked together,
124 // but as of poppler 0.48 they must be queried separately.
125 // https://bugs.freedesktop.org/show_bug.cgi?id=96801
126 $cmdMeta = [
127 $wgPdfInfo,
128 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
129 '-meta', # Report XMP metadata
131 ];
132 $resultMeta = Shell::command( $cmdMeta )
133 ->execute();
134
135 $cmdPages = [
136 $wgPdfInfo,
137 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
138 '-l', '9999999', # Report page sizes for all pages
140 ];
141 $resultPages = Shell::command( $cmdPages )
142 ->execute();
143
144 $dump = $resultMeta->getStdout() . $resultPages->getStdout();
145 $data = $this->convertDumpToArray( $dump );
146 } else {
147 $data = null;
148 }
149
150 // Read text layer
151 if ( isset( $wgPdftoText ) ) {
152 $cmd = [ $wgPdftoText, $this->mFilename, '-' ];
153 $result = Shell::command( $cmd )
154 ->execute();
155 $retval = $result->getExitCode();
156 $txt = $result->getStdout();
157 if ( $retval == 0 ) {
158 $txt = str_replace( "\r\n", "\n", $txt );
159 $pages = explode( "\f", $txt );
160 foreach ( $pages as $page => $pageText ) {
161 // Get rid of invalid UTF-8, strip control characters
162 // Note we need to do this per page, as \f page feed would be stripped.
163 $pages[$page] = Validator::cleanUp( $pageText );
164 }
165 $data['text'] = $pages;
166 }
167 }
168 return $data;
169 }
170
175 protected function convertDumpToArray( $dump ) {
176 if ( strval( $dump ) == '' ) {
177 return false;
178 }
179
180 $lines = explode( "\n", $dump );
181 $data = [];
182
183 // Metadata is always the last item, and spans multiple lines.
184 $inMetadata = false;
185
186 // Basically this loop will go through each line, splitting key value
187 // pairs on the colon, until it gets to a "Metadata:\n" at which point
188 // it will gather all remaining lines into the xmp key.
189 foreach ( $lines as $line ) {
190 if ( $inMetadata ) {
191 // Handle XMP differently due to diffence in line break
192 // @phan-suppress-next-line PhanTypeInvalidDimOffset weird loop
193 $data['xmp'] .= "\n$line";
194 continue;
195 }
196 $bits = explode( ':', $line, 2 );
197 if ( count( $bits ) > 1 ) {
198 $key = trim( $bits[0] );
199 if ( $key === 'Metadata' ) {
200 $inMetadata = true;
201 $data['xmp'] = '';
202 continue;
203 }
204 $value = trim( $bits[1] );
205 $matches = [];
206 // "Page xx rot" will be in poppler 0.20's pdfinfo output
207 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
208 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
209 $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
210 } else {
211 $data[$key] = $value;
212 }
213 }
214 }
215 $data = $this->postProcessDump( $data );
216 return $data;
217 }
218
228 protected function postProcessDump( array $data ) {
229 $meta = new BitmapMetadataHandler();
230 $items = [];
231 foreach ( $data as $key => $val ) {
232 switch ( $key ) {
233 case 'Title':
234 $items['ObjectName'] = $val;
235 break;
236 case 'Subject':
237 $items['ImageDescription'] = $val;
238 break;
239 case 'Keywords':
240 // Sometimes we have empty keywords. This seems
241 // to be a product of how pdfinfo deals with keywords
242 // with spaces in them. Filter such empty keywords
243 $keyList = array_filter( explode( ' ', $val ) );
244 if ( count( $keyList ) > 0 ) {
245 $items['Keywords'] = $keyList;
246 }
247 break;
248 case 'Author':
249 $items['Artist'] = $val;
250 break;
251 case 'Creator':
252 // Program used to create file.
253 // Different from program used to convert to pdf.
254 $items['Software'] = $val;
255 break;
256 case 'Producer':
257 // Conversion program
258 $items['pdf-Producer'] = $val;
259 break;
260 case 'ModTime':
261 $timestamp = wfTimestamp( TS_EXIF, $val );
262 if ( $timestamp ) {
263 // 'if' is just paranoia
264 $items['DateTime'] = $timestamp;
265 }
266 break;
267 case 'CreationTime':
268 $timestamp = wfTimestamp( TS_EXIF, $val );
269 if ( $timestamp ) {
270 $items['DateTimeDigitized'] = $timestamp;
271 }
272 break;
273 // These last two (version and encryption) I was unsure
274 // if we should include in the table, since they aren't
275 // all that useful to editors. I leaned on the side
276 // of including. However not including if file
277 // is optimized/linearized since that is really useless
278 // to an editor.
279 case 'PDF version':
280 $items['pdf-Version'] = $val;
281 break;
282 case 'Encrypted':
283 // @todo: The value isn't i18n-ised. The appropriate
284 // place to do that is in FormatMetadata.php
285 // should add a hook a there.
286 // For reference, if encrypted this fields value looks like:
287 // "yes (print:yes copy:no change:no addNotes:no)"
288 $items['pdf-Encrypted'] = $val;
289 break;
290 // Note 'pages' and 'Pages' are different keys (!)
291 case 'pages':
292 // A pdf document can have multiple sized pages in it.
293 // (However 95% of the time, all pages are the same size)
294 // get a list of all the unique page sizes in document.
295 // This doesn't do anything with rotation as of yet,
296 // mostly because I am unsure of what a good way to
297 // present that information to the user would be.
298 $pageSizes = [];
299 foreach ( $val as $page ) {
300 if ( isset( $page['Page size'] ) ) {
301 $pageSizes[$page['Page size']] = true;
302 }
303 }
304
305 $pageSizeArray = array_keys( $pageSizes );
306 if ( count( $pageSizeArray ) > 0 ) {
307 $items['pdf-PageSize'] = $pageSizeArray;
308 }
309 break;
310 }
311
312 }
313 $meta->addMetadata( $items, 'native' );
314
315 if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) {
316 // @todo: This only handles generic xmp properties. Would be improved
317 // by handling pdf xmp properties (pdf and pdfx) via a hook.
318 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
319 $xmp->parse( $data['xmp'] );
320 $xmpRes = $xmp->getResults();
321 foreach ( $xmpRes as $type => $xmpSection ) {
322 $meta->addMetadata( $xmpSection, $type );
323 }
324 }
325 unset( $data['xmp'] );
326 $data['mergedMetadata'] = $meta->getMetadataArray();
327 return $data;
328 }
329}
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
$line
Definition cdb.php:59
Class to deal with reconciling and extracting metadata from bitmap images.
PSR-3 logger instance factory.
Executes shell commands.
Definition Shell.php:44
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition PdfImage.php:32
static getPageSize( $data, $page)
Definition PdfImage.php:74
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition PdfImage.php:228
getImageSize()
Definition PdfImage.php:56
__construct( $filename)
Definition PdfImage.php:42
retrieveMetaData()
Definition PdfImage.php:119
string $mFilename
Definition PdfImage.php:37
convertDumpToArray( $dump)
Definition PdfImage.php:175
$lines
Definition router.php:61