MediaWiki REL1_31
PdfImage.php
Go to the documentation of this file.
1<?php
24use UtfNormal\Validator;
25
31class PdfImage {
32
36 private $mFilename;
37
41 function __construct( $filename ) {
42 $this->mFilename = $filename;
43 }
44
48 public function isValid() {
49 return true;
50 }
51
55 public function getImageSize() {
56 $data = $this->retrieveMetadata();
57 $size = self::getPageSize( $data, 1 );
58
59 if ( $size ) {
60 $width = $size['width'];
61 $height = $size['height'];
62 return [ $width, $height, 'Pdf',
63 "width=\"$width\" height=\"$height\"" ];
64 }
65 return false;
66 }
67
73 public static function getPageSize( $data, $page ) {
74 global $wgPdfHandlerDpi;
75
76 if ( isset( $data['pages'][$page]['Page size'] ) ) {
77 $o = $data['pages'][$page]['Page size'];
78 } elseif ( isset( $data['Page size'] ) ) {
79 $o = $data['Page size'];
80 } else {
81 $o = false;
82 }
83
84 if ( $o ) {
85 if ( isset( $data['pages'][$page]['Page rot'] ) ) {
86 $r = $data['pages'][$page]['Page rot'];
87 } elseif ( isset( $data['Page rot'] ) ) {
88 $r = $data['Page rot'];
89 } else {
90 $r = 0;
91 }
92 $size = explode( 'x', $o, 2 );
93
94 if ( $size ) {
95 $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
96 $height = explode( ' ', trim( $size[1] ), 2 );
97 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
98 if ( ( $r / 90 ) & 1 ) {
99 // Swap width and height for landscape pages
100 $t = $width;
101 $width = $height;
102 $height = $t;
103 }
104
105 return [
106 'width' => $width,
107 'height' => $height
108 ];
109 }
110 }
111
112 return false;
113 }
114
118 public function retrieveMetaData() {
119 global $wgPdfInfo, $wgPdftoText;
120
121 if ( $wgPdfInfo ) {
122 // Note in poppler 0.26 the -meta and page data options worked together,
123 // but as of poppler 0.48 they must be queried separately.
124 // https://bugs.freedesktop.org/show_bug.cgi?id=96801
125 $cmdMeta = [
126 $wgPdfInfo,
127 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
128 '-meta', # Report XMP metadata
130 ];
131 $resultMeta = Shell::command( $cmdMeta )
132 ->execute();
133
134 $cmdPages = [
135 $wgPdfInfo,
136 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
137 '-l', '9999999', # Report page sizes for all pages
139 ];
140 $resultPages = Shell::command( $cmdPages )
141 ->execute();
142
143 $dump = $resultMeta->getStdout() . $resultPages->getStdout();
144 $data = $this->convertDumpToArray( $dump );
145 } else {
146 $data = null;
147 }
148
149 // Read text layer
150 if ( isset( $wgPdftoText ) ) {
151 $cmd = [ $wgPdftoText, $this->mFilename, '-' ];
152 $result = Shell::command( $cmd )
153 ->execute();
154 $retval = $result->getExitCode();
155 $txt = $result->getStdout();
156 if ( $retval == 0 ) {
157 $txt = str_replace( "\r\n", "\n", $txt );
158 $pages = explode( "\f", $txt );
159 foreach ( $pages as $page => $pageText ) {
160 // Get rid of invalid UTF-8, strip control characters
161 // Note we need to do this per page, as \f page feed would be stripped.
162 $pages[$page] = Validator::cleanUp( $pageText );
163 }
164 $data['text'] = $pages;
165 }
166 }
167 return $data;
168 }
169
174 protected function convertDumpToArray( $dump ) {
175 if ( strval( $dump ) == '' ) {
176 return false;
177 }
178
179 $lines = explode( "\n", $dump );
180 $data = [];
181
182 // Metadata is always the last item, and spans multiple lines.
183 $inMetadata = false;
184
185 // Basically this loop will go through each line, splitting key value
186 // pairs on the colon, until it gets to a "Metadata:\n" at which point
187 // it will gather all remaining lines into the xmp key.
188 foreach ( $lines as $line ) {
189 if ( $inMetadata ) {
190 // Handle XMP differently due to diffence in line break
191 $data['xmp'] .= "\n$line";
192 continue;
193 }
194 $bits = explode( ':', $line, 2 );
195 if ( count( $bits ) > 1 ) {
196 $key = trim( $bits[0] );
197 if ( $key === 'Metadata' ) {
198 $inMetadata = true;
199 $data['xmp'] = '';
200 continue;
201 }
202 $value = trim( $bits[1] );
203 $matches = [];
204 // "Page xx rot" will be in poppler 0.20's pdfinfo output
205 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
206 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
207 $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
208 } else {
209 $data[$key] = $value;
210 }
211 }
212 }
213 $data = $this->postProcessDump( $data );
214 return $data;
215 }
216
226 protected function postProcessDump( array $data ) {
227 $meta = new BitmapMetadataHandler();
228 $items = [];
229 foreach ( $data as $key => $val ) {
230 switch ( $key ) {
231 case 'Title':
232 $items['ObjectName'] = $val;
233 break;
234 case 'Subject':
235 $items['ImageDescription'] = $val;
236 break;
237 case 'Keywords':
238 // Sometimes we have empty keywords. This seems
239 // to be a product of how pdfinfo deals with keywords
240 // with spaces in them. Filter such empty keywords
241 $keyList = array_filter( explode( ' ', $val ) );
242 if ( count( $keyList ) > 0 ) {
243 $items['Keywords'] = $keyList;
244 }
245 break;
246 case 'Author':
247 $items['Artist'] = $val;
248 break;
249 case 'Creator':
250 // Program used to create file.
251 // Different from program used to convert to pdf.
252 $items['Software'] = $val;
253 break;
254 case 'Producer':
255 // Conversion program
256 $items['pdf-Producer'] = $val;
257 break;
258 case 'ModTime':
259 $timestamp = wfTimestamp( TS_EXIF, $val );
260 if ( $timestamp ) {
261 // 'if' is just paranoia
262 $items['DateTime'] = $timestamp;
263 }
264 break;
265 case 'CreationTime':
266 $timestamp = wfTimestamp( TS_EXIF, $val );
267 if ( $timestamp ) {
268 $items['DateTimeDigitized'] = $timestamp;
269 }
270 break;
271 // These last two (version and encryption) I was unsure
272 // if we should include in the table, since they aren't
273 // all that useful to editors. I leaned on the side
274 // of including. However not including if file
275 // is optimized/linearized since that is really useless
276 // to an editor.
277 case 'PDF version':
278 $items['pdf-Version'] = $val;
279 break;
280 case 'Encrypted':
281 // @todo: The value isn't i18n-ised. The appropriate
282 // place to do that is in FormatMetadata.php
283 // should add a hook a there.
284 // For reference, if encrypted this fields value looks like:
285 // "yes (print:yes copy:no change:no addNotes:no)"
286 $items['pdf-Encrypted'] = $val;
287 break;
288 // Note 'pages' and 'Pages' are different keys (!)
289 case 'pages':
290 // A pdf document can have multiple sized pages in it.
291 // (However 95% of the time, all pages are the same size)
292 // get a list of all the unique page sizes in document.
293 // This doesn't do anything with rotation as of yet,
294 // mostly because I am unsure of what a good way to
295 // present that information to the user would be.
296 $pageSizes = [];
297 foreach ( $val as $page ) {
298 if ( isset( $page['Page size'] ) ) {
299 $pageSizes[$page['Page size']] = true;
300 }
301 }
302
303 $pageSizeArray = array_keys( $pageSizes );
304 if ( count( $pageSizeArray ) > 0 ) {
305 $items['pdf-PageSize'] = $pageSizeArray;
306 }
307 break;
308 }
309
310 }
311 $meta->addMetadata( $items, 'native' );
312
313 if ( isset( $data['xmp'] ) && function_exists( 'xml_parser_create_ns' ) ) {
314 // func exists verifies that the xml extension required for XMPReader
315 // is present (Almost always is present)
316 // @todo: This only handles generic xmp properties. Would be improved
317 // by handling pdf xmp properties (pdf and pdfx) via XMPInfo hook.
318 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
319 $xmp->parse( $data['xmp'] );
320 $xmpRes = $xmp->getResults();
321 foreach ( $xmpRes as $type => $xmpSection ) {
322 $meta->addMetadata( $xmpSection, $type );
323 }
324 }
325 unset( $data['xmp'] );
326 $data['mergedMetadata'] = $meta->getMetadataArray();
327 return $data;
328 }
329}
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
$line
Definition cdb.php:59
Class to deal with reconciling and extracting metadata from bitmap images.
PSR-3 logger instance factory.
Executes shell commands.
Definition Shell.php:44
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition PdfImage.php:31
static getPageSize( $data, $page)
Definition PdfImage.php:73
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition PdfImage.php:226
getImageSize()
Definition PdfImage.php:55
__construct( $filename)
Definition PdfImage.php:41
retrieveMetaData()
Definition PdfImage.php:118
string $mFilename
Definition PdfImage.php:36
convertDumpToArray( $dump)
Definition PdfImage.php:174
Class for reading xmp data containing properties relevant to images, and spitting out an array that F...
Definition XMP.php:53
The ContentHandler facility adds support for arbitrary content types on wiki pages
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a local account incomplete not yet checked for validity & $retval
Definition hooks.txt:266
$lines
Definition router.php:61