MediaWiki REL1_33
PdfImage.php
Go to the documentation of this file.
1<?php
26
32class PdfImage {
33
37 private $mFilename;
38
42 public function __construct( $filename ) {
43 $this->mFilename = $filename;
44 }
45
49 public function isValid() {
50 return true;
51 }
52
56 public function getImageSize() {
57 $data = $this->retrieveMetadata();
58 $size = self::getPageSize( $data, 1 );
59
60 if ( $size ) {
61 $width = $size['width'];
62 $height = $size['height'];
63 return [ $width, $height, 'Pdf',
64 "width=\"$width\" height=\"$height\"" ];
65 }
66 return false;
67 }
68
74 public static function getPageSize( $data, $page ) {
75 global $wgPdfHandlerDpi;
76
77 if ( isset( $data['pages'][$page]['Page size'] ) ) {
78 $o = $data['pages'][$page]['Page size'];
79 } elseif ( isset( $data['Page size'] ) ) {
80 $o = $data['Page size'];
81 } else {
82 $o = false;
83 }
84
85 if ( $o ) {
86 if ( isset( $data['pages'][$page]['Page rot'] ) ) {
87 $r = $data['pages'][$page]['Page rot'];
88 } elseif ( isset( $data['Page rot'] ) ) {
89 $r = $data['Page rot'];
90 } else {
91 $r = 0;
92 }
93 $size = explode( 'x', $o, 2 );
94
95 if ( $size ) {
96 $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
97 $height = explode( ' ', trim( $size[1] ), 2 );
98 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
99 if ( ( $r / 90 ) & 1 ) {
100 // Swap width and height for landscape pages
101 $t = $width;
102 $width = $height;
103 $height = $t;
104 }
105
106 return [
107 'width' => $width,
108 'height' => $height
109 ];
110 }
111 }
112
113 return false;
114 }
115
119 public function retrieveMetaData() {
120 global $wgPdfInfo, $wgPdftoText;
121
122 if ( $wgPdfInfo ) {
123 // Note in poppler 0.26 the -meta and page data options worked together,
124 // but as of poppler 0.48 they must be queried separately.
125 // https://bugs.freedesktop.org/show_bug.cgi?id=96801
126 $cmdMeta = [
128 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
129 '-meta', # Report XMP metadata
131 ];
132 $resultMeta = Shell::command( $cmdMeta )
133 ->execute();
134
135 $cmdPages = [
137 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
138 '-l', '9999999', # Report page sizes for all pages
140 ];
141 $resultPages = Shell::command( $cmdPages )
142 ->execute();
143
144 $dump = $resultMeta->getStdout() . $resultPages->getStdout();
145 $data = $this->convertDumpToArray( $dump );
146 } else {
147 $data = null;
148 }
149
150 // Read text layer
151 if ( isset( $wgPdftoText ) ) {
152 $cmd = [ $wgPdftoText, $this->mFilename, '-' ];
153 $result = Shell::command( $cmd )
154 ->execute();
155 $retval = $result->getExitCode();
156 $txt = $result->getStdout();
157 if ( $retval == 0 ) {
158 $txt = str_replace( "\r\n", "\n", $txt );
159 $pages = explode( "\f", $txt );
160 foreach ( $pages as $page => $pageText ) {
161 // Get rid of invalid UTF-8, strip control characters
162 // Note we need to do this per page, as \f page feed would be stripped.
163 $pages[$page] = Validator::cleanUp( $pageText );
164 }
165 $data['text'] = $pages;
166 }
167 }
168 return $data;
169 }
170
175 protected function convertDumpToArray( $dump ) {
176 if ( strval( $dump ) == '' ) {
177 return false;
178 }
179
180 $lines = explode( "\n", $dump );
181 $data = [];
182
183 // Metadata is always the last item, and spans multiple lines.
184 $inMetadata = false;
185
186 // Basically this loop will go through each line, splitting key value
187 // pairs on the colon, until it gets to a "Metadata:\n" at which point
188 // it will gather all remaining lines into the xmp key.
189 foreach ( $lines as $line ) {
190 if ( $inMetadata ) {
191 // Handle XMP differently due to diffence in line break
192 // @phan-suppress-next-line PhanTypeInvalidDimOffset weird loop
193 $data['xmp'] .= "\n$line";
194 continue;
195 }
196 $bits = explode( ':', $line, 2 );
197 if ( count( $bits ) > 1 ) {
198 $key = trim( $bits[0] );
199 if ( $key === 'Metadata' ) {
200 $inMetadata = true;
201 $data['xmp'] = '';
202 continue;
203 }
204 $value = trim( $bits[1] );
205 $matches = [];
206 // "Page xx rot" will be in poppler 0.20's pdfinfo output
207 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
208 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
209 $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
210 } else {
211 $data[$key] = $value;
212 }
213 }
214 }
215 $data = $this->postProcessDump( $data );
216 return $data;
217 }
218
228 protected function postProcessDump( array $data ) {
229 $meta = new BitmapMetadataHandler();
230 $items = [];
231 foreach ( $data as $key => $val ) {
232 switch ( $key ) {
233 case 'Title':
234 $items['ObjectName'] = $val;
235 break;
236 case 'Subject':
237 $items['ImageDescription'] = $val;
238 break;
239 case 'Keywords':
240 // Sometimes we have empty keywords. This seems
241 // to be a product of how pdfinfo deals with keywords
242 // with spaces in them. Filter such empty keywords
243 $keyList = array_filter( explode( ' ', $val ) );
244 if ( count( $keyList ) > 0 ) {
245 $items['Keywords'] = $keyList;
246 }
247 break;
248 case 'Author':
249 $items['Artist'] = $val;
250 break;
251 case 'Creator':
252 // Program used to create file.
253 // Different from program used to convert to pdf.
254 $items['Software'] = $val;
255 break;
256 case 'Producer':
257 // Conversion program
258 $items['pdf-Producer'] = $val;
259 break;
260 case 'ModTime':
261 $timestamp = wfTimestamp( TS_EXIF, $val );
262 if ( $timestamp ) {
263 // 'if' is just paranoia
264 $items['DateTime'] = $timestamp;
265 }
266 break;
267 case 'CreationTime':
268 $timestamp = wfTimestamp( TS_EXIF, $val );
269 if ( $timestamp ) {
270 $items['DateTimeDigitized'] = $timestamp;
271 }
272 break;
273 // These last two (version and encryption) I was unsure
274 // if we should include in the table, since they aren't
275 // all that useful to editors. I leaned on the side
276 // of including. However not including if file
277 // is optimized/linearized since that is really useless
278 // to an editor.
279 case 'PDF version':
280 $items['pdf-Version'] = $val;
281 break;
282 case 'Encrypted':
283 // @todo: The value isn't i18n-ised. The appropriate
284 // place to do that is in FormatMetadata.php
285 // should add a hook a there.
286 // For reference, if encrypted this fields value looks like:
287 // "yes (print:yes copy:no change:no addNotes:no)"
288 $items['pdf-Encrypted'] = $val;
289 break;
290 // Note 'pages' and 'Pages' are different keys (!)
291 case 'pages':
292 // A pdf document can have multiple sized pages in it.
293 // (However 95% of the time, all pages are the same size)
294 // get a list of all the unique page sizes in document.
295 // This doesn't do anything with rotation as of yet,
296 // mostly because I am unsure of what a good way to
297 // present that information to the user would be.
298 $pageSizes = [];
299 foreach ( $val as $page ) {
300 if ( isset( $page['Page size'] ) ) {
301 $pageSizes[$page['Page size']] = true;
302 }
303 }
304
305 $pageSizeArray = array_keys( $pageSizes );
306 if ( count( $pageSizeArray ) > 0 ) {
307 $items['pdf-PageSize'] = $pageSizeArray;
308 }
309 break;
310 }
311
312 }
313 $meta->addMetadata( $items, 'native' );
314
315 if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) {
316 // @todo: This only handles generic xmp properties. Would be improved
317 // by handling pdf xmp properties (pdf and pdfx) via a hook.
318 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
319 $xmp->parse( $data['xmp'] );
320 $xmpRes = $xmp->getResults();
321 foreach ( $xmpRes as $type => $xmpSection ) {
322 $meta->addMetadata( $xmpSection, $type );
323 }
324 }
325 unset( $data['xmp'] );
326 $data['mergedMetadata'] = $meta->getMetadataArray();
327 return $data;
328 }
329}
and that you know you can do these things To protect your we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights These restrictions translate to certain responsibilities for you if you distribute copies of the or if you modify it For if you distribute copies of such a whether gratis or for a you must give the recipients all the rights that you have You must make sure that receive or can get the source code And you must show them these terms so they know their rights We protect your rights with two and(2) offer you this license which gives you legal permission to copy
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
$line
Definition cdb.php:59
Class to deal with reconciling and extracting metadata from bitmap images.
PSR-3 logger instance factory.
Executes shell commands.
Definition Shell.php:44
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition PdfImage.php:32
static getPageSize( $data, $page)
Definition PdfImage.php:74
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition PdfImage.php:228
getImageSize()
Definition PdfImage.php:56
__construct( $filename)
Definition PdfImage.php:42
retrieveMetaData()
Definition PdfImage.php:119
string $mFilename
Definition PdfImage.php:37
convertDumpToArray( $dump)
Definition PdfImage.php:175
The ContentHandler facility adds support for arbitrary content types on wiki pages
$data
Utility to generate mapping file used in mw.Title (phpCharToUpper.json)
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a array("192.168.0.1:11211", 2))
$lines
Definition router.php:61