MediaWiki REL1_32
PdfImage.php
Go to the documentation of this file.
1<?php
24use UtfNormal\Validator;
25use Wikimedia\XMPReader\Reader as XMPReader;
26
32class PdfImage {
33
37 private $mFilename;
38
42 function __construct( $filename ) {
43 $this->mFilename = $filename;
44 }
45
49 public function isValid() {
50 return true;
51 }
52
56 public function getImageSize() {
57 $data = $this->retrieveMetadata();
58 $size = self::getPageSize( $data, 1 );
59
60 if ( $size ) {
61 $width = $size['width'];
62 $height = $size['height'];
63 return [ $width, $height, 'Pdf',
64 "width=\"$width\" height=\"$height\"" ];
65 }
66 return false;
67 }
68
74 public static function getPageSize( $data, $page ) {
75 global $wgPdfHandlerDpi;
76
77 if ( isset( $data['pages'][$page]['Page size'] ) ) {
78 $o = $data['pages'][$page]['Page size'];
79 } elseif ( isset( $data['Page size'] ) ) {
80 $o = $data['Page size'];
81 } else {
82 $o = false;
83 }
84
85 if ( $o ) {
86 if ( isset( $data['pages'][$page]['Page rot'] ) ) {
87 $r = $data['pages'][$page]['Page rot'];
88 } elseif ( isset( $data['Page rot'] ) ) {
89 $r = $data['Page rot'];
90 } else {
91 $r = 0;
92 }
93 $size = explode( 'x', $o, 2 );
94
95 if ( $size ) {
96 $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
97 $height = explode( ' ', trim( $size[1] ), 2 );
98 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
99 if ( ( $r / 90 ) & 1 ) {
100 // Swap width and height for landscape pages
101 $t = $width;
102 $width = $height;
103 $height = $t;
104 }
105
106 return [
107 'width' => $width,
108 'height' => $height
109 ];
110 }
111 }
112
113 return false;
114 }
115
119 public function retrieveMetaData() {
120 global $wgPdfInfo, $wgPdftoText;
121
122 if ( $wgPdfInfo ) {
123 // Note in poppler 0.26 the -meta and page data options worked together,
124 // but as of poppler 0.48 they must be queried separately.
125 // https://bugs.freedesktop.org/show_bug.cgi?id=96801
126 $cmdMeta = [
127 $wgPdfInfo,
128 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
129 '-meta', # Report XMP metadata
131 ];
132 $resultMeta = Shell::command( $cmdMeta )
133 ->execute();
134
135 $cmdPages = [
136 $wgPdfInfo,
137 '-enc', 'UTF-8', # Report metadata as UTF-8 text...
138 '-l', '9999999', # Report page sizes for all pages
140 ];
141 $resultPages = Shell::command( $cmdPages )
142 ->execute();
143
144 $dump = $resultMeta->getStdout() . $resultPages->getStdout();
145 $data = $this->convertDumpToArray( $dump );
146 } else {
147 $data = null;
148 }
149
150 // Read text layer
151 if ( isset( $wgPdftoText ) ) {
152 $cmd = [ $wgPdftoText, $this->mFilename, '-' ];
153 $result = Shell::command( $cmd )
154 ->execute();
155 $retval = $result->getExitCode();
156 $txt = $result->getStdout();
157 if ( $retval == 0 ) {
158 $txt = str_replace( "\r\n", "\n", $txt );
159 $pages = explode( "\f", $txt );
160 foreach ( $pages as $page => $pageText ) {
161 // Get rid of invalid UTF-8, strip control characters
162 // Note we need to do this per page, as \f page feed would be stripped.
163 $pages[$page] = Validator::cleanUp( $pageText );
164 }
165 $data['text'] = $pages;
166 }
167 }
168 return $data;
169 }
170
175 protected function convertDumpToArray( $dump ) {
176 if ( strval( $dump ) == '' ) {
177 return false;
178 }
179
180 $lines = explode( "\n", $dump );
181 $data = [];
182
183 // Metadata is always the last item, and spans multiple lines.
184 $inMetadata = false;
185
186 // Basically this loop will go through each line, splitting key value
187 // pairs on the colon, until it gets to a "Metadata:\n" at which point
188 // it will gather all remaining lines into the xmp key.
189 foreach ( $lines as $line ) {
190 if ( $inMetadata ) {
191 // Handle XMP differently due to diffence in line break
192 $data['xmp'] .= "\n$line";
193 continue;
194 }
195 $bits = explode( ':', $line, 2 );
196 if ( count( $bits ) > 1 ) {
197 $key = trim( $bits[0] );
198 if ( $key === 'Metadata' ) {
199 $inMetadata = true;
200 $data['xmp'] = '';
201 continue;
202 }
203 $value = trim( $bits[1] );
204 $matches = [];
205 // "Page xx rot" will be in poppler 0.20's pdfinfo output
206 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
207 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
208 $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
209 } else {
210 $data[$key] = $value;
211 }
212 }
213 }
214 $data = $this->postProcessDump( $data );
215 return $data;
216 }
217
227 protected function postProcessDump( array $data ) {
228 $meta = new BitmapMetadataHandler();
229 $items = [];
230 foreach ( $data as $key => $val ) {
231 switch ( $key ) {
232 case 'Title':
233 $items['ObjectName'] = $val;
234 break;
235 case 'Subject':
236 $items['ImageDescription'] = $val;
237 break;
238 case 'Keywords':
239 // Sometimes we have empty keywords. This seems
240 // to be a product of how pdfinfo deals with keywords
241 // with spaces in them. Filter such empty keywords
242 $keyList = array_filter( explode( ' ', $val ) );
243 if ( count( $keyList ) > 0 ) {
244 $items['Keywords'] = $keyList;
245 }
246 break;
247 case 'Author':
248 $items['Artist'] = $val;
249 break;
250 case 'Creator':
251 // Program used to create file.
252 // Different from program used to convert to pdf.
253 $items['Software'] = $val;
254 break;
255 case 'Producer':
256 // Conversion program
257 $items['pdf-Producer'] = $val;
258 break;
259 case 'ModTime':
260 $timestamp = wfTimestamp( TS_EXIF, $val );
261 if ( $timestamp ) {
262 // 'if' is just paranoia
263 $items['DateTime'] = $timestamp;
264 }
265 break;
266 case 'CreationTime':
267 $timestamp = wfTimestamp( TS_EXIF, $val );
268 if ( $timestamp ) {
269 $items['DateTimeDigitized'] = $timestamp;
270 }
271 break;
272 // These last two (version and encryption) I was unsure
273 // if we should include in the table, since they aren't
274 // all that useful to editors. I leaned on the side
275 // of including. However not including if file
276 // is optimized/linearized since that is really useless
277 // to an editor.
278 case 'PDF version':
279 $items['pdf-Version'] = $val;
280 break;
281 case 'Encrypted':
282 // @todo: The value isn't i18n-ised. The appropriate
283 // place to do that is in FormatMetadata.php
284 // should add a hook a there.
285 // For reference, if encrypted this fields value looks like:
286 // "yes (print:yes copy:no change:no addNotes:no)"
287 $items['pdf-Encrypted'] = $val;
288 break;
289 // Note 'pages' and 'Pages' are different keys (!)
290 case 'pages':
291 // A pdf document can have multiple sized pages in it.
292 // (However 95% of the time, all pages are the same size)
293 // get a list of all the unique page sizes in document.
294 // This doesn't do anything with rotation as of yet,
295 // mostly because I am unsure of what a good way to
296 // present that information to the user would be.
297 $pageSizes = [];
298 foreach ( $val as $page ) {
299 if ( isset( $page['Page size'] ) ) {
300 $pageSizes[$page['Page size']] = true;
301 }
302 }
303
304 $pageSizeArray = array_keys( $pageSizes );
305 if ( count( $pageSizeArray ) > 0 ) {
306 $items['pdf-PageSize'] = $pageSizeArray;
307 }
308 break;
309 }
310
311 }
312 $meta->addMetadata( $items, 'native' );
313
314 if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) {
315 // @todo: This only handles generic xmp properties. Would be improved
316 // by handling pdf xmp properties (pdf and pdfx) via a hook.
317 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
318 $xmp->parse( $data['xmp'] );
319 $xmpRes = $xmp->getResults();
320 foreach ( $xmpRes as $type => $xmpSection ) {
321 $meta->addMetadata( $xmpSection, $type );
322 }
323 }
324 unset( $data['xmp'] );
325 $data['mergedMetadata'] = $meta->getMetadataArray();
326 return $data;
327 }
328}
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
$line
Definition cdb.php:59
Class to deal with reconciling and extracting metadata from bitmap images.
PSR-3 logger instance factory.
Executes shell commands.
Definition Shell.php:44
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition PdfImage.php:32
static getPageSize( $data, $page)
Definition PdfImage.php:74
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition PdfImage.php:227
getImageSize()
Definition PdfImage.php:56
__construct( $filename)
Definition PdfImage.php:42
retrieveMetaData()
Definition PdfImage.php:119
string $mFilename
Definition PdfImage.php:37
convertDumpToArray( $dump)
Definition PdfImage.php:175
The ContentHandler facility adds support for arbitrary content types on wiki pages
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a local account incomplete not yet checked for validity & $retval
Definition hooks.txt:266
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a array("192.168.0.1:11211", 2))
$lines
Definition router.php:61