MediaWiki  1.34.0
PdfImage.php
Go to the documentation of this file.
1 <?php
24 use UtfNormal\Validator;
25 use Wikimedia\XMPReader\Reader as XMPReader;
26 
32 class PdfImage {
33 
37  private $mFilename;
38 
42  public function __construct( $filename ) {
43  $this->mFilename = $filename;
44  }
45 
49  public function isValid() {
50  return true;
51  }
52 
56  public function getImageSize() {
57  $data = $this->retrieveMetadata();
58  $size = self::getPageSize( $data, 1 );
59 
60  if ( $size ) {
61  $width = $size['width'];
62  $height = $size['height'];
63  return [ $width, $height, 'Pdf',
64  "width=\"$width\" height=\"$height\"" ];
65  }
66  return false;
67  }
68 
74  public static function getPageSize( $data, $page ) {
75  global $wgPdfHandlerDpi;
76 
77  if ( isset( $data['pages'][$page]['Page size'] ) ) {
78  $o = $data['pages'][$page]['Page size'];
79  } elseif ( isset( $data['Page size'] ) ) {
80  $o = $data['Page size'];
81  } else {
82  $o = false;
83  }
84 
85  if ( $o ) {
86  if ( isset( $data['pages'][$page]['Page rot'] ) ) {
87  $r = $data['pages'][$page]['Page rot'];
88  } elseif ( isset( $data['Page rot'] ) ) {
89  $r = $data['Page rot'];
90  } else {
91  $r = 0;
92  }
93  $size = explode( 'x', $o, 2 );
94 
95  if ( $size ) {
96  $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
97  $height = explode( ' ', trim( $size[1] ), 2 );
98  $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
99  if ( ( $r / 90 ) & 1 ) {
100  // Swap width and height for landscape pages
101  $t = $width;
102  $width = $height;
103  $height = $t;
104  }
105 
106  return [
107  'width' => $width,
108  'height' => $height
109  ];
110  }
111  }
112 
113  return false;
114  }
115 
119  public function retrieveMetaData() {
120  global $wgPdfInfo, $wgPdftoText;
121 
122  if ( $wgPdfInfo ) {
123  // Note in poppler 0.26 the -meta and page data options worked together,
124  // but as of poppler 0.48 they must be queried separately.
125  // https://bugs.freedesktop.org/show_bug.cgi?id=96801
126  $cmdMeta = [
127  $wgPdfInfo,
128  '-enc', 'UTF-8', # Report metadata as UTF-8 text...
129  '-meta', # Report XMP metadata
131  ];
132  $resultMeta = Shell::command( $cmdMeta )
133  ->execute();
134 
135  $cmdPages = [
136  $wgPdfInfo,
137  '-enc', 'UTF-8', # Report metadata as UTF-8 text...
138  '-l', '9999999', # Report page sizes for all pages
140  ];
141  $resultPages = Shell::command( $cmdPages )
142  ->execute();
143 
144  $dump = $resultMeta->getStdout() . $resultPages->getStdout();
145  $data = $this->convertDumpToArray( $dump );
146  } else {
147  $data = null;
148  }
149 
150  // Read text layer
151  if ( isset( $wgPdftoText ) ) {
152  $cmd = [ $wgPdftoText, $this->mFilename, '-' ];
153  $result = Shell::command( $cmd )
154  ->execute();
155  $retval = $result->getExitCode();
156  $txt = $result->getStdout();
157  if ( $retval == 0 ) {
158  $txt = str_replace( "\r\n", "\n", $txt );
159  $pages = explode( "\f", $txt );
160  foreach ( $pages as $page => $pageText ) {
161  // Get rid of invalid UTF-8, strip control characters
162  // Note we need to do this per page, as \f page feed would be stripped.
163  $pages[$page] = Validator::cleanUp( $pageText );
164  }
165  $data['text'] = $pages;
166  }
167  }
168  return $data;
169  }
170 
175  protected function convertDumpToArray( $dump ) {
176  if ( strval( $dump ) == '' ) {
177  return false;
178  }
179 
180  $lines = explode( "\n", $dump );
181  $data = [];
182 
183  // Metadata is always the last item, and spans multiple lines.
184  $inMetadata = false;
185 
186  // Basically this loop will go through each line, splitting key value
187  // pairs on the colon, until it gets to a "Metadata:\n" at which point
188  // it will gather all remaining lines into the xmp key.
189  foreach ( $lines as $line ) {
190  if ( $inMetadata ) {
191  // Handle XMP differently due to diffence in line break
192  // @phan-suppress-next-line PhanTypeInvalidDimOffset weird loop
193  $data['xmp'] .= "\n$line";
194  continue;
195  }
196  $bits = explode( ':', $line, 2 );
197  if ( count( $bits ) > 1 ) {
198  $key = trim( $bits[0] );
199  if ( $key === 'Metadata' ) {
200  $inMetadata = true;
201  $data['xmp'] = '';
202  continue;
203  }
204  $value = trim( $bits[1] );
205  $matches = [];
206  // "Page xx rot" will be in poppler 0.20's pdfinfo output
207  // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
208  if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
209  $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
210  } else {
211  $data[$key] = $value;
212  }
213  }
214  }
215  $data = $this->postProcessDump( $data );
216  return $data;
217  }
218 
228  protected function postProcessDump( array $data ) {
229  $meta = new BitmapMetadataHandler();
230  $items = [];
231  foreach ( $data as $key => $val ) {
232  switch ( $key ) {
233  case 'Title':
234  $items['ObjectName'] = $val;
235  break;
236  case 'Subject':
237  $items['ImageDescription'] = $val;
238  break;
239  case 'Keywords':
240  // Sometimes we have empty keywords. This seems
241  // to be a product of how pdfinfo deals with keywords
242  // with spaces in them. Filter such empty keywords
243  $keyList = array_filter( explode( ' ', $val ) );
244  if ( count( $keyList ) > 0 ) {
245  $items['Keywords'] = $keyList;
246  }
247  break;
248  case 'Author':
249  $items['Artist'] = $val;
250  break;
251  case 'Creator':
252  // Program used to create file.
253  // Different from program used to convert to pdf.
254  $items['Software'] = $val;
255  break;
256  case 'Producer':
257  // Conversion program
258  $items['pdf-Producer'] = $val;
259  break;
260  case 'ModTime':
261  $timestamp = wfTimestamp( TS_EXIF, $val );
262  if ( $timestamp ) {
263  // 'if' is just paranoia
264  $items['DateTime'] = $timestamp;
265  }
266  break;
267  case 'CreationTime':
268  $timestamp = wfTimestamp( TS_EXIF, $val );
269  if ( $timestamp ) {
270  $items['DateTimeDigitized'] = $timestamp;
271  }
272  break;
273  // These last two (version and encryption) I was unsure
274  // if we should include in the table, since they aren't
275  // all that useful to editors. I leaned on the side
276  // of including. However not including if file
277  // is optimized/linearized since that is really useless
278  // to an editor.
279  case 'PDF version':
280  $items['pdf-Version'] = $val;
281  break;
282  case 'Encrypted':
283  // @todo: The value isn't i18n-ised. The appropriate
284  // place to do that is in FormatMetadata.php
285  // should add a hook a there.
286  // For reference, if encrypted this fields value looks like:
287  // "yes (print:yes copy:no change:no addNotes:no)"
288  $items['pdf-Encrypted'] = $val;
289  break;
290  // Note 'pages' and 'Pages' are different keys (!)
291  case 'pages':
292  // A pdf document can have multiple sized pages in it.
293  // (However 95% of the time, all pages are the same size)
294  // get a list of all the unique page sizes in document.
295  // This doesn't do anything with rotation as of yet,
296  // mostly because I am unsure of what a good way to
297  // present that information to the user would be.
298  $pageSizes = [];
299  foreach ( $val as $page ) {
300  if ( isset( $page['Page size'] ) ) {
301  $pageSizes[$page['Page size']] = true;
302  }
303  }
304 
305  $pageSizeArray = array_keys( $pageSizes );
306  if ( count( $pageSizeArray ) > 0 ) {
307  $items['pdf-PageSize'] = $pageSizeArray;
308  }
309  break;
310  }
311 
312  }
313  $meta->addMetadata( $items, 'native' );
314 
315  if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) {
316  // @todo: This only handles generic xmp properties. Would be improved
317  // by handling pdf xmp properties (pdf and pdfx) via a hook.
318  $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
319  $xmp->parse( $data['xmp'] );
320  $xmpRes = $xmp->getResults();
321  foreach ( $xmpRes as $type => $xmpSection ) {
322  $meta->addMetadata( $xmpSection, $type );
323  }
324  }
325  unset( $data['xmp'] );
326  $data['mergedMetadata'] = $meta->getMetadataArray();
327  return $data;
328  }
329 }
MediaWiki\Shell\Shell
Executes shell commands.
Definition: Shell.php:44
PdfImage\$mFilename
string $mFilename
Definition: PdfImage.php:37
PdfImage\retrieveMetaData
retrieveMetaData()
Definition: PdfImage.php:119
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1869
PdfImage\getPageSize
static getPageSize( $data, $page)
Definition: PdfImage.php:74
PdfImage\convertDumpToArray
convertDumpToArray( $dump)
Definition: PdfImage.php:175
PdfImage
inspired by djvuimage from Brion Vibber modified and written by xarax
Definition: PdfImage.php:32
BitmapMetadataHandler
Class to deal with reconciling and extracting metadata from bitmap images.
Definition: BitmapMetadataHandler.php:37
MediaWiki\Logger\LoggerFactory
PSR-3 logger instance factory.
Definition: LoggerFactory.php:45
$matches
$matches
Definition: NoLocalSettings.php:24
$t
$t
Definition: make-normalization-table.php:143
$lines
$lines
Definition: router.php:61
PdfImage\__construct
__construct( $filename)
Definition: PdfImage.php:42
$line
$line
Definition: cdb.php:59
PdfImage\postProcessDump
postProcessDump(array $data)
Postprocess the metadata (convert xmp into useful form, etc)
Definition: PdfImage.php:228
PdfImage\getImageSize
getImageSize()
Definition: PdfImage.php:56
$type
$type
Definition: testCompression.php:48
PdfImage\isValid
isValid()
Definition: PdfImage.php:49