Code Coverage for /workspace/src/extensions/PdfHandler/includes/PdfImage.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 158	0.00% covered (danger)	0.00%	0 / 6	CRAP	0.00% covered (danger)	0.00%	0 / 1
PdfImage	0.00% covered (danger)	0.00%	0 / 158	0.00% covered (danger)	0.00%	0 / 6	2352	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
isValid	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
getPageSize	0.00% covered (danger)	0.00%	0 / 25	0.00% covered (danger)	0.00%	0 / 1	56
retrieveMetaData	0.00% covered (danger)	0.00%	0 / 48	0.00% covered (danger)	0.00%	0 / 1	72
convertDumpToArray	0.00% covered (danger)	0.00%	0 / 25	0.00% covered (danger)	0.00%	0 / 1	90
postProcessDump	0.00% covered (danger)	0.00%	0 / 58	0.00% covered (danger)	0.00%	0 / 1	506

1	<?php
2	/**
3	*
4	* Copyright © 2007 Xarax <jodeldi@gmx.de>
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License along
17	* with this program; if not, write to the Free Software Foundation, Inc.,
18	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19	* http://www.gnu.org/copyleft/gpl.html
20	*/
21
22	namespace MediaWiki\Extension\PdfHandler;
23
24	use BitmapMetadataHandler;
25	use MediaWiki\Logger\LoggerFactory;
26	use MediaWiki\MediaWikiServices;
27	use UtfNormal\Validator;
28	use Wikimedia\XMPReader\Reader as XMPReader;
29
30	/**
31	* inspired by djvuimage from Brion Vibber
32	* modified and written by xarax
33	*/
34
35	class PdfImage {
36
37	/**
38	* @var string
39	*/
40	private $mFilename;
41
42	public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ];
43
44	/**
45	* @param string $filename
46	*/
47	public function __construct( $filename ) {
48	$this->mFilename = $filename;
49	}
50
51	/**
52	* @return bool
53	*/
54	public function isValid() {
55	return true;
56	}
57
58	/**
59	* @param array $data
60	* @param int $page
61	* @return array\|bool
62	*/
63	public static function getPageSize( $data, $page ) {
64	global $wgPdfHandlerDpi;
65
66	if ( isset( $data['pages'][$page]['Page size'] ) ) {
67	$pageSize = $data['pages'][$page]['Page size'];
68	} elseif ( isset( $data['Page size'] ) ) {
69	$pageSize = $data['Page size'];
70	} else {
71	$pageSize = false;
72	}
73
74	if ( $pageSize ) {
75	if ( isset( $data['pages'][$page]['Page rot'] ) ) {
76	$pageRotation = $data['pages'][$page]['Page rot'];
77	} elseif ( isset( $data['Page rot'] ) ) {
78	$pageRotation = $data['Page rot'];
79	} else {
80	$pageRotation = 0;
81	}
82	$size = explode( 'x', $pageSize, 2 );
83
84	$width = intval( (int)trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
85	$height = explode( ' ', trim( $size[1] ), 2 );
86	$height = intval( (int)trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
87	if ( ( $pageRotation / 90 ) & 1 ) {
88	// Swap width and height for landscape pages
89	$temp = $width;
90	$width = $height;
91	$height = $temp;
92	}
93
94	return [
95	'width' => $width,
96	'height' => $height
97	];
98	}
99
100	return false;
101	}
102
103	/**
104	* @return array
105	*/
106	public function retrieveMetaData(): array {
107	global $wgPdfInfo, $wgPdftoText, $wgShellboxShell;
108
109	$command = MediaWikiServices::getInstance()->getShellCommandFactory()
110	->createBoxed( 'pdfhandler' )
111	->disableNetwork()
112	->firejailDefaultSeccomp()
113	->routeName( 'pdfhandler-metadata' );
114
115	$result = $command
116	->params( $wgShellboxShell, 'scripts/retrieveMetaData.sh' )
117	->inputFileFromFile(
118	'scripts/retrieveMetaData.sh',
119	__DIR__ . '/../scripts/retrieveMetaData.sh' )
120	->inputFileFromFile( 'file.pdf', $this->mFilename )
121	->outputFileToString( 'meta' )
122	->outputFileToString( 'pages' )
123	->outputFileToString( 'text' )
124	->outputFileToString( 'text_exit_code' )
125	->environment( [
126	'PDFHANDLER_INFO' => $wgPdfInfo,
127	'PDFHANDLER_TOTEXT' => $wgPdftoText,
128	] )
129	->execute();
130
131	// Record in statsd
132	MediaWikiServices::getInstance()->getStatsFactory()
133	->getCounter( 'pdfhandler_shell_retrievemetadata_total' )
134	->copyToStatsdAt( 'pdfhandler.shell.retrieve_meta_data' )
135	->increment();
136
137	// Metadata retrieval is allowed to fail, but we'd like to know why
138	if ( $result->getExitCode() != 0 ) {
139	wfDebug( __METHOD__ . ': retrieveMetaData.sh' .
140	"\n\nExitcode: " . $result->getExitCode() . "\n\n"
141	. $result->getStderr() );
142	}
143
144	$resultMeta = $result->getFileContents( 'meta' );
145	$resultPages = $result->getFileContents( 'pages' );
146	if ( $resultMeta !== null \|\| $resultPages !== null ) {
147	$data = $this->convertDumpToArray(
148	$resultMeta ?? '',
149	$resultPages ?? ''
150	);
151	} else {
152	$data = [];
153	}
154
155	// Read text layer
156	$retval = $result->wasReceived( 'text_exit_code' )
157	? (int)trim( $result->getFileContents( 'text_exit_code' ) )
158	: 1;
159	$txt = $result->getFileContents( 'text' );
160	if ( $retval == 0 && strlen( $txt ) ) {
161	$txt = str_replace( "\r\n", "\n", $txt );
162	$pages = explode( "\f", $txt );
163	foreach ( $pages as $page => $pageText ) {
164	// Get rid of invalid UTF-8, strip control characters
165	// Note we need to do this per page, as \f page feed would be stripped.
166	$pages[$page] = Validator::cleanUp( $pageText );
167	}
168	$data['text'] = $pages;
169	}
170
171	return $data;
172	}
173
174	/**
175	* @param string $metaDump
176	* @param string $infoDump
177	* @return array
178	*/
179	protected function convertDumpToArray( $metaDump, $infoDump ): array {
180	if ( strval( $infoDump ) === '' ) {
181	return [];
182	}
183
184	$lines = explode( "\n", $infoDump );
185	$data = [];
186
187	// Metadata is always the last item, and spans multiple lines.
188	$inMetadata = false;
189
190	// Basically this loop will go through each line, splitting key value
191	// pairs on the colon, until it gets to a "Metadata:\n" at which point
192	// it will gather all remaining lines into the xmp key.
193	foreach ( $lines as $line ) {
194	if ( $inMetadata ) {
195	// Handle XMP differently due to difference in line break
196	$data['xmp'] .= "\n$line";
197	continue;
198	}
199	$bits = explode( ':', $line, 2 );
200	if ( count( $bits ) > 1 ) {
201	$key = trim( $bits[0] );
202	if ( $key === 'Metadata' ) {
203	$inMetadata = true;
204	$data['xmp'] = '';
205	continue;
206	}
207	$value = trim( $bits[1] );
208	$matches = [];
209	// "Page xx rot" will be in poppler 0.20's pdfinfo output
210	// See https://bugs.freedesktop.org/show_bug.cgi?id=41867
211	if ( preg_match( '/^Page +(\d+) (size\|rot)$/', $key, $matches ) ) {
212	$data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
213	} else {
214	$data[$key] = $value;
215	}
216	}
217	}
218	$metaDump = trim( $metaDump );
219	if ( $metaDump !== '' ) {
220	$data['xmp'] = $metaDump;
221	}
222
223	return $this->postProcessDump( $data );
224	}
225
226	/**
227	* Postprocess the metadata (convert xmp into useful form, etc)
228	*
229	* This is used to generate the metadata table at the bottom
230	* of the image description page.
231	*
232	* @param array $data metadata
233	* @return array post-processed metadata
234	*/
235	protected function postProcessDump( array $data ) {
236	$meta = new BitmapMetadataHandler();
237	$items = [];
238	foreach ( $data as $key => $val ) {
239	switch ( $key ) {
240	case 'Title':
241	$items['ObjectName'] = $val;
242	break;
243	case 'Subject':
244	$items['ImageDescription'] = $val;
245	break;
246	case 'Keywords':
247	// Sometimes we have empty keywords. This seems
248	// to be a product of how pdfinfo deals with keywords
249	// with spaces in them. Filter such empty keywords
250	$keyList = array_filter( explode( ' ', $val ) );
251	if ( count( $keyList ) > 0 ) {
252	$items['Keywords'] = $keyList;
253	}
254	break;
255	case 'Author':
256	$items['Artist'] = $val;
257	break;
258	case 'Creator':
259	// Program used to create file.
260	// Different from program used to convert to pdf.
261	$items['Software'] = $val;
262	break;
263	case 'Producer':
264	// Conversion program
265	$items['pdf-Producer'] = $val;
266	break;
267	case 'ModTime':
268	$timestamp = wfTimestamp( TS_EXIF, $val );
269	if ( $timestamp ) {
270	// 'if' is just paranoia
271	$items['DateTime'] = $timestamp;
272	}
273	break;
274	case 'CreationTime':
275	$timestamp = wfTimestamp( TS_EXIF, $val );
276	if ( $timestamp ) {
277	$items['DateTimeDigitized'] = $timestamp;
278	}
279	break;
280	// These last two (version and encryption) I was unsure
281	// if we should include in the table, since they aren't
282	// all that useful to editors. I leaned on the side
283	// of including. However not including if file
284	// is optimized/linearized since that is really useless
285	// to an editor.
286	case 'PDF version':
287	$items['pdf-Version'] = $val;
288	break;
289	case 'Encrypted':
290	$items['pdf-Encrypted'] = $val;
291	break;
292	// Note 'pages' and 'Pages' are different keys (!)
293	case 'pages':
294	// A pdf document can have multiple sized pages in it.
295	// (However 95% of the time, all pages are the same size)
296	// get a list of all the unique page sizes in document.
297	// This doesn't do anything with rotation as of yet,
298	// mostly because I am unsure of what a good way to
299	// present that information to the user would be.
300	$pageSizes = [];
301	foreach ( $val as $page ) {
302	if ( isset( $page['Page size'] ) ) {
303	$pageSizes[$page['Page size']] = true;
304	}
305	}
306
307	$pageSizeArray = array_keys( $pageSizes );
308	if ( count( $pageSizeArray ) > 0 ) {
309	$items['pdf-PageSize'] = $pageSizeArray;
310	}
311	break;
312	}
313
314	}
315	$meta->addMetadata( $items, 'native' );
316
317	if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) {
318	// @todo: This only handles generic xmp properties. Would be improved
319	// by handling pdf xmp properties (pdf and pdfx) via a hook.
320	$xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
321	$xmp->parse( $data['xmp'] );
322	$xmpRes = $xmp->getResults();
323	foreach ( $xmpRes as $type => $xmpSection ) {
324	$meta->addMetadata( $xmpSection, $type );
325	}
326	}
327	unset( $data['xmp'] );
328	$data['mergedMetadata'] = $meta->getMetadataArray();
329	return $data;
330	}
331	}