Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 158 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
PdfImage | |
0.00% |
0 / 158 |
|
0.00% |
0 / 6 |
2352 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isValid | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getPageSize | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
56 | |||
retrieveMetaData | |
0.00% |
0 / 48 |
|
0.00% |
0 / 1 |
72 | |||
convertDumpToArray | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
90 | |||
postProcessDump | |
0.00% |
0 / 58 |
|
0.00% |
0 / 1 |
506 |
1 | <?php |
2 | /** |
3 | * |
4 | * Copyright © 2007 Xarax <jodeldi@gmx.de> |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. |
10 | * |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU General Public License along |
17 | * with this program; if not, write to the Free Software Foundation, Inc., |
18 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
19 | * http://www.gnu.org/copyleft/gpl.html |
20 | */ |
21 | |
22 | namespace MediaWiki\Extension\PdfHandler; |
23 | |
24 | use BitmapMetadataHandler; |
25 | use MediaWiki\Logger\LoggerFactory; |
26 | use MediaWiki\MediaWikiServices; |
27 | use UtfNormal\Validator; |
28 | use Wikimedia\XMPReader\Reader as XMPReader; |
29 | |
30 | /** |
31 | * inspired by djvuimage from Brion Vibber |
32 | * modified and written by xarax |
33 | */ |
34 | |
35 | class PdfImage { |
36 | |
37 | /** |
38 | * @var string |
39 | */ |
40 | private $mFilename; |
41 | |
42 | public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ]; |
43 | |
44 | /** |
45 | * @param string $filename |
46 | */ |
47 | public function __construct( $filename ) { |
48 | $this->mFilename = $filename; |
49 | } |
50 | |
51 | /** |
52 | * @return bool |
53 | */ |
54 | public function isValid() { |
55 | return true; |
56 | } |
57 | |
58 | /** |
59 | * @param array $data |
60 | * @param int $page |
61 | * @return array|bool |
62 | */ |
63 | public static function getPageSize( $data, $page ) { |
64 | global $wgPdfHandlerDpi; |
65 | |
66 | if ( isset( $data['pages'][$page]['Page size'] ) ) { |
67 | $pageSize = $data['pages'][$page]['Page size']; |
68 | } elseif ( isset( $data['Page size'] ) ) { |
69 | $pageSize = $data['Page size']; |
70 | } else { |
71 | $pageSize = false; |
72 | } |
73 | |
74 | if ( $pageSize ) { |
75 | if ( isset( $data['pages'][$page]['Page rot'] ) ) { |
76 | $pageRotation = $data['pages'][$page]['Page rot']; |
77 | } elseif ( isset( $data['Page rot'] ) ) { |
78 | $pageRotation = $data['Page rot']; |
79 | } else { |
80 | $pageRotation = 0; |
81 | } |
82 | $size = explode( 'x', $pageSize, 2 ); |
83 | |
84 | $width = intval( (int)trim( $size[0] ) / 72 * $wgPdfHandlerDpi ); |
85 | $height = explode( ' ', trim( $size[1] ), 2 ); |
86 | $height = intval( (int)trim( $height[0] ) / 72 * $wgPdfHandlerDpi ); |
87 | if ( ( $pageRotation / 90 ) & 1 ) { |
88 | // Swap width and height for landscape pages |
89 | $temp = $width; |
90 | $width = $height; |
91 | $height = $temp; |
92 | } |
93 | |
94 | return [ |
95 | 'width' => $width, |
96 | 'height' => $height |
97 | ]; |
98 | } |
99 | |
100 | return false; |
101 | } |
102 | |
103 | /** |
104 | * @return array |
105 | */ |
106 | public function retrieveMetaData(): array { |
107 | global $wgPdfInfo, $wgPdftoText, $wgShellboxShell; |
108 | |
109 | $command = MediaWikiServices::getInstance()->getShellCommandFactory() |
110 | ->createBoxed( 'pdfhandler' ) |
111 | ->disableNetwork() |
112 | ->firejailDefaultSeccomp() |
113 | ->routeName( 'pdfhandler-metadata' ); |
114 | |
115 | $result = $command |
116 | ->params( $wgShellboxShell, 'scripts/retrieveMetaData.sh' ) |
117 | ->inputFileFromFile( |
118 | 'scripts/retrieveMetaData.sh', |
119 | __DIR__ . '/../scripts/retrieveMetaData.sh' ) |
120 | ->inputFileFromFile( 'file.pdf', $this->mFilename ) |
121 | ->outputFileToString( 'meta' ) |
122 | ->outputFileToString( 'pages' ) |
123 | ->outputFileToString( 'text' ) |
124 | ->outputFileToString( 'text_exit_code' ) |
125 | ->environment( [ |
126 | 'PDFHANDLER_INFO' => $wgPdfInfo, |
127 | 'PDFHANDLER_TOTEXT' => $wgPdftoText, |
128 | ] ) |
129 | ->execute(); |
130 | |
131 | // Record in statsd |
132 | MediaWikiServices::getInstance()->getStatsFactory() |
133 | ->getCounter( 'pdfhandler_shell_retrievemetadata_total' ) |
134 | ->copyToStatsdAt( 'pdfhandler.shell.retrieve_meta_data' ) |
135 | ->increment(); |
136 | |
137 | // Metadata retrieval is allowed to fail, but we'd like to know why |
138 | if ( $result->getExitCode() != 0 ) { |
139 | wfDebug( __METHOD__ . ': retrieveMetaData.sh' . |
140 | "\n\nExitcode: " . $result->getExitCode() . "\n\n" |
141 | . $result->getStderr() ); |
142 | } |
143 | |
144 | $resultMeta = $result->getFileContents( 'meta' ); |
145 | $resultPages = $result->getFileContents( 'pages' ); |
146 | if ( $resultMeta !== null || $resultPages !== null ) { |
147 | $data = $this->convertDumpToArray( |
148 | $resultMeta ?? '', |
149 | $resultPages ?? '' |
150 | ); |
151 | } else { |
152 | $data = []; |
153 | } |
154 | |
155 | // Read text layer |
156 | $retval = $result->wasReceived( 'text_exit_code' ) |
157 | ? (int)trim( $result->getFileContents( 'text_exit_code' ) ) |
158 | : 1; |
159 | $txt = $result->getFileContents( 'text' ); |
160 | if ( $retval == 0 && strlen( $txt ) ) { |
161 | $txt = str_replace( "\r\n", "\n", $txt ); |
162 | $pages = explode( "\f", $txt ); |
163 | foreach ( $pages as $page => $pageText ) { |
164 | // Get rid of invalid UTF-8, strip control characters |
165 | // Note we need to do this per page, as \f page feed would be stripped. |
166 | $pages[$page] = Validator::cleanUp( $pageText ); |
167 | } |
168 | $data['text'] = $pages; |
169 | } |
170 | |
171 | return $data; |
172 | } |
173 | |
174 | /** |
175 | * @param string $metaDump |
176 | * @param string $infoDump |
177 | * @return array |
178 | */ |
179 | protected function convertDumpToArray( $metaDump, $infoDump ): array { |
180 | if ( strval( $infoDump ) === '' ) { |
181 | return []; |
182 | } |
183 | |
184 | $lines = explode( "\n", $infoDump ); |
185 | $data = []; |
186 | |
187 | // Metadata is always the last item, and spans multiple lines. |
188 | $inMetadata = false; |
189 | |
190 | // Basically this loop will go through each line, splitting key value |
191 | // pairs on the colon, until it gets to a "Metadata:\n" at which point |
192 | // it will gather all remaining lines into the xmp key. |
193 | foreach ( $lines as $line ) { |
194 | if ( $inMetadata ) { |
195 | // Handle XMP differently due to difference in line break |
196 | $data['xmp'] .= "\n$line"; |
197 | continue; |
198 | } |
199 | $bits = explode( ':', $line, 2 ); |
200 | if ( count( $bits ) > 1 ) { |
201 | $key = trim( $bits[0] ); |
202 | if ( $key === 'Metadata' ) { |
203 | $inMetadata = true; |
204 | $data['xmp'] = ''; |
205 | continue; |
206 | } |
207 | $value = trim( $bits[1] ); |
208 | $matches = []; |
209 | // "Page xx rot" will be in poppler 0.20's pdfinfo output |
210 | // See https://bugs.freedesktop.org/show_bug.cgi?id=41867 |
211 | if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) { |
212 | $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value; |
213 | } else { |
214 | $data[$key] = $value; |
215 | } |
216 | } |
217 | } |
218 | $metaDump = trim( $metaDump ); |
219 | if ( $metaDump !== '' ) { |
220 | $data['xmp'] = $metaDump; |
221 | } |
222 | |
223 | return $this->postProcessDump( $data ); |
224 | } |
225 | |
226 | /** |
227 | * Postprocess the metadata (convert xmp into useful form, etc) |
228 | * |
229 | * This is used to generate the metadata table at the bottom |
230 | * of the image description page. |
231 | * |
232 | * @param array $data metadata |
233 | * @return array post-processed metadata |
234 | */ |
235 | protected function postProcessDump( array $data ) { |
236 | $meta = new BitmapMetadataHandler(); |
237 | $items = []; |
238 | foreach ( $data as $key => $val ) { |
239 | switch ( $key ) { |
240 | case 'Title': |
241 | $items['ObjectName'] = $val; |
242 | break; |
243 | case 'Subject': |
244 | $items['ImageDescription'] = $val; |
245 | break; |
246 | case 'Keywords': |
247 | // Sometimes we have empty keywords. This seems |
248 | // to be a product of how pdfinfo deals with keywords |
249 | // with spaces in them. Filter such empty keywords |
250 | $keyList = array_filter( explode( ' ', $val ) ); |
251 | if ( count( $keyList ) > 0 ) { |
252 | $items['Keywords'] = $keyList; |
253 | } |
254 | break; |
255 | case 'Author': |
256 | $items['Artist'] = $val; |
257 | break; |
258 | case 'Creator': |
259 | // Program used to create file. |
260 | // Different from program used to convert to pdf. |
261 | $items['Software'] = $val; |
262 | break; |
263 | case 'Producer': |
264 | // Conversion program |
265 | $items['pdf-Producer'] = $val; |
266 | break; |
267 | case 'ModTime': |
268 | $timestamp = wfTimestamp( TS_EXIF, $val ); |
269 | if ( $timestamp ) { |
270 | // 'if' is just paranoia |
271 | $items['DateTime'] = $timestamp; |
272 | } |
273 | break; |
274 | case 'CreationTime': |
275 | $timestamp = wfTimestamp( TS_EXIF, $val ); |
276 | if ( $timestamp ) { |
277 | $items['DateTimeDigitized'] = $timestamp; |
278 | } |
279 | break; |
280 | // These last two (version and encryption) I was unsure |
281 | // if we should include in the table, since they aren't |
282 | // all that useful to editors. I leaned on the side |
283 | // of including. However not including if file |
284 | // is optimized/linearized since that is really useless |
285 | // to an editor. |
286 | case 'PDF version': |
287 | $items['pdf-Version'] = $val; |
288 | break; |
289 | case 'Encrypted': |
290 | $items['pdf-Encrypted'] = $val; |
291 | break; |
292 | // Note 'pages' and 'Pages' are different keys (!) |
293 | case 'pages': |
294 | // A pdf document can have multiple sized pages in it. |
295 | // (However 95% of the time, all pages are the same size) |
296 | // get a list of all the unique page sizes in document. |
297 | // This doesn't do anything with rotation as of yet, |
298 | // mostly because I am unsure of what a good way to |
299 | // present that information to the user would be. |
300 | $pageSizes = []; |
301 | foreach ( $val as $page ) { |
302 | if ( isset( $page['Page size'] ) ) { |
303 | $pageSizes[$page['Page size']] = true; |
304 | } |
305 | } |
306 | |
307 | $pageSizeArray = array_keys( $pageSizes ); |
308 | if ( count( $pageSizeArray ) > 0 ) { |
309 | $items['pdf-PageSize'] = $pageSizeArray; |
310 | } |
311 | break; |
312 | } |
313 | |
314 | } |
315 | $meta->addMetadata( $items, 'native' ); |
316 | |
317 | if ( isset( $data['xmp'] ) && XMPReader::isSupported() ) { |
318 | // @todo: This only handles generic xmp properties. Would be improved |
319 | // by handling pdf xmp properties (pdf and pdfx) via a hook. |
320 | $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) ); |
321 | $xmp->parse( $data['xmp'] ); |
322 | $xmpRes = $xmp->getResults(); |
323 | foreach ( $xmpRes as $type => $xmpSection ) { |
324 | $meta->addMetadata( $xmpSection, $type ); |
325 | } |
326 | } |
327 | unset( $data['xmp'] ); |
328 | $data['mergedMetadata'] = $meta->getMetadataArray(); |
329 | return $data; |
330 | } |
331 | } |