Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
39.67% |
73 / 184 |
|
12.50% |
2 / 16 |
CRAP | |
0.00% |
0 / 1 |
ParserFileProcessingHookHandlers | |
39.67% |
73 / 184 |
|
12.50% |
2 / 16 |
992.56 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
onParserTestGlobals | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
onParserModifyImageHTML | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
onParserAfterTidy | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
11.32 | |||
findBestImages | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
10.02 | |||
addPageImageCandidateToParserOutput | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
processThisTitle | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
calcWidth | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
getScore | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
scoreFromTable | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
4.02 | |||
isImageFree | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
fetchFileMetadata | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getRatio | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
getDenylist | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
30 | |||
getDbDenylist | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
getUrlDenylist | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 |
1 | <?php |
2 | |
3 | namespace PageImages\Hooks; |
4 | |
5 | use Exception; |
6 | use File; |
7 | use FormatMetadata; |
8 | use MediaWiki\Config\Config; |
9 | use MediaWiki\Context\DerivativeContext; |
10 | use MediaWiki\Hook\ParserAfterTidyHook; |
11 | use MediaWiki\Hook\ParserModifyImageHTMLHook; |
12 | use MediaWiki\Hook\ParserTestGlobalsHook; |
13 | use MediaWiki\Http\HttpRequestFactory; |
14 | use MediaWiki\Linker\LinksMigration; |
15 | use MediaWiki\MainConfigNames; |
16 | use MediaWiki\Page\PageReference; |
17 | use MediaWiki\Parser\Parser; |
18 | use MediaWiki\Parser\ParserOutput; |
19 | use MediaWiki\Title\TitleFactory; |
20 | use PageImages\PageImageCandidate; |
21 | use PageImages\PageImages; |
22 | use RepoGroup; |
23 | use RuntimeException; |
24 | use Wikimedia\ObjectCache\WANObjectCache; |
25 | use Wikimedia\Rdbms\IConnectionProvider; |
26 | |
27 | /** |
28 | * Handlers for parser hooks. |
29 | * |
30 | * The ParserModifyImageHTML hook handler collects candidate images, and marks |
31 | * them with a temporary HTML comment in the parser output. |
32 | * |
33 | * The ParserAfterTidy hook handler processes the candidate images, identifying |
34 | * the best image and the best free image. If $wgPageImagesLeadSectionOnly is |
35 | * set, images following the first section header are discarded. It removes the |
36 | * temporary comments and saves the resulting best images to page_props. |
37 | * |
38 | * The various query interfaces will retrieve the lead image from page_props. |
39 | * |
40 | * @license WTFPL |
41 | * @author Max Semenik |
42 | * @author Thiemo Kreuz |
43 | */ |
44 | class ParserFileProcessingHookHandlers implements |
45 | ParserAfterTidyHook, |
46 | ParserModifyImageHTMLHook, |
47 | ParserTestGlobalsHook |
48 | { |
49 | private const CANDIDATE_REGEX = '/<!--MW-PAGEIMAGES-CANDIDATE-([0-9]+)-->/'; |
50 | |
51 | protected Config $config; |
52 | private RepoGroup $repoGroup; |
53 | private WANObjectCache $mainWANObjectCache; |
54 | private HttpRequestFactory $httpRequestFactory; |
55 | private IConnectionProvider $connectionProvider; |
56 | private TitleFactory $titleFactory; |
57 | private LinksMigration $linksMigration; |
58 | |
59 | public function __construct( |
60 | Config $config, |
61 | RepoGroup $repoGroup, |
62 | WANObjectCache $mainWANObjectCache, |
63 | HttpRequestFactory $httpRequestFactory, |
64 | IConnectionProvider $connectionProvider, |
65 | TitleFactory $titleFactory, |
66 | LinksMigration $linksMigration |
67 | ) { |
68 | $this->config = $config; |
69 | $this->repoGroup = $repoGroup; |
70 | $this->mainWANObjectCache = $mainWANObjectCache; |
71 | $this->httpRequestFactory = $httpRequestFactory; |
72 | $this->connectionProvider = $connectionProvider; |
73 | $this->titleFactory = $titleFactory; |
74 | $this->linksMigration = $linksMigration; |
75 | } |
76 | |
77 | /** |
78 | * @param array &$globals |
79 | */ |
80 | public function onParserTestGlobals( &$globals ) { |
81 | $globals += [ |
82 | 'wgPageImagesScores' => [ |
83 | 'width' => [ |
84 | 200 => 10, |
85 | 1000 => 20 |
86 | ], |
87 | 'position' => [], |
88 | 'ratio' => [], |
89 | 'galleryImageWidth' => [] |
90 | ], |
91 | 'wgPageImagesLeadSectionOnly' => true |
92 | ]; |
93 | } |
94 | |
95 | /** |
96 | * ParserModifyImageHTML hook. Save candidate images, and mark them with a |
97 | * comment so that we can later tell if they were in the lead section. |
98 | * |
99 | * @param Parser $parser |
100 | * @param File $file |
101 | * @param array $params |
102 | * @param string &$html |
103 | */ |
104 | public function onParserModifyImageHTML( |
105 | Parser $parser, |
106 | File $file, |
107 | array $params, |
108 | string &$html |
109 | ): void { |
110 | $page = $parser->getPage(); |
111 | if ( !$page || !$this->processThisTitle( $page ) ) { |
112 | return; |
113 | } |
114 | |
115 | $this->calcWidth( $params, $file ); |
116 | |
117 | $index = $this->addPageImageCandidateToParserOutput( |
118 | PageImageCandidate::newFromFileAndParams( $file, $params ), |
119 | $parser->getOutput() |
120 | ); |
121 | $html .= "<!--MW-PAGEIMAGES-CANDIDATE-$index-->"; |
122 | } |
123 | |
124 | /** |
125 | * ParserAfterTidy hook handler. Remove candidate images which were not in |
126 | * the lead section. |
127 | * |
128 | * @param Parser $parser |
129 | * @param string &$text |
130 | */ |
131 | public function onParserAfterTidy( $parser, &$text ) { |
132 | $parserOutput = $parser->getOutput(); |
133 | $allImages = $parserOutput->getExtensionData( 'pageImages' ); |
134 | if ( !$allImages ) { |
135 | return; |
136 | } |
137 | |
138 | // Find and remove our special comments |
139 | $images = []; |
140 | if ( $this->config->get( 'PageImagesLeadSectionOnly' ) ) { |
141 | $leadEndPos = strpos( $text, '<mw:editsection' ); |
142 | } else { |
143 | $leadEndPos = false; |
144 | } |
145 | $text = preg_replace_callback( |
146 | self::CANDIDATE_REGEX, |
147 | static function ( $m ) use ( $allImages, &$images, $leadEndPos ) { |
148 | $offset = $m[0][1]; |
149 | $id = intval( $m[1][0] ); |
150 | $inLead = $leadEndPos === false || $offset < $leadEndPos; |
151 | if ( $inLead && isset( $allImages[$id] ) ) { |
152 | $images[] = PageImageCandidate::newFromArray( $allImages[$id] ); |
153 | } |
154 | return ''; |
155 | }, |
156 | $text, -1, $count, PREG_OFFSET_CAPTURE |
157 | ); |
158 | |
159 | [ $bestImageName, $freeImageName ] = $this->findBestImages( $images ); |
160 | |
161 | if ( $freeImageName ) { |
162 | $parserOutput->setPageProperty( PageImages::getPropName( true ), $freeImageName ); |
163 | } |
164 | |
165 | // Only store the image if it's not free. Free image (if any) has already been stored above. |
166 | if ( $bestImageName && $bestImageName !== $freeImageName ) { |
167 | $parserOutput->setPageProperty( PageImages::getPropName( false ), $bestImageName ); |
168 | } |
169 | |
170 | // Strip comments from indicators (T298930) |
171 | foreach ( $parserOutput->getIndicators() as $id => $value ) { |
172 | $stripped = preg_replace( self::CANDIDATE_REGEX, '', $value ); |
173 | if ( $stripped !== $value ) { |
174 | $parserOutput->setIndicator( $id, $stripped ); |
175 | } |
176 | } |
177 | // We may have comments in TOC data - Parser::cleanupTocLine strips them for us. |
178 | } |
179 | |
180 | /** |
181 | * Find the best images out of an array of candidates |
182 | * |
183 | * @param PageImageCandidate[] $images |
184 | * @return array{string|false,string|false} The best image, and the best free image |
185 | */ |
186 | private function findBestImages( array $images ) { |
187 | if ( !$images ) { |
188 | return [ false, false ]; |
189 | } |
190 | |
191 | // Determine the image scores |
192 | |
193 | $scores = []; |
194 | $counter = 0; |
195 | |
196 | foreach ( $images as $image ) { |
197 | $score = $this->getScore( $image, $counter++ ); |
198 | $fileName = $image->getFileName(); |
199 | $scores[$fileName] = max( $scores[$fileName] ?? -1, $score ); |
200 | } |
201 | |
202 | $bestImageName = false; |
203 | $freeImageName = false; |
204 | |
205 | foreach ( $scores as $name => $score ) { |
206 | if ( $score > 0 ) { |
207 | if ( !$bestImageName || $score > $scores[$bestImageName] ) { |
208 | $bestImageName = $name; |
209 | } |
210 | if ( ( !$freeImageName || $score > $scores[$freeImageName] ) && $this->isImageFree( $name ) ) { |
211 | $freeImageName = $name; |
212 | } |
213 | } |
214 | } |
215 | return [ $bestImageName, $freeImageName ]; |
216 | } |
217 | |
218 | /** |
219 | * Adds $image to $parserOutput extension data. |
220 | * |
221 | * @param PageImageCandidate $image |
222 | * @param ParserOutput $parserOutput |
223 | * @return int |
224 | */ |
225 | private function addPageImageCandidateToParserOutput( |
226 | PageImageCandidate $image, |
227 | ParserOutput $parserOutput |
228 | ) { |
229 | $images = $parserOutput->getExtensionData( 'pageImages' ) ?: []; |
230 | $images[] = $image->jsonSerialize(); |
231 | $parserOutput->setExtensionData( 'pageImages', $images ); |
232 | return count( $images ) - 1; |
233 | } |
234 | |
235 | /** |
236 | * Returns true if data for this title should be saved |
237 | * |
238 | * @param PageReference $pageReference |
239 | * |
240 | * @return bool |
241 | */ |
242 | private function processThisTitle( PageReference $pageReference ) { |
243 | static $flipped = null; |
244 | $flipped ??= array_flip( $this->config->get( 'PageImagesNamespaces' ) ); |
245 | |
246 | return isset( $flipped[$pageReference->getNamespace()] ); |
247 | } |
248 | |
249 | /** |
250 | * Estimates image size as displayed if not explicitly provided. We don't follow the core size |
251 | * calculation algorithm precisely because it's not required and editor's intentions are more |
252 | * important than the precise number. |
253 | * |
254 | * @param array[] &$params |
255 | * @param File $file |
256 | */ |
257 | private function calcWidth( array &$params, File $file ) { |
258 | if ( isset( $params['handler']['width'] ) ) { |
259 | return; |
260 | } |
261 | |
262 | if ( isset( $params['handler']['height'] ) && $file->getHeight() > 0 ) { |
263 | $params['handler']['width'] = |
264 | $file->getWidth() * ( $params['handler']['height'] / $file->getHeight() ); |
265 | } elseif ( isset( $params['frame']['thumbnail'] ) |
266 | || isset( $params['frame']['thumb'] ) |
267 | || isset( $params['frame']['frameless'] ) |
268 | ) { |
269 | $thumbLimits = $this->config->get( MainConfigNames::ThumbLimits ); |
270 | $defaultUserOptions = $this->config->get( MainConfigNames::DefaultUserOptions ); |
271 | $params['handler']['width'] = $thumbLimits[$defaultUserOptions['thumbsize']] |
272 | ?? 250; |
273 | } else { |
274 | $params['handler']['width'] = $file->getWidth(); |
275 | } |
276 | } |
277 | |
278 | /** |
279 | * Returns score for image, the more the better, if it is less than zero, |
280 | * the image shouldn't be used for anything |
281 | * |
282 | * @param PageImageCandidate $image Associative array describing an image |
283 | * @param int $position Image order on page |
284 | * |
285 | * @return float |
286 | */ |
287 | protected function getScore( PageImageCandidate $image, $position ) { |
288 | // Exclude images with class="notpageimage" |
289 | if ( preg_match( '/(?:^|\s)notpageimage(?=\s|$)/', $image->getFrameClass() ) ) { |
290 | return -1000; |
291 | } |
292 | |
293 | $pageImagesScores = $this->config->get( 'PageImagesScores' ); |
294 | if ( $image->getHandlerWidth() ) { |
295 | // Standalone image |
296 | $score = $this->scoreFromTable( $image->getHandlerWidth(), $pageImagesScores['width'] ); |
297 | } else { |
298 | // From gallery |
299 | $score = $this->scoreFromTable( $image->getFullWidth(), $pageImagesScores['galleryImageWidth'] ); |
300 | } |
301 | |
302 | if ( isset( $pageImagesScores['position'][$position] ) ) { |
303 | $score += $pageImagesScores['position'][$position]; |
304 | } |
305 | |
306 | $ratio = intval( $this->getRatio( $image ) * 10 ); |
307 | $score += $this->scoreFromTable( $ratio, $pageImagesScores['ratio'] ); |
308 | |
309 | $denylist = $this->getDenylist(); |
310 | if ( isset( $denylist[$image->getFileName()] ) ) { |
311 | $score = -1000; |
312 | } |
313 | |
314 | return $score; |
315 | } |
316 | |
317 | /** |
318 | * Returns score based on table of ranges |
319 | * |
320 | * @param int $value The number that the various bounds are compared against |
321 | * to calculate the score |
322 | * @param float[] $scores Table of scores for different ranges of $value |
323 | * |
324 | * @return float |
325 | */ |
326 | protected function scoreFromTable( $value, array $scores ) { |
327 | $lastScore = 0; |
328 | |
329 | // The loop stops at the *first* match, and therefore *requires* the input array keys to be |
330 | // in increasing order. |
331 | ksort( $scores, SORT_NUMERIC ); |
332 | foreach ( $scores as $upperBoundary => $score ) { |
333 | $lastScore = $score; |
334 | |
335 | if ( $value <= $upperBoundary ) { |
336 | break; |
337 | } |
338 | } |
339 | |
340 | if ( !is_numeric( $lastScore ) ) { |
341 | wfLogWarning( 'The PageImagesScores setting must only contain numeric values!' ); |
342 | } |
343 | |
344 | return (float)$lastScore; |
345 | } |
346 | |
347 | /** |
348 | * Check whether image's copyright allows it to be used freely. |
349 | * |
350 | * @param string $fileName Name of the image file |
351 | * @return bool |
352 | */ |
353 | protected function isImageFree( $fileName ) { |
354 | $file = $this->repoGroup->findFile( $fileName ); |
355 | if ( $file ) { |
356 | // Process copyright metadata from CommonsMetadata, if present. |
357 | // Image is considered free if the value is '0' or unset. |
358 | return empty( $this->fetchFileMetadata( $file )['NonFree']['value'] ); |
359 | } |
360 | return true; |
361 | } |
362 | |
363 | /** |
364 | * Fetch file metadata |
365 | * |
366 | * @param File $file File to fetch metadata from |
367 | * @return array |
368 | */ |
369 | protected function fetchFileMetadata( $file ) { |
370 | $format = new FormatMetadata; |
371 | $context = new DerivativeContext( $format->getContext() ); |
372 | // we don't care about the language, and specifying singleLanguage is slightly faster |
373 | $format->setSingleLanguage( true ); |
374 | // we don't care about the language, so avoid splitting the cache by selecting English |
375 | $context->setLanguage( 'en' ); |
376 | $format->setContext( $context ); |
377 | return $format->fetchExtendedMetadata( $file ); |
378 | } |
379 | |
380 | /** |
381 | * Returns width/height ratio of an image as displayed or 0 if not available |
382 | * |
383 | * @param PageImageCandidate $image |
384 | * |
385 | * @return float|int |
386 | */ |
387 | protected function getRatio( PageImageCandidate $image ) { |
388 | $width = $image->getFullWidth(); |
389 | $height = $image->getFullHeight(); |
390 | return $width > 0 && $height > 0 ? $width / $height : 0; |
391 | } |
392 | |
393 | /** |
394 | * Returns a list of images denylisted from influencing this extension's output |
395 | * |
396 | * @return int[] Flipped associative array in format "image BDB key" => int |
397 | * @throws Exception |
398 | */ |
399 | protected function getDenylist() { |
400 | return $this->mainWANObjectCache->getWithSetCallback( |
401 | $this->mainWANObjectCache->makeKey( 'pageimages-denylist' ), |
402 | $this->config->get( 'PageImagesDenylistExpiry' ), |
403 | function () { |
404 | $list = []; |
405 | foreach ( $this->config->get( 'PageImagesDenylist' ) as $source ) { |
406 | switch ( $source['type'] ) { |
407 | case 'db': |
408 | $list = array_merge( |
409 | $list, |
410 | $this->getDbDenylist( $source['db'], $source['page'] ) |
411 | ); |
412 | break; |
413 | case 'url': |
414 | $list = array_merge( |
415 | $list, |
416 | $this->getUrlDenylist( $source['url'] ) |
417 | ); |
418 | break; |
419 | default: |
420 | throw new RuntimeException( |
421 | "unrecognized image denylist type '{$source['type']}'" |
422 | ); |
423 | } |
424 | } |
425 | |
426 | return array_flip( $list ); |
427 | } |
428 | ); |
429 | } |
430 | |
431 | /** |
432 | * Returns list of images linked by the given denylist page |
433 | * |
434 | * @param string|false $dbName Database name or false for current database |
435 | * @param string $page |
436 | * |
437 | * @return string[] |
438 | */ |
439 | private function getDbDenylist( $dbName, $page ) { |
440 | $title = $this->titleFactory->newFromText( $page ); |
441 | if ( !$title || !$title->canExist() ) { |
442 | return []; |
443 | } |
444 | |
445 | $dbr = $this->connectionProvider->getReplicaDatabase( $dbName ); |
446 | $id = $dbr->newSelectQueryBuilder() |
447 | ->select( 'page_id' ) |
448 | ->from( 'page' ) |
449 | ->where( [ 'page_namespace' => $title->getNamespace(), 'page_title' => $title->getDBkey() ] ) |
450 | ->caller( __METHOD__ )->fetchField(); |
451 | if ( !$id ) { |
452 | return []; |
453 | } |
454 | [ $blNamespace, $blTitle ] = $this->linksMigration->getTitleFields( 'pagelinks' ); |
455 | $queryInfo = $this->linksMigration->getQueryInfo( 'pagelinks' ); |
456 | |
457 | return $dbr->newSelectQueryBuilder() |
458 | ->select( $blTitle ) |
459 | ->tables( $queryInfo['tables'] ) |
460 | ->joinConds( $queryInfo['joins'] ) |
461 | ->where( [ 'pl_from' => (int)$id, $blNamespace => NS_FILE ] ) |
462 | ->caller( __METHOD__ )->fetchFieldValues(); |
463 | } |
464 | |
465 | /** |
466 | * Returns list of images on given remote denylist page. |
467 | * Not quite 100% bulletproof due to localised namespaces and so on. |
468 | * Though if you beat people if they add bad entries to the list... :) |
469 | * |
470 | * @param string $url |
471 | * |
472 | * @return string[] |
473 | */ |
474 | private function getUrlDenylist( $url ) { |
475 | $list = []; |
476 | $text = $this->httpRequestFactory->get( $url, [ 'timeout' => 3 ], __METHOD__ ); |
477 | $fileExtensions = $this->config->get( 'FileExtensions' ); |
478 | $regex = '/\[\[:([^|\#]*?\.(?:' . implode( '|', $fileExtensions ) . '))/i'; |
479 | |
480 | if ( $text && preg_match_all( $regex, $text, $matches ) ) { |
481 | foreach ( $matches[1] as $s ) { |
482 | $t = $this->titleFactory->makeTitleSafe( NS_FILE, $s ); |
483 | |
484 | if ( $t ) { |
485 | $list[] = $t->getDBkey(); |
486 | } |
487 | } |
488 | } |
489 | |
490 | return $list; |
491 | } |
492 | |
493 | } |