Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
38.89% |
70 / 180 |
|
12.50% |
2 / 16 |
CRAP | |
0.00% |
0 / 1 |
| ParserFileProcessingHookHandlers | |
38.89% |
70 / 180 |
|
12.50% |
2 / 16 |
1029.24 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| onParserTestGlobals | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| onParserModifyImageHTML | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
| onParserAfterTidy | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
11.32 | |||
| findBestImages | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
10.02 | |||
| addPageImageCandidateToParserOutput | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| processThisTitle | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| calcWidth | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
| getScore | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
6 | |||
| scoreFromTable | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
4.02 | |||
| isImageFree | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
| fetchFileMetadata | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| getRatio | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| getDenylist | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
30 | |||
| getDbDenylist | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
| getUrlDenylist | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace PageImages\Hooks; |
| 4 | |
| 5 | use Exception; |
| 6 | use MediaWiki\Config\Config; |
| 7 | use MediaWiki\Context\DerivativeContext; |
| 8 | use MediaWiki\FileRepo\File\File; |
| 9 | use MediaWiki\FileRepo\RepoGroup; |
| 10 | use MediaWiki\Hook\ParserTestGlobalsHook; |
| 11 | use MediaWiki\Http\HttpRequestFactory; |
| 12 | use MediaWiki\Linker\LinksMigration; |
| 13 | use MediaWiki\MainConfigNames; |
| 14 | use MediaWiki\Media\FormatMetadata; |
| 15 | use MediaWiki\Page\PageReference; |
| 16 | use MediaWiki\Parser\Hook\ParserAfterTidyHook; |
| 17 | use MediaWiki\Parser\Hook\ParserModifyImageHTMLHook; |
| 18 | use MediaWiki\Parser\Parser; |
| 19 | use MediaWiki\Parser\ParserOutput; |
| 20 | use MediaWiki\Title\TitleFactory; |
| 21 | use PageImages\PageImageCandidate; |
| 22 | use PageImages\PageImages; |
| 23 | use RuntimeException; |
| 24 | use Wikimedia\ObjectCache\WANObjectCache; |
| 25 | use Wikimedia\Rdbms\IConnectionProvider; |
| 26 | |
| 27 | /** |
| 28 | * Handlers for parser hooks. |
| 29 | * |
| 30 | * The ParserModifyImageHTML hook handler collects candidate images, and marks |
| 31 | * them with a temporary HTML comment in the parser output. |
| 32 | * |
| 33 | * The ParserAfterTidy hook handler processes the candidate images, identifying |
| 34 | * the best image and the best free image. If $wgPageImagesLeadSectionOnly is |
| 35 | * set, images following the first section header are discarded. It removes the |
| 36 | * temporary comments and saves the resulting best images to page_props. |
| 37 | * |
| 38 | * The various query interfaces will retrieve the lead image from page_props. |
| 39 | * |
| 40 | * @license WTFPL |
| 41 | * @author Max Semenik |
| 42 | * @author Thiemo Kreuz |
| 43 | */ |
| 44 | class ParserFileProcessingHookHandlers implements |
| 45 | ParserAfterTidyHook, |
| 46 | ParserModifyImageHTMLHook, |
| 47 | ParserTestGlobalsHook |
| 48 | { |
| 49 | private const CANDIDATE_REGEX = '/<!--MW-PAGEIMAGES-CANDIDATE-([0-9]+)-->/'; |
| 50 | |
| 51 | public function __construct( |
| 52 | protected Config $config, |
| 53 | private readonly RepoGroup $repoGroup, |
| 54 | private readonly WANObjectCache $mainWANObjectCache, |
| 55 | private readonly HttpRequestFactory $httpRequestFactory, |
| 56 | private readonly IConnectionProvider $connectionProvider, |
| 57 | private readonly TitleFactory $titleFactory, |
| 58 | private readonly LinksMigration $linksMigration, |
| 59 | ) { |
| 60 | } |
| 61 | |
| 62 | /** |
| 63 | * @param array &$globals |
| 64 | */ |
| 65 | public function onParserTestGlobals( &$globals ) { |
| 66 | $globals += [ |
| 67 | 'wgPageImagesScores' => [ |
| 68 | 'width' => [ |
| 69 | 200 => 10, |
| 70 | 1000 => 20 |
| 71 | ], |
| 72 | 'position' => [], |
| 73 | 'ratio' => [], |
| 74 | 'galleryImageWidth' => [] |
| 75 | ], |
| 76 | 'wgPageImagesLeadSectionOnly' => true |
| 77 | ]; |
| 78 | } |
| 79 | |
| 80 | /** |
| 81 | * ParserModifyImageHTML hook. Save candidate images, and mark them with a |
| 82 | * comment so that we can later tell if they were in the lead section. |
| 83 | */ |
| 84 | public function onParserModifyImageHTML( |
| 85 | Parser $parser, |
| 86 | File $file, |
| 87 | array $params, |
| 88 | string &$html |
| 89 | ): void { |
| 90 | if ( !$this->processThisTitle( $parser->getPage() ) ) { |
| 91 | return; |
| 92 | } |
| 93 | |
| 94 | $this->calcWidth( $params, $file ); |
| 95 | |
| 96 | $index = $this->addPageImageCandidateToParserOutput( |
| 97 | PageImageCandidate::newFromFileAndParams( $file, $params ), |
| 98 | $parser->getOutput() |
| 99 | ); |
| 100 | $html .= "<!--MW-PAGEIMAGES-CANDIDATE-$index-->"; |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * ParserAfterTidy hook handler. Remove candidate images which were not in |
| 105 | * the lead section. |
| 106 | * |
| 107 | * @param Parser $parser |
| 108 | * @param string &$text |
| 109 | */ |
| 110 | public function onParserAfterTidy( $parser, &$text ) { |
| 111 | $parserOutput = $parser->getOutput(); |
| 112 | $allImages = $parserOutput->getExtensionData( 'pageImages' ); |
| 113 | if ( !$allImages ) { |
| 114 | return; |
| 115 | } |
| 116 | |
| 117 | // Find and remove our special comments |
| 118 | $images = []; |
| 119 | if ( $this->config->get( 'PageImagesLeadSectionOnly' ) ) { |
| 120 | $leadEndPos = strpos( $text, '<mw:editsection' ); |
| 121 | } else { |
| 122 | $leadEndPos = false; |
| 123 | } |
| 124 | $text = preg_replace_callback( |
| 125 | self::CANDIDATE_REGEX, |
| 126 | static function ( $m ) use ( $allImages, &$images, $leadEndPos ) { |
| 127 | $offset = $m[0][1]; |
| 128 | $id = intval( $m[1][0] ); |
| 129 | $inLead = $leadEndPos === false || $offset < $leadEndPos; |
| 130 | if ( $inLead && isset( $allImages[$id] ) ) { |
| 131 | $images[] = PageImageCandidate::newFromArray( $allImages[$id] ); |
| 132 | } |
| 133 | return ''; |
| 134 | }, |
| 135 | $text, -1, $count, PREG_OFFSET_CAPTURE |
| 136 | ); |
| 137 | |
| 138 | [ $bestImageName, $freeImageName ] = $this->findBestImages( $images ); |
| 139 | |
| 140 | if ( $freeImageName ) { |
| 141 | $parserOutput->setPageProperty( PageImages::getPropName( true ), $freeImageName ); |
| 142 | } |
| 143 | |
| 144 | // Only store the image if it's not free. Free image (if any) has already been stored above. |
| 145 | if ( $bestImageName && $bestImageName !== $freeImageName ) { |
| 146 | $parserOutput->setPageProperty( PageImages::getPropName( false ), $bestImageName ); |
| 147 | } |
| 148 | |
| 149 | // Strip comments from indicators (T298930) |
| 150 | foreach ( $parserOutput->getIndicators() as $id => $value ) { |
| 151 | $stripped = preg_replace( self::CANDIDATE_REGEX, '', $value ); |
| 152 | if ( $stripped !== $value ) { |
| 153 | $parserOutput->setIndicator( $id, $stripped ); |
| 154 | } |
| 155 | } |
| 156 | // We may have comments in TOC data - Parser::cleanupTocLine strips them for us. |
| 157 | } |
| 158 | |
| 159 | /** |
| 160 | * Find the best images out of an array of candidates |
| 161 | * |
| 162 | * @param PageImageCandidate[] $images |
| 163 | * @return array{string|false,string|false} The best image, and the best free image |
| 164 | */ |
| 165 | private function findBestImages( array $images ) { |
| 166 | if ( !$images ) { |
| 167 | return [ false, false ]; |
| 168 | } |
| 169 | |
| 170 | // Determine the image scores |
| 171 | |
| 172 | $scores = []; |
| 173 | $counter = 0; |
| 174 | |
| 175 | foreach ( $images as $image ) { |
| 176 | $score = $this->getScore( $image, $counter++ ); |
| 177 | $fileName = $image->getFileName(); |
| 178 | $scores[$fileName] = max( $scores[$fileName] ?? -1, $score ); |
| 179 | } |
| 180 | |
| 181 | $bestImageName = false; |
| 182 | $freeImageName = false; |
| 183 | |
| 184 | foreach ( $scores as $name => $score ) { |
| 185 | if ( $score > 0 ) { |
| 186 | if ( !$bestImageName || $score > $scores[$bestImageName] ) { |
| 187 | $bestImageName = $name; |
| 188 | } |
| 189 | if ( ( !$freeImageName || $score > $scores[$freeImageName] ) && $this->isImageFree( $name ) ) { |
| 190 | $freeImageName = $name; |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | return [ $bestImageName, $freeImageName ]; |
| 195 | } |
| 196 | |
| 197 | /** |
| 198 | * Adds $image to $parserOutput extension data. |
| 199 | */ |
| 200 | private function addPageImageCandidateToParserOutput( |
| 201 | PageImageCandidate $image, |
| 202 | ParserOutput $parserOutput |
| 203 | ): int { |
| 204 | $images = $parserOutput->getExtensionData( 'pageImages' ) ?: []; |
| 205 | $images[] = $image->jsonSerialize(); |
| 206 | $parserOutput->setExtensionData( 'pageImages', $images ); |
| 207 | return count( $images ) - 1; |
| 208 | } |
| 209 | |
| 210 | /** |
| 211 | * Returns true if data for this title should be saved |
| 212 | */ |
| 213 | private function processThisTitle( PageReference $pageReference ): bool { |
| 214 | static $flipped = null; |
| 215 | $flipped ??= array_flip( $this->config->get( 'PageImagesNamespaces' ) ); |
| 216 | |
| 217 | return isset( $flipped[$pageReference->getNamespace()] ); |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * Estimates image size as displayed if not explicitly provided. We don't follow the core size |
| 222 | * calculation algorithm precisely because it's not required and editor's intentions are more |
| 223 | * important than the precise number. |
| 224 | */ |
| 225 | private function calcWidth( array &$params, File $file ): void { |
| 226 | if ( isset( $params['handler']['width'] ) ) { |
| 227 | return; |
| 228 | } |
| 229 | |
| 230 | if ( isset( $params['handler']['height'] ) && $file->getHeight() > 0 ) { |
| 231 | $params['handler']['width'] = |
| 232 | $file->getWidth() * ( $params['handler']['height'] / $file->getHeight() ); |
| 233 | } elseif ( isset( $params['frame']['thumbnail'] ) |
| 234 | || isset( $params['frame']['thumb'] ) |
| 235 | || isset( $params['frame']['frameless'] ) |
| 236 | ) { |
| 237 | $thumbLimits = $this->config->get( MainConfigNames::ThumbLimits ); |
| 238 | $defaultUserOptions = $this->config->get( MainConfigNames::DefaultUserOptions ); |
| 239 | $params['handler']['width'] = $thumbLimits[$defaultUserOptions['thumbsize']] |
| 240 | ?? 250; |
| 241 | } else { |
| 242 | $params['handler']['width'] = $file->getWidth(); |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | /** |
| 247 | * Return score for image, the more the better, if it is less than zero, |
| 248 | * the image shouldn't be used for anything |
| 249 | * |
| 250 | * @param PageImageCandidate $image Associative array describing an image |
| 251 | * @param int $position Image order on page |
| 252 | * @return float |
| 253 | */ |
| 254 | protected function getScore( PageImageCandidate $image, int $position ) { |
| 255 | $classes = preg_split( '/\s+/', $image->getFrameClass(), -1, PREG_SPLIT_NO_EMPTY ); |
| 256 | // Exclude images with class="notpageimage" |
| 257 | if ( in_array( 'notpageimage', $classes ) ) { |
| 258 | return -1000; |
| 259 | } |
| 260 | |
| 261 | $pageImagesScores = $this->config->get( 'PageImagesScores' ); |
| 262 | if ( $image->getHandlerWidth() ) { |
| 263 | // Standalone image |
| 264 | $score = $this->scoreFromTable( $image->getHandlerWidth(), $pageImagesScores['width'] ); |
| 265 | } else { |
| 266 | // From gallery |
| 267 | $score = $this->scoreFromTable( $image->getFullWidth(), $pageImagesScores['galleryImageWidth'] ); |
| 268 | } |
| 269 | |
| 270 | if ( isset( $pageImagesScores['position'][$position] ) ) { |
| 271 | $score += $pageImagesScores['position'][$position]; |
| 272 | } |
| 273 | |
| 274 | $ratio = intval( $this->getRatio( $image ) * 10 ); |
| 275 | $score += $this->scoreFromTable( $ratio, $pageImagesScores['ratio'] ); |
| 276 | |
| 277 | // T91683: Prefer images with class="pageimage". We're simply adding to the current score rather than returning |
| 278 | // earlier, so that the algorithm still helps decide which image to use when multiple have this class. |
| 279 | if ( in_array( 'pageimage', $classes ) ) { |
| 280 | $score += 1000; |
| 281 | } |
| 282 | |
| 283 | $denylist = $this->getDenylist(); |
| 284 | if ( isset( $denylist[$image->getFileName()] ) ) { |
| 285 | $score = -1000; |
| 286 | } |
| 287 | |
| 288 | return $score; |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * Return score based on table of ranges |
| 293 | * |
| 294 | * @param int $value The number that the various bounds are compared against |
| 295 | * to calculate the score |
| 296 | * @param float[] $scores Table of scores for different ranges of $value |
| 297 | * @return float |
| 298 | */ |
| 299 | protected function scoreFromTable( int $value, array $scores ) { |
| 300 | $lastScore = 0; |
| 301 | |
| 302 | // The loop stops at the *first* match, and therefore *requires* the input array keys to be |
| 303 | // in increasing order. |
| 304 | ksort( $scores, SORT_NUMERIC ); |
| 305 | foreach ( $scores as $upperBoundary => $score ) { |
| 306 | $lastScore = $score; |
| 307 | |
| 308 | if ( $value <= $upperBoundary ) { |
| 309 | break; |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | if ( !is_numeric( $lastScore ) ) { |
| 314 | wfLogWarning( 'The PageImagesScores setting must only contain numeric values!' ); |
| 315 | } |
| 316 | |
| 317 | return (float)$lastScore; |
| 318 | } |
| 319 | |
| 320 | /** |
| 321 | * Check whether image's copyright allows it to be used freely. |
| 322 | * |
| 323 | * @param string $fileName Name of the image file |
| 324 | * @return bool |
| 325 | */ |
| 326 | protected function isImageFree( string $fileName ): bool { |
| 327 | $file = $this->repoGroup->findFile( $fileName ); |
| 328 | if ( $file ) { |
| 329 | // Process copyright metadata from CommonsMetadata, if present. |
| 330 | // Image is considered free if the value is '0' or unset. |
| 331 | return empty( $this->fetchFileMetadata( $file )['NonFree']['value'] ); |
| 332 | } |
| 333 | return true; |
| 334 | } |
| 335 | |
| 336 | /** |
| 337 | * Fetch file metadata |
| 338 | * |
| 339 | * @param File $file File to fetch metadata from |
| 340 | * @return array |
| 341 | */ |
| 342 | protected function fetchFileMetadata( File $file ): array { |
| 343 | $format = new FormatMetadata; |
| 344 | $context = new DerivativeContext( $format->getContext() ); |
| 345 | // we don't care about the language, and specifying singleLanguage is slightly faster |
| 346 | $format->setSingleLanguage( true ); |
| 347 | // we don't care about the language, so avoid splitting the cache by selecting English |
| 348 | $context->setLanguage( 'en' ); |
| 349 | $format->setContext( $context ); |
| 350 | return $format->fetchExtendedMetadata( $file ); |
| 351 | } |
| 352 | |
| 353 | /** |
| 354 | * Return width/height ratio of an image as displayed or 0 if not available |
| 355 | * |
| 356 | * @param PageImageCandidate $image |
| 357 | * @return float|int |
| 358 | */ |
| 359 | protected function getRatio( PageImageCandidate $image ) { |
| 360 | $width = $image->getFullWidth(); |
| 361 | $height = $image->getFullHeight(); |
| 362 | return $width > 0 && $height > 0 ? $width / $height : 0; |
| 363 | } |
| 364 | |
| 365 | /** |
| 366 | * Return a list of images denylisted from influencing this extension's output |
| 367 | * |
| 368 | * @return int[] Flipped associative array in format "image BDB key" => int |
| 369 | * @throws Exception |
| 370 | */ |
| 371 | protected function getDenylist(): array { |
| 372 | return $this->mainWANObjectCache->getWithSetCallback( |
| 373 | $this->mainWANObjectCache->makeKey( 'pageimages-denylist' ), |
| 374 | $this->config->get( 'PageImagesDenylistExpiry' ), |
| 375 | function () { |
| 376 | $list = []; |
| 377 | foreach ( $this->config->get( 'PageImagesDenylist' ) as $source ) { |
| 378 | switch ( $source['type'] ) { |
| 379 | case 'db': |
| 380 | $list = array_merge( |
| 381 | $list, |
| 382 | $this->getDbDenylist( $source['db'], $source['page'] ) |
| 383 | ); |
| 384 | break; |
| 385 | case 'url': |
| 386 | $list = array_merge( |
| 387 | $list, |
| 388 | $this->getUrlDenylist( $source['url'] ) |
| 389 | ); |
| 390 | break; |
| 391 | default: |
| 392 | throw new RuntimeException( |
| 393 | "unrecognized image denylist type '{$source['type']}'" |
| 394 | ); |
| 395 | } |
| 396 | } |
| 397 | |
| 398 | return array_flip( $list ); |
| 399 | } |
| 400 | ); |
| 401 | } |
| 402 | |
| 403 | /** |
| 404 | * Return list of images linked by the given denylist page |
| 405 | * |
| 406 | * @param string|false $dbName Database name or false for current database |
| 407 | * @param string $page |
| 408 | * @return string[] |
| 409 | */ |
| 410 | private function getDbDenylist( $dbName, string $page ): array { |
| 411 | $title = $this->titleFactory->newFromText( $page ); |
| 412 | if ( !$title || !$title->canExist() ) { |
| 413 | return []; |
| 414 | } |
| 415 | |
| 416 | $dbr = $this->connectionProvider->getReplicaDatabase( $dbName ); |
| 417 | $id = $dbr->newSelectQueryBuilder() |
| 418 | ->select( 'page_id' ) |
| 419 | ->from( 'page' ) |
| 420 | ->where( [ 'page_namespace' => $title->getNamespace(), 'page_title' => $title->getDBkey() ] ) |
| 421 | ->caller( __METHOD__ )->fetchField(); |
| 422 | if ( !$id ) { |
| 423 | return []; |
| 424 | } |
| 425 | [ $blNamespace, $blTitle ] = $this->linksMigration->getTitleFields( 'pagelinks' ); |
| 426 | $queryInfo = $this->linksMigration->getQueryInfo( 'pagelinks' ); |
| 427 | |
| 428 | return $dbr->newSelectQueryBuilder() |
| 429 | ->select( $blTitle ) |
| 430 | ->tables( $queryInfo['tables'] ) |
| 431 | ->joinConds( $queryInfo['joins'] ) |
| 432 | ->where( [ 'pl_from' => (int)$id, $blNamespace => NS_FILE ] ) |
| 433 | ->caller( __METHOD__ )->fetchFieldValues(); |
| 434 | } |
| 435 | |
| 436 | /** |
| 437 | * Return list of images on given remote denylist page. |
| 438 | * |
| 439 | * Not quite 100% bulletproof due to localised namespaces and so on. |
| 440 | * Though if you beat people if they add bad entries to the list... :) |
| 441 | * |
| 442 | * @param string $url |
| 443 | * @return string[] |
| 444 | */ |
| 445 | private function getUrlDenylist( string $url ): array { |
| 446 | $list = []; |
| 447 | $text = $this->httpRequestFactory->get( $url, [ 'timeout' => 3 ], __METHOD__ ); |
| 448 | $fileExtensions = $this->config->get( MainConfigNames::FileExtensions ); |
| 449 | $regex = '/\[\[:([^|\#]*?\.(?:' . implode( '|', $fileExtensions ) . '))/i'; |
| 450 | |
| 451 | if ( $text && preg_match_all( $regex, $text, $matches ) ) { |
| 452 | foreach ( $matches[1] as $s ) { |
| 453 | $t = $this->titleFactory->makeTitleSafe( NS_FILE, $s ); |
| 454 | |
| 455 | if ( $t ) { |
| 456 | $list[] = $t->getDBkey(); |
| 457 | } |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | return $list; |
| 462 | } |
| 463 | |
| 464 | } |