Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
88.55% covered (warning)
88.55%
201 / 227
42.86% covered (danger)
42.86%
6 / 14
CRAP
0.00% covered (danger)
0.00%
0 / 1
ApiDetailRetriever
88.55% covered (warning)
88.55%
201 / 227
42.86% covered (danger)
42.86%
6 / 14
61.88
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
2
 sendApiRequest
57.14% covered (warning)
57.14%
4 / 7
0.00% covered (danger)
0.00%
0 / 1
2.31
 getImportDetails
98.21% covered (success)
98.21%
55 / 56
0.00% covered (danger)
0.00%
0 / 1
8
 reduceTitleList
81.82% covered (warning)
81.82%
9 / 11
0.00% covered (danger)
0.00%
0 / 1
1.01
 getMoreRevisions
79.31% covered (warning)
79.31%
23 / 29
0.00% covered (danger)
0.00%
0 / 1
9.72
 checkRevisionCount
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
5
 checkMaxRevisionAggregatedBytes
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
4
 getFileRevisionsFromImageInfo
65.00% covered (warning)
65.00%
13 / 20
0.00% covered (danger)
0.00%
0 / 1
10.74
 getTextRevisionsFromRevisionsInfo
64.29% covered (warning)
64.29%
9 / 14
0.00% covered (danger)
0.00%
0 / 1
6.14
 getBaseParams
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 addTextRevisionsToParams
100.00% covered (success)
100.00%
21 / 21
100.00% covered (success)
100.00%
1 / 1
3
 addFileRevisionsToParams
100.00% covered (success)
100.00%
22 / 22
100.00% covered (success)
100.00%
1 / 1
3
 addTemplatesToParams
75.00% covered (warning)
75.00%
3 / 4
0.00% covered (danger)
0.00%
0 / 1
3.14
 addCategoriesToParams
75.00% covered (warning)
75.00%
3 / 4
0.00% covered (danger)
0.00%
0 / 1
3.14
1<?php
2
3namespace FileImporter\Remote\MediaWiki;
4
5use FileImporter\Data\FileRevision;
6use FileImporter\Data\FileRevisions;
7use FileImporter\Data\ImportDetails;
8use FileImporter\Data\SourceUrl;
9use FileImporter\Data\TextRevision;
10use FileImporter\Data\TextRevisions;
11use FileImporter\Exceptions\HttpRequestException;
12use FileImporter\Exceptions\ImportException;
13use FileImporter\Exceptions\LocalizedImportException;
14use FileImporter\Interfaces\DetailRetriever;
15use FileImporter\Services\Http\HttpRequestExecutor;
16use MediaWiki\Config\ConfigException;
17use MediaWiki\MediaWikiServices;
18use MediaWiki\Revision\SlotRecord;
19use MediaWiki\Title\TitleValue;
20use Psr\Log\LoggerInterface;
21use Psr\Log\NullLogger;
22
23/**
24 * @license GPL-2.0-or-later
25 * @author Addshore
26 */
27class ApiDetailRetriever implements DetailRetriever {
28    use MediaWikiSourceUrlParser;
29
30    private HttpApiLookup $httpApiLookup;
31    private HttpRequestExecutor $httpRequestExecutor;
32    /** @var int */
33    private $maxBytes;
34    private LoggerInterface $logger;
35    /**
36     * @var string Placeholder name replacing usernames that have been suppressed as part of
37     * a steward action on the source site.
38     */
39    private $suppressedUsername;
40    /** @var int */
41    private $maxRevisions;
42    /** @var int */
43    private $maxAggregatedBytes;
44
45    private const API_RESULT_LIMIT = 500;
46    private const MAX_REVISIONS = 100;
47    private const MAX_AGGREGATED_BYTES = 250000000;
48
49    /**
50     * @param HttpApiLookup $httpApiLookup
51     * @param HttpRequestExecutor $httpRequestExecutor
52     * @param int $maxBytes
53     * @param LoggerInterface|null $logger
54     *
55     * @throws ConfigException when $wgFileImporterAccountForSuppressedUsername is invalid
56     */
57    public function __construct(
58        HttpApiLookup $httpApiLookup,
59        HttpRequestExecutor $httpRequestExecutor,
60        $maxBytes,
61        LoggerInterface $logger = null
62    ) {
63        $this->httpApiLookup = $httpApiLookup;
64        $this->httpRequestExecutor = $httpRequestExecutor;
65        $this->maxBytes = $maxBytes;
66        $this->logger = $logger ?? new NullLogger();
67
68        $config = MediaWikiServices::getInstance()->getMainConfig();
69
70        $this->maxRevisions = (int)$config->get( 'FileImporterMaxRevisions' );
71        $this->maxAggregatedBytes = (int)$config->get( 'FileImporterMaxAggregatedBytes' );
72        $this->suppressedUsername = $config->get( 'FileImporterAccountForSuppressedUsername' );
73        if ( !MediaWikiServices::getInstance()->getUserNameUtils()->isValid( $this->suppressedUsername ) ) {
74            throw new ConfigException(
75                'Invalid username configured in wgFileImporterAccountForSuppressedUsername: "' .
76                $this->suppressedUsername . '"'
77            );
78        }
79    }
80
81    /**
82     * @return array[]
83     * @throws ImportException when the request failed
84     */
85    private function sendApiRequest( SourceUrl $sourceUrl, array $apiParameters ) {
86        $apiUrl = $this->httpApiLookup->getApiUrl( $sourceUrl );
87
88        try {
89            $imageInfoRequest = $this->httpRequestExecutor->execute( $apiUrl, $apiParameters );
90        } catch ( HttpRequestException $e ) {
91            throw new LocalizedImportException( [ 'fileimporter-api-failedtogetinfo',
92                $apiUrl ], $e );
93        }
94        $requestData = json_decode( $imageInfoRequest->getContent(), true );
95        return $requestData;
96    }
97
98    /**
99     * @throws ImportException e.g. when the file couldn't be found
100     */
101    public function getImportDetails( SourceUrl $sourceUrl ): ImportDetails {
102        $params = $this->getBaseParams( $sourceUrl );
103        $params = $this->addFileRevisionsToParams( $params );
104        $params = $this->addTextRevisionsToParams( $params );
105        $params = $this->addTemplatesToParams( $params );
106        $params = $this->addCategoriesToParams( $params );
107
108        $requestData = $this->sendApiRequest( $sourceUrl, $params );
109
110        if ( count( $requestData['query']['pages'] ?? [] ) !== 1 ) {
111            $this->logger->warning(
112                'No pages returned by the API',
113                [
114                    'sourceUrl' => $sourceUrl->getUrl(),
115                    'apiParameters' => $params,
116                ]
117            );
118            throw new LocalizedImportException( 'fileimporter-api-nopagesreturned' );
119        }
120
121        /** @var array $pageInfoData */
122        $pageInfoData = end( $requestData['query']['pages'] );
123        '@phan-var array $pageInfoData';
124
125        if ( array_key_exists( 'missing', $pageInfoData ) ) {
126            if (
127                array_key_exists( 'imagerepository', $pageInfoData ) &&
128                $pageInfoData['imagerepository'] == 'shared'
129            ) {
130                throw new LocalizedImportException(
131                    [ 'fileimporter-cantimportfromsharedrepo', $sourceUrl->getHost() ]
132                );
133            }
134            throw new LocalizedImportException( 'fileimporter-cantimportmissingfile' );
135        }
136
137        if ( empty( $pageInfoData['imageinfo'] ) || empty( $pageInfoData['revisions'] ) ) {
138            $this->logger->warning(
139                'Bad image or revision info returned by the API',
140                [
141                    'sourceUrl' => $sourceUrl->getUrl(),
142                    'apiParameters' => $params,
143                ]
144            );
145            throw new LocalizedImportException( 'fileimporter-api-badinfo' );
146        }
147
148        // FIXME: Isn't this misplaced here, *before* more revisions are fetched?
149        $this->checkRevisionCount( $sourceUrl, $pageInfoData );
150        $this->checkMaxRevisionAggregatedBytes( $pageInfoData );
151
152        while ( array_key_exists( 'continue', $requestData ) ) {
153            $this->getMoreRevisions( $sourceUrl, $requestData, $pageInfoData );
154        }
155
156        $pageTitle = $pageInfoData['title'];
157        $pageLanguage = $pageInfoData['pagelanguagehtmlcode'] ?? null;
158
159        // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
160        $imageInfoData = $pageInfoData['imageinfo'];
161        $revisionsData = $pageInfoData['revisions'];
162        $fileRevisions = $this->getFileRevisionsFromImageInfo( $imageInfoData, $pageTitle );
163        $textRevisions = $this->getTextRevisionsFromRevisionsInfo( $revisionsData, $pageTitle );
164        $templates = $this->reduceTitleList( $pageInfoData['templates'] ?? [], NS_TEMPLATE );
165        $categories = $this->reduceTitleList( $pageInfoData['categories'] ?? [], NS_CATEGORY );
166
167        $splitTitle = explode( ':', $pageInfoData['title'] );
168        $titleAfterColon = end( $splitTitle );
169
170        $importDetails = new ImportDetails(
171            $sourceUrl,
172            new TitleValue( NS_FILE, $titleAfterColon ),
173            $textRevisions,
174            $fileRevisions
175        );
176        // FIXME: Better use constructor parameters instead of setters?
177        $importDetails->setPageLanguage( $pageLanguage );
178        $importDetails->setTemplates( $templates );
179        $importDetails->setCategories( $categories );
180
181        return $importDetails;
182    }
183
184    /**
185     * @param array[] $titles
186     * @param int $namespace
187     *
188     * @return string[]
189     */
190    private function reduceTitleList( array $titles, int $namespace ): array {
191        return array_map(
192            static function ( array $title ): string {
193                return $title['title'];
194            },
195            array_filter(
196                $titles,
197                static function ( array $title ) use ( $namespace ): bool {
198                    return $title['ns'] === $namespace;
199                }
200            )
201        );
202    }
203
204    /**
205     * Fetches the next set of revisions unless the number of revisions
206     * exceeds the max revisions limit
207     *
208     * @param SourceUrl $sourceUrl
209     * @param array[] &$requestData
210     * @param array[] &$pageInfoData
211     *
212     * @throws ImportException
213     */
214    private function getMoreRevisions(
215        SourceUrl $sourceUrl,
216        array &$requestData,
217        array &$pageInfoData
218    ): void {
219        $rvContinue = $requestData['continue']['rvcontinue'] ?? null;
220        $iiStart = $requestData['continue']['iistart'] ?? null;
221        $tlContinue = $requestData['continue']['tlcontinue'] ?? null;
222        $clContinue = $requestData['continue']['clcontinue'] ?? null;
223
224        $params = $this->getBaseParams( $sourceUrl );
225
226        if ( $iiStart ) {
227            $params = $this->addFileRevisionsToParams( $params, $iiStart );
228        }
229
230        if ( $rvContinue ) {
231            $params = $this->addTextRevisionsToParams( $params, $rvContinue );
232        }
233
234        if ( $tlContinue ) {
235            $params = $this->addTemplatesToParams( $params, $tlContinue );
236        }
237
238        if ( $clContinue ) {
239            $params = $this->addCategoriesToParams( $params, $clContinue );
240        }
241
242        $requestData = $this->sendApiRequest( $sourceUrl, $params );
243
244        $newPageInfoData = end( $requestData['query']['pages'] );
245
246        if ( array_key_exists( 'revisions', $newPageInfoData ) ) {
247            $pageInfoData['revisions'] =
248                array_merge( $pageInfoData['revisions'], $newPageInfoData['revisions'] );
249        }
250
251        if ( array_key_exists( 'imageinfo', $newPageInfoData ) ) {
252            $pageInfoData['imageinfo'] =
253                array_merge( $pageInfoData['imageinfo'], $newPageInfoData['imageinfo'] );
254        }
255
256        if ( array_key_exists( 'templates', $newPageInfoData ) ) {
257            $pageInfoData['templates'] =
258                array_merge( $pageInfoData['templates'], $newPageInfoData['templates'] );
259        }
260
261        if ( array_key_exists( 'categories', $newPageInfoData ) ) {
262            $pageInfoData['categories'] =
263                array_merge( $pageInfoData['categories'], $newPageInfoData['categories'] );
264        }
265
266        $this->checkRevisionCount( $sourceUrl, $pageInfoData );
267        $this->checkMaxRevisionAggregatedBytes( $pageInfoData );
268    }
269
270    /**
271     * Throws an exception if the number of revisions to be imported exceeds
272     * the maximum revision limit
273     *
274     * @param SourceUrl $sourceUrl
275     * @param array[] $pageInfoData
276     *
277     * @throws ImportException when exceeding the acceptable maximum
278     */
279    private function checkRevisionCount( SourceUrl $sourceUrl, array $pageInfoData ): void {
280        if ( count( $pageInfoData['revisions'] ) > $this->maxRevisions ||
281            count( $pageInfoData['imageinfo'] ) > $this->maxRevisions ||
282            count( $pageInfoData['revisions'] ) > static::MAX_REVISIONS ||
283            count( $pageInfoData['imageinfo'] ) > static::MAX_REVISIONS ) {
284            $this->logger->warning(
285                'Too many revisions were being fetched',
286                [
287                    'sourceUrl' => $sourceUrl->getUrl(),
288                ]
289            );
290
291            throw new LocalizedImportException( 'fileimporter-api-toomanyrevisions' );
292        }
293    }
294
295    /**
296     * @param array[] $pageInfoData
297     * @phan-param array{imageinfo:array{size:int}[]} $pageInfoData
298     *
299     * @throws ImportException when exceeding the maximum file size
300     */
301    private function checkMaxRevisionAggregatedBytes( array $pageInfoData ): void {
302        $aggregatedFileBytes = 0;
303        foreach ( $pageInfoData['imageinfo'] as $fileVersion ) {
304            $aggregatedFileBytes += $fileVersion['size'] ?? 0;
305            if ( $aggregatedFileBytes > $this->maxAggregatedBytes ||
306                $aggregatedFileBytes > static::MAX_AGGREGATED_BYTES ) {
307                $versions = count( $pageInfoData['imageinfo'] );
308                throw new LocalizedImportException( [ 'fileimporter-filetoolarge', $versions ] );
309            }
310        }
311    }
312
313    /**
314     * @param array[] $imageInfo
315     * @param string $pageTitle
316     *
317     * @throws ImportException when the file is not acceptable, e.g. hidden or to big
318     */
319    private function getFileRevisionsFromImageInfo( array $imageInfo, string $pageTitle ): FileRevisions {
320        $revisions = [];
321        foreach ( $imageInfo as $revisionInfo ) {
322            if ( array_key_exists( 'filehidden', $revisionInfo ) ) {
323                throw new LocalizedImportException( 'fileimporter-cantimportfilehidden' );
324            }
325
326            if ( array_key_exists( 'filemissing', $revisionInfo ) ) {
327                throw new LocalizedImportException( 'fileimporter-filemissinginrevision' );
328            }
329
330            if ( array_key_exists( 'userhidden', $revisionInfo ) ) {
331                $revisionInfo['user'] ??= $this->suppressedUsername;
332            }
333
334            if ( ( $revisionInfo['size'] ?? 0 ) > $this->maxBytes ) {
335                $versions = count( $imageInfo );
336                throw new LocalizedImportException( [ 'fileimporter-filetoolarge', $versions ] );
337            }
338
339            if ( isset( $revisionInfo['sha1'] ) ) {
340                // Convert from API sha1 format to DB sha1 format. The conversion can be se inside
341                // ApiQueryImageInfo.
342                // * API sha1 format is base 16 padded to 40 chars
343                // * DB sha1 format is base 36 padded to 31 chars
344                $revisionInfo['sha1'] = \Wikimedia\base_convert( $revisionInfo['sha1'], 16, 36, 31 );
345            }
346
347            if ( array_key_exists( 'commenthidden', $revisionInfo ) ) {
348                $revisionInfo['comment'] ??=
349                    wfMessage( 'fileimporter-revision-removed-comment' )->plain();
350            }
351
352            $revisionInfo['name'] = $pageTitle;
353            $revisionInfo['description'] = $revisionInfo['comment'] ?? null;
354
355            $revisions[] = new FileRevision( $revisionInfo );
356        }
357        return new FileRevisions( $revisions );
358    }
359
360    /**
361     * @param array[] $revisionsInfo
362     * @param string $pageTitle
363     */
364    private function getTextRevisionsFromRevisionsInfo( array $revisionsInfo, string $pageTitle ): TextRevisions {
365        $revisions = [];
366        foreach ( $revisionsInfo as $revisionInfo ) {
367            if ( array_key_exists( 'userhidden', $revisionInfo ) ) {
368                $revisionInfo['user'] ??= $this->suppressedUsername;
369            }
370
371            if ( array_key_exists( 'texthidden', $revisionInfo ) ) {
372                $revisionInfo['slots'][SlotRecord::MAIN]['content'] ??=
373                    wfMessage( 'fileimporter-revision-removed-text' )->plain();
374            }
375
376            if ( array_key_exists( 'commenthidden', $revisionInfo ) ) {
377                $revisionInfo['comment'] ??=
378                    wfMessage( 'fileimporter-revision-removed-comment' )->plain();
379            }
380
381            $revisionInfo['minor'] = array_key_exists( 'minor', $revisionInfo );
382            $revisionInfo['title'] = $pageTitle;
383            $revisions[] = new TextRevision( $revisionInfo );
384        }
385        return new TextRevisions( $revisions );
386    }
387
388    private function getBaseParams( SourceUrl $sourceUrl ): array {
389        return [
390            'action' => 'query',
391            'errorformat' => 'plaintext',
392            'format' => 'json',
393            'formatversion' => '2',
394            'titles' => $this->parseTitleFromSourceUrl( $sourceUrl ),
395            'prop' => 'info'
396        ];
397    }
398
399    /**
400     * Adds to params base the properties for getting Text Revisions
401     */
402    private function addTextRevisionsToParams( array $params, string $rvContinue = null ): array {
403        $params['prop'] .= ( $params['prop'] ) ? '|revisions' : 'revisions';
404
405        if ( $rvContinue ) {
406            $params['rvcontinue'] = $rvContinue;
407        }
408
409        return $params + [
410            'rvlimit' => static::API_RESULT_LIMIT,
411            'rvdir' => 'newer',
412            'rvslots' => SlotRecord::MAIN,
413            'rvprop' => implode(
414                '|',
415                [
416                    'flags',
417                    'timestamp',
418                    'user',
419                    'sha1',
420                    'contentmodel',
421                    'comment',
422                    'content',
423                    'tags',
424                ]
425            )
426        ];
427    }
428
429    /**
430     * Adds to params base the properties for getting File Revisions
431     */
432    private function addFileRevisionsToParams( array $params, string $iiStart = null ): array {
433        $params['prop'] .= ( $params['prop'] ) ? '|imageinfo' : 'imageinfo';
434
435        if ( $iiStart ) {
436            $params['iistart'] = $iiStart;
437        }
438
439        return $params + [
440            'iilimit' => static::API_RESULT_LIMIT,
441            'iiurlwidth' => 800,
442            'iiurlheight' => 400,
443            'iiprop' => implode(
444                '|',
445                [
446                    'timestamp',
447                    'user',
448                    'userid',
449                    'comment',
450                    'canonicaltitle',
451                    'url',
452                    'size',
453                    'sha1',
454                    'archivename',
455                ]
456            )
457        ];
458    }
459
460    /**
461     * Adds to params base the properties for getting Templates
462     */
463    private function addTemplatesToParams( array $params, string $tlContinue = null ): array {
464        $params['prop'] .= ( $params['prop'] ) ? '|templates' : 'templates';
465
466        if ( $tlContinue ) {
467            $params['tlcontinue'] = $tlContinue;
468        }
469
470        return $params + [ 'tlnamespace' => NS_TEMPLATE, 'tllimit' => static::API_RESULT_LIMIT ];
471    }
472
473    /**
474     * Adds to params base the properties for getting Categories
475     */
476    private function addCategoriesToParams( array $params, string $clContinue = null ): array {
477        $params['prop'] .= ( $params['prop'] ) ? '|categories' : 'categories';
478
479        if ( $clContinue ) {
480            $params['clcontinue'] = $clContinue;
481        }
482
483        return $params + [ 'cllimit' => static::API_RESULT_LIMIT ];
484    }
485
486}