Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
89.74% |
70 / 78 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
MediaModerationFileLookup | |
89.74% |
70 / 78 |
|
75.00% |
6 / 8 |
26.73 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getFileSelectQueryBuilder | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
getTimestampFieldForTable | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
getSha1FieldForTable | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
getRowCountForTimestamp | |
76.92% |
10 / 13 |
|
0.00% |
0 / 1 |
3.11 | |||
performBatchQuery | |
66.67% |
10 / 15 |
|
0.00% |
0 / 1 |
2.15 | |||
getBatchOfFileRows | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
5 | |||
getFileObjectsForSha1 | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MediaModeration\Services; |
4 | |
5 | use ArchivedFile; |
6 | use Generator; |
7 | use InvalidArgumentException; |
8 | use LocalFile; |
9 | use LocalRepo; |
10 | use MediaWiki\FileRepo\File\FileSelectQueryBuilder; |
11 | use Wikimedia\Rdbms\FakeResultWrapper; |
12 | use Wikimedia\Rdbms\IResultWrapper; |
13 | use Wikimedia\Rdbms\SelectQueryBuilder; |
14 | |
15 | class MediaModerationFileLookup { |
16 | |
17 | public const TABLES_USED_FOR_LOOKUP = [ |
18 | 'image', |
19 | 'oldimage', |
20 | 'filearchive', |
21 | ]; |
22 | |
23 | private LocalRepo $localRepo; |
24 | private MediaModerationFileFactory $mediaModerationFileFactory; |
25 | |
26 | public function __construct( |
27 | LocalRepo $localRepo, |
28 | MediaModerationFileFactory $mediaModerationFileFactory |
29 | ) { |
30 | $this->localRepo = $localRepo; |
31 | $this->mediaModerationFileFactory = $mediaModerationFileFactory; |
32 | } |
33 | |
34 | /** |
35 | * Gets the appropriate FileSelectQueryBuilder for the given image $table |
36 | * |
37 | * @param string $table One of 'image', 'oldimage', or 'filearchive' |
38 | * @return FileSelectQueryBuilder |
39 | * @throws InvalidArgumentException If an unrecognised $table is provided |
40 | */ |
41 | public function getFileSelectQueryBuilder( string $table ): FileSelectQueryBuilder { |
42 | if ( $table === 'image' ) { |
43 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForFile( $this->localRepo->getReplicaDB() ); |
44 | } elseif ( $table === 'oldimage' ) { |
45 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForOldFile( $this->localRepo->getReplicaDB() ); |
46 | } elseif ( $table === 'filearchive' ) { |
47 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForArchivedFile( $this->localRepo->getReplicaDB() ); |
48 | } else { |
49 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
50 | } |
51 | return $fileSelectQueryBuilder; |
52 | } |
53 | |
54 | /** |
55 | * Gets the timestamp field for the provided $table. |
56 | * |
57 | * @param string $table One of 'image', 'oldimage', or 'fileimage' |
58 | * @return string The timestamp field name |
59 | * @throws InvalidArgumentException If $table is not one of the three valid options. |
60 | */ |
61 | public function getTimestampFieldForTable( string $table ): string { |
62 | if ( $table === 'image' ) { |
63 | return 'img_timestamp'; |
64 | } elseif ( $table === 'oldimage' ) { |
65 | return 'oi_timestamp'; |
66 | } elseif ( $table === 'filearchive' ) { |
67 | return 'fa_timestamp'; |
68 | } else { |
69 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
70 | } |
71 | } |
72 | |
73 | /** |
74 | * Gets the SHA-1 field for the provided $table. |
75 | * |
76 | * @param string $table One of 'image', 'oldimage', or 'fileimage' |
77 | * @return string The SHA-1 field name |
78 | * @throws InvalidArgumentException If $table is not one of the three valid options. |
79 | */ |
80 | private function getSha1FieldForTable( string $table ): string { |
81 | if ( $table === 'image' ) { |
82 | return 'img_sha1'; |
83 | } elseif ( $table === 'oldimage' ) { |
84 | return 'oi_sha1'; |
85 | } elseif ( $table === 'filearchive' ) { |
86 | return 'fa_sha1'; |
87 | } else { |
88 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
89 | } |
90 | } |
91 | |
92 | /** |
93 | * Returns the row count for rows that have a given |
94 | * timestamp and optionally a given SHA-1 value. |
95 | * |
96 | * Used to prevent issues with paging by timestamp |
97 | * when the row count being used to page is less |
98 | * than the row count of rows with a given timestamp. |
99 | * |
100 | * @param string $table The table to get the row count from (one of image, oldimage, or filearchive). |
101 | * @param string $timestamp The given timestamp in a TS_MW format. The count will only include rows |
102 | * with this exact timestamp. |
103 | * @param string|null $sha1 If provided, filter the count to only include rows with the given SHA-1 value. |
104 | * To not filter by SHA-1, provide null. |
105 | * @return int |
106 | */ |
107 | public function getRowCountForTimestamp( string $table, string $timestamp, ?string $sha1 ): int { |
108 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( $table ) |
109 | ->clearFields() |
110 | ->field( 'COUNT(*)' ); |
111 | if ( $timestamp ) { |
112 | $fileSelectQueryBuilder->where( [ |
113 | $this->getTimestampFieldForTable( $table ) => $this->localRepo->getReplicaDB()->timestamp( $timestamp ), |
114 | ] ); |
115 | } |
116 | if ( $sha1 !== null ) { |
117 | $fileSelectQueryBuilder->where( [ |
118 | $this->getSha1FieldForTable( $table ) => $sha1, |
119 | ] ); |
120 | } |
121 | return $fileSelectQueryBuilder->caller( __METHOD__ ) |
122 | ->fetchField(); |
123 | } |
124 | |
125 | /** |
126 | * Actually performs the SELECT query to get a batch of rows from the given $table. |
127 | * Used by ::getBatchOfFileRows. |
128 | * |
129 | * @param string $table The table to get the batch from (one of image, oldimage, or filearchive). |
130 | * @param string $startTimestamp The timestamp which to start this batch at (cannot have been used for |
131 | * a previous batch to prevent infinite loops). Provide the empty string to start with the newest timestamp. |
132 | * @param string $sha1 The SHA-1 which rows must have to be selected |
133 | * @param int $batchSize The maximum number of rows to select |
134 | * |
135 | * @return IResultWrapper |
136 | */ |
137 | protected function performBatchQuery( |
138 | string $table, string $startTimestamp, string $sha1, int $batchSize |
139 | ): IResultWrapper { |
140 | // Only select rows with the given $sha1 |
141 | $queryBuilder = $this->getFileSelectQueryBuilder( $table ) |
142 | ->where( [ |
143 | $this->getSha1FieldForTable( $table ) => $sha1, |
144 | ] ); |
145 | if ( $startTimestamp ) { |
146 | // Only select rows with that have a timestamp under the $startTimestamp. |
147 | $queryBuilder->where( $this->localRepo->getReplicaDB()->expr( |
148 | $this->getTimestampFieldForTable( $table ), |
149 | '<=', |
150 | $startTimestamp |
151 | ) ); |
152 | } |
153 | // Select $batchSize rows. |
154 | return $queryBuilder |
155 | ->orderBy( $this->getTimestampFieldForTable( $table ), SelectQueryBuilder::SORT_DESC ) |
156 | ->limit( $batchSize ) |
157 | ->caller( __METHOD__ ) |
158 | ->fetchResultSet(); |
159 | } |
160 | |
161 | /** |
162 | * Returns a batch of rows from $table. The batches will go from largest timestamp to smallest timestamp, |
163 | * and are constructed such to not return the same row in more than one batch. |
164 | * |
165 | * @param string $table The table to get the batch from (one of image, oldimage, or filearchive). |
166 | * @param string $startTimestamp The timestamp which to start this batch at (cannot have been used for |
167 | * a previous batch to prevent infinite loops). Provide the empty string to start with the newest timestamp. |
168 | * @param string $sha1 The SHA-1 which rows must have to be selected |
169 | * @param int $batchSize The maximum number of rows return in the batch |
170 | * |
171 | * @return array First item being the IResultWrapper and the second being the value for $startTimestamp of the |
172 | * next batch. |
173 | */ |
174 | protected function getBatchOfFileRows( |
175 | string $table, string $startTimestamp, string $sha1, int $batchSize |
176 | ): array { |
177 | // Check that rows with the $sha1 and $startTimestamp do not exceed $batchSize. Otherwise, raise the $batchSize |
178 | // to prevent infinite loops. |
179 | $rowsWithStartTimestamp = $this->getRowCountForTimestamp( $table, $startTimestamp, $sha1 ); |
180 | if ( $rowsWithStartTimestamp > $batchSize ) { |
181 | // Increase the batch size to account for this, as if not the next batch would start with |
182 | // the same rows causing an infinite loop. |
183 | $batchSize = $rowsWithStartTimestamp; |
184 | } |
185 | // Get the batch which contains $batchSize + 1 rows. The added row is to ensure proper paging. |
186 | $resultWrapper = $this->performBatchQuery( $table, $startTimestamp, $sha1, $batchSize + 1 ); |
187 | if ( $resultWrapper->count() < $batchSize + 1 ) { |
188 | // If the row count returned is less than the batch size, then just return the result |
189 | // set along with the indication that no more batches can be found. |
190 | return [ $resultWrapper, false ]; |
191 | } |
192 | // Get the smallest timestamp in this batch. |
193 | $resultWrapper->seek( $resultWrapper->count() - 1 ); |
194 | $timestampToRemoveFromBatch = $resultWrapper->fetchRow()[$this->getTimestampFieldForTable( $table )]; |
195 | // Rewind the results wrapper to the first row. |
196 | $resultWrapper->rewind(); |
197 | // To ensure that a given file is only present within one batch, the batches must contain upload timestamp |
198 | // values which are not present in any other batch. This is because the upload timestamp is the way to separate |
199 | // the results when batching, and starting a new batch with a timestamp used in a previous batch would mean |
200 | // some of the files are listed twice. |
201 | // Removing the rows with the smallest timestamp addresses this by ensuring all the files are in the |
202 | // next batch. |
203 | $resultsToReturn = []; |
204 | foreach ( $resultWrapper as $row ) { |
205 | $rowAsArray = (array)$row; |
206 | if ( $rowAsArray[$this->getTimestampFieldForTable( $table )] !== $timestampToRemoveFromBatch ) { |
207 | $resultsToReturn[] = $row; |
208 | } |
209 | } |
210 | // Return the modified results. |
211 | return [ new FakeResultWrapper( $resultsToReturn ), $timestampToRemoveFromBatch ]; |
212 | } |
213 | |
214 | /** |
215 | * Gets LocalFile, OldFile, and ArchivedFile objects with a given SHA-1 from the |
216 | * local wiki. |
217 | * |
218 | * This method generates these entries instead of returning all objects as |
219 | * only one object is usually needed for the purposes of scanning. In the |
220 | * case of the image not having a thumbnail or otherwise having an problem, |
221 | * further images may be needed. |
222 | * |
223 | * The order in which these objects are generated is the order of the tables |
224 | * in self::TABLES_USED_FOR_LOOKUP and then their upload timestamp starting |
225 | * with the newest file. |
226 | * |
227 | * @param string $sha1 The SHA-1 used for lookup |
228 | * @param int $batchSize The number of files to select per select query. Increase this |
229 | * number if you intend to use all the files returned by this query. |
230 | * @return Generator<LocalFile|ArchivedFile> |
231 | */ |
232 | public function getFileObjectsForSha1( string $sha1, int $batchSize = 5 ): Generator { |
233 | // Process each image table in the order defined in ::TABLES_USED_FOR_LOOKUP |
234 | foreach ( self::TABLES_USED_FOR_LOOKUP as $table ) { |
235 | // Lookup rows from the image $table where the SHA-1 for that row is |
236 | // the same as in $sha1. Order these by upload timestamp and limit |
237 | // each batch selected from the DB to $batchSize rows. |
238 | $startTimestampForBatch = ''; |
239 | do { |
240 | [ $batch, $startTimestampForBatch ] = $this->getBatchOfFileRows( |
241 | $table, |
242 | $startTimestampForBatch, |
243 | $sha1, |
244 | $batchSize |
245 | ); |
246 | foreach ( $batch as $row ) { |
247 | // Yield the row as a LocalFile or ArchivedFile object. |
248 | yield $this->mediaModerationFileFactory->getFileObjectForRow( $row, $table ); |
249 | } |
250 | } while ( $startTimestampForBatch ); |
251 | } |
252 | } |
253 | } |