Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.24% |
179 / 186 |
|
90.00% |
9 / 10 |
CRAP | |
0.00% |
0 / 1 |
ImportExistingFilesToScanTable | |
99.44% |
179 / 180 |
|
90.00% |
9 / 10 |
32 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
1 | |||
getUpdateKey | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doDBUpdates | |
97.92% |
47 / 48 |
|
0.00% |
0 / 1 |
7 | |||
generateDBUpdatesReturnValue | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
getTablesToProcess | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
getEstimatedNumberOfBatchesForTable | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
4 | |||
getTemporaryBatchSize | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
2 | |||
getFileSelectQueryBuilder | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 | |||
getRowsForBatch | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
3 | |||
performBatch | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MediaModeration\Maintenance; |
4 | |
5 | use LoggedUpdateMaintenance; |
6 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationDatabaseLookup; |
7 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileFactory; |
8 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileLookup; |
9 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileProcessor; |
10 | use MediaWiki\FileRepo\File\FileSelectQueryBuilder; |
11 | use Wikimedia\Rdbms\IReadableDatabase; |
12 | use Wikimedia\Rdbms\SelectQueryBuilder; |
13 | |
14 | $IP = getenv( 'MW_INSTALL_PATH' ); |
15 | if ( $IP === false ) { |
16 | $IP = __DIR__ . '/../../..'; |
17 | } |
18 | require_once "$IP/maintenance/Maintenance.php"; |
19 | |
20 | class ImportExistingFilesToScanTable extends LoggedUpdateMaintenance { |
21 | |
22 | /** @var string[] The DB tables that have images to be imported to mediamoderation_scan. */ |
23 | public const TABLES_TO_IMPORT_FROM = [ |
24 | 'image', |
25 | 'filearchive', |
26 | 'oldimage', |
27 | ]; |
28 | |
29 | private IReadableDatabase $dbr; |
30 | private MediaModerationFileProcessor $mediaModerationFileProcessor; |
31 | private MediaModerationDatabaseLookup $mediaModerationDatabaseLookup; |
32 | private MediaModerationFileFactory $mediaModerationFileFactory; |
33 | private MediaModerationFileLookup $mediaModerationFileLookup; |
34 | |
35 | public function __construct() { |
36 | parent::__construct(); |
37 | $this->requireExtension( 'MediaModeration' ); |
38 | $this->addDescription( 'Populates the mediamoderation_scan table with existing images from the wiki.' ); |
39 | $this->addOption( |
40 | 'sleep', |
41 | 'Sleep time (in seconds) between every batch. Default: 1', |
42 | false, |
43 | true |
44 | ); |
45 | $this->addOption( |
46 | 'start-timestamp', |
47 | 'The timestamp which to start importing files from. Default is for no timestamp start point ' . |
48 | '(which means importing all images)', |
49 | false, |
50 | true |
51 | ); |
52 | $this->addOption( |
53 | 'table', |
54 | 'Allows specifying which table(s) files should be imported from. Default is all supported tables.', |
55 | false, |
56 | true, |
57 | false, |
58 | true |
59 | ); |
60 | $this->addOption( |
61 | 'mark-complete', |
62 | 'Allows controlling whether this script should be considered completely run for the purposes ' . |
63 | 'of the updatelog. If provided the script will be marked as complete. ' . |
64 | "Default is to consider the script completely run if the 'table' and 'start-timestamp' options were left " . |
65 | 'as the default and the script does not error out.', |
66 | ); |
67 | } |
68 | |
69 | /** @inheritDoc */ |
70 | protected function getUpdateKey() { |
71 | return __CLASS__; |
72 | } |
73 | |
74 | /** @inheritDoc */ |
75 | protected function doDBUpdates() { |
76 | $services = $this->getServiceContainer(); |
77 | $loadBalancerFactory = $services->getDBLoadBalancerFactory(); |
78 | $this->dbr = $loadBalancerFactory->getReplicaDatabase(); |
79 | $this->mediaModerationFileFactory = $services->get( 'MediaModerationFileFactory' ); |
80 | $this->mediaModerationFileProcessor = $services->get( 'MediaModerationFileProcessor' ); |
81 | $this->mediaModerationDatabaseLookup = $services->get( 'MediaModerationDatabaseLookup' ); |
82 | $this->mediaModerationFileLookup = $services->get( 'MediaModerationFileLookup' ); |
83 | |
84 | // Get the list of tables to import images from. |
85 | $tablesToProcess = $this->getTablesToProcess(); |
86 | if ( $tablesToProcess === false ) { |
87 | // If the value is false, then return false |
88 | // as this indicates the script did not run. |
89 | return false; |
90 | } |
91 | |
92 | foreach ( $tablesToProcess as $table ) { |
93 | $batchSize = $this->getBatchSize() ?? 200; |
94 | $this->output( "Now importing rows from the table '$table' in batches of $batchSize.\n" ); |
95 | $previousBatchFinalTimestamp = $this->getOption( 'start-timestamp', '' ); |
96 | if ( $previousBatchFinalTimestamp ) { |
97 | $this->output( |
98 | "Starting from timestamp $previousBatchFinalTimestamp and importing files with a " . |
99 | "greater timestamp.\n" |
100 | ); |
101 | } |
102 | $expectedNumberOfBatches = $this->getEstimatedNumberOfBatchesForTable( |
103 | $table, $previousBatchFinalTimestamp |
104 | ); |
105 | $batchNo = 1; |
106 | do { |
107 | $outputString = "Batch $batchNo of ~$expectedNumberOfBatches"; |
108 | if ( $previousBatchFinalTimestamp ) { |
109 | $outputString .= " with rows starting at timestamp $previousBatchFinalTimestamp"; |
110 | } |
111 | $this->output( $outputString . ".\n" ); |
112 | [ |
113 | $filesLeft, |
114 | $previousBatchFinalTimestamp |
115 | ] = $this->performBatch( $table, $previousBatchFinalTimestamp ); |
116 | sleep( intval( $this->getOption( 'sleep', 1 ) ) ); |
117 | $loadBalancerFactory->waitForReplication(); |
118 | $batchNo += 1; |
119 | } while ( $filesLeft ); |
120 | } |
121 | $returnValue = $this->generateDBUpdatesReturnValue(); |
122 | if ( $this->hasOption( 'force' ) ) { |
123 | // If the script was run with the force option, then don't |
124 | // print out about how the script has been or has not been |
125 | // marked as completed. |
126 | return $returnValue; |
127 | } |
128 | if ( $returnValue ) { |
129 | $this->output( "Script marked as completed (added to updatelog).\n" ); |
130 | } else { |
131 | $this->output( |
132 | 'Script not marked as completed (not added to updatelog). The script was marked as not complete ' . |
133 | "because not all the images on the wiki were processed in this run of the script.\n" . |
134 | 'To mark the script as complete and not have it run again through update.php, make sure to run the ' . |
135 | "script again with the 'mark-complete' option specified. You should only do this once you are sure " . |
136 | "that all the images on the wiki have been imported.\n" |
137 | ); |
138 | } |
139 | return $returnValue; |
140 | } |
141 | |
142 | /** |
143 | * Return the value that should be used as the return value |
144 | * of ::doDBUpdates. This value depends on the options |
145 | * passed to the script. |
146 | * |
147 | * If true is returned, it should only be done if the script |
148 | * either has imported all images or the caller of the maintenance |
149 | * script has specifically intended for the script to marked as |
150 | * complete. |
151 | * |
152 | * @return bool |
153 | */ |
154 | protected function generateDBUpdatesReturnValue(): bool { |
155 | // Return true if mark-complete is specified, or if both: |
156 | // * start-timestamp is not specified or an empty string, and |
157 | // * table is not specified or includes all tables listed in self::TABLES_TO_IMPORT_FROM. |
158 | return $this->hasOption( 'mark-complete' ) || |
159 | ( |
160 | $this->getOption( 'start-timestamp', '' ) === '' && |
161 | $this->getOption( 'table', self::TABLES_TO_IMPORT_FROM ) == self::TABLES_TO_IMPORT_FROM |
162 | ); |
163 | } |
164 | |
165 | /** |
166 | * Processes the user supplied list of tables to process, |
167 | * with the default being all supported tables. |
168 | * |
169 | * Prints an error if the supplied arguments are invalid. |
170 | * |
171 | * @return false|array The list of tables to process, or false if the list was not valid. |
172 | */ |
173 | protected function getTablesToProcess() { |
174 | $tablesToProcess = $this->getOption( 'table', self::TABLES_TO_IMPORT_FROM ); |
175 | if ( !count( $tablesToProcess ) ) { |
176 | $this->error( "The array of tables to have images imported from cannot be empty.\n" ); |
177 | return false; |
178 | } |
179 | foreach ( $tablesToProcess as $table ) { |
180 | if ( !in_array( $table, self::TABLES_TO_IMPORT_FROM ) ) { |
181 | $this->error( "The table option value '$table' is not a valid table to import images from.\n" ); |
182 | return false; |
183 | } |
184 | } |
185 | return $tablesToProcess; |
186 | } |
187 | |
188 | /** |
189 | * Gets the expected number of batches needed to process a table. |
190 | * This is used just for visual display and the actual number of batches |
191 | * may be higher or lower. |
192 | * |
193 | * @param string $table |
194 | * @param string $startTimestamp The timestamp that the processing will start from. |
195 | * @return int |
196 | */ |
197 | protected function getEstimatedNumberOfBatchesForTable( string $table, string $startTimestamp ): int { |
198 | // Get the row count for the $table. |
199 | $queryBuilder = $this->dbr->newSelectQueryBuilder() |
200 | ->select( 'COUNT(*)' ) |
201 | ->from( $table ); |
202 | if ( $startTimestamp ) { |
203 | $queryBuilder->where( $this->dbr->expr( |
204 | $this->mediaModerationFileLookup->getTimestampFieldForTable( $table ), |
205 | '>=', |
206 | $this->dbr->timestamp( $startTimestamp ) |
207 | ) ); |
208 | } |
209 | $rowCountInTable = $queryBuilder |
210 | ->caller( __METHOD__ ) |
211 | ->fetchField(); |
212 | // If the row count is zero, then one batch will be performed. |
213 | if ( !$rowCountInTable ) { |
214 | return 1; |
215 | } |
216 | // The expected batch count is the number of rows in the table |
217 | // divided by the batch size. This may be higher than the actual |
218 | // batch count, as it may be temporarily increased to prevent |
219 | // infinite loops. |
220 | $batchSize = $this->getBatchSize() ?? 200; |
221 | $expectedBatchesCount = ceil( $rowCountInTable / $batchSize ); |
222 | if ( $rowCountInTable % $batchSize === 0 ) { |
223 | // If the batch size divides the row count without a remainder, then |
224 | // the expected batch count needs to be increased by one as one |
225 | // more batch will be performed at the end with no rows found. |
226 | $expectedBatchesCount += 1; |
227 | } |
228 | return $expectedBatchesCount; |
229 | } |
230 | |
231 | /** |
232 | * Gets the temporary batch size for use by ::processBatch if it was provided |
233 | * with $shouldRaiseBatchSize as the boolean 'true'. This is one more than |
234 | * the number of files with the $previousBatchFinalTimestamp. |
235 | * |
236 | * @param FileSelectQueryBuilder $fileSelectQueryBuilder The cloned FileSelectQueryBuilder that is being |
237 | * built in ::getSelectFileQueryBuilder. This needs to be cloned to avoid issues with this |
238 | * method modifying the query builder. |
239 | * @param string $timestampField |
240 | * @param string $previousBatchFinalTimestamp |
241 | * @return int |
242 | */ |
243 | protected function getTemporaryBatchSize( |
244 | FileSelectQueryBuilder $fileSelectQueryBuilder, string $timestampField, string $previousBatchFinalTimestamp |
245 | ): int { |
246 | $filesWithTheCutoffTimestamp = (int)$fileSelectQueryBuilder |
247 | ->clearFields() |
248 | ->field( 'COUNT(*)' ) |
249 | ->where( $this->dbr->expr( $timestampField, '=', $this->dbr->timestamp( $previousBatchFinalTimestamp ) ) ) |
250 | ->caller( __METHOD__ ) |
251 | ->fetchField(); |
252 | // Sanity check that the new batch size would actually be larger (otherwise |
253 | // leave the batch size as is as it will be fine). |
254 | if ( $filesWithTheCutoffTimestamp >= ( $this->getBatchSize() ?? 200 ) ) { |
255 | $batchSize = $filesWithTheCutoffTimestamp + 1; |
256 | $this->output( |
257 | "Temporarily raised the batch size to $batchSize due to files with the same upload timestamp. " . |
258 | "This is done to prevent an infinite loop. Consider raising the batch size to avoid this.\n" |
259 | ); |
260 | return $batchSize; |
261 | } |
262 | return $this->getBatchSize() ?? 200; |
263 | } |
264 | |
265 | /** |
266 | * Gets the appropriate FileSelectQueryBuilder for the $table and |
267 | * applies the WHERE conditions, ORDER BY and LIMIT. |
268 | * |
269 | * @param string $table The table name currently being processed. |
270 | * @param string $previousBatchFinalTimestamp The timestamp which the last batch stopped at. This |
271 | * is used to filter for files with this timestamp or a newer timestamp. |
272 | * @param bool $shouldRaiseBatchSize Used to indicate that the previous batch ended and started on |
273 | * the same timestamp, so this batch should reattempt that timestamp |
274 | * but with a temporarily raised batch size to account for this. |
275 | * @return FileSelectQueryBuilder |
276 | */ |
277 | protected function getFileSelectQueryBuilder( |
278 | string $table, string $previousBatchFinalTimestamp, bool $shouldRaiseBatchSize |
279 | ): FileSelectQueryBuilder { |
280 | // Get the appropriate FileSelectQueryBuilder using MediaModerationDatabaseLookup::getFileSelectQueryBuilder |
281 | $fileSelectQueryBuilder = $this->mediaModerationFileLookup->getFileSelectQueryBuilder( $table ); |
282 | $timestampField = $this->mediaModerationFileLookup->getTimestampFieldForTable( $table ); |
283 | $batchSize = $this->getBatchSize() ?? 200; |
284 | if ( $shouldRaiseBatchSize ) { |
285 | // If the previous batch started and ended on the same timestamp, |
286 | // then temporarily raise the batch count to 1 more than the number |
287 | // of files with this timestamp to avoid an infinite loop. |
288 | $batchSize = $this->getTemporaryBatchSize( |
289 | clone $fileSelectQueryBuilder, |
290 | $timestampField, |
291 | $previousBatchFinalTimestamp |
292 | ); |
293 | } |
294 | // If the timestamp is not empty, filter for entries with a greater timestamp |
295 | // than the cutoff timestamp. |
296 | if ( $previousBatchFinalTimestamp ) { |
297 | $fileSelectQueryBuilder |
298 | ->where( $this->dbr->expr( |
299 | $timestampField, '>=', $this->dbr->timestamp( $previousBatchFinalTimestamp ) ) ); |
300 | } |
301 | // Order by the timestamp (oldest to newest) and set the limit as the batch size. |
302 | $fileSelectQueryBuilder |
303 | ->orderBy( $timestampField, SelectQueryBuilder::SORT_ASC ) |
304 | ->limit( $batchSize ); |
305 | return $fileSelectQueryBuilder; |
306 | } |
307 | |
308 | /** |
309 | * Gets the rows for a batch along with the timestamp for the last file in the batch. |
310 | * |
311 | * @param string $table |
312 | * @param string $previousBatchFinalTimestamp |
313 | * @return array The rows for the batch, the timestamp for the last file in the results list, and the |
314 | * LIMIT used for the batch. |
315 | */ |
316 | protected function getRowsForBatch( string $table, string $previousBatchFinalTimestamp ): array { |
317 | // Get the FileSelectQueryBuilder with everything but the caller specified. |
318 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( |
319 | $table, $previousBatchFinalTimestamp, false |
320 | ); |
321 | // Specify the caller and then get the rows from the DB. |
322 | $rows = $fileSelectQueryBuilder |
323 | ->caller( __METHOD__ ) |
324 | ->fetchResultSet(); |
325 | $lastFileTimestamp = $previousBatchFinalTimestamp; |
326 | // Check whether the last file in this batch has the same timestamp as in |
327 | // $previousBatchFinalTimestamp. If so, then increase the batch size to |
328 | // prevent an infinite loop which would be caused by processing the same |
329 | // files over and over again with that timestamp. |
330 | if ( $rows->numRows() ) { |
331 | $rows->seek( $rows->numRows() - 1 ); |
332 | $lastFileObject = $this->mediaModerationFileFactory->getFileObjectForRow( $rows->fetchObject(), $table ); |
333 | if ( $previousBatchFinalTimestamp === $lastFileObject->getTimestamp() ) { |
334 | // Temporarily raise the batch size for the next batch as the |
335 | // last timestamp in this batch is the same as the last timestamp |
336 | // for the last batch. |
337 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( |
338 | $table, $previousBatchFinalTimestamp, true |
339 | ); |
340 | $rows = $fileSelectQueryBuilder |
341 | ->caller( __METHOD__ ) |
342 | ->fetchResultSet(); |
343 | $rows->seek( $rows->numRows() - 1 ); |
344 | $lastFileObject = $this->mediaModerationFileFactory->getFileObjectForRow( |
345 | $rows->fetchObject(), $table |
346 | ); |
347 | } |
348 | // Store the timestamp for the last file, and return it to the caller later in this method. |
349 | $lastFileTimestamp = $lastFileObject->getTimestamp(); |
350 | // Reset the position of the pointer for the caller to be able to use a foreach loop on $rows. |
351 | $rows->rewind(); |
352 | } |
353 | return [ $rows, $lastFileTimestamp, $fileSelectQueryBuilder->getQueryInfo()['options']['LIMIT'] ]; |
354 | } |
355 | |
356 | /** |
357 | * Perform a batch of imports to the mediamoderation_scan table from the $table |
358 | * starting at the $lastFileTimestamp going towards newer files. |
359 | * |
360 | * @param string $table |
361 | * @param string $previousBatchFinalTimestamp |
362 | * @return array First value is whether another batch should be run, second value is the new value of |
363 | * $previousBatchFinalTimestamp, and third value is the new value of $shouldRaiseBatchSize |
364 | */ |
365 | protected function performBatch( string $table, string $previousBatchFinalTimestamp ): array { |
366 | [ $rows, $lastFileTimestamp, $batchSizeUsedForBatch ] = $this->getRowsForBatch( |
367 | $table, $previousBatchFinalTimestamp |
368 | ); |
369 | foreach ( $rows as $row ) { |
370 | // Get the File or ArchivedFile object for this $row. |
371 | $fileObject = $this->mediaModerationFileFactory->getFileObjectForRow( $row, $table ); |
372 | // Exclude any file that has a SHA-1 value that is false or empty. |
373 | // This can happen in some filearchive rows where the image no |
374 | // longer exists. |
375 | // Also check if the SHA-1 exists in the scan table using a replica DB |
376 | // before attempting to insert the file to reduce the number of |
377 | // unnecessary reads on the primary DB. |
378 | if ( |
379 | $fileObject->getSha1() && |
380 | !$this->mediaModerationDatabaseLookup->fileExistsInScanTable( $fileObject ) |
381 | ) { |
382 | $this->mediaModerationFileProcessor->insertFile( $fileObject ); |
383 | } |
384 | } |
385 | // Return false as the first item of the array if the number of rows processed |
386 | // was less than the batch size. This will happen when there are no more images |
387 | // to process. |
388 | return [ |
389 | $rows->count() >= $batchSizeUsedForBatch, |
390 | $lastFileTimestamp |
391 | ]; |
392 | } |
393 | } |
394 | |
395 | $maintClass = ImportExistingFilesToScanTable::class; |
396 | require_once RUN_MAINTENANCE_IF_MAIN; |