Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.76% |
179 / 185 |
|
100.00% |
10 / 10 |
CRAP | |
100.00% |
1 / 1 |
ImportExistingFilesToScanTable | |
100.00% |
179 / 179 |
|
100.00% |
10 / 10 |
32 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
1 | |||
getUpdateKey | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doDBUpdates | |
100.00% |
47 / 47 |
|
100.00% |
1 / 1 |
7 | |||
generateDBUpdatesReturnValue | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
getTablesToProcess | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
getEstimatedNumberOfBatchesForTable | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
4 | |||
getTemporaryBatchSize | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
2 | |||
getFileSelectQueryBuilder | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 | |||
getRowsForBatch | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
3 | |||
performBatch | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MediaModeration\Maintenance; |
4 | |
5 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationDatabaseLookup; |
6 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileFactory; |
7 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileLookup; |
8 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileProcessor; |
9 | use MediaWiki\FileRepo\File\FileSelectQueryBuilder; |
10 | use MediaWiki\Maintenance\LoggedUpdateMaintenance; |
11 | use Wikimedia\Rdbms\IReadableDatabase; |
12 | use Wikimedia\Rdbms\SelectQueryBuilder; |
13 | |
14 | $IP = getenv( 'MW_INSTALL_PATH' ); |
15 | if ( $IP === false ) { |
16 | $IP = __DIR__ . '/../../..'; |
17 | } |
18 | require_once "$IP/maintenance/Maintenance.php"; |
19 | |
20 | class ImportExistingFilesToScanTable extends LoggedUpdateMaintenance { |
21 | |
22 | /** @var string[] The DB tables that have images to be imported to mediamoderation_scan. */ |
23 | public const TABLES_TO_IMPORT_FROM = [ |
24 | 'image', |
25 | 'filearchive', |
26 | 'oldimage', |
27 | ]; |
28 | |
29 | private IReadableDatabase $dbr; |
30 | private MediaModerationFileProcessor $mediaModerationFileProcessor; |
31 | private MediaModerationDatabaseLookup $mediaModerationDatabaseLookup; |
32 | private MediaModerationFileFactory $mediaModerationFileFactory; |
33 | private MediaModerationFileLookup $mediaModerationFileLookup; |
34 | |
35 | public function __construct() { |
36 | parent::__construct(); |
37 | $this->requireExtension( 'MediaModeration' ); |
38 | $this->addDescription( 'Populates the mediamoderation_scan table with existing images from the wiki.' ); |
39 | $this->addOption( |
40 | 'sleep', |
41 | 'Sleep time (in seconds) between every batch. Default: 1', |
42 | false, |
43 | true |
44 | ); |
45 | $this->addOption( |
46 | 'start-timestamp', |
47 | 'The timestamp which to start importing files from. Default is for no timestamp start point ' . |
48 | '(which means importing all images)', |
49 | false, |
50 | true |
51 | ); |
52 | $this->addOption( |
53 | 'table', |
54 | 'Allows specifying which table(s) files should be imported from. Default is all supported tables.', |
55 | false, |
56 | true, |
57 | false, |
58 | true |
59 | ); |
60 | $this->addOption( |
61 | 'mark-complete', |
62 | 'Allows controlling whether this script should be considered completely run for the purposes ' . |
63 | 'of the updatelog. If provided the script will be marked as complete. ' . |
64 | "Default is to consider the script completely run if the 'table' and 'start-timestamp' options were left " . |
65 | 'as the default and the script does not error out.', |
66 | ); |
67 | } |
68 | |
69 | /** @inheritDoc */ |
70 | protected function getUpdateKey() { |
71 | return __CLASS__; |
72 | } |
73 | |
74 | /** @inheritDoc */ |
75 | protected function doDBUpdates() { |
76 | $services = $this->getServiceContainer(); |
77 | $this->dbr = $this->getReplicaDB(); |
78 | $this->mediaModerationFileFactory = $services->get( 'MediaModerationFileFactory' ); |
79 | $this->mediaModerationFileProcessor = $services->get( 'MediaModerationFileProcessor' ); |
80 | $this->mediaModerationDatabaseLookup = $services->get( 'MediaModerationDatabaseLookup' ); |
81 | $this->mediaModerationFileLookup = $services->get( 'MediaModerationFileLookup' ); |
82 | |
83 | // Get the list of tables to import images from. |
84 | $tablesToProcess = $this->getTablesToProcess(); |
85 | if ( $tablesToProcess === false ) { |
86 | // If the value is false, then return false |
87 | // as this indicates the script did not run. |
88 | return false; |
89 | } |
90 | |
91 | foreach ( $tablesToProcess as $table ) { |
92 | $batchSize = $this->getBatchSize() ?? 200; |
93 | $this->output( "Now importing rows from the table '$table' in batches of $batchSize.\n" ); |
94 | $previousBatchFinalTimestamp = $this->getOption( 'start-timestamp', '' ); |
95 | if ( $previousBatchFinalTimestamp ) { |
96 | $this->output( |
97 | "Starting from timestamp $previousBatchFinalTimestamp and importing files with a " . |
98 | "greater timestamp.\n" |
99 | ); |
100 | } |
101 | $expectedNumberOfBatches = $this->getEstimatedNumberOfBatchesForTable( |
102 | $table, $previousBatchFinalTimestamp |
103 | ); |
104 | $batchNo = 1; |
105 | do { |
106 | $outputString = "Batch $batchNo of ~$expectedNumberOfBatches"; |
107 | if ( $previousBatchFinalTimestamp ) { |
108 | $outputString .= " with rows starting at timestamp $previousBatchFinalTimestamp"; |
109 | } |
110 | $this->output( $outputString . ".\n" ); |
111 | [ |
112 | $filesLeft, |
113 | $previousBatchFinalTimestamp |
114 | ] = $this->performBatch( $table, $previousBatchFinalTimestamp ); |
115 | sleep( intval( $this->getOption( 'sleep', 1 ) ) ); |
116 | $this->waitForReplication(); |
117 | $batchNo += 1; |
118 | } while ( $filesLeft ); |
119 | } |
120 | $returnValue = $this->generateDBUpdatesReturnValue(); |
121 | if ( $this->hasOption( 'force' ) ) { |
122 | // If the script was run with the force option, then don't |
123 | // print out about how the script has been or has not been |
124 | // marked as completed. |
125 | return $returnValue; |
126 | } |
127 | if ( $returnValue ) { |
128 | $this->output( "Script marked as completed (added to updatelog).\n" ); |
129 | } else { |
130 | $this->output( |
131 | 'Script not marked as completed (not added to updatelog). The script was marked as not complete ' . |
132 | "because not all the images on the wiki were processed in this run of the script.\n" . |
133 | 'To mark the script as complete and not have it run again through update.php, make sure to run the ' . |
134 | "script again with the 'mark-complete' option specified. You should only do this once you are sure " . |
135 | "that all the images on the wiki have been imported.\n" |
136 | ); |
137 | } |
138 | return $returnValue; |
139 | } |
140 | |
141 | /** |
142 | * Return the value that should be used as the return value |
143 | * of ::doDBUpdates. This value depends on the options |
144 | * passed to the script. |
145 | * |
146 | * If true is returned, it should only be done if the script |
147 | * either has imported all images or the caller of the maintenance |
148 | * script has specifically intended for the script to marked as |
149 | * complete. |
150 | * |
151 | * @return bool |
152 | */ |
153 | protected function generateDBUpdatesReturnValue(): bool { |
154 | // Return true if mark-complete is specified, or if both: |
155 | // * start-timestamp is not specified or an empty string, and |
156 | // * table is not specified or includes all tables listed in self::TABLES_TO_IMPORT_FROM. |
157 | return $this->hasOption( 'mark-complete' ) || |
158 | ( |
159 | $this->getOption( 'start-timestamp', '' ) === '' && |
160 | $this->getOption( 'table', self::TABLES_TO_IMPORT_FROM ) == self::TABLES_TO_IMPORT_FROM |
161 | ); |
162 | } |
163 | |
164 | /** |
165 | * Processes the user supplied list of tables to process, |
166 | * with the default being all supported tables. |
167 | * |
168 | * Prints an error if the supplied arguments are invalid. |
169 | * |
170 | * @return false|array The list of tables to process, or false if the list was not valid. |
171 | */ |
172 | protected function getTablesToProcess() { |
173 | $tablesToProcess = $this->getOption( 'table', self::TABLES_TO_IMPORT_FROM ); |
174 | if ( !count( $tablesToProcess ) ) { |
175 | $this->error( "The array of tables to have images imported from cannot be empty.\n" ); |
176 | return false; |
177 | } |
178 | foreach ( $tablesToProcess as $table ) { |
179 | if ( !in_array( $table, self::TABLES_TO_IMPORT_FROM ) ) { |
180 | $this->error( "The table option value '$table' is not a valid table to import images from.\n" ); |
181 | return false; |
182 | } |
183 | } |
184 | return $tablesToProcess; |
185 | } |
186 | |
187 | /** |
188 | * Gets the expected number of batches needed to process a table. |
189 | * This is used just for visual display and the actual number of batches |
190 | * may be higher or lower. |
191 | * |
192 | * @param string $table |
193 | * @param string $startTimestamp The timestamp that the processing will start from. |
194 | * @return int |
195 | */ |
196 | protected function getEstimatedNumberOfBatchesForTable( string $table, string $startTimestamp ): int { |
197 | // Get the row count for the $table. |
198 | $queryBuilder = $this->dbr->newSelectQueryBuilder() |
199 | ->select( 'COUNT(*)' ) |
200 | ->from( $table ); |
201 | if ( $startTimestamp ) { |
202 | $queryBuilder->where( $this->dbr->expr( |
203 | $this->mediaModerationFileLookup->getTimestampFieldForTable( $table ), |
204 | '>=', |
205 | $this->dbr->timestamp( $startTimestamp ) |
206 | ) ); |
207 | } |
208 | $rowCountInTable = $queryBuilder |
209 | ->caller( __METHOD__ ) |
210 | ->fetchField(); |
211 | // If the row count is zero, then one batch will be performed. |
212 | if ( !$rowCountInTable ) { |
213 | return 1; |
214 | } |
215 | // The expected batch count is the number of rows in the table |
216 | // divided by the batch size. This may be higher than the actual |
217 | // batch count, as it may be temporarily increased to prevent |
218 | // infinite loops. |
219 | $batchSize = $this->getBatchSize() ?? 200; |
220 | $expectedBatchesCount = ceil( $rowCountInTable / $batchSize ); |
221 | if ( $rowCountInTable % $batchSize === 0 ) { |
222 | // If the batch size divides the row count without a remainder, then |
223 | // the expected batch count needs to be increased by one as one |
224 | // more batch will be performed at the end with no rows found. |
225 | $expectedBatchesCount += 1; |
226 | } |
227 | return $expectedBatchesCount; |
228 | } |
229 | |
230 | /** |
231 | * Gets the temporary batch size for use by ::processBatch if it was provided |
232 | * with $shouldRaiseBatchSize as the boolean 'true'. This is one more than |
233 | * the number of files with the $previousBatchFinalTimestamp. |
234 | * |
235 | * @param FileSelectQueryBuilder $fileSelectQueryBuilder The cloned FileSelectQueryBuilder that is being |
236 | * built in ::getSelectFileQueryBuilder. This needs to be cloned to avoid issues with this |
237 | * method modifying the query builder. |
238 | * @param string $timestampField |
239 | * @param string $previousBatchFinalTimestamp |
240 | * @return int |
241 | */ |
242 | protected function getTemporaryBatchSize( |
243 | FileSelectQueryBuilder $fileSelectQueryBuilder, string $timestampField, string $previousBatchFinalTimestamp |
244 | ): int { |
245 | $filesWithTheCutoffTimestamp = (int)$fileSelectQueryBuilder |
246 | ->clearFields() |
247 | ->field( 'COUNT(*)' ) |
248 | ->where( $this->dbr->expr( $timestampField, '=', $this->dbr->timestamp( $previousBatchFinalTimestamp ) ) ) |
249 | ->caller( __METHOD__ ) |
250 | ->fetchField(); |
251 | // Sanity check that the new batch size would actually be larger (otherwise |
252 | // leave the batch size as is as it will be fine). |
253 | if ( $filesWithTheCutoffTimestamp >= ( $this->getBatchSize() ?? 200 ) ) { |
254 | $batchSize = $filesWithTheCutoffTimestamp + 1; |
255 | $this->output( |
256 | "Temporarily raised the batch size to $batchSize due to files with the same upload timestamp. " . |
257 | "This is done to prevent an infinite loop. Consider raising the batch size to avoid this.\n" |
258 | ); |
259 | return $batchSize; |
260 | } |
261 | return $this->getBatchSize() ?? 200; |
262 | } |
263 | |
264 | /** |
265 | * Gets the appropriate FileSelectQueryBuilder for the $table and |
266 | * applies the WHERE conditions, ORDER BY and LIMIT. |
267 | * |
268 | * @param string $table The table name currently being processed. |
269 | * @param string $previousBatchFinalTimestamp The timestamp which the last batch stopped at. This |
270 | * is used to filter for files with this timestamp or a newer timestamp. |
271 | * @param bool $shouldRaiseBatchSize Used to indicate that the previous batch ended and started on |
272 | * the same timestamp, so this batch should reattempt that timestamp |
273 | * but with a temporarily raised batch size to account for this. |
274 | * @return FileSelectQueryBuilder |
275 | */ |
276 | protected function getFileSelectQueryBuilder( |
277 | string $table, string $previousBatchFinalTimestamp, bool $shouldRaiseBatchSize |
278 | ): FileSelectQueryBuilder { |
279 | // Get the appropriate FileSelectQueryBuilder using MediaModerationDatabaseLookup::getFileSelectQueryBuilder |
280 | $fileSelectQueryBuilder = $this->mediaModerationFileLookup->getFileSelectQueryBuilder( $table ); |
281 | $timestampField = $this->mediaModerationFileLookup->getTimestampFieldForTable( $table ); |
282 | $batchSize = $this->getBatchSize() ?? 200; |
283 | if ( $shouldRaiseBatchSize ) { |
284 | // If the previous batch started and ended on the same timestamp, |
285 | // then temporarily raise the batch count to 1 more than the number |
286 | // of files with this timestamp to avoid an infinite loop. |
287 | $batchSize = $this->getTemporaryBatchSize( |
288 | clone $fileSelectQueryBuilder, |
289 | $timestampField, |
290 | $previousBatchFinalTimestamp |
291 | ); |
292 | } |
293 | // If the timestamp is not empty, filter for entries with a greater timestamp |
294 | // than the cutoff timestamp. |
295 | if ( $previousBatchFinalTimestamp ) { |
296 | $fileSelectQueryBuilder |
297 | ->where( $this->dbr->expr( |
298 | $timestampField, '>=', $this->dbr->timestamp( $previousBatchFinalTimestamp ) ) ); |
299 | } |
300 | // Order by the timestamp (oldest to newest) and set the limit as the batch size. |
301 | $fileSelectQueryBuilder |
302 | ->orderBy( $timestampField, SelectQueryBuilder::SORT_ASC ) |
303 | ->limit( $batchSize ); |
304 | return $fileSelectQueryBuilder; |
305 | } |
306 | |
307 | /** |
308 | * Gets the rows for a batch along with the timestamp for the last file in the batch. |
309 | * |
310 | * @param string $table |
311 | * @param string $previousBatchFinalTimestamp |
312 | * @return array The rows for the batch, the timestamp for the last file in the results list, and the |
313 | * LIMIT used for the batch. |
314 | */ |
315 | protected function getRowsForBatch( string $table, string $previousBatchFinalTimestamp ): array { |
316 | // Get the FileSelectQueryBuilder with everything but the caller specified. |
317 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( |
318 | $table, $previousBatchFinalTimestamp, false |
319 | ); |
320 | // Specify the caller and then get the rows from the DB. |
321 | $rows = $fileSelectQueryBuilder |
322 | ->caller( __METHOD__ ) |
323 | ->fetchResultSet(); |
324 | $lastFileTimestamp = $previousBatchFinalTimestamp; |
325 | // Check whether the last file in this batch has the same timestamp as in |
326 | // $previousBatchFinalTimestamp. If so, then increase the batch size to |
327 | // prevent an infinite loop which would be caused by processing the same |
328 | // files over and over again with that timestamp. |
329 | if ( $rows->numRows() ) { |
330 | $rows->seek( $rows->numRows() - 1 ); |
331 | $lastFileObject = $this->mediaModerationFileFactory->getFileObjectForRow( $rows->fetchObject(), $table ); |
332 | if ( $previousBatchFinalTimestamp === $lastFileObject->getTimestamp() ) { |
333 | // Temporarily raise the batch size for the next batch as the |
334 | // last timestamp in this batch is the same as the last timestamp |
335 | // for the last batch. |
336 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( |
337 | $table, $previousBatchFinalTimestamp, true |
338 | ); |
339 | $rows = $fileSelectQueryBuilder |
340 | ->caller( __METHOD__ ) |
341 | ->fetchResultSet(); |
342 | $rows->seek( $rows->numRows() - 1 ); |
343 | $lastFileObject = $this->mediaModerationFileFactory->getFileObjectForRow( |
344 | $rows->fetchObject(), $table |
345 | ); |
346 | } |
347 | // Store the timestamp for the last file, and return it to the caller later in this method. |
348 | $lastFileTimestamp = $lastFileObject->getTimestamp(); |
349 | // Reset the position of the pointer for the caller to be able to use a foreach loop on $rows. |
350 | $rows->rewind(); |
351 | } |
352 | return [ $rows, $lastFileTimestamp, $fileSelectQueryBuilder->getQueryInfo()['options']['LIMIT'] ]; |
353 | } |
354 | |
355 | /** |
356 | * Perform a batch of imports to the mediamoderation_scan table from the $table |
357 | * starting at the $lastFileTimestamp going towards newer files. |
358 | * |
359 | * @param string $table |
360 | * @param string $previousBatchFinalTimestamp |
361 | * @return array First value is whether another batch should be run, second value is the new value of |
362 | * $previousBatchFinalTimestamp, and third value is the new value of $shouldRaiseBatchSize |
363 | */ |
364 | protected function performBatch( string $table, string $previousBatchFinalTimestamp ): array { |
365 | [ $rows, $lastFileTimestamp, $batchSizeUsedForBatch ] = $this->getRowsForBatch( |
366 | $table, $previousBatchFinalTimestamp |
367 | ); |
368 | foreach ( $rows as $row ) { |
369 | // Get the File or ArchivedFile object for this $row. |
370 | $fileObject = $this->mediaModerationFileFactory->getFileObjectForRow( $row, $table ); |
371 | // Exclude any file that has a SHA-1 value that is false or empty. |
372 | // This can happen in some filearchive rows where the image no |
373 | // longer exists. |
374 | // Also check if the SHA-1 exists in the scan table using a replica DB |
375 | // before attempting to insert the file to reduce the number of |
376 | // unnecessary reads on the primary DB. |
377 | if ( |
378 | $fileObject->getSha1() && |
379 | !$this->mediaModerationDatabaseLookup->fileExistsInScanTable( $fileObject ) |
380 | ) { |
381 | $this->mediaModerationFileProcessor->insertFile( $fileObject ); |
382 | } |
383 | } |
384 | // Return false as the first item of the array if the number of rows processed |
385 | // was less than the batch size. This will happen when there are no more images |
386 | // to process. |
387 | return [ |
388 | $rows->count() >= $batchSizeUsedForBatch, |
389 | $lastFileTimestamp |
390 | ]; |
391 | } |
392 | } |
393 | |
394 | $maintClass = ImportExistingFilesToScanTable::class; |
395 | require_once RUN_MAINTENANCE_IF_MAIN; |