Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.76% |
179 / 185 |
|
100.00% |
8 / 8 |
CRAP | |
100.00% |
1 / 1 |
ScanFilesInScanTable | |
100.00% |
179 / 179 |
|
100.00% |
8 / 8 |
32 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
60 / 60 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
4 | |||
maybeOutputVerboseScanResult | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
8 | |||
initServices | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
parseLastCheckedTimestamp | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
7 | |||
generateSha1ValuesForScan | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
2 | |||
waitForJobQueueSize | |
100.00% |
32 / 32 |
|
100.00% |
1 / 1 |
7 | |||
pollSha1ValuesForScanCompletion | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MediaModeration\Maintenance; |
4 | |
5 | use IDBAccessObject; |
6 | use JobQueueError; |
7 | use JobQueueGroup; |
8 | use JobSpecification; |
9 | use Maintenance; |
10 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationDatabaseLookup; |
11 | use MediaWiki\Extension\MediaModeration\Services\MediaModerationFileScanner; |
12 | use MediaWiki\Status\StatusFormatter; |
13 | use RequestContext; |
14 | use StatusValue; |
15 | use Wikimedia\Rdbms\LBFactory; |
16 | use Wikimedia\Rdbms\SelectQueryBuilder; |
17 | use Wikimedia\Timestamp\ConvertibleTimestamp; |
18 | |
19 | $IP = getenv( 'MW_INSTALL_PATH' ); |
20 | if ( $IP === false ) { |
21 | $IP = __DIR__ . '/../../..'; |
22 | } |
23 | require_once "$IP/maintenance/Maintenance.php"; |
24 | |
25 | /** |
26 | * Scans files referenced by their SHA-1 value in the mediamoderation_scan table. |
27 | */ |
28 | class ScanFilesInScanTable extends Maintenance { |
29 | |
30 | private LBFactory $loadBalancerFactory; |
31 | private MediaModerationDatabaseLookup $mediaModerationDatabaseLookup; |
32 | private MediaModerationFileScanner $mediaModerationFileScanner; |
33 | private JobQueueGroup $jobQueueGroup; |
34 | private StatusFormatter $statusFormatter; |
35 | |
36 | private ?string $lastChecked; |
37 | /** @var array If --use-jobqueue is specified, holds the SHA-1 values currently being processed by the job queue. */ |
38 | private array $sha1ValuesBeingProcessed = []; |
39 | |
40 | public function __construct() { |
41 | parent::__construct(); |
42 | |
43 | $this->requireExtension( 'MediaModeration' ); |
44 | $this->addDescription( 'Maintenance script to scan files listed in the mediamoderation_scan table.' ); |
45 | |
46 | $this->addOption( |
47 | 'last-checked', |
48 | 'Only scan files (referenced by their SHA-1 value internally) where the last attempted scan ' . |
49 | 'was before this date (including never checked files). The default is to filter for files last attempted ' . |
50 | "to be scanned before today. To only scan files that have never been scanned before specify 'never'. The " . |
51 | 'accepted format is YYYYMMDD or a timestamp supported by ConvertibleTimestamp. Files that have been ' . |
52 | 'successfully scanned (i.e. the match status is not null) are not re-scanned by this script.', |
53 | ); |
54 | $this->addOption( |
55 | 'use-jobqueue', |
56 | 'Scan files concurrently using the job queue. Each job scans one SHA-1 and are added in ' . |
57 | 'batches of --batch-size. The script waits to add more jobs until the number of jobs left processing ' . |
58 | 'less than --poll-until. Using the job queue increases the speed of scanning, but disables output to ' . |
59 | 'console about the status of scans as these are handled by jobs which produce no console output.', |
60 | ); |
61 | $this->addOption( |
62 | 'sleep', |
63 | 'Sleep time (in seconds) between every batch of SHA-1 values scanned. Default: 1', |
64 | false, |
65 | true |
66 | ); |
67 | $this->addOption( |
68 | 'poll-sleep', |
69 | 'Sleep time (in seconds) between every poll to check for completed scanning jobs. This is done ' . |
70 | 'so that the script does not add more jobs to scan SHA-1 values until the SHA-1 values being currently ' . |
71 | 'processed is equal or less than --poll-until. Does nothing if the --use-jobqueue option is not ' . |
72 | 'specified. Default: 1', |
73 | false, |
74 | true |
75 | ); |
76 | $this->addOption( |
77 | 'poll-until', |
78 | 'If --use-jobqueue is specified, used to wait until there are this or less SHA-1s being ' . |
79 | 'currently being processed by the job queue. This is checked via polling and the speed of polling is ' . |
80 | 'controlled by --poll-sleep. The default for this option is half of the value of --batch-size (which ' . |
81 | 'is 200 by default).', |
82 | false, |
83 | true |
84 | ); |
85 | $this->addOption( |
86 | 'max-polls', |
87 | 'If --use-jobqueue is specified, then this controls the number of times that the status of ' . |
88 | 'scans in the job queue are polled. If the number of times polled exceeds this value the array that ' . |
89 | 'tracks the SHA-1 values currently being processed is emptied to avoid failed jobs causing the script ' . |
90 | 'to infinitely loop.', |
91 | false, |
92 | true |
93 | ); |
94 | $this->addOption( |
95 | 'verbose', |
96 | 'Enables verbose mode which prints out information once a SHA-1 has finished being scanned.' . |
97 | 'If --use-jobqueue is specified, this instead prints out information about the jobs being queued.', |
98 | false, |
99 | false, |
100 | 'v' |
101 | ); |
102 | |
103 | $this->setBatchSize( 200 ); |
104 | } |
105 | |
106 | public function execute() { |
107 | $this->initServices(); |
108 | $this->parseLastCheckedTimestamp(); |
109 | |
110 | foreach ( $this->generateSha1ValuesForScan() as $sha1 ) { |
111 | if ( $this->hasOption( 'use-jobqueue' ) ) { |
112 | // Push scan jobs to the job queue if --use-jobqueue is set. |
113 | // To monitor the status of scans when using the job queue it |
114 | // is intended that the user monitors statsd / the logging channel. |
115 | try { |
116 | $this->jobQueueGroup->push( new JobSpecification( |
117 | 'mediaModerationScanFileJob', |
118 | [ 'sha1' => $sha1 ] |
119 | ) ); |
120 | } catch ( JobQueueError $e ) { |
121 | // If the job failed to be inserted, then catch the exception and sleep as this can occur if the |
122 | // server is experiencing instability. |
123 | sleep( intval( $this->getOption( 'sleep', 1 ) ) ); |
124 | } |
125 | } else { |
126 | $scanStatus = $this->mediaModerationFileScanner->scanSha1( $sha1 ); |
127 | $this->maybeOutputVerboseScanResult( $sha1, $scanStatus ); |
128 | } |
129 | } |
130 | } |
131 | |
132 | /** |
133 | * Outputs verbose information about the status of a scan for a provided SHA-1 if |
134 | * verbose mode is enabled via the --verbose command line argument. |
135 | * |
136 | * @param string $sha1 The SHA-1 that was just checked |
137 | * @param StatusValue $checkResult The StatusValue as returned by MediaModerationFileScanner::scanSha1 |
138 | * @return void |
139 | */ |
140 | protected function maybeOutputVerboseScanResult( string $sha1, StatusValue $checkResult ) { |
141 | if ( !$this->hasOption( 'verbose' ) ) { |
142 | return; |
143 | } |
144 | // Output any warnings or errors. |
145 | if ( !$checkResult->isGood() && count( $checkResult->getErrors() ) ) { |
146 | $this->error( "SHA-1 $sha1\n" ); |
147 | if ( count( $checkResult->getErrors() ) === 1 ) { |
148 | $this->error( '* ' . $this->statusFormatter->getWikiText( $checkResult ) . "\n" ); |
149 | } elseif ( count( $checkResult->getErrors() ) > 1 ) { |
150 | $this->error( $this->statusFormatter->getWikiText( $checkResult ) ); |
151 | } |
152 | } |
153 | $outputString = "SHA-1 $sha1: "; |
154 | if ( $checkResult->getValue() === null ) { |
155 | $outputString .= "Scan failed.\n"; |
156 | // If the scan failed, make this an error output. |
157 | $this->error( $outputString ); |
158 | return; |
159 | } |
160 | if ( $checkResult->getValue() ) { |
161 | $outputString .= "Positive match.\n"; |
162 | } else { |
163 | $outputString .= "No match.\n"; |
164 | } |
165 | $this->output( $outputString ); |
166 | } |
167 | |
168 | protected function initServices() { |
169 | $services = $this->getServiceContainer(); |
170 | $this->loadBalancerFactory = $services->getDBLoadBalancerFactory(); |
171 | $this->mediaModerationDatabaseLookup = $services->get( 'MediaModerationDatabaseLookup' ); |
172 | $this->mediaModerationFileScanner = $services->get( 'MediaModerationFileScanner' ); |
173 | $this->jobQueueGroup = $services->getJobQueueGroup(); |
174 | $this->statusFormatter = $services->getFormatterFactory()->getStatusFormatter( RequestContext::getMain() ); |
175 | } |
176 | |
177 | /** |
178 | * Parse the 'last-checked' timestamp provided via the command line, |
179 | * and cause a fatal error if it cannot be parsed. |
180 | * |
181 | * @return void |
182 | */ |
183 | protected function parseLastCheckedTimestamp() { |
184 | $lastChecked = $this->getOption( |
185 | 'last-checked', |
186 | // Subtract one day from the current date for the default of 'last-checked' |
187 | ConvertibleTimestamp::time() - 60 * 60 * 24 |
188 | ); |
189 | // If the 'last-checked' option is the string "never", then convert this to null. |
190 | if ( $lastChecked === "never" ) { |
191 | $this->lastChecked = null; |
192 | } elseif ( strlen( $lastChecked ) === 8 && $lastChecked === strval( intval( $lastChecked ) ) ) { |
193 | // The 'last-checked' argument is likely to be in the form YYYYMMDD because: |
194 | // * The length of the argument is 8 (which is the length of a YYYYMMDD format) |
195 | // * The intval of the 'last-checked' parameter can be converted to an integer |
196 | // and from a string without any changes in value (thus it must be an integer |
197 | // in string form). |
198 | // Convert it to a TS_MW timestamp by adding 000000 to the end (the time component). |
199 | if ( |
200 | $lastChecked === $this->mediaModerationDatabaseLookup |
201 | ->getDateFromTimestamp( ConvertibleTimestamp::now() ) |
202 | ) { |
203 | $this->fatalError( 'The --last-checked argument cannot be the current date.' ); |
204 | } |
205 | $this->lastChecked = $lastChecked . '000000'; |
206 | } elseif ( ConvertibleTimestamp::convert( TS_MW, $lastChecked ) ) { |
207 | // If the 'last-checked' argument is recognised as a timestamp by ConvertibleTimestamp::convert, |
208 | // then get the date part and discard the time part (replacing it with 000000). |
209 | $dateFromTimestamp = $this->mediaModerationDatabaseLookup->getDateFromTimestamp( $lastChecked ); |
210 | if ( |
211 | $dateFromTimestamp === $this->mediaModerationDatabaseLookup |
212 | ->getDateFromTimestamp( ConvertibleTimestamp::now() ) |
213 | ) { |
214 | $this->fatalError( 'The --last-checked argument cannot be the current date.' ); |
215 | } |
216 | $this->lastChecked = $dateFromTimestamp . '000000'; |
217 | } else { |
218 | // The 'last-checked' argument could not be parsed, so raise an error |
219 | $this->fatalError( |
220 | 'The --last-checked argument passed to this script could not be parsed. This can take a ' . |
221 | 'timestamp in string form, or a date in YYYYMMDD format.' |
222 | ); |
223 | } |
224 | } |
225 | |
226 | /** |
227 | * Generates SHA-1 values for to be scanned. This function pauses for the |
228 | * specified number of seconds after each batch of SHA-1 values. |
229 | * |
230 | * @return \Generator |
231 | */ |
232 | protected function generateSha1ValuesForScan(): \Generator { |
233 | do { |
234 | $batch = $this->mediaModerationDatabaseLookup->getSha1ValuesForScan( |
235 | $this->getBatchSize() ?? 200, |
236 | $this->lastChecked, |
237 | SelectQueryBuilder::SORT_ASC, |
238 | $this->sha1ValuesBeingProcessed, |
239 | MediaModerationDatabaseLookup::NULL_MATCH_STATUS |
240 | ); |
241 | // Store the number of rows returned to determine if another batch should be performed. |
242 | $lastBatchRowCount = count( $batch ); |
243 | yield from $batch; |
244 | // Sleep for the number of seconds specified in the 'sleep' option. |
245 | sleep( intval( $this->getOption( 'sleep', 1 ) ) ); |
246 | if ( $this->hasOption( 'use-jobqueue' ) ) { |
247 | // Wait until the number of SHA-1 values being processed drops below a specific count. |
248 | $this->waitForJobQueueSize( $batch ); |
249 | } |
250 | // Wait for replication so that updates to the mms_is_match and mms_last_checked |
251 | // on the rows processed in this batch are replicated to replica DBs. |
252 | $this->loadBalancerFactory->waitForReplication(); |
253 | } while ( $lastBatchRowCount !== 0 ); |
254 | } |
255 | |
256 | /** |
257 | * Waits for the number of SHA-1 values currently being processed using jobs to be less |
258 | * than half the batch size. |
259 | * |
260 | * When in verbose mode this method also prints out information about the SHA-1 values being processed. |
261 | * |
262 | * @param array $batch The new batch of SHA-1s being processed. If no batch was added, |
263 | * specify an empty array. |
264 | * @return void |
265 | */ |
266 | protected function waitForJobQueueSize( array $batch ) { |
267 | $pollUntil = intval( $this->getOption( 'poll-until', floor( ( $this->getBatchSize() ?? 200 ) / 2 ) ) ); |
268 | if ( $this->hasOption( 'verbose' ) ) { |
269 | // If in verbose mode, print out the batch that was just added to the console. |
270 | $batchSize = count( $batch ); |
271 | $this->output( |
272 | "Added $batchSize SHA-1 value(s) for scanning via the job queue: " . |
273 | implode( ', ', $batch ) . "\n" |
274 | ); |
275 | } |
276 | // Add the new SHA-1 values being processed by the job queue to the array keeping track |
277 | // of the job queue count. Needed because JobQueueEventBus does not return the current |
278 | // job queue count. |
279 | $this->sha1ValuesBeingProcessed = array_merge( $this->sha1ValuesBeingProcessed, $batch ); |
280 | // Wait until at least half of the SHA-1's have been updated to have mms_last_checked as the current date |
281 | // or we have looped more than --max-polls times. |
282 | $numberOfTimesPolled = 0; |
283 | if ( !count( $this->sha1ValuesBeingProcessed ) ) { |
284 | // Return early if sha1ValuesBeingProcessed is empty, as we have nothing to wait for. |
285 | return; |
286 | } |
287 | do { |
288 | if ( $this->hasOption( 'verbose' ) ) { |
289 | // If in verbose mode, print out how many jobs are currently processing and how many we are |
290 | // waiting to complete before adding more. |
291 | $sha1sBeingProcessedCount = count( $this->sha1ValuesBeingProcessed ); |
292 | $this->output( |
293 | "$sha1sBeingProcessedCount SHA-1 value(s) currently being processed via jobs. " . |
294 | "Waiting until there are $pollUntil or less SHA-1 value(s) being processed before " . |
295 | "adding more jobs.\n" |
296 | ); |
297 | } |
298 | $this->sha1ValuesBeingProcessed = array_diff( |
299 | $this->sha1ValuesBeingProcessed, $this->pollSha1ValuesForScanCompletion() |
300 | ); |
301 | sleep( intval( $this->getOption( 'poll-sleep', 1 ) ) ); |
302 | $numberOfTimesPolled++; |
303 | } while ( |
304 | count( $this->sha1ValuesBeingProcessed ) > $pollUntil && |
305 | $numberOfTimesPolled < $this->getOption( 'max-polls', 60 ) |
306 | ); |
307 | // If the we polled too many times, then reset the internal array of SHA-1s being processed as it is probably |
308 | // out of sync to the actual number of jobs running. |
309 | if ( $numberOfTimesPolled >= $this->getOption( 'max-polls', 60 ) ) { |
310 | if ( $this->hasOption( 'verbose' ) ) { |
311 | $this->error( |
312 | 'The internal array of SHA-1 values being processed has been cleared as more than ' . |
313 | "{$this->getOption( 'max-polls', 60 )} polls have occurred.\n" |
314 | ); |
315 | } |
316 | $this->sha1ValuesBeingProcessed = []; |
317 | } |
318 | } |
319 | |
320 | protected function pollSha1ValuesForScanCompletion(): array { |
321 | $dbr = $this->mediaModerationDatabaseLookup->getDb( IDBAccessObject::READ_NORMAL ); |
322 | // Wait for replication to occur to avoid polling a out-of-date replica DB. |
323 | $this->loadBalancerFactory->waitForReplication(); |
324 | $queryBuilder = $dbr->newSelectQueryBuilder() |
325 | ->select( 'mms_sha1' ) |
326 | ->from( 'mediamoderation_scan' ) |
327 | ->where( [ |
328 | $dbr->expr( |
329 | 'mms_last_checked', |
330 | '>', |
331 | $this->mediaModerationDatabaseLookup->getDateFromTimestamp( $this->lastChecked ) |
332 | ), |
333 | ] ); |
334 | if ( count( $this->sha1ValuesBeingProcessed ) ) { |
335 | $queryBuilder->andWhere( [ |
336 | 'mms_sha1' => $this->sha1ValuesBeingProcessed |
337 | ] ); |
338 | } |
339 | return $queryBuilder->fetchFieldValues(); |
340 | } |
341 | } |
342 | |
343 | $maintClass = ScanFilesInScanTable::class; |
344 | require_once RUN_MAINTENANCE_IF_MAIN; |