Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
89.61% |
69 / 77 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
| MediaModerationFileLookup | |
89.61% |
69 / 77 |
|
75.00% |
6 / 8 |
26.76 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getFileSelectQueryBuilder | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
| getTimestampFieldForTable | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
| getSha1FieldForTable | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
| getRowCountForTimestamp | |
76.92% |
10 / 13 |
|
0.00% |
0 / 1 |
3.11 | |||
| performBatchQuery | |
66.67% |
10 / 15 |
|
0.00% |
0 / 1 |
2.15 | |||
| getBatchOfFileRows | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
5 | |||
| getFileObjectsForSha1 | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | declare( strict_types=1 ); |
| 3 | |
| 4 | namespace MediaWiki\Extension\MediaModeration\Services; |
| 5 | |
| 6 | use Generator; |
| 7 | use InvalidArgumentException; |
| 8 | use MediaWiki\FileRepo\File\ArchivedFile; |
| 9 | use MediaWiki\FileRepo\File\FileSelectQueryBuilder; |
| 10 | use MediaWiki\FileRepo\File\LocalFile; |
| 11 | use MediaWiki\FileRepo\LocalRepo; |
| 12 | use Wikimedia\Rdbms\FakeResultWrapper; |
| 13 | use Wikimedia\Rdbms\IResultWrapper; |
| 14 | use Wikimedia\Rdbms\SelectQueryBuilder; |
| 15 | |
| 16 | class MediaModerationFileLookup { |
| 17 | |
| 18 | public const TABLES_USED_FOR_LOOKUP = [ |
| 19 | 'image', |
| 20 | 'oldimage', |
| 21 | 'filearchive', |
| 22 | ]; |
| 23 | |
| 24 | public function __construct( |
| 25 | private readonly LocalRepo $localRepo, |
| 26 | private readonly MediaModerationFileFactory $mediaModerationFileFactory, |
| 27 | ) { |
| 28 | } |
| 29 | |
| 30 | /** |
| 31 | * Gets the appropriate FileSelectQueryBuilder for the given image $table |
| 32 | * |
| 33 | * @param string $table One of 'image', 'oldimage', or 'filearchive' |
| 34 | * @return FileSelectQueryBuilder |
| 35 | * @throws InvalidArgumentException If an unrecognised $table is provided |
| 36 | */ |
| 37 | public function getFileSelectQueryBuilder( string $table ): FileSelectQueryBuilder { |
| 38 | if ( $table === 'image' ) { |
| 39 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForFile( $this->localRepo->getReplicaDB() ); |
| 40 | } elseif ( $table === 'oldimage' ) { |
| 41 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForOldFile( $this->localRepo->getReplicaDB() ); |
| 42 | } elseif ( $table === 'filearchive' ) { |
| 43 | $fileSelectQueryBuilder = FileSelectQueryBuilder::newForArchivedFile( $this->localRepo->getReplicaDB() ); |
| 44 | } else { |
| 45 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
| 46 | } |
| 47 | return $fileSelectQueryBuilder; |
| 48 | } |
| 49 | |
| 50 | /** |
| 51 | * Gets the timestamp field for the provided $table. |
| 52 | * |
| 53 | * @param string $table One of 'image', 'oldimage', or 'fileimage' |
| 54 | * @return string The timestamp field name |
| 55 | * @throws InvalidArgumentException If $table is not one of the three valid options. |
| 56 | */ |
| 57 | public function getTimestampFieldForTable( string $table ): string { |
| 58 | if ( $table === 'image' ) { |
| 59 | return 'img_timestamp'; |
| 60 | } elseif ( $table === 'oldimage' ) { |
| 61 | return 'oi_timestamp'; |
| 62 | } elseif ( $table === 'filearchive' ) { |
| 63 | return 'fa_timestamp'; |
| 64 | } else { |
| 65 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
| 66 | } |
| 67 | } |
| 68 | |
| 69 | /** |
| 70 | * Gets the SHA-1 field for the provided $table. |
| 71 | * |
| 72 | * @param string $table One of 'image', 'oldimage', or 'fileimage' |
| 73 | * @return string The SHA-1 field name |
| 74 | * @throws InvalidArgumentException If $table is not one of the three valid options. |
| 75 | */ |
| 76 | private function getSha1FieldForTable( string $table ): string { |
| 77 | if ( $table === 'image' ) { |
| 78 | return 'img_sha1'; |
| 79 | } elseif ( $table === 'oldimage' ) { |
| 80 | return 'oi_sha1'; |
| 81 | } elseif ( $table === 'filearchive' ) { |
| 82 | return 'fa_sha1'; |
| 83 | } else { |
| 84 | throw new InvalidArgumentException( "Unrecognised image table '$table'." ); |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /** |
| 89 | * Returns the row count for rows that have a given |
| 90 | * timestamp and optionally a given SHA-1 value. |
| 91 | * |
| 92 | * Used to prevent issues with paging by timestamp |
| 93 | * when the row count being used to page is less |
| 94 | * than the row count of rows with a given timestamp. |
| 95 | * |
| 96 | * @param string $table The table to get the row count from (one of image, oldimage, or filearchive). |
| 97 | * @param string $timestamp The given timestamp in a TS_MW format. The count will only include rows |
| 98 | * with this exact timestamp. |
| 99 | * @param string|null $sha1 If provided, filter the count to only include rows with the given SHA-1 value. |
| 100 | * To not filter by SHA-1, provide null. |
| 101 | * @return int |
| 102 | */ |
| 103 | public function getRowCountForTimestamp( string $table, string $timestamp, ?string $sha1 ): int { |
| 104 | $fileSelectQueryBuilder = $this->getFileSelectQueryBuilder( $table ) |
| 105 | ->clearFields() |
| 106 | ->field( 'COUNT(*)' ); |
| 107 | if ( $timestamp ) { |
| 108 | $fileSelectQueryBuilder->where( [ |
| 109 | $this->getTimestampFieldForTable( $table ) => $this->localRepo->getReplicaDB()->timestamp( $timestamp ), |
| 110 | ] ); |
| 111 | } |
| 112 | if ( $sha1 !== null ) { |
| 113 | $fileSelectQueryBuilder->where( [ |
| 114 | $this->getSha1FieldForTable( $table ) => $sha1, |
| 115 | ] ); |
| 116 | } |
| 117 | return (int)$fileSelectQueryBuilder->caller( __METHOD__ ) |
| 118 | ->fetchField(); |
| 119 | } |
| 120 | |
| 121 | /** |
| 122 | * Actually performs the SELECT query to get a batch of rows from the given $table. |
| 123 | * Used by ::getBatchOfFileRows. |
| 124 | * |
| 125 | * @param string $table The table to get the batch from (one of image, oldimage, or filearchive). |
| 126 | * @param string $startTimestamp The timestamp which to start this batch at (cannot have been used for |
| 127 | * a previous batch to prevent infinite loops). Provide the empty string to start with the newest timestamp. |
| 128 | * @param string $sha1 The SHA-1 which rows must have to be selected |
| 129 | * @param int $batchSize The maximum number of rows to select |
| 130 | * |
| 131 | * @return IResultWrapper |
| 132 | */ |
| 133 | protected function performBatchQuery( |
| 134 | string $table, string $startTimestamp, string $sha1, int $batchSize |
| 135 | ): IResultWrapper { |
| 136 | // Only select rows with the given $sha1 |
| 137 | $queryBuilder = $this->getFileSelectQueryBuilder( $table ) |
| 138 | ->where( [ |
| 139 | $this->getSha1FieldForTable( $table ) => $sha1, |
| 140 | ] ); |
| 141 | if ( $startTimestamp ) { |
| 142 | // Only select rows with that have a timestamp under the $startTimestamp. |
| 143 | $queryBuilder->where( $this->localRepo->getReplicaDB()->expr( |
| 144 | $this->getTimestampFieldForTable( $table ), |
| 145 | '<=', |
| 146 | $startTimestamp |
| 147 | ) ); |
| 148 | } |
| 149 | // Select $batchSize rows. |
| 150 | return $queryBuilder |
| 151 | ->orderBy( $this->getTimestampFieldForTable( $table ), SelectQueryBuilder::SORT_DESC ) |
| 152 | ->limit( $batchSize ) |
| 153 | ->caller( __METHOD__ ) |
| 154 | ->fetchResultSet(); |
| 155 | } |
| 156 | |
| 157 | /** |
| 158 | * Returns a batch of rows from $table. The batches will go from largest timestamp to smallest timestamp, |
| 159 | * and are constructed such to not return the same row in more than one batch. |
| 160 | * |
| 161 | * @param string $table The table to get the batch from (one of image, oldimage, or filearchive). |
| 162 | * @param string $startTimestamp The timestamp which to start this batch at (cannot have been used for |
| 163 | * a previous batch to prevent infinite loops). Provide the empty string to start with the newest timestamp. |
| 164 | * @param string $sha1 The SHA-1 which rows must have to be selected |
| 165 | * @param int $batchSize The maximum number of rows return in the batch |
| 166 | * |
| 167 | * @return array First item being the IResultWrapper and the second being the value for $startTimestamp of the |
| 168 | * next batch. |
| 169 | */ |
| 170 | protected function getBatchOfFileRows( |
| 171 | string $table, string $startTimestamp, string $sha1, int $batchSize |
| 172 | ): array { |
| 173 | // Check that rows with the $sha1 and $startTimestamp do not exceed $batchSize. Otherwise, raise the $batchSize |
| 174 | // to prevent infinite loops. |
| 175 | $rowsWithStartTimestamp = $this->getRowCountForTimestamp( $table, $startTimestamp, $sha1 ); |
| 176 | if ( $rowsWithStartTimestamp > $batchSize ) { |
| 177 | // Increase the batch size to account for this, as if not the next batch would start with |
| 178 | // the same rows causing an infinite loop. |
| 179 | $batchSize = $rowsWithStartTimestamp; |
| 180 | } |
| 181 | // Get the batch which contains $batchSize + 1 rows. The added row is to ensure proper paging. |
| 182 | $resultWrapper = $this->performBatchQuery( $table, $startTimestamp, $sha1, $batchSize + 1 ); |
| 183 | if ( $resultWrapper->count() < $batchSize + 1 ) { |
| 184 | // If the row count returned is less than the batch size, then just return the result |
| 185 | // set along with the indication that no more batches can be found. |
| 186 | return [ $resultWrapper, false ]; |
| 187 | } |
| 188 | // Get the smallest timestamp in this batch. |
| 189 | $resultWrapper->seek( $resultWrapper->count() - 1 ); |
| 190 | $timestampToRemoveFromBatch = $resultWrapper->fetchRow()[$this->getTimestampFieldForTable( $table )]; |
| 191 | // Rewind the results wrapper to the first row. |
| 192 | $resultWrapper->rewind(); |
| 193 | // To ensure that a given file is only present within one batch, the batches must contain upload timestamp |
| 194 | // values which are not present in any other batch. This is because the upload timestamp is the way to separate |
| 195 | // the results when batching, and starting a new batch with a timestamp used in a previous batch would mean |
| 196 | // some of the files are listed twice. |
| 197 | // Removing the rows with the smallest timestamp addresses this by ensuring all the files are in the |
| 198 | // next batch. |
| 199 | $resultsToReturn = []; |
| 200 | foreach ( $resultWrapper as $row ) { |
| 201 | $rowAsArray = (array)$row; |
| 202 | if ( $rowAsArray[$this->getTimestampFieldForTable( $table )] !== $timestampToRemoveFromBatch ) { |
| 203 | $resultsToReturn[] = $row; |
| 204 | } |
| 205 | } |
| 206 | // Return the modified results. |
| 207 | return [ new FakeResultWrapper( $resultsToReturn ), $timestampToRemoveFromBatch ]; |
| 208 | } |
| 209 | |
| 210 | /** |
| 211 | * Gets LocalFile, OldFile, and ArchivedFile objects with a given SHA-1 from the |
| 212 | * local wiki. |
| 213 | * |
| 214 | * This method generates these entries instead of returning all objects as |
| 215 | * only one object is usually needed for the purposes of scanning. In the |
| 216 | * case of the image not having a thumbnail or otherwise having an problem, |
| 217 | * further images may be needed. |
| 218 | * |
| 219 | * The order in which these objects are generated is the order of the tables |
| 220 | * in self::TABLES_USED_FOR_LOOKUP and then their upload timestamp starting |
| 221 | * with the newest file. |
| 222 | * |
| 223 | * @param string $sha1 The SHA-1 used for lookup |
| 224 | * @param int $batchSize The number of files to select per select query. Increase this |
| 225 | * number if you intend to use all the files returned by this query. |
| 226 | * @return Generator<LocalFile|ArchivedFile> |
| 227 | */ |
| 228 | public function getFileObjectsForSha1( string $sha1, int $batchSize = 5 ): Generator { |
| 229 | // Process each image table in the order defined in ::TABLES_USED_FOR_LOOKUP |
| 230 | foreach ( self::TABLES_USED_FOR_LOOKUP as $table ) { |
| 231 | // Lookup rows from the image $table where the SHA-1 for that row is |
| 232 | // the same as in $sha1. Order these by upload timestamp and limit |
| 233 | // each batch selected from the DB to $batchSize rows. |
| 234 | $startTimestampForBatch = ''; |
| 235 | do { |
| 236 | [ $batch, $startTimestampForBatch ] = $this->getBatchOfFileRows( |
| 237 | $table, |
| 238 | $startTimestampForBatch, |
| 239 | $sha1, |
| 240 | $batchSize |
| 241 | ); |
| 242 | foreach ( $batch as $row ) { |
| 243 | // Yield the row as a LocalFile or ArchivedFile object. |
| 244 | yield $this->mediaModerationFileFactory->getFileObjectForRow( $row, $table ); |
| 245 | } |
| 246 | } while ( $startTimestampForBatch ); |
| 247 | } |
| 248 | } |
| 249 | } |