Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
79.03% |
260 / 329 |
|
50.00% |
11 / 22 |
CRAP | |
0.00% |
0 / 1 |
UtteranceStore | |
79.03% |
260 / 329 |
|
50.00% |
11 / 22 |
93.21 | |
0.00% |
0 / 1 |
__construct | |
61.54% |
8 / 13 |
|
0.00% |
0 / 1 |
2.23 | |||
getFileBackend | |
8.00% |
2 / 25 |
|
0.00% |
0 / 1 |
16.46 | |||
findUtterance | |
80.00% |
24 / 30 |
|
0.00% |
0 / 1 |
5.20 | |||
retrieveUtteranceMetadata | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
3 | |||
retrieveFileContents | |
62.50% |
5 / 8 |
|
0.00% |
0 / 1 |
2.21 | |||
createUtterance | |
100.00% |
35 / 35 |
|
100.00% |
1 / 1 |
1 | |||
storeFile | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
5.54 | |||
flushUtterancesByExpirationDate | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByPage | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByLanguageAndVoice | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
flushUtterances | |
85.00% |
34 / 40 |
|
0.00% |
0 / 1 |
9.27 | |||
deleteFileBackendFile | |
42.86% |
9 / 21 |
|
0.00% |
0 / 1 |
4.68 | |||
urlPathFactory | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
audioUrlPrefixFactory | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
audioUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
synthesisMetadataUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByExpirationDateOnFile | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
2.01 | |||
recurseFlushUtterancesByExpirationDateOnFile | |
91.11% |
41 / 45 |
|
0.00% |
0 / 1 |
7.03 | |||
getWikispeechUtteranceExpirationTimestamp | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
getWikispeechUtteranceTimeToLiveDays | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isWikispeechUtteranceUseSwiftFileBackendExpiring | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
evaluateRemoteWikiHash | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Utterance; |
4 | |
5 | /** |
6 | * @file |
7 | * @ingroup Extensions |
8 | * @license GPL-2.0-or-later |
9 | */ |
10 | |
11 | use Config; |
12 | use ExternalStoreException; |
13 | use FileBackend; |
14 | use FSFileBackend; |
15 | use MediaWiki\Logger\LoggerFactory; |
16 | use MediaWiki\MediaWikiServices; |
17 | use MWTimestamp; |
18 | use Psr\Log\LoggerInterface; |
19 | use SwiftFileBackend; |
20 | use WikiMap; |
21 | use Wikimedia\Rdbms\IDatabase; |
22 | use Wikimedia\Rdbms\ILoadBalancer; |
23 | use Wikimedia\Rdbms\IResultWrapper; |
24 | |
25 | /** |
26 | * Keeps track of utterances in persistent layers. |
27 | * |
28 | * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table. |
29 | * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend. |
30 | * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend. |
31 | * |
32 | * (.opus and .json suffixes are added in file backed store although this class is agnostic |
33 | * regarding to the actual data encoding and formats.) |
34 | * |
35 | * @since 0.1.5 |
36 | */ |
37 | class UtteranceStore { |
38 | |
39 | /** @var string Name of database table that keeps track of utterance metadata. */ |
40 | public const UTTERANCE_TABLE = 'wikispeech_utterance'; |
41 | |
42 | /** @var LoggerInterface */ |
43 | private $logger; |
44 | |
45 | /** |
46 | * Don't use this directly, access @see getFileBackend |
47 | * @var FileBackend Used to store utterance audio and synthesis metadata. |
48 | */ |
49 | private $fileBackend; |
50 | |
51 | /** |
52 | * @var ILoadBalancer |
53 | */ |
54 | private $dbLoadBalancer; |
55 | |
56 | /** @var string Name of container (sort of path prefix) used for files in backend. */ |
57 | private $fileBackendContainerName; |
58 | |
59 | /** @var Config */ |
60 | private $config; |
61 | |
62 | public function __construct() { |
63 | $this->logger = LoggerFactory::getInstance( 'Wikispeech' ); |
64 | |
65 | // @todo don't create, add as constructor parameter |
66 | // Refer to https://phabricator.wikimedia.org/T264165 |
67 | $this->config = MediaWikiServices::getInstance() |
68 | ->getConfigFactory() |
69 | ->makeConfig( 'wikispeech' ); |
70 | |
71 | $this->fileBackendContainerName = $this->config |
72 | ->get( 'WikispeechUtteranceFileBackendContainerName' ); |
73 | if ( !$this->fileBackendContainerName ) { |
74 | $this->fileBackendContainerName = "wikispeech-utterances"; |
75 | $this->logger->info( __METHOD__ . ': ' . |
76 | 'Falling back on container name {containerName}', [ |
77 | 'containerName' => $this->fileBackendContainerName |
78 | ] ); |
79 | } |
80 | |
81 | $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer(); |
82 | } |
83 | |
84 | /** |
85 | * @since 0.1.5 |
86 | * @return FileBackend |
87 | * @throws ExternalStoreException If defined file backend group does not exists. |
88 | */ |
89 | private function getFileBackend() { |
90 | global $wgUploadDirectory; |
91 | if ( !$this->fileBackend ) { |
92 | |
93 | /** @var string Name of file backend group in LocalSettings.php to use. */ |
94 | $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' ); |
95 | if ( !$fileBackendName ) { |
96 | $fileBackendName = 'wikispeech-backend'; |
97 | $fallbackDir = "$wgUploadDirectory/wikispeech_utterances"; |
98 | $this->logger->info( __METHOD__ . ': ' . |
99 | 'No file backend defined in LocalSettings.php. Falling back ' . |
100 | 'on FS storage backend named {name} in {dir}.', [ |
101 | 'name' => $fileBackendName, |
102 | 'dir' => $fallbackDir |
103 | ] ); |
104 | $this->fileBackend = new FSFileBackend( [ |
105 | 'name' => $fileBackendName, |
106 | 'wikiId' => WikiMap::getCurrentWikiId(), |
107 | 'basePath' => $fallbackDir |
108 | ] ); |
109 | } else { |
110 | $fileBackend = MediaWikiServices::getInstance() |
111 | ->getFileBackendGroup() |
112 | ->get( $fileBackendName ); |
113 | if ( $fileBackend ) { |
114 | $this->fileBackend = $fileBackend; |
115 | } else { |
116 | throw new ExternalStoreException( |
117 | "No file backend group in LocalSettings.php named $fileBackendName." |
118 | ); |
119 | } |
120 | } |
121 | } |
122 | return $this->fileBackend; |
123 | } |
124 | |
125 | /** |
126 | * Retrieves an utterance for a given segment in a page, using a specific |
127 | * voice and language. |
128 | * |
129 | * @since 0.1.5 |
130 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
131 | * @param int $pageId Mediawiki page ID. |
132 | * @param string $language ISO-639. |
133 | * @param string $voice Name of synthesis voice. |
134 | * @param string $segmentHash Hash of segment representing utterance. |
135 | * @param bool $omitAudio If true, then no audio is returned. |
136 | * @return Utterance|null Utterance found, or null if non-existing. |
137 | */ |
138 | public function findUtterance( |
139 | ?string $consumerUrl, |
140 | int $pageId, |
141 | string $language, |
142 | string $voice, |
143 | string $segmentHash, |
144 | bool $omitAudio = false |
145 | ): ?Utterance { |
146 | $utterance = $this->retrieveUtteranceMetadata( |
147 | $consumerUrl, |
148 | $pageId, |
149 | $language, |
150 | $voice, |
151 | $segmentHash |
152 | ); |
153 | if ( !$utterance ) { |
154 | return null; |
155 | } |
156 | |
157 | // load utterance audio and synthesis metadata |
158 | $utteranceId = $utterance->getUtteranceId(); |
159 | |
160 | // @note We might want to keep this as separate function calls, |
161 | // allowing the user to request when needed, and perhaps |
162 | // pass a stream straight down from file backend to user |
163 | // rather than bouncing it via RAM. |
164 | // Not sure if this is an existing thing in PHP though. |
165 | |
166 | if ( !$omitAudio ) { |
167 | $audioSrc = $this->audioUrlFactory( $utteranceId ); |
168 | try { |
169 | $utterance->setAudio( $this->retrieveFileContents( |
170 | $audioSrc, |
171 | $utteranceId, |
172 | 'audio file' |
173 | ) ); |
174 | } catch ( ExternalStoreException $e ) { |
175 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
176 | return null; |
177 | } |
178 | } |
179 | |
180 | $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId ); |
181 | try { |
182 | $utterance->setSynthesisMetadata( $this->retrieveFileContents( |
183 | $synthesisMetadataSrc, |
184 | $utteranceId, |
185 | 'synthesis metadata file' |
186 | ) ); |
187 | } catch ( ExternalStoreException $e ) { |
188 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
189 | return null; |
190 | } |
191 | |
192 | return $utterance; |
193 | } |
194 | |
195 | /** |
196 | * Retrieves the utterance metadata from the database for a given segment in a page, |
197 | * using a specific voice and language. |
198 | * |
199 | * @since 0.1.5 |
200 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
201 | * @param int $pageId Mediawiki page ID. |
202 | * @param string $language ISO-639. |
203 | * @param string $voice Name of synthesis voice. |
204 | * @param string $segmentHash Hash of segment representing utterance. |
205 | * @return Utterance|null Utterance or null if not found in database |
206 | */ |
207 | public function retrieveUtteranceMetadata( |
208 | ?string $consumerUrl, |
209 | int $pageId, |
210 | string $language, |
211 | string $voice, |
212 | string $segmentHash |
213 | ): ?Utterance { |
214 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
215 | $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA ); |
216 | $row = $dbr->selectRow( self::UTTERANCE_TABLE, [ |
217 | 'wsu_utterance_id', |
218 | 'wsu_remote_wiki_hash', |
219 | 'wsu_page_id', |
220 | 'wsu_lang', |
221 | 'wsu_voice', |
222 | 'wsu_seg_hash', |
223 | 'wsu_date_stored' |
224 | ], [ |
225 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
226 | 'wsu_page_id' => $pageId, |
227 | 'wsu_lang' => $language, |
228 | 'wsu_voice' => $voice, |
229 | 'wsu_seg_hash' => $segmentHash |
230 | ], __METHOD__, [ |
231 | 'ORDER BY date_stored DESC', |
232 | ] ); |
233 | if ( !$row ) { |
234 | return null; |
235 | } |
236 | $utterance = new Utterance( |
237 | intval( $row->wsu_utterance_id ), |
238 | $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ), |
239 | intval( $row->wsu_page_id ), |
240 | strval( $row->wsu_lang ), |
241 | strval( $row->wsu_voice ), |
242 | strval( $row->wsu_seg_hash ), |
243 | MWTimestamp::getInstance( $row->wsu_date_stored ) |
244 | ); |
245 | return $utterance; |
246 | } |
247 | |
248 | /** |
249 | * Retrieve the file contents from the backend. |
250 | * |
251 | * @since 0.1.5 |
252 | * @param string $src |
253 | * @param int $utteranceId |
254 | * @param string $type |
255 | * @return mixed File contents |
256 | * @throws ExternalStoreException |
257 | */ |
258 | public function retrieveFileContents( $src, $utteranceId, $type ) { |
259 | $content = $this->getFileBackend()->getFileContents( [ |
260 | 'src' => $src |
261 | ] ); |
262 | if ( $content == FileBackend::CONTENT_FAIL ) { |
263 | // @note Consider queuing job to flush inconsistencies from database. |
264 | throw new ExternalStoreException( |
265 | "Inconsistency! Database contains utterance with ID $utteranceId " . |
266 | "that does not exist as $type named $src in file backend." ); |
267 | } |
268 | return $content; |
269 | } |
270 | |
271 | /** |
272 | * Creates an utterance in the database. |
273 | * |
274 | * @since 0.1.5 |
275 | * @param string|null $consumerUrl |
276 | * @param int $pageId Mediawiki page ID. |
277 | * @param string $language ISO 639. |
278 | * @param string $voice Name of synthesis voice. |
279 | * @param string $segmentHash Hash of segment representing utterance. |
280 | * @param string $audio Utterance audio. |
281 | * @param string $synthesisMetadata JSON form metadata about the audio. |
282 | * @return Utterance Inserted utterance. |
283 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
284 | */ |
285 | public function createUtterance( |
286 | ?string $consumerUrl, |
287 | int $pageId, |
288 | string $language, |
289 | string $voice, |
290 | string $segmentHash, |
291 | string $audio, |
292 | string $synthesisMetadata |
293 | ): Utterance { |
294 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
295 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
296 | $rows = [ |
297 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
298 | 'wsu_page_id' => $pageId, |
299 | 'wsu_lang' => $language, |
300 | 'wsu_voice' => $voice, |
301 | 'wsu_seg_hash' => $segmentHash, |
302 | 'wsu_date_stored' => $dbw->timestamp() |
303 | ]; |
304 | $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ ); |
305 | $utterance = new Utterance( |
306 | intval( $dbw->insertId() ), |
307 | $remoteWikiHash, |
308 | $pageId, |
309 | $language, |
310 | $voice, |
311 | $segmentHash, |
312 | MWTimestamp::getInstance( $rows['wsu_date_stored'] ) |
313 | ); |
314 | |
315 | // create audio file |
316 | $this->storeFile( |
317 | $this->audioUrlFactory( $utterance->getUtteranceId() ), |
318 | $audio, |
319 | 'audio file' |
320 | ); |
321 | $utterance->setAudio( $audio ); |
322 | |
323 | // create synthesis metadata file |
324 | $this->storeFile( |
325 | $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ), |
326 | $synthesisMetadata, |
327 | 'synthesis metadata file' |
328 | ); |
329 | $utterance->setSynthesisMetadata( $synthesisMetadata ); |
330 | |
331 | $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue(); |
332 | $jobQueue->maybeQueueJob(); |
333 | |
334 | return $utterance; |
335 | } |
336 | |
337 | /** |
338 | * Store a file in the backend. |
339 | * |
340 | * @since 0.1.5 |
341 | * @param string $fileUrl |
342 | * @param mixed $content |
343 | * @param string $type |
344 | * @throws ExternalStoreException |
345 | */ |
346 | public function storeFile( $fileUrl, $content, $type ) { |
347 | $fileBackend = $this->getFileBackend(); |
348 | if ( !$fileBackend->prepare( [ |
349 | 'dir' => dirname( $fileUrl ), |
350 | 'noAccess' => 1, |
351 | 'noListing' => 1 |
352 | ] )->isOK() ) { |
353 | throw new ExternalStoreException( "Failed to prepare $type: $fileUrl." ); |
354 | } |
355 | $opts = [ |
356 | 'dst' => $fileUrl, |
357 | 'content' => $content |
358 | ]; |
359 | if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
360 | $fileBackend instanceof SwiftFileBackend ) { |
361 | // Mark files in Swift for automatic removal after TTL. |
362 | // See $this->flushUtterances for code that skips forced removal if backend is Swift. |
363 | $opts['headers'] = [ |
364 | // number of seconds from now |
365 | 'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24 |
366 | ]; |
367 | } |
368 | if ( !$fileBackend->create( $opts )->isOK() ) { |
369 | throw new ExternalStoreException( "Failed to create $type: $fileUrl." ); |
370 | } |
371 | } |
372 | |
373 | /** |
374 | * Clears database and file backend of utterances older than a given age. |
375 | * |
376 | * @since 0.1.5 |
377 | * @param MWTimestamp $expirationDate |
378 | * @return int Number of utterances flushed. |
379 | */ |
380 | public function flushUtterancesByExpirationDate( $expirationDate ) { |
381 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
382 | $results = $dbw->select( self::UTTERANCE_TABLE, |
383 | [ 'wsu_utterance_id' ], |
384 | [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ) ], |
385 | __METHOD__ |
386 | ); |
387 | return $this->flushUtterances( $dbw, $results ); |
388 | } |
389 | |
390 | /** |
391 | * Clears database and file backend of all utterances for a given page. |
392 | * |
393 | * @since 0.1.5 |
394 | * @param string|null $consumerUrl |
395 | * @param int $pageId Mediawiki page ID. |
396 | * @return int Number of utterances flushed. |
397 | */ |
398 | public function flushUtterancesByPage( |
399 | ?string $consumerUrl, |
400 | int $pageId |
401 | ): int { |
402 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
403 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
404 | $results = $dbw->select( self::UTTERANCE_TABLE, |
405 | [ 'wsu_utterance_id' ], |
406 | [ |
407 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
408 | 'wsu_page_id' => $pageId |
409 | ], |
410 | __METHOD__ |
411 | ); |
412 | return $this->flushUtterances( $dbw, $results ); |
413 | } |
414 | |
415 | /** |
416 | * Clears database and file backend of all utterances for a given language and voice. |
417 | * If no voice is set, then all voices will be removed. |
418 | * |
419 | * @since 0.1.5 |
420 | * @param string $language ISO 639. |
421 | * @param string|null $voice Optional name of synthesis voice to limit flush to. |
422 | * @return int Number of utterances flushed. |
423 | */ |
424 | public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) { |
425 | $conditions = [ |
426 | 'wsu_lang' => $language |
427 | ]; |
428 | if ( $voice != null ) { |
429 | $conditions['wsu_voice'] = $voice; |
430 | } |
431 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
432 | $results = $dbw->select( self::UTTERANCE_TABLE, |
433 | [ 'wsu_utterance_id' ], $conditions, __METHOD__ |
434 | ); |
435 | return $this->flushUtterances( $dbw, $results ); |
436 | } |
437 | |
438 | /** |
439 | * Flushes utterances listed in a result set containing |
440 | * at least the wsu_utterance_id column. |
441 | * |
442 | * In order for return value to increase, the utterance must have been |
443 | * successfully deleted in all layers, i.e. utterance metadata database row, |
444 | * utterance audio and synthesis metadata from file store. |
445 | * E.g. if the utterance audio file is missing and thus not explicitly removed, |
446 | * but at the same time we managed to remove the utterance metadata from database |
447 | * and also removed the synthesis metadata file, this will not count as a |
448 | * successfully removed utterance. It would however be removed from all layers |
449 | * and it would also cause an out-of-sync warning in the log. |
450 | * |
451 | * @note Consider if database should be flushing within a transaction. |
452 | * |
453 | * @since 0.1.5 |
454 | * @param IDatabase $dbw Writable database connection. |
455 | * @param IResultWrapper $results Result set. |
456 | * @return int Number of utterances that were successfully flushed in all layers. |
457 | */ |
458 | private function flushUtterances( $dbw, $results ) { |
459 | if ( !$results ) { |
460 | return 0; |
461 | } |
462 | |
463 | // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations. |
464 | $flushInFileBackend = !( |
465 | $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
466 | $this->getFileBackend() instanceof SwiftFileBackend |
467 | ); |
468 | |
469 | $successfullyFlushedCounter = 0; |
470 | foreach ( $results as $row ) { |
471 | $utteranceId = $row->wsu_utterance_id; |
472 | |
473 | // 1. delete in database |
474 | $successfullyDeletedTableRow = $dbw->delete( |
475 | self::UTTERANCE_TABLE, |
476 | [ 'wsu_utterance_id' => $utteranceId ], |
477 | __METHOD__ |
478 | ); |
479 | if ( !$successfullyDeletedTableRow ) { |
480 | $this->logger->warning( __METHOD__ . ': ' . |
481 | 'Failed to delete utterance {utteranceId} from database.', [ |
482 | 'utteranceId' => $utteranceId |
483 | ] ); |
484 | } else { |
485 | $this->logger->debug( __METHOD__ . ': ' . |
486 | 'Flushed out utterance with id {utteranceId} from database', [ |
487 | 'utteranceId' => $utteranceId |
488 | ] ); |
489 | } |
490 | |
491 | // 2. delete in file store. |
492 | if ( $flushInFileBackend ) { |
493 | $successfullyDeletedAudioFile = $this->deleteFileBackendFile( |
494 | $this->audioUrlFactory( $utteranceId ), |
495 | $utteranceId, |
496 | 'audio file' |
497 | ); |
498 | $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile( |
499 | $this->synthesisMetadataUrlFactory( $utteranceId ), |
500 | $utteranceId, |
501 | 'synthesis metadata file' |
502 | ); |
503 | $successfullyDeletedFiles = |
504 | $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile; |
505 | } else { |
506 | // The files were marked for automatic deletion using TTL in the Swift create operation. |
507 | $successfullyDeletedFiles = true; |
508 | } |
509 | |
510 | if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) { |
511 | $successfullyFlushedCounter++; |
512 | } |
513 | } |
514 | |
515 | return $successfullyFlushedCounter; |
516 | } |
517 | |
518 | /** |
519 | * @since 0.1.5 |
520 | * @param string $src |
521 | * @param int $utteranceId |
522 | * @param string $type |
523 | * @return bool If successfully deleted |
524 | */ |
525 | private function deleteFileBackendFile( $src, $utteranceId, $type ) { |
526 | $synthesisMetadataFile = [ |
527 | 'src' => $src |
528 | ]; |
529 | if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) { |
530 | if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) { |
531 | $this->logger->warning( __METHOD__ . ': ' . |
532 | 'Unable to delete {type} for utterance with identity {utteranceId}.', [ |
533 | 'utteranceId' => $utteranceId, |
534 | 'type' => $type |
535 | ] ); |
536 | return false; |
537 | } else { |
538 | $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] ); |
539 | } |
540 | } else { |
541 | $this->logger->warning( __METHOD__ . ': ' . |
542 | 'Attempted to delete non existing {type} for utterance {utteranceId}.', [ |
543 | 'utteranceId' => $utteranceId, |
544 | 'type' => $type |
545 | ] ); |
546 | return false; |
547 | } |
548 | $this->logger->debug( __METHOD__ . ': ' . |
549 | 'Flushed out file {src}', [ 'src' => $src ] ); |
550 | return true; |
551 | } |
552 | |
553 | /** |
554 | * Creates a deterministic path based on utterance identity, |
555 | * causing no more than 1000 files and 10 subdirectories per directory. |
556 | * (Actually, 2000 files, as we store both .json and .opus) |
557 | * |
558 | * Overloading a directory with files often cause performance problems. |
559 | * |
560 | * 1 -> / |
561 | * 12 -> / |
562 | * 123 -> / |
563 | * 1234 -> /1/ |
564 | * 12345 -> /1/2/ |
565 | * 123456 -> /1/2/3/ |
566 | * 1234567 -> /1/2/3/4/ |
567 | * |
568 | * @since 0.1.5 |
569 | * @param int $utteranceId |
570 | * @return string Path |
571 | */ |
572 | private function urlPathFactory( $utteranceId ) { |
573 | $path = '/'; |
574 | $utteranceIdText = strval( $utteranceId ); |
575 | $utteranceIdTextLength = strlen( $utteranceIdText ); |
576 | for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) { |
577 | $path .= substr( $utteranceIdText, $index, 1 ) . '/'; |
578 | } |
579 | return $path; |
580 | } |
581 | |
582 | /** |
583 | * @since 0.1.5 |
584 | * @param int $utteranceId Utterance identity. |
585 | * @return string url used to access object in file store |
586 | */ |
587 | private function audioUrlPrefixFactory( $utteranceId ) { |
588 | return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName ) |
589 | . $this->urlPathFactory( $utteranceId ) . $utteranceId; |
590 | } |
591 | |
592 | /** |
593 | * @since 0.1.5 |
594 | * @param int $utteranceId Utterance identity. |
595 | * @return string url used to access object in file store |
596 | */ |
597 | private function audioUrlFactory( $utteranceId ) { |
598 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus'; |
599 | } |
600 | |
601 | /** |
602 | * @since 0.1.5 |
603 | * @param int $utteranceId Utterance identity. |
604 | * @return string url used to access object in file store |
605 | */ |
606 | private function synthesisMetadataUrlFactory( $utteranceId ) { |
607 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.json'; |
608 | } |
609 | |
610 | /** |
611 | * Removes expired utterance and synthesis metadata from the file backend. |
612 | * |
613 | * @since 0.1.7 |
614 | * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned. |
615 | * Defaults to config value. |
616 | * @return int Number of expired files flushed |
617 | */ |
618 | public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) { |
619 | // @note Either this method, or the job, |
620 | // should probably call `flushUtterancesByExpirationDate` |
621 | // to ensure we are not deleting a bunch of files |
622 | // which were scheduled to be deleted together with their db-entries anyway. |
623 | |
624 | if ( !$expiredTimestamp ) { |
625 | $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp(); |
626 | } |
627 | $fileBackend = $this->getFileBackend(); |
628 | return $this->recurseFlushUtterancesByExpirationDateOnFile( |
629 | $fileBackend, |
630 | $this->getFileBackend() |
631 | ->getContainerStoragePath( $this->fileBackendContainerName ), |
632 | $expiredTimestamp |
633 | ); |
634 | } |
635 | |
636 | /** |
637 | * @since 0.1.7 |
638 | * @param FileBackend $fileBackend |
639 | * @param string $directory |
640 | * @param MWTimestamp $expiredTimestamp |
641 | * @return int Number of expired files flushed |
642 | */ |
643 | private function recurseFlushUtterancesByExpirationDateOnFile( |
644 | $fileBackend, |
645 | $directory, |
646 | $expiredTimestamp |
647 | ) { |
648 | $this->logger->debug( __METHOD__ . ': ' . |
649 | 'Processing directory {directory}', [ 'directory' => $directory ] ); |
650 | $removedFilesCounter = 0; |
651 | $subdirectories = $fileBackend->getDirectoryList( [ |
652 | 'dir' => $directory, |
653 | 'topOnly' => true, |
654 | ] ); |
655 | if ( $subdirectories ) { |
656 | foreach ( $subdirectories as $subdirectory ) { |
657 | $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile( |
658 | $fileBackend, |
659 | $directory . '/' . $subdirectory, |
660 | $expiredTimestamp |
661 | ); |
662 | } |
663 | } |
664 | $files = $fileBackend->getFileList( [ |
665 | 'dir' => $directory, |
666 | 'topOnly' => true, |
667 | 'adviseStat' => false |
668 | ] ); |
669 | if ( $files ) { |
670 | foreach ( $files as $file ) { |
671 | $src = [ 'src' => $directory . '/' . $file ]; |
672 | $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) ); |
673 | $this->logger->debug( __METHOD__ . ': ' . |
674 | 'Processing file {src} with timestamp {timestamp}', [ |
675 | 'src' => $file, |
676 | 'timestamp' => $timestamp, |
677 | 'expiredTimestamp' => $expiredTimestamp |
678 | ] ); |
679 | if ( $timestamp <= $expiredTimestamp ) { |
680 | if ( $fileBackend->delete( $src )->isOK() ) { |
681 | $removedFilesCounter++; |
682 | $this->logger->debug( __METHOD__ . ': ' . |
683 | 'Deleted expired file {file} #{num}', [ |
684 | 'file' => $file, |
685 | 'num' => $removedFilesCounter |
686 | ] |
687 | ); |
688 | } else { |
689 | $this->logger->warning( __METHOD__ . ': ' . |
690 | 'Unable to delete expired file {file}', |
691 | [ 'file' => $file ] |
692 | ); |
693 | } |
694 | } |
695 | unset( $timestamp ); |
696 | } |
697 | } |
698 | $this->getFileBackend()->clean( [ 'dir' => $directory ] ); |
699 | return $removedFilesCounter; |
700 | } |
701 | |
702 | /** |
703 | * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays |
704 | * |
705 | * @return MWTimestamp Utterance parts with timestamp <= this is expired. |
706 | */ |
707 | public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp { |
708 | return MWTimestamp::getInstance( |
709 | strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' ) |
710 | ); |
711 | } |
712 | |
713 | /** |
714 | * @return int Number of days an utterance is to exist before being flushed out. |
715 | */ |
716 | private function getWikispeechUtteranceTimeToLiveDays(): int { |
717 | return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) ); |
718 | } |
719 | |
720 | /** |
721 | * @return bool |
722 | */ |
723 | private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool { |
724 | return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' ); |
725 | } |
726 | |
727 | /** |
728 | * Used to evaluate hash of gadget consumer URL, |
729 | * the remote wiki where the page is located. |
730 | * |
731 | * Making changes to this function will probably invalidate all existing cached utterances. |
732 | * |
733 | * @since 0.1.9 |
734 | * @param string|null $consumerUrl |
735 | * @return string|null SHA256 message digest |
736 | */ |
737 | public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string { |
738 | if ( $consumerUrl === null ) { |
739 | return null; |
740 | } |
741 | $context = hash_init( 'sha256' ); |
742 | $urlParts = parse_url( $consumerUrl ); |
743 | if ( isset( $urlParts['host'] ) ) { |
744 | hash_update( $context, mb_strtolower( $urlParts['host'] ) ); |
745 | } |
746 | if ( isset( $urlParts['port'] ) ) { |
747 | hash_update( $context, strval( $urlParts['port'] ) ); |
748 | } |
749 | if ( isset( $urlParts['path'] ) ) { |
750 | hash_update( $context, $urlParts['path'] ); |
751 | } |
752 | return hash_final( $context ); |
753 | } |
754 | |
755 | } |