Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.80% |
303 / 375 |
|
46.15% |
12 / 26 |
CRAP | |
0.00% |
0 / 1 |
UtteranceStore | |
80.80% |
303 / 375 |
|
46.15% |
12 / 26 |
102.70 | |
0.00% |
0 / 1 |
__construct | |
61.54% |
8 / 13 |
|
0.00% |
0 / 1 |
2.23 | |||
getFileBackend | |
8.00% |
2 / 25 |
|
0.00% |
0 / 1 |
16.46 | |||
findUtterance | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
findMessageUtterance | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
2.00 | |||
loadUtteranceAudio | |
71.43% |
15 / 21 |
|
0.00% |
0 / 1 |
4.37 | |||
retrieveUtteranceMetadata | |
97.44% |
38 / 39 |
|
0.00% |
0 / 1 |
6 | |||
retrieveFileContents | |
62.50% |
5 / 8 |
|
0.00% |
0 / 1 |
2.21 | |||
createUtterance | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 | |||
createMessageUtterance | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
storeUtterance | |
100.00% |
37 / 37 |
|
100.00% |
1 / 1 |
1 | |||
storeFile | |
73.68% |
14 / 19 |
|
0.00% |
0 / 1 |
5.46 | |||
flushUtterancesByExpirationDate | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByPage | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByLanguageAndVoice | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
flushUtterances | |
85.00% |
34 / 40 |
|
0.00% |
0 / 1 |
9.27 | |||
deleteFileBackendFile | |
42.86% |
9 / 21 |
|
0.00% |
0 / 1 |
4.68 | |||
urlPathFactory | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
audioUrlPrefixFactory | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
audioUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
synthesisMetadataUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
flushUtterancesByExpirationDateOnFile | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
2.01 | |||
recurseFlushUtterancesByExpirationDateOnFile | |
91.11% |
41 / 45 |
|
0.00% |
0 / 1 |
7.03 | |||
getWikispeechUtteranceExpirationTimestamp | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
getWikispeechUtteranceTimeToLiveDays | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isWikispeechUtteranceUseSwiftFileBackendExpiring | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
evaluateRemoteWikiHash | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Utterance; |
4 | |
5 | /** |
6 | * @file |
7 | * @ingroup Extensions |
8 | * @license GPL-2.0-or-later |
9 | */ |
10 | |
11 | use Config; |
12 | use ExternalStoreException; |
13 | use FileBackend; |
14 | use FSFileBackend; |
15 | use MediaWiki\Logger\LoggerFactory; |
16 | use MediaWiki\MediaWikiServices; |
17 | use MediaWiki\WikiMap\WikiMap; |
18 | use MWTimestamp; |
19 | use Psr\Log\LoggerInterface; |
20 | use RuntimeException; |
21 | use SwiftFileBackend; |
22 | use Wikimedia\Rdbms\IDatabase; |
23 | use Wikimedia\Rdbms\ILoadBalancer; |
24 | use Wikimedia\Rdbms\IResultWrapper; |
25 | |
26 | /** |
27 | * Keeps track of utterances in persistent layers. |
28 | * |
29 | * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table. |
30 | * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend. |
31 | * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend. |
32 | * |
33 | * (.opus and .json suffixes are added in file backed store although this class is agnostic |
34 | * regarding to the actual data encoding and formats.) |
35 | * |
36 | * @since 0.1.13 Introduces messageKey as parameter for system error messages. |
37 | * @since 0.1.5 |
38 | */ |
39 | class UtteranceStore { |
40 | |
41 | /** @var string Name of database table that keeps track of utterance metadata. */ |
42 | public const UTTERANCE_TABLE = 'wikispeech_utterance'; |
43 | |
44 | /** @var LoggerInterface */ |
45 | private $logger; |
46 | |
47 | /** |
48 | * Don't use this directly, access @see getFileBackend |
49 | * @var FileBackend Used to store utterance audio and synthesis metadata. |
50 | */ |
51 | private $fileBackend; |
52 | |
53 | /** |
54 | * @var ILoadBalancer |
55 | */ |
56 | private $dbLoadBalancer; |
57 | |
58 | /** @var string Name of container (sort of path prefix) used for files in backend. */ |
59 | private $fileBackendContainerName; |
60 | |
61 | /** @var Config */ |
62 | private $config; |
63 | |
64 | public function __construct() { |
65 | $this->logger = LoggerFactory::getInstance( 'Wikispeech' ); |
66 | |
67 | // @todo don't create, add as constructor parameter |
68 | // Refer to https://phabricator.wikimedia.org/T264165 |
69 | $this->config = MediaWikiServices::getInstance() |
70 | ->getConfigFactory() |
71 | ->makeConfig( 'wikispeech' ); |
72 | |
73 | $this->fileBackendContainerName = $this->config |
74 | ->get( 'WikispeechUtteranceFileBackendContainerName' ); |
75 | if ( !$this->fileBackendContainerName ) { |
76 | $this->fileBackendContainerName = "wikispeech-utterances"; |
77 | $this->logger->info( __METHOD__ . ': ' . |
78 | 'Falling back on container name {containerName}', [ |
79 | 'containerName' => $this->fileBackendContainerName |
80 | ] ); |
81 | } |
82 | |
83 | $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer(); |
84 | } |
85 | |
86 | /** |
87 | * @since 0.1.5 |
88 | * @return FileBackend |
89 | * @throws ExternalStoreException If defined file backend group does not exists. |
90 | */ |
91 | private function getFileBackend() { |
92 | global $wgUploadDirectory; |
93 | if ( !$this->fileBackend ) { |
94 | |
95 | /** @var string Name of file backend group in LocalSettings.php to use. */ |
96 | $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' ); |
97 | if ( !$fileBackendName ) { |
98 | $fileBackendName = 'wikispeech-backend'; |
99 | $fallbackDir = "$wgUploadDirectory/wikispeech_utterances"; |
100 | $this->logger->info( __METHOD__ . ': ' . |
101 | 'No file backend defined in LocalSettings.php. Falling back ' . |
102 | 'on FS storage backend named {name} in {dir}.', [ |
103 | 'name' => $fileBackendName, |
104 | 'dir' => $fallbackDir |
105 | ] ); |
106 | $this->fileBackend = new FSFileBackend( [ |
107 | 'name' => $fileBackendName, |
108 | 'wikiId' => WikiMap::getCurrentWikiId(), |
109 | 'basePath' => $fallbackDir |
110 | ] ); |
111 | } else { |
112 | $fileBackend = MediaWikiServices::getInstance() |
113 | ->getFileBackendGroup() |
114 | ->get( $fileBackendName ); |
115 | if ( $fileBackend ) { |
116 | $this->fileBackend = $fileBackend; |
117 | } else { |
118 | throw new ExternalStoreException( |
119 | "No file backend group in LocalSettings.php named $fileBackendName." |
120 | ); |
121 | } |
122 | } |
123 | } |
124 | return $this->fileBackend; |
125 | } |
126 | |
127 | /** |
128 | * Retrieves an utterance for a given segment in a page, using a specific |
129 | * voice and language. |
130 | * |
131 | * @since 0.1.13 |
132 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
133 | * @param int $pageId Mediawiki page ID. |
134 | * @param string $language ISO-639. |
135 | * @param string $voice Name of synthesis voice. |
136 | * @param string $segmentHash Hash of segment representing utterance. |
137 | * @param bool $omitAudio If true, then no audio is returned. |
138 | * @return Utterance|null Utterance found, or null if non-existing. |
139 | */ |
140 | public function findUtterance( |
141 | ?string $consumerUrl, |
142 | int $pageId, |
143 | string $language, |
144 | string $voice, |
145 | string $segmentHash, |
146 | bool $omitAudio = false |
147 | ): ?Utterance { |
148 | $utterance = $this->retrieveUtteranceMetadata( |
149 | $consumerUrl, |
150 | $pageId, |
151 | null, |
152 | $language, |
153 | $voice, |
154 | $segmentHash |
155 | ); |
156 | if ( !$utterance ) { |
157 | return null; |
158 | } |
159 | |
160 | return $this->loadUtteranceAudio( $utterance, $omitAudio ); |
161 | } |
162 | |
163 | /** |
164 | * Retrieves an utterance for a specific error message |
165 | * |
166 | * @since 0.1.13 |
167 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
168 | * @param string $messageKey Message key for system message. |
169 | * @param string $language ISO-639. |
170 | * @param string $voice Name of synthesis voice. |
171 | * @param string $segmentHash Hash of segment representing utterance. |
172 | * @param bool $omitAudio If true, then no audio is returned. |
173 | * @return Utterance|null Utterance found, or null if non-existing. |
174 | */ |
175 | public function findMessageUtterance( |
176 | ?string $consumerUrl, |
177 | string $messageKey, |
178 | string $language, |
179 | string $voice, |
180 | string $segmentHash, |
181 | bool $omitAudio = false |
182 | ) { |
183 | $utterance = $this->retrieveUtteranceMetadata( |
184 | $consumerUrl, |
185 | 0, |
186 | $messageKey, |
187 | $language, |
188 | $voice, |
189 | $segmentHash |
190 | ); |
191 | if ( !$utterance ) { |
192 | return null; |
193 | } |
194 | |
195 | return $this->loadUtteranceAudio( $utterance, $omitAudio ); |
196 | } |
197 | |
198 | /** |
199 | * Loads utterance audio and synthesis metadata |
200 | * |
201 | * @since 0.1.13 |
202 | */ |
203 | private function loadUtteranceAudio( Utterance $utterance, bool $omitAudio ): ?Utterance { |
204 | $utteranceId = $utterance->getUtteranceId(); |
205 | |
206 | // @note We might want to keep this as separate function calls, |
207 | // allowing the user to request when needed, and perhaps |
208 | // pass a stream straight down from file backend to user |
209 | // rather than bouncing it via RAM. |
210 | // Not sure if this is an existing thing in PHP though. |
211 | |
212 | if ( !$omitAudio ) { |
213 | $audioSrc = $this->audioUrlFactory( $utteranceId ); |
214 | try { |
215 | $utterance->setAudio( $this->retrieveFileContents( |
216 | $audioSrc, |
217 | $utteranceId, |
218 | 'audio file' |
219 | ) ); |
220 | } catch ( ExternalStoreException $e ) { |
221 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
222 | return null; |
223 | } |
224 | } |
225 | |
226 | $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId ); |
227 | try { |
228 | $utterance->setSynthesisMetadata( $this->retrieveFileContents( |
229 | $synthesisMetadataSrc, |
230 | $utteranceId, |
231 | 'synthesis metadata file' |
232 | ) ); |
233 | } catch ( ExternalStoreException $e ) { |
234 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
235 | return null; |
236 | } |
237 | |
238 | return $utterance; |
239 | } |
240 | |
241 | /** |
242 | * Retrieves the utterance metadata from the database for a given segment in a page, |
243 | * using a specific voice and language. |
244 | * |
245 | * @since 0.1.13 Optional parameter messageKey |
246 | * @since 0.1.5 |
247 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
248 | * @param int $pageId Mediawiki page ID. |
249 | * @param string|null $messageKey Mediawiki message key. |
250 | * @param string $language ISO-639. |
251 | * @param string $voice Name of synthesis voice. |
252 | * @param string $segmentHash Hash of segment representing utterance. |
253 | * @return Utterance|null Utterance or null if not found in database |
254 | */ |
255 | public function retrieveUtteranceMetadata( |
256 | ?string $consumerUrl, |
257 | int $pageId, |
258 | ?string $messageKey, |
259 | string $language, |
260 | string $voice, |
261 | string $segmentHash |
262 | ): ?Utterance { |
263 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
264 | $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA ); |
265 | |
266 | $conditions = [ |
267 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
268 | 'wsu_lang' => $language, |
269 | 'wsu_voice' => $voice, |
270 | 'wsu_seg_hash' => $segmentHash |
271 | ]; |
272 | |
273 | if ( $pageId > 0 ) { |
274 | $conditions['wsu_page_id'] = $pageId; |
275 | } else { |
276 | if ( $messageKey === null ) { |
277 | throw new RuntimeException( 'If pageId is 0, messageKey must be provided.' ); |
278 | } |
279 | $conditions['wsu_message_key'] = $messageKey; |
280 | $conditions['wsu_page_id'] = 0; |
281 | } |
282 | |
283 | $row = $dbr->selectRow( self::UTTERANCE_TABLE, [ |
284 | 'wsu_utterance_id', |
285 | 'wsu_remote_wiki_hash', |
286 | 'wsu_message_key', |
287 | 'wsu_page_id', |
288 | 'wsu_lang', |
289 | 'wsu_voice', |
290 | 'wsu_seg_hash', |
291 | 'wsu_date_stored' |
292 | ], $conditions, __METHOD__, [ |
293 | 'ORDER BY date_stored DESC', |
294 | ] ); |
295 | if ( !$row ) { |
296 | return null; |
297 | } |
298 | $utterance = new Utterance( |
299 | intval( $row->wsu_utterance_id ), |
300 | $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ), |
301 | $row->wsu_message_key ? strval( $row->wsu_message_key ) : null, |
302 | intval( $row->wsu_page_id ), |
303 | strval( $row->wsu_lang ), |
304 | strval( $row->wsu_voice ), |
305 | strval( $row->wsu_seg_hash ), |
306 | MWTimestamp::getInstance( $row->wsu_date_stored ) |
307 | ); |
308 | return $utterance; |
309 | } |
310 | |
311 | /** |
312 | * Retrieve the file contents from the backend. |
313 | * |
314 | * @since 0.1.5 |
315 | * @param string $src |
316 | * @param int $utteranceId |
317 | * @param string $type |
318 | * @return mixed File contents |
319 | * @throws ExternalStoreException |
320 | */ |
321 | public function retrieveFileContents( $src, $utteranceId, $type ) { |
322 | $content = $this->getFileBackend()->getFileContents( [ |
323 | 'src' => $src |
324 | ] ); |
325 | if ( $content == FileBackend::CONTENT_FAIL ) { |
326 | // @note Consider queuing job to flush inconsistencies from database. |
327 | throw new ExternalStoreException( |
328 | "Inconsistency! Database contains utterance with ID $utteranceId " . |
329 | "that does not exist as $type named $src in file backend." ); |
330 | } |
331 | return $content; |
332 | } |
333 | |
334 | /** |
335 | * Creates an utterance in the database. |
336 | * |
337 | * @since 0.1.13 |
338 | * @param string|null $consumerUrl |
339 | * @param int $pageId Mediawiki page ID. |
340 | * @param string $language ISO 639. |
341 | * @param string $voice Name of synthesis voice. |
342 | * @param string $segmentHash Hash of segment representing utterance. |
343 | * @param string $audio Utterance audio. |
344 | * @param string $synthesisMetadata JSON form metadata about the audio. |
345 | * @return Utterance Inserted utterance. |
346 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
347 | */ |
348 | public function createUtterance( |
349 | ?string $consumerUrl, |
350 | int $pageId, |
351 | string $language, |
352 | string $voice, |
353 | string $segmentHash, |
354 | string $audio, |
355 | string $synthesisMetadata |
356 | ): Utterance { |
357 | if ( $pageId === 0 ) { |
358 | throw new RuntimeException( 'Page ID must not be 0 when creating regular utterance.' ); |
359 | } |
360 | |
361 | return $this->storeUtterance( |
362 | $consumerUrl, |
363 | $pageId, |
364 | null, |
365 | $language, |
366 | $voice, |
367 | $segmentHash, |
368 | $audio, |
369 | $synthesisMetadata |
370 | ); |
371 | } |
372 | |
373 | /** |
374 | * Creates a system error utterance in the database and prepares for storing. |
375 | * |
376 | * @since 0.1.13 |
377 | * @param string|null $consumerUrl |
378 | * @param string|null $messageKey Mediawiki message key. |
379 | * @param string $language ISO 639. |
380 | * @param string $voice Name of synthesis voice. |
381 | * @param string $segmentHash Hash of segment representing utterance. |
382 | * @param string $audio Utterance audio. |
383 | * @param string $synthesisMetadata JSON form metadata about the audio. |
384 | * @return Utterance Inserted utterance. |
385 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
386 | */ |
387 | public function createMessageUtterance( |
388 | ?string $consumerUrl, |
389 | ?string $messageKey, |
390 | string $language, |
391 | string $voice, |
392 | string $segmentHash, |
393 | string $audio, |
394 | string $synthesisMetadata |
395 | ) { |
396 | return $this->storeUtterance( |
397 | $consumerUrl, |
398 | 0, |
399 | $messageKey, |
400 | $language, |
401 | $voice, |
402 | $segmentHash, |
403 | $audio, |
404 | $synthesisMetadata |
405 | ); |
406 | } |
407 | |
408 | /** |
409 | * Stores a created utterance. |
410 | * |
411 | * @since 0.1.13 |
412 | * @param string|null $consumerUrl |
413 | * @param int $pageId Mediawiki page ID. |
414 | * @param string|null $messageKey Mediawiki message key. |
415 | * @param string $language ISO 639. |
416 | * @param string $voice Name of synthesis voice. |
417 | * @param string $segmentHash Hash of segment representing utterance. |
418 | * @param string $audio Utterance audio. |
419 | * @param string $synthesisMetadata JSON form metadata about the audio. |
420 | * @return Utterance Inserted utterance. |
421 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
422 | */ |
423 | private function storeUtterance( |
424 | ?string $consumerUrl, |
425 | int $pageId, |
426 | ?string $messageKey, |
427 | string $language, |
428 | string $voice, |
429 | string $segmentHash, |
430 | string $audio, |
431 | string $synthesisMetadata |
432 | ) { |
433 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
434 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
435 | $rows = [ |
436 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
437 | 'wsu_page_id' => $pageId, |
438 | 'wsu_message_key' => $messageKey, |
439 | 'wsu_lang' => $language, |
440 | 'wsu_voice' => $voice, |
441 | 'wsu_seg_hash' => $segmentHash, |
442 | 'wsu_date_stored' => $dbw->timestamp() |
443 | ]; |
444 | $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ ); |
445 | $utterance = new Utterance( |
446 | intval( $dbw->insertId() ), |
447 | $remoteWikiHash, |
448 | $messageKey, |
449 | $pageId, |
450 | $language, |
451 | $voice, |
452 | $segmentHash, |
453 | MWTimestamp::getInstance( $rows['wsu_date_stored'] ) |
454 | ); |
455 | |
456 | // create audio file |
457 | $this->storeFile( |
458 | $this->audioUrlFactory( $utterance->getUtteranceId() ), |
459 | $audio, |
460 | 'audio file' |
461 | ); |
462 | $utterance->setAudio( $audio ); |
463 | |
464 | // create synthesis metadata file |
465 | $this->storeFile( |
466 | $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ), |
467 | $synthesisMetadata, |
468 | 'synthesis metadata file' |
469 | ); |
470 | $utterance->setSynthesisMetadata( $synthesisMetadata ); |
471 | |
472 | $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue(); |
473 | $jobQueue->maybeQueueJob(); |
474 | |
475 | return $utterance; |
476 | } |
477 | |
478 | /** |
479 | * Store a file in the backend. |
480 | * |
481 | * @since 0.1.5 |
482 | * @param string $fileUrl |
483 | * @param mixed $content |
484 | * @param string $type |
485 | * @throws ExternalStoreException |
486 | */ |
487 | public function storeFile( $fileUrl, $content, $type ) { |
488 | $fileBackend = $this->getFileBackend(); |
489 | |
490 | if ( !$fileBackend->prepare( [ |
491 | 'dir' => dirname( $fileUrl ), |
492 | 'noAccess' => 1, |
493 | 'noListing' => 1 |
494 | ] )->isOK() ) { |
495 | throw new ExternalStoreException( "Failed to prepare $type: $fileUrl." ); |
496 | } |
497 | $opts = [ |
498 | 'dst' => $fileUrl, |
499 | 'content' => $content, |
500 | 'overwrite' => true |
501 | ]; |
502 | if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
503 | $fileBackend instanceof SwiftFileBackend ) { |
504 | // Mark files in Swift for automatic removal after TTL. |
505 | // See $this->flushUtterances for code that skips forced removal if backend is Swift. |
506 | $opts['headers'] = [ |
507 | // number of seconds from now |
508 | 'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24 |
509 | ]; |
510 | } |
511 | if ( !$fileBackend->create( $opts )->isOK() ) { |
512 | throw new ExternalStoreException( "Failed to create $type: $fileUrl." ); |
513 | } |
514 | } |
515 | |
516 | /** |
517 | * Clears database and file backend of utterances older than a given age. |
518 | * |
519 | * @since 0.1.5 |
520 | * @param MWTimestamp $expirationDate |
521 | * @return int Number of utterances flushed. |
522 | */ |
523 | public function flushUtterancesByExpirationDate( $expirationDate ) { |
524 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
525 | $results = $dbw->select( self::UTTERANCE_TABLE, |
526 | [ 'wsu_utterance_id' ], |
527 | [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ) ], |
528 | __METHOD__ |
529 | ); |
530 | return $this->flushUtterances( $dbw, $results ); |
531 | } |
532 | |
533 | /** |
534 | * Clears database and file backend of all utterances for a given page. |
535 | * |
536 | * @since 0.1.5 |
537 | * @param string|null $consumerUrl |
538 | * @param int $pageId Mediawiki page ID. |
539 | * @return int Number of utterances flushed. |
540 | */ |
541 | public function flushUtterancesByPage( |
542 | ?string $consumerUrl, |
543 | int $pageId |
544 | ): int { |
545 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
546 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
547 | $results = $dbw->select( self::UTTERANCE_TABLE, |
548 | [ 'wsu_utterance_id' ], |
549 | [ |
550 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
551 | 'wsu_page_id' => $pageId |
552 | ], |
553 | __METHOD__ |
554 | ); |
555 | return $this->flushUtterances( $dbw, $results ); |
556 | } |
557 | |
558 | /** |
559 | * Clears database and file backend of all utterances for a given language and voice. |
560 | * If no voice is set, then all voices will be removed. |
561 | * |
562 | * @since 0.1.5 |
563 | * @param string $language ISO 639. |
564 | * @param string|null $voice Optional name of synthesis voice to limit flush to. |
565 | * @return int Number of utterances flushed. |
566 | */ |
567 | public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) { |
568 | $conditions = [ |
569 | 'wsu_lang' => $language |
570 | ]; |
571 | if ( $voice != null ) { |
572 | $conditions['wsu_voice'] = $voice; |
573 | } |
574 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
575 | $results = $dbw->select( self::UTTERANCE_TABLE, |
576 | [ 'wsu_utterance_id' ], $conditions, __METHOD__ |
577 | ); |
578 | return $this->flushUtterances( $dbw, $results ); |
579 | } |
580 | |
581 | /** |
582 | * Flushes utterances listed in a result set containing |
583 | * at least the wsu_utterance_id column. |
584 | * |
585 | * In order for return value to increase, the utterance must have been |
586 | * successfully deleted in all layers, i.e. utterance metadata database row, |
587 | * utterance audio and synthesis metadata from file store. |
588 | * E.g. if the utterance audio file is missing and thus not explicitly removed, |
589 | * but at the same time we managed to remove the utterance metadata from database |
590 | * and also removed the synthesis metadata file, this will not count as a |
591 | * successfully removed utterance. It would however be removed from all layers |
592 | * and it would also cause an out-of-sync warning in the log. |
593 | * |
594 | * @note Consider if database should be flushing within a transaction. |
595 | * |
596 | * @since 0.1.5 |
597 | * @param IDatabase $dbw Writable database connection. |
598 | * @param IResultWrapper $results Result set. |
599 | * @return int Number of utterances that were successfully flushed in all layers. |
600 | */ |
601 | private function flushUtterances( $dbw, $results ) { |
602 | if ( !$results ) { |
603 | return 0; |
604 | } |
605 | |
606 | // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations. |
607 | $flushInFileBackend = !( |
608 | $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
609 | $this->getFileBackend() instanceof SwiftFileBackend |
610 | ); |
611 | |
612 | $successfullyFlushedCounter = 0; |
613 | foreach ( $results as $row ) { |
614 | $utteranceId = $row->wsu_utterance_id; |
615 | |
616 | // 1. delete in database |
617 | $successfullyDeletedTableRow = $dbw->delete( |
618 | self::UTTERANCE_TABLE, |
619 | [ 'wsu_utterance_id' => $utteranceId ], |
620 | __METHOD__ |
621 | ); |
622 | if ( !$successfullyDeletedTableRow ) { |
623 | $this->logger->warning( __METHOD__ . ': ' . |
624 | 'Failed to delete utterance {utteranceId} from database.', [ |
625 | 'utteranceId' => $utteranceId |
626 | ] ); |
627 | } else { |
628 | $this->logger->debug( __METHOD__ . ': ' . |
629 | 'Flushed out utterance with id {utteranceId} from database', [ |
630 | 'utteranceId' => $utteranceId |
631 | ] ); |
632 | } |
633 | |
634 | // 2. delete in file store. |
635 | if ( $flushInFileBackend ) { |
636 | $successfullyDeletedAudioFile = $this->deleteFileBackendFile( |
637 | $this->audioUrlFactory( $utteranceId ), |
638 | $utteranceId, |
639 | 'audio file' |
640 | ); |
641 | $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile( |
642 | $this->synthesisMetadataUrlFactory( $utteranceId ), |
643 | $utteranceId, |
644 | 'synthesis metadata file' |
645 | ); |
646 | $successfullyDeletedFiles = |
647 | $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile; |
648 | } else { |
649 | // The files were marked for automatic deletion using TTL in the Swift create operation. |
650 | $successfullyDeletedFiles = true; |
651 | } |
652 | |
653 | if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) { |
654 | $successfullyFlushedCounter++; |
655 | } |
656 | } |
657 | |
658 | return $successfullyFlushedCounter; |
659 | } |
660 | |
661 | /** |
662 | * @since 0.1.5 |
663 | * @param string $src |
664 | * @param int $utteranceId |
665 | * @param string $type |
666 | * @return bool If successfully deleted |
667 | */ |
668 | private function deleteFileBackendFile( $src, $utteranceId, $type ) { |
669 | $synthesisMetadataFile = [ |
670 | 'src' => $src |
671 | ]; |
672 | if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) { |
673 | if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) { |
674 | $this->logger->warning( __METHOD__ . ': ' . |
675 | 'Unable to delete {type} for utterance with identity {utteranceId}.', [ |
676 | 'utteranceId' => $utteranceId, |
677 | 'type' => $type |
678 | ] ); |
679 | return false; |
680 | } else { |
681 | $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] ); |
682 | } |
683 | } else { |
684 | $this->logger->warning( __METHOD__ . ': ' . |
685 | 'Attempted to delete non existing {type} for utterance {utteranceId}.', [ |
686 | 'utteranceId' => $utteranceId, |
687 | 'type' => $type |
688 | ] ); |
689 | return false; |
690 | } |
691 | $this->logger->debug( __METHOD__ . ': ' . |
692 | 'Flushed out file {src}', [ 'src' => $src ] ); |
693 | return true; |
694 | } |
695 | |
696 | /** |
697 | * Creates a deterministic path based on utterance identity, |
698 | * causing no more than 1000 files and 10 subdirectories per directory. |
699 | * (Actually, 2000 files, as we store both .json and .opus) |
700 | * |
701 | * Overloading a directory with files often cause performance problems. |
702 | * |
703 | * 1 -> / |
704 | * 12 -> / |
705 | * 123 -> / |
706 | * 1234 -> /1/ |
707 | * 12345 -> /1/2/ |
708 | * 123456 -> /1/2/3/ |
709 | * 1234567 -> /1/2/3/4/ |
710 | * |
711 | * @since 0.1.5 |
712 | * @param int $utteranceId |
713 | * @return string Path |
714 | */ |
715 | private function urlPathFactory( $utteranceId ) { |
716 | $path = '/'; |
717 | $utteranceIdText = strval( $utteranceId ); |
718 | $utteranceIdTextLength = strlen( $utteranceIdText ); |
719 | for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) { |
720 | $path .= substr( $utteranceIdText, $index, 1 ) . '/'; |
721 | } |
722 | return $path; |
723 | } |
724 | |
725 | /** |
726 | * @since 0.1.5 |
727 | * @param int $utteranceId Utterance identity. |
728 | * @return string url used to access object in file store |
729 | */ |
730 | private function audioUrlPrefixFactory( $utteranceId ) { |
731 | return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName ) |
732 | . $this->urlPathFactory( $utteranceId ) . $utteranceId; |
733 | } |
734 | |
735 | /** |
736 | * @since 0.1.5 |
737 | * @param int $utteranceId Utterance identity. |
738 | * @return string url used to access object in file store |
739 | */ |
740 | private function audioUrlFactory( $utteranceId ) { |
741 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus'; |
742 | } |
743 | |
744 | /** |
745 | * @since 0.1.5 |
746 | * @param int $utteranceId Utterance identity. |
747 | * @return string url used to access object in file store |
748 | */ |
749 | private function synthesisMetadataUrlFactory( $utteranceId ) { |
750 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.json'; |
751 | } |
752 | |
753 | /** |
754 | * Removes expired utterance and synthesis metadata from the file backend. |
755 | * |
756 | * @since 0.1.7 |
757 | * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned. |
758 | * Defaults to config value. |
759 | * @return int Number of expired files flushed |
760 | */ |
761 | public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) { |
762 | // @note Either this method, or the job, |
763 | // should probably call `flushUtterancesByExpirationDate` |
764 | // to ensure we are not deleting a bunch of files |
765 | // which were scheduled to be deleted together with their db-entries anyway. |
766 | |
767 | if ( !$expiredTimestamp ) { |
768 | $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp(); |
769 | } |
770 | $fileBackend = $this->getFileBackend(); |
771 | return $this->recurseFlushUtterancesByExpirationDateOnFile( |
772 | $fileBackend, |
773 | $this->getFileBackend() |
774 | ->getContainerStoragePath( $this->fileBackendContainerName ), |
775 | $expiredTimestamp |
776 | ); |
777 | } |
778 | |
779 | /** |
780 | * @since 0.1.7 |
781 | * @param FileBackend $fileBackend |
782 | * @param string $directory |
783 | * @param MWTimestamp $expiredTimestamp |
784 | * @return int Number of expired files flushed |
785 | */ |
786 | private function recurseFlushUtterancesByExpirationDateOnFile( |
787 | $fileBackend, |
788 | $directory, |
789 | $expiredTimestamp |
790 | ) { |
791 | $this->logger->debug( __METHOD__ . ': ' . |
792 | 'Processing directory {directory}', [ 'directory' => $directory ] ); |
793 | $removedFilesCounter = 0; |
794 | $subdirectories = $fileBackend->getDirectoryList( [ |
795 | 'dir' => $directory, |
796 | 'topOnly' => true, |
797 | ] ); |
798 | if ( $subdirectories ) { |
799 | foreach ( $subdirectories as $subdirectory ) { |
800 | $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile( |
801 | $fileBackend, |
802 | $directory . '/' . $subdirectory, |
803 | $expiredTimestamp |
804 | ); |
805 | } |
806 | } |
807 | $files = $fileBackend->getFileList( [ |
808 | 'dir' => $directory, |
809 | 'topOnly' => true, |
810 | 'adviseStat' => false |
811 | ] ); |
812 | if ( $files ) { |
813 | foreach ( $files as $file ) { |
814 | $src = [ 'src' => $directory . '/' . $file ]; |
815 | $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) ); |
816 | $this->logger->debug( __METHOD__ . ': ' . |
817 | 'Processing file {src} with timestamp {timestamp}', [ |
818 | 'src' => $file, |
819 | 'timestamp' => $timestamp, |
820 | 'expiredTimestamp' => $expiredTimestamp |
821 | ] ); |
822 | if ( $timestamp <= $expiredTimestamp ) { |
823 | if ( $fileBackend->delete( $src )->isOK() ) { |
824 | $removedFilesCounter++; |
825 | $this->logger->debug( __METHOD__ . ': ' . |
826 | 'Deleted expired file {file} #{num}', [ |
827 | 'file' => $file, |
828 | 'num' => $removedFilesCounter |
829 | ] |
830 | ); |
831 | } else { |
832 | $this->logger->warning( __METHOD__ . ': ' . |
833 | 'Unable to delete expired file {file}', |
834 | [ 'file' => $file ] |
835 | ); |
836 | } |
837 | } |
838 | unset( $timestamp ); |
839 | } |
840 | } |
841 | $this->getFileBackend()->clean( [ 'dir' => $directory ] ); |
842 | return $removedFilesCounter; |
843 | } |
844 | |
845 | /** |
846 | * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays |
847 | * |
848 | * @return MWTimestamp Utterance parts with timestamp <= this is expired. |
849 | */ |
850 | public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp { |
851 | return MWTimestamp::getInstance( |
852 | strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' ) |
853 | ); |
854 | } |
855 | |
856 | /** |
857 | * @return int Number of days an utterance is to exist before being flushed out. |
858 | */ |
859 | private function getWikispeechUtteranceTimeToLiveDays(): int { |
860 | return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) ); |
861 | } |
862 | |
863 | /** |
864 | * @return bool |
865 | */ |
866 | private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool { |
867 | return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' ); |
868 | } |
869 | |
870 | /** |
871 | * Used to evaluate hash of gadget consumer URL, |
872 | * the remote wiki where the page is located. |
873 | * |
874 | * Making changes to this function will probably invalidate all existing cached utterances. |
875 | * |
876 | * @since 0.1.9 |
877 | * @param string|null $consumerUrl |
878 | * @return string|null SHA256 message digest |
879 | */ |
880 | public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string { |
881 | if ( $consumerUrl === null ) { |
882 | return null; |
883 | } |
884 | $context = hash_init( 'sha256' ); |
885 | $urlParts = parse_url( $consumerUrl ); |
886 | if ( isset( $urlParts['host'] ) ) { |
887 | hash_update( $context, mb_strtolower( $urlParts['host'] ) ); |
888 | } |
889 | if ( isset( $urlParts['port'] ) ) { |
890 | hash_update( $context, strval( $urlParts['port'] ) ); |
891 | } |
892 | if ( isset( $urlParts['path'] ) ) { |
893 | hash_update( $context, $urlParts['path'] ); |
894 | } |
895 | return hash_final( $context ); |
896 | } |
897 | |
898 | } |