Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
80.80% covered (warning)
80.80%
303 / 375
46.15% covered (danger)
46.15%
12 / 26
CRAP
0.00% covered (danger)
0.00%
0 / 1
UtteranceStore
80.80% covered (warning)
80.80%
303 / 375
46.15% covered (danger)
46.15%
12 / 26
102.70
0.00% covered (danger)
0.00%
0 / 1
 __construct
61.54% covered (warning)
61.54%
8 / 13
0.00% covered (danger)
0.00%
0 / 1
2.23
 getFileBackend
8.00% covered (danger)
8.00%
2 / 25
0.00% covered (danger)
0.00%
0 / 1
16.46
 findUtterance
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
2
 findMessageUtterance
90.91% covered (success)
90.91%
10 / 11
0.00% covered (danger)
0.00%
0 / 1
2.00
 loadUtteranceAudio
71.43% covered (warning)
71.43%
15 / 21
0.00% covered (danger)
0.00%
0 / 1
4.37
 retrieveUtteranceMetadata
97.44% covered (success)
97.44%
38 / 39
0.00% covered (danger)
0.00%
0 / 1
6
 retrieveFileContents
62.50% covered (warning)
62.50%
5 / 8
0.00% covered (danger)
0.00%
0 / 1
2.21
 createUtterance
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
2.00
 createMessageUtterance
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 storeUtterance
100.00% covered (success)
100.00%
37 / 37
100.00% covered (success)
100.00%
1 / 1
1
 storeFile
73.68% covered (warning)
73.68%
14 / 19
0.00% covered (danger)
0.00%
0 / 1
5.46
 flushUtterancesByExpirationDate
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByPage
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByLanguageAndVoice
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 flushUtterances
85.00% covered (warning)
85.00%
34 / 40
0.00% covered (danger)
0.00%
0 / 1
9.27
 deleteFileBackendFile
42.86% covered (danger)
42.86%
9 / 21
0.00% covered (danger)
0.00%
0 / 1
4.68
 urlPathFactory
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 audioUrlPrefixFactory
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 audioUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 synthesisMetadataUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByExpirationDateOnFile
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
2.01
 recurseFlushUtterancesByExpirationDateOnFile
91.11% covered (success)
91.11%
41 / 45
0.00% covered (danger)
0.00%
0 / 1
7.03
 getWikispeechUtteranceExpirationTimestamp
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getWikispeechUtteranceTimeToLiveDays
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 isWikispeechUtteranceUseSwiftFileBackendExpiring
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 evaluateRemoteWikiHash
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3namespace MediaWiki\Wikispeech\Utterance;
4
5/**
6 * @file
7 * @ingroup Extensions
8 * @license GPL-2.0-or-later
9 */
10
11use Config;
12use ExternalStoreException;
13use FileBackend;
14use FSFileBackend;
15use MediaWiki\Logger\LoggerFactory;
16use MediaWiki\MediaWikiServices;
17use MediaWiki\WikiMap\WikiMap;
18use MWTimestamp;
19use Psr\Log\LoggerInterface;
20use RuntimeException;
21use SwiftFileBackend;
22use Wikimedia\Rdbms\IDatabase;
23use Wikimedia\Rdbms\ILoadBalancer;
24use Wikimedia\Rdbms\IResultWrapper;
25
26/**
27 * Keeps track of utterances in persistent layers.
28 *
29 * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table.
30 * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend.
31 * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend.
32 *
33 * (.opus and .json suffixes are added in file backed store although this class is agnostic
34 * regarding to the actual data encoding and formats.)
35 *
36 * @since 0.1.13 Introduces messageKey as parameter for system error messages.
37 * @since 0.1.5
38 */
39class UtteranceStore {
40
41    /** @var string Name of database table that keeps track of utterance metadata. */
42    public const UTTERANCE_TABLE = 'wikispeech_utterance';
43
44    /** @var LoggerInterface */
45    private $logger;
46
47    /**
48     * Don't use this directly, access @see getFileBackend
49     * @var FileBackend Used to store utterance audio and synthesis metadata.
50     */
51    private $fileBackend;
52
53    /**
54     * @var ILoadBalancer
55     */
56    private $dbLoadBalancer;
57
58    /** @var string Name of container (sort of path prefix) used for files in backend. */
59    private $fileBackendContainerName;
60
61    /** @var Config */
62    private $config;
63
64    public function __construct() {
65        $this->logger = LoggerFactory::getInstance( 'Wikispeech' );
66
67        // @todo don't create, add as constructor parameter
68        // Refer to https://phabricator.wikimedia.org/T264165
69        $this->config = MediaWikiServices::getInstance()
70            ->getConfigFactory()
71            ->makeConfig( 'wikispeech' );
72
73        $this->fileBackendContainerName = $this->config
74            ->get( 'WikispeechUtteranceFileBackendContainerName' );
75        if ( !$this->fileBackendContainerName ) {
76            $this->fileBackendContainerName = "wikispeech-utterances";
77            $this->logger->info( __METHOD__ . ': ' .
78                'Falling back on container name {containerName}', [
79                    'containerName' => $this->fileBackendContainerName
80                ] );
81        }
82
83        $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer();
84    }
85
86    /**
87     * @since 0.1.5
88     * @return FileBackend
89     * @throws ExternalStoreException If defined file backend group does not exists.
90     */
91    private function getFileBackend() {
92        global $wgUploadDirectory;
93        if ( !$this->fileBackend ) {
94
95            /** @var string Name of file backend group in LocalSettings.php to use. */
96            $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' );
97            if ( !$fileBackendName ) {
98                $fileBackendName = 'wikispeech-backend';
99                $fallbackDir = "$wgUploadDirectory/wikispeech_utterances";
100                $this->logger->info( __METHOD__ . ': ' .
101                    'No file backend defined in LocalSettings.php. Falling back ' .
102                    'on FS storage backend named {name} in {dir}.', [
103                        'name' => $fileBackendName,
104                        'dir' => $fallbackDir
105                    ] );
106                $this->fileBackend = new FSFileBackend( [
107                    'name' => $fileBackendName,
108                    'wikiId' => WikiMap::getCurrentWikiId(),
109                    'basePath' => $fallbackDir
110                ] );
111            } else {
112                $fileBackend = MediaWikiServices::getInstance()
113                    ->getFileBackendGroup()
114                    ->get( $fileBackendName );
115                if ( $fileBackend ) {
116                    $this->fileBackend = $fileBackend;
117                } else {
118                    throw new ExternalStoreException(
119                        "No file backend group in LocalSettings.php named $fileBackendName."
120                    );
121                }
122            }
123        }
124        return $this->fileBackend;
125    }
126
127    /**
128     * Retrieves an utterance for a given segment in a page, using a specific
129     * voice and language.
130     *
131     * @since 0.1.13
132     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
133     * @param int $pageId Mediawiki page ID.
134     * @param string $language ISO-639.
135     * @param string $voice Name of synthesis voice.
136     * @param string $segmentHash Hash of segment representing utterance.
137     * @param bool $omitAudio If true, then no audio is returned.
138     * @return Utterance|null Utterance found, or null if non-existing.
139     */
140    public function findUtterance(
141        ?string $consumerUrl,
142        int $pageId,
143        string $language,
144        string $voice,
145        string $segmentHash,
146        bool $omitAudio = false
147    ): ?Utterance {
148        $utterance = $this->retrieveUtteranceMetadata(
149            $consumerUrl,
150            $pageId,
151            null,
152            $language,
153            $voice,
154            $segmentHash
155        );
156        if ( !$utterance ) {
157            return null;
158        }
159
160        return $this->loadUtteranceAudio( $utterance, $omitAudio );
161    }
162
163    /**
164     * Retrieves an utterance for a specific error message
165     *
166     * @since 0.1.13
167     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
168     * @param string $messageKey Message key for system message.
169     * @param string $language ISO-639.
170     * @param string $voice Name of synthesis voice.
171     * @param string $segmentHash Hash of segment representing utterance.
172     * @param bool $omitAudio If true, then no audio is returned.
173     * @return Utterance|null Utterance found, or null if non-existing.
174     */
175    public function findMessageUtterance(
176        ?string $consumerUrl,
177        string $messageKey,
178        string $language,
179        string $voice,
180        string $segmentHash,
181        bool $omitAudio = false
182    ) {
183        $utterance = $this->retrieveUtteranceMetadata(
184            $consumerUrl,
185            0,
186            $messageKey,
187            $language,
188            $voice,
189            $segmentHash
190        );
191        if ( !$utterance ) {
192            return null;
193        }
194
195        return $this->loadUtteranceAudio( $utterance, $omitAudio );
196    }
197
198    /**
199     * Loads utterance audio and synthesis metadata
200     *
201     * @since 0.1.13
202     */
203    private function loadUtteranceAudio( Utterance $utterance, bool $omitAudio ): ?Utterance {
204        $utteranceId = $utterance->getUtteranceId();
205
206        // @note We might want to keep this as separate function calls,
207        // allowing the user to request when needed, and perhaps
208        // pass a stream straight down from file backend to user
209        // rather than bouncing it via RAM.
210        // Not sure if this is an existing thing in PHP though.
211
212        if ( !$omitAudio ) {
213            $audioSrc = $this->audioUrlFactory( $utteranceId );
214            try {
215                $utterance->setAudio( $this->retrieveFileContents(
216                    $audioSrc,
217                    $utteranceId,
218                    'audio file'
219                ) );
220            } catch ( ExternalStoreException $e ) {
221                $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
222                return null;
223            }
224        }
225
226        $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId );
227        try {
228            $utterance->setSynthesisMetadata( $this->retrieveFileContents(
229                $synthesisMetadataSrc,
230                $utteranceId,
231                'synthesis metadata file'
232            ) );
233        } catch ( ExternalStoreException $e ) {
234            $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
235            return null;
236        }
237
238        return $utterance;
239    }
240
241    /**
242     * Retrieves the utterance metadata from the database for a given segment in a page,
243     * using a specific voice and language.
244     *
245     * @since 0.1.13 Optional parameter messageKey
246     * @since 0.1.5
247     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
248     * @param int $pageId Mediawiki page ID.
249     * @param string|null $messageKey Mediawiki message key.
250     * @param string $language ISO-639.
251     * @param string $voice Name of synthesis voice.
252     * @param string $segmentHash Hash of segment representing utterance.
253     * @return Utterance|null Utterance or null if not found in database
254     */
255    public function retrieveUtteranceMetadata(
256        ?string $consumerUrl,
257        int $pageId,
258        ?string $messageKey,
259        string $language,
260        string $voice,
261        string $segmentHash
262    ): ?Utterance {
263        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
264        $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA );
265
266        $conditions = [
267            'wsu_remote_wiki_hash' => $remoteWikiHash,
268            'wsu_lang' => $language,
269            'wsu_voice' => $voice,
270            'wsu_seg_hash' => $segmentHash
271        ];
272
273        if ( $pageId > 0 ) {
274            $conditions['wsu_page_id'] = $pageId;
275        } else {
276            if ( $messageKey === null ) {
277                throw new RuntimeException( 'If pageId is 0, messageKey must be provided.' );
278            }
279            $conditions['wsu_message_key'] = $messageKey;
280            $conditions['wsu_page_id'] = 0;
281        }
282
283        $row = $dbr->selectRow( self::UTTERANCE_TABLE, [
284            'wsu_utterance_id',
285            'wsu_remote_wiki_hash',
286            'wsu_message_key',
287            'wsu_page_id',
288            'wsu_lang',
289            'wsu_voice',
290            'wsu_seg_hash',
291            'wsu_date_stored'
292        ], $conditions, __METHOD__, [
293            'ORDER BY date_stored DESC',
294        ] );
295        if ( !$row ) {
296            return null;
297        }
298        $utterance = new Utterance(
299            intval( $row->wsu_utterance_id ),
300            $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ),
301            $row->wsu_message_key ? strval( $row->wsu_message_key ) : null,
302            intval( $row->wsu_page_id ),
303            strval( $row->wsu_lang ),
304            strval( $row->wsu_voice ),
305            strval( $row->wsu_seg_hash ),
306            MWTimestamp::getInstance( $row->wsu_date_stored )
307        );
308        return $utterance;
309    }
310
311    /**
312     * Retrieve the file contents from the backend.
313     *
314     * @since 0.1.5
315     * @param string $src
316     * @param int $utteranceId
317     * @param string $type
318     * @return mixed File contents
319     * @throws ExternalStoreException
320     */
321    public function retrieveFileContents( $src, $utteranceId, $type ) {
322        $content = $this->getFileBackend()->getFileContents( [
323            'src' => $src
324        ] );
325        if ( $content == FileBackend::CONTENT_FAIL ) {
326            // @note Consider queuing job to flush inconsistencies from database.
327            throw new ExternalStoreException(
328                "Inconsistency! Database contains utterance with ID $utteranceId " .
329                "that does not exist as $type named $src in file backend." );
330        }
331        return $content;
332    }
333
334    /**
335     * Creates an utterance in the database.
336     *
337     * @since 0.1.13
338     * @param string|null $consumerUrl
339     * @param int $pageId Mediawiki page ID.
340     * @param string $language ISO 639.
341     * @param string $voice Name of synthesis voice.
342     * @param string $segmentHash Hash of segment representing utterance.
343     * @param string $audio Utterance audio.
344     * @param string $synthesisMetadata JSON form metadata about the audio.
345     * @return Utterance Inserted utterance.
346     * @throws ExternalStoreException If unable to prepare or create files in file backend.
347     */
348    public function createUtterance(
349        ?string $consumerUrl,
350        int $pageId,
351        string $language,
352        string $voice,
353        string $segmentHash,
354        string $audio,
355        string $synthesisMetadata
356    ): Utterance {
357        if ( $pageId === 0 ) {
358            throw new RuntimeException( 'Page ID must not be 0 when creating regular utterance.' );
359        }
360
361        return $this->storeUtterance(
362            $consumerUrl,
363            $pageId,
364            null,
365            $language,
366            $voice,
367            $segmentHash,
368            $audio,
369            $synthesisMetadata
370        );
371    }
372
373    /**
374     * Creates a system error utterance in the database and prepares for storing.
375     *
376     * @since 0.1.13
377     * @param string|null $consumerUrl
378     * @param string|null $messageKey Mediawiki message key.
379     * @param string $language ISO 639.
380     * @param string $voice Name of synthesis voice.
381     * @param string $segmentHash Hash of segment representing utterance.
382     * @param string $audio Utterance audio.
383     * @param string $synthesisMetadata JSON form metadata about the audio.
384     * @return Utterance Inserted utterance.
385     * @throws ExternalStoreException If unable to prepare or create files in file backend.
386     */
387    public function createMessageUtterance(
388        ?string $consumerUrl,
389        ?string $messageKey,
390        string $language,
391        string $voice,
392        string $segmentHash,
393        string $audio,
394        string $synthesisMetadata
395    ) {
396        return $this->storeUtterance(
397            $consumerUrl,
398            0,
399            $messageKey,
400            $language,
401            $voice,
402            $segmentHash,
403            $audio,
404            $synthesisMetadata
405        );
406    }
407
408    /**
409     * Stores a created utterance.
410     *
411     * @since 0.1.13
412     * @param string|null $consumerUrl
413     * @param int $pageId Mediawiki page ID.
414     * @param string|null $messageKey Mediawiki message key.
415     * @param string $language ISO 639.
416     * @param string $voice Name of synthesis voice.
417     * @param string $segmentHash Hash of segment representing utterance.
418     * @param string $audio Utterance audio.
419     * @param string $synthesisMetadata JSON form metadata about the audio.
420     * @return Utterance Inserted utterance.
421     * @throws ExternalStoreException If unable to prepare or create files in file backend.
422     */
423    private function storeUtterance(
424        ?string $consumerUrl,
425        int $pageId,
426        ?string $messageKey,
427        string $language,
428        string $voice,
429        string $segmentHash,
430        string $audio,
431        string $synthesisMetadata
432    ) {
433        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
434        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
435        $rows = [
436            'wsu_remote_wiki_hash' => $remoteWikiHash,
437            'wsu_page_id' => $pageId,
438            'wsu_message_key' => $messageKey,
439            'wsu_lang' => $language,
440            'wsu_voice' => $voice,
441            'wsu_seg_hash' => $segmentHash,
442            'wsu_date_stored' => $dbw->timestamp()
443        ];
444        $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ );
445        $utterance = new Utterance(
446            intval( $dbw->insertId() ),
447            $remoteWikiHash,
448            $messageKey,
449            $pageId,
450            $language,
451            $voice,
452            $segmentHash,
453            MWTimestamp::getInstance( $rows['wsu_date_stored'] )
454        );
455
456        // create audio file
457        $this->storeFile(
458            $this->audioUrlFactory( $utterance->getUtteranceId() ),
459            $audio,
460            'audio file'
461        );
462        $utterance->setAudio( $audio );
463
464        // create synthesis metadata file
465        $this->storeFile(
466            $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ),
467            $synthesisMetadata,
468            'synthesis metadata file'
469        );
470        $utterance->setSynthesisMetadata( $synthesisMetadata );
471
472        $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue();
473        $jobQueue->maybeQueueJob();
474
475        return $utterance;
476    }
477
478    /**
479     * Store a file in the backend.
480     *
481     * @since 0.1.5
482     * @param string $fileUrl
483     * @param mixed $content
484     * @param string $type
485     * @throws ExternalStoreException
486     */
487    public function storeFile( $fileUrl, $content, $type ) {
488        $fileBackend = $this->getFileBackend();
489
490        if ( !$fileBackend->prepare( [
491            'dir' => dirname( $fileUrl ),
492            'noAccess' => 1,
493            'noListing' => 1
494        ] )->isOK() ) {
495            throw new ExternalStoreException( "Failed to prepare $type$fileUrl." );
496        }
497        $opts = [
498            'dst' => $fileUrl,
499            'content' => $content,
500            'overwrite' => true
501        ];
502        if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
503            $fileBackend instanceof SwiftFileBackend ) {
504            // Mark files in Swift for automatic removal after TTL.
505            // See $this->flushUtterances for code that skips forced removal if backend is Swift.
506            $opts['headers'] = [
507                // number of seconds from now
508                'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24
509            ];
510        }
511        if ( !$fileBackend->create( $opts )->isOK() ) {
512            throw new ExternalStoreException( "Failed to create $type$fileUrl." );
513        }
514    }
515
516    /**
517     * Clears database and file backend of utterances older than a given age.
518     *
519     * @since 0.1.5
520     * @param MWTimestamp $expirationDate
521     * @return int Number of utterances flushed.
522     */
523    public function flushUtterancesByExpirationDate( $expirationDate ) {
524        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
525        $results = $dbw->select( self::UTTERANCE_TABLE,
526            [ 'wsu_utterance_id' ],
527            [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ) ],
528            __METHOD__
529        );
530        return $this->flushUtterances( $dbw, $results );
531    }
532
533    /**
534     * Clears database and file backend of all utterances for a given page.
535     *
536     * @since 0.1.5
537     * @param string|null $consumerUrl
538     * @param int $pageId Mediawiki page ID.
539     * @return int Number of utterances flushed.
540     */
541    public function flushUtterancesByPage(
542        ?string $consumerUrl,
543        int $pageId
544    ): int {
545        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
546        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
547        $results = $dbw->select( self::UTTERANCE_TABLE,
548            [ 'wsu_utterance_id' ],
549            [
550                'wsu_remote_wiki_hash' => $remoteWikiHash,
551                'wsu_page_id' => $pageId
552            ],
553            __METHOD__
554        );
555        return $this->flushUtterances( $dbw, $results );
556    }
557
558    /**
559     * Clears database and file backend of all utterances for a given language and voice.
560     * If no voice is set, then all voices will be removed.
561     *
562     * @since 0.1.5
563     * @param string $language ISO 639.
564     * @param string|null $voice Optional name of synthesis voice to limit flush to.
565     * @return int Number of utterances flushed.
566     */
567    public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) {
568        $conditions = [
569            'wsu_lang' => $language
570        ];
571        if ( $voice != null ) {
572            $conditions['wsu_voice'] = $voice;
573        }
574        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
575        $results = $dbw->select( self::UTTERANCE_TABLE,
576            [ 'wsu_utterance_id' ], $conditions, __METHOD__
577        );
578        return $this->flushUtterances( $dbw, $results );
579    }
580
581    /**
582     * Flushes utterances listed in a result set containing
583     * at least the wsu_utterance_id column.
584     *
585     * In order for return value to increase, the utterance must have been
586     * successfully deleted in all layers, i.e. utterance metadata database row,
587     * utterance audio and synthesis metadata from file store.
588     * E.g. if the utterance audio file is missing and thus not explicitly removed,
589     * but at the same time we managed to remove the utterance metadata from database
590     * and also removed the synthesis metadata file, this will not count as a
591     * successfully removed utterance. It would however be removed from all layers
592     * and it would also cause an out-of-sync warning in the log.
593     *
594     * @note Consider if database should be flushing within a transaction.
595     *
596     * @since 0.1.5
597     * @param IDatabase $dbw Writable database connection.
598     * @param IResultWrapper $results Result set.
599     * @return int Number of utterances that were successfully flushed in all layers.
600     */
601    private function flushUtterances( $dbw, $results ) {
602        if ( !$results ) {
603            return 0;
604        }
605
606        // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations.
607        $flushInFileBackend = !(
608            $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
609            $this->getFileBackend() instanceof SwiftFileBackend
610        );
611
612        $successfullyFlushedCounter = 0;
613        foreach ( $results as $row ) {
614            $utteranceId = $row->wsu_utterance_id;
615
616            // 1. delete in database
617            $successfullyDeletedTableRow = $dbw->delete(
618                self::UTTERANCE_TABLE,
619                [ 'wsu_utterance_id' => $utteranceId ],
620                __METHOD__
621            );
622            if ( !$successfullyDeletedTableRow ) {
623                $this->logger->warning( __METHOD__ . ': ' .
624                    'Failed to delete utterance {utteranceId} from database.', [
625                        'utteranceId' => $utteranceId
626                    ] );
627            } else {
628                $this->logger->debug( __METHOD__ . ': ' .
629                    'Flushed out utterance with id {utteranceId} from database', [
630                        'utteranceId' => $utteranceId
631                    ] );
632            }
633
634            // 2. delete in file store.
635            if ( $flushInFileBackend ) {
636                $successfullyDeletedAudioFile = $this->deleteFileBackendFile(
637                    $this->audioUrlFactory( $utteranceId ),
638                    $utteranceId,
639                    'audio file'
640                );
641                $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile(
642                    $this->synthesisMetadataUrlFactory( $utteranceId ),
643                    $utteranceId,
644                    'synthesis metadata file'
645                );
646                $successfullyDeletedFiles =
647                    $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile;
648            } else {
649                // The files were marked for automatic deletion using TTL in the Swift create operation.
650                $successfullyDeletedFiles = true;
651            }
652
653            if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) {
654                $successfullyFlushedCounter++;
655            }
656        }
657
658        return $successfullyFlushedCounter;
659    }
660
661    /**
662     * @since 0.1.5
663     * @param string $src
664     * @param int $utteranceId
665     * @param string $type
666     * @return bool If successfully deleted
667     */
668    private function deleteFileBackendFile( $src, $utteranceId, $type ) {
669        $synthesisMetadataFile = [
670            'src' => $src
671        ];
672        if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) {
673            if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) {
674                $this->logger->warning( __METHOD__ . ': ' .
675                    'Unable to delete {type} for utterance with identity {utteranceId}.', [
676                        'utteranceId' => $utteranceId,
677                        'type' => $type
678                    ] );
679                return false;
680            } else {
681                $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] );
682            }
683        } else {
684            $this->logger->warning( __METHOD__ . ': ' .
685                'Attempted to delete non existing {type} for utterance {utteranceId}.', [
686                    'utteranceId' => $utteranceId,
687                    'type' => $type
688                ] );
689            return false;
690        }
691        $this->logger->debug( __METHOD__ . ': ' .
692            'Flushed out file {src}', [ 'src' => $src ] );
693        return true;
694    }
695
696    /**
697     * Creates a deterministic path based on utterance identity,
698     * causing no more than 1000 files and 10 subdirectories per directory.
699     * (Actually, 2000 files, as we store both .json and .opus)
700     *
701     * Overloading a directory with files often cause performance problems.
702     *
703     * 1 -> /
704     * 12 -> /
705     * 123 -> /
706     * 1234 -> /1/
707     * 12345 -> /1/2/
708     * 123456 -> /1/2/3/
709     * 1234567 -> /1/2/3/4/
710     *
711     * @since 0.1.5
712     * @param int $utteranceId
713     * @return string Path
714     */
715    private function urlPathFactory( $utteranceId ) {
716        $path = '/';
717        $utteranceIdText = strval( $utteranceId );
718        $utteranceIdTextLength = strlen( $utteranceIdText );
719        for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) {
720            $path .= substr( $utteranceIdText, $index, 1 ) . '/';
721        }
722        return $path;
723    }
724
725    /**
726     * @since 0.1.5
727     * @param int $utteranceId Utterance identity.
728     * @return string url used to access object in file store
729     */
730    private function audioUrlPrefixFactory( $utteranceId ) {
731        return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName )
732            . $this->urlPathFactory( $utteranceId ) . $utteranceId;
733    }
734
735    /**
736     * @since 0.1.5
737     * @param int $utteranceId Utterance identity.
738     * @return string url used to access object in file store
739     */
740    private function audioUrlFactory( $utteranceId ) {
741        return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus';
742    }
743
744    /**
745     * @since 0.1.5
746     * @param int $utteranceId Utterance identity.
747     * @return string url used to access object in file store
748     */
749    private function synthesisMetadataUrlFactory( $utteranceId ) {
750        return $this->audioUrlPrefixFactory( $utteranceId ) . '.json';
751    }
752
753    /**
754     * Removes expired utterance and synthesis metadata from the file backend.
755     *
756     * @since 0.1.7
757     * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned.
758     *  Defaults to config value.
759     * @return int Number of expired files flushed
760     */
761    public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) {
762        // @note Either this method, or the job,
763        // should probably call `flushUtterancesByExpirationDate`
764        // to ensure we are not deleting a bunch of files
765        // which were scheduled to be deleted together with their db-entries anyway.
766
767        if ( !$expiredTimestamp ) {
768            $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp();
769        }
770        $fileBackend = $this->getFileBackend();
771        return $this->recurseFlushUtterancesByExpirationDateOnFile(
772            $fileBackend,
773            $this->getFileBackend()
774                ->getContainerStoragePath( $this->fileBackendContainerName ),
775            $expiredTimestamp
776        );
777    }
778
779    /**
780     * @since 0.1.7
781     * @param FileBackend $fileBackend
782     * @param string $directory
783     * @param MWTimestamp $expiredTimestamp
784     * @return int Number of expired files flushed
785     */
786    private function recurseFlushUtterancesByExpirationDateOnFile(
787        $fileBackend,
788        $directory,
789        $expiredTimestamp
790    ) {
791        $this->logger->debug( __METHOD__ . ': ' .
792            'Processing directory {directory}', [ 'directory' => $directory ] );
793        $removedFilesCounter = 0;
794        $subdirectories = $fileBackend->getDirectoryList( [
795            'dir' => $directory,
796            'topOnly' => true,
797        ] );
798        if ( $subdirectories ) {
799            foreach ( $subdirectories as $subdirectory ) {
800                $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile(
801                    $fileBackend,
802                    $directory . '/' . $subdirectory,
803                    $expiredTimestamp
804                );
805            }
806        }
807        $files = $fileBackend->getFileList( [
808            'dir' => $directory,
809            'topOnly' => true,
810            'adviseStat' => false
811        ] );
812        if ( $files ) {
813            foreach ( $files as $file ) {
814                $src = [ 'src' => $directory . '/' . $file ];
815                $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) );
816                $this->logger->debug( __METHOD__ . ': ' .
817                    'Processing file {src} with timestamp {timestamp}', [
818                        'src' => $file,
819                        'timestamp' => $timestamp,
820                        'expiredTimestamp' => $expiredTimestamp
821                    ] );
822                if ( $timestamp <= $expiredTimestamp ) {
823                    if ( $fileBackend->delete( $src )->isOK() ) {
824                        $removedFilesCounter++;
825                        $this->logger->debug( __METHOD__ . ': ' .
826                            'Deleted expired file {file} #{num}', [
827                                'file' => $file,
828                                'num' => $removedFilesCounter
829                            ]
830                        );
831                    } else {
832                        $this->logger->warning( __METHOD__ . ': ' .
833                            'Unable to delete expired file {file}',
834                            [ 'file' => $file ]
835                        );
836                    }
837                }
838                unset( $timestamp );
839            }
840        }
841        $this->getFileBackend()->clean( [ 'dir' => $directory ] );
842        return $removedFilesCounter;
843    }
844
845    /**
846     * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays
847     *
848     * @return MWTimestamp Utterance parts with timestamp <= this is expired.
849     */
850    public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp {
851        return MWTimestamp::getInstance(
852            strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' )
853        );
854    }
855
856    /**
857     * @return int Number of days an utterance is to exist before being flushed out.
858     */
859    private function getWikispeechUtteranceTimeToLiveDays(): int {
860        return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) );
861    }
862
863    /**
864     * @return bool
865     */
866    private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool {
867        return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' );
868    }
869
870    /**
871     * Used to evaluate hash of gadget consumer URL,
872     * the remote wiki where the page is located.
873     *
874     * Making changes to this function will probably invalidate all existing cached utterances.
875     *
876     * @since 0.1.9
877     * @param string|null $consumerUrl
878     * @return string|null SHA256 message digest
879     */
880    public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string {
881        if ( $consumerUrl === null ) {
882            return null;
883        }
884        $context = hash_init( 'sha256' );
885        $urlParts = parse_url( $consumerUrl );
886        if ( isset( $urlParts['host'] ) ) {
887            hash_update( $context, mb_strtolower( $urlParts['host'] ) );
888        }
889        if ( isset( $urlParts['port'] ) ) {
890            hash_update( $context, strval( $urlParts['port'] ) );
891        }
892        if ( isset( $urlParts['path'] ) ) {
893            hash_update( $context, $urlParts['path'] );
894        }
895        return hash_final( $context );
896    }
897
898}