Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
79.03% covered (warning)
79.03%
260 / 329
50.00% covered (danger)
50.00%
11 / 22
CRAP
0.00% covered (danger)
0.00%
0 / 1
UtteranceStore
79.03% covered (warning)
79.03%
260 / 329
50.00% covered (danger)
50.00%
11 / 22
93.21
0.00% covered (danger)
0.00%
0 / 1
 __construct
61.54% covered (warning)
61.54%
8 / 13
0.00% covered (danger)
0.00%
0 / 1
2.23
 getFileBackend
8.00% covered (danger)
8.00%
2 / 25
0.00% covered (danger)
0.00%
0 / 1
16.46
 findUtterance
80.00% covered (warning)
80.00%
24 / 30
0.00% covered (danger)
0.00%
0 / 1
5.20
 retrieveUtteranceMetadata
100.00% covered (success)
100.00%
31 / 31
100.00% covered (success)
100.00%
1 / 1
3
 retrieveFileContents
62.50% covered (warning)
62.50%
5 / 8
0.00% covered (danger)
0.00%
0 / 1
2.21
 createUtterance
100.00% covered (success)
100.00%
35 / 35
100.00% covered (success)
100.00%
1 / 1
1
 storeFile
72.22% covered (warning)
72.22%
13 / 18
0.00% covered (danger)
0.00%
0 / 1
5.54
 flushUtterancesByExpirationDate
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByPage
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByLanguageAndVoice
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 flushUtterances
85.00% covered (warning)
85.00%
34 / 40
0.00% covered (danger)
0.00%
0 / 1
9.27
 deleteFileBackendFile
42.86% covered (danger)
42.86%
9 / 21
0.00% covered (danger)
0.00%
0 / 1
4.68
 urlPathFactory
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 audioUrlPrefixFactory
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 audioUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 synthesisMetadataUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByExpirationDateOnFile
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
2.01
 recurseFlushUtterancesByExpirationDateOnFile
91.11% covered (success)
91.11%
41 / 45
0.00% covered (danger)
0.00%
0 / 1
7.03
 getWikispeechUtteranceExpirationTimestamp
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getWikispeechUtteranceTimeToLiveDays
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 isWikispeechUtteranceUseSwiftFileBackendExpiring
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 evaluateRemoteWikiHash
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3namespace MediaWiki\Wikispeech\Utterance;
4
5/**
6 * @file
7 * @ingroup Extensions
8 * @license GPL-2.0-or-later
9 */
10
11use Config;
12use ExternalStoreException;
13use FileBackend;
14use FSFileBackend;
15use MediaWiki\Logger\LoggerFactory;
16use MediaWiki\MediaWikiServices;
17use MWTimestamp;
18use Psr\Log\LoggerInterface;
19use SwiftFileBackend;
20use WikiMap;
21use Wikimedia\Rdbms\IDatabase;
22use Wikimedia\Rdbms\ILoadBalancer;
23use Wikimedia\Rdbms\IResultWrapper;
24
25/**
26 * Keeps track of utterances in persistent layers.
27 *
28 * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table.
29 * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend.
30 * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend.
31 *
32 * (.opus and .json suffixes are added in file backed store although this class is agnostic
33 * regarding to the actual data encoding and formats.)
34 *
35 * @since 0.1.5
36 */
37class UtteranceStore {
38
39    /** @var string Name of database table that keeps track of utterance metadata. */
40    public const UTTERANCE_TABLE = 'wikispeech_utterance';
41
42    /** @var LoggerInterface */
43    private $logger;
44
45    /**
46     * Don't use this directly, access @see getFileBackend
47     * @var FileBackend Used to store utterance audio and synthesis metadata.
48     */
49    private $fileBackend;
50
51    /**
52     * @var ILoadBalancer
53     */
54    private $dbLoadBalancer;
55
56    /** @var string Name of container (sort of path prefix) used for files in backend. */
57    private $fileBackendContainerName;
58
59    /** @var Config */
60    private $config;
61
62    public function __construct() {
63        $this->logger = LoggerFactory::getInstance( 'Wikispeech' );
64
65        // @todo don't create, add as constructor parameter
66        // Refer to https://phabricator.wikimedia.org/T264165
67        $this->config = MediaWikiServices::getInstance()
68            ->getConfigFactory()
69            ->makeConfig( 'wikispeech' );
70
71        $this->fileBackendContainerName = $this->config
72            ->get( 'WikispeechUtteranceFileBackendContainerName' );
73        if ( !$this->fileBackendContainerName ) {
74            $this->fileBackendContainerName = "wikispeech-utterances";
75            $this->logger->info( __METHOD__ . ': ' .
76                'Falling back on container name {containerName}', [
77                    'containerName' => $this->fileBackendContainerName
78            ] );
79        }
80
81        $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer();
82    }
83
84    /**
85     * @since 0.1.5
86     * @return FileBackend
87     * @throws ExternalStoreException If defined file backend group does not exists.
88     */
89    private function getFileBackend() {
90        global $wgUploadDirectory;
91        if ( !$this->fileBackend ) {
92
93            /** @var string Name of file backend group in LocalSettings.php to use. */
94            $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' );
95            if ( !$fileBackendName ) {
96                $fileBackendName = 'wikispeech-backend';
97                $fallbackDir = "$wgUploadDirectory/wikispeech_utterances";
98                $this->logger->info( __METHOD__ . ': ' .
99                    'No file backend defined in LocalSettings.php. Falling back ' .
100                    'on FS storage backend named {name} in {dir}.', [
101                        'name' => $fileBackendName,
102                        'dir' => $fallbackDir
103                ] );
104                $this->fileBackend = new FSFileBackend( [
105                    'name' => $fileBackendName,
106                    'wikiId' => WikiMap::getCurrentWikiId(),
107                    'basePath' => $fallbackDir
108                ] );
109            } else {
110                $fileBackend = MediaWikiServices::getInstance()
111                    ->getFileBackendGroup()
112                    ->get( $fileBackendName );
113                if ( $fileBackend ) {
114                    $this->fileBackend = $fileBackend;
115                } else {
116                    throw new ExternalStoreException(
117                        "No file backend group in LocalSettings.php named $fileBackendName."
118                    );
119                }
120            }
121        }
122        return $this->fileBackend;
123    }
124
125    /**
126     * Retrieves an utterance for a given segment in a page, using a specific
127     * voice and language.
128     *
129     * @since 0.1.5
130     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
131     * @param int $pageId Mediawiki page ID.
132     * @param string $language ISO-639.
133     * @param string $voice Name of synthesis voice.
134     * @param string $segmentHash Hash of segment representing utterance.
135     * @param bool $omitAudio If true, then no audio is returned.
136     * @return Utterance|null Utterance found, or null if non-existing.
137     */
138    public function findUtterance(
139        ?string $consumerUrl,
140        int $pageId,
141        string $language,
142        string $voice,
143        string $segmentHash,
144        bool $omitAudio = false
145    ): ?Utterance {
146        $utterance = $this->retrieveUtteranceMetadata(
147            $consumerUrl,
148            $pageId,
149            $language,
150            $voice,
151            $segmentHash
152        );
153        if ( !$utterance ) {
154            return null;
155        }
156
157        // load utterance audio and synthesis metadata
158        $utteranceId = $utterance->getUtteranceId();
159
160        // @note We might want to keep this as separate function calls,
161        // allowing the user to request when needed, and perhaps
162        // pass a stream straight down from file backend to user
163        // rather than bouncing it via RAM.
164        // Not sure if this is an existing thing in PHP though.
165
166        if ( !$omitAudio ) {
167            $audioSrc = $this->audioUrlFactory( $utteranceId );
168            try {
169                $utterance->setAudio( $this->retrieveFileContents(
170                    $audioSrc,
171                    $utteranceId,
172                    'audio file'
173                ) );
174            } catch ( ExternalStoreException $e ) {
175                $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
176                return null;
177            }
178        }
179
180        $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId );
181        try {
182            $utterance->setSynthesisMetadata( $this->retrieveFileContents(
183                $synthesisMetadataSrc,
184                $utteranceId,
185                'synthesis metadata file'
186            ) );
187        } catch ( ExternalStoreException $e ) {
188            $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
189            return null;
190        }
191
192        return $utterance;
193    }
194
195    /**
196     * Retrieves the utterance metadata from the database for a given segment in a page,
197     * using a specific voice and language.
198     *
199     * @since 0.1.5
200     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
201     * @param int $pageId Mediawiki page ID.
202     * @param string $language ISO-639.
203     * @param string $voice Name of synthesis voice.
204     * @param string $segmentHash Hash of segment representing utterance.
205     * @return Utterance|null Utterance or null if not found in database
206     */
207    public function retrieveUtteranceMetadata(
208        ?string $consumerUrl,
209        int $pageId,
210        string $language,
211        string $voice,
212        string $segmentHash
213    ): ?Utterance {
214        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
215        $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA );
216        $row = $dbr->selectRow( self::UTTERANCE_TABLE, [
217            'wsu_utterance_id',
218            'wsu_remote_wiki_hash',
219            'wsu_page_id',
220            'wsu_lang',
221            'wsu_voice',
222            'wsu_seg_hash',
223            'wsu_date_stored'
224        ], [
225            'wsu_remote_wiki_hash' => $remoteWikiHash,
226            'wsu_page_id' => $pageId,
227            'wsu_lang' => $language,
228            'wsu_voice' => $voice,
229            'wsu_seg_hash' => $segmentHash
230        ], __METHOD__, [
231            'ORDER BY date_stored DESC',
232        ] );
233        if ( !$row ) {
234            return null;
235        }
236        $utterance = new Utterance(
237            intval( $row->wsu_utterance_id ),
238            $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ),
239            intval( $row->wsu_page_id ),
240            strval( $row->wsu_lang ),
241            strval( $row->wsu_voice ),
242            strval( $row->wsu_seg_hash ),
243            MWTimestamp::getInstance( $row->wsu_date_stored )
244        );
245        return $utterance;
246    }
247
248    /**
249     * Retrieve the file contents from the backend.
250     *
251     * @since 0.1.5
252     * @param string $src
253     * @param int $utteranceId
254     * @param string $type
255     * @return mixed File contents
256     * @throws ExternalStoreException
257     */
258    public function retrieveFileContents( $src, $utteranceId, $type ) {
259        $content = $this->getFileBackend()->getFileContents( [
260            'src' => $src
261        ] );
262        if ( $content == FileBackend::CONTENT_FAIL ) {
263            // @note Consider queuing job to flush inconsistencies from database.
264            throw new ExternalStoreException(
265                "Inconsistency! Database contains utterance with ID $utteranceId " .
266                "that does not exist as $type named $src in file backend." );
267        }
268        return $content;
269    }
270
271    /**
272     * Creates an utterance in the database.
273     *
274     * @since 0.1.5
275     * @param string|null $consumerUrl
276     * @param int $pageId Mediawiki page ID.
277     * @param string $language ISO 639.
278     * @param string $voice Name of synthesis voice.
279     * @param string $segmentHash Hash of segment representing utterance.
280     * @param string $audio Utterance audio.
281     * @param string $synthesisMetadata JSON form metadata about the audio.
282     * @return Utterance Inserted utterance.
283     * @throws ExternalStoreException If unable to prepare or create files in file backend.
284     */
285    public function createUtterance(
286        ?string $consumerUrl,
287        int $pageId,
288        string $language,
289        string $voice,
290        string $segmentHash,
291        string $audio,
292        string $synthesisMetadata
293    ): Utterance {
294        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
295        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
296        $rows = [
297            'wsu_remote_wiki_hash' => $remoteWikiHash,
298            'wsu_page_id' => $pageId,
299            'wsu_lang' => $language,
300            'wsu_voice' => $voice,
301            'wsu_seg_hash' => $segmentHash,
302            'wsu_date_stored' => $dbw->timestamp()
303        ];
304        $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ );
305        $utterance = new Utterance(
306            intval( $dbw->insertId() ),
307            $remoteWikiHash,
308            $pageId,
309            $language,
310            $voice,
311            $segmentHash,
312            MWTimestamp::getInstance( $rows['wsu_date_stored'] )
313        );
314
315        // create audio file
316        $this->storeFile(
317            $this->audioUrlFactory( $utterance->getUtteranceId() ),
318            $audio,
319            'audio file'
320        );
321        $utterance->setAudio( $audio );
322
323        // create synthesis metadata file
324        $this->storeFile(
325            $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ),
326            $synthesisMetadata,
327            'synthesis metadata file'
328        );
329        $utterance->setSynthesisMetadata( $synthesisMetadata );
330
331        $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue();
332        $jobQueue->maybeQueueJob();
333
334        return $utterance;
335    }
336
337    /**
338     * Store a file in the backend.
339     *
340     * @since 0.1.5
341     * @param string $fileUrl
342     * @param mixed $content
343     * @param string $type
344     * @throws ExternalStoreException
345     */
346    public function storeFile( $fileUrl, $content, $type ) {
347        $fileBackend = $this->getFileBackend();
348        if ( !$fileBackend->prepare( [
349            'dir' => dirname( $fileUrl ),
350            'noAccess' => 1,
351            'noListing' => 1
352        ] )->isOK() ) {
353            throw new ExternalStoreException( "Failed to prepare $type$fileUrl." );
354        }
355        $opts = [
356            'dst' => $fileUrl,
357            'content' => $content
358        ];
359        if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
360            $fileBackend instanceof SwiftFileBackend ) {
361            // Mark files in Swift for automatic removal after TTL.
362            // See $this->flushUtterances for code that skips forced removal if backend is Swift.
363            $opts['headers'] = [
364                // number of seconds from now
365                'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24
366            ];
367        }
368        if ( !$fileBackend->create( $opts )->isOK() ) {
369            throw new ExternalStoreException( "Failed to create $type$fileUrl." );
370        }
371    }
372
373    /**
374     * Clears database and file backend of utterances older than a given age.
375     *
376     * @since 0.1.5
377     * @param MWTimestamp $expirationDate
378     * @return int Number of utterances flushed.
379     */
380    public function flushUtterancesByExpirationDate( $expirationDate ) {
381        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
382        $results = $dbw->select( self::UTTERANCE_TABLE,
383            [ 'wsu_utterance_id' ],
384            [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ) ],
385            __METHOD__
386        );
387        return $this->flushUtterances( $dbw, $results );
388    }
389
390    /**
391     * Clears database and file backend of all utterances for a given page.
392     *
393     * @since 0.1.5
394     * @param string|null $consumerUrl
395     * @param int $pageId Mediawiki page ID.
396     * @return int Number of utterances flushed.
397     */
398    public function flushUtterancesByPage(
399        ?string $consumerUrl,
400        int $pageId
401    ): int {
402        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
403        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
404        $results = $dbw->select( self::UTTERANCE_TABLE,
405            [ 'wsu_utterance_id' ],
406            [
407                'wsu_remote_wiki_hash' => $remoteWikiHash,
408                'wsu_page_id' => $pageId
409            ],
410            __METHOD__
411        );
412        return $this->flushUtterances( $dbw, $results );
413    }
414
415    /**
416     * Clears database and file backend of all utterances for a given language and voice.
417     * If no voice is set, then all voices will be removed.
418     *
419     * @since 0.1.5
420     * @param string $language ISO 639.
421     * @param string|null $voice Optional name of synthesis voice to limit flush to.
422     * @return int Number of utterances flushed.
423     */
424    public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) {
425        $conditions = [
426            'wsu_lang' => $language
427        ];
428        if ( $voice != null ) {
429            $conditions['wsu_voice'] = $voice;
430        }
431        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
432        $results = $dbw->select( self::UTTERANCE_TABLE,
433            [ 'wsu_utterance_id' ], $conditions, __METHOD__
434        );
435        return $this->flushUtterances( $dbw, $results );
436    }
437
438    /**
439     * Flushes utterances listed in a result set containing
440     * at least the wsu_utterance_id column.
441     *
442     * In order for return value to increase, the utterance must have been
443     * successfully deleted in all layers, i.e. utterance metadata database row,
444     * utterance audio and synthesis metadata from file store.
445     * E.g. if the utterance audio file is missing and thus not explicitly removed,
446     * but at the same time we managed to remove the utterance metadata from database
447     * and also removed the synthesis metadata file, this will not count as a
448     * successfully removed utterance. It would however be removed from all layers
449     * and it would also cause an out-of-sync warning in the log.
450     *
451     * @note Consider if database should be flushing within a transaction.
452     *
453     * @since 0.1.5
454     * @param IDatabase $dbw Writable database connection.
455     * @param IResultWrapper $results Result set.
456     * @return int Number of utterances that were successfully flushed in all layers.
457     */
458    private function flushUtterances( $dbw, $results ) {
459        if ( !$results ) {
460            return 0;
461        }
462
463        // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations.
464        $flushInFileBackend = !(
465            $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
466            $this->getFileBackend() instanceof SwiftFileBackend
467        );
468
469        $successfullyFlushedCounter = 0;
470        foreach ( $results as $row ) {
471            $utteranceId = $row->wsu_utterance_id;
472
473            // 1. delete in database
474            $successfullyDeletedTableRow = $dbw->delete(
475                self::UTTERANCE_TABLE,
476                [ 'wsu_utterance_id' => $utteranceId ],
477                __METHOD__
478            );
479            if ( !$successfullyDeletedTableRow ) {
480                $this->logger->warning( __METHOD__ . ': ' .
481                    'Failed to delete utterance {utteranceId} from database.', [
482                        'utteranceId' => $utteranceId
483                ] );
484            } else {
485                $this->logger->debug( __METHOD__ . ': ' .
486                    'Flushed out utterance with id {utteranceId} from database', [
487                        'utteranceId' => $utteranceId
488                ] );
489            }
490
491            // 2. delete in file store.
492            if ( $flushInFileBackend ) {
493                $successfullyDeletedAudioFile = $this->deleteFileBackendFile(
494                    $this->audioUrlFactory( $utteranceId ),
495                    $utteranceId,
496                    'audio file'
497                );
498                $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile(
499                    $this->synthesisMetadataUrlFactory( $utteranceId ),
500                    $utteranceId,
501                    'synthesis metadata file'
502                );
503                $successfullyDeletedFiles =
504                    $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile;
505            } else {
506                // The files were marked for automatic deletion using TTL in the Swift create operation.
507                $successfullyDeletedFiles = true;
508            }
509
510            if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) {
511                $successfullyFlushedCounter++;
512            }
513        }
514
515        return $successfullyFlushedCounter;
516    }
517
518    /**
519     * @since 0.1.5
520     * @param string $src
521     * @param int $utteranceId
522     * @param string $type
523     * @return bool If successfully deleted
524     */
525    private function deleteFileBackendFile( $src, $utteranceId, $type ) {
526        $synthesisMetadataFile = [
527            'src' => $src
528        ];
529        if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) {
530            if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) {
531                $this->logger->warning( __METHOD__ . ': ' .
532                    'Unable to delete {type} for utterance with identity {utteranceId}.', [
533                        'utteranceId' => $utteranceId,
534                        'type' => $type
535                ] );
536                return false;
537            } else {
538                $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] );
539            }
540        } else {
541            $this->logger->warning( __METHOD__ . ': ' .
542                'Attempted to delete non existing {type} for utterance {utteranceId}.', [
543                    'utteranceId' => $utteranceId,
544                    'type' => $type
545            ] );
546            return false;
547        }
548        $this->logger->debug( __METHOD__ . ': ' .
549            'Flushed out file {src}', [ 'src' => $src ] );
550        return true;
551    }
552
553    /**
554     * Creates a deterministic path based on utterance identity,
555     * causing no more than 1000 files and 10 subdirectories per directory.
556     * (Actually, 2000 files, as we store both .json and .opus)
557     *
558     * Overloading a directory with files often cause performance problems.
559     *
560     * 1 -> /
561     * 12 -> /
562     * 123 -> /
563     * 1234 -> /1/
564     * 12345 -> /1/2/
565     * 123456 -> /1/2/3/
566     * 1234567 -> /1/2/3/4/
567     *
568     * @since 0.1.5
569     * @param int $utteranceId
570     * @return string Path
571     */
572    private function urlPathFactory( $utteranceId ) {
573        $path = '/';
574        $utteranceIdText = strval( $utteranceId );
575        $utteranceIdTextLength = strlen( $utteranceIdText );
576        for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) {
577            $path .= substr( $utteranceIdText, $index, 1 ) . '/';
578        }
579        return $path;
580    }
581
582    /**
583     * @since 0.1.5
584     * @param int $utteranceId Utterance identity.
585     * @return string url used to access object in file store
586     */
587    private function audioUrlPrefixFactory( $utteranceId ) {
588        return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName )
589            . $this->urlPathFactory( $utteranceId ) . $utteranceId;
590    }
591
592    /**
593     * @since 0.1.5
594     * @param int $utteranceId Utterance identity.
595     * @return string url used to access object in file store
596     */
597    private function audioUrlFactory( $utteranceId ) {
598        return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus';
599    }
600
601    /**
602     * @since 0.1.5
603     * @param int $utteranceId Utterance identity.
604     * @return string url used to access object in file store
605     */
606    private function synthesisMetadataUrlFactory( $utteranceId ) {
607        return $this->audioUrlPrefixFactory( $utteranceId ) . '.json';
608    }
609
610    /**
611     * Removes expired utterance and synthesis metadata from the file backend.
612     *
613     * @since 0.1.7
614     * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned.
615     *  Defaults to config value.
616     * @return int Number of expired files flushed
617     */
618    public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) {
619        // @note Either this method, or the job,
620        // should probably call `flushUtterancesByExpirationDate`
621        // to ensure we are not deleting a bunch of files
622        // which were scheduled to be deleted together with their db-entries anyway.
623
624        if ( !$expiredTimestamp ) {
625            $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp();
626        }
627        $fileBackend = $this->getFileBackend();
628        return $this->recurseFlushUtterancesByExpirationDateOnFile(
629            $fileBackend,
630            $this->getFileBackend()
631                ->getContainerStoragePath( $this->fileBackendContainerName ),
632            $expiredTimestamp
633        );
634    }
635
636    /**
637     * @since 0.1.7
638     * @param FileBackend $fileBackend
639     * @param string $directory
640     * @param MWTimestamp $expiredTimestamp
641     * @return int Number of expired files flushed
642     */
643    private function recurseFlushUtterancesByExpirationDateOnFile(
644        $fileBackend,
645        $directory,
646        $expiredTimestamp
647    ) {
648        $this->logger->debug( __METHOD__ . ': ' .
649            'Processing directory {directory}', [ 'directory' => $directory ] );
650        $removedFilesCounter = 0;
651        $subdirectories = $fileBackend->getDirectoryList( [
652            'dir' => $directory,
653            'topOnly' => true,
654        ] );
655        if ( $subdirectories ) {
656            foreach ( $subdirectories as $subdirectory ) {
657                $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile(
658                    $fileBackend,
659                    $directory . '/' . $subdirectory,
660                    $expiredTimestamp
661                );
662            }
663        }
664        $files = $fileBackend->getFileList( [
665            'dir' => $directory,
666            'topOnly' => true,
667            'adviseStat' => false
668        ] );
669        if ( $files ) {
670            foreach ( $files as $file ) {
671                $src = [ 'src' => $directory . '/' . $file ];
672                $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) );
673                $this->logger->debug( __METHOD__ . ': ' .
674                    'Processing file {src} with timestamp {timestamp}', [
675                    'src' => $file,
676                    'timestamp' => $timestamp,
677                    'expiredTimestamp' => $expiredTimestamp
678                ] );
679                if ( $timestamp <= $expiredTimestamp ) {
680                    if ( $fileBackend->delete( $src )->isOK() ) {
681                        $removedFilesCounter++;
682                        $this->logger->debug( __METHOD__ . ': ' .
683                            'Deleted expired file {file} #{num}', [
684                                'file' => $file,
685                                'num' => $removedFilesCounter
686                            ]
687                        );
688                    } else {
689                        $this->logger->warning( __METHOD__ . ': ' .
690                            'Unable to delete expired file {file}',
691                            [ 'file' => $file ]
692                        );
693                    }
694                }
695                unset( $timestamp );
696            }
697        }
698        $this->getFileBackend()->clean( [ 'dir' => $directory ] );
699        return $removedFilesCounter;
700    }
701
702    /**
703     * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays
704     *
705     * @return MWTimestamp Utterance parts with timestamp <= this is expired.
706     */
707    public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp {
708        return MWTimestamp::getInstance(
709            strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' )
710        );
711    }
712
713    /**
714     * @return int Number of days an utterance is to exist before being flushed out.
715     */
716    private function getWikispeechUtteranceTimeToLiveDays(): int {
717        return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) );
718    }
719
720    /**
721     * @return bool
722     */
723    private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool {
724        return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' );
725    }
726
727    /**
728     * Used to evaluate hash of gadget consumer URL,
729     * the remote wiki where the page is located.
730     *
731     * Making changes to this function will probably invalidate all existing cached utterances.
732     *
733     * @since 0.1.9
734     * @param string|null $consumerUrl
735     * @return string|null SHA256 message digest
736     */
737    public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string {
738        if ( $consumerUrl === null ) {
739            return null;
740        }
741        $context = hash_init( 'sha256' );
742        $urlParts = parse_url( $consumerUrl );
743        if ( isset( $urlParts['host'] ) ) {
744            hash_update( $context, mb_strtolower( $urlParts['host'] ) );
745        }
746        if ( isset( $urlParts['port'] ) ) {
747            hash_update( $context, strval( $urlParts['port'] ) );
748        }
749        if ( isset( $urlParts['path'] ) ) {
750            hash_update( $context, $urlParts['path'] );
751        }
752        return hash_final( $context );
753    }
754
755}