Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
81.40% covered (warning)
81.40%
315 / 387
48.15% covered (danger)
48.15%
13 / 27
CRAP
0.00% covered (danger)
0.00%
0 / 1
UtteranceStore
81.40% covered (warning)
81.40%
315 / 387
48.15% covered (danger)
48.15%
13 / 27
103.46
0.00% covered (danger)
0.00%
0 / 1
 __construct
61.54% covered (warning)
61.54%
8 / 13
0.00% covered (danger)
0.00%
0 / 1
2.23
 getFileBackend
8.00% covered (danger)
8.00%
2 / 25
0.00% covered (danger)
0.00%
0 / 1
16.46
 findUtterance
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
2
 findMessageUtterance
90.91% covered (success)
90.91%
10 / 11
0.00% covered (danger)
0.00%
0 / 1
2.00
 loadUtteranceAudio
71.43% covered (warning)
71.43%
15 / 21
0.00% covered (danger)
0.00%
0 / 1
4.37
 retrieveUtteranceMetadata
97.44% covered (success)
97.44%
38 / 39
0.00% covered (danger)
0.00%
0 / 1
6
 retrieveFileContents
62.50% covered (warning)
62.50%
5 / 8
0.00% covered (danger)
0.00%
0 / 1
2.21
 createUtterance
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
2.00
 createMessageUtterance
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 storeUtterance
100.00% covered (success)
100.00%
37 / 37
100.00% covered (success)
100.00%
1 / 1
1
 storeFile
73.68% covered (warning)
73.68%
14 / 19
0.00% covered (danger)
0.00%
0 / 1
5.46
 flushUtterancesByExpirationDate
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByPage
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
2
 flushUtterancesByLanguageAndVoice
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
2
 flushMessageUtterances
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterances
85.00% covered (warning)
85.00%
34 / 40
0.00% covered (danger)
0.00%
0 / 1
9.27
 deleteFileBackendFile
42.86% covered (danger)
42.86%
9 / 21
0.00% covered (danger)
0.00%
0 / 1
4.68
 urlPathFactory
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 audioUrlPrefixFactory
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 audioUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 synthesisMetadataUrlFactory
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 flushUtterancesByExpirationDateOnFile
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
2.01
 recurseFlushUtterancesByExpirationDateOnFile
91.11% covered (success)
91.11%
41 / 45
0.00% covered (danger)
0.00%
0 / 1
7.03
 getWikispeechUtteranceExpirationTimestamp
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 getWikispeechUtteranceTimeToLiveDays
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 isWikispeechUtteranceUseSwiftFileBackendExpiring
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 evaluateRemoteWikiHash
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3namespace MediaWiki\Wikispeech\Utterance;
4
5/**
6 * @file
7 * @ingroup Extensions
8 * @license GPL-2.0-or-later
9 */
10
11use Config;
12use ExternalStoreException;
13use FileBackend;
14use FSFileBackend;
15use MediaWiki\Logger\LoggerFactory;
16use MediaWiki\MediaWikiServices;
17use MediaWiki\WikiMap\WikiMap;
18use MWTimestamp;
19use Psr\Log\LoggerInterface;
20use RuntimeException;
21use SwiftFileBackend;
22use Wikimedia\Rdbms\IDatabase;
23use Wikimedia\Rdbms\ILoadBalancer;
24use Wikimedia\Rdbms\IResultWrapper;
25
26/**
27 * Keeps track of utterances in persistent layers.
28 *
29 * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table.
30 * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend.
31 * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend.
32 *
33 * (.opus and .json suffixes are added in file backed store although this class is agnostic
34 * regarding to the actual data encoding and formats.)
35 *
36 * @since 0.1.13 Introduces messageKey as parameter for system error messages.
37 * @since 0.1.5
38 */
39class UtteranceStore {
40
41    /** @var string Name of database table that keeps track of utterance metadata. */
42    public const UTTERANCE_TABLE = 'wikispeech_utterance';
43
44    /** @var LoggerInterface */
45    private $logger;
46
47    /**
48     * Don't use this directly, access @see getFileBackend
49     * @var FileBackend Used to store utterance audio and synthesis metadata.
50     */
51    private $fileBackend;
52
53    /**
54     * @var ILoadBalancer
55     */
56    private $dbLoadBalancer;
57
58    /** @var string Name of container (sort of path prefix) used for files in backend. */
59    private $fileBackendContainerName;
60
61    /** @var Config */
62    private $config;
63
64    public function __construct() {
65        $this->logger = LoggerFactory::getInstance( 'Wikispeech' );
66
67        // @todo don't create, add as constructor parameter
68        // Refer to https://phabricator.wikimedia.org/T264165
69        $this->config = MediaWikiServices::getInstance()
70            ->getConfigFactory()
71            ->makeConfig( 'wikispeech' );
72
73        $this->fileBackendContainerName = $this->config
74            ->get( 'WikispeechUtteranceFileBackendContainerName' );
75        if ( !$this->fileBackendContainerName ) {
76            $this->fileBackendContainerName = "wikispeech-utterances";
77            $this->logger->info( __METHOD__ . ': ' .
78                'Falling back on container name {containerName}', [
79                    'containerName' => $this->fileBackendContainerName
80                ] );
81        }
82
83        $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer();
84    }
85
86    /**
87     * @since 0.1.5
88     * @return FileBackend
89     * @throws ExternalStoreException If defined file backend group does not exists.
90     */
91    private function getFileBackend() {
92        global $wgUploadDirectory;
93        if ( !$this->fileBackend ) {
94
95            /** @var string Name of file backend group in LocalSettings.php to use. */
96            $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' );
97            if ( !$fileBackendName ) {
98                $fileBackendName = 'wikispeech-backend';
99                $fallbackDir = "$wgUploadDirectory/wikispeech_utterances";
100                $this->logger->info( __METHOD__ . ': ' .
101                    'No file backend defined in LocalSettings.php. Falling back ' .
102                    'on FS storage backend named {name} in {dir}.', [
103                        'name' => $fileBackendName,
104                        'dir' => $fallbackDir
105                    ] );
106                $this->fileBackend = new FSFileBackend( [
107                    'name' => $fileBackendName,
108                    'wikiId' => WikiMap::getCurrentWikiId(),
109                    'basePath' => $fallbackDir
110                ] );
111            } else {
112                $fileBackend = MediaWikiServices::getInstance()
113                    ->getFileBackendGroup()
114                    ->get( $fileBackendName );
115                if ( $fileBackend ) {
116                    $this->fileBackend = $fileBackend;
117                } else {
118                    throw new ExternalStoreException(
119                        "No file backend group in LocalSettings.php named $fileBackendName."
120                    );
121                }
122            }
123        }
124        return $this->fileBackend;
125    }
126
127    /**
128     * Retrieves an utterance for a given segment in a page, using a specific
129     * voice and language.
130     *
131     * @since 0.1.13
132     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
133     * @param int $pageId Mediawiki page ID.
134     * @param string $language ISO-639.
135     * @param string $voice Name of synthesis voice.
136     * @param string $segmentHash Hash of segment representing utterance.
137     * @param bool $omitAudio If true, then no audio is returned.
138     * @return Utterance|null Utterance found, or null if non-existing.
139     */
140    public function findUtterance(
141        ?string $consumerUrl,
142        int $pageId,
143        string $language,
144        string $voice,
145        string $segmentHash,
146        bool $omitAudio = false
147    ): ?Utterance {
148        $utterance = $this->retrieveUtteranceMetadata(
149            $consumerUrl,
150            $pageId,
151            null,
152            $language,
153            $voice,
154            $segmentHash
155        );
156        if ( !$utterance ) {
157            return null;
158        }
159
160        return $this->loadUtteranceAudio( $utterance, $omitAudio );
161    }
162
163    /**
164     * Retrieves an utterance for a specific error message
165     *
166     * @since 0.1.13
167     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
168     * @param string $messageKey Message key for system message.
169     * @param string $language ISO-639.
170     * @param string $voice Name of synthesis voice.
171     * @param string $segmentHash Hash of segment representing utterance.
172     * @param bool $omitAudio If true, then no audio is returned.
173     * @return Utterance|null Utterance found, or null if non-existing.
174     */
175    public function findMessageUtterance(
176        ?string $consumerUrl,
177        string $messageKey,
178        string $language,
179        string $voice,
180        string $segmentHash,
181        bool $omitAudio = false
182    ) {
183        $utterance = $this->retrieveUtteranceMetadata(
184            $consumerUrl,
185            0,
186            $messageKey,
187            $language,
188            $voice,
189            $segmentHash
190        );
191        if ( !$utterance ) {
192            return null;
193        }
194
195        return $this->loadUtteranceAudio( $utterance, $omitAudio );
196    }
197
198    /**
199     * Loads utterance audio and synthesis metadata
200     *
201     * @since 0.1.13
202     */
203    private function loadUtteranceAudio( Utterance $utterance, bool $omitAudio ): ?Utterance {
204        $utteranceId = $utterance->getUtteranceId();
205
206        // @note We might want to keep this as separate function calls,
207        // allowing the user to request when needed, and perhaps
208        // pass a stream straight down from file backend to user
209        // rather than bouncing it via RAM.
210        // Not sure if this is an existing thing in PHP though.
211
212        if ( !$omitAudio ) {
213            $audioSrc = $this->audioUrlFactory( $utteranceId );
214            try {
215                $utterance->setAudio( $this->retrieveFileContents(
216                    $audioSrc,
217                    $utteranceId,
218                    'audio file'
219                ) );
220            } catch ( ExternalStoreException $e ) {
221                $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
222                return null;
223            }
224        }
225
226        $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId );
227        try {
228            $utterance->setSynthesisMetadata( $this->retrieveFileContents(
229                $synthesisMetadataSrc,
230                $utteranceId,
231                'synthesis metadata file'
232            ) );
233        } catch ( ExternalStoreException $e ) {
234            $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() );
235            return null;
236        }
237
238        return $utterance;
239    }
240
241    /**
242     * Retrieves the utterance metadata from the database for a given segment in a page,
243     * using a specific voice and language.
244     *
245     * @since 0.1.13 Optional parameter messageKey
246     * @since 0.1.5
247     * @param string|null $consumerUrl Remote wiki where page is located, or null if local.
248     * @param int $pageId Mediawiki page ID.
249     * @param string|null $messageKey Mediawiki message key.
250     * @param string $language ISO-639.
251     * @param string $voice Name of synthesis voice.
252     * @param string $segmentHash Hash of segment representing utterance.
253     * @return Utterance|null Utterance or null if not found in database
254     */
255    public function retrieveUtteranceMetadata(
256        ?string $consumerUrl,
257        int $pageId,
258        ?string $messageKey,
259        string $language,
260        string $voice,
261        string $segmentHash
262    ): ?Utterance {
263        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
264        $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA );
265
266        $conditions = [
267            'wsu_remote_wiki_hash' => $remoteWikiHash,
268            'wsu_lang' => $language,
269            'wsu_voice' => $voice,
270            'wsu_seg_hash' => $segmentHash
271        ];
272
273        if ( $pageId > 0 ) {
274            $conditions['wsu_page_id'] = $pageId;
275        } else {
276            if ( $messageKey === null ) {
277                throw new RuntimeException( 'If pageId is 0, messageKey must be provided.' );
278            }
279            $conditions['wsu_message_key'] = $messageKey;
280            $conditions['wsu_page_id'] = 0;
281        }
282
283        $row = $dbr->selectRow( self::UTTERANCE_TABLE, [
284            'wsu_utterance_id',
285            'wsu_remote_wiki_hash',
286            'wsu_message_key',
287            'wsu_page_id',
288            'wsu_lang',
289            'wsu_voice',
290            'wsu_seg_hash',
291            'wsu_date_stored'
292        ], $conditions, __METHOD__, [
293            'ORDER BY date_stored DESC',
294        ] );
295        if ( !$row ) {
296            return null;
297        }
298        $utterance = new Utterance(
299            intval( $row->wsu_utterance_id ),
300            $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ),
301            $row->wsu_message_key ? strval( $row->wsu_message_key ) : null,
302            intval( $row->wsu_page_id ),
303            strval( $row->wsu_lang ),
304            strval( $row->wsu_voice ),
305            strval( $row->wsu_seg_hash ),
306            MWTimestamp::getInstance( $row->wsu_date_stored )
307        );
308        return $utterance;
309    }
310
311    /**
312     * Retrieve the file contents from the backend.
313     *
314     * @since 0.1.5
315     * @param string $src
316     * @param int $utteranceId
317     * @param string $type
318     * @return mixed File contents
319     * @throws ExternalStoreException
320     */
321    public function retrieveFileContents( $src, $utteranceId, $type ) {
322        $content = $this->getFileBackend()->getFileContents( [
323            'src' => $src
324        ] );
325        if ( $content == FileBackend::CONTENT_FAIL ) {
326            // @note Consider queuing job to flush inconsistencies from database.
327            throw new ExternalStoreException(
328                "Inconsistency! Database contains utterance with ID $utteranceId " .
329                "that does not exist as $type named $src in file backend." );
330        }
331        return $content;
332    }
333
334    /**
335     * Creates an utterance in the database.
336     *
337     * @since 0.1.13
338     * @param string|null $consumerUrl
339     * @param int $pageId Mediawiki page ID.
340     * @param string $language ISO 639.
341     * @param string $voice Name of synthesis voice.
342     * @param string $segmentHash Hash of segment representing utterance.
343     * @param string $audio Utterance audio.
344     * @param string $synthesisMetadata JSON form metadata about the audio.
345     * @return Utterance Inserted utterance.
346     * @throws ExternalStoreException If unable to prepare or create files in file backend.
347     */
348    public function createUtterance(
349        ?string $consumerUrl,
350        int $pageId,
351        string $language,
352        string $voice,
353        string $segmentHash,
354        string $audio,
355        string $synthesisMetadata
356    ): Utterance {
357        if ( $pageId === 0 ) {
358            throw new RuntimeException( 'Page ID must not be 0 when creating regular utterance.' );
359        }
360
361        return $this->storeUtterance(
362            $consumerUrl,
363            $pageId,
364            null,
365            $language,
366            $voice,
367            $segmentHash,
368            $audio,
369            $synthesisMetadata
370        );
371    }
372
373    /**
374     * Creates a system error utterance in the database and prepares for storing.
375     *
376     * @since 0.1.13
377     * @param string|null $consumerUrl
378     * @param string|null $messageKey Mediawiki message key.
379     * @param string $language ISO 639.
380     * @param string $voice Name of synthesis voice.
381     * @param string $segmentHash Hash of segment representing utterance.
382     * @param string $audio Utterance audio.
383     * @param string $synthesisMetadata JSON form metadata about the audio.
384     * @return Utterance Inserted utterance.
385     * @throws ExternalStoreException If unable to prepare or create files in file backend.
386     */
387    public function createMessageUtterance(
388        ?string $consumerUrl,
389        ?string $messageKey,
390        string $language,
391        string $voice,
392        string $segmentHash,
393        string $audio,
394        string $synthesisMetadata
395    ) {
396        return $this->storeUtterance(
397            $consumerUrl,
398            0,
399            $messageKey,
400            $language,
401            $voice,
402            $segmentHash,
403            $audio,
404            $synthesisMetadata
405        );
406    }
407
408    /**
409     * Stores a created utterance.
410     *
411     * @since 0.1.13
412     * @param string|null $consumerUrl
413     * @param int $pageId Mediawiki page ID.
414     * @param string|null $messageKey Mediawiki message key.
415     * @param string $language ISO 639.
416     * @param string $voice Name of synthesis voice.
417     * @param string $segmentHash Hash of segment representing utterance.
418     * @param string $audio Utterance audio.
419     * @param string $synthesisMetadata JSON form metadata about the audio.
420     * @return Utterance Inserted utterance.
421     * @throws ExternalStoreException If unable to prepare or create files in file backend.
422     */
423    private function storeUtterance(
424        ?string $consumerUrl,
425        int $pageId,
426        ?string $messageKey,
427        string $language,
428        string $voice,
429        string $segmentHash,
430        string $audio,
431        string $synthesisMetadata
432    ) {
433        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
434        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
435        $rows = [
436            'wsu_remote_wiki_hash' => $remoteWikiHash,
437            'wsu_page_id' => $pageId,
438            'wsu_message_key' => $messageKey,
439            'wsu_lang' => $language,
440            'wsu_voice' => $voice,
441            'wsu_seg_hash' => $segmentHash,
442            'wsu_date_stored' => $dbw->timestamp()
443        ];
444        $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ );
445        $utterance = new Utterance(
446            intval( $dbw->insertId() ),
447            $remoteWikiHash,
448            $messageKey,
449            $pageId,
450            $language,
451            $voice,
452            $segmentHash,
453            MWTimestamp::getInstance( $rows['wsu_date_stored'] )
454        );
455
456        // create audio file
457        $this->storeFile(
458            $this->audioUrlFactory( $utterance->getUtteranceId() ),
459            $audio,
460            'audio file'
461        );
462        $utterance->setAudio( $audio );
463
464        // create synthesis metadata file
465        $this->storeFile(
466            $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ),
467            $synthesisMetadata,
468            'synthesis metadata file'
469        );
470        $utterance->setSynthesisMetadata( $synthesisMetadata );
471
472        $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue();
473        $jobQueue->maybeQueueJob();
474
475        return $utterance;
476    }
477
478    /**
479     * Store a file in the backend.
480     *
481     * @since 0.1.5
482     * @param string $fileUrl
483     * @param mixed $content
484     * @param string $type
485     * @throws ExternalStoreException
486     */
487    public function storeFile( $fileUrl, $content, $type ) {
488        $fileBackend = $this->getFileBackend();
489
490        if ( !$fileBackend->prepare( [
491            'dir' => dirname( $fileUrl ),
492            'noAccess' => 1,
493            'noListing' => 1
494        ] )->isOK() ) {
495            throw new ExternalStoreException( "Failed to prepare $type$fileUrl." );
496        }
497        $opts = [
498            'dst' => $fileUrl,
499            'content' => $content,
500            'overwrite' => true
501        ];
502        if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
503            $fileBackend instanceof SwiftFileBackend ) {
504            // Mark files in Swift for automatic removal after TTL.
505            // See $this->flushUtterances for code that skips forced removal if backend is Swift.
506            $opts['headers'] = [
507                // number of seconds from now
508                'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24
509            ];
510        }
511        if ( !$fileBackend->create( $opts )->isOK() ) {
512            throw new ExternalStoreException( "Failed to create $type$fileUrl." );
513        }
514    }
515
516    /**
517     * Clears database and file backend of utterances older than a given age.
518     *
519     * @since 0.1.5
520     * @param MWTimestamp $expirationDate
521     * @return int Number of utterances flushed.
522     */
523    public function flushUtterancesByExpirationDate( $expirationDate ) {
524        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
525        $results = $dbw->select( self::UTTERANCE_TABLE,
526            [ 'wsu_utterance_id' ],
527            [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ),
528                'wsu_message_key' => null,
529            ],
530            __METHOD__
531        );
532        return $this->flushUtterances( $dbw, $results );
533    }
534
535    /**
536     * Clears database and file backend of all utterances for a given page.
537     *
538     * @since 0.1.5
539     * @param string|null $consumerUrl
540     * @param int $pageId Mediawiki page ID.
541     * @return int Number of utterances flushed.
542     */
543    public function flushUtterancesByPage(
544        ?string $consumerUrl,
545        int $pageId
546    ): int {
547        $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl );
548
549        if ( $consumerUrl ) {
550            $this->logger->info( __METHOD__ . ": Flushing utterances for page $pageId at $consumerUrl" );
551        } else {
552            $this->logger->info( __METHOD__ . ": Flushing utterances for page $pageId" );
553        }
554
555        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
556        $results = $dbw->select( self::UTTERANCE_TABLE,
557            [ 'wsu_utterance_id' ],
558            [
559                'wsu_remote_wiki_hash' => $remoteWikiHash,
560                'wsu_page_id' => $pageId,
561                'wsu_message_key' => null,
562            ],
563            __METHOD__
564        );
565        return $this->flushUtterances( $dbw, $results );
566    }
567
568    /**
569     * Clears database and file backend of all utterances for a given language and voice.
570     * If no voice is set, then all voices will be removed.
571     *
572     * @since 0.1.5
573     * @param string $language ISO 639.
574     * @param string|null $voice Optional name of synthesis voice to limit flush to.
575     * @return int Number of utterances flushed.
576     */
577    public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) {
578        $conditions = [
579            'wsu_lang' => $language,
580            'wsu_message_key' => null,
581        ];
582        if ( $voice != null ) {
583            $conditions['wsu_voice'] = $voice;
584        }
585        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
586        $results = $dbw->select( self::UTTERANCE_TABLE,
587            [ 'wsu_utterance_id' ], $conditions, __METHOD__
588        );
589        return $this->flushUtterances( $dbw, $results );
590    }
591
592    /**
593     * Clears database and file backend of all message utterances.
594     *
595     * @since 0.1.14
596     * @return int Number of utterances flushed.
597     */
598    public function flushMessageUtterances() {
599        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
600        $results = $dbw->select( self::UTTERANCE_TABLE,
601            [ 'wsu_utterance_id' ], [ 'wsu_message_key IS NOT NULL' ], __METHOD__
602        );
603        return $this->flushUtterances( $dbw, $results );
604    }
605
606    /**
607     * Flushes utterances listed in a result set containing
608     * at least the wsu_utterance_id column.
609     *
610     * In order for return value to increase, the utterance must have been
611     * successfully deleted in all layers, i.e. utterance metadata database row,
612     * utterance audio and synthesis metadata from file store.
613     * E.g. if the utterance audio file is missing and thus not explicitly removed,
614     * but at the same time we managed to remove the utterance metadata from database
615     * and also removed the synthesis metadata file, this will not count as a
616     * successfully removed utterance. It would however be removed from all layers
617     * and it would also cause an out-of-sync warning in the log.
618     *
619     * @note Consider if database should be flushing within a transaction.
620     *
621     * @since 0.1.5
622     * @param IDatabase $dbw Writable database connection.
623     * @param IResultWrapper $results Result set.
624     * @return int Number of utterances that were successfully flushed in all layers.
625     */
626    private function flushUtterances( $dbw, $results ) {
627        if ( !$results ) {
628            return 0;
629        }
630
631        // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations.
632        $flushInFileBackend = !(
633            $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() &&
634            $this->getFileBackend() instanceof SwiftFileBackend
635        );
636
637        $successfullyFlushedCounter = 0;
638        foreach ( $results as $row ) {
639            $utteranceId = $row->wsu_utterance_id;
640
641            // 1. delete in database
642            $successfullyDeletedTableRow = $dbw->delete(
643                self::UTTERANCE_TABLE,
644                [ 'wsu_utterance_id' => $utteranceId ],
645                __METHOD__
646            );
647            if ( !$successfullyDeletedTableRow ) {
648                $this->logger->warning( __METHOD__ . ': ' .
649                    'Failed to delete utterance {utteranceId} from database.', [
650                        'utteranceId' => $utteranceId
651                    ] );
652            } else {
653                $this->logger->debug( __METHOD__ . ': ' .
654                    'Flushed out utterance with id {utteranceId} from database', [
655                        'utteranceId' => $utteranceId
656                    ] );
657            }
658
659            // 2. delete in file store.
660            if ( $flushInFileBackend ) {
661                $successfullyDeletedAudioFile = $this->deleteFileBackendFile(
662                    $this->audioUrlFactory( $utteranceId ),
663                    $utteranceId,
664                    'audio file'
665                );
666                $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile(
667                    $this->synthesisMetadataUrlFactory( $utteranceId ),
668                    $utteranceId,
669                    'synthesis metadata file'
670                );
671                $successfullyDeletedFiles =
672                    $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile;
673            } else {
674                // The files were marked for automatic deletion using TTL in the Swift create operation.
675                $successfullyDeletedFiles = true;
676            }
677
678            if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) {
679                $successfullyFlushedCounter++;
680            }
681        }
682
683        return $successfullyFlushedCounter;
684    }
685
686    /**
687     * @since 0.1.5
688     * @param string $src
689     * @param int $utteranceId
690     * @param string $type
691     * @return bool If successfully deleted
692     */
693    private function deleteFileBackendFile( $src, $utteranceId, $type ) {
694        $synthesisMetadataFile = [
695            'src' => $src
696        ];
697        if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) {
698            if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) {
699                $this->logger->warning( __METHOD__ . ': ' .
700                    'Unable to delete {type} for utterance with identity {utteranceId}.', [
701                        'utteranceId' => $utteranceId,
702                        'type' => $type
703                    ] );
704                return false;
705            } else {
706                $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] );
707            }
708        } else {
709            $this->logger->warning( __METHOD__ . ': ' .
710                'Attempted to delete non existing {type} for utterance {utteranceId}.', [
711                    'utteranceId' => $utteranceId,
712                    'type' => $type
713                ] );
714            return false;
715        }
716        $this->logger->debug( __METHOD__ . ': ' .
717            'Flushed out file {src}', [ 'src' => $src ] );
718        return true;
719    }
720
721    /**
722     * Creates a deterministic path based on utterance identity,
723     * causing no more than 1000 files and 10 subdirectories per directory.
724     * (Actually, 2000 files, as we store both .json and .opus)
725     *
726     * Overloading a directory with files often cause performance problems.
727     *
728     * 1 -> /
729     * 12 -> /
730     * 123 -> /
731     * 1234 -> /1/
732     * 12345 -> /1/2/
733     * 123456 -> /1/2/3/
734     * 1234567 -> /1/2/3/4/
735     *
736     * @since 0.1.5
737     * @param int $utteranceId
738     * @return string Path
739     */
740    private function urlPathFactory( $utteranceId ) {
741        $path = '/';
742        $utteranceIdText = strval( $utteranceId );
743        $utteranceIdTextLength = strlen( $utteranceIdText );
744        for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) {
745            $path .= substr( $utteranceIdText, $index, 1 ) . '/';
746        }
747        return $path;
748    }
749
750    /**
751     * @since 0.1.5
752     * @param int $utteranceId Utterance identity.
753     * @return string url used to access object in file store
754     */
755    private function audioUrlPrefixFactory( $utteranceId ) {
756        return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName )
757            . $this->urlPathFactory( $utteranceId ) . $utteranceId;
758    }
759
760    /**
761     * @since 0.1.5
762     * @param int $utteranceId Utterance identity.
763     * @return string url used to access object in file store
764     */
765    private function audioUrlFactory( $utteranceId ) {
766        return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus';
767    }
768
769    /**
770     * @since 0.1.5
771     * @param int $utteranceId Utterance identity.
772     * @return string url used to access object in file store
773     */
774    private function synthesisMetadataUrlFactory( $utteranceId ) {
775        return $this->audioUrlPrefixFactory( $utteranceId ) . '.json';
776    }
777
778    /**
779     * Removes expired utterance and synthesis metadata from the file backend.
780     *
781     * @since 0.1.7
782     * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned.
783     *  Defaults to config value.
784     * @return int Number of expired files flushed
785     */
786    public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) {
787        // @note Either this method, or the job,
788        // should probably call `flushUtterancesByExpirationDate`
789        // to ensure we are not deleting a bunch of files
790        // which were scheduled to be deleted together with their db-entries anyway.
791
792        if ( !$expiredTimestamp ) {
793            $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp();
794        }
795        $fileBackend = $this->getFileBackend();
796        return $this->recurseFlushUtterancesByExpirationDateOnFile(
797            $fileBackend,
798            $this->getFileBackend()
799                ->getContainerStoragePath( $this->fileBackendContainerName ),
800            $expiredTimestamp
801        );
802    }
803
804    /**
805     * @since 0.1.7
806     * @param FileBackend $fileBackend
807     * @param string $directory
808     * @param MWTimestamp $expiredTimestamp
809     * @return int Number of expired files flushed
810     */
811    private function recurseFlushUtterancesByExpirationDateOnFile(
812        $fileBackend,
813        $directory,
814        $expiredTimestamp
815    ) {
816        $this->logger->debug( __METHOD__ . ': ' .
817            'Processing directory {directory}', [ 'directory' => $directory ] );
818        $removedFilesCounter = 0;
819        $subdirectories = $fileBackend->getDirectoryList( [
820            'dir' => $directory,
821            'topOnly' => true,
822        ] );
823        if ( $subdirectories ) {
824            foreach ( $subdirectories as $subdirectory ) {
825                $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile(
826                    $fileBackend,
827                    $directory . '/' . $subdirectory,
828                    $expiredTimestamp
829                );
830            }
831        }
832        $files = $fileBackend->getFileList( [
833            'dir' => $directory,
834            'topOnly' => true,
835            'adviseStat' => false
836        ] );
837        if ( $files ) {
838            foreach ( $files as $file ) {
839                $src = [ 'src' => $directory . '/' . $file ];
840                $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) );
841                $this->logger->debug( __METHOD__ . ': ' .
842                    'Processing file {src} with timestamp {timestamp}', [
843                        'src' => $file,
844                        'timestamp' => $timestamp,
845                        'expiredTimestamp' => $expiredTimestamp
846                    ] );
847                if ( $timestamp <= $expiredTimestamp ) {
848                    if ( $fileBackend->delete( $src )->isOK() ) {
849                        $removedFilesCounter++;
850                        $this->logger->debug( __METHOD__ . ': ' .
851                            'Deleted expired file {file} #{num}', [
852                                'file' => $file,
853                                'num' => $removedFilesCounter
854                            ]
855                        );
856                    } else {
857                        $this->logger->warning( __METHOD__ . ': ' .
858                            'Unable to delete expired file {file}',
859                            [ 'file' => $file ]
860                        );
861                    }
862                }
863                unset( $timestamp );
864            }
865        }
866        $this->getFileBackend()->clean( [ 'dir' => $directory ] );
867        return $removedFilesCounter;
868    }
869
870    /**
871     * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays
872     *
873     * @return MWTimestamp Utterance parts with timestamp <= this is expired.
874     */
875    public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp {
876        return MWTimestamp::getInstance(
877            strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' )
878        );
879    }
880
881    /**
882     * @return int Number of days an utterance is to exist before being flushed out.
883     */
884    private function getWikispeechUtteranceTimeToLiveDays(): int {
885        return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) );
886    }
887
888    /**
889     * @return bool
890     */
891    private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool {
892        return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' );
893    }
894
895    /**
896     * Used to evaluate hash of gadget consumer URL,
897     * the remote wiki where the page is located.
898     *
899     * Making changes to this function will probably invalidate all existing cached utterances.
900     *
901     * @since 0.1.9
902     * @param string|null $consumerUrl
903     * @return string|null SHA256 message digest
904     */
905    public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string {
906        if ( $consumerUrl === null ) {
907            return null;
908        }
909        $context = hash_init( 'sha256' );
910        $urlParts = parse_url( $consumerUrl );
911        if ( isset( $urlParts['host'] ) ) {
912            hash_update( $context, mb_strtolower( $urlParts['host'] ) );
913        }
914        if ( isset( $urlParts['port'] ) ) {
915            hash_update( $context, strval( $urlParts['port'] ) );
916        }
917        if ( isset( $urlParts['path'] ) ) {
918            hash_update( $context, $urlParts['path'] );
919        }
920        return hash_final( $context );
921    }
922
923}