Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
81.40% |
315 / 387 |
|
48.15% |
13 / 27 |
CRAP | |
0.00% |
0 / 1 |
| UtteranceStore | |
81.40% |
315 / 387 |
|
48.15% |
13 / 27 |
103.46 | |
0.00% |
0 / 1 |
| __construct | |
61.54% |
8 / 13 |
|
0.00% |
0 / 1 |
2.23 | |||
| getFileBackend | |
8.00% |
2 / 25 |
|
0.00% |
0 / 1 |
16.46 | |||
| findUtterance | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
| findMessageUtterance | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
2.00 | |||
| loadUtteranceAudio | |
71.43% |
15 / 21 |
|
0.00% |
0 / 1 |
4.37 | |||
| retrieveUtteranceMetadata | |
97.44% |
38 / 39 |
|
0.00% |
0 / 1 |
6 | |||
| retrieveFileContents | |
62.50% |
5 / 8 |
|
0.00% |
0 / 1 |
2.21 | |||
| createUtterance | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 | |||
| createMessageUtterance | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
| storeUtterance | |
100.00% |
37 / 37 |
|
100.00% |
1 / 1 |
1 | |||
| storeFile | |
73.68% |
14 / 19 |
|
0.00% |
0 / 1 |
5.46 | |||
| flushUtterancesByExpirationDate | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
| flushUtterancesByPage | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
2 | |||
| flushUtterancesByLanguageAndVoice | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
| flushMessageUtterances | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| flushUtterances | |
85.00% |
34 / 40 |
|
0.00% |
0 / 1 |
9.27 | |||
| deleteFileBackendFile | |
42.86% |
9 / 21 |
|
0.00% |
0 / 1 |
4.68 | |||
| urlPathFactory | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
| audioUrlPrefixFactory | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| audioUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| synthesisMetadataUrlFactory | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| flushUtterancesByExpirationDateOnFile | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
2.01 | |||
| recurseFlushUtterancesByExpirationDateOnFile | |
91.11% |
41 / 45 |
|
0.00% |
0 / 1 |
7.03 | |||
| getWikispeechUtteranceExpirationTimestamp | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| getWikispeechUtteranceTimeToLiveDays | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| isWikispeechUtteranceUseSwiftFileBackendExpiring | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| evaluateRemoteWikiHash | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Wikispeech\Utterance; |
| 4 | |
| 5 | /** |
| 6 | * @file |
| 7 | * @ingroup Extensions |
| 8 | * @license GPL-2.0-or-later |
| 9 | */ |
| 10 | |
| 11 | use Config; |
| 12 | use ExternalStoreException; |
| 13 | use FileBackend; |
| 14 | use FSFileBackend; |
| 15 | use MediaWiki\Logger\LoggerFactory; |
| 16 | use MediaWiki\MediaWikiServices; |
| 17 | use MediaWiki\WikiMap\WikiMap; |
| 18 | use MWTimestamp; |
| 19 | use Psr\Log\LoggerInterface; |
| 20 | use RuntimeException; |
| 21 | use SwiftFileBackend; |
| 22 | use Wikimedia\Rdbms\IDatabase; |
| 23 | use Wikimedia\Rdbms\ILoadBalancer; |
| 24 | use Wikimedia\Rdbms\IResultWrapper; |
| 25 | |
| 26 | /** |
| 27 | * Keeps track of utterances in persistent layers. |
| 28 | * |
| 29 | * Utterance metadata (i.e. segment hash, page id, language, etc) is stored in a database table. |
| 30 | * Utterance audio is (synthesised voice audio) is stored as an opus file in file backend. |
| 31 | * Synthesis metadata (tokens, etc) is stored as a JSON file in file backend. |
| 32 | * |
| 33 | * (.opus and .json suffixes are added in file backed store although this class is agnostic |
| 34 | * regarding to the actual data encoding and formats.) |
| 35 | * |
| 36 | * @since 0.1.13 Introduces messageKey as parameter for system error messages. |
| 37 | * @since 0.1.5 |
| 38 | */ |
| 39 | class UtteranceStore { |
| 40 | |
| 41 | /** @var string Name of database table that keeps track of utterance metadata. */ |
| 42 | public const UTTERANCE_TABLE = 'wikispeech_utterance'; |
| 43 | |
| 44 | /** @var LoggerInterface */ |
| 45 | private $logger; |
| 46 | |
| 47 | /** |
| 48 | * Don't use this directly, access @see getFileBackend |
| 49 | * @var FileBackend Used to store utterance audio and synthesis metadata. |
| 50 | */ |
| 51 | private $fileBackend; |
| 52 | |
| 53 | /** |
| 54 | * @var ILoadBalancer |
| 55 | */ |
| 56 | private $dbLoadBalancer; |
| 57 | |
| 58 | /** @var string Name of container (sort of path prefix) used for files in backend. */ |
| 59 | private $fileBackendContainerName; |
| 60 | |
| 61 | /** @var Config */ |
| 62 | private $config; |
| 63 | |
| 64 | public function __construct() { |
| 65 | $this->logger = LoggerFactory::getInstance( 'Wikispeech' ); |
| 66 | |
| 67 | // @todo don't create, add as constructor parameter |
| 68 | // Refer to https://phabricator.wikimedia.org/T264165 |
| 69 | $this->config = MediaWikiServices::getInstance() |
| 70 | ->getConfigFactory() |
| 71 | ->makeConfig( 'wikispeech' ); |
| 72 | |
| 73 | $this->fileBackendContainerName = $this->config |
| 74 | ->get( 'WikispeechUtteranceFileBackendContainerName' ); |
| 75 | if ( !$this->fileBackendContainerName ) { |
| 76 | $this->fileBackendContainerName = "wikispeech-utterances"; |
| 77 | $this->logger->info( __METHOD__ . ': ' . |
| 78 | 'Falling back on container name {containerName}', [ |
| 79 | 'containerName' => $this->fileBackendContainerName |
| 80 | ] ); |
| 81 | } |
| 82 | |
| 83 | $this->dbLoadBalancer = MediaWikiServices::getInstance()->getDBLoadBalancer(); |
| 84 | } |
| 85 | |
| 86 | /** |
| 87 | * @since 0.1.5 |
| 88 | * @return FileBackend |
| 89 | * @throws ExternalStoreException If defined file backend group does not exists. |
| 90 | */ |
| 91 | private function getFileBackend() { |
| 92 | global $wgUploadDirectory; |
| 93 | if ( !$this->fileBackend ) { |
| 94 | |
| 95 | /** @var string Name of file backend group in LocalSettings.php to use. */ |
| 96 | $fileBackendName = $this->config->get( 'WikispeechUtteranceFileBackendName' ); |
| 97 | if ( !$fileBackendName ) { |
| 98 | $fileBackendName = 'wikispeech-backend'; |
| 99 | $fallbackDir = "$wgUploadDirectory/wikispeech_utterances"; |
| 100 | $this->logger->info( __METHOD__ . ': ' . |
| 101 | 'No file backend defined in LocalSettings.php. Falling back ' . |
| 102 | 'on FS storage backend named {name} in {dir}.', [ |
| 103 | 'name' => $fileBackendName, |
| 104 | 'dir' => $fallbackDir |
| 105 | ] ); |
| 106 | $this->fileBackend = new FSFileBackend( [ |
| 107 | 'name' => $fileBackendName, |
| 108 | 'wikiId' => WikiMap::getCurrentWikiId(), |
| 109 | 'basePath' => $fallbackDir |
| 110 | ] ); |
| 111 | } else { |
| 112 | $fileBackend = MediaWikiServices::getInstance() |
| 113 | ->getFileBackendGroup() |
| 114 | ->get( $fileBackendName ); |
| 115 | if ( $fileBackend ) { |
| 116 | $this->fileBackend = $fileBackend; |
| 117 | } else { |
| 118 | throw new ExternalStoreException( |
| 119 | "No file backend group in LocalSettings.php named $fileBackendName." |
| 120 | ); |
| 121 | } |
| 122 | } |
| 123 | } |
| 124 | return $this->fileBackend; |
| 125 | } |
| 126 | |
| 127 | /** |
| 128 | * Retrieves an utterance for a given segment in a page, using a specific |
| 129 | * voice and language. |
| 130 | * |
| 131 | * @since 0.1.13 |
| 132 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
| 133 | * @param int $pageId Mediawiki page ID. |
| 134 | * @param string $language ISO-639. |
| 135 | * @param string $voice Name of synthesis voice. |
| 136 | * @param string $segmentHash Hash of segment representing utterance. |
| 137 | * @param bool $omitAudio If true, then no audio is returned. |
| 138 | * @return Utterance|null Utterance found, or null if non-existing. |
| 139 | */ |
| 140 | public function findUtterance( |
| 141 | ?string $consumerUrl, |
| 142 | int $pageId, |
| 143 | string $language, |
| 144 | string $voice, |
| 145 | string $segmentHash, |
| 146 | bool $omitAudio = false |
| 147 | ): ?Utterance { |
| 148 | $utterance = $this->retrieveUtteranceMetadata( |
| 149 | $consumerUrl, |
| 150 | $pageId, |
| 151 | null, |
| 152 | $language, |
| 153 | $voice, |
| 154 | $segmentHash |
| 155 | ); |
| 156 | if ( !$utterance ) { |
| 157 | return null; |
| 158 | } |
| 159 | |
| 160 | return $this->loadUtteranceAudio( $utterance, $omitAudio ); |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * Retrieves an utterance for a specific error message |
| 165 | * |
| 166 | * @since 0.1.13 |
| 167 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
| 168 | * @param string $messageKey Message key for system message. |
| 169 | * @param string $language ISO-639. |
| 170 | * @param string $voice Name of synthesis voice. |
| 171 | * @param string $segmentHash Hash of segment representing utterance. |
| 172 | * @param bool $omitAudio If true, then no audio is returned. |
| 173 | * @return Utterance|null Utterance found, or null if non-existing. |
| 174 | */ |
| 175 | public function findMessageUtterance( |
| 176 | ?string $consumerUrl, |
| 177 | string $messageKey, |
| 178 | string $language, |
| 179 | string $voice, |
| 180 | string $segmentHash, |
| 181 | bool $omitAudio = false |
| 182 | ) { |
| 183 | $utterance = $this->retrieveUtteranceMetadata( |
| 184 | $consumerUrl, |
| 185 | 0, |
| 186 | $messageKey, |
| 187 | $language, |
| 188 | $voice, |
| 189 | $segmentHash |
| 190 | ); |
| 191 | if ( !$utterance ) { |
| 192 | return null; |
| 193 | } |
| 194 | |
| 195 | return $this->loadUtteranceAudio( $utterance, $omitAudio ); |
| 196 | } |
| 197 | |
| 198 | /** |
| 199 | * Loads utterance audio and synthesis metadata |
| 200 | * |
| 201 | * @since 0.1.13 |
| 202 | */ |
| 203 | private function loadUtteranceAudio( Utterance $utterance, bool $omitAudio ): ?Utterance { |
| 204 | $utteranceId = $utterance->getUtteranceId(); |
| 205 | |
| 206 | // @note We might want to keep this as separate function calls, |
| 207 | // allowing the user to request when needed, and perhaps |
| 208 | // pass a stream straight down from file backend to user |
| 209 | // rather than bouncing it via RAM. |
| 210 | // Not sure if this is an existing thing in PHP though. |
| 211 | |
| 212 | if ( !$omitAudio ) { |
| 213 | $audioSrc = $this->audioUrlFactory( $utteranceId ); |
| 214 | try { |
| 215 | $utterance->setAudio( $this->retrieveFileContents( |
| 216 | $audioSrc, |
| 217 | $utteranceId, |
| 218 | 'audio file' |
| 219 | ) ); |
| 220 | } catch ( ExternalStoreException $e ) { |
| 221 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
| 222 | return null; |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | $synthesisMetadataSrc = $this->synthesisMetadataUrlFactory( $utteranceId ); |
| 227 | try { |
| 228 | $utterance->setSynthesisMetadata( $this->retrieveFileContents( |
| 229 | $synthesisMetadataSrc, |
| 230 | $utteranceId, |
| 231 | 'synthesis metadata file' |
| 232 | ) ); |
| 233 | } catch ( ExternalStoreException $e ) { |
| 234 | $this->logger->warning( __METHOD__ . ': ' . $e->getMessage() ); |
| 235 | return null; |
| 236 | } |
| 237 | |
| 238 | return $utterance; |
| 239 | } |
| 240 | |
| 241 | /** |
| 242 | * Retrieves the utterance metadata from the database for a given segment in a page, |
| 243 | * using a specific voice and language. |
| 244 | * |
| 245 | * @since 0.1.13 Optional parameter messageKey |
| 246 | * @since 0.1.5 |
| 247 | * @param string|null $consumerUrl Remote wiki where page is located, or null if local. |
| 248 | * @param int $pageId Mediawiki page ID. |
| 249 | * @param string|null $messageKey Mediawiki message key. |
| 250 | * @param string $language ISO-639. |
| 251 | * @param string $voice Name of synthesis voice. |
| 252 | * @param string $segmentHash Hash of segment representing utterance. |
| 253 | * @return Utterance|null Utterance or null if not found in database |
| 254 | */ |
| 255 | public function retrieveUtteranceMetadata( |
| 256 | ?string $consumerUrl, |
| 257 | int $pageId, |
| 258 | ?string $messageKey, |
| 259 | string $language, |
| 260 | string $voice, |
| 261 | string $segmentHash |
| 262 | ): ?Utterance { |
| 263 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
| 264 | $dbr = $this->dbLoadBalancer->getConnection( DB_REPLICA ); |
| 265 | |
| 266 | $conditions = [ |
| 267 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
| 268 | 'wsu_lang' => $language, |
| 269 | 'wsu_voice' => $voice, |
| 270 | 'wsu_seg_hash' => $segmentHash |
| 271 | ]; |
| 272 | |
| 273 | if ( $pageId > 0 ) { |
| 274 | $conditions['wsu_page_id'] = $pageId; |
| 275 | } else { |
| 276 | if ( $messageKey === null ) { |
| 277 | throw new RuntimeException( 'If pageId is 0, messageKey must be provided.' ); |
| 278 | } |
| 279 | $conditions['wsu_message_key'] = $messageKey; |
| 280 | $conditions['wsu_page_id'] = 0; |
| 281 | } |
| 282 | |
| 283 | $row = $dbr->selectRow( self::UTTERANCE_TABLE, [ |
| 284 | 'wsu_utterance_id', |
| 285 | 'wsu_remote_wiki_hash', |
| 286 | 'wsu_message_key', |
| 287 | 'wsu_page_id', |
| 288 | 'wsu_lang', |
| 289 | 'wsu_voice', |
| 290 | 'wsu_seg_hash', |
| 291 | 'wsu_date_stored' |
| 292 | ], $conditions, __METHOD__, [ |
| 293 | 'ORDER BY date_stored DESC', |
| 294 | ] ); |
| 295 | if ( !$row ) { |
| 296 | return null; |
| 297 | } |
| 298 | $utterance = new Utterance( |
| 299 | intval( $row->wsu_utterance_id ), |
| 300 | $row->wsu_remote_wiki_hash === null ? null : strval( $row->wsu_remote_wiki_hash ), |
| 301 | $row->wsu_message_key ? strval( $row->wsu_message_key ) : null, |
| 302 | intval( $row->wsu_page_id ), |
| 303 | strval( $row->wsu_lang ), |
| 304 | strval( $row->wsu_voice ), |
| 305 | strval( $row->wsu_seg_hash ), |
| 306 | MWTimestamp::getInstance( $row->wsu_date_stored ) |
| 307 | ); |
| 308 | return $utterance; |
| 309 | } |
| 310 | |
| 311 | /** |
| 312 | * Retrieve the file contents from the backend. |
| 313 | * |
| 314 | * @since 0.1.5 |
| 315 | * @param string $src |
| 316 | * @param int $utteranceId |
| 317 | * @param string $type |
| 318 | * @return mixed File contents |
| 319 | * @throws ExternalStoreException |
| 320 | */ |
| 321 | public function retrieveFileContents( $src, $utteranceId, $type ) { |
| 322 | $content = $this->getFileBackend()->getFileContents( [ |
| 323 | 'src' => $src |
| 324 | ] ); |
| 325 | if ( $content == FileBackend::CONTENT_FAIL ) { |
| 326 | // @note Consider queuing job to flush inconsistencies from database. |
| 327 | throw new ExternalStoreException( |
| 328 | "Inconsistency! Database contains utterance with ID $utteranceId " . |
| 329 | "that does not exist as $type named $src in file backend." ); |
| 330 | } |
| 331 | return $content; |
| 332 | } |
| 333 | |
| 334 | /** |
| 335 | * Creates an utterance in the database. |
| 336 | * |
| 337 | * @since 0.1.13 |
| 338 | * @param string|null $consumerUrl |
| 339 | * @param int $pageId Mediawiki page ID. |
| 340 | * @param string $language ISO 639. |
| 341 | * @param string $voice Name of synthesis voice. |
| 342 | * @param string $segmentHash Hash of segment representing utterance. |
| 343 | * @param string $audio Utterance audio. |
| 344 | * @param string $synthesisMetadata JSON form metadata about the audio. |
| 345 | * @return Utterance Inserted utterance. |
| 346 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
| 347 | */ |
| 348 | public function createUtterance( |
| 349 | ?string $consumerUrl, |
| 350 | int $pageId, |
| 351 | string $language, |
| 352 | string $voice, |
| 353 | string $segmentHash, |
| 354 | string $audio, |
| 355 | string $synthesisMetadata |
| 356 | ): Utterance { |
| 357 | if ( $pageId === 0 ) { |
| 358 | throw new RuntimeException( 'Page ID must not be 0 when creating regular utterance.' ); |
| 359 | } |
| 360 | |
| 361 | return $this->storeUtterance( |
| 362 | $consumerUrl, |
| 363 | $pageId, |
| 364 | null, |
| 365 | $language, |
| 366 | $voice, |
| 367 | $segmentHash, |
| 368 | $audio, |
| 369 | $synthesisMetadata |
| 370 | ); |
| 371 | } |
| 372 | |
| 373 | /** |
| 374 | * Creates a system error utterance in the database and prepares for storing. |
| 375 | * |
| 376 | * @since 0.1.13 |
| 377 | * @param string|null $consumerUrl |
| 378 | * @param string|null $messageKey Mediawiki message key. |
| 379 | * @param string $language ISO 639. |
| 380 | * @param string $voice Name of synthesis voice. |
| 381 | * @param string $segmentHash Hash of segment representing utterance. |
| 382 | * @param string $audio Utterance audio. |
| 383 | * @param string $synthesisMetadata JSON form metadata about the audio. |
| 384 | * @return Utterance Inserted utterance. |
| 385 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
| 386 | */ |
| 387 | public function createMessageUtterance( |
| 388 | ?string $consumerUrl, |
| 389 | ?string $messageKey, |
| 390 | string $language, |
| 391 | string $voice, |
| 392 | string $segmentHash, |
| 393 | string $audio, |
| 394 | string $synthesisMetadata |
| 395 | ) { |
| 396 | return $this->storeUtterance( |
| 397 | $consumerUrl, |
| 398 | 0, |
| 399 | $messageKey, |
| 400 | $language, |
| 401 | $voice, |
| 402 | $segmentHash, |
| 403 | $audio, |
| 404 | $synthesisMetadata |
| 405 | ); |
| 406 | } |
| 407 | |
| 408 | /** |
| 409 | * Stores a created utterance. |
| 410 | * |
| 411 | * @since 0.1.13 |
| 412 | * @param string|null $consumerUrl |
| 413 | * @param int $pageId Mediawiki page ID. |
| 414 | * @param string|null $messageKey Mediawiki message key. |
| 415 | * @param string $language ISO 639. |
| 416 | * @param string $voice Name of synthesis voice. |
| 417 | * @param string $segmentHash Hash of segment representing utterance. |
| 418 | * @param string $audio Utterance audio. |
| 419 | * @param string $synthesisMetadata JSON form metadata about the audio. |
| 420 | * @return Utterance Inserted utterance. |
| 421 | * @throws ExternalStoreException If unable to prepare or create files in file backend. |
| 422 | */ |
| 423 | private function storeUtterance( |
| 424 | ?string $consumerUrl, |
| 425 | int $pageId, |
| 426 | ?string $messageKey, |
| 427 | string $language, |
| 428 | string $voice, |
| 429 | string $segmentHash, |
| 430 | string $audio, |
| 431 | string $synthesisMetadata |
| 432 | ) { |
| 433 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
| 434 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
| 435 | $rows = [ |
| 436 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
| 437 | 'wsu_page_id' => $pageId, |
| 438 | 'wsu_message_key' => $messageKey, |
| 439 | 'wsu_lang' => $language, |
| 440 | 'wsu_voice' => $voice, |
| 441 | 'wsu_seg_hash' => $segmentHash, |
| 442 | 'wsu_date_stored' => $dbw->timestamp() |
| 443 | ]; |
| 444 | $dbw->insert( self::UTTERANCE_TABLE, $rows, __METHOD__ ); |
| 445 | $utterance = new Utterance( |
| 446 | intval( $dbw->insertId() ), |
| 447 | $remoteWikiHash, |
| 448 | $messageKey, |
| 449 | $pageId, |
| 450 | $language, |
| 451 | $voice, |
| 452 | $segmentHash, |
| 453 | MWTimestamp::getInstance( $rows['wsu_date_stored'] ) |
| 454 | ); |
| 455 | |
| 456 | // create audio file |
| 457 | $this->storeFile( |
| 458 | $this->audioUrlFactory( $utterance->getUtteranceId() ), |
| 459 | $audio, |
| 460 | 'audio file' |
| 461 | ); |
| 462 | $utterance->setAudio( $audio ); |
| 463 | |
| 464 | // create synthesis metadata file |
| 465 | $this->storeFile( |
| 466 | $this->synthesisMetadataUrlFactory( $utterance->getUtteranceId() ), |
| 467 | $synthesisMetadata, |
| 468 | 'synthesis metadata file' |
| 469 | ); |
| 470 | $utterance->setSynthesisMetadata( $synthesisMetadata ); |
| 471 | |
| 472 | $jobQueue = new FlushUtterancesFromStoreByExpirationJobQueue(); |
| 473 | $jobQueue->maybeQueueJob(); |
| 474 | |
| 475 | return $utterance; |
| 476 | } |
| 477 | |
| 478 | /** |
| 479 | * Store a file in the backend. |
| 480 | * |
| 481 | * @since 0.1.5 |
| 482 | * @param string $fileUrl |
| 483 | * @param mixed $content |
| 484 | * @param string $type |
| 485 | * @throws ExternalStoreException |
| 486 | */ |
| 487 | public function storeFile( $fileUrl, $content, $type ) { |
| 488 | $fileBackend = $this->getFileBackend(); |
| 489 | |
| 490 | if ( !$fileBackend->prepare( [ |
| 491 | 'dir' => dirname( $fileUrl ), |
| 492 | 'noAccess' => 1, |
| 493 | 'noListing' => 1 |
| 494 | ] )->isOK() ) { |
| 495 | throw new ExternalStoreException( "Failed to prepare $type: $fileUrl." ); |
| 496 | } |
| 497 | $opts = [ |
| 498 | 'dst' => $fileUrl, |
| 499 | 'content' => $content, |
| 500 | 'overwrite' => true |
| 501 | ]; |
| 502 | if ( $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
| 503 | $fileBackend instanceof SwiftFileBackend ) { |
| 504 | // Mark files in Swift for automatic removal after TTL. |
| 505 | // See $this->flushUtterances for code that skips forced removal if backend is Swift. |
| 506 | $opts['headers'] = [ |
| 507 | // number of seconds from now |
| 508 | 'X-Delete-After' => $this->getWikispeechUtteranceTimeToLiveDays() * 60 * 60 * 24 |
| 509 | ]; |
| 510 | } |
| 511 | if ( !$fileBackend->create( $opts )->isOK() ) { |
| 512 | throw new ExternalStoreException( "Failed to create $type: $fileUrl." ); |
| 513 | } |
| 514 | } |
| 515 | |
| 516 | /** |
| 517 | * Clears database and file backend of utterances older than a given age. |
| 518 | * |
| 519 | * @since 0.1.5 |
| 520 | * @param MWTimestamp $expirationDate |
| 521 | * @return int Number of utterances flushed. |
| 522 | */ |
| 523 | public function flushUtterancesByExpirationDate( $expirationDate ) { |
| 524 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
| 525 | $results = $dbw->select( self::UTTERANCE_TABLE, |
| 526 | [ 'wsu_utterance_id' ], |
| 527 | [ 1 => 'wsu_date_stored <= ' . $expirationDate->getTimestamp( TS_MW ), |
| 528 | 'wsu_message_key' => null, |
| 529 | ], |
| 530 | __METHOD__ |
| 531 | ); |
| 532 | return $this->flushUtterances( $dbw, $results ); |
| 533 | } |
| 534 | |
| 535 | /** |
| 536 | * Clears database and file backend of all utterances for a given page. |
| 537 | * |
| 538 | * @since 0.1.5 |
| 539 | * @param string|null $consumerUrl |
| 540 | * @param int $pageId Mediawiki page ID. |
| 541 | * @return int Number of utterances flushed. |
| 542 | */ |
| 543 | public function flushUtterancesByPage( |
| 544 | ?string $consumerUrl, |
| 545 | int $pageId |
| 546 | ): int { |
| 547 | $remoteWikiHash = self::evaluateRemoteWikiHash( $consumerUrl ); |
| 548 | |
| 549 | if ( $consumerUrl ) { |
| 550 | $this->logger->info( __METHOD__ . ": Flushing utterances for page $pageId at $consumerUrl" ); |
| 551 | } else { |
| 552 | $this->logger->info( __METHOD__ . ": Flushing utterances for page $pageId" ); |
| 553 | } |
| 554 | |
| 555 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
| 556 | $results = $dbw->select( self::UTTERANCE_TABLE, |
| 557 | [ 'wsu_utterance_id' ], |
| 558 | [ |
| 559 | 'wsu_remote_wiki_hash' => $remoteWikiHash, |
| 560 | 'wsu_page_id' => $pageId, |
| 561 | 'wsu_message_key' => null, |
| 562 | ], |
| 563 | __METHOD__ |
| 564 | ); |
| 565 | return $this->flushUtterances( $dbw, $results ); |
| 566 | } |
| 567 | |
| 568 | /** |
| 569 | * Clears database and file backend of all utterances for a given language and voice. |
| 570 | * If no voice is set, then all voices will be removed. |
| 571 | * |
| 572 | * @since 0.1.5 |
| 573 | * @param string $language ISO 639. |
| 574 | * @param string|null $voice Optional name of synthesis voice to limit flush to. |
| 575 | * @return int Number of utterances flushed. |
| 576 | */ |
| 577 | public function flushUtterancesByLanguageAndVoice( $language, $voice = null ) { |
| 578 | $conditions = [ |
| 579 | 'wsu_lang' => $language, |
| 580 | 'wsu_message_key' => null, |
| 581 | ]; |
| 582 | if ( $voice != null ) { |
| 583 | $conditions['wsu_voice'] = $voice; |
| 584 | } |
| 585 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
| 586 | $results = $dbw->select( self::UTTERANCE_TABLE, |
| 587 | [ 'wsu_utterance_id' ], $conditions, __METHOD__ |
| 588 | ); |
| 589 | return $this->flushUtterances( $dbw, $results ); |
| 590 | } |
| 591 | |
| 592 | /** |
| 593 | * Clears database and file backend of all message utterances. |
| 594 | * |
| 595 | * @since 0.1.14 |
| 596 | * @return int Number of utterances flushed. |
| 597 | */ |
| 598 | public function flushMessageUtterances() { |
| 599 | $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase(); |
| 600 | $results = $dbw->select( self::UTTERANCE_TABLE, |
| 601 | [ 'wsu_utterance_id' ], [ 'wsu_message_key IS NOT NULL' ], __METHOD__ |
| 602 | ); |
| 603 | return $this->flushUtterances( $dbw, $results ); |
| 604 | } |
| 605 | |
| 606 | /** |
| 607 | * Flushes utterances listed in a result set containing |
| 608 | * at least the wsu_utterance_id column. |
| 609 | * |
| 610 | * In order for return value to increase, the utterance must have been |
| 611 | * successfully deleted in all layers, i.e. utterance metadata database row, |
| 612 | * utterance audio and synthesis metadata from file store. |
| 613 | * E.g. if the utterance audio file is missing and thus not explicitly removed, |
| 614 | * but at the same time we managed to remove the utterance metadata from database |
| 615 | * and also removed the synthesis metadata file, this will not count as a |
| 616 | * successfully removed utterance. It would however be removed from all layers |
| 617 | * and it would also cause an out-of-sync warning in the log. |
| 618 | * |
| 619 | * @note Consider if database should be flushing within a transaction. |
| 620 | * |
| 621 | * @since 0.1.5 |
| 622 | * @param IDatabase $dbw Writable database connection. |
| 623 | * @param IResultWrapper $results Result set. |
| 624 | * @return int Number of utterances that were successfully flushed in all layers. |
| 625 | */ |
| 626 | private function flushUtterances( $dbw, $results ) { |
| 627 | if ( !$results ) { |
| 628 | return 0; |
| 629 | } |
| 630 | |
| 631 | // TTL is set when creating files in Swift, so no need to invoke any delete I/O operations. |
| 632 | $flushInFileBackend = !( |
| 633 | $this->isWikispeechUtteranceUseSwiftFileBackendExpiring() && |
| 634 | $this->getFileBackend() instanceof SwiftFileBackend |
| 635 | ); |
| 636 | |
| 637 | $successfullyFlushedCounter = 0; |
| 638 | foreach ( $results as $row ) { |
| 639 | $utteranceId = $row->wsu_utterance_id; |
| 640 | |
| 641 | // 1. delete in database |
| 642 | $successfullyDeletedTableRow = $dbw->delete( |
| 643 | self::UTTERANCE_TABLE, |
| 644 | [ 'wsu_utterance_id' => $utteranceId ], |
| 645 | __METHOD__ |
| 646 | ); |
| 647 | if ( !$successfullyDeletedTableRow ) { |
| 648 | $this->logger->warning( __METHOD__ . ': ' . |
| 649 | 'Failed to delete utterance {utteranceId} from database.', [ |
| 650 | 'utteranceId' => $utteranceId |
| 651 | ] ); |
| 652 | } else { |
| 653 | $this->logger->debug( __METHOD__ . ': ' . |
| 654 | 'Flushed out utterance with id {utteranceId} from database', [ |
| 655 | 'utteranceId' => $utteranceId |
| 656 | ] ); |
| 657 | } |
| 658 | |
| 659 | // 2. delete in file store. |
| 660 | if ( $flushInFileBackend ) { |
| 661 | $successfullyDeletedAudioFile = $this->deleteFileBackendFile( |
| 662 | $this->audioUrlFactory( $utteranceId ), |
| 663 | $utteranceId, |
| 664 | 'audio file' |
| 665 | ); |
| 666 | $successfullyDeletedSynthesisMetadataFile = $this->deleteFileBackendFile( |
| 667 | $this->synthesisMetadataUrlFactory( $utteranceId ), |
| 668 | $utteranceId, |
| 669 | 'synthesis metadata file' |
| 670 | ); |
| 671 | $successfullyDeletedFiles = |
| 672 | $successfullyDeletedAudioFile && $successfullyDeletedSynthesisMetadataFile; |
| 673 | } else { |
| 674 | // The files were marked for automatic deletion using TTL in the Swift create operation. |
| 675 | $successfullyDeletedFiles = true; |
| 676 | } |
| 677 | |
| 678 | if ( $successfullyDeletedTableRow && $successfullyDeletedFiles ) { |
| 679 | $successfullyFlushedCounter++; |
| 680 | } |
| 681 | } |
| 682 | |
| 683 | return $successfullyFlushedCounter; |
| 684 | } |
| 685 | |
| 686 | /** |
| 687 | * @since 0.1.5 |
| 688 | * @param string $src |
| 689 | * @param int $utteranceId |
| 690 | * @param string $type |
| 691 | * @return bool If successfully deleted |
| 692 | */ |
| 693 | private function deleteFileBackendFile( $src, $utteranceId, $type ) { |
| 694 | $synthesisMetadataFile = [ |
| 695 | 'src' => $src |
| 696 | ]; |
| 697 | if ( $this->getFileBackend()->fileExists( $synthesisMetadataFile ) ) { |
| 698 | if ( !$this->getFileBackend()->delete( $synthesisMetadataFile )->isOK() ) { |
| 699 | $this->logger->warning( __METHOD__ . ': ' . |
| 700 | 'Unable to delete {type} for utterance with identity {utteranceId}.', [ |
| 701 | 'utteranceId' => $utteranceId, |
| 702 | 'type' => $type |
| 703 | ] ); |
| 704 | return false; |
| 705 | } else { |
| 706 | $this->getFileBackend()->clean( [ 'dir' => $this->urlPathFactory( $utteranceId ) ] ); |
| 707 | } |
| 708 | } else { |
| 709 | $this->logger->warning( __METHOD__ . ': ' . |
| 710 | 'Attempted to delete non existing {type} for utterance {utteranceId}.', [ |
| 711 | 'utteranceId' => $utteranceId, |
| 712 | 'type' => $type |
| 713 | ] ); |
| 714 | return false; |
| 715 | } |
| 716 | $this->logger->debug( __METHOD__ . ': ' . |
| 717 | 'Flushed out file {src}', [ 'src' => $src ] ); |
| 718 | return true; |
| 719 | } |
| 720 | |
| 721 | /** |
| 722 | * Creates a deterministic path based on utterance identity, |
| 723 | * causing no more than 1000 files and 10 subdirectories per directory. |
| 724 | * (Actually, 2000 files, as we store both .json and .opus) |
| 725 | * |
| 726 | * Overloading a directory with files often cause performance problems. |
| 727 | * |
| 728 | * 1 -> / |
| 729 | * 12 -> / |
| 730 | * 123 -> / |
| 731 | * 1234 -> /1/ |
| 732 | * 12345 -> /1/2/ |
| 733 | * 123456 -> /1/2/3/ |
| 734 | * 1234567 -> /1/2/3/4/ |
| 735 | * |
| 736 | * @since 0.1.5 |
| 737 | * @param int $utteranceId |
| 738 | * @return string Path |
| 739 | */ |
| 740 | private function urlPathFactory( $utteranceId ) { |
| 741 | $path = '/'; |
| 742 | $utteranceIdText = strval( $utteranceId ); |
| 743 | $utteranceIdTextLength = strlen( $utteranceIdText ); |
| 744 | for ( $index = 0; $index < $utteranceIdTextLength - 3; $index++ ) { |
| 745 | $path .= substr( $utteranceIdText, $index, 1 ) . '/'; |
| 746 | } |
| 747 | return $path; |
| 748 | } |
| 749 | |
| 750 | /** |
| 751 | * @since 0.1.5 |
| 752 | * @param int $utteranceId Utterance identity. |
| 753 | * @return string url used to access object in file store |
| 754 | */ |
| 755 | private function audioUrlPrefixFactory( $utteranceId ) { |
| 756 | return $this->getFileBackend()->getContainerStoragePath( $this->fileBackendContainerName ) |
| 757 | . $this->urlPathFactory( $utteranceId ) . $utteranceId; |
| 758 | } |
| 759 | |
| 760 | /** |
| 761 | * @since 0.1.5 |
| 762 | * @param int $utteranceId Utterance identity. |
| 763 | * @return string url used to access object in file store |
| 764 | */ |
| 765 | private function audioUrlFactory( $utteranceId ) { |
| 766 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.opus'; |
| 767 | } |
| 768 | |
| 769 | /** |
| 770 | * @since 0.1.5 |
| 771 | * @param int $utteranceId Utterance identity. |
| 772 | * @return string url used to access object in file store |
| 773 | */ |
| 774 | private function synthesisMetadataUrlFactory( $utteranceId ) { |
| 775 | return $this->audioUrlPrefixFactory( $utteranceId ) . '.json'; |
| 776 | } |
| 777 | |
| 778 | /** |
| 779 | * Removes expired utterance and synthesis metadata from the file backend. |
| 780 | * |
| 781 | * @since 0.1.7 |
| 782 | * @param MWTimestamp|null $expiredTimestamp File timestamp <= to this value is orphaned. |
| 783 | * Defaults to config value. |
| 784 | * @return int Number of expired files flushed |
| 785 | */ |
| 786 | public function flushUtterancesByExpirationDateOnFile( $expiredTimestamp = null ) { |
| 787 | // @note Either this method, or the job, |
| 788 | // should probably call `flushUtterancesByExpirationDate` |
| 789 | // to ensure we are not deleting a bunch of files |
| 790 | // which were scheduled to be deleted together with their db-entries anyway. |
| 791 | |
| 792 | if ( !$expiredTimestamp ) { |
| 793 | $expiredTimestamp = $this->getWikispeechUtteranceExpirationTimestamp(); |
| 794 | } |
| 795 | $fileBackend = $this->getFileBackend(); |
| 796 | return $this->recurseFlushUtterancesByExpirationDateOnFile( |
| 797 | $fileBackend, |
| 798 | $this->getFileBackend() |
| 799 | ->getContainerStoragePath( $this->fileBackendContainerName ), |
| 800 | $expiredTimestamp |
| 801 | ); |
| 802 | } |
| 803 | |
| 804 | /** |
| 805 | * @since 0.1.7 |
| 806 | * @param FileBackend $fileBackend |
| 807 | * @param string $directory |
| 808 | * @param MWTimestamp $expiredTimestamp |
| 809 | * @return int Number of expired files flushed |
| 810 | */ |
| 811 | private function recurseFlushUtterancesByExpirationDateOnFile( |
| 812 | $fileBackend, |
| 813 | $directory, |
| 814 | $expiredTimestamp |
| 815 | ) { |
| 816 | $this->logger->debug( __METHOD__ . ': ' . |
| 817 | 'Processing directory {directory}', [ 'directory' => $directory ] ); |
| 818 | $removedFilesCounter = 0; |
| 819 | $subdirectories = $fileBackend->getDirectoryList( [ |
| 820 | 'dir' => $directory, |
| 821 | 'topOnly' => true, |
| 822 | ] ); |
| 823 | if ( $subdirectories ) { |
| 824 | foreach ( $subdirectories as $subdirectory ) { |
| 825 | $removedFilesCounter += $this->recurseFlushUtterancesByExpirationDateOnFile( |
| 826 | $fileBackend, |
| 827 | $directory . '/' . $subdirectory, |
| 828 | $expiredTimestamp |
| 829 | ); |
| 830 | } |
| 831 | } |
| 832 | $files = $fileBackend->getFileList( [ |
| 833 | 'dir' => $directory, |
| 834 | 'topOnly' => true, |
| 835 | 'adviseStat' => false |
| 836 | ] ); |
| 837 | if ( $files ) { |
| 838 | foreach ( $files as $file ) { |
| 839 | $src = [ 'src' => $directory . '/' . $file ]; |
| 840 | $timestamp = new MWTimestamp( $fileBackend->getFileTimestamp( $src ) ); |
| 841 | $this->logger->debug( __METHOD__ . ': ' . |
| 842 | 'Processing file {src} with timestamp {timestamp}', [ |
| 843 | 'src' => $file, |
| 844 | 'timestamp' => $timestamp, |
| 845 | 'expiredTimestamp' => $expiredTimestamp |
| 846 | ] ); |
| 847 | if ( $timestamp <= $expiredTimestamp ) { |
| 848 | if ( $fileBackend->delete( $src )->isOK() ) { |
| 849 | $removedFilesCounter++; |
| 850 | $this->logger->debug( __METHOD__ . ': ' . |
| 851 | 'Deleted expired file {file} #{num}', [ |
| 852 | 'file' => $file, |
| 853 | 'num' => $removedFilesCounter |
| 854 | ] |
| 855 | ); |
| 856 | } else { |
| 857 | $this->logger->warning( __METHOD__ . ': ' . |
| 858 | 'Unable to delete expired file {file}', |
| 859 | [ 'file' => $file ] |
| 860 | ); |
| 861 | } |
| 862 | } |
| 863 | unset( $timestamp ); |
| 864 | } |
| 865 | } |
| 866 | $this->getFileBackend()->clean( [ 'dir' => $directory ] ); |
| 867 | return $removedFilesCounter; |
| 868 | } |
| 869 | |
| 870 | /** |
| 871 | * Calculates historic timestamp on now-WikispeechUtteranceTimeToLiveDays |
| 872 | * |
| 873 | * @return MWTimestamp Utterance parts with timestamp <= this is expired. |
| 874 | */ |
| 875 | public function getWikispeechUtteranceExpirationTimestamp(): MWTimestamp { |
| 876 | return MWTimestamp::getInstance( |
| 877 | strtotime( '-' . $this->getWikispeechUtteranceTimeToLiveDays() . 'days' ) |
| 878 | ); |
| 879 | } |
| 880 | |
| 881 | /** |
| 882 | * @return int Number of days an utterance is to exist before being flushed out. |
| 883 | */ |
| 884 | private function getWikispeechUtteranceTimeToLiveDays(): int { |
| 885 | return intval( $this->config->get( 'WikispeechUtteranceTimeToLiveDays' ) ); |
| 886 | } |
| 887 | |
| 888 | /** |
| 889 | * @return bool |
| 890 | */ |
| 891 | private function isWikispeechUtteranceUseSwiftFileBackendExpiring(): bool { |
| 892 | return $this->config->get( 'WikispeechUtteranceUseSwiftFileBackendExpiring' ); |
| 893 | } |
| 894 | |
| 895 | /** |
| 896 | * Used to evaluate hash of gadget consumer URL, |
| 897 | * the remote wiki where the page is located. |
| 898 | * |
| 899 | * Making changes to this function will probably invalidate all existing cached utterances. |
| 900 | * |
| 901 | * @since 0.1.9 |
| 902 | * @param string|null $consumerUrl |
| 903 | * @return string|null SHA256 message digest |
| 904 | */ |
| 905 | public static function evaluateRemoteWikiHash( ?string $consumerUrl ): ?string { |
| 906 | if ( $consumerUrl === null ) { |
| 907 | return null; |
| 908 | } |
| 909 | $context = hash_init( 'sha256' ); |
| 910 | $urlParts = parse_url( $consumerUrl ); |
| 911 | if ( isset( $urlParts['host'] ) ) { |
| 912 | hash_update( $context, mb_strtolower( $urlParts['host'] ) ); |
| 913 | } |
| 914 | if ( isset( $urlParts['port'] ) ) { |
| 915 | hash_update( $context, strval( $urlParts['port'] ) ); |
| 916 | } |
| 917 | if ( isset( $urlParts['path'] ) ) { |
| 918 | hash_update( $context, $urlParts['path'] ); |
| 919 | } |
| 920 | return hash_final( $context ); |
| 921 | } |
| 922 | |
| 923 | } |