Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 152 |
|
0.00% |
0 / 15 |
CRAP | |
0.00% |
0 / 1 |
DatabaseTtmServer | |
0.00% |
0 / 152 |
|
0.00% |
0 / 15 |
1260 | |
0.00% |
0 / 1 |
getDB | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
update | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
72 | |||
insertSource | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
filterForFulltext | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
30 | |||
beginBootstrap | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
beginBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
batchInsertDefinitions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
batchInsertTranslations | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
6 | |||
endBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
endBootstrap | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isLocalSuggestion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
expandLocation | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
query | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
6 | |||
processQueryResults | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
30 | |||
setDoReIndex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\TtmServer; |
5 | |
6 | use MediaWiki\Extension\Translate\MessageLoading\MessageHandle; |
7 | use MediaWiki\Extension\Translate\Utilities\StringComparators\EditDistanceStringComparator; |
8 | use MediaWiki\MediaWikiServices; |
9 | use MediaWiki\Title\Title; |
10 | use MediaWiki\WikiMap\WikiMap; |
11 | use TTMServer; |
12 | use Wikimedia\Rdbms\DBQueryError; |
13 | use Wikimedia\Rdbms\IDatabase; |
14 | use Wikimedia\Rdbms\IResultWrapper; |
15 | |
16 | /** |
17 | * MySQL/MariaDB-based based backend for translation memory. |
18 | * @author Niklas Laxström |
19 | * @copyright Copyright © 2012-2013, Niklas Laxström |
20 | * @license GPL-2.0-or-later |
21 | * @ingroup TTMServer |
22 | */ |
23 | class DatabaseTtmServer extends TTMServer implements WritableTtmServer, ReadableTtmServer { |
24 | private array $sids; |
25 | |
26 | private function getDB( int $mode = DB_REPLICA ): IDatabase { |
27 | return MediaWikiServices::getInstance()->getDBLoadBalancer()->getConnection( |
28 | $mode, 'ttmserver', $this->config['database'] |
29 | ); |
30 | } |
31 | |
32 | public function update( MessageHandle $handle, ?string $targetText ): bool { |
33 | if ( !$handle->isValid() || $handle->getCode() === '' ) { |
34 | return false; |
35 | } |
36 | |
37 | $mKey = $handle->getKey(); |
38 | $group = $handle->getGroup(); |
39 | $targetLanguage = $handle->getCode(); |
40 | $sourceLanguage = $group->getSourceLanguage(); |
41 | |
42 | // Skip definitions to not slow down mass imports etc. |
43 | // These will be added when the first translation is made |
44 | if ( $targetLanguage === $sourceLanguage ) { |
45 | return false; |
46 | } |
47 | |
48 | $definition = $group->getMessage( $mKey, $sourceLanguage ); |
49 | if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) { |
50 | return false; |
51 | } |
52 | |
53 | $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $mKey ); |
54 | $dbw = $this->getDB( DB_PRIMARY ); |
55 | /* Check that the definition exists and fetch the sid. If not, add |
56 | * the definition and retrieve the sid. If the definition changes, |
57 | * we will create a new entry - otherwise we could at some point |
58 | * get suggestions which do not match the original definition any |
59 | * longer. The old translations are still kept until purged by |
60 | * rerunning the bootstrap script. */ |
61 | $sid = $dbw->newSelectQueryBuilder() |
62 | ->select( 'tms_sid' ) |
63 | ->from( 'translate_tms' ) |
64 | ->where( [ |
65 | 'tms_context' => $context->getPrefixedText(), |
66 | 'tms_text' => $definition, |
67 | ] ) |
68 | ->caller( __METHOD__ ) |
69 | ->fetchField(); |
70 | if ( $sid === false ) { |
71 | $sid = $this->insertSource( $context, $sourceLanguage, $definition ); |
72 | } |
73 | |
74 | // Delete old translations for this message if any. Could also use replace |
75 | $deleteConditions = [ |
76 | 'tmt_sid' => $sid, |
77 | 'tmt_lang' => $targetLanguage, |
78 | ]; |
79 | $dbw->delete( 'translate_tmt', $deleteConditions, __METHOD__ ); |
80 | |
81 | // Insert the new translation |
82 | if ( $targetText !== null ) { |
83 | $row = $deleteConditions + [ |
84 | 'tmt_text' => $targetText, |
85 | ]; |
86 | |
87 | $dbw->insert( 'translate_tmt', $row, __METHOD__ ); |
88 | } |
89 | |
90 | return true; |
91 | } |
92 | |
93 | private function insertSource( Title $context, string $sourceLanguage, string $text ): int { |
94 | $row = [ |
95 | 'tms_lang' => $sourceLanguage, |
96 | 'tms_len' => mb_strlen( $text ), |
97 | 'tms_text' => $text, |
98 | 'tms_context' => $context->getPrefixedText(), |
99 | ]; |
100 | |
101 | $dbw = $this->getDB( DB_PRIMARY ); |
102 | $dbw->insert( 'translate_tms', $row, __METHOD__ ); |
103 | $sid = $dbw->insertId(); |
104 | |
105 | $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); |
106 | if ( count( $fulltext ) ) { |
107 | $row = [ |
108 | 'tmf_sid' => $sid, |
109 | 'tmf_text' => implode( ' ', $fulltext ), |
110 | ]; |
111 | $dbw->insert( 'translate_tmf', $row, __METHOD__ ); |
112 | } |
113 | |
114 | return $sid; |
115 | } |
116 | |
117 | /** Tokenizes the text for fulltext search. Tries to find the most useful tokens. */ |
118 | protected function filterForFulltext( string $languageCode, string $input ): array { |
119 | $lang = MediaWikiServices::getInstance()->getLanguageFactory()->getLanguage( $languageCode ); |
120 | |
121 | $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input ); |
122 | $text = $lang->segmentByWord( $text ); |
123 | $text = $lang->lc( $text ); |
124 | $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY ); |
125 | if ( count( $segments ) < 4 ) { |
126 | return []; |
127 | } |
128 | |
129 | foreach ( $segments as $i => $segment ) { |
130 | // Yes strlen |
131 | $len = strlen( $segment ); |
132 | if ( $len < 4 || $len > 15 ) { |
133 | unset( $segments[$i] ); |
134 | } |
135 | } |
136 | |
137 | return array_slice( array_unique( $segments ), 0, 10 ); |
138 | } |
139 | |
140 | public function beginBootstrap(): void { |
141 | $dbw = $this->getDB( DB_PRIMARY ); |
142 | $dbw->delete( 'translate_tms', '*', __METHOD__ ); |
143 | $dbw->delete( 'translate_tmt', '*', __METHOD__ ); |
144 | $dbw->delete( 'translate_tmf', '*', __METHOD__ ); |
145 | $table = $dbw->tableName( 'translate_tmf' ); |
146 | try { |
147 | $dbw->query( "DROP INDEX tmf_text ON $table", __METHOD__ ); |
148 | } catch ( DBQueryError $e ) { |
149 | // Perhaps the script was aborted before it got |
150 | // chance to add the index back. |
151 | } |
152 | } |
153 | |
154 | public function beginBatch(): void { |
155 | $this->sids = []; |
156 | } |
157 | |
158 | public function batchInsertDefinitions( array $batch ): void { |
159 | $mwInstance = MediaWikiServices::getInstance(); |
160 | $titleFactory = $mwInstance->getTitleFactory(); |
161 | foreach ( $batch as $key => $item ) { |
162 | [ $handle, $language, $text ] = $item; |
163 | $context = $titleFactory->makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); |
164 | $this->sids[$key] = $this->insertSource( $context, $language, $text ); |
165 | } |
166 | |
167 | $mwInstance->getDBLoadBalancerFactory()->waitForReplication( [ 'ifWritesSince' => 10 ] ); |
168 | } |
169 | |
170 | public function batchInsertTranslations( array $batch ): void { |
171 | $rows = []; |
172 | foreach ( $batch as $key => $data ) { |
173 | [ , $language, $text ] = $data; |
174 | $rows[] = [ |
175 | 'tmt_sid' => $this->sids[$key], |
176 | 'tmt_lang' => $language, |
177 | 'tmt_text' => $text, |
178 | ]; |
179 | } |
180 | |
181 | $dbw = $this->getDB( DB_PRIMARY ); |
182 | $dbw->insert( 'translate_tmt', $rows, __METHOD__ ); |
183 | |
184 | MediaWikiServices::getInstance() |
185 | ->getDBLoadBalancerFactory() |
186 | ->waitForReplication( [ 'ifWritesSince' => 10 ] ); |
187 | } |
188 | |
189 | public function endBatch(): void { |
190 | } |
191 | |
192 | public function endBootstrap(): void { |
193 | $dbw = $this->getDB( DB_PRIMARY ); |
194 | $table = $dbw->tableName( 'translate_tmf' ); |
195 | $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)", __METHOD__ ); |
196 | } |
197 | |
198 | /* Reading interface */ |
199 | |
200 | public function isLocalSuggestion( array $suggestion ): bool { |
201 | return true; |
202 | } |
203 | |
204 | public function expandLocation( array $suggestion ): string { |
205 | return Title::newFromText( $suggestion['location'] )->getCanonicalURL(); |
206 | } |
207 | |
208 | public function query( string $sourceLanguage, string $targetLanguage, string $text ): array { |
209 | // Calculate the bounds of the string length which are able |
210 | // to satisfy the cutoff percentage in edit distance. |
211 | $len = mb_strlen( $text ); |
212 | $min = ceil( max( $len * $this->config['cutoff'], 2 ) ); |
213 | $max = floor( $len / $this->config['cutoff'] ); |
214 | |
215 | // We could use fulltext index to narrow the results further |
216 | $dbr = $this->getDB(); |
217 | $tables = [ 'translate_tmt', 'translate_tms' ]; |
218 | $fields = [ 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' ]; |
219 | |
220 | $conditions = [ |
221 | 'tms_lang' => $sourceLanguage, |
222 | 'tmt_lang' => $targetLanguage, |
223 | "tms_len BETWEEN $min AND $max", |
224 | 'tms_sid = tmt_sid', |
225 | ]; |
226 | |
227 | $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); |
228 | if ( $fulltext ) { |
229 | $tables[] = 'translate_tmf'; |
230 | $list = implode( ' ', $fulltext ); |
231 | $conditions[] = 'tmf_sid = tmt_sid'; |
232 | $conditions[] = "MATCH(tmf_text) AGAINST( '$list' )"; |
233 | } |
234 | |
235 | $res = $dbr->newSelectQueryBuilder() |
236 | ->tables( $tables ) |
237 | ->select( $fields ) |
238 | ->where( $conditions ) |
239 | ->caller( __METHOD__ ) |
240 | ->fetchResultSet(); |
241 | |
242 | return $this->processQueryResults( $res, $text, $targetLanguage ); |
243 | } |
244 | |
245 | private function processQueryResults( IResultWrapper $res, string $text, string $targetLanguage ): array { |
246 | $timeLimit = microtime( true ) + 5; |
247 | |
248 | $lenA = mb_strlen( $text ); |
249 | $results = []; |
250 | $stringComparator = new EditDistanceStringComparator(); |
251 | foreach ( $res as $row ) { |
252 | if ( microtime( true ) > $timeLimit ) { |
253 | // Having no suggestions is better than preventing translation |
254 | // altogether by timing out the request :( |
255 | break; |
256 | } |
257 | |
258 | $a = $text; |
259 | $b = $row->tms_text; |
260 | $lenB = mb_strlen( $b ); |
261 | $len = min( $lenA, $lenB ); |
262 | if ( $len > 600 ) { |
263 | // two strings of length 1500 ~ 10s |
264 | // two strings of length 2250 ~ 30s |
265 | $dist = $len; |
266 | } else { |
267 | $dist = $stringComparator->levenshtein( $a, $b, $lenA, $lenB ); |
268 | } |
269 | $quality = 1 - ( $dist * 0.9 / $len ); |
270 | |
271 | if ( $quality >= $this->config['cutoff'] ) { |
272 | $results[] = [ |
273 | 'source' => $row->tms_text, |
274 | 'target' => $row->tmt_text, |
275 | 'context' => $row->tms_context, |
276 | 'location' => $row->tms_context . '/' . $targetLanguage, |
277 | 'quality' => $quality, |
278 | 'wiki' => $row->tms_wiki ?? WikiMap::getCurrentWikiId(), |
279 | ]; |
280 | } |
281 | } |
282 | |
283 | return TTMServer::sortSuggestions( $results ); |
284 | } |
285 | |
286 | public function setDoReIndex(): void { |
287 | } |
288 | } |