Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
EntitySearch.php
1<?php
2declare( strict_types = 1 );
3
4namespace MediaWiki\Extension\Translate\TranslatorInterface;
5
6use Collation;
7use MalformedTitleException;
10use NamespaceInfo;
11use SplMinHeap;
12use TitleFormatter;
13use TitleParser;
14use WANObjectCache;
15use Wikimedia\LightweightObjectStore\ExpirationAwareness;
16
24 private const FIELD_DELIMITER = "\x7F";
25 private const ROW_DELIMITER = "\n";
26
28 private $cache;
30 private $collation;
32 private $messageGroupFactory;
34 private $namespaceInfo;
36 private $messageIndex;
38 private $titleParser;
40 private $titleFormatter;
41
42 public function __construct(
43 WANObjectCache $cache,
44 Collation $collation,
45 MessageGroups $messageGroupFactory,
46 NamespaceInfo $namespaceInfo,
47 MessageIndex $messageIndex,
48 TitleParser $titleParser,
49 TitleFormatter $titleFormatter
50 ) {
51 $this->cache = $cache;
52 $this->collation = $collation;
53 $this->messageGroupFactory = $messageGroupFactory;
54 $this->namespaceInfo = $namespaceInfo;
55 $this->messageIndex = $messageIndex;
56 $this->titleParser = $titleParser;
57 $this->titleFormatter = $titleFormatter;
58 }
59
60 public function searchStaticMessageGroups( string $query, int $maxResults ): array {
61 $cache = $this->cache;
62 // None of the static groups currently use language-dependent labels. This
63 // may need revisiting later and splitting the cache by language.
64 $key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups' );
65 $haystack = $cache->getWithSetCallback(
66 $key,
67 ExpirationAwareness::TTL_WEEK,
68 function (): string {
69 return $this->getStaticMessageGroupsHaystack();
70 },
71 [
72 // Calling touchCheckKey() on this key purges the cache
73 'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ],
74 // Avoid querying cache servers multiple times in a web request
75 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
76 ]
77 );
78
79 // Algorithm: Construct one big string with one entity per line. Then run
80 // preg_match_all twice over it, first to collect prefix match (to show them
81 // first), then to match words if more results are needed.
82 $results = [];
83
84 $delimiter = self::FIELD_DELIMITER;
85 $anything = "[^$delimiter\n]";
86 $query = preg_quote( $query, '/' );
87 // Prefix match
88 $pattern = "/^($query$anything*)$delimiter($anything+)$/miu";
89 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
90 foreach ( $matches as [ , $label, $groupId ] ) {
91 // Index by $groupId to avoid duplicates from the prefix match and the word match
92 $results[$groupId] = [
93 'label' => $label,
94 'group' => $groupId,
95 ];
96
97 if ( count( $results ) >= $maxResults ) {
98 return array_values( $results );
99 }
100 }
101
102 // Word match
103 $pattern = "/^($anything*\b$query$anything*)$delimiter($anything+)$/miu";
104 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
105 foreach ( $matches as [ , $label, $groupId ] ) {
106 $results[$groupId] = [
107 'label' => $label,
108 'group' => $groupId,
109 ];
110
111 if ( count( $results ) >= $maxResults ) {
112 return array_values( $results );
113 }
114 }
115
116 return array_values( $results );
117 }
118
120 public function searchMessages( string $query, int $maxResults ): array {
121 // Optimized based on requirements:
122 // * "Natural" sorting of results
123 // * No need to show which message group things belong to
124 // * Match at any point in the message
125 // * Return full keys of prefixes that match multiple messages
126
127 // Algorithm: Construct one big string (haystack) with one entity per line.
128 // Then run preg_match_all over it. Because we will have many more matches
129 // than search results, this may be more efficient than calling preg_match
130 // repeatedly in a loop. On the other hand, it can use a lot of memory to
131 // construct the array for all the matches.
132 $haystack = $this->getMessagesHaystack();
133 $rowDelimiter = self::ROW_DELIMITER;
134 $anything = "[^$rowDelimiter]";
135 $query = preg_quote( $query, '/' );
136
137 // Word match
138 $pattern = "/^($anything*\b$query)$anything*$/miu";
139 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
140
141 $results = [];
142 $previousPrefixMatch = null;
143 foreach ( $matches as [ $full, $prefixMatch ] ) {
144 // This is a bit tricky. If we are at the maximum results, continue processing
145 // until the prefix changes, to get an accurate count
146 if ( count( $results ) >= $maxResults && $previousPrefixMatch !== $prefixMatch ) {
147 break;
148 }
149
150 if ( $full === $prefixMatch ) {
151 $results[$full] = [ $full, 1, true, $full ];
152 } else {
153 if ( !isset( $results["$prefixMatch*"] ) ) {
154 $results["$prefixMatch*"] = [ "$prefixMatch*", 0, false, $full ];
155 }
156 $results["$prefixMatch*"][1]++;
157 }
158 $previousPrefixMatch = $prefixMatch;
159 }
160
161 // Convert partial matches with single results to full match
162 foreach ( $results as $index => [ $label, $count, $isExact, $full ] ) {
163 if ( $count === 1 && !$isExact ) {
164 $results[$index][0] = $full;
165 }
166 }
167
168 // Drop unnecessary fields, pretty format title
169 foreach ( $results as &$value ) {
170 try {
171 $title = $this->titleParser->parseTitle( $value[0] );
172 $label = $this->titleFormatter->getPrefixedText( $title );
173 } catch ( MalformedTitleException $e ) {
174 $label = $value[0];
175 }
176 $value = [
177 'pattern' => $label,
178 'count' => $value[1]
179 ];
180 }
181
182 return array_values( $results );
183 }
184
186 public function matchMessages( string $query ): array {
187 $haystack = $this->getMessagesHaystack();
188 $rowDelimiter = self::ROW_DELIMITER;
189 $anything = "[^$rowDelimiter]*";
190
191 // Need something that's not affected by preg_quote and cannot be guessed
192 $placeholder = bin2hex( random_bytes( 16 ) );
193 $query = str_replace( '*', $placeholder, $query );
194 $query = preg_quote( $query, '/' );
195 $query = str_replace( $placeholder, $anything, $query );
196
197 $pattern = "/^$query/miu";
198 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
199 return array_column( $matches, 0 );
200 }
201
202 private function getStaticMessageGroupsHaystack(): string {
203 $groups = $this->messageGroupFactory->getGroups();
204 $data = new SplMinHeap();
205 foreach ( $groups as $group ) {
206 $label = $group->getLabel();
207 // Ensure there are no special chars that will break matching
208 $label = strtr( $label, [ self::FIELD_DELIMITER => '', self::ROW_DELIMITER => '' ] );
209 $sortKey = $this->collation->getSortKey( $label );
210 // It is unlikely that different groups have the same label (or sort key),
211 // but it's possible, so cannot use a hashmap here.
212 $data->insert( [ $sortKey, $label, $group->getId() ] );
213 }
214
215 $haystack = '';
216 foreach ( $data as [ , $label, $groupId ] ) {
217 $haystack .= $label . self::FIELD_DELIMITER . $groupId . self::ROW_DELIMITER;
218 }
219
220 return $haystack;
221 }
222
223 private function getMessageHaystackUncached(): string {
224 $namespaceMap = [];
225 $data = new SplMinHeap();
226 $keys = $this->messageIndex->getKeys();
227 foreach ( $keys as $key ) {
228 // Normalize "_" to " " so that \b in regexp matches words separated by underscores
229 $key = strtr( $key, [ '_' => ' ' ] );
230
231 [ $namespaceId, $label ] = explode( ':', $key, 2 );
232 if ( !isset( $namespaceMap[$namespaceId] ) ) {
233 $namespaceMap[$namespaceId] = $this->namespaceInfo->getCanonicalName( (int)$namespaceId );
234 }
235 $label = $namespaceMap[$namespaceId] . ":$label";
236
237 // Ensure there are no special chars that will break matching
238 $label = strtr( $label, [ self::ROW_DELIMITER => '' ] );
239 $sortKey = $this->collation->getSortKey( $label );
240 $data->insert( [ $sortKey, $label ] );
241 }
242
243 $haystack = '';
244 foreach ( $data as [ , $label ] ) {
245 $haystack .= $label . self::ROW_DELIMITER;
246 }
247
248 return $haystack;
249 }
250
251 private function getMessagesHaystack(): string {
252 $cache = $this->cache;
253 $key = $cache->makeKey( 'Translate', 'EntitySearch', 'messages' );
254 return $cache->getWithSetCallback(
255 $key,
256 ExpirationAwareness::TTL_WEEK,
257 function (): string {
258 // This can get rather large. On translatewiki.net it is multiple megabytes
259 // uncompressed. With compression (assumed to happen implicitly in the
260 // caching layer) it's under a megabyte.
261 return $this->getMessageHaystackUncached();
262 },
263 [
264 // Calling touchCheckKey() on this key purges the cache
265 'checkKeys' => [ $this->messageIndex->getStatusCacheKey() ],
266 // Avoid querying cache servers multiple times in a web request
267 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
268 ]
269 );
270 }
271}
return[ 'Translate:ConfigHelper'=> static function():ConfigHelper { return new ConfigHelper();}, 'Translate:CsvTranslationImporter'=> static function(MediaWikiServices $services):CsvTranslationImporter { return new CsvTranslationImporter( $services->getWikiPageFactory());}, 'Translate:EntitySearch'=> static function(MediaWikiServices $services):EntitySearch { return new EntitySearch($services->getMainWANObjectCache(), $services->getCollationFactory() ->makeCollation( 'uca-default-u-kn'), MessageGroups::singleton(), $services->getNamespaceInfo(), $services->get( 'Translate:MessageIndex'), $services->getTitleParser(), $services->getTitleFormatter());}, 'Translate:ExternalMessageSourceStateImporter'=> static function(MediaWikiServices $services):ExternalMessageSourceStateImporter { return new ExternalMessageSourceStateImporter($services->getMainConfig(), $services->get( 'Translate:GroupSynchronizationCache'), $services->getJobQueueGroup(), LoggerFactory::getInstance( 'Translate.GroupSynchronization'), $services->get( 'Translate:MessageIndex'));}, 'Translate:FileFormatFactory'=> static function(MediaWikiServices $services):FileFormatFactory { return new FileFormatFactory( $services->getObjectFactory());}, 'Translate:GroupSynchronizationCache'=> static function(MediaWikiServices $services):GroupSynchronizationCache { return new GroupSynchronizationCache( $services->get( 'Translate:PersistentCache'));}, 'Translate:HookRunner'=> static function(MediaWikiServices $services):HookRunner { return new HookRunner( $services->getHookContainer());}, 'Translate:MessageBundleStore'=> static function(MediaWikiServices $services):MessageBundleStore { return new MessageBundleStore($services->get( 'Translate:RevTagStore'), $services->getJobQueueGroup(), $services->getLanguageNameUtils(), $services->get( 'Translate:MessageIndex'));}, 'Translate:MessageGroupReviewStore'=> static function(MediaWikiServices $services):MessageGroupReviewStore { return new MessageGroupReviewStore($services->getDBLoadBalancer(), $services->get( 'Translate:HookRunner'));}, 'Translate:MessageGroupStatsTableFactory'=> static function(MediaWikiServices $services):MessageGroupStatsTableFactory { return new MessageGroupStatsTableFactory($services->get( 'Translate:ProgressStatsTableFactory'), $services->getDBLoadBalancer(), $services->getLinkRenderer(), $services->get( 'Translate:MessageGroupReviewStore'), $services->getMainConfig() ->get( 'TranslateWorkflowStates') !==false);}, 'Translate:MessageIndex'=> static function(MediaWikiServices $services):MessageIndex { $params=$services->getMainConfig() ->get( 'TranslateMessageIndex');if(is_string( $params)) { $params=(array) $params;} $class=array_shift( $params);return new $class( $params);}, 'Translate:MessagePrefixStats'=> static function(MediaWikiServices $services):MessagePrefixStats { return new MessagePrefixStats( $services->getTitleParser());}, 'Translate:ParsingPlaceholderFactory'=> static function():ParsingPlaceholderFactory { return new ParsingPlaceholderFactory();}, 'Translate:PersistentCache'=> static function(MediaWikiServices $services):PersistentCache { return new PersistentDatabaseCache($services->getDBLoadBalancer(), $services->getJsonCodec());}, 'Translate:ProgressStatsTableFactory'=> static function(MediaWikiServices $services):ProgressStatsTableFactory { return new ProgressStatsTableFactory($services->getLinkRenderer(), $services->get( 'Translate:ConfigHelper'));}, 'Translate:RevTagStore'=> static function(MediaWikiServices $services):RevTagStore { return new RevTagStore($services->getDBLoadBalancerFactory());}, 'Translate:SubpageListBuilder'=> static function(MediaWikiServices $services):SubpageListBuilder { return new SubpageListBuilder($services->get( 'Translate:TranslatableBundleFactory'), $services->getLinkBatchFactory());}, 'Translate:TranslatableBundleExporter'=> static function(MediaWikiServices $services):TranslatableBundleExporter { return new TranslatableBundleExporter($services->get( 'Translate:SubpageListBuilder'), $services->getWikiExporterFactory(), $services->getDBLoadBalancer());}, 'Translate:TranslatableBundleFactory'=> static function(MediaWikiServices $services):TranslatableBundleFactory { return new TranslatableBundleFactory($services->get( 'Translate:TranslatablePageStore'), $services->get( 'Translate:MessageBundleStore'));}, 'Translate:TranslatableBundleImporter'=> static function(MediaWikiServices $services):TranslatableBundleImporter { return new TranslatableBundleImporter($services->getWikiImporterFactory(), $services->get( 'Translate:TranslatablePageParser'), $services->getRevisionLookup());}, 'Translate:TranslatableBundleMover'=> static function(MediaWikiServices $services):TranslatableBundleMover { return new TranslatableBundleMover($services->getMovePageFactory(), $services->getJobQueueGroup(), $services->getLinkBatchFactory(), $services->get( 'Translate:TranslatableBundleFactory'), $services->get( 'Translate:SubpageListBuilder'), $services->getMainConfig() ->get( 'TranslatePageMoveLimit'));}, 'Translate:TranslatableBundleStatusStore'=> static function(MediaWikiServices $services):TranslatableBundleStatusStore { return new TranslatableBundleStatusStore($services->getDBLoadBalancer() ->getConnection(DB_PRIMARY), $services->getCollationFactory() ->makeCollation( 'uca-default-u-kn'), $services->getDBLoadBalancer() ->getMaintenanceConnectionRef(DB_PRIMARY));}, 'Translate:TranslatablePageParser'=> static function(MediaWikiServices $services):TranslatablePageParser { return new TranslatablePageParser($services->get( 'Translate:ParsingPlaceholderFactory'));}, 'Translate:TranslatablePageStore'=> static function(MediaWikiServices $services):TranslatablePageStore { return new TranslatablePageStore($services->get( 'Translate:MessageIndex'), $services->getJobQueueGroup(), $services->get( 'Translate:RevTagStore'), $services->getDBLoadBalancer(), $services->get( 'Translate:TranslatableBundleStatusStore'), $services->get( 'Translate:TranslatablePageParser'),);}, 'Translate:TranslationStashReader'=> static function(MediaWikiServices $services):TranslationStashReader { $db=$services->getDBLoadBalancer() ->getConnection(DB_REPLICA);return new TranslationStashStorage( $db);}, 'Translate:TranslationStatsDataProvider'=> static function(MediaWikiServices $services):TranslationStatsDataProvider { return new TranslationStatsDataProvider(new ServiceOptions(TranslationStatsDataProvider::CONSTRUCTOR_OPTIONS, $services->getMainConfig()), $services->getObjectFactory(), $services->getDBLoadBalancer());}, 'Translate:TranslationUnitStoreFactory'=> static function(MediaWikiServices $services):TranslationUnitStoreFactory { return new TranslationUnitStoreFactory( $services->getDBLoadBalancer());}, 'Translate:TranslatorActivity'=> static function(MediaWikiServices $services):TranslatorActivity { $query=new TranslatorActivityQuery($services->getMainConfig(), $services->getDBLoadBalancer());return new TranslatorActivity($services->getMainObjectStash(), $query, $services->getJobQueueGroup());}, 'Translate:TtmServerFactory'=> static function(MediaWikiServices $services):TtmServerFactory { $config=$services->getMainConfig();$default=$config->get( 'TranslateTranslationDefaultService');if( $default===false) { $default=null;} return new TtmServerFactory( $config->get( 'TranslateTranslationServices'), $default);}]
@phpcs-require-sorted-array
Factory class for accessing message groups individually by id or all of them as a list.
Service for searching message groups and message keys.
searchMessages(string $query, int $maxResults)
Search message prefixes.
matchMessages(string $query)
Match messages matching a pattern.
Creates a database of keys in all groups, so that namespace and key can be used to get the groups the...