Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
87.77% |
122 / 139 |
|
62.50% |
5 / 8 |
CRAP | |
0.00% |
0 / 1 |
EntitySearch | |
87.77% |
122 / 139 |
|
62.50% |
5 / 8 |
30.54 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
searchStaticMessageGroups | |
92.86% |
39 / 42 |
|
0.00% |
0 / 1 |
7.02 | |||
searchMessages | |
86.67% |
26 / 30 |
|
0.00% |
0 / 1 |
11.29 | |||
matchMessages | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
getStaticMessageGroupsHaystack | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
3 | |||
getMessageHaystackUncached | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
4 | |||
getMessagesHaystack | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getGroupTypes | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\TranslatorInterface; |
5 | |
6 | use AggregateMessageGroup; |
7 | use Collation; |
8 | use MalformedTitleException; |
9 | use MediaWiki\Extension\Translate\MessageBundleTranslation\MessageBundleMessageGroup; |
10 | use MediaWiki\Extension\Translate\MessageGroupProcessing\MessageGroups; |
11 | use MediaWiki\Extension\Translate\MessageLoading\MessageIndex; |
12 | use NamespaceInfo; |
13 | use SplMinHeap; |
14 | use TitleFormatter; |
15 | use TitleParser; |
16 | use WANObjectCache; |
17 | use Wikimedia\LightweightObjectStore\ExpirationAwareness; |
18 | use WikiPageMessageGroup; |
19 | |
20 | /** |
21 | * Service for searching message groups and message keys. |
22 | * @author Niklas Laxström |
23 | * @license GPL-2.0-or-later |
24 | * @since 2021.10 |
25 | */ |
26 | class EntitySearch { |
27 | private const FIELD_DELIMITER = "\x7F"; |
28 | private const ROW_DELIMITER = "\n"; |
29 | |
30 | private WANObjectCache $cache; |
31 | private Collation $collation; |
32 | private MessageGroups $messageGroupFactory; |
33 | private NamespaceInfo $namespaceInfo; |
34 | private MessageIndex $messageIndex; |
35 | private TitleParser $titleParser; |
36 | private TitleFormatter $titleFormatter; |
37 | private const TYPE_AGGREGATE = AggregateMessageGroup::class; |
38 | private const TYPE_MESSAGE_BUNDLE = MessageBundleMessageGroup::class; |
39 | private const TYPE_WIKIPAGE = WikiPageMessageGroup::class; |
40 | private const TYPE_OTHERS = 'o'; |
41 | private const TYPE_MAPPING = [ |
42 | self::TYPE_AGGREGATE => [ 'aggregate-groups', 'a' ], |
43 | self::TYPE_MESSAGE_BUNDLE => [ 'message-bundles', 'm' ], |
44 | self::TYPE_WIKIPAGE => [ 'translatable-pages', 'w' ] |
45 | ]; |
46 | /** @var array<string,string> */ |
47 | private array $mappedTypes; |
48 | |
49 | public function __construct( |
50 | WANObjectCache $cache, |
51 | Collation $collation, |
52 | MessageGroups $messageGroupFactory, |
53 | NamespaceInfo $namespaceInfo, |
54 | MessageIndex $messageIndex, |
55 | TitleParser $titleParser, |
56 | TitleFormatter $titleFormatter |
57 | ) { |
58 | $this->cache = $cache; |
59 | $this->collation = $collation; |
60 | $this->messageGroupFactory = $messageGroupFactory; |
61 | $this->namespaceInfo = $namespaceInfo; |
62 | $this->messageIndex = $messageIndex; |
63 | $this->titleParser = $titleParser; |
64 | $this->titleFormatter = $titleFormatter; |
65 | $this->mappedTypes = $this->getGroupTypes(); |
66 | } |
67 | |
68 | public function searchStaticMessageGroups( string $query, int $maxResults, array $types = [] ): array { |
69 | $cache = $this->cache; |
70 | // None of the static groups currently use language-dependent labels. This |
71 | // may need revisiting later and splitting the cache by language. |
72 | $key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups', '-v2' ); |
73 | $haystack = $cache->getWithSetCallback( |
74 | $key, |
75 | ExpirationAwareness::TTL_WEEK, |
76 | function (): string { |
77 | return $this->getStaticMessageGroupsHaystack(); |
78 | }, |
79 | [ |
80 | // Calling touchCheckKey() on this key purges the cache |
81 | 'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ], |
82 | // Avoid querying cache servers multiple times in a web request |
83 | 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG |
84 | ] |
85 | ); |
86 | |
87 | // Algorithm: Construct one big string with one entity per line. Then run |
88 | // preg_match_all twice over it, first to collect prefix match (to show them |
89 | // first), then to match words if more results are needed. |
90 | $results = []; |
91 | |
92 | $delimiter = self::FIELD_DELIMITER; |
93 | $anything = "[^$delimiter\n]"; |
94 | $query = preg_quote( $query, '/' ); |
95 | |
96 | $groupTypeFilter = []; |
97 | foreach ( $types as $filter ) { |
98 | $groupTypeFilter[] = $this->mappedTypes[$filter] ?? ''; |
99 | } |
100 | $groupTypeFilter = count( $groupTypeFilter ) |
101 | ? ( '[' . implode( $groupTypeFilter ) . ']' ) |
102 | : $anything; |
103 | |
104 | // Prefix match |
105 | $pattern = "/^$groupTypeFilter$delimiter($query$anything*)$delimiter($anything+)$/miu"; |
106 | preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER ); |
107 | foreach ( $matches as [ , $label, $groupId ] ) { |
108 | // Index by $groupId to avoid duplicates from the prefix match and the word match |
109 | $results[$groupId] = [ |
110 | 'label' => $label, |
111 | 'group' => $groupId, |
112 | ]; |
113 | |
114 | if ( count( $results ) >= $maxResults ) { |
115 | return array_values( $results ); |
116 | } |
117 | } |
118 | |
119 | // Word match |
120 | $pattern = "/^$groupTypeFilter$delimiter($anything*\b$query$anything*)$delimiter($anything+)$/miu"; |
121 | preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER ); |
122 | foreach ( $matches as [ , $label, $groupId ] ) { |
123 | $results[$groupId] = [ |
124 | 'label' => $label, |
125 | 'group' => $groupId, |
126 | ]; |
127 | |
128 | if ( count( $results ) >= $maxResults ) { |
129 | return array_values( $results ); |
130 | } |
131 | } |
132 | |
133 | return array_values( $results ); |
134 | } |
135 | |
136 | /** Search message prefixes. Results are collapsed into prefix patterns when possible. */ |
137 | public function searchMessages( string $query, int $maxResults ): array { |
138 | // Optimized based on requirements: |
139 | // * "Natural" sorting of results |
140 | // * No need to show which message group things belong to |
141 | // * Match at any point in the message |
142 | // * Return full keys of prefixes that match multiple messages |
143 | |
144 | // Algorithm: Construct one big string (haystack) with one entity per line. |
145 | // Then run preg_match_all over it. Because we will have many more matches |
146 | // than search results, this may be more efficient than calling preg_match |
147 | // repeatedly in a loop. On the other hand, it can use a lot of memory to |
148 | // construct the array for all the matches. |
149 | $haystack = $this->getMessagesHaystack(); |
150 | $rowDelimiter = self::ROW_DELIMITER; |
151 | $anything = "[^$rowDelimiter]"; |
152 | $query = preg_quote( $query, '/' ); |
153 | |
154 | // Word match |
155 | $pattern = "/^($anything*\b$query)$anything*$/miu"; |
156 | preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER ); |
157 | |
158 | $results = []; |
159 | $previousPrefixMatch = null; |
160 | foreach ( $matches as [ $full, $prefixMatch ] ) { |
161 | // This is a bit tricky. If we are at the maximum results, continue processing |
162 | // until the prefix changes, to get an accurate count |
163 | if ( count( $results ) >= $maxResults && $previousPrefixMatch !== $prefixMatch ) { |
164 | break; |
165 | } |
166 | |
167 | if ( $full === $prefixMatch ) { |
168 | $results[$full] = [ $full, 1, true, $full ]; |
169 | } else { |
170 | if ( !isset( $results["$prefixMatch*"] ) ) { |
171 | $results["$prefixMatch*"] = [ "$prefixMatch*", 0, false, $full ]; |
172 | } |
173 | $results["$prefixMatch*"][1]++; |
174 | } |
175 | $previousPrefixMatch = $prefixMatch; |
176 | } |
177 | |
178 | // Convert partial matches with single results to full match |
179 | foreach ( $results as $index => [ $label, $count, $isExact, $full ] ) { |
180 | if ( $count === 1 && !$isExact ) { |
181 | $results[$index][0] = $full; |
182 | } |
183 | } |
184 | |
185 | // Drop unnecessary fields, pretty format title |
186 | foreach ( $results as &$value ) { |
187 | try { |
188 | $title = $this->titleParser->parseTitle( $value[0] ); |
189 | $label = $this->titleFormatter->getPrefixedText( $title ); |
190 | } catch ( MalformedTitleException $e ) { |
191 | $label = $value[0]; |
192 | } |
193 | $value = [ |
194 | 'pattern' => $label, |
195 | 'count' => $value[1] |
196 | ]; |
197 | } |
198 | |
199 | return array_values( $results ); |
200 | } |
201 | |
202 | /** Match messages matching a pattern. '*' is the wildcard for anything. */ |
203 | public function matchMessages( string $query ): array { |
204 | $haystack = $this->getMessagesHaystack(); |
205 | $rowDelimiter = self::ROW_DELIMITER; |
206 | $anything = "[^$rowDelimiter]*"; |
207 | |
208 | // Need something that's not affected by preg_quote and cannot be guessed |
209 | $placeholder = bin2hex( random_bytes( 16 ) ); |
210 | $query = str_replace( '*', $placeholder, $query ); |
211 | $query = preg_quote( $query, '/' ); |
212 | $query = str_replace( $placeholder, $anything, $query ); |
213 | |
214 | $pattern = "/^$query/miu"; |
215 | preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER ); |
216 | return array_column( $matches, 0 ); |
217 | } |
218 | |
219 | private function getStaticMessageGroupsHaystack(): string { |
220 | $groups = $this->messageGroupFactory->getGroups(); |
221 | $data = new SplMinHeap(); |
222 | foreach ( $groups as $group ) { |
223 | $label = $group->getLabel(); |
224 | // Ensure there are no special chars that will break matching |
225 | $label = strtr( $label, [ self::FIELD_DELIMITER => '', self::ROW_DELIMITER => '' ] ); |
226 | $sortKey = $this->collation->getSortKey( $label ); |
227 | // It is unlikely that different groups have the same label (or sort key), |
228 | // but it's possible, so cannot use a hashmap here. |
229 | $groupType = get_class( $group ); |
230 | $type = self::TYPE_MAPPING[$groupType][1] ?? self::TYPE_OTHERS; |
231 | $data->insert( [ $sortKey, $label, $group->getId(), $type ] ); |
232 | } |
233 | |
234 | $haystack = ''; |
235 | foreach ( $data as [ , $label, $groupId, $type ] ) { |
236 | $haystack |
237 | .= $type |
238 | . self::FIELD_DELIMITER |
239 | . $label |
240 | . self::FIELD_DELIMITER |
241 | . $groupId |
242 | . self::ROW_DELIMITER; |
243 | } |
244 | return $haystack; |
245 | } |
246 | |
247 | private function getMessageHaystackUncached(): string { |
248 | $namespaceMap = []; |
249 | $data = new SplMinHeap(); |
250 | $keys = $this->messageIndex->getKeys(); |
251 | foreach ( $keys as $key ) { |
252 | // Normalize "_" to " " so that \b in regexp matches words separated by underscores |
253 | $key = strtr( $key, [ '_' => ' ' ] ); |
254 | |
255 | [ $namespaceId, $label ] = explode( ':', $key, 2 ); |
256 | if ( !isset( $namespaceMap[$namespaceId] ) ) { |
257 | $namespaceMap[$namespaceId] = $this->namespaceInfo->getCanonicalName( (int)$namespaceId ); |
258 | } |
259 | $label = $namespaceMap[$namespaceId] . ":$label"; |
260 | |
261 | // Ensure there are no special chars that will break matching |
262 | $label = strtr( $label, [ self::ROW_DELIMITER => '' ] ); |
263 | $sortKey = $this->collation->getSortKey( $label ); |
264 | $data->insert( [ $sortKey, $label ] ); |
265 | } |
266 | |
267 | $haystack = ''; |
268 | foreach ( $data as [ , $label ] ) { |
269 | $haystack .= $label . self::ROW_DELIMITER; |
270 | } |
271 | |
272 | return $haystack; |
273 | } |
274 | |
275 | private function getMessagesHaystack(): string { |
276 | $cache = $this->cache; |
277 | $key = $cache->makeKey( 'Translate', 'EntitySearch', 'messages' ); |
278 | return $cache->getWithSetCallback( |
279 | $key, |
280 | ExpirationAwareness::TTL_WEEK, |
281 | function (): string { |
282 | // This can get rather large. On translatewiki.net it is multiple megabytes |
283 | // uncompressed. With compression (assumed to happen implicitly in the |
284 | // caching layer) it's under a megabyte. |
285 | return $this->getMessageHaystackUncached(); |
286 | }, |
287 | [ |
288 | // Calling touchCheckKey() on this key purges the cache |
289 | 'checkKeys' => [ $this->messageIndex->getStatusCacheKey() ], |
290 | // Avoid querying cache servers multiple times in a web request |
291 | 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG |
292 | ] |
293 | ); |
294 | } |
295 | |
296 | /** @return array<string,string> */ |
297 | public function getGroupTypes(): array { |
298 | return array_column( self::TYPE_MAPPING, 1, 0 ); |
299 | } |
300 | } |