Translate extension for MediaWiki
 
Loading...
Searching...
No Matches
EntitySearch.php
1<?php
2declare( strict_types = 1 );
3
4namespace MediaWiki\Extension\Translate\TranslatorInterface;
5
6use Collation;
7use MalformedTitleException;
10use NamespaceInfo;
11use SplMinHeap;
12use TitleFormatter;
13use TitleParser;
14use WANObjectCache;
15use Wikimedia\LightweightObjectStore\ExpirationAwareness;
16
24 private const FIELD_DELIMITER = "\x7F";
25 private const ROW_DELIMITER = "\n";
26
28 private $cache;
30 private $collation;
32 private $messageGroupFactory;
34 private $namespaceInfo;
36 private $messageIndex;
38 private $titleParser;
40 private $titleFormatter;
41
42 public function __construct(
43 WANObjectCache $cache,
44 Collation $collation,
45 MessageGroups $messageGroupFactory,
46 NamespaceInfo $namespaceInfo,
47 MessageIndex $messageIndex,
48 TitleParser $titleParser,
49 TitleFormatter $titleFormatter
50 ) {
51 $this->cache = $cache;
52 $this->collation = $collation;
53 $this->messageGroupFactory = $messageGroupFactory;
54 $this->namespaceInfo = $namespaceInfo;
55 $this->messageIndex = $messageIndex;
56 $this->titleParser = $titleParser;
57 $this->titleFormatter = $titleFormatter;
58 }
59
60 public function searchStaticMessageGroups( string $query, int $maxResults ): array {
61 $cache = $this->cache;
62 // None of the static groups currently use language-dependent labels. This
63 // may need revisiting later and splitting the cache by language.
64 $key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups' );
65 $haystack = $cache->getWithSetCallback(
66 $key,
67 ExpirationAwareness::TTL_WEEK,
68 function (): string {
69 return $this->getStaticMessageGroupsHaystack();
70 },
71 [
72 // Calling touchCheckKey() on this key purges the cache
73 'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ],
74 // Avoid querying cache servers multiple times in a web request
75 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
76 ]
77 );
78
79 // Algorithm: Construct one big string with one entity per line. Then run
80 // preg_match_all twice over it, first to collect prefix match (to show them
81 // first), then to match words if more results are needed.
82 $results = [];
83
84 $delimiter = self::FIELD_DELIMITER;
85 $anything = "[^$delimiter\n]";
86 $query = preg_quote( $query, '/' );
87 // Prefix match
88 $pattern = "/^($query$anything*)$delimiter($anything+)$/miu";
89 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
90 foreach ( $matches as [ , $label, $groupId ] ) {
91 // Index by $groupId to avoid duplicates from the prefix match and the word match
92 $results[$groupId] = [
93 'label' => $label,
94 'group' => $groupId,
95 ];
96
97 if ( count( $results ) >= $maxResults ) {
98 return array_values( $results );
99 }
100 }
101
102 // Word match
103 $pattern = "/^($anything*\b$query$anything*)$delimiter($anything+)$/miu";
104 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
105 foreach ( $matches as [ , $label, $groupId ] ) {
106 $results[$groupId] = [
107 'label' => $label,
108 'group' => $groupId,
109 ];
110
111 if ( count( $results ) >= $maxResults ) {
112 return array_values( $results );
113 }
114 }
115
116 return array_values( $results );
117 }
118
119 public function searchMessages( string $query, int $maxResults ): array {
120 // Optimized based on requirements:
121 // * "Natural" sorting of results
122 // * No need to show which message group things belong to
123 // * Match at any point in the message
124 // * Return full keys of prefixes that match multiple messages
125
126 $cache = $this->cache;
127 $key = $cache->makeKey( 'Translate', 'EntitySearch', 'messages' );
128 $haystack = $cache->getWithSetCallback(
129 $key,
130 ExpirationAwareness::TTL_WEEK,
131 function (): string {
132 // This can get rather large. On translatewiki.net it is multiple megabytes
133 // uncompressed. With compression (assumed to happen implicitly in the
134 // caching layer) it's under a megabyte.
135 return $this->getMessagesHaystack();
136 },
137 [
138 // Calling touchCheckKey() on this key purges the cache
139 'checkKeys' => [ $this->messageIndex->getStatusCacheKey() ],
140 // Avoid querying cache servers multiple times in a web request
141 'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
142 ]
143 );
144
145 // Algorithm: Construct one big string with one entity per line. Then run
146 // preg_match_all over it. Because we will have many more matches than search
147 // results, this may be more efficient than calling preg_match iteratively.
148 // On the other hand, it can use a lot of memory to construct the array for
149 // all the matches.
150 $results = [];
151 $rowDelimiter = self::ROW_DELIMITER;
152 $anything = "[^$rowDelimiter]";
153 $query = preg_quote( $query, '/' );
154
155 // Word match
156 $pattern = "/^($anything*\b$query)$anything*$/miu";
157 preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
158 $previousPrefixMatch = null;
159 foreach ( $matches as [ $full, $prefixMatch ] ) {
160 // This is a bit tricky. If we are at the maximum results, continue processing
161 // until the prefix changes, to get an accurate count
162 if ( count( $results ) >= $maxResults && $previousPrefixMatch !== $prefixMatch ) {
163 break;
164 }
165
166 if ( $full === $prefixMatch ) {
167 $results[$full] = [ $full, 1, true, $full ];
168 } else {
169 if ( !isset( $results["$prefixMatch*"] ) ) {
170 $results["$prefixMatch*"] = [ "$prefixMatch*", 0, false, $full ];
171 }
172 $results["$prefixMatch*"][1]++;
173 }
174 $previousPrefixMatch = $prefixMatch;
175 }
176
177 // Convert partial matches with single results to full match
178 foreach ( $results as $index => [ $label, $count, $isExact, $full ] ) {
179 if ( $count === 1 && !$isExact ) {
180 $results[$index][0] = $full;
181 }
182 }
183
184 // Drop unnecessary fields, pretty format title
185 foreach ( $results as &$value ) {
186 try {
187 $title = $this->titleParser->parseTitle( $value[0] );
188 $label = $this->titleFormatter->getPrefixedText( $title );
189 } catch ( MalformedTitleException $e ) {
190 $label = $value[0];
191 }
192 $value = [
193 'pattern' => $label,
194 'count' => $value[1]
195 ];
196 }
197
198 return array_values( $results );
199 }
200
201 private function getStaticMessageGroupsHaystack(): string {
202 $groups = $this->messageGroupFactory->getGroups();
203 $data = new SplMinHeap();
204 foreach ( $groups as $group ) {
205 $label = $group->getLabel();
206 // Ensure there are no special chars that will break matching
207 $label = strtr( $label, [ self::FIELD_DELIMITER => '', self::ROW_DELIMITER => '' ] );
208 $sortKey = $this->collation->getSortKey( $label );
209 // It is unlikely that different groups have the same label (or sort key),
210 // but it's possible, so cannot use a hashmap here.
211 $data->insert( [ $sortKey, $label, $group->getId() ] );
212 }
213
214 $haystack = '';
215 foreach ( $data as [ , $label, $groupId ] ) {
216 $haystack .= $label . self::FIELD_DELIMITER . $groupId . self::ROW_DELIMITER;
217 }
218
219 return $haystack;
220 }
221
222 private function getMessagesHaystack(): string {
223 $namespaceMap = [];
224 $data = new SplMinHeap();
225 $keys = $this->messageIndex->getKeys();
226 foreach ( $keys as $key ) {
227 // Normalize "_" to " " so that \b in regexp matches words separated by underscores
228 $key = strtr( $key, [ '_' => ' ' ] );
229
230 [ $namespaceId, $label ] = explode( ':', $key, 2 );
231 if ( !isset( $namespaceMap[$namespaceId] ) ) {
232 $namespaceMap[$namespaceId] = $this->namespaceInfo->getCanonicalName( (int)$namespaceId );
233 }
234 $label = $namespaceMap[$namespaceId] . ":$label";
235
236 // Ensure there are no special chars that will break matching
237 $label = strtr( $label, [ self::ROW_DELIMITER => '' ] );
238 $sortKey = $this->collation->getSortKey( $label );
239 $data->insert( [ $sortKey, $label ] );
240 }
241
242 $haystack = '';
243 foreach ( $data as [ , $label ] ) {
244 $haystack .= $label . self::ROW_DELIMITER;
245 }
246
247 return $haystack;
248 }
249}
Service for searching message groups and message keys.
Factory class for accessing message groups individually by id or all of them as an list.
Creates a database of keys in all groups, so that namespace and key can be used to get the groups the...