Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.37% |
121 / 123 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
MediaSearchEntitiesFetcher | |
98.37% |
121 / 123 |
|
75.00% |
6 / 8 |
23 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
get | |
96.67% |
29 / 30 |
|
0.00% |
0 / 1 |
8 | |||
gatherEntitySearchRequests | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
1 | |||
gatherTitleMatchRequests | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
2.00 | |||
mbUcFirst | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
transformTitleMatchResult | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
addToTransformedResponses | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
2 | |||
transformEntitySearchResult | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace Wikibase\MediaInfo\Search; |
4 | |
5 | use MultiHttpClient; |
6 | |
7 | class MediaSearchEntitiesFetcher { |
8 | /** @var MultiHttpClient */ |
9 | protected $multiHttpClient; |
10 | |
11 | /** @var string */ |
12 | protected $entitySearchUrl; |
13 | |
14 | /** @var string */ |
15 | protected $titleMatchUrl; |
16 | |
17 | /** @var string */ |
18 | protected $inputLanguage; |
19 | |
20 | /** @var string */ |
21 | protected $outputLanguage; |
22 | |
23 | public function __construct( |
24 | MultiHttpClient $multiHttpClient, |
25 | string $entitySearchUrl, |
26 | string $titleMatchUrl, |
27 | string $inputLanguage, |
28 | string $outputLanguage |
29 | ) { |
30 | $this->multiHttpClient = $multiHttpClient; |
31 | $this->entitySearchUrl = $entitySearchUrl; |
32 | $this->titleMatchUrl = $titleMatchUrl; |
33 | $this->inputLanguage = $inputLanguage; |
34 | $this->outputLanguage = $outputLanguage; |
35 | } |
36 | |
37 | /** |
38 | * Find wikibase entities that match given search queries and return their ids, |
39 | * along with a (normalized, between 0-1) score indicating good of a match |
40 | * they are. |
41 | * |
42 | * @param array $searchQueries |
43 | * @return array |
44 | */ |
45 | public function get( array $searchQueries ): array { |
46 | if ( count( $searchQueries ) === 0 ) { |
47 | return []; |
48 | } |
49 | |
50 | $entitySearchRequests = $this->gatherEntitySearchRequests( $searchQueries ); |
51 | $titleMatchRequests = $this->gatherTitleMatchRequests( $searchQueries ); |
52 | |
53 | $responses = $this->multiHttpClient->runMulti( |
54 | array_merge( $entitySearchRequests, $titleMatchRequests ) |
55 | ); |
56 | |
57 | $transformedResponses = array_fill_keys( |
58 | array_values( $searchQueries ), |
59 | [] |
60 | ); |
61 | foreach ( $responses as $response ) { |
62 | $body = json_decode( $response['response']['body'], true ) ?: []; |
63 | $term = $response['_term']; |
64 | if ( $response['_type'] === 'entitySearch' ) { |
65 | // iterate each result |
66 | foreach ( $body['query']['pages'] ?? [] as $result ) { |
67 | $transformedResponses[$term] = $this->addToTransformedResponses( |
68 | $transformedResponses[$term], |
69 | $this->transformEntitySearchResult( $result ) |
70 | ); |
71 | } |
72 | } else { |
73 | $titleMatch = $this->transformTitleMatchResult( $body ); |
74 | if ( $titleMatch ) { |
75 | $transformedResponses[$term] = $this->addToTransformedResponses( |
76 | $transformedResponses[$term], |
77 | $titleMatch |
78 | ); |
79 | } |
80 | } |
81 | } |
82 | |
83 | // Sort items by score. |
84 | foreach ( $transformedResponses as $i => $term ) { |
85 | $scores = array_column( $term, 'score' ); |
86 | array_multisort( $scores, SORT_DESC, $transformedResponses[$i] ); |
87 | } |
88 | |
89 | return $transformedResponses; |
90 | } |
91 | |
92 | private function gatherEntitySearchRequests( array $searchQueries ): array { |
93 | return array_map( function ( $query ) { |
94 | $params = [ |
95 | 'format' => 'json', |
96 | 'action' => 'query', |
97 | 'generator' => 'search', |
98 | 'gsrsearch' => $query, |
99 | 'gsrnamespace' => 0, |
100 | 'gsrlimit' => 50, |
101 | 'gsrprop' => 'snippet|titlesnippet|extensiondata', |
102 | 'uselang' => $this->inputLanguage, |
103 | 'prop' => 'entityterms', |
104 | 'wbetterms' => 'alias|label', |
105 | 'wbetlanguage' => $this->outputLanguage, |
106 | ]; |
107 | |
108 | return [ |
109 | 'method' => 'GET', |
110 | '_term' => $query, |
111 | '_type' => 'entitySearch', |
112 | 'url' => $this->entitySearchUrl . '?' . http_build_query( $params ), |
113 | ]; |
114 | }, $searchQueries ); |
115 | } |
116 | |
117 | private function gatherTitleMatchRequests( array $searchQueries ): array { |
118 | if ( !$this->titleMatchUrl ) { |
119 | return []; |
120 | } |
121 | return array_map( function ( $query ) { |
122 | $params = [ |
123 | 'format' => 'json', |
124 | 'action' => 'query', |
125 | // ucfirst() the string, and strip quotes (in case the query comes from |
126 | // a phrase query) |
127 | 'titles' => $this->mbUcFirst( trim( $query, " \n\r\t\v\0\"" ) ), |
128 | 'prop' => 'pageprops', |
129 | 'redirects' => 1, |
130 | ]; |
131 | |
132 | return [ |
133 | 'method' => 'GET', |
134 | '_term' => $query, |
135 | '_type' => 'titleMatch', |
136 | 'url' => sprintf( $this->titleMatchUrl, $this->inputLanguage ) . '?' . |
137 | http_build_query( $params ), |
138 | ]; |
139 | }, $searchQueries ); |
140 | } |
141 | |
142 | /** |
143 | * Replicates php's ucfirst() function with multibyte support. |
144 | * |
145 | * @param string $str The string being converted. |
146 | * @param null|string $encoding Optional encoding parameter is the character encoding. |
147 | * If it is omitted, the internal character encoding value will be used. |
148 | * |
149 | * @return string The input string with first character uppercased. |
150 | * @see https://github.com/cofirazak/phpMissingFunctions/blob/master/src/StringFunc.php |
151 | */ |
152 | public function mbUcFirst( string $str, string $encoding = null ): string { |
153 | if ( $encoding === null ) { |
154 | $encoding = mb_internal_encoding(); |
155 | } |
156 | |
157 | return mb_strtoupper( mb_substr( $str, 0, 1, $encoding ), $encoding ) . |
158 | mb_substr( $str, 1, null, $encoding ); |
159 | } |
160 | |
161 | private function transformTitleMatchResult( array $result ): ?array { |
162 | if ( isset( $result['query']['pages'] ) ) { |
163 | $page = array_shift( $result['query']['pages'] ); |
164 | if ( isset( $page['pageprops']['wikibase_item'] ) ) { |
165 | return [ |
166 | 'entityId' => $page['pageprops']['wikibase_item'], |
167 | 'score' => 1.0, |
168 | 'synonyms' => array_column( $result['query']['redirects'] ?? [], 'to' ), |
169 | ]; |
170 | } |
171 | } |
172 | return null; |
173 | } |
174 | |
175 | private function addToTransformedResponses( array $collection, array $item ) { |
176 | if ( !isset( $collection[ $item['entityId'] ] ) ) { |
177 | $collection[ $item['entityId'] ] = $item; |
178 | return $collection; |
179 | } |
180 | $collection[ $item['entityId'] ] = [ |
181 | 'entityId' => $item['entityId'], |
182 | 'synonyms' => array_merge( |
183 | $collection[ $item['entityId'] ]['synonyms'] ?? [], |
184 | $item['synonyms'] ?? [] |
185 | ), |
186 | 'score' => max( $collection[ $item['entityId'] ]['score'], $item['score'] ), |
187 | ]; |
188 | return $collection; |
189 | } |
190 | |
191 | /** |
192 | * @param array $result |
193 | * @return array |
194 | */ |
195 | protected function transformEntitySearchResult( array $result ): array { |
196 | // unfortunately, the search API doesn't return an actual score |
197 | // (for relevancy of the match), which means that we have no way |
198 | // of telling which results are awesome matches and which are only |
199 | // somewhat relevant |
200 | // since we can't rely on the order to tell us much about how |
201 | // relevant a result is (except for relative to one another), and |
202 | // we don't know the actual score of these results, we'll try to |
203 | // approximate a term frequency - it won't be great, but at least |
204 | // we'll be able to tell which of "cat" and "Pirates of Catalonia" |
205 | // most resemble "cat" |
206 | // the highlight will either be in extensiondata (in the case |
207 | // of a matching alias), snippet (for descriptions), or |
208 | // titlesnippet (for labels) |
209 | $snippets = [ |
210 | $result['snippet'], |
211 | $result['titlesnippet'], |
212 | $result['extensiondata']['wikibase']['extrasnippet'] ?? '' |
213 | ]; |
214 | |
215 | $maxTermFrequency = 0; |
216 | foreach ( $snippets as $snippet ) { |
217 | // let's figure out how much of the snippet actually matched |
218 | // the search term based on the highlight |
219 | $source = preg_replace( '/<span class="searchmatch">(.*?)<\/span>/', '$1', $snippet ); |
220 | $omitted = preg_replace( '/<span class="searchmatch">.*?<\/span>/', '', $snippet ); |
221 | $termFrequency = $source === '' ? 0 : 1 - mb_strlen( $omitted ) / mb_strlen( $source ); |
222 | $maxTermFrequency = max( $maxTermFrequency, $termFrequency ); |
223 | } |
224 | |
225 | // average the order in which results were returned (because that |
226 | // takes into account additional factors such as popularity of |
227 | // the page) and the naive term frequency to calculate how relevant |
228 | // the results are relative to one another |
229 | $relativeOrder = 1 / $result['index']; |
230 | |
231 | $synonyms = []; |
232 | if ( isset( $result['entityterms'] ) ) { |
233 | $synonyms = array_merge( |
234 | $synonyms, |
235 | $result['entityterms']['label'] ?? [], |
236 | $result['entityterms']['alias'] ?? [] |
237 | ); |
238 | } |
239 | |
240 | return [ |
241 | 'entityId' => $result['title'], |
242 | 'score' => ( $relativeOrder + $maxTermFrequency ) / 2, |
243 | 'synonyms' => $synonyms, |
244 | ]; |
245 | } |
246 | } |