Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
98.37% covered (success)
98.37%
121 / 123
75.00% covered (warning)
75.00%
6 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
MediaSearchEntitiesFetcher
98.37% covered (success)
98.37%
121 / 123
75.00% covered (warning)
75.00%
6 / 8
23
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 get
96.67% covered (success)
96.67%
29 / 30
0.00% covered (danger)
0.00%
0 / 1
8
 gatherEntitySearchRequests
100.00% covered (success)
100.00%
21 / 21
100.00% covered (success)
100.00%
1 / 1
1
 gatherTitleMatchRequests
94.44% covered (success)
94.44%
17 / 18
0.00% covered (danger)
0.00%
0 / 1
2.00
 mbUcFirst
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 transformTitleMatchResult
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 addToTransformedResponses
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
2
 transformEntitySearchResult
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
4
1<?php
2
3namespace Wikibase\MediaInfo\Search;
4
5use MultiHttpClient;
6
7class MediaSearchEntitiesFetcher {
8    /** @var MultiHttpClient */
9    protected $multiHttpClient;
10
11    /** @var string */
12    protected $entitySearchUrl;
13
14    /** @var string */
15    protected $titleMatchUrl;
16
17    /** @var string */
18    protected $inputLanguage;
19
20    /** @var string */
21    protected $outputLanguage;
22
23    public function __construct(
24        MultiHttpClient $multiHttpClient,
25        string $entitySearchUrl,
26        string $titleMatchUrl,
27        string $inputLanguage,
28        string $outputLanguage
29    ) {
30        $this->multiHttpClient = $multiHttpClient;
31        $this->entitySearchUrl = $entitySearchUrl;
32        $this->titleMatchUrl = $titleMatchUrl;
33        $this->inputLanguage = $inputLanguage;
34        $this->outputLanguage = $outputLanguage;
35    }
36
37    /**
38     * Find wikibase entities that match given search queries and return their ids,
39     * along with a (normalized, between 0-1) score indicating good of a match
40     * they are.
41     *
42     * @param array $searchQueries
43     * @return array
44     */
45    public function get( array $searchQueries ): array {
46        if ( count( $searchQueries ) === 0 ) {
47            return [];
48        }
49
50        $entitySearchRequests = $this->gatherEntitySearchRequests( $searchQueries );
51        $titleMatchRequests = $this->gatherTitleMatchRequests( $searchQueries );
52
53        $responses = $this->multiHttpClient->runMulti(
54            array_merge( $entitySearchRequests, $titleMatchRequests )
55        );
56
57        $transformedResponses = array_fill_keys(
58            array_values( $searchQueries ),
59            []
60        );
61        foreach ( $responses as $response ) {
62            $body = json_decode( $response['response']['body'], true ) ?: [];
63            $term = $response['_term'];
64            if ( $response['_type'] === 'entitySearch' ) {
65                // iterate each result
66                foreach ( $body['query']['pages'] ?? [] as $result ) {
67                    $transformedResponses[$term] = $this->addToTransformedResponses(
68                        $transformedResponses[$term],
69                        $this->transformEntitySearchResult( $result )
70                    );
71                }
72            } else {
73                $titleMatch = $this->transformTitleMatchResult( $body );
74                if ( $titleMatch ) {
75                    $transformedResponses[$term] = $this->addToTransformedResponses(
76                        $transformedResponses[$term],
77                        $titleMatch
78                    );
79                }
80            }
81        }
82
83        // Sort items by score.
84        foreach ( $transformedResponses as $i => $term ) {
85            $scores = array_column( $term, 'score' );
86            array_multisort( $scores, SORT_DESC, $transformedResponses[$i] );
87        }
88
89        return $transformedResponses;
90    }
91
92    private function gatherEntitySearchRequests( array $searchQueries ): array {
93        return array_map( function ( $query ) {
94            $params = [
95                'format' => 'json',
96                'action' => 'query',
97                'generator' => 'search',
98                'gsrsearch' => $query,
99                'gsrnamespace' => 0,
100                'gsrlimit' => 50,
101                'gsrprop' => 'snippet|titlesnippet|extensiondata',
102                'uselang' => $this->inputLanguage,
103                'prop' => 'entityterms',
104                'wbetterms' => 'alias|label',
105                'wbetlanguage' => $this->outputLanguage,
106            ];
107
108            return [
109                'method' => 'GET',
110                '_term' => $query,
111                '_type' => 'entitySearch',
112                'url' => $this->entitySearchUrl . '?' . http_build_query( $params ),
113            ];
114        }, $searchQueries );
115    }
116
117    private function gatherTitleMatchRequests( array $searchQueries ): array {
118        if ( !$this->titleMatchUrl ) {
119            return [];
120        }
121        return array_map( function ( $query ) {
122            $params = [
123                'format' => 'json',
124                'action' => 'query',
125                // ucfirst() the string, and strip quotes (in case the query comes from
126                // a phrase query)
127                'titles' => $this->mbUcFirst( trim( $query, " \n\r\t\v\0\"" ) ),
128                'prop' => 'pageprops',
129                'redirects' => 1,
130            ];
131
132            return [
133                'method' => 'GET',
134                '_term' => $query,
135                '_type' => 'titleMatch',
136                'url' => sprintf( $this->titleMatchUrl, $this->inputLanguage ) . '?' .
137                         http_build_query( $params ),
138            ];
139        }, $searchQueries );
140    }
141
142    /**
143     * Replicates php's ucfirst() function with multibyte support.
144     *
145     * @param string $str The string being converted.
146     * @param null|string $encoding Optional encoding parameter is the character encoding.
147     *     If it is omitted, the internal character encoding value will be used.
148     *
149     * @return string The input string with first character uppercased.
150     * @see https://github.com/cofirazak/phpMissingFunctions/blob/master/src/StringFunc.php
151     */
152    public function mbUcFirst( string $str, string $encoding = null ): string {
153        if ( $encoding === null ) {
154            $encoding = mb_internal_encoding();
155        }
156
157        return mb_strtoupper( mb_substr( $str, 0, 1, $encoding ), $encoding ) .
158               mb_substr( $str, 1, null, $encoding );
159    }
160
161    private function transformTitleMatchResult( array $result ): ?array {
162        if ( isset( $result['query']['pages'] ) ) {
163            $page = array_shift( $result['query']['pages'] );
164            if ( isset( $page['pageprops']['wikibase_item'] ) ) {
165                return [
166                    'entityId' => $page['pageprops']['wikibase_item'],
167                    'score' => 1.0,
168                    'synonyms' => array_column( $result['query']['redirects'] ?? [], 'to' ),
169                ];
170            }
171        }
172        return null;
173    }
174
175    private function addToTransformedResponses( array $collection, array $item ) {
176        if ( !isset( $collection[ $item['entityId'] ] ) ) {
177            $collection[ $item['entityId'] ] = $item;
178            return $collection;
179        }
180        $collection[ $item['entityId'] ] = [
181            'entityId' => $item['entityId'],
182            'synonyms' => array_merge(
183                $collection[ $item['entityId'] ]['synonyms'] ?? [],
184                $item['synonyms'] ?? []
185            ),
186            'score' => max( $collection[ $item['entityId'] ]['score'], $item['score'] ),
187        ];
188        return $collection;
189    }
190
191    /**
192     * @param array $result
193     * @return array
194     */
195    protected function transformEntitySearchResult( array $result ): array {
196        // unfortunately, the search API doesn't return an actual score
197        // (for relevancy of the match), which means that we have no way
198        // of telling which results are awesome matches and which are only
199        // somewhat relevant
200        // since we can't rely on the order to tell us much about how
201        // relevant a result is (except for relative to one another), and
202        // we don't know the actual score of these results, we'll try to
203        // approximate a term frequency - it won't be great, but at least
204        // we'll be able to tell which of "cat" and "Pirates of Catalonia"
205        // most resemble "cat"
206        // the highlight will either be in extensiondata (in the case
207        // of a matching alias), snippet (for descriptions), or
208        // titlesnippet (for labels)
209        $snippets = [
210            $result['snippet'],
211            $result['titlesnippet'],
212            $result['extensiondata']['wikibase']['extrasnippet'] ?? ''
213        ];
214
215        $maxTermFrequency = 0;
216        foreach ( $snippets as $snippet ) {
217            // let's figure out how much of the snippet actually matched
218            // the search term based on the highlight
219            $source = preg_replace( '/<span class="searchmatch">(.*?)<\/span>/', '$1', $snippet );
220            $omitted = preg_replace( '/<span class="searchmatch">.*?<\/span>/', '', $snippet );
221            $termFrequency = $source === '' ? 0 : 1 - mb_strlen( $omitted ) / mb_strlen( $source );
222            $maxTermFrequency = max( $maxTermFrequency, $termFrequency );
223        }
224
225        // average the order in which results were returned (because that
226        // takes into account additional factors such as popularity of
227        // the page) and the naive term frequency to calculate how relevant
228        // the results are relative to one another
229        $relativeOrder = 1 / $result['index'];
230
231        $synonyms = [];
232        if ( isset( $result['entityterms'] ) ) {
233            $synonyms = array_merge(
234                $synonyms,
235                $result['entityterms']['label'] ?? [],
236                $result['entityterms']['alias'] ?? []
237             );
238        }
239
240        return [
241            'entityId' => $result['title'],
242            'score' => ( $relativeOrder + $maxTermFrequency ) / 2,
243            'synonyms' => $synonyms,
244        ];
245    }
246}