Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
108 / 108 |
|
100.00% |
10 / 10 |
CRAP | |
100.00% |
1 / 1 |
InterwikiTablePrefixLookup | |
100.00% |
108 / 108 |
|
100.00% |
10 / 10 |
32 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getPrefix | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getPrefixFromLegacyConfig | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
getPrefixFromInterwikiTable | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
getTwoHopPrefixThroughIntermediary | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
fetchSecondHopPrefix | |
100.00% |
44 / 44 |
|
100.00% |
1 / 1 |
7 | |||
prefetchInterwikiMap | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
prefetchParentDomainToHostMap | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getParentDomain | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
isSmaller | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace FileImporter\Remote\MediaWiki; |
4 | |
5 | use FileImporter\Data\SourceUrl; |
6 | use FileImporter\Exceptions\HttpRequestException; |
7 | use FileImporter\Interfaces\LinkPrefixLookup; |
8 | use FileImporter\Services\Http\HttpRequestExecutor; |
9 | use MediaWiki\Interwiki\InterwikiLookup; |
10 | use Psr\Log\LoggerInterface; |
11 | use Psr\Log\NullLogger; |
12 | |
13 | /** |
14 | * This LinkPrefixLookup implementation will allow interwiki references |
15 | * from MediaWiki websites that are contained in the interwiki table. |
16 | * |
17 | * @license GPL-2.0-or-later |
18 | * @author Christoph Jauera <christoph.jauera@wikimedia.de> |
19 | */ |
20 | class InterwikiTablePrefixLookup implements LinkPrefixLookup { |
21 | |
22 | private InterwikiLookup $interwikiLookup; |
23 | private HttpApiLookup $httpApiLookup; |
24 | private HttpRequestExecutor $httpRequestExecutor; |
25 | /** @var array<string,string>|null Array mapping full host name to interwiki prefix */ |
26 | private $interwikiTableMap = null; |
27 | /** |
28 | * @var array<string,string>|null Array mapping parent domain to a representative URL. The idea |
29 | * is that, for example, a site matching *.wiktionary.* will have interwiki links to each |
30 | * language version of Wiktionary. |
31 | */ |
32 | private $parentDomainToUrlMap = null; |
33 | /** @var string[] */ |
34 | private array $interWikiConfigMap; |
35 | private LoggerInterface $logger; |
36 | |
37 | /** |
38 | * @param InterwikiLookup $interwikiLookup |
39 | * @param HttpApiLookup $httpApiLookup |
40 | * @param HttpRequestExecutor $httpRequestExecutor |
41 | * @param string[] $interWikiConfigMap |
42 | * @param LoggerInterface|null $logger |
43 | */ |
44 | public function __construct( |
45 | InterwikiLookup $interwikiLookup, |
46 | HttpApiLookup $httpApiLookup, |
47 | HttpRequestExecutor $httpRequestExecutor, |
48 | array $interWikiConfigMap = [], |
49 | ?LoggerInterface $logger = null |
50 | ) { |
51 | $this->interwikiLookup = $interwikiLookup; |
52 | $this->httpApiLookup = $httpApiLookup; |
53 | $this->httpRequestExecutor = $httpRequestExecutor; |
54 | $this->interWikiConfigMap = $interWikiConfigMap; |
55 | $this->logger = $logger ?? new NullLogger(); |
56 | } |
57 | |
58 | /** |
59 | * @inheritDoc |
60 | * @return string Interwiki prefix or empty string on failure. |
61 | */ |
62 | public function getPrefix( SourceUrl $sourceUrl ): string { |
63 | $host = $sourceUrl->getHost(); |
64 | |
65 | // TODO: Wrap this class in a caching lookup to save each successful host -> prefix mapping. |
66 | |
67 | return $this->getPrefixFromLegacyConfig( $host ) ?? |
68 | $this->getPrefixFromInterwikiTable( $host ) ?? |
69 | $this->getTwoHopPrefixThroughIntermediary( $host ) ?? |
70 | ''; |
71 | } |
72 | |
73 | /** |
74 | * Lookup the host in hardcoded configuration. |
75 | * |
76 | * @deprecated This configuration will go away once dynamic lookup is in place. |
77 | */ |
78 | private function getPrefixFromLegacyConfig( string $host ): ?string { |
79 | if ( isset( $this->interWikiConfigMap[$host] ) ) { |
80 | $prefixes = explode( ':', $this->interWikiConfigMap[$host], 2 ); |
81 | if ( !$this->interwikiLookup->isValidInterwiki( $prefixes[0] ) ) { |
82 | $this->logger->warning( 'Configured prefix {prefix} not valid.', [ |
83 | 'host' => $host, |
84 | 'prefix' => $this->interWikiConfigMap[$host] |
85 | ] ); |
86 | |
87 | return null; |
88 | } |
89 | |
90 | return $this->interWikiConfigMap[$host]; |
91 | } else { |
92 | $this->logger->debug( 'Host {host} not in FileImporterInterWikiMap, proceeding with lookup.', [ |
93 | 'host' => $host ] ); |
94 | return null; |
95 | } |
96 | } |
97 | |
98 | /** |
99 | * Lookup host in the local interwiki table. |
100 | */ |
101 | private function getPrefixFromInterwikiTable( string $host ): ?string { |
102 | $this->interwikiTableMap ??= $this->prefetchInterwikiMap(); |
103 | |
104 | if ( !isset( $this->interwikiTableMap[$host] ) ) { |
105 | $this->logger->debug( |
106 | 'Host {host} does not match any local interwiki entry.', |
107 | [ |
108 | 'host' => $host, |
109 | ] |
110 | ); |
111 | } |
112 | |
113 | return $this->interwikiTableMap[$host] ?? null; |
114 | } |
115 | |
116 | /** |
117 | * Lookup host by hopping through its parent domain's interwiki. |
118 | * |
119 | * This is an optimization for Wikimedia projects which are split into |
120 | * third-level subdomains by language, and often not present in the |
121 | * target wiki's local Interwiki table. |
122 | */ |
123 | private function getTwoHopPrefixThroughIntermediary( string $host ): ?string { |
124 | $this->parentDomainToUrlMap ??= $this->prefetchParentDomainToHostMap(); |
125 | |
126 | // TODO: The sub-domain-based intermediate host-guessing logic should be in its own |
127 | // class, and pluggable. |
128 | $parent = $this->getParentDomain( $host ); |
129 | if ( $parent && isset( $this->parentDomainToUrlMap[$parent] ) ) { |
130 | $prefix = $this->getPrefixFromInterwikiTable( $this->parentDomainToUrlMap[$parent] ); |
131 | |
132 | if ( $prefix !== null ) { |
133 | $secondHop = $this->fetchSecondHopPrefix( $prefix, $host ); |
134 | if ( $secondHop !== null ) { |
135 | // TODO: It would be luxurious to find the shortest matching prefix. |
136 | $fullPrefix = $prefix . ':' . $secondHop; |
137 | $this->logger->info( 'Calculated two-hop interwiki prefix {prefix} to {host}', [ |
138 | 'host' => $host, |
139 | 'prefix' => $fullPrefix, |
140 | ] ); |
141 | return $fullPrefix; |
142 | } |
143 | } |
144 | } |
145 | return null; |
146 | } |
147 | |
148 | /** |
149 | * Fetch the next interwiki prefix from the first hop's API. |
150 | * |
151 | * @param string $intermediateWikiPrefix first hop |
152 | * @param string $host final host |
153 | * |
154 | * @return string|null |
155 | */ |
156 | private function fetchSecondHopPrefix( string $intermediateWikiPrefix, string $host ): ?string { |
157 | $this->logger->debug( 'Fetching second hop to {host} via {prefix}', [ |
158 | 'host' => $host, |
159 | 'prefix' => $intermediateWikiPrefix ] ); |
160 | $intermediateWiki = $this->interwikiLookup->fetch( $intermediateWikiPrefix ); |
161 | if ( !$intermediateWiki ) { |
162 | $this->logger->warning( 'Missing interwiki entry for {prefix}', [ |
163 | 'prefix' => $intermediateWikiPrefix ] ); |
164 | return null; |
165 | } |
166 | $intermediateWikiApiUrl = $intermediateWiki->getAPI(); |
167 | if ( $intermediateWikiApiUrl === '' ) { |
168 | $this->logger->debug( 'Missing API URL for interwiki {prefix}, scraping from mainpage.', [ |
169 | 'prefix' => $intermediateWikiPrefix ] ); |
170 | $intermediateWikiUrl = $intermediateWiki->getURL( '' ); |
171 | $intermediateWikiApiUrl = $this->httpApiLookup->getApiUrl( |
172 | new SourceUrl( $intermediateWikiUrl ) ); |
173 | } |
174 | |
175 | try { |
176 | $this->logger->debug( 'Making API request to pull interwiki links from {api}.', [ |
177 | 'api' => $intermediateWikiApiUrl ] ); |
178 | $response = $this->httpRequestExecutor->execute( |
179 | $intermediateWikiApiUrl, |
180 | [ |
181 | 'action' => 'query', |
182 | 'errorformat' => 'plaintext', |
183 | 'format' => 'json', |
184 | 'formatversion' => 2, |
185 | 'meta' => 'siteinfo', |
186 | 'siprop' => 'interwikimap' |
187 | ] |
188 | ); |
189 | |
190 | $responseInterwikiMap = json_decode( $response->getContent(), true ); |
191 | foreach ( $responseInterwikiMap['query']['interwikimap'] ?? [] as $entry ) { |
192 | if ( isset( $entry['url'] ) ) { |
193 | if ( parse_url( $entry['url'], PHP_URL_HOST ) === $host ) { |
194 | // FIXME: Currently this returns the first match, not the shortest |
195 | return $entry['prefix']; |
196 | } |
197 | } |
198 | } |
199 | } catch ( HttpRequestException $e ) { |
200 | $this->logger->warning( 'Failed to make API request to {api}.', [ |
201 | 'api' => $intermediateWikiApiUrl ] ); |
202 | } |
203 | |
204 | $this->logger->info( |
205 | 'Failed to find second interwiki hop from {api} to {host}.', |
206 | [ |
207 | 'api' => $intermediateWikiApiUrl, |
208 | 'host' => $host |
209 | ] |
210 | ); |
211 | |
212 | return null; |
213 | } |
214 | |
215 | /** |
216 | * @return array<string,string> |
217 | */ |
218 | private function prefetchInterwikiMap(): array { |
219 | $map = []; |
220 | |
221 | foreach ( $this->interwikiLookup->getAllPrefixes() as $row ) { |
222 | // This assumes all URLs in the interwiki (or sites) table are valid. |
223 | $host = parse_url( $row['iw_url'], PHP_URL_HOST ); |
224 | |
225 | if ( !isset( $map[$host] ) || $this->isSmaller( $row['iw_prefix'], $map[$host] ) ) { |
226 | $map[$host] = $row['iw_prefix']; |
227 | } |
228 | } |
229 | |
230 | return $map; |
231 | } |
232 | |
233 | /** |
234 | * @return array<string,string> |
235 | */ |
236 | private function prefetchParentDomainToHostMap(): array { |
237 | $this->interwikiTableMap ??= $this->prefetchInterwikiMap(); |
238 | |
239 | $maps = []; |
240 | foreach ( $this->interwikiTableMap as $host => $prefix ) { |
241 | $parentDomain = $this->getParentDomain( $host ); |
242 | if ( $parentDomain ) { |
243 | $maps[$parentDomain] = $host; |
244 | } |
245 | } |
246 | |
247 | return $maps; |
248 | } |
249 | |
250 | /** |
251 | * @return string|null New hostname with the minor sub-*-domain removed. |
252 | */ |
253 | private function getParentDomain( string $host ): ?string { |
254 | $parts = explode( '.', $host, 2 ); |
255 | // It doesn't make sense to reduce e.g. "mediawiki.org" to "org" |
256 | if ( isset( $parts[1] ) && str_contains( $parts[1], '.' ) ) { |
257 | return $parts[1]; |
258 | } |
259 | return null; |
260 | } |
261 | |
262 | /** |
263 | * @return bool true if $a is shorter or alphabetically before $b |
264 | */ |
265 | private function isSmaller( string $a, string $b ): bool { |
266 | return strlen( $a ) < strlen( $b ) |
267 | || ( strlen( $a ) === strlen( $b ) && strcmp( $a, $b ) < 0 ); |
268 | } |
269 | |
270 | } |