Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 210 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
ImportOresTopics | |
0.00% |
0 / 204 |
|
0.00% |
0 / 13 |
3906 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
init | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
132 | |||
getPages | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
72 | |||
getTopics | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
240 | |||
getTopicsByRandom | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
getApiUrl | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
hasOresModel | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
getTopicsFromOres | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
getSiteLinks | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
12 | |||
titlesToRevisionIds | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
getJsonData | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
search | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | |
3 | namespace GrowthExperiments\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\Query\ArticleTopicFeature; |
7 | use Generator; |
8 | use GrowthExperiments\GrowthExperimentsServices; |
9 | use GrowthExperiments\Util; |
10 | use LogicException; |
11 | use Maintenance; |
12 | use MediaWiki\Cache\LinkBatchFactory; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Status\Status; |
15 | use MediaWiki\Title\Title; |
16 | use MediaWiki\Title\TitleFactory; |
17 | use MediaWiki\WikiMap\WikiMap; |
18 | use RuntimeException; |
19 | use StatusValue; |
20 | use Wikimedia\Assert\PreconditionException; |
21 | |
22 | $IP = getenv( 'MW_INSTALL_PATH' ); |
23 | if ( $IP === false ) { |
24 | $IP = __DIR__ . '/../../..'; |
25 | } |
26 | require_once "$IP/maintenance/Maintenance.php"; |
27 | |
28 | /** |
29 | * Maintenance script for importing ORES topics from production to a test instance. |
30 | */ |
31 | class ImportOresTopics extends Maintenance { |
32 | |
33 | /** Fetch ORES topics from a production wiki. */ |
34 | public const TOPIC_SOURCE_PROD = 'prod'; |
35 | /** Use random topics. */ |
36 | public const TOPIC_SOURCE_RANDOM = 'random'; |
37 | |
38 | /** @var CirrusSearch */ |
39 | private $cirrusSearch; |
40 | |
41 | /** @var TitleFactory */ |
42 | private $titleFactory; |
43 | |
44 | /** @var LinkBatchFactory */ |
45 | private $linkBatchFactory; |
46 | |
47 | /** @var bool Are we on the beta cluster? */ |
48 | private $isBeta; |
49 | |
50 | /** @var string Source of ORES topic information; one of TOPIC_SOURCE_* */ |
51 | private $topicSource; |
52 | |
53 | /** @var bool Use verbose output. */ |
54 | private $verbose; |
55 | |
56 | /** @var string|null MediaWiki API URL for the production wiki. */ |
57 | private $apiUrl; |
58 | |
59 | /** @var string|null Wiki ID of the production wiki. */ |
60 | private $wikiId; |
61 | |
62 | /** @var bool|null Does the wiki have an 'articletopic' ORES model? */ |
63 | private $wikiHasOresModel; |
64 | |
65 | public function __construct() { |
66 | parent::__construct(); |
67 | $this->requireExtension( 'GrowthExperiments' ); |
68 | $this->requireExtension( 'CirrusSearch' ); |
69 | |
70 | $this->addDescription( 'Import ORES topics from a production wiki' ); |
71 | $this->addOption( 'count', 'Number of articles to fetch a topic for.', false, true ); |
72 | $this->addOption( 'topicSource', "Topic source: 'prod' for fetching from a production wiki ' |
73 | . '(assumes a wiki with titles imported from production), 'random': random topics", false, true ); |
74 | $this->addOption( 'apiUrl', "MediaWiki API URL of the wiki the articles are from. ' |
75 | . 'Only with --topicSource=prod. Can be auto-guessed in the beta cluster.", false, true ); |
76 | $this->addOption( 'wikiId', "Wiki ID to use when fetching scores from the ORES API. ' |
77 | . 'Only with --topicSource=prod. Can be auto-guessed in the beta cluster.", false, true ); |
78 | $this->addOption( 'pageList', 'Name of a file containing the list of pages to import topics for, ' |
79 | . 'one title per line. When omitted, pages with no topics are selected randomly.', false, true ); |
80 | $this->addOption( 'verbose', 'Use verbose output' ); |
81 | $this->setBatchSize( 50 ); |
82 | } |
83 | |
84 | public function execute() { |
85 | $this->init(); |
86 | |
87 | $gen = $this->getPages( $this->getBatchSize() ); |
88 | foreach ( $gen as $titleBatch ) { |
89 | $topics = $this->getTopics( $titleBatch ); |
90 | foreach ( $topics as $pageName => $titleTopics ) { |
91 | if ( $this->verbose ) { |
92 | $topicList = urldecode( http_build_query( $titleTopics, '', ', ' ) ); |
93 | $this->output( "Adding topics for $pageName: $topicList\n" ); |
94 | } |
95 | try { |
96 | $this->cirrusSearch->updateWeightedTags( Title::newFromText( $pageName )->toPageIdentity(), |
97 | 'classification.ores.articletopic', array_keys( $titleTopics ), $titleTopics ); |
98 | } catch ( PreconditionException $e ) { |
99 | // Page did not exist |
100 | $this->error( $pageName . ': ' . $e->getMessage() ); |
101 | } |
102 | } |
103 | $gen->send( count( $topics ) ); |
104 | } |
105 | } |
106 | |
107 | private function init() { |
108 | $services = MediaWikiServices::getInstance(); |
109 | $growthServices = GrowthExperimentsServices::wrap( $services ); |
110 | if ( !$growthServices->getGrowthConfig()->get( 'GEDeveloperSetup' ) ) { |
111 | $this->fatalError( 'This script cannot be safely run in production. (If the current ' |
112 | . 'environment is not production, $wgGEDeveloperSetup should be set to true.)' ); |
113 | } |
114 | |
115 | $this->cirrusSearch = new CirrusSearch(); |
116 | $this->titleFactory = $services->getTitleFactory(); |
117 | $this->linkBatchFactory = $services->getLinkBatchFactory(); |
118 | $this->isBeta = preg_match( '/\.beta\.wmflabs\./', $this->getConfig()->get( 'Server' ) ); |
119 | |
120 | $this->topicSource = $this->getOption( 'topicSource', self::TOPIC_SOURCE_PROD ); |
121 | if ( !in_array( $this->topicSource, [ self::TOPIC_SOURCE_PROD, self::TOPIC_SOURCE_RANDOM ] ) ) { |
122 | $this->fatalError( "Invalid value for --topicSource: {$this->topicSource}" ); |
123 | } |
124 | if ( $this->topicSource == self::TOPIC_SOURCE_PROD ) { |
125 | $this->apiUrl = $this->getOption( 'apiUrl' ); |
126 | $this->wikiId = $this->getOption( 'wikiId' ); |
127 | if ( $this->isBeta ) { |
128 | $this->apiUrl ??= $this->getApiUrl(); |
129 | $this->wikiId ??= WikiMap::getCurrentWikiId(); |
130 | } elseif ( !$this->apiUrl ) { |
131 | $this->fatalError( '--apiUrl is required when --topicSource is prod, ' |
132 | . 'unless running in the beta cluster' ); |
133 | } elseif ( !$this->wikiId ) { |
134 | $this->fatalError( '--wikiId is required when --topicSource is prod, ' |
135 | . 'unless running in the beta cluster' ); |
136 | } |
137 | } |
138 | if ( $this->hasOption( 'pageList' ) && $this->hasOption( 'count' ) ) { |
139 | $this->fatalError( 'It makes no sense to use --count and --pageList together' ); |
140 | } elseif ( !$this->hasOption( 'pageList' ) && !$this->hasOption( 'count' ) ) { |
141 | $this->fatalError( 'One of --count or --pageList is required' ); |
142 | } |
143 | |
144 | $this->verbose = $this->hasOption( 'verbose' ); |
145 | } |
146 | |
147 | /** |
148 | * @param int $batchSize |
149 | * @return Generator<Title[]> |
150 | */ |
151 | private function getPages( int $batchSize ) { |
152 | $pageList = $this->getOption( 'pageList' ); |
153 | if ( $pageList ) { |
154 | if ( $pageList[0] !== '/' ) { |
155 | $pageList = ( $_SERVER['PWD'] ?? getcwd() ) . '/' . $pageList; |
156 | } |
157 | $pages = file_get_contents( $pageList ); |
158 | if ( $pages === false ) { |
159 | $this->fatalError( "Could not read $pageList" ); |
160 | } |
161 | $pages = preg_split( '/\n/', $pages, -1, PREG_SPLIT_NO_EMPTY ); |
162 | foreach ( array_chunk( $pages, $batchSize ) as $pageBatch ) { |
163 | $titleBatch = array_filter( array_map( [ $this->titleFactory, 'newFromText' ], $pageBatch ) ); |
164 | $this->linkBatchFactory->newLinkBatch( $titleBatch )->execute(); |
165 | yield $titleBatch; |
166 | } |
167 | } else { |
168 | $totalCount = $this->getOption( 'count' ); |
169 | $batchSize = min( $batchSize, $totalCount ); |
170 | $offset = 0; |
171 | while ( $totalCount > 0 ) { |
172 | // Exclude Selenium test articles. The search query regex syntax does not seem to |
173 | // allow for \d. |
174 | $searchTerms = [ |
175 | '-intitle:/[0-9]{10}/', |
176 | '-articletopic:' . implode( '|', array_keys( ArticleTopicFeature::TERMS_TO_LABELS ) ), |
177 | ]; |
178 | $titleBatch = $this->search( implode( ' ', $searchTerms ), $batchSize, $offset ); |
179 | if ( !$titleBatch ) { |
180 | $this->fatalError( 'No more articles found' ); |
181 | } elseif ( $this->verbose ) { |
182 | $this->output( 'Found ' . count( $titleBatch ) . " articles\n" ); |
183 | } |
184 | $fixedCount = yield $titleBatch; |
185 | $totalCount -= $fixedCount; |
186 | $offset += $batchSize; |
187 | } |
188 | } |
189 | } |
190 | |
191 | /** |
192 | * @param Title[] $titles |
193 | * @return int[][] title => topic => score |
194 | */ |
195 | private function getTopics( array $titles ): array { |
196 | if ( $this->topicSource === self::TOPIC_SOURCE_RANDOM ) { |
197 | $topics = $this->getTopicsByRandom( $titles ); |
198 | } elseif ( $this->topicSource === self::TOPIC_SOURCE_PROD ) { |
199 | $titleStrings = array_map( static function ( Title $title ) { |
200 | return $title->getPrefixedText(); |
201 | }, $titles ); |
202 | $wikiId = $this->wikiId; |
203 | $apiUrl = $this->apiUrl; |
204 | $titleMap = []; |
205 | |
206 | if ( !$this->hasOresModel( $this->wikiId ) ) { |
207 | $wikiId = 'enwiki'; |
208 | $apiUrl = 'https://en.wikipedia.org/w/api.php'; |
209 | $titleMap = $this->getSiteLinks( $titleStrings, $this->apiUrl, $missingTitles ); |
210 | $titleStrings = array_values( $titleMap ); |
211 | if ( $this->verbose && $missingTitles ) { |
212 | $this->output( 'not found on enwiki: ' . implode( ', ', $missingTitles ) . "\n" ); |
213 | } |
214 | if ( !$titleStrings ) { |
215 | return []; |
216 | } |
217 | } |
218 | |
219 | if ( $apiUrl === null || $wikiId === null ) { |
220 | throw new RuntimeException( "No API URL ($apiUrl) or wiki ID ($wikiId)" ); |
221 | } |
222 | |
223 | $titleToRevId = $this->titlesToRevisionIds( $titleStrings, $apiUrl, $missingTitles ); |
224 | if ( $this->verbose && $missingTitles ) { |
225 | $this->output( 'not found on the production wiki: ' . implode( ', ', $missingTitles ) . "\n" ); |
226 | } |
227 | if ( !$titleToRevId ) { |
228 | return []; |
229 | } |
230 | if ( !$this->hasOresModel( $this->wikiId ) ) { |
231 | $reverseTitleMap = array_flip( $titleMap ); |
232 | $titleToRevId = array_flip( array_map( static function ( string $title ) use ( $reverseTitleMap ) { |
233 | return $reverseTitleMap[$title]; |
234 | }, array_flip( $titleToRevId ) ) ); |
235 | } |
236 | |
237 | $topics = $this->getTopicsFromOres( $titleToRevId, $wikiId ); |
238 | } else { |
239 | throw new LogicException( 'cannot get here' ); |
240 | } |
241 | foreach ( $topics as $title => &$scores ) { |
242 | foreach ( $scores as $topic => &$score ) { |
243 | // Scale probability values to 1-1000. We avoid 0 as ElasticSearch cannot |
244 | // represent it. |
245 | $score = intval( ceil( 1000 * $score ) ); |
246 | } |
247 | } |
248 | return $topics; |
249 | } |
250 | |
251 | /** |
252 | * For a set of titles, set random ORES data. |
253 | * @param Title[] $titles |
254 | * @return int[][] title => topic => score |
255 | */ |
256 | private function getTopicsByRandom( array $titles ): array { |
257 | $topicScores = []; |
258 | foreach ( $titles as $title ) { |
259 | $randomTopics = $oresTopics = array_rand( array_flip( ArticleTopicFeature::TERMS_TO_LABELS ), 3 ); |
260 | $topicScores[$title->getPrefixedText()] = array_combine( $randomTopics, array_map( static function ( $_ ) { |
261 | return mt_rand() / mt_getrandmax(); |
262 | }, $randomTopics ) ); |
263 | } |
264 | return $topicScores; |
265 | } |
266 | |
267 | /** |
268 | * @return string |
269 | */ |
270 | private function getApiUrl(): string { |
271 | $title = Title::newFromText( 'Title' ); |
272 | $devUrl = $title->getFullURL(); |
273 | $prodUrl = preg_replace( '/\.beta\.wmflabs\./', '.', $devUrl ); |
274 | if ( $devUrl === $prodUrl ) { |
275 | // Ensure we are not doing something unexpected, such as accidentally running in production |
276 | $this->fatalError( 'Could not guess production URL' ); |
277 | } |
278 | $urlParts = wfParseUrl( $prodUrl ); |
279 | $urlParts['path'] = '/w/api.php'; |
280 | unset( $urlParts['query'] ); |
281 | return wfAssembleUrl( $urlParts ); |
282 | } |
283 | |
284 | /** |
285 | * Does the wiki have an 'articletopic' ORES model? |
286 | * @param string $wikiId |
287 | * @return bool |
288 | */ |
289 | private function hasOresModel( string $wikiId ): bool { |
290 | if ( $this->wikiHasOresModel === null ) { |
291 | $oresApiUrl = 'https://ores.wikimedia.org/v3/scores/'; |
292 | $modelData = $this->getJsonData( $oresApiUrl, [ 'model_info' => '' ] ); |
293 | $this->wikiHasOresModel = isset( $modelData[$wikiId]['models']['articletopic'] ); |
294 | } |
295 | return $this->wikiHasOresModel; |
296 | } |
297 | |
298 | /** |
299 | * For a set of titles, fetch the ORES topic data for the articles with the same titles |
300 | * from a Wikimedia production wiki. |
301 | * @param int[] $revIds revision IDs (keys will be preserved and used in the return value). |
302 | * @param string $wikiId Wiki ID to use for the ORES queries. |
303 | * @return int[][] key => topic => score. |
304 | */ |
305 | private function getTopicsFromOres( array $revIds, string $wikiId ): array { |
306 | $oresApiUrl = "https://ores.wikimedia.org/v3/scores/$wikiId"; |
307 | $data = $this->getJsonData( $oresApiUrl, [ |
308 | 'models' => 'articletopic', |
309 | 'revids' => implode( '|', $revIds ), |
310 | ] ); |
311 | |
312 | $topics = []; |
313 | $revIdKeys = array_flip( $revIds ); |
314 | foreach ( $data[$wikiId]['scores'] as $revId => $scores ) { |
315 | $topicScores = []; |
316 | foreach ( $scores['articletopic']['score']['prediction'] as $topic ) { |
317 | $topicScores[$topic] = $scores['articletopic']['score']['probability'][$topic]; |
318 | } |
319 | $topics[$revIdKeys[$revId]] = $topicScores; |
320 | } |
321 | return $topics; |
322 | } |
323 | |
324 | /** |
325 | * Gets enwiki sitelinks for a batch of pages. |
326 | * @param string[] $titles Titles as prefixed text. |
327 | * @param string $apiUrl |
328 | * @param string[]|null &$missingTitles Returns the list of titles (as prefixed text) which are not found. |
329 | * @return string[] Title => enwiki title |
330 | */ |
331 | private function getSiteLinks( array $titles, string $apiUrl, array &$missingTitles = null ): array { |
332 | $data = $this->getJsonData( $apiUrl, [ |
333 | 'action' => 'query', |
334 | 'prop' => 'langlinks', |
335 | 'rvprop' => 'ids', |
336 | 'titles' => implode( '|', $titles ), |
337 | 'lllang' => 'en', |
338 | 'lllimit' => 'max', |
339 | ], true ); |
340 | $siteLinks = []; |
341 | foreach ( $data['query']['pages'] as $page ) { |
342 | if ( isset( $page['langlinks'] ) ) { |
343 | $siteLinks[$page['title']] = $page['langlinks'][0]['title']; |
344 | } |
345 | } |
346 | $missingTitles = array_diff( $titles, array_keys( $siteLinks ) ); |
347 | return $siteLinks; |
348 | } |
349 | |
350 | /** |
351 | * @param string[] $titles Titles as prefixed text. |
352 | * @param string $apiUrl |
353 | * @param string[]|null &$missingTitles Returns the list of titles (as prefixed text) which are not found. |
354 | * @return int[] title as prefixed text => rev ID |
355 | */ |
356 | private function titlesToRevisionIds( array $titles, string $apiUrl, array &$missingTitles = null ): array { |
357 | $data = $this->getJsonData( $apiUrl, [ |
358 | 'action' => 'query', |
359 | 'prop' => 'revisions', |
360 | 'rvprop' => 'ids', |
361 | 'titles' => implode( '|', $titles ), |
362 | ], true ); |
363 | |
364 | $titleToRevId = []; |
365 | foreach ( $data['query']['pages'] as $row ) { |
366 | if ( isset( $row['revisions'] ) ) { |
367 | $titleToRevId[$row['title']] = $row['revisions'][0]['revid']; |
368 | } |
369 | } |
370 | |
371 | $revIdToTitle = array_flip( $titleToRevId ); |
372 | $missingTitles = array_diff( $titles, array_values( $revIdToTitle ) ); |
373 | |
374 | return $titleToRevId; |
375 | } |
376 | |
377 | /** |
378 | * @param string $url JSON URL |
379 | * @param string[] $parameters Query parameters |
380 | * @param bool $isMediaWikiApiUrl |
381 | * @return mixed A JSON value |
382 | */ |
383 | private function getJsonData( string $url, array $parameters = [], bool $isMediaWikiApiUrl = false ) { |
384 | $requestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory(); |
385 | if ( $isMediaWikiApiUrl ) { |
386 | $result = Util::getApiUrl( $requestFactory, $url, $parameters + [ 'errorlang' => 'en' ] ); |
387 | } else { |
388 | if ( $parameters ) { |
389 | $url .= '?' . wfArrayToCgi( $parameters ); |
390 | } |
391 | $result = Util::getJsonUrl( $requestFactory, $url ); |
392 | } |
393 | if ( !$result->isOK() ) { |
394 | $this->fatalError( Status::wrap( $result )->getWikiText( false, false, 'en' ) ); |
395 | } |
396 | return $result->getValue(); |
397 | } |
398 | |
399 | /** |
400 | * Do a CirrusSearch query. |
401 | * @param string $query Search query |
402 | * @param int $limit |
403 | * @param int $offset |
404 | * @return Title[] |
405 | */ |
406 | private function search( string $query, int $limit, int $offset ): array { |
407 | $searchEngine = MediaWikiServices::getInstance()->newSearchEngine(); |
408 | $searchEngine->setLimitOffset( $limit, $offset ); |
409 | $searchEngine->setNamespaces( [ NS_MAIN ] ); |
410 | $searchEngine->setShowSuggestion( false ); |
411 | $searchEngine->setSort( 'none' ); |
412 | $matches = $searchEngine->searchText( $query ) |
413 | ?? StatusValue::newFatal( 'rawmessage', 'Search is disabled' ); |
414 | if ( $matches instanceof StatusValue ) { |
415 | if ( $matches->isOK() ) { |
416 | $matches = $matches->getValue(); |
417 | } else { |
418 | $this->fatalError( Status::wrap( $matches )->getWikiText( false, false, 'en' ) ); |
419 | } |
420 | } |
421 | return $matches->extractTitles(); |
422 | } |
423 | |
424 | } |
425 | |
426 | $maintClass = ImportOresTopics::class; |
427 | require_once RUN_MAINTENANCE_IF_MAIN; |