Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 128 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
FixLinkRecommendationData | |
0.00% |
0 / 122 |
|
0.00% |
0 / 11 |
1406 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
30 | |||
init | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
30 | |||
fixSearchIndex | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
56 | |||
fixDatabaseTable | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
42 | |||
search | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
getRandomSeed | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
titlesToPageIds | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
pageIdsToPageRecords | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
verboseOutput | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
maybeReportFixedCount | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | |
3 | namespace GrowthExperiments\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\Query\ArticleTopicFeature; |
7 | use GrowthExperiments\GrowthExperimentsServices; |
8 | use GrowthExperiments\NewcomerTasks\AddLink\LinkRecommendationStore; |
9 | use GrowthExperiments\NewcomerTasks\ConfigurationLoader\ConfigurationLoader; |
10 | use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskType; |
11 | use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskTypeHandler; |
12 | use Maintenance; |
13 | use MediaWiki\Cache\LinkBatchFactory; |
14 | use MediaWiki\MediaWikiServices; |
15 | use MediaWiki\Page\PageRecord; |
16 | use MediaWiki\Page\PageStore; |
17 | use MediaWiki\Status\Status; |
18 | use MediaWiki\Title\Title; |
19 | use MediaWiki\Title\TitleFormatter; |
20 | use StatusValue; |
21 | |
22 | $IP = getenv( 'MW_INSTALL_PATH' ); |
23 | if ( $IP === false ) { |
24 | $IP = __DIR__ . '/../../..'; |
25 | } |
26 | require_once "$IP/maintenance/Maintenance.php"; |
27 | |
28 | /** |
29 | * Aligns link recommendation data in the growthexperiments_link_recommendations table and the |
30 | * search index. Useful for fixing test setups if the DB or the index gets messed up somehow. |
31 | * |
32 | * No attempt is made to handle replication lag, delayed search index updates due to job queue |
33 | * size or batching, and similar potential race conditions. As such, this script is not appropriate |
34 | * for production use. |
35 | */ |
36 | class FixLinkRecommendationData extends Maintenance { |
37 | |
38 | /** @var ConfigurationLoader */ |
39 | private $configurationLoader; |
40 | |
41 | /** @var LinkRecommendationStore */ |
42 | private $linkRecommendationStore; |
43 | |
44 | /** @var CirrusSearch */ |
45 | private $cirrusSearch; |
46 | |
47 | /** @var LinkBatchFactory */ |
48 | private $linkBatchFactory; |
49 | |
50 | /** @var PageStore */ |
51 | private $pageStore; |
52 | |
53 | /** @var TitleFormatter */ |
54 | private $titleFormatter; |
55 | |
56 | /** @var int|null */ |
57 | private $randomSeed; |
58 | |
59 | /** @var LinkRecommendationTaskType */ |
60 | private $linkRecommendationTaskType; |
61 | |
62 | public function __construct() { |
63 | parent::__construct(); |
64 | $this->requireExtension( 'GrowthExperiments' ); |
65 | $this->requireExtension( 'CirrusSearch' ); |
66 | |
67 | $this->addDescription( 'Aligns link recommendation data in the ' |
68 | . 'growthexperiments_link_recommendations table and the search index, by deleting table rows ' |
69 | . 'without a matching search index entry and/or search index entries without a matching table row.' ); |
70 | $this->addOption( 'search-index', 'Delete search index entries which do not match the DB table. ' |
71 | . '(Note that this relies on the job queue to work.)' ); |
72 | $this->addOption( 'random', 'Sort randomly. Applies to --search-index only. ' |
73 | . 'This is mainly useful with --statsd.' ); |
74 | $this->addOption( 'db-table', 'Delete DB table entries which do not match the search index.' ); |
75 | $this->addOption( 'dry-run', 'Run without making any changes.' ); |
76 | $this->addOption( 'statsd', 'Report the number of fixes (or would-be fixes, ' |
77 | . 'when called with --dry-run) to statsd' ); |
78 | $this->addOption( 'verbose', 'Show debug output.' ); |
79 | $this->setBatchSize( 100 ); |
80 | } |
81 | |
82 | /** @inheritDoc */ |
83 | public function execute() { |
84 | $this->init(); |
85 | if ( !$this->hasOption( 'search-index' ) && !$this->hasOption( 'db-table' ) ) { |
86 | $this->fatalError( 'At least one of --search-index and --db-table must be specified.' ); |
87 | } |
88 | if ( $this->hasOption( 'search-index' ) ) { |
89 | $this->verboseOutput( "Removing search index entries not found in the database...\n" ); |
90 | $this->fixSearchIndex(); |
91 | } |
92 | if ( $this->hasOption( 'db-table' ) ) { |
93 | $this->verboseOutput( "Removing database entries not found in the search index...\n" ); |
94 | $this->fixDatabaseTable(); |
95 | } |
96 | } |
97 | |
98 | public function init() { |
99 | $services = MediaWikiServices::getInstance(); |
100 | $growthServices = GrowthExperimentsServices::wrap( $services ); |
101 | if ( $this->hasOption( 'db-table' ) |
102 | && !$this->hasOption( 'dry-run' ) |
103 | && !$growthServices->getGrowthConfig()->get( 'GEDeveloperSetup' ) |
104 | ) { |
105 | // Adding search index entries is batched in production, and takes hours. This script would delete |
106 | // the associated DB records in the meantime. |
107 | $this->fatalError( 'The --db-table option cannot be safely run in production. (If the current ' |
108 | . 'environment is not production, $wgGEDeveloperSetup should be set to true.)' ); |
109 | } |
110 | $this->configurationLoader = $growthServices->getNewcomerTasksConfigurationLoader(); |
111 | $this->linkRecommendationStore = $growthServices->getLinkRecommendationStore(); |
112 | $this->cirrusSearch = new CirrusSearch(); |
113 | $this->linkBatchFactory = $services->getLinkBatchFactory(); |
114 | $this->pageStore = $services->getPageStore(); |
115 | $this->titleFormatter = $services->getTitleFormatter(); |
116 | |
117 | $taskTypes = $this->configurationLoader->getTaskTypes(); |
118 | $linkRecommendationTaskType = $taskTypes[LinkRecommendationTaskTypeHandler::TASK_TYPE_ID] ?? null; |
119 | if ( !$linkRecommendationTaskType instanceof LinkRecommendationTaskType ) { |
120 | $this->fatalError( sprintf( "'%s' is not a link recommendation task type", |
121 | LinkRecommendationTaskTypeHandler::TASK_TYPE_ID ) ); |
122 | } else { |
123 | $this->linkRecommendationTaskType = $linkRecommendationTaskType; |
124 | } |
125 | } |
126 | |
127 | private function fixSearchIndex() { |
128 | $fixing = $this->hasOption( 'dry-run' ) ? 'Would fix' : 'Fixing'; |
129 | $batchSize = $this->getBatchSize(); |
130 | $randomize = $this->getOption( 'random', false ); |
131 | $fixedCount = 0; |
132 | $pageIdsFixed = []; |
133 | |
134 | $oresTopics = array_keys( ArticleTopicFeature::TERMS_TO_LABELS ); |
135 | // Search offsets are limited to 10K. Search topic by topic. This is still not a 100% |
136 | // guarantee that we'll avoid a >10K result set, but it's the best we can do. |
137 | foreach ( $oresTopics as $oresTopic ) { |
138 | $from = 0; |
139 | $this->verboseOutput( " checking topic $oresTopic...\n" ); |
140 | $searchQuery = "hasrecommendation:link articletopic:$oresTopic"; |
141 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
142 | while ( $titles = $this->search( $searchQuery, $batchSize, $from, $randomize ) ) { |
143 | $this->verboseOutput( ' checking ' . count( $titles ) . " titles...\n" ); |
144 | $pageIdsToCheck = $this->titlesToPageIds( $titles ); |
145 | $pageIdsToFix = array_diff( $pageIdsToCheck, |
146 | $this->linkRecommendationStore->filterPageIds( $pageIdsToCheck ) ); |
147 | $pageIdsToFix = array_diff( $pageIdsToFix, $pageIdsFixed ); |
148 | $pagesToFix = $this->pageIdsToPageRecords( $pageIdsToFix ); |
149 | |
150 | foreach ( $pagesToFix as $pageRecord ) { |
151 | $this->verboseOutput( |
152 | " $fixing " . $this->titleFormatter->getPrefixedText( $pageRecord ) . "\n" |
153 | ); |
154 | if ( !$this->hasOption( 'dry-run' ) ) { |
155 | $this->cirrusSearch->resetWeightedTags( $pageRecord, 'recommendation.link' ); |
156 | } |
157 | $pageIdsFixed[] = $pageRecord->getId(); |
158 | } |
159 | $from = min( 10000, $batchSize + $from ); |
160 | $fixedCount += count( $pagesToFix ); |
161 | if ( $batchSize + $from > 10000 ) { |
162 | $this->error( " topic $oresTopic had more than 10K tasks" ); |
163 | break; |
164 | } |
165 | } |
166 | } |
167 | $this->maybeReportFixedCount( $fixedCount, 'search-index' ); |
168 | } |
169 | |
170 | private function fixDatabaseTable() { |
171 | $fixing = $this->hasOption( 'dry-run' ) ? 'Would fix' : 'Fixing'; |
172 | $from = null; |
173 | $fixedCount = 0; |
174 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
175 | while ( $pageIds = $this->linkRecommendationStore->listPageIds( $this->getBatchSize(), $from ) ) { |
176 | $this->verboseOutput( ' checking ' . count( $pageIds ) . " titles...\n" ); |
177 | $titlesToFix = $this->search( '-hasrecommendation:link pageid:' . implode( '|', $pageIds ), |
178 | $this->getBatchSize(), 0 ); |
179 | $pageIdsToFix = $this->titlesToPageIds( $titlesToFix ); |
180 | foreach ( $titlesToFix as $title ) { |
181 | $this->verboseOutput( " $fixing " . $title->getPrefixedText() . "\n" ); |
182 | } |
183 | if ( $pageIdsToFix && !$this->hasOption( 'dry-run' ) ) { |
184 | $this->linkRecommendationStore->deleteByPageIds( $pageIdsToFix ); |
185 | $this->commitTransaction( $this->linkRecommendationStore->getDB( DB_PRIMARY ), __METHOD__ ); |
186 | } |
187 | $from = end( $pageIds ); |
188 | $fixedCount += count( $pageIdsToFix ); |
189 | } |
190 | $this->maybeReportFixedCount( $fixedCount, 'db-table' ); |
191 | } |
192 | |
193 | /** |
194 | * Do a CirrusSearch query. |
195 | * @param string $query Search query |
196 | * @param int $limit |
197 | * @param int $offset |
198 | * @param bool $randomize Use random sorting |
199 | * @return Title[] |
200 | */ |
201 | private function search( string $query, int $limit, int $offset, bool $randomize = false ): array { |
202 | $searchEngine = MediaWikiServices::getInstance()->newSearchEngine(); |
203 | $searchEngine->setLimitOffset( $limit, $offset ); |
204 | $searchEngine->setShowSuggestion( false ); |
205 | if ( $randomize ) { |
206 | $searchEngine->setFeatureData( 'random_seed', $this->getRandomSeed() ); |
207 | $searchEngine->setSort( 'random' ); |
208 | } else { |
209 | // Sort by creation date as it's stable over time. |
210 | $searchEngine->setSort( 'create_timestamp_asc' ); |
211 | } |
212 | $matches = $searchEngine->searchText( $query ) |
213 | ?? StatusValue::newFatal( 'rawmessage', 'Search is disabled' ); |
214 | if ( $matches instanceof StatusValue ) { |
215 | if ( $matches->isOK() ) { |
216 | $matches = $matches->getValue(); |
217 | } else { |
218 | $this->fatalError( Status::wrap( $matches )->getWikiText( false, false, 'en' ) ); |
219 | } |
220 | } |
221 | return $matches->extractTitles(); |
222 | } |
223 | |
224 | /** |
225 | * Helper method for a random value that remains the same during successive calls. |
226 | * @return int |
227 | */ |
228 | private function getRandomSeed(): int { |
229 | if ( $this->randomSeed === null ) { |
230 | $this->randomSeed = random_int( 0, PHP_INT_MAX ); |
231 | } |
232 | return $this->randomSeed; |
233 | } |
234 | |
235 | /** |
236 | * @param Title[] $titles |
237 | * @return int[] |
238 | */ |
239 | private function titlesToPageIds( array $titles ): array { |
240 | $linkBatch = $this->linkBatchFactory->newLinkBatch( $titles ); |
241 | return $linkBatch->execute(); |
242 | } |
243 | |
244 | /** |
245 | * @param int[] $pageIds |
246 | * @return PageRecord[] |
247 | */ |
248 | private function pageIdsToPageRecords( array $pageIds ): array { |
249 | $pageRecords = $this->pageStore |
250 | ->newSelectQueryBuilder() |
251 | ->wherePageIds( $pageIds ) |
252 | ->caller( __METHOD__ ) |
253 | ->fetchPageRecords(); |
254 | return iterator_to_array( $pageRecords ); |
255 | } |
256 | |
257 | private function verboseOutput( string $output ): void { |
258 | if ( $this->hasOption( 'verbose' ) ) { |
259 | $this->output( $output ); |
260 | } |
261 | } |
262 | |
263 | private function maybeReportFixedCount( int $count, string $type ) { |
264 | if ( !$this->hasOption( 'statsd' ) ) { |
265 | return; |
266 | } |
267 | $fixWord = $this->hasOption( 'dry-run' ) ? 'fixable' : 'fixed'; |
268 | $dataFactory = MediaWikiServices::getInstance()->getPerDbNameStatsdDataFactory(); |
269 | $dataFactory->updateCount( "growthexperiments.$fixWord.link-recommendation.$type", $count ); |
270 | } |
271 | |
272 | } |
273 | |
274 | $maintClass = FixLinkRecommendationData::class; |
275 | require_once RUN_MAINTENANCE_IF_MAIN; |