Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 128
0.00% covered (danger)
0.00%
0 / 11
CRAP
0.00% covered (danger)
0.00%
0 / 1
FixLinkRecommendationData
0.00% covered (danger)
0.00%
0 / 122
0.00% covered (danger)
0.00%
0 / 11
1406
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
2
 execute
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
30
 init
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
30
 fixSearchIndex
0.00% covered (danger)
0.00%
0 / 30
0.00% covered (danger)
0.00%
0 / 1
56
 fixDatabaseTable
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
42
 search
0.00% covered (danger)
0.00%
0 / 14
0.00% covered (danger)
0.00%
0 / 1
20
 getRandomSeed
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 titlesToPageIds
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 pageIdsToPageRecords
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 verboseOutput
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
6
 maybeReportFixedCount
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2
3namespace GrowthExperiments\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\Query\ArticleTopicFeature;
7use GrowthExperiments\GrowthExperimentsServices;
8use GrowthExperiments\NewcomerTasks\AddLink\LinkRecommendationStore;
9use GrowthExperiments\NewcomerTasks\ConfigurationLoader\ConfigurationLoader;
10use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskType;
11use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskTypeHandler;
12use Maintenance;
13use MediaWiki\Cache\LinkBatchFactory;
14use MediaWiki\MediaWikiServices;
15use MediaWiki\Page\PageRecord;
16use MediaWiki\Page\PageStore;
17use MediaWiki\Status\Status;
18use MediaWiki\Title\Title;
19use MediaWiki\Title\TitleFormatter;
20use StatusValue;
21
22$IP = getenv( 'MW_INSTALL_PATH' );
23if ( $IP === false ) {
24    $IP = __DIR__ . '/../../..';
25}
26require_once "$IP/maintenance/Maintenance.php";
27
28/**
29 * Aligns link recommendation data in the growthexperiments_link_recommendations table and the
30 * search index. Useful for fixing test setups if the DB or the index gets messed up somehow.
31 *
32 * No attempt is made to handle replication lag, delayed search index updates due to job queue
33 * size or batching, and similar potential race conditions. As such, this script is not appropriate
34 * for production use.
35 */
36class FixLinkRecommendationData extends Maintenance {
37
38    /** @var ConfigurationLoader */
39    private $configurationLoader;
40
41    /** @var LinkRecommendationStore */
42    private $linkRecommendationStore;
43
44    /** @var CirrusSearch */
45    private $cirrusSearch;
46
47    /** @var LinkBatchFactory */
48    private $linkBatchFactory;
49
50    /** @var PageStore */
51    private $pageStore;
52
53    /** @var TitleFormatter */
54    private $titleFormatter;
55
56    /** @var int|null */
57    private $randomSeed;
58
59    /** @var LinkRecommendationTaskType */
60    private $linkRecommendationTaskType;
61
62    public function __construct() {
63        parent::__construct();
64        $this->requireExtension( 'GrowthExperiments' );
65        $this->requireExtension( 'CirrusSearch' );
66
67        $this->addDescription( 'Aligns link recommendation data in the '
68            . 'growthexperiments_link_recommendations table and the search index, by deleting table rows '
69            . 'without a matching search index entry and/or search index entries without a matching table row.' );
70        $this->addOption( 'search-index', 'Delete search index entries which do not match the DB table. '
71            . '(Note that this relies on the job queue to work.)' );
72        $this->addOption( 'random', 'Sort randomly. Applies to --search-index only. '
73            . 'This is mainly useful with --statsd.' );
74        $this->addOption( 'db-table', 'Delete DB table entries which do not match the search index.' );
75        $this->addOption( 'dry-run', 'Run without making any changes.' );
76        $this->addOption( 'statsd', 'Report the number of fixes (or would-be fixes, '
77            . 'when called with --dry-run) to statsd' );
78        $this->addOption( 'verbose', 'Show debug output.' );
79        $this->setBatchSize( 100 );
80    }
81
82    /** @inheritDoc */
83    public function execute() {
84        $this->init();
85        if ( !$this->hasOption( 'search-index' ) && !$this->hasOption( 'db-table' ) ) {
86            $this->fatalError( 'At least one of --search-index and --db-table must be specified.' );
87        }
88        if ( $this->hasOption( 'search-index' ) ) {
89            $this->verboseOutput( "Removing search index entries not found in the database...\n" );
90            $this->fixSearchIndex();
91        }
92        if ( $this->hasOption( 'db-table' ) ) {
93            $this->verboseOutput( "Removing database entries not found in the search index...\n" );
94            $this->fixDatabaseTable();
95        }
96    }
97
98    public function init() {
99        $services = MediaWikiServices::getInstance();
100        $growthServices = GrowthExperimentsServices::wrap( $services );
101        if ( $this->hasOption( 'db-table' )
102            && !$this->hasOption( 'dry-run' )
103            && !$growthServices->getGrowthConfig()->get( 'GEDeveloperSetup' )
104        ) {
105            // Adding search index entries is batched in production, and takes hours. This script would delete
106            // the associated DB records in the meantime.
107            $this->fatalError( 'The --db-table option cannot be safely run in production. (If the current '
108                . 'environment is not production, $wgGEDeveloperSetup should be set to true.)' );
109        }
110        $this->configurationLoader = $growthServices->getNewcomerTasksConfigurationLoader();
111        $this->linkRecommendationStore = $growthServices->getLinkRecommendationStore();
112        $this->cirrusSearch = new CirrusSearch();
113        $this->linkBatchFactory = $services->getLinkBatchFactory();
114        $this->pageStore = $services->getPageStore();
115        $this->titleFormatter = $services->getTitleFormatter();
116
117        $taskTypes = $this->configurationLoader->getTaskTypes();
118        $linkRecommendationTaskType = $taskTypes[LinkRecommendationTaskTypeHandler::TASK_TYPE_ID] ?? null;
119        if ( !$linkRecommendationTaskType instanceof LinkRecommendationTaskType ) {
120            $this->fatalError( sprintf( "'%s' is not a link recommendation task type",
121                LinkRecommendationTaskTypeHandler::TASK_TYPE_ID ) );
122        } else {
123            $this->linkRecommendationTaskType = $linkRecommendationTaskType;
124        }
125    }
126
127    private function fixSearchIndex() {
128        $fixing = $this->hasOption( 'dry-run' ) ? 'Would fix' : 'Fixing';
129        $batchSize = $this->getBatchSize();
130        $randomize = $this->getOption( 'random', false );
131        $fixedCount = 0;
132        $pageIdsFixed = [];
133
134        $oresTopics = array_keys( ArticleTopicFeature::TERMS_TO_LABELS );
135        // Search offsets are limited to 10K. Search topic by topic. This is still not a 100%
136        // guarantee that we'll avoid a >10K result set, but it's the best we can do.
137        foreach ( $oresTopics as $oresTopic ) {
138            $from = 0;
139            $this->verboseOutput( "  checking topic $oresTopic...\n" );
140            $searchQuery = "hasrecommendation:link articletopic:$oresTopic";
141            // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
142            while ( $titles = $this->search( $searchQuery, $batchSize, $from, $randomize ) ) {
143                $this->verboseOutput( '    checking ' . count( $titles ) . " titles...\n" );
144                $pageIdsToCheck = $this->titlesToPageIds( $titles );
145                $pageIdsToFix = array_diff( $pageIdsToCheck,
146                    $this->linkRecommendationStore->filterPageIds( $pageIdsToCheck ) );
147                $pageIdsToFix = array_diff( $pageIdsToFix, $pageIdsFixed );
148                $pagesToFix = $this->pageIdsToPageRecords( $pageIdsToFix );
149
150                foreach ( $pagesToFix as $pageRecord ) {
151                    $this->verboseOutput(
152                        "    $fixing " . $this->titleFormatter->getPrefixedText( $pageRecord ) . "\n"
153                    );
154                    if ( !$this->hasOption( 'dry-run' ) ) {
155                        $this->cirrusSearch->resetWeightedTags( $pageRecord, 'recommendation.link' );
156                    }
157                    $pageIdsFixed[] = $pageRecord->getId();
158                }
159                $from = min( 10000, $batchSize + $from );
160                $fixedCount += count( $pagesToFix );
161                if ( $batchSize + $from > 10000 ) {
162                    $this->error( "  topic $oresTopic had more than 10K tasks" );
163                    break;
164                }
165            }
166        }
167        $this->maybeReportFixedCount( $fixedCount, 'search-index' );
168    }
169
170    private function fixDatabaseTable() {
171        $fixing = $this->hasOption( 'dry-run' ) ? 'Would fix' : 'Fixing';
172        $from = null;
173        $fixedCount = 0;
174        // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
175        while ( $pageIds = $this->linkRecommendationStore->listPageIds( $this->getBatchSize(), $from ) ) {
176            $this->verboseOutput( '  checking ' . count( $pageIds ) . " titles...\n" );
177            $titlesToFix = $this->search( '-hasrecommendation:link pageid:' . implode( '|', $pageIds ),
178                $this->getBatchSize(), 0 );
179            $pageIdsToFix = $this->titlesToPageIds( $titlesToFix );
180            foreach ( $titlesToFix as $title ) {
181                $this->verboseOutput( "    $fixing " . $title->getPrefixedText() . "\n" );
182            }
183            if ( $pageIdsToFix && !$this->hasOption( 'dry-run' ) ) {
184                $this->linkRecommendationStore->deleteByPageIds( $pageIdsToFix );
185                $this->commitTransaction( $this->linkRecommendationStore->getDB( DB_PRIMARY ), __METHOD__ );
186            }
187            $from = end( $pageIds );
188            $fixedCount += count( $pageIdsToFix );
189        }
190        $this->maybeReportFixedCount( $fixedCount, 'db-table' );
191    }
192
193    /**
194     * Do a CirrusSearch query.
195     * @param string $query Search query
196     * @param int $limit
197     * @param int $offset
198     * @param bool $randomize Use random sorting
199     * @return Title[]
200     */
201    private function search( string $query, int $limit, int $offset, bool $randomize = false ): array {
202        $searchEngine = MediaWikiServices::getInstance()->newSearchEngine();
203        $searchEngine->setLimitOffset( $limit, $offset );
204        $searchEngine->setShowSuggestion( false );
205        if ( $randomize ) {
206            $searchEngine->setFeatureData( 'random_seed', $this->getRandomSeed() );
207            $searchEngine->setSort( 'random' );
208        } else {
209            // Sort by creation date as it's stable over time.
210            $searchEngine->setSort( 'create_timestamp_asc' );
211        }
212        $matches = $searchEngine->searchText( $query )
213            ?? StatusValue::newFatal( 'rawmessage', 'Search is disabled' );
214        if ( $matches instanceof StatusValue ) {
215            if ( $matches->isOK() ) {
216                $matches = $matches->getValue();
217            } else {
218                $this->fatalError( Status::wrap( $matches )->getWikiText( false, false, 'en' ) );
219            }
220        }
221        return $matches->extractTitles();
222    }
223
224    /**
225     * Helper method for a random value that remains the same during successive calls.
226     * @return int
227     */
228    private function getRandomSeed(): int {
229        if ( $this->randomSeed === null ) {
230            $this->randomSeed = random_int( 0, PHP_INT_MAX );
231        }
232        return $this->randomSeed;
233    }
234
235    /**
236     * @param Title[] $titles
237     * @return int[]
238     */
239    private function titlesToPageIds( array $titles ): array {
240        $linkBatch = $this->linkBatchFactory->newLinkBatch( $titles );
241        return $linkBatch->execute();
242    }
243
244    /**
245     * @param int[] $pageIds
246     * @return PageRecord[]
247     */
248    private function pageIdsToPageRecords( array $pageIds ): array {
249        $pageRecords = $this->pageStore
250            ->newSelectQueryBuilder()
251            ->wherePageIds( $pageIds )
252            ->caller( __METHOD__ )
253            ->fetchPageRecords();
254        return iterator_to_array( $pageRecords );
255    }
256
257    private function verboseOutput( string $output ): void {
258        if ( $this->hasOption( 'verbose' ) ) {
259            $this->output( $output );
260        }
261    }
262
263    private function maybeReportFixedCount( int $count, string $type ) {
264        if ( !$this->hasOption( 'statsd' ) ) {
265            return;
266        }
267        $fixWord = $this->hasOption( 'dry-run' ) ? 'fixable' : 'fixed';
268        $dataFactory = MediaWikiServices::getInstance()->getPerDbNameStatsdDataFactory();
269        $dataFactory->updateCount( "growthexperiments.$fixWord.link-recommendation.$type", $count );
270    }
271
272}
273
274$maintClass = FixLinkRecommendationData::class;
275require_once RUN_MAINTENANCE_IF_MAIN;