MediaWiki master
CategoryMembershipChangeJob.php
Go to the documentation of this file.
1<?php
8
25use Wikimedia\Timestamp\TimestampFormat as TS;
26
43 private $ticket;
44
45 private RecentChangeFactory $recentChangeFactory;
46
47 private const ENQUEUE_FUDGE_SEC = 60;
48
55 public static function newSpec( PageIdentity $page, $revisionTimestamp, bool $forImport ) {
56 return new JobSpecification(
57 'categoryMembershipChange',
58 [
59 'pageId' => $page->getId(),
60 'revTimestamp' => $revisionTimestamp,
61 'forImport' => $forImport,
62 ],
63 [
64 'removeDuplicates' => true,
65 'removeDuplicatesIgnoreParams' => [ 'revTimestamp' ]
66 ],
67 $page
68 );
69 }
70
78 public function __construct(
79 PageIdentity $page,
80 array $params,
81 RecentChangeFactory $recentChangeFactory
82 ) {
83 parent::__construct( 'categoryMembershipChange', $page, $params );
84 $this->recentChangeFactory = $recentChangeFactory;
85 // Only need one job per page. Note that ENQUEUE_FUDGE_SEC handles races where an
86 // older revision job gets inserted while the newer revision job is de-duplicated.
87 $this->removeDuplicates = true;
88 }
89
91 public function run() {
93 $lbFactory = $services->getDBLoadBalancerFactory();
94 $lb = $lbFactory->getMainLB();
95 $dbw = $lb->getConnection( DB_PRIMARY );
96
97 $this->ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
98
99 $page = $services->getWikiPageFactory()->newFromID( $this->params['pageId'], IDBAccessObject::READ_LATEST );
100 if ( !$page ) {
101 $this->setLastError( "Could not find page #{$this->params['pageId']}" );
102 return false; // deleted?
103 }
104
105 // Cut down on the time spent in waitForPrimaryPos() in the critical section
106 $dbr = $lb->getConnection( DB_REPLICA );
107 if ( !$lb->waitForPrimaryPos( $dbr ) ) {
108 $this->setLastError( "Timed out while pre-waiting for replica DB to catch up" );
109 return false;
110 }
111
112 // Use a named lock so that jobs for this page see each others' changes
113 $lockKey = "{$dbw->getDomainID()}:CategoryMembershipChange:{$page->getId()}"; // per-wiki
114 $scopedLock = $dbw->getScopedLockAndFlush( $lockKey, __METHOD__, 1 );
115 if ( !$scopedLock ) {
116 $this->setLastError( "Could not acquire lock '$lockKey'" );
117 return false;
118 }
119
120 // Wait till replica DB is caught up so that jobs for this page see each others' changes
121 if ( !$lb->waitForPrimaryPos( $dbr ) ) {
122 $this->setLastError( "Timed out while waiting for replica DB to catch up" );
123 return false;
124 }
125 // Clear any stale REPEATABLE-READ snapshot
126 $dbr->flushSnapshot( __METHOD__ );
127
128 $cutoffUnix = wfTimestamp( TS::UNIX, $this->params['revTimestamp'] );
129 // Using ENQUEUE_FUDGE_SEC handles jobs inserted out of revision order due to the delay
130 // between COMMIT and actual enqueueing of the CategoryMembershipChangeJob job.
131 $cutoffUnix -= self::ENQUEUE_FUDGE_SEC;
132
133 // Get the newest page revision that has a SRC_CATEGORIZE row.
134 // Assume that category changes before it were already handled.
135 $subQuery = $dbr->newSelectQueryBuilder()
136 ->select( '1' )
137 ->from( 'recentchanges' )
138 ->where( 'rc_this_oldid = rev_id' )
139 ->andWhere( [ 'rc_source' => RecentChange::SRC_CATEGORIZE ] );
140 $row = $dbr->newSelectQueryBuilder()
141 ->select( [ 'rev_timestamp', 'rev_id' ] )
142 ->from( 'revision' )
143 ->where( [ 'rev_page' => $page->getId() ] )
144 ->andWhere( $dbr->expr( 'rev_timestamp', '>=', $dbr->timestamp( $cutoffUnix ) ) )
145 ->andWhere( new RawSQLExpression( 'EXISTS (' . $subQuery->getSQL() . ')' ) )
146 ->orderBy( [ 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_DESC )
147 ->caller( __METHOD__ )->fetchRow();
148
149 // Only consider revisions newer than any such revision
150 if ( $row ) {
151 $cutoffUnix = wfTimestamp( TS::UNIX, $row->rev_timestamp );
152 $lastRevId = (int)$row->rev_id;
153 } else {
154 $lastRevId = 0;
155 }
156
157 // Find revisions to this page made around and after this revision which lack category
158 // notifications in recent changes. This lets jobs pick up were the last one left off.
159 $revisionStore = $services->getRevisionStore();
160 $res = $revisionStore->newSelectQueryBuilder( $dbr )
161 ->joinComment()
162 ->where( [
163 'rev_page' => $page->getId(),
164 $dbr->buildComparison( '>', [
165 'rev_timestamp' => $dbr->timestamp( $cutoffUnix ),
166 'rev_id' => $lastRevId,
167 ] )
168 ] )
169 ->orderBy( [ 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_ASC )
170 ->caller( __METHOD__ )->fetchResultSet();
171
172 // Apply all category updates in revision timestamp order
173 foreach ( $res as $row ) {
174 $this->notifyUpdatesForRevision( $lbFactory, $page, $revisionStore->newRevisionFromRow( $row ) );
175 }
176
177 return true;
178 }
179
185 protected function notifyUpdatesForRevision(
186 LBFactory $lbFactory, WikiPage $page, RevisionRecord $newRev
187 ) {
188 $title = $page->getTitle();
189
190 // Get the new revision
191 if ( $newRev->isDeleted( RevisionRecord::DELETED_TEXT ) ) {
192 return;
193 }
194
195 $services = MediaWikiServices::getInstance();
196 // Get the prior revision (the same for null edits)
197 if ( $newRev->getParentId() ) {
198 $oldRev = $services->getRevisionLookup()
199 ->getRevisionById( $newRev->getParentId(), IDBAccessObject::READ_LATEST );
200 if ( !$oldRev || $oldRev->isDeleted( RevisionRecord::DELETED_TEXT ) ) {
201 return;
202 }
203 } else {
204 $oldRev = null;
205 }
206
207 // Parse the new revision and get the categories
208 $categoryChanges = $this->getExplicitCategoriesChanges( $page, $newRev, $oldRev );
209 [ $categoryInserts, $categoryDeletes ] = $categoryChanges;
210 if ( !$categoryInserts && !$categoryDeletes ) {
211 return; // nothing to do
212 }
213
214 $blc = $services->getBacklinkCacheFactory()->getBacklinkCache( $title );
215 $catMembChange = new CategoryMembershipChange(
216 $title,
217 $blc,
218 $newRev,
219 $this->recentChangeFactory,
220 $this->params['forImport'] ?? false
221 );
222 $catMembChange->checkTemplateLinks();
223
224 $batchSize = $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerQuery );
225 $insertCount = 0;
226
227 foreach ( $categoryInserts as $categoryName ) {
228 $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName );
229 $catMembChange->triggerCategoryAddedNotification( $categoryTitle );
230 if ( $insertCount++ && ( $insertCount % $batchSize ) == 0 ) {
231 $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket );
232 }
233 }
234
235 foreach ( $categoryDeletes as $categoryName ) {
236 $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName );
237 $catMembChange->triggerCategoryRemovedNotification( $categoryTitle );
238 if ( $insertCount++ && ( $insertCount++ % $batchSize ) == 0 ) {
239 $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket );
240 }
241 }
242 }
243
244 private function getExplicitCategoriesChanges(
245 WikiPage $page, RevisionRecord $newRev, ?RevisionRecord $oldRev = null
246 ): array {
247 // Inject the same timestamp for both revision parses to avoid seeing category changes
248 // due to time-based parser functions. Inject the same page title for the parses too.
249 // Note that REPEATABLE-READ makes template/file pages appear unchanged between parses.
250 $parseTimestamp = $newRev->getTimestamp();
251 // Parse the old rev and get the categories. Do not use link tables as that
252 // assumes these updates are perfectly FIFO and that link tables are always
253 // up to date, neither of which are true.
254 $oldCategories = $oldRev
255 ? $this->getCategoriesAtRev( $page, $oldRev, $parseTimestamp )
256 : [];
257 // Parse the new revision and get the categories
258 $newCategories = $this->getCategoriesAtRev( $page, $newRev, $parseTimestamp );
259
260 $categoryInserts = array_values( array_diff( $newCategories, $oldCategories ) );
261 $categoryDeletes = array_values( array_diff( $oldCategories, $newCategories ) );
262
263 return [ $categoryInserts, $categoryDeletes ];
264 }
265
273 private function getCategoriesAtRev( WikiPage $page, RevisionRecord $rev, $parseTimestamp ) {
274 $services = MediaWikiServices::getInstance();
275 $options = $page->makeParserOptions( 'canonical' );
276 $options->setTimestamp( $parseTimestamp );
277 $options->setRenderReason( 'CategoryMembershipChangeJob' );
278
279 $output = $rev instanceof RevisionStoreRecord && $rev->isCurrent()
280 ? $services->getParserCache()->get( $page, $options )
281 : null;
282
283 if ( !$output || $output->getCacheRevisionId() !== $rev->getId() ) {
284 $output = $services->getRevisionRenderer()->getRenderedRevision( $rev, $options )
285 ->getRevisionParserOutput();
286 }
287
288 // array keys will cast numeric category names to ints;
289 // ::getCategoryNames() is careful to cast them back to strings
290 // to avoid breaking things!
291 return $output->getCategoryNames();
292 }
293
295 public function getDeduplicationInfo() {
296 $info = parent::getDeduplicationInfo();
297 unset( $info['params']['revTimestamp'] ); // first job wins
298
299 return $info;
300 }
301}
302
304class_alias( CategoryMembershipChangeJob::class, 'CategoryMembershipChangeJob' );
const NS_CATEGORY
Definition Defines.php:65
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28
makeTitle( $linkId)
Convert a link ID to a Title.to override Title
Job queue task description base code.
Describe and execute a background job.
Definition Job.php:28
array $params
Array of job parameters.
Definition Job.php:33
setLastError( $error)
Definition Job.php:425
Job to add recent change entries mentioning category membership changes.
static newSpec(PageIdentity $page, $revisionTimestamp, bool $forImport)
__construct(PageIdentity $page, array $params, RecentChangeFactory $recentChangeFactory)
Constructor for use by the Job Queue infrastructure.
run()
Run the job.If this method returns false or completes exceptionally, the job runner will retry execut...
notifyUpdatesForRevision(LBFactory $lbFactory, WikiPage $page, RevisionRecord $newRev)
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.The resulting map conveys eve...
A class containing constants representing the names of configuration variables.
const UpdateRowsPerQuery
Name constant for the UpdateRowsPerQuery setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Base representation for an editable wiki page.
Definition WikiPage.php:82
getTitle()
Get the title object of the article.
Definition WikiPage.php:250
Helper class for category membership changes.
Utility class for creating and reading rows in the recentchanges table.
Page revision base class.
getParentId( $wikiId=self::LOCAL)
Get parent revision ID (the original previous page revision).
isDeleted( $field)
MCR migration note: this replaced Revision::isDeleted.
A RevisionRecord representing an existing revision persisted in the revision table.
Represents a title within MediaWiki.
Definition Title.php:69
commitAndWaitForReplication( $fname, $ticket, array $opts=[])
Commit primary DB transactions and wait for replication (if $ticket indicates it is safe)....
Raw SQL expression to be used in query builders.
Build SELECT queries with a fluent interface.
Interface for objects (potentially) representing an editable wiki page.
getId( $wikiId=self::LOCAL)
Returns the page ID.
Interface for database access objects.