MediaWiki 1.40.4
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
21use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
31
57class RefreshLinksJob extends Job {
59 private const NORMAL_MAX_LAG = 10;
61 private const LAG_WAIT_TIMEOUT = 15;
62
63 public function __construct( PageIdentity $page, array $params ) {
64 if ( empty( $params['pages'] ) && !$page->canExist() ) {
65 // BC with the Title class
66 throw new PageAssertionException(
67 'The given PageIdentity {pageIdentity} does not represent a proper page',
68 [ 'pageIdentity' => $page ]
69 );
70 }
71
72 parent::__construct( 'refreshLinks', $page, $params );
73 // Avoid the overhead of de-duplication when it would be pointless
74 $this->removeDuplicates = (
75 // Ranges rarely will line up
76 !isset( $params['range'] ) &&
77 // Multiple pages per job make matches unlikely
78 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
79 );
80 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
81 // Tell JobRunner to not automatically wrap run() in a transaction round.
82 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
83 // and to avoid contention as well.
84 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
85 }
86
92 public static function newPrioritized( PageIdentity $page, array $params ) {
93 $job = new self( $page, $params );
94 $job->command = 'refreshLinksPrioritized';
95
96 return $job;
97 }
98
104 public static function newDynamic( PageIdentity $page, array $params ) {
105 $job = new self( $page, $params );
106 $job->command = 'refreshLinksDynamic';
107
108 return $job;
109 }
110
111 public function run() {
112 $ok = true;
113
114 if ( !empty( $this->params['recursive'] ) ) {
115 // Job to update all (or a range of) backlink pages for a page
116
117 // When the base job branches, wait for the replica DBs to catch up to the primary.
118 // From then on, we know that any template changes at the time the base job was
119 // enqueued will be reflected in backlink page parses when the leaf jobs run.
120 $services = MediaWikiServices::getInstance();
121 if ( !isset( $this->params['range'] ) ) {
122 $lbFactory = $services->getDBLoadBalancerFactory();
123 if ( !$lbFactory->waitForReplication( [
124 'domain' => $lbFactory->getLocalDomainID(),
125 'timeout' => self::LAG_WAIT_TIMEOUT
126 ] ) ) {
127 // only try so hard, keep going with what we have
128 $stats = $services->getStatsdDataFactory();
129 $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
130 }
131 }
132 // Carry over information for de-duplication
133 $extraParams = $this->getRootJobParams();
134 $extraParams['triggeredRecursive'] = true;
135 // Carry over cause information for logging
136 $extraParams['causeAction'] = $this->params['causeAction'];
137 $extraParams['causeAgent'] = $this->params['causeAgent'];
138 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
139 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
141 $this,
142 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
143 1, // job-per-title
144 [ 'params' => $extraParams ]
145 );
146 $services->getJobQueueGroup()->push( $jobs );
147
148 } elseif ( isset( $this->params['pages'] ) ) {
149 // Job to update link tables for a set of titles
150 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
151 $title = Title::makeTitleSafe( $ns, $dbKey );
152 if ( $title && $title->canExist() ) {
153 $ok = $this->runForTitle( $title ) && $ok;
154 } else {
155 $ok = false;
156 $this->setLastError( "Invalid title ($ns,$dbKey)." );
157 }
158 }
159
160 } else {
161 // Job to update link tables for a given title
162 $ok = $this->runForTitle( $this->title );
163 }
164
165 return $ok;
166 }
167
172 protected function runForTitle( PageIdentity $pageIdentity ) {
173 $services = MediaWikiServices::getInstance();
174 $stats = $services->getStatsdDataFactory();
175 $renderer = $services->getRevisionRenderer();
176 $parserCache = $services->getParserCache();
177 $lbFactory = $services->getDBLoadBalancerFactory();
178 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
179
180 // Load the page from the primary DB
181 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
182 $page->loadPageData( WikiPage::READ_LATEST );
183
184 if ( !$page->exists() ) {
185 // Probably due to concurrent deletion or renaming of the page
186 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
187 $logger->warning(
188 'The page does not exist. Perhaps it was deleted?',
189 [
190 'page_title' => $this->title->getPrefixedDBkey(),
191 'job_params' => $this->getParams(),
192 'job_metadata' => $this->getMetadata()
193 ]
194 );
195 $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
196
197 // retry later to handle unlucky race condition
198 return false;
199 }
200
201 // Serialize link update job by page ID so they see each others' changes.
202 // The page ID and latest revision ID will be queried again after the lock
203 // is acquired to bail if they are changed from that of loadPageData() above.
204 // Serialize links updates by page ID so they see each others' changes
205 $dbw = $lbFactory->getPrimaryDatabase();
207 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
208 if ( $scopedLock === null ) {
209 // Another job is already updating the page, likely for a prior revision (T170596)
210 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
211 $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
212
213 // retry later when overlapping job for previous rev is done
214 return false;
215 }
216
217 if ( $this->isAlreadyRefreshed( $page ) ) {
218 // this job has been superseded, e.g. by overlapping recursive job
219 // for a different template edit, or by direct edit or purge.
220 $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
221 // treat as success
222 return true;
223 }
224
225 // These can be fairly long-running jobs, while commitAndWaitForReplication
226 // releases primary snapshots, let the replica release their snapshot as well
227 $lbFactory->flushReplicaSnapshots( __METHOD__ );
228 // Parse during a fresh transaction round for better read consistency
229 $lbFactory->beginPrimaryChanges( __METHOD__ );
230 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
231 $options = $this->getDataUpdateOptions();
232 $lbFactory->commitPrimaryChanges( __METHOD__ );
233
234 if ( !$output ) {
235 // probably raced out.
236 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
237 // FIXME: Why do we retry this? Can this be a cancellation?
238 return false;
239 }
240
241 // Tell DerivedPageDataUpdater to use this parser output
242 $options['known-revision-output'] = $output;
243 // Execute corresponding DataUpdates immediately
244 $page->doSecondaryDataUpdates( $options );
245 InfoAction::invalidateCache( $page );
246
247 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
248 // This means the page will have to be rendered on-the-fly when it is next viewed.
249 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
250 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
251 // for pages that are likely to benefit (T327162).
252
253 // Commit any writes here in case this method is called in a loop.
254 // In that case, the scoped lock will fail to be acquired.
255 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
256
257 return true;
258 }
259
263 private function getLagAwareRootTimestamp() {
264 // Get the timestamp of the change that triggered this job
265 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
266 if ( $rootTimestamp === null ) {
267 return null;
268 }
269
270 if ( !empty( $this->params['isOpportunistic'] ) ) {
271 // Neither clock skew nor DB snapshot/replica DB lag matter much for
272 // such updates; focus on reusing the (often recently updated) cache
273 $lagAwareTimestamp = $rootTimestamp;
274 } else {
275 // For transclusion updates, the template changes must be reflected
276 $lagAwareTimestamp = wfTimestamp(
277 TS_MW,
278 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
279 );
280 }
281
282 return $lagAwareTimestamp;
283 }
284
289 private function isAlreadyRefreshed( WikiPage $page ) {
290 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
291
292 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
293 }
294
304 private function getParserOutput(
305 RevisionRenderer $renderer,
306 ParserCache $parserCache,
307 WikiPage $page,
308 StatsdDataFactoryInterface $stats
309 ) {
310 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
311 if ( !$revision ) {
312 // race condition?
313 return null;
314 }
315
316 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
317 if ( $cachedOutput ) {
318 return $cachedOutput;
319 }
320
321 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
322 $renderedRevision = $renderer->getRenderedRevision(
323 $revision,
324 $page->makeParserOptions( 'canonical' ),
325 null,
326 [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
327 );
328
329 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
330 $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
331 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
332
333 return $output;
334 }
335
343 private function getCurrentRevisionIfUnchanged(
344 WikiPage $page,
345 StatsdDataFactoryInterface $stats
346 ) {
347 $title = $page->getTitle();
348 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
349 // This is used to detect edits/moves after loadPageData() but before the scope lock.
350 // The works around the chicken/egg problem of determining the scope lock key name
351 $latest = $title->getLatestRevID( Title::READ_LATEST );
352
353 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
354 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
355 // This job is obsolete and one for the latest revision will handle updates
356 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
357 $this->setLastError( "Revision $triggeringRevisionId is not current" );
358 return null;
359 }
360
361 // Load the current revision. Note that $page should have loaded with READ_LATEST.
362 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
363 $revision = $page->getRevisionRecord();
364 if ( !$revision ) {
365 // revision just got deleted?
366 $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
367 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
368 return null;
369
370 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
371 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
372 // serialized, it would be OK to update links based on older revisions since it
373 // would eventually get to the latest. Since that is not the case (by design),
374 // only update the link tables to a state matching the current revision's output.
375 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
376 $this->setLastError( "Revision {$revision->getId()} is not current" );
377
378 return null;
379 }
380
381 return $revision;
382 }
383
393 private function getParserOutputFromCache(
394 ParserCache $parserCache,
395 WikiPage $page,
396 RevisionRecord $currentRevision,
397 StatsdDataFactoryInterface $stats
398 ) {
399 $cachedOutput = null;
400 // If page_touched changed after this root job, then it is likely that
401 // any views of the pages already resulted in re-parses which are now in
402 // cache. The cache can be reused to avoid expensive parsing in some cases.
403 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
404 if ( $rootTimestamp !== null ) {
405 $opportunistic = !empty( $this->params['isOpportunistic'] );
406 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
407 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
408 // As long as the cache rev ID matches the current rev ID and it reflects
409 // the job's triggering change, then it is usable.
410 $parserOptions = $page->makeParserOptions( 'canonical' );
411 $output = $parserCache->getDirty( $page, $parserOptions );
412 if (
413 $output &&
414 $output->getCacheRevisionId() == $currentRevision->getId() &&
415 $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
416 ) {
417 $cachedOutput = $output;
418 }
419 }
420 }
421
422 if ( $cachedOutput ) {
423 $stats->increment( 'refreshlinks.parser_cached' );
424 } else {
425 $stats->increment( 'refreshlinks.parser_uncached' );
426 }
427
428 return $cachedOutput;
429 }
430
434 private function getDataUpdateOptions() {
435 $options = [
436 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
437 // Carry over cause so the update can do extra logging
438 'causeAction' => $this->params['causeAction'],
439 'causeAgent' => $this->params['causeAgent']
440 ];
441 if ( !empty( $this->params['triggeringUser'] ) ) {
442 $userInfo = $this->params['triggeringUser'];
443 if ( $userInfo['userId'] ) {
444 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
445 } else {
446 // Anonymous, use the username
447 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
448 }
449 }
450
451 return $options;
452 }
453
454 public function getDeduplicationInfo() {
455 $info = parent::getDeduplicationInfo();
456 unset( $info['causeAction'] );
457 unset( $info['causeAgent'] );
458 if ( is_array( $info['params'] ) ) {
459 // For per-pages jobs, the job title is that of the template that changed
460 // (or similar), so remove that since it ruins duplicate detection
461 if ( isset( $info['params']['pages'] ) ) {
462 unset( $info['namespace'] );
463 unset( $info['title'] );
464 }
465 }
466
467 return $info;
468 }
469
470 public function workItemCount() {
471 if ( !empty( $this->params['recursive'] ) ) {
472 return 0; // nothing actually refreshed
473 } elseif ( isset( $this->params['pages'] ) ) {
474 return count( $this->params['pages'] );
475 }
476
477 return 1; // one title
478 }
479}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Class to both describe a background job and handle jobs.
Definition Job.php:39
getRootJobParams()
Definition Job.php:322
setLastError( $error)
Definition Job.php:429
Class the manages updates of *_link tables as well as similar extension-managed tables.
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Page revision base class.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Represents a title within MediaWiki.
Definition Title.php:82
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
static newFromName( $name, $validate='valid')
Definition User.php:592
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition User.php:626
Base representation for an editable wiki page.
Definition WikiPage.php:75
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:744
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:588
getTitle()
Get the title object of the article.
Definition WikiPage.php:318
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:474
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:820
getTouched()
Get the page_touched field.
Definition WikiPage.php:722
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
if(count( $args)< 1) $job