MediaWiki master
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
33
105class RefreshLinksJob extends Job {
107 private const NORMAL_MAX_LAG = 10;
109 private const LAG_WAIT_TIMEOUT = 15;
110
111 public function __construct( PageIdentity $page, array $params ) {
112 if ( empty( $params['pages'] ) && !$page->canExist() ) {
113 // BC with the Title class
114 throw new PageAssertionException(
115 'The given PageIdentity {pageIdentity} does not represent a proper page',
116 [ 'pageIdentity' => $page ]
117 );
118 }
119
120 parent::__construct( 'refreshLinks', $page, $params );
121 // Avoid the overhead of de-duplication when it would be pointless
122 $this->removeDuplicates = (
123 // Ranges rarely will line up
124 !isset( $params['range'] ) &&
125 // Multiple pages per job make matches unlikely
126 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
127 );
128 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
129 // Tell JobRunner to not automatically wrap run() in a transaction round.
130 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
131 // and to avoid contention as well.
132 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
133 }
134
140 public static function newPrioritized( PageIdentity $page, array $params ) {
141 $job = new self( $page, $params );
142 $job->command = 'refreshLinksPrioritized';
143
144 return $job;
145 }
146
152 public static function newDynamic( PageIdentity $page, array $params ) {
153 $job = new self( $page, $params );
154 $job->command = 'refreshLinksDynamic';
155
156 return $job;
157 }
158
159 public function run() {
160 $ok = true;
161
162 if ( !empty( $this->params['recursive'] ) ) {
163 // Job to update all (or a range of) backlink pages for a page
164
165 // When the base job branches, wait for the replica DBs to catch up to the primary.
166 // From then on, we know that any template changes at the time the base job was
167 // enqueued will be reflected in backlink page parses when the leaf jobs run.
168 $services = MediaWikiServices::getInstance();
169 if ( !isset( $this->params['range'] ) ) {
170 $lbFactory = $services->getDBLoadBalancerFactory();
171 if ( !$lbFactory->waitForReplication( [
172 'timeout' => self::LAG_WAIT_TIMEOUT
173 ] ) ) {
174 // only try so hard, keep going with what we have
175 $stats = $services->getStatsFactory();
176 $stats->getCounter( 'refreshlinks_warnings_total' )
177 ->setLabel( 'reason', 'lag_wait_failed' )
178 ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' )
179 ->increment();
180 }
181 }
182 // Carry over information for de-duplication
183 $extraParams = $this->getRootJobParams();
184 $extraParams['triggeredRecursive'] = true;
185 // Carry over cause information for logging
186 $extraParams['causeAction'] = $this->params['causeAction'];
187 $extraParams['causeAgent'] = $this->params['causeAgent'];
188 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
189 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
191 $this,
192 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
193 1, // job-per-title
194 [ 'params' => $extraParams ]
195 );
196 $services->getJobQueueGroup()->push( $jobs );
197
198 } elseif ( isset( $this->params['pages'] ) ) {
199 // Job to update link tables for a set of titles
200 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
201 $title = Title::makeTitleSafe( $ns, $dbKey );
202 if ( $title && $title->canExist() ) {
203 $ok = $this->runForTitle( $title ) && $ok;
204 } else {
205 $ok = false;
206 $this->setLastError( "Invalid title ($ns,$dbKey)." );
207 }
208 }
209
210 } else {
211 // Job to update link tables for a given title
212 $ok = $this->runForTitle( $this->title );
213 }
214
215 return $ok;
216 }
217
222 protected function runForTitle( PageIdentity $pageIdentity ) {
223 $services = MediaWikiServices::getInstance();
224 $stats = $services->getStatsFactory();
225 $renderer = $services->getRevisionRenderer();
226 $parserCache = $services->getParserCache();
227 $lbFactory = $services->getDBLoadBalancerFactory();
228 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
229
230 // Load the page from the primary DB
231 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
232 $page->loadPageData( IDBAccessObject::READ_LATEST );
233
234 if ( !$page->exists() ) {
235 // Probably due to concurrent deletion or renaming of the page
236 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
237 $logger->warning(
238 'The page does not exist. Perhaps it was deleted?',
239 [
240 'page_title' => $this->title->getPrefixedDBkey(),
241 'job_params' => $this->getParams(),
242 'job_metadata' => $this->getMetadata()
243 ]
244 );
245 $this->incrementFailureCounter( $stats, 'page_not_found' );
246
247 // retry later to handle unlucky race condition
248 return false;
249 }
250
251 // Serialize link update job by page ID so they see each others' changes.
252 // The page ID and latest revision ID will be queried again after the lock
253 // is acquired to bail if they are changed from that of loadPageData() above.
254 // Serialize links updates by page ID so they see each others' changes
255 $dbw = $lbFactory->getPrimaryDatabase();
257 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
258 if ( $scopedLock === null ) {
259 // Another job is already updating the page, likely for a prior revision (T170596)
260 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
261 $this->incrementFailureCounter( $stats, 'lock_failure' );
262
263 // retry later when overlapping job for previous rev is done
264 return false;
265 }
266
267 if ( $this->isAlreadyRefreshed( $page ) ) {
268 // this job has been superseded, e.g. by overlapping recursive job
269 // for a different template edit, or by direct edit or purge.
270 $stats->getCounter( 'refreshlinks_superseded_updates_total' )
271 ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' )
272 ->increment();
273 // treat as success
274 return true;
275 }
276
277 // These can be fairly long-running jobs, while commitAndWaitForReplication
278 // releases primary snapshots, let the replica release their snapshot as well
279 $lbFactory->flushReplicaSnapshots( __METHOD__ );
280 // Parse during a fresh transaction round for better read consistency
281 $lbFactory->beginPrimaryChanges( __METHOD__ );
282 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
283 $options = $this->getDataUpdateOptions();
284 $lbFactory->commitPrimaryChanges( __METHOD__ );
285
286 if ( !$output ) {
287 // probably raced out.
288 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
289 // FIXME: Why do we retry this? Can this be a cancellation?
290 return false;
291 }
292
293 // Tell DerivedPageDataUpdater to use this parser output
294 $options['known-revision-output'] = $output;
295 // Execute corresponding DataUpdates immediately
296 $page->doSecondaryDataUpdates( $options );
297 InfoAction::invalidateCache( $page );
298
299 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
300 // This means the page will have to be rendered on-the-fly when it is next viewed.
301 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
302 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
303 // for pages that are likely to benefit (T327162).
304
305 // Commit any writes here in case this method is called in a loop.
306 // In that case, the scoped lock will fail to be acquired.
307 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
308
309 return true;
310 }
311
315 private function getLagAwareRootTimestamp() {
316 // Get the timestamp of the change that triggered this job
317 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
318 if ( $rootTimestamp === null ) {
319 return null;
320 }
321
322 if ( !empty( $this->params['isOpportunistic'] ) ) {
323 // Neither clock skew nor DB snapshot/replica DB lag matter much for
324 // such updates; focus on reusing the (often recently updated) cache
325 $lagAwareTimestamp = $rootTimestamp;
326 } else {
327 // For transclusion updates, the template changes must be reflected
328 $lagAwareTimestamp = wfTimestamp(
329 TS_MW,
330 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
331 );
332 }
333
334 return $lagAwareTimestamp;
335 }
336
341 private function isAlreadyRefreshed( WikiPage $page ) {
342 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
343
344 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
345 }
346
352 private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
353 $services = MediaWikiServices::getInstance();
354 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
355 $slot = $revision->getSlots()->getSlot( $role );
356 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
357 if ( $contentHandler->generateHTMLOnEdit() ) {
358 return true;
359 }
360 }
361 return false;
362 }
363
373 private function getParserOutput(
374 RevisionRenderer $renderer,
375 ParserCache $parserCache,
376 WikiPage $page,
377 StatsFactory $stats
378 ) {
379 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
380 if ( !$revision ) {
381 // race condition?
382 return null;
383 }
384
385 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
386 if ( $cachedOutput ) {
387 return $cachedOutput;
388 }
389
390 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
391 $renderedRevision = $renderer->getRenderedRevision(
392 $revision,
393 $page->makeParserOptions( 'canonical' ),
394 null,
395 [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
396 );
397
398 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
399 $output = $renderedRevision->getRevisionParserOutput( [
400 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
401 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
402 ] );
403 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
404
405 return $output;
406 }
407
415 private function getCurrentRevisionIfUnchanged(
416 WikiPage $page,
417 StatsFactory $stats
418 ) {
419 $title = $page->getTitle();
420 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
421 // This is used to detect edits/moves after loadPageData() but before the scope lock.
422 // The works around the chicken/egg problem of determining the scope lock key name
423 $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST );
424
425 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
426 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
427 // This job is obsolete and one for the latest revision will handle updates
428 $this->incrementFailureCounter( $stats, 'rev_not_current' );
429 $this->setLastError( "Revision $triggeringRevisionId is not current" );
430 return null;
431 }
432
433 // Load the current revision. Note that $page should have loaded with READ_LATEST.
434 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
435 $revision = $page->getRevisionRecord();
436 if ( !$revision ) {
437 // revision just got deleted?
438 $this->incrementFailureCounter( $stats, 'rev_not_found' );
439 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
440 return null;
441
442 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
443 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
444 // serialized, it would be OK to update links based on older revisions since it
445 // would eventually get to the latest. Since that is not the case (by design),
446 // only update the link tables to a state matching the current revision's output.
447 $this->incrementFailureCounter( $stats, 'rev_not_current' );
448 $this->setLastError( "Revision {$revision->getId()} is not current" );
449
450 return null;
451 }
452
453 return $revision;
454 }
455
465 private function getParserOutputFromCache(
466 ParserCache $parserCache,
467 WikiPage $page,
468 RevisionRecord $currentRevision,
469 StatsFactory $stats
470 ) {
471 $cachedOutput = null;
472 // If page_touched changed after this root job, then it is likely that
473 // any views of the pages already resulted in re-parses which are now in
474 // cache. The cache can be reused to avoid expensive parsing in some cases.
475 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
476 if ( $rootTimestamp !== null ) {
477 $opportunistic = !empty( $this->params['isOpportunistic'] );
478 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
479 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
480 // As long as the cache rev ID matches the current rev ID and it reflects
481 // the job's triggering change, then it is usable.
482 $parserOptions = $page->makeParserOptions( 'canonical' );
483 $output = $parserCache->getDirty( $page, $parserOptions );
484 if (
485 $output &&
486 $output->getCacheRevisionId() == $currentRevision->getId() &&
487 $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
488 ) {
489 $cachedOutput = $output;
490 }
491 }
492 }
493
494 if ( $cachedOutput ) {
495 $stats->getCounter( 'refreshlinks_parsercache_operations_total' )
496 ->setLabel( 'status', 'cache_hit' )
497 ->copyToStatsdAt( 'refreshlinks.parser_cached' )
498 ->increment();
499 } else {
500 $stats->getCounter( 'refreshlinks_parsercache_operations_total' )
501 ->setLabel( 'status', 'cache_miss' )
502 ->copyToStatsdAt( 'refreshlinks.parser_uncached' )
503 ->increment();
504 }
505
506 return $cachedOutput;
507 }
508
516 private function incrementFailureCounter( StatsFactory $stats, $reason ): void {
517 $stats->getCounter( 'refreshlinks_failures_total' )
518 ->setLabel( 'reason', $reason )
519 ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" )
520 ->increment();
521 }
522
526 private function getDataUpdateOptions() {
527 $options = [
528 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
529 // Carry over cause so the update can do extra logging
530 'causeAction' => $this->params['causeAction'],
531 'causeAgent' => $this->params['causeAgent']
532 ];
533 if ( !empty( $this->params['triggeringUser'] ) ) {
534 $userInfo = $this->params['triggeringUser'];
535 if ( $userInfo['userId'] ) {
536 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
537 } else {
538 // Anonymous, use the username
539 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
540 }
541 }
542
543 return $options;
544 }
545
546 public function getDeduplicationInfo() {
547 $info = parent::getDeduplicationInfo();
548 unset( $info['causeAction'] );
549 unset( $info['causeAgent'] );
550 if ( is_array( $info['params'] ) ) {
551 // For per-pages jobs, the job title is that of the template that changed
552 // (or similar), so remove that since it ruins duplicate detection
553 if ( isset( $info['params']['pages'] ) ) {
554 unset( $info['namespace'] );
555 unset( $info['title'] );
556 }
557 }
558
559 return $info;
560 }
561
562 public function workItemCount() {
563 if ( !empty( $this->params['recursive'] ) ) {
564 return 0; // nothing actually refreshed
565 } elseif ( isset( $this->params['pages'] ) ) {
566 return count( $this->params['pages'] );
567 }
568
569 return 1; // one title
570 }
571}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
array $params
The job parameters.
setLastError( $error)
This is actually implemented in the Job class.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Describe and execute a background job.
Definition Job.php:40
Title $title
Definition Job.php:51
getRootJobParams()
Definition Job.php:323
Class the manages updates of *_link tables as well as similar extension-managed tables.
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
ParserOutput is a rendering of a Content object or a message.
Page revision base class.
getPageId( $wikiId=self::LOCAL)
Get the page ID.
getSlots()
Returns the slots defined for this revision.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Represents a title within MediaWiki.
Definition Title.php:78
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1212
internal since 1.36
Definition User.php:93
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
Base representation for an editable wiki page.
Definition WikiPage.php:79
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:673
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:537
getTitle()
Get the title object of the article.
Definition WikiPage.php:260
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:421
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:749
getTouched()
Get the page_touched field.
Definition WikiPage.php:651
StatsFactory Implementation.
getCounter(string $name)
Makes a new CounterMetric or fetches one from cache.
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
if(count( $args)< 1) $job