MediaWiki master
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
34
106class RefreshLinksJob extends Job {
108 private const NORMAL_MAX_LAG = 10;
110 private const LAG_WAIT_TIMEOUT = 15;
111
112 public function __construct( PageIdentity $page, array $params ) {
113 if ( empty( $params['pages'] ) && !$page->canExist() ) {
114 // BC with the Title class
115 throw new PageAssertionException(
116 'The given PageIdentity {pageIdentity} does not represent a proper page',
117 [ 'pageIdentity' => $page ]
118 );
119 }
120
121 parent::__construct( 'refreshLinks', $page, $params );
122 // Avoid the overhead of de-duplication when it would be pointless
123 $this->removeDuplicates = (
124 // Ranges rarely will line up
125 !isset( $params['range'] ) &&
126 // Multiple pages per job make matches unlikely
127 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
128 );
129 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
130 // Tell JobRunner to not automatically wrap run() in a transaction round.
131 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
132 // and to avoid contention as well.
133 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
134 }
135
141 public static function newPrioritized( PageIdentity $page, array $params ) {
142 $job = new self( $page, $params );
143 $job->command = 'refreshLinksPrioritized';
144
145 return $job;
146 }
147
153 public static function newDynamic( PageIdentity $page, array $params ) {
154 $job = new self( $page, $params );
155 $job->command = 'refreshLinksDynamic';
156
157 return $job;
158 }
159
160 public function run() {
161 $ok = true;
162
163 if ( !empty( $this->params['recursive'] ) ) {
164 // Job to update all (or a range of) backlink pages for a page
165
166 // When the base job branches, wait for the replica DBs to catch up to the primary.
167 // From then on, we know that any template changes at the time the base job was
168 // enqueued will be reflected in backlink page parses when the leaf jobs run.
169 $services = MediaWikiServices::getInstance();
170 if ( !isset( $this->params['range'] ) ) {
171 $lbFactory = $services->getDBLoadBalancerFactory();
172 if ( !$lbFactory->waitForReplication( [
173 'timeout' => self::LAG_WAIT_TIMEOUT
174 ] ) ) {
175 // only try so hard, keep going with what we have
176 $stats = $services->getStatsFactory();
177 $stats->getCounter( 'refreshlinks_warnings_total' )
178 ->setLabel( 'reason', 'lag_wait_failed' )
179 ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' )
180 ->increment();
181 }
182 }
183 // Carry over information for de-duplication
184 $extraParams = $this->getRootJobParams();
185 $extraParams['triggeredRecursive'] = true;
186 // Carry over cause information for logging
187 $extraParams['causeAction'] = $this->params['causeAction'];
188 $extraParams['causeAgent'] = $this->params['causeAgent'];
189 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
190 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
192 $this,
193 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
194 1, // job-per-title
195 [ 'params' => $extraParams ]
196 );
197 $services->getJobQueueGroup()->push( $jobs );
198
199 } elseif ( isset( $this->params['pages'] ) ) {
200 // Job to update link tables for a set of titles
201 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
202 $title = Title::makeTitleSafe( $ns, $dbKey );
203 if ( $title && $title->canExist() ) {
204 $ok = $this->runForTitle( $title ) && $ok;
205 } else {
206 $ok = false;
207 $this->setLastError( "Invalid title ($ns,$dbKey)." );
208 }
209 }
210
211 } else {
212 // Job to update link tables for a given title
213 $ok = $this->runForTitle( $this->title );
214 }
215
216 return $ok;
217 }
218
223 protected function runForTitle( PageIdentity $pageIdentity ) {
224 $services = MediaWikiServices::getInstance();
225 $stats = $services->getStatsFactory();
226 $renderer = $services->getRevisionRenderer();
227 $parserCache = $services->getParserCache();
228 $lbFactory = $services->getDBLoadBalancerFactory();
229 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
230
231 // Load the page from the primary DB
232 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
233 $page->loadPageData( IDBAccessObject::READ_LATEST );
234
235 if ( !$page->exists() ) {
236 // Probably due to concurrent deletion or renaming of the page
237 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
238 $logger->warning(
239 'The page does not exist. Perhaps it was deleted?',
240 [
241 'page_title' => $this->title->getPrefixedDBkey(),
242 'job_params' => $this->getParams(),
243 'job_metadata' => $this->getMetadata()
244 ]
245 );
246 $this->incrementFailureCounter( $stats, 'page_not_found' );
247
248 // retry later to handle unlucky race condition
249 return false;
250 }
251
252 // Serialize link update job by page ID so they see each others' changes.
253 // The page ID and latest revision ID will be queried again after the lock
254 // is acquired to bail if they are changed from that of loadPageData() above.
255 // Serialize links updates by page ID so they see each others' changes
256 $dbw = $lbFactory->getPrimaryDatabase();
258 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
259 if ( $scopedLock === null ) {
260 // Another job is already updating the page, likely for a prior revision (T170596)
261 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
262 $this->incrementFailureCounter( $stats, 'lock_failure' );
263
264 // retry later when overlapping job for previous rev is done
265 return false;
266 }
267
268 if ( $this->isAlreadyRefreshed( $page ) ) {
269 // this job has been superseded, e.g. by overlapping recursive job
270 // for a different template edit, or by direct edit or purge.
271 $stats->getCounter( 'refreshlinks_superseded_updates_total' )
272 ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' )
273 ->increment();
274 // treat as success
275 return true;
276 }
277
278 // These can be fairly long-running jobs, while commitAndWaitForReplication
279 // releases primary snapshots, let the replica release their snapshot as well
280 $lbFactory->flushReplicaSnapshots( __METHOD__ );
281 // Parse during a fresh transaction round for better read consistency
282 $lbFactory->beginPrimaryChanges( __METHOD__ );
283 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
284 $options = $this->getDataUpdateOptions();
285 $lbFactory->commitPrimaryChanges( __METHOD__ );
286
287 if ( !$output ) {
288 // probably raced out.
289 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
290 // FIXME: Why do we retry this? Can this be a cancellation?
291 return false;
292 }
293
294 // Tell DerivedPageDataUpdater to use this parser output
295 $options['known-revision-output'] = $output;
296 // Execute corresponding DataUpdates immediately
297 $page->doSecondaryDataUpdates( $options );
298 InfoAction::invalidateCache( $page );
299
300 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
301 // This means the page will have to be rendered on-the-fly when it is next viewed.
302 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
303 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
304 // for pages that are likely to benefit (T327162).
305
306 // Commit any writes here in case this method is called in a loop.
307 // In that case, the scoped lock will fail to be acquired.
308 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
309
310 return true;
311 }
312
316 private function getLagAwareRootTimestamp() {
317 // Get the timestamp of the change that triggered this job
318 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
319 if ( $rootTimestamp === null ) {
320 return null;
321 }
322
323 if ( !empty( $this->params['isOpportunistic'] ) ) {
324 // Neither clock skew nor DB snapshot/replica DB lag matter much for
325 // such updates; focus on reusing the (often recently updated) cache
326 $lagAwareTimestamp = $rootTimestamp;
327 } else {
328 // For transclusion updates, the template changes must be reflected
329 $lagAwareTimestamp = wfTimestamp(
330 TS_MW,
331 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
332 );
333 }
334
335 return $lagAwareTimestamp;
336 }
337
342 private function isAlreadyRefreshed( WikiPage $page ) {
343 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
344
345 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
346 }
347
353 private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
354 $services = MediaWikiServices::getInstance();
355 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
356 $slot = $revision->getSlots()->getSlot( $role );
357 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
358 if ( $contentHandler->generateHTMLOnEdit() ) {
359 return true;
360 }
361 }
362 return false;
363 }
364
374 private function getParserOutput(
375 RevisionRenderer $renderer,
376 ParserCache $parserCache,
377 WikiPage $page,
378 StatsFactory $stats
379 ) {
380 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
381 if ( !$revision ) {
382 // race condition?
383 return null;
384 }
385
386 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
387
388 if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) {
389 $stats->getCounter( 'refreshlinks_parsercache_operations_total' )
390 ->setLabel( 'status', 'cache_hit' )
391 ->copyToStatsdAt( 'refreshlinks.parser_cached' )
392 ->increment();
393
394 return $cachedOutput;
395 }
396
397 $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' )
398 ->setLabel( 'status', 'cache_miss' )
399 ->copyToStatsdAt( 'refreshlinks.parser_uncached' );
400
401 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
402 $renderedRevision = $renderer->getRenderedRevision(
403 $revision,
404 $page->makeParserOptions( 'canonical' ),
405 null,
406 [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
407 );
408
409 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
410 $output = $renderedRevision->getRevisionParserOutput( [
411 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
412 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
413 ] );
414 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
415
416 // Collect stats on parses that don't actually change the page content.
417 // In that case, we could abort here, and perhaps we could also avoid
418 // triggering CDN purges (T369898).
419 if ( !$cachedOutput ) {
420 // There was no cached output
421 $statsCounter->setLabel( 'html_changed', 'unknown' );
422 } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) {
423 // We have cached output, but we couldn't be sure that it was still good.
424 // So we parsed again, but the result turned out to be the same HTML as
425 // before.
426 $statsCounter->setLabel( 'html_changed', 'no' );
427 } else {
428 // Re-parsing yielded HTML different from the cached output.
429 $statsCounter->setLabel( 'html_changed', 'yes' );
430 }
431
432 $statsCounter->increment();
433
434 return $output;
435 }
436
444 private function getCurrentRevisionIfUnchanged(
445 WikiPage $page,
446 StatsFactory $stats
447 ) {
448 $title = $page->getTitle();
449 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
450 // This is used to detect edits/moves after loadPageData() but before the scope lock.
451 // The works around the chicken/egg problem of determining the scope lock key name
452 $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST );
453
454 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
455 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
456 // This job is obsolete and one for the latest revision will handle updates
457 $this->incrementFailureCounter( $stats, 'rev_not_current' );
458 $this->setLastError( "Revision $triggeringRevisionId is not current" );
459 return null;
460 }
461
462 // Load the current revision. Note that $page should have loaded with READ_LATEST.
463 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
464 $revision = $page->getRevisionRecord();
465 if ( !$revision ) {
466 // revision just got deleted?
467 $this->incrementFailureCounter( $stats, 'rev_not_found' );
468 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
469 return null;
470
471 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
472 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
473 // serialized, it would be OK to update links based on older revisions since it
474 // would eventually get to the latest. Since that is not the case (by design),
475 // only update the link tables to a state matching the current revision's output.
476 $this->incrementFailureCounter( $stats, 'rev_not_current' );
477 $this->setLastError( "Revision {$revision->getId()} is not current" );
478
479 return null;
480 }
481
482 return $revision;
483 }
484
494 private function getParserOutputFromCache(
495 ParserCache $parserCache,
496 WikiPage $page,
497 RevisionRecord $currentRevision,
498 StatsFactory $stats
499 ) {
500 // If page_touched changed after this root job, then it is likely that
501 // any views of the pages already resulted in re-parses which are now in
502 // cache. The cache can be reused to avoid expensive parsing in some cases.
503 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
504 if ( $rootTimestamp !== null ) {
505 $opportunistic = !empty( $this->params['isOpportunistic'] );
506 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
507 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
508 // We call canUseParserOutputFromCache() later to check if it's usable.
509 $parserOptions = $page->makeParserOptions( 'canonical' );
510 $output = $parserCache->getDirty( $page, $parserOptions );
511 if (
512 $output &&
513 $output->getCacheRevisionId() == $currentRevision->getId()
514 ) {
515 return $output;
516 }
517 }
518 }
519
520 return null;
521 }
522
523 private function canUseParserOutputFromCache(
524 ParserOutput $cachedOutput,
525 RevisionRecord $currentRevision
526 ) {
527 // As long as the cache rev ID matches the current rev ID and it reflects
528 // the job's triggering change, then it is usable.
529 return $cachedOutput->getCacheRevisionId() == $currentRevision->getId()
530 && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp();
531 }
532
540 private function incrementFailureCounter( StatsFactory $stats, $reason ): void {
541 $stats->getCounter( 'refreshlinks_failures_total' )
542 ->setLabel( 'reason', $reason )
543 ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" )
544 ->increment();
545 }
546
550 private function getDataUpdateOptions() {
551 $options = [
552 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
553 // Carry over cause so the update can do extra logging
554 'causeAction' => $this->params['causeAction'],
555 'causeAgent' => $this->params['causeAgent']
556 ];
557 if ( !empty( $this->params['triggeringUser'] ) ) {
558 $userInfo = $this->params['triggeringUser'];
559 if ( $userInfo['userId'] ) {
560 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
561 } else {
562 // Anonymous, use the username
563 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
564 }
565 }
566
567 return $options;
568 }
569
570 public function getDeduplicationInfo() {
571 $info = parent::getDeduplicationInfo();
572 unset( $info['causeAction'] );
573 unset( $info['causeAgent'] );
574 if ( is_array( $info['params'] ) ) {
575 // For per-pages jobs, the job title is that of the template that changed
576 // (or similar), so remove that since it ruins duplicate detection
577 if ( isset( $info['params']['pages'] ) ) {
578 unset( $info['namespace'] );
579 unset( $info['title'] );
580 }
581 }
582
583 return $info;
584 }
585
586 public function workItemCount() {
587 if ( !empty( $this->params['recursive'] ) ) {
588 return 0; // nothing actually refreshed
589 } elseif ( isset( $this->params['pages'] ) ) {
590 return count( $this->params['pages'] );
591 }
592
593 return 1; // one title
594 }
595}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
array $params
The job parameters.
setLastError( $error)
This is actually implemented in the Job class.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
getCacheRevisionId()
Describe and execute a background job.
Definition Job.php:41
Title $title
Definition Job.php:52
getRootJobParams()
Definition Job.php:324
Class the manages updates of *_link tables as well as similar extension-managed tables.
Update object handling the cleanup of secondary data after a page was edited.
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
ParserOutput is a rendering of a Content object or a message.
getRawText()
Get the cacheable text with <mw:editsection> markers still in it.
Page revision base class.
getPageId( $wikiId=self::LOCAL)
Get the page ID.
getSlots()
Returns the slots defined for this revision.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Represents a title within MediaWiki.
Definition Title.php:79
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1213
internal since 1.36
Definition User.php:93
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
Base representation for an editable wiki page.
Definition WikiPage.php:81
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:673
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:531
getTitle()
Get the title object of the article.
Definition WikiPage.php:255
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:415
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:749
getTouched()
Get the page_touched field.
Definition WikiPage.php:651
StatsFactory Implementation.
getCounter(string $name)
Makes a new CounterMetric or fetches one from cache.
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
if(count( $args)< 1) $job