MediaWiki master
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
38
110class RefreshLinksJob extends Job {
112 private const NORMAL_MAX_LAG = 10;
114 private const LAG_WAIT_TIMEOUT = 15;
115
116 public function __construct( PageIdentity $page, array $params ) {
117 if ( empty( $params['pages'] ) && !$page->canExist() ) {
118 // BC with the Title class
119 throw new PageAssertionException(
120 'The given PageIdentity {pageIdentity} does not represent a proper page',
121 [ 'pageIdentity' => $page ]
122 );
123 }
124
125 parent::__construct( 'refreshLinks', $page, $params );
126 // Avoid the overhead of de-duplication when it would be pointless
127 $this->removeDuplicates = (
128 // Ranges rarely will line up
129 !isset( $params['range'] ) &&
130 // Multiple pages per job make matches unlikely
131 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
132 );
133 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
134 // Tell JobRunner to not automatically wrap run() in a transaction round.
135 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
136 // and to avoid contention as well.
137 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
138 }
139
145 public static function newPrioritized( PageIdentity $page, array $params ) {
146 $job = new self( $page, $params );
147 $job->command = 'refreshLinksPrioritized';
148
149 return $job;
150 }
151
157 public static function newDynamic( PageIdentity $page, array $params ) {
158 $job = new self( $page, $params );
159 $job->command = 'refreshLinksDynamic';
160
161 return $job;
162 }
163
164 public function run() {
165 $ok = true;
166
167 if ( !empty( $this->params['recursive'] ) ) {
168 // Job to update all (or a range of) backlink pages for a page
169
170 // When the base job branches, wait for the replica DBs to catch up to the primary.
171 // From then on, we know that any template changes at the time the base job was
172 // enqueued will be reflected in backlink page parses when the leaf jobs run.
173 $services = MediaWikiServices::getInstance();
174 if ( !isset( $this->params['range'] ) ) {
175 $lbFactory = $services->getDBLoadBalancerFactory();
176 if ( !$lbFactory->waitForReplication( [
177 'timeout' => self::LAG_WAIT_TIMEOUT
178 ] ) ) {
179 // only try so hard, keep going with what we have
180 $stats = $services->getStatsFactory();
181 $stats->getCounter( 'refreshlinks_warnings_total' )
182 ->setLabel( 'reason', 'lag_wait_failed' )
183 ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' )
184 ->increment();
185 }
186 }
187 // Carry over information for de-duplication
188 $extraParams = $this->getRootJobParams();
189 $extraParams['triggeredRecursive'] = true;
190 // Carry over cause information for logging
191 $extraParams['causeAction'] = $this->params['causeAction'];
192 $extraParams['causeAgent'] = $this->params['causeAgent'];
193 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
194 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
196 $this,
197 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
198 1, // job-per-title
199 [ 'params' => $extraParams ]
200 );
201 $services->getJobQueueGroup()->push( $jobs );
202
203 } elseif ( isset( $this->params['pages'] ) ) {
204 // Job to update link tables for a set of titles
205 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
206 $title = Title::makeTitleSafe( $ns, $dbKey );
207 if ( $title && $title->canExist() ) {
208 $ok = $this->runForTitle( $title ) && $ok;
209 } else {
210 $ok = false;
211 $this->setLastError( "Invalid title ($ns,$dbKey)." );
212 }
213 }
214
215 } else {
216 // Job to update link tables for a given title
217 $ok = $this->runForTitle( $this->title );
218 }
219
220 return $ok;
221 }
222
227 protected function runForTitle( PageIdentity $pageIdentity ) {
228 $services = MediaWikiServices::getInstance();
229 $stats = $services->getStatsFactory();
230 $renderer = $services->getRevisionRenderer();
231 $parserCache = $services->getParserCache();
232 $lbFactory = $services->getDBLoadBalancerFactory();
233 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
234
235 // Load the page from the primary DB
236 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
237 $page->loadPageData( IDBAccessObject::READ_LATEST );
238
239 if ( !$page->exists() ) {
240 // Probably due to concurrent deletion or renaming of the page
241 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
242 $logger->warning(
243 'The page does not exist. Perhaps it was deleted?',
244 [
245 'page_title' => $this->title->getPrefixedDBkey(),
246 'job_params' => $this->getParams(),
247 'job_metadata' => $this->getMetadata()
248 ]
249 );
250 $this->incrementFailureCounter( $stats, 'page_not_found' );
251
252 // retry later to handle unlucky race condition
253 return false;
254 }
255
256 // Serialize link update job by page ID so they see each others' changes.
257 // The page ID and latest revision ID will be queried again after the lock
258 // is acquired to bail if they are changed from that of loadPageData() above.
259 // Serialize links updates by page ID so they see each others' changes
260 $dbw = $lbFactory->getPrimaryDatabase();
262 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
263 if ( $scopedLock === null ) {
264 // Another job is already updating the page, likely for a prior revision (T170596)
265 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
266 $this->incrementFailureCounter( $stats, 'lock_failure' );
267
268 // retry later when overlapping job for previous rev is done
269 return false;
270 }
271
272 if ( $this->isAlreadyRefreshed( $page ) ) {
273 // this job has been superseded, e.g. by overlapping recursive job
274 // for a different template edit, or by direct edit or purge.
275 $stats->getCounter( 'refreshlinks_superseded_updates_total' )
276 ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' )
277 ->increment();
278 // treat as success
279 return true;
280 }
281
282 // Parse during a fresh transaction round for better read consistency
283 $lbFactory->beginPrimaryChanges( __METHOD__ );
284 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
285 $options = $this->getDataUpdateOptions();
286 $lbFactory->commitPrimaryChanges( __METHOD__ );
287
288 if ( !$output ) {
289 // probably raced out.
290 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
291 // Don't retry job.
292 return true;
293 }
294
295 // Tell DerivedPageDataUpdater to use this parser output
296 $options['known-revision-output'] = $output;
297 // Execute corresponding DataUpdates immediately
298 $page->doSecondaryDataUpdates( $options );
299 InfoAction::invalidateCache( $page );
300
301 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
302 // This means the page will have to be rendered on-the-fly when it is next viewed.
303 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
304 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
305 // for pages that are likely to benefit (T327162).
306
307 // Commit any writes here in case this method is called in a loop.
308 // In that case, the scoped lock will fail to be acquired.
309 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
310
311 return true;
312 }
313
317 private function getLagAwareRootTimestamp() {
318 // Get the timestamp of the change that triggered this job
319 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
320 if ( $rootTimestamp === null ) {
321 return null;
322 }
323
324 if ( !empty( $this->params['isOpportunistic'] ) ) {
325 // Neither clock skew nor DB snapshot/replica DB lag matter much for
326 // such updates; focus on reusing the (often recently updated) cache
327 $lagAwareTimestamp = $rootTimestamp;
328 } else {
329 // For transclusion updates, the template changes must be reflected
330 $lagAwareTimestamp = wfTimestamp(
331 TS_MW,
332 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
333 );
334 }
335
336 return $lagAwareTimestamp;
337 }
338
343 private function isAlreadyRefreshed( WikiPage $page ) {
344 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
345
346 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
347 }
348
354 private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
355 $services = MediaWikiServices::getInstance();
356 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
357 $slot = $revision->getSlots()->getSlot( $role );
358 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
359 if ( $contentHandler->generateHTMLOnEdit() ) {
360 return true;
361 }
362 }
363 return false;
364 }
365
375 private function getParserOutput(
376 RevisionRenderer $renderer,
377 ParserCache $parserCache,
378 WikiPage $page,
379 StatsFactory $stats
380 ) {
381 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
382 if ( !$revision ) {
383 // race condition?
384 return null;
385 }
386
387 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
388 $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' );
389
390 if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) {
391 $statsCounter
392 ->setLabel( 'status', 'cache_hit' )
393 ->setLabel( 'html_changed', 'n/a' )
394 ->copyToStatsdAt( 'refreshlinks.parser_cached' )
395 ->increment();
396
397 return $cachedOutput;
398 }
399
400 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
401 $parserOptions = $page->makeParserOptions( 'canonical' );
402
403 // T371713: Temporary statistics collection code to determine
404 // feasibility of Parsoid selective update
405 $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get(
406 MainConfigNames::ParsoidSelectiveUpdateSampleRate
407 );
408 $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1;
409 if ( $doSample && $cachedOutput === null ) {
410 // In order to collect accurate statistics, check for
411 // a dirty copy in the cache even if we wouldn't have
412 // to otherwise.
413 $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?: null;
414 }
415
416 $renderedRevision = $renderer->getRenderedRevision(
417 $revision,
418 $parserOptions,
419 null,
420 [
421 'audience' => $revision::RAW,
422 'causeAction' => $causeAction,
423 // Providing a previous parse potentially allows for
424 // selective updates
425 'previous-output' => $cachedOutput,
426 ]
427 );
428
429 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
430 $output = $renderedRevision->getRevisionParserOutput( [
431 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
432 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
433 ] );
434 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
435 // T371713: Temporary statistics collection code to determine
436 // feasibility of Parsoid selective update
437 if ( $doSample ) {
438 $content = $revision->getContent( SlotRecord::MAIN );
439 $labels = [
440 'source' => 'RefreshLinksJob',
441 'type' => $cachedOutput === null ? 'full' : 'selective',
442 'reason' => $causeAction,
443 'parser' => $parserOptions->getUseParsoid() ? 'parsoid' : 'legacy',
444 'opportunistic' => empty( $this->params['isOpportunistic'] ) ? 'false' : 'true',
445 'wiki' => WikiMap::getCurrentWikiId(),
446 'model' => $content ? $content->getModel() : 'unknown',
447 ];
448 $stats
449 ->getCounter( 'ParserCache_selective_total' )
450 ->setLabels( $labels )
451 ->increment();
452 $stats
453 ->getCounter( 'ParserCache_selective_cpu_seconds' )
454 ->setLabels( $labels )
455 ->incrementBy( $output->getTimeProfile( 'cpu' ) );
456 }
457
458 // Collect stats on parses that don't actually change the page content.
459 // In that case, we could abort here, and perhaps we could also avoid
460 // triggering CDN purges (T369898).
461 if ( !$cachedOutput ) {
462 // There was no cached output
463 $htmlChanged = 'unknown';
464 } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) {
465 // We have cached output, but we couldn't be sure that it was still good.
466 // So we parsed again, but the result turned out to be the same HTML as
467 // before.
468 $htmlChanged = 'no';
469 } else {
470 // Re-parsing yielded HTML different from the cached output.
471 $htmlChanged = 'yes';
472 }
473
474 $statsCounter
475 ->setLabel( 'status', 'cache_miss' )
476 ->setLabel( 'html_changed', $htmlChanged )
477 ->copyToStatsdAt( 'refreshlinks.parser_uncached' )
478 ->increment();
479
480 return $output;
481 }
482
490 private function getCurrentRevisionIfUnchanged(
491 WikiPage $page,
492 StatsFactory $stats
493 ) {
494 $title = $page->getTitle();
495 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
496 // This is used to detect edits/moves after loadPageData() but before the scope lock.
497 // The works around the chicken/egg problem of determining the scope lock key name
498 $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST );
499
500 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
501 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
502 // This job is obsolete and one for the latest revision will handle updates
503 $this->incrementFailureCounter( $stats, 'rev_not_current' );
504 $this->setLastError( "Revision $triggeringRevisionId is not current" );
505 return null;
506 }
507
508 // Load the current revision. Note that $page should have loaded with READ_LATEST.
509 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
510 $revision = $page->getRevisionRecord();
511 if ( !$revision ) {
512 // revision just got deleted?
513 $this->incrementFailureCounter( $stats, 'rev_not_found' );
514 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
515 return null;
516
517 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
518 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
519 // serialized, it would be OK to update links based on older revisions since it
520 // would eventually get to the latest. Since that is not the case (by design),
521 // only update the link tables to a state matching the current revision's output.
522 $this->incrementFailureCounter( $stats, 'rev_not_current' );
523 $this->setLastError( "Revision {$revision->getId()} is not current" );
524
525 return null;
526 }
527
528 return $revision;
529 }
530
540 private function getParserOutputFromCache(
541 ParserCache $parserCache,
542 WikiPage $page,
543 RevisionRecord $currentRevision,
544 StatsFactory $stats
545 ): ?ParserOutput {
546 // Parsoid can do selective updates, so it is always worth the I/O
547 // to check for a previous parse.
548 $parserOptions = $page->makeParserOptions( 'canonical' );
549 if ( $parserOptions->getUseParsoid() ) {
550 return $parserCache->getDirty( $page, $parserOptions ) ?: null;
551 }
552 // If page_touched changed after this root job, then it is likely that
553 // any views of the pages already resulted in re-parses which are now in
554 // cache. The cache can be reused to avoid expensive parsing in some cases.
555 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
556 if ( $rootTimestamp !== null ) {
557 $opportunistic = !empty( $this->params['isOpportunistic'] );
558 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
559 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
560 // We call canUseParserOutputFromCache() later to check if it's usable.
561 return $parserCache->getDirty( $page, $parserOptions ) ?: null;
562 }
563 }
564
565 return null;
566 }
567
568 private function canUseParserOutputFromCache(
569 ParserOutput $cachedOutput,
570 RevisionRecord $currentRevision
571 ) {
572 // As long as the cache rev ID matches the current rev ID and it reflects
573 // the job's triggering change, then it is usable.
574 return $cachedOutput->getCacheRevisionId() == $currentRevision->getId()
575 && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp();
576 }
577
585 private function incrementFailureCounter( StatsFactory $stats, $reason ): void {
586 $stats->getCounter( 'refreshlinks_failures_total' )
587 ->setLabel( 'reason', $reason )
588 ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" )
589 ->increment();
590 }
591
595 private function getDataUpdateOptions() {
596 $options = [
597 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
598 // Carry over cause so the update can do extra logging
599 'causeAction' => $this->params['causeAction'],
600 'causeAgent' => $this->params['causeAgent']
601 ];
602 if ( !empty( $this->params['triggeringUser'] ) ) {
603 $userInfo = $this->params['triggeringUser'];
604 if ( $userInfo['userId'] ) {
605 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
606 } else {
607 // Anonymous, use the username
608 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
609 }
610 }
611
612 return $options;
613 }
614
615 public function getDeduplicationInfo() {
616 $info = parent::getDeduplicationInfo();
617 unset( $info['causeAction'] );
618 unset( $info['causeAgent'] );
619 if ( is_array( $info['params'] ) ) {
620 // For per-pages jobs, the job title is that of the template that changed
621 // (or similar), so remove that since it ruins duplicate detection
622 if ( isset( $info['params']['pages'] ) ) {
623 unset( $info['namespace'] );
624 unset( $info['title'] );
625 }
626 }
627
628 return $info;
629 }
630
631 public function workItemCount() {
632 if ( !empty( $this->params['recursive'] ) ) {
633 return 0; // nothing actually refreshed
634 } elseif ( isset( $this->params['pages'] ) ) {
635 return count( $this->params['pages'] );
636 }
637
638 return 1; // one title
639 }
640}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
array $params
The job parameters.
setLastError( $error)
This is actually implemented in the Job class.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Describe and execute a background job.
Definition Job.php:38
Title $title
Definition Job.php:49
getRootJobParams()
Definition Job.php:320
Class the manages updates of *_link tables as well as similar extension-managed tables.
Update object handling the cleanup of secondary data after a page was edited.
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
ParserOutput is a rendering of a Content object or a message.
getRawText()
Get the cacheable text with <mw:editsection> markers still in it.
Page revision base class.
getContent( $role, $audience=self::FOR_PUBLIC, ?Authority $performer=null)
Returns the Content of the given slot of this revision.
getPageId( $wikiId=self::LOCAL)
Get the page ID.
getSlots()
Returns the slots defined for this revision.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ?ParserOptions $options=null, ?Authority $forPerformer=null, array $hints=[])
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:78
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1211
internal since 1.36
Definition User.php:93
Tools for dealing with other locally-hosted wikis.
Definition WikiMap.php:31
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
Base representation for an editable wiki page.
Definition WikiPage.php:85
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:677
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:535
getTitle()
Get the title object of the article.
Definition WikiPage.php:259
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:419
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:753
getTouched()
Get the page_touched field.
Definition WikiPage.php:655
This is the primary interface for validating metrics definitions, caching defined metrics,...
getCounter(string $name)
Makes a new CounterMetric or fetches one from cache.
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
Interface for database access objects.
setLabel(string $key, string $value)
copyToStatsdAt( $statsdNamespaces)
if(count( $args)< 1) $job