MediaWiki master
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
22
44
116class RefreshLinksJob extends Job {
118 private const NORMAL_MAX_LAG = 10;
120 private const LAG_WAIT_TIMEOUT = 15;
121
122 public function __construct( PageIdentity $page, array $params ) {
123 if ( empty( $params['pages'] ) && !$page->canExist() ) {
124 // BC with the Title class
125 throw new PageAssertionException(
126 'The given PageIdentity {pageIdentity} does not represent a proper page',
127 [ 'pageIdentity' => $page ]
128 );
129 }
130
131 parent::__construct( 'refreshLinks', $page, $params );
132 // Avoid the overhead of de-duplication when it would be pointless
133 $this->removeDuplicates = (
134 // Ranges rarely will line up
135 !isset( $params['range'] ) &&
136 // Multiple pages per job make matches unlikely
137 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
138 );
139 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
140 // Tell JobRunner to not automatically wrap run() in a transaction round.
141 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
142 // and to avoid contention as well.
143 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
144 }
145
151 public static function newPrioritized( PageIdentity $page, array $params ) {
152 $job = new self( $page, $params );
153 $job->command = 'refreshLinksPrioritized';
154
155 return $job;
156 }
157
163 public static function newDynamic( PageIdentity $page, array $params ) {
164 $job = new self( $page, $params );
165 $job->command = 'refreshLinksDynamic';
166
167 return $job;
168 }
169
170 public function run() {
171 $ok = true;
172
173 if ( !empty( $this->params['recursive'] ) ) {
174 // Job to update all (or a range of) backlink pages for a page
175
176 // When the base job branches, wait for the replica DBs to catch up to the primary.
177 // From then on, we know that any template changes at the time the base job was
178 // enqueued will be reflected in backlink page parses when the leaf jobs run.
179 $services = MediaWikiServices::getInstance();
180 if ( !isset( $this->params['range'] ) ) {
181 $lbFactory = $services->getDBLoadBalancerFactory();
182 if ( !$lbFactory->waitForReplication( [
183 'timeout' => self::LAG_WAIT_TIMEOUT
184 ] ) ) {
185 // only try so hard, keep going with what we have
186 $stats = $services->getStatsFactory();
187 $stats->getCounter( 'refreshlinks_warnings_total' )
188 ->setLabel( 'reason', 'lag_wait_failed' )
189 ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' )
190 ->increment();
191 }
192 }
193 // Carry over information for de-duplication
194 $extraParams = $this->getRootJobParams();
195 $extraParams['triggeredRecursive'] = true;
196 // Carry over cause information for logging
197 $extraParams['causeAction'] = $this->params['causeAction'];
198 $extraParams['causeAgent'] = $this->params['causeAgent'];
199 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
200 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
201 $jobs = BacklinkJobUtils::partitionBacklinkJob(
202 $this,
203 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
204 1, // job-per-title
205 [ 'params' => $extraParams ]
206 );
207 $services->getJobQueueGroup()->push( $jobs );
208
209 } elseif ( isset( $this->params['pages'] ) ) {
210 // Job to update link tables for a set of titles
211 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
212 $title = Title::makeTitleSafe( $ns, $dbKey );
213 if ( $title && $title->canExist() ) {
214 $ok = $this->runForTitle( $title ) && $ok;
215 } else {
216 $ok = false;
217 $this->setLastError( "Invalid title ($ns,$dbKey)." );
218 }
219 }
220
221 } else {
222 // Job to update link tables for a given title
223 $ok = $this->runForTitle( $this->title );
224 }
225
226 return $ok;
227 }
228
233 protected function runForTitle( PageIdentity $pageIdentity ) {
234 $services = MediaWikiServices::getInstance();
235 $stats = $services->getStatsFactory();
236 $renderer = $services->getRevisionRenderer();
237 $parserCache = $services->getParserCache();
238 $lbFactory = $services->getDBLoadBalancerFactory();
239 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
240
241 // Load the page from the primary DB
242 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
243 $page->loadPageData( IDBAccessObject::READ_LATEST );
244
245 if ( !$page->exists() ) {
246 // Probably due to concurrent deletion or renaming of the page
247 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
248 $logger->warning(
249 'The page does not exist. Perhaps it was deleted?',
250 [
251 'page_title' => $this->title->getPrefixedDBkey(),
252 'job_params' => $this->getParams(),
253 'job_metadata' => $this->getMetadata()
254 ]
255 );
256 $this->incrementFailureCounter( $stats, 'page_not_found' );
257
258 // retry later to handle unlucky race condition
259 return false;
260 }
261
262 // Serialize link update job by page ID so they see each others' changes.
263 // The page ID and latest revision ID will be queried again after the lock
264 // is acquired to bail if they are changed from that of loadPageData() above.
265 // Serialize links updates by page ID so they see each others' changes
266 $dbw = $lbFactory->getPrimaryDatabase();
268 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
269 if ( $scopedLock === null ) {
270 // Another job is already updating the page, likely for a prior revision (T170596)
271 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
272 $this->incrementFailureCounter( $stats, 'lock_failure' );
273
274 // retry later when overlapping job for previous rev is done
275 return false;
276 }
277
278 if ( $this->isAlreadyRefreshed( $page ) ) {
279 // this job has been superseded, e.g. by overlapping recursive job
280 // for a different template edit, or by direct edit or purge.
281 $stats->getCounter( 'refreshlinks_superseded_updates_total' )
282 ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' )
283 ->increment();
284 // treat as success
285 return true;
286 }
287
288 // Parse during a fresh transaction round for better read consistency
289 $lbFactory->beginPrimaryChanges( __METHOD__ );
290 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
291 $options = $this->getDataUpdateOptions();
292 $lbFactory->commitPrimaryChanges( __METHOD__ );
293
294 if ( !$output ) {
295 // probably raced out.
296 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
297 // Don't retry job.
298 return true;
299 }
300
301 // Tell DerivedPageDataUpdater to use this parser output
302 $options['known-revision-output'] = $output;
303 // Execute corresponding DataUpdates immediately
304 $page->doSecondaryDataUpdates( $options );
306
307 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
308 // This means the page will have to be rendered on-the-fly when it is next viewed.
309 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
310 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
311 // for pages that are likely to benefit (T327162).
312
313 // Commit any writes here in case this method is called in a loop.
314 // In that case, the scoped lock will fail to be acquired.
315 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
316
317 return true;
318 }
319
323 private function getLagAwareRootTimestamp() {
324 // Get the timestamp of the change that triggered this job
325 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
326 if ( $rootTimestamp === null ) {
327 return null;
328 }
329
330 if ( !empty( $this->params['isOpportunistic'] ) ) {
331 // Neither clock skew nor DB snapshot/replica DB lag matter much for
332 // such updates; focus on reusing the (often recently updated) cache
333 $lagAwareTimestamp = $rootTimestamp;
334 } else {
335 // For transclusion updates, the template changes must be reflected
336 $lagAwareTimestamp = wfTimestamp(
337 TS_MW,
338 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
339 );
340 }
341
342 return $lagAwareTimestamp;
343 }
344
349 private function isAlreadyRefreshed( WikiPage $page ) {
350 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
351
352 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
353 }
354
360 private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
361 $services = MediaWikiServices::getInstance();
362 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
363 $slot = $revision->getSlots()->getSlot( $role );
364 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
365 if ( $contentHandler->generateHTMLOnEdit() ) {
366 return true;
367 }
368 }
369 return false;
370 }
371
381 private function getParserOutput(
382 RevisionRenderer $renderer,
383 ParserCache $parserCache,
384 WikiPage $page,
385 StatsFactory $stats
386 ) {
387 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
388 if ( !$revision ) {
389 // race condition?
390 return null;
391 }
392
393 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
394 $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' );
395
396 if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) {
397 $statsCounter
398 ->setLabel( 'status', 'cache_hit' )
399 ->setLabel( 'html_changed', 'n/a' )
400 ->copyToStatsdAt( 'refreshlinks.parser_cached' )
401 ->increment();
402
403 return $cachedOutput;
404 }
405
406 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
407 $parserOptions = $page->makeParserOptions( 'canonical' );
408
409 // T371713: Temporary statistics collection code to determine
410 // feasibility of Parsoid selective update
411 $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get(
413 );
414 $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1;
415 if ( $doSample && $cachedOutput === null ) {
416 // In order to collect accurate statistics, check for
417 // a dirty copy in the cache even if we wouldn't have
418 // to otherwise.
419 $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?: null;
420 }
421
422 $renderedRevision = $renderer->getRenderedRevision(
423 $revision,
424 $parserOptions,
425 null,
426 [
427 'audience' => $revision::RAW,
428 'causeAction' => $causeAction,
429 // Providing a previous parse potentially allows for
430 // selective updates
431 'previous-output' => $cachedOutput,
432 ]
433 );
434
435 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
436 $output = $renderedRevision->getRevisionParserOutput( [
437 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
438 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
439 ] );
440 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
441 // T371713: Temporary statistics collection code to determine
442 // feasibility of Parsoid selective update
443 if ( $doSample ) {
444 $content = $revision->getContent( SlotRecord::MAIN );
445 $labels = [
446 'source' => 'RefreshLinksJob',
447 'type' => $cachedOutput === null ? 'full' : 'selective',
448 'reason' => $causeAction,
449 'parser' => $parserOptions->getUseParsoid() ? 'parsoid' : 'legacy',
450 'opportunistic' => empty( $this->params['isOpportunistic'] ) ? 'false' : 'true',
451 'wiki' => WikiMap::getCurrentWikiId(),
452 'model' => $content ? $content->getModel() : 'unknown',
453 ];
454 $stats
455 ->getCounter( 'ParserCache_selective_total' )
456 ->setLabels( $labels )
457 ->increment();
458 $stats
459 ->getCounter( 'ParserCache_selective_cpu_seconds' )
460 ->setLabels( $labels )
461 ->incrementBy( $output->getTimeProfile( 'cpu' ) );
462 }
463
464 // Collect stats on parses that don't actually change the page content.
465 // In that case, we could abort here, and perhaps we could also avoid
466 // triggering CDN purges (T369898).
467 if ( !$cachedOutput ) {
468 // There was no cached output
469 $htmlChanged = 'unknown';
470 } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) {
471 // We have cached output, but we couldn't be sure that it was still good.
472 // So we parsed again, but the result turned out to be the same HTML as
473 // before.
474 $htmlChanged = 'no';
475 } else {
476 // Re-parsing yielded HTML different from the cached output.
477 $htmlChanged = 'yes';
478 }
479
480 $statsCounter
481 ->setLabel( 'status', 'cache_miss' )
482 ->setLabel( 'html_changed', $htmlChanged )
483 ->copyToStatsdAt( 'refreshlinks.parser_uncached' )
484 ->increment();
485
486 return $output;
487 }
488
496 private function getCurrentRevisionIfUnchanged(
497 WikiPage $page,
498 StatsFactory $stats
499 ) {
500 $title = $page->getTitle();
501 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
502 // This is used to detect edits/moves after loadPageData() but before the scope lock.
503 // The works around the chicken/egg problem of determining the scope lock key name
504 $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST );
505
506 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
507 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
508 // This job is obsolete and one for the latest revision will handle updates
509 $this->incrementFailureCounter( $stats, 'rev_not_current' );
510 $this->setLastError( "Revision $triggeringRevisionId is not current" );
511 return null;
512 }
513
514 // Load the current revision. Note that $page should have loaded with READ_LATEST.
515 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
516 $revision = $page->getRevisionRecord();
517 if ( !$revision ) {
518 // revision just got deleted?
519 $this->incrementFailureCounter( $stats, 'rev_not_found' );
520 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
521 return null;
522
523 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
524 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
525 // serialized, it would be OK to update links based on older revisions since it
526 // would eventually get to the latest. Since that is not the case (by design),
527 // only update the link tables to a state matching the current revision's output.
528 $this->incrementFailureCounter( $stats, 'rev_not_current' );
529 $this->setLastError( "Revision {$revision->getId()} is not current" );
530
531 return null;
532 }
533
534 return $revision;
535 }
536
546 private function getParserOutputFromCache(
547 ParserCache $parserCache,
548 WikiPage $page,
549 RevisionRecord $currentRevision,
550 StatsFactory $stats
551 ): ?ParserOutput {
552 // Parsoid can do selective updates, so it is always worth the I/O
553 // to check for a previous parse.
554 $parserOptions = $page->makeParserOptions( 'canonical' );
555 if ( $parserOptions->getUseParsoid() ) {
556 return $parserCache->getDirty( $page, $parserOptions ) ?: null;
557 }
558 // If page_touched changed after this root job, then it is likely that
559 // any views of the pages already resulted in re-parses which are now in
560 // cache. The cache can be reused to avoid expensive parsing in some cases.
561 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
562 if ( $rootTimestamp !== null ) {
563 $opportunistic = !empty( $this->params['isOpportunistic'] );
564 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
565 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
566 // We call canUseParserOutputFromCache() later to check if it's usable.
567 return $parserCache->getDirty( $page, $parserOptions ) ?: null;
568 }
569 }
570
571 return null;
572 }
573
574 private function canUseParserOutputFromCache(
575 ParserOutput $cachedOutput,
576 RevisionRecord $currentRevision
577 ) {
578 // As long as the cache rev ID matches the current rev ID and it reflects
579 // the job's triggering change, then it is usable.
580 return $cachedOutput->getCacheRevisionId() == $currentRevision->getId()
581 && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp();
582 }
583
591 private function incrementFailureCounter( StatsFactory $stats, $reason ): void {
592 $stats->getCounter( 'refreshlinks_failures_total' )
593 ->setLabel( 'reason', $reason )
594 ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" )
595 ->increment();
596 }
597
601 private function getDataUpdateOptions() {
602 $options = [
603 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
604 // Carry over cause so the update can do extra logging
605 'causeAction' => $this->params['causeAction'],
606 'causeAgent' => $this->params['causeAgent']
607 ];
608 if ( !empty( $this->params['triggeringUser'] ) ) {
609 $userInfo = $this->params['triggeringUser'];
610 if ( $userInfo['userId'] ) {
611 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
612 } else {
613 // Anonymous, use the username
614 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
615 }
616 }
617
618 return $options;
619 }
620
621 public function getDeduplicationInfo() {
622 $info = parent::getDeduplicationInfo();
623 unset( $info['causeAction'] );
624 unset( $info['causeAgent'] );
625 if ( is_array( $info['params'] ) ) {
626 // For per-pages jobs, the job title is that of the template that changed
627 // (or similar), so remove that since it ruins duplicate detection
628 if ( isset( $info['params']['pages'] ) ) {
629 unset( $info['namespace'] );
630 unset( $info['title'] );
631 }
632 }
633
634 return $info;
635 }
636
637 public function workItemCount() {
638 if ( !empty( $this->params['recursive'] ) ) {
639 return 0; // nothing actually refreshed
640 } elseif ( isset( $this->params['pages'] ) ) {
641 return count( $this->params['pages'] );
642 }
643
644 return 1; // one title
645 }
646}
647
649class_alias( RefreshLinksJob::class, 'RefreshLinksJob' );
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Displays information about a page.
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Class the manages updates of *_link tables as well as similar extension-managed tables.
Update object handling the cleanup of secondary data after a page was edited.
Describe and execute a background job.
Definition Job.php:41
array $params
Array of job parameters.
Definition Job.php:46
setLastError( $error)
Definition Job.php:435
Job to update link tables for rerendered wiki pages.
__construct(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
Helper for a Job that updates links to a given page title.
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
const UpdateRowsPerJob
Name constant for the UpdateRowsPerJob setting, for use with Config::get()
const ParsoidSelectiveUpdateSampleRate
Name constant for the ParsoidSelectiveUpdateSampleRate setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Exception if a PageIdentity is an invalid argument.
Base representation for an editable wiki page.
Definition WikiPage.php:92
Cache for ParserOutput objects corresponding to the latest page revisions.
ParserOutput is a rendering of a Content object or a message.
Page revision base class.
The RevisionRenderer service provides access to rendered output for revisions.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:78
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1208
User class for the MediaWiki software.
Definition User.php:120
Tools for dealing with other locally-hosted wikis.
Definition WikiMap.php:31
This is the primary interface for validating metrics definitions, caching defined metrics,...
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
Interface for database access objects.
setLastError( $error)
This is actually implemented in the Job class.
setLabel(string $key, string $value)
copyToStatsdAt( $statsdNamespaces)
if(count( $args)< 1) $job