MediaWiki master
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
21use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
33
94class RefreshLinksJob extends Job {
96 private const NORMAL_MAX_LAG = 10;
98 private const LAG_WAIT_TIMEOUT = 15;
99
100 public function __construct( PageIdentity $page, array $params ) {
101 if ( empty( $params['pages'] ) && !$page->canExist() ) {
102 // BC with the Title class
103 throw new PageAssertionException(
104 'The given PageIdentity {pageIdentity} does not represent a proper page',
105 [ 'pageIdentity' => $page ]
106 );
107 }
108
109 parent::__construct( 'refreshLinks', $page, $params );
110 // Avoid the overhead of de-duplication when it would be pointless
111 $this->removeDuplicates = (
112 // Ranges rarely will line up
113 !isset( $params['range'] ) &&
114 // Multiple pages per job make matches unlikely
115 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
116 );
117 $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
118 // Tell JobRunner to not automatically wrap run() in a transaction round.
119 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
120 // and to avoid contention as well.
121 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
122 }
123
129 public static function newPrioritized( PageIdentity $page, array $params ) {
130 $job = new self( $page, $params );
131 $job->command = 'refreshLinksPrioritized';
132
133 return $job;
134 }
135
141 public static function newDynamic( PageIdentity $page, array $params ) {
142 $job = new self( $page, $params );
143 $job->command = 'refreshLinksDynamic';
144
145 return $job;
146 }
147
148 public function run() {
149 $ok = true;
150
151 if ( !empty( $this->params['recursive'] ) ) {
152 // Job to update all (or a range of) backlink pages for a page
153
154 // When the base job branches, wait for the replica DBs to catch up to the primary.
155 // From then on, we know that any template changes at the time the base job was
156 // enqueued will be reflected in backlink page parses when the leaf jobs run.
157 $services = MediaWikiServices::getInstance();
158 if ( !isset( $this->params['range'] ) ) {
159 $lbFactory = $services->getDBLoadBalancerFactory();
160 if ( !$lbFactory->waitForReplication( [
161 'timeout' => self::LAG_WAIT_TIMEOUT
162 ] ) ) {
163 // only try so hard, keep going with what we have
164 $stats = $services->getStatsdDataFactory();
165 $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
166 }
167 }
168 // Carry over information for de-duplication
169 $extraParams = $this->getRootJobParams();
170 $extraParams['triggeredRecursive'] = true;
171 // Carry over cause information for logging
172 $extraParams['causeAction'] = $this->params['causeAction'];
173 $extraParams['causeAgent'] = $this->params['causeAgent'];
174 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
175 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
177 $this,
178 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
179 1, // job-per-title
180 [ 'params' => $extraParams ]
181 );
182 $services->getJobQueueGroup()->push( $jobs );
183
184 } elseif ( isset( $this->params['pages'] ) ) {
185 // Job to update link tables for a set of titles
186 foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
187 $title = Title::makeTitleSafe( $ns, $dbKey );
188 if ( $title && $title->canExist() ) {
189 $ok = $this->runForTitle( $title ) && $ok;
190 } else {
191 $ok = false;
192 $this->setLastError( "Invalid title ($ns,$dbKey)." );
193 }
194 }
195
196 } else {
197 // Job to update link tables for a given title
198 $ok = $this->runForTitle( $this->title );
199 }
200
201 return $ok;
202 }
203
208 protected function runForTitle( PageIdentity $pageIdentity ) {
209 $services = MediaWikiServices::getInstance();
210 $stats = $services->getStatsdDataFactory();
211 $renderer = $services->getRevisionRenderer();
212 $parserCache = $services->getParserCache();
213 $lbFactory = $services->getDBLoadBalancerFactory();
214 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
215
216 // Load the page from the primary DB
217 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
218 $page->loadPageData( IDBAccessObject::READ_LATEST );
219
220 if ( !$page->exists() ) {
221 // Probably due to concurrent deletion or renaming of the page
222 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
223 $logger->warning(
224 'The page does not exist. Perhaps it was deleted?',
225 [
226 'page_title' => $this->title->getPrefixedDBkey(),
227 'job_params' => $this->getParams(),
228 'job_metadata' => $this->getMetadata()
229 ]
230 );
231 $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
232
233 // retry later to handle unlucky race condition
234 return false;
235 }
236
237 // Serialize link update job by page ID so they see each others' changes.
238 // The page ID and latest revision ID will be queried again after the lock
239 // is acquired to bail if they are changed from that of loadPageData() above.
240 // Serialize links updates by page ID so they see each others' changes
241 $dbw = $lbFactory->getPrimaryDatabase();
243 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
244 if ( $scopedLock === null ) {
245 // Another job is already updating the page, likely for a prior revision (T170596)
246 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
247 $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
248
249 // retry later when overlapping job for previous rev is done
250 return false;
251 }
252
253 if ( $this->isAlreadyRefreshed( $page ) ) {
254 // this job has been superseded, e.g. by overlapping recursive job
255 // for a different template edit, or by direct edit or purge.
256 $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
257 // treat as success
258 return true;
259 }
260
261 // These can be fairly long-running jobs, while commitAndWaitForReplication
262 // releases primary snapshots, let the replica release their snapshot as well
263 $lbFactory->flushReplicaSnapshots( __METHOD__ );
264 // Parse during a fresh transaction round for better read consistency
265 $lbFactory->beginPrimaryChanges( __METHOD__ );
266 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
267 $options = $this->getDataUpdateOptions();
268 $lbFactory->commitPrimaryChanges( __METHOD__ );
269
270 if ( !$output ) {
271 // probably raced out.
272 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
273 // FIXME: Why do we retry this? Can this be a cancellation?
274 return false;
275 }
276
277 // Tell DerivedPageDataUpdater to use this parser output
278 $options['known-revision-output'] = $output;
279 // Execute corresponding DataUpdates immediately
280 $page->doSecondaryDataUpdates( $options );
281 InfoAction::invalidateCache( $page );
282
283 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
284 // This means the page will have to be rendered on-the-fly when it is next viewed.
285 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
286 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
287 // for pages that are likely to benefit (T327162).
288
289 // Commit any writes here in case this method is called in a loop.
290 // In that case, the scoped lock will fail to be acquired.
291 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
292
293 return true;
294 }
295
299 private function getLagAwareRootTimestamp() {
300 // Get the timestamp of the change that triggered this job
301 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
302 if ( $rootTimestamp === null ) {
303 return null;
304 }
305
306 if ( !empty( $this->params['isOpportunistic'] ) ) {
307 // Neither clock skew nor DB snapshot/replica DB lag matter much for
308 // such updates; focus on reusing the (often recently updated) cache
309 $lagAwareTimestamp = $rootTimestamp;
310 } else {
311 // For transclusion updates, the template changes must be reflected
312 $lagAwareTimestamp = wfTimestamp(
313 TS_MW,
314 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
315 );
316 }
317
318 return $lagAwareTimestamp;
319 }
320
325 private function isAlreadyRefreshed( WikiPage $page ) {
326 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
327
328 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
329 }
330
336 private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
337 $services = MediaWikiServices::getInstance();
338 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
339 $slot = $revision->getSlots()->getSlot( $role );
340 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
341 if ( $contentHandler->generateHTMLOnEdit() ) {
342 return true;
343 }
344 }
345 return false;
346 }
347
357 private function getParserOutput(
358 RevisionRenderer $renderer,
359 ParserCache $parserCache,
360 WikiPage $page,
361 StatsdDataFactoryInterface $stats
362 ) {
363 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
364 if ( !$revision ) {
365 // race condition?
366 return null;
367 }
368
369 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
370 if ( $cachedOutput ) {
371 return $cachedOutput;
372 }
373
374 $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
375 $renderedRevision = $renderer->getRenderedRevision(
376 $revision,
377 $page->makeParserOptions( 'canonical' ),
378 null,
379 [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
380 );
381
382 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
383 $output = $renderedRevision->getRevisionParserOutput( [
384 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
385 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
386 ] );
387 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
388
389 return $output;
390 }
391
399 private function getCurrentRevisionIfUnchanged(
400 WikiPage $page,
401 StatsdDataFactoryInterface $stats
402 ) {
403 $title = $page->getTitle();
404 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
405 // This is used to detect edits/moves after loadPageData() but before the scope lock.
406 // The works around the chicken/egg problem of determining the scope lock key name
407 $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST );
408
409 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
410 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
411 // This job is obsolete and one for the latest revision will handle updates
412 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
413 $this->setLastError( "Revision $triggeringRevisionId is not current" );
414 return null;
415 }
416
417 // Load the current revision. Note that $page should have loaded with READ_LATEST.
418 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
419 $revision = $page->getRevisionRecord();
420 if ( !$revision ) {
421 // revision just got deleted?
422 $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
423 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
424 return null;
425
426 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
427 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
428 // serialized, it would be OK to update links based on older revisions since it
429 // would eventually get to the latest. Since that is not the case (by design),
430 // only update the link tables to a state matching the current revision's output.
431 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
432 $this->setLastError( "Revision {$revision->getId()} is not current" );
433
434 return null;
435 }
436
437 return $revision;
438 }
439
449 private function getParserOutputFromCache(
450 ParserCache $parserCache,
451 WikiPage $page,
452 RevisionRecord $currentRevision,
453 StatsdDataFactoryInterface $stats
454 ) {
455 $cachedOutput = null;
456 // If page_touched changed after this root job, then it is likely that
457 // any views of the pages already resulted in re-parses which are now in
458 // cache. The cache can be reused to avoid expensive parsing in some cases.
459 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
460 if ( $rootTimestamp !== null ) {
461 $opportunistic = !empty( $this->params['isOpportunistic'] );
462 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
463 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
464 // As long as the cache rev ID matches the current rev ID and it reflects
465 // the job's triggering change, then it is usable.
466 $parserOptions = $page->makeParserOptions( 'canonical' );
467 $output = $parserCache->getDirty( $page, $parserOptions );
468 if (
469 $output &&
470 $output->getCacheRevisionId() == $currentRevision->getId() &&
471 $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
472 ) {
473 $cachedOutput = $output;
474 }
475 }
476 }
477
478 if ( $cachedOutput ) {
479 $stats->increment( 'refreshlinks.parser_cached' );
480 } else {
481 $stats->increment( 'refreshlinks.parser_uncached' );
482 }
483
484 return $cachedOutput;
485 }
486
490 private function getDataUpdateOptions() {
491 $options = [
492 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
493 // Carry over cause so the update can do extra logging
494 'causeAction' => $this->params['causeAction'],
495 'causeAgent' => $this->params['causeAgent']
496 ];
497 if ( !empty( $this->params['triggeringUser'] ) ) {
498 $userInfo = $this->params['triggeringUser'];
499 if ( $userInfo['userId'] ) {
500 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
501 } else {
502 // Anonymous, use the username
503 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
504 }
505 }
506
507 return $options;
508 }
509
510 public function getDeduplicationInfo() {
511 $info = parent::getDeduplicationInfo();
512 unset( $info['causeAction'] );
513 unset( $info['causeAgent'] );
514 if ( is_array( $info['params'] ) ) {
515 // For per-pages jobs, the job title is that of the template that changed
516 // (or similar), so remove that since it ruins duplicate detection
517 if ( isset( $info['params']['pages'] ) ) {
518 unset( $info['namespace'] );
519 unset( $info['title'] );
520 }
521 }
522
523 return $info;
524 }
525
526 public function workItemCount() {
527 if ( !empty( $this->params['recursive'] ) ) {
528 return 0; // nothing actually refreshed
529 } elseif ( isset( $this->params['pages'] ) ) {
530 return count( $this->params['pages'] );
531 }
532
533 return 1; // one title
534 }
535}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Class to both describe a background job and handle jobs.
Definition Job.php:40
Title $title
Definition Job.php:51
getRootJobParams()
Definition Job.php:323
setLastError( $error)
Definition Job.php:434
Class the manages updates of *_link tables as well as similar extension-managed tables.
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Rendered output of a wiki page, as parsed from wikitext.
Page revision base class.
getPageId( $wikiId=self::LOCAL)
Get the page ID.
getSlots()
Returns the slots defined for this revision.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Represents a title within MediaWiki.
Definition Title.php:78
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1212
internal since 1.36
Definition User.php:93
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
Base representation for an editable wiki page.
Definition WikiPage.php:79
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:666
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:530
getTitle()
Get the title object of the article.
Definition WikiPage.php:260
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:417
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:742
getTouched()
Get the page_touched field.
Definition WikiPage.php:644
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
if(count( $args)< 1) $job