MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
21 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
32 
58 class RefreshLinksJob extends Job {
60  private const NORMAL_MAX_LAG = 10;
62  private const LAG_WAIT_TIMEOUT = 15;
63 
64  public function __construct( PageIdentity $page, array $params ) {
65  if ( empty( $params['pages'] ) && !$page->canExist() ) {
66  // BC with the Title class
67  throw new PageAssertionException(
68  'The given PageIdentity {pageIdentity} does not represent a proper page',
69  [ 'pageIdentity' => $page ]
70  );
71  }
72 
73  parent::__construct( 'refreshLinks', $page, $params );
74  // Avoid the overhead of de-duplication when it would be pointless
75  $this->removeDuplicates = (
76  // Ranges rarely will line up
77  !isset( $params['range'] ) &&
78  // Multiple pages per job make matches unlikely
79  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
80  );
81  $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
82  // Tell JobRunner to not automatically wrap run() in a transaction round.
83  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
84  // and to avoid contention as well.
85  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
86  }
87 
93  public static function newPrioritized( PageIdentity $page, array $params ) {
94  $job = new self( $page, $params );
95  $job->command = 'refreshLinksPrioritized';
96 
97  return $job;
98  }
99 
105  public static function newDynamic( PageIdentity $page, array $params ) {
106  $job = new self( $page, $params );
107  $job->command = 'refreshLinksDynamic';
108 
109  return $job;
110  }
111 
112  public function run() {
113  $ok = true;
114 
115  if ( !empty( $this->params['recursive'] ) ) {
116  // Job to update all (or a range of) backlink pages for a page
117 
118  // When the base job branches, wait for the replica DBs to catch up to the primary.
119  // From then on, we know that any template changes at the time the base job was
120  // enqueued will be reflected in backlink page parses when the leaf jobs run.
121  $services = MediaWikiServices::getInstance();
122  if ( !isset( $this->params['range'] ) ) {
123  $lbFactory = $services->getDBLoadBalancerFactory();
124  if ( !$lbFactory->waitForReplication( [
125  'timeout' => self::LAG_WAIT_TIMEOUT
126  ] ) ) {
127  // only try so hard, keep going with what we have
128  $stats = $services->getStatsdDataFactory();
129  $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
130  }
131  }
132  // Carry over information for de-duplication
133  $extraParams = $this->getRootJobParams();
134  $extraParams['triggeredRecursive'] = true;
135  // Carry over cause information for logging
136  $extraParams['causeAction'] = $this->params['causeAction'];
137  $extraParams['causeAgent'] = $this->params['causeAgent'];
138  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
139  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
141  $this,
142  $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
143  1, // job-per-title
144  [ 'params' => $extraParams ]
145  );
146  $services->getJobQueueGroup()->push( $jobs );
147 
148  } elseif ( isset( $this->params['pages'] ) ) {
149  // Job to update link tables for a set of titles
150  foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
151  $title = Title::makeTitleSafe( $ns, $dbKey );
152  if ( $title && $title->canExist() ) {
153  $ok = $this->runForTitle( $title ) && $ok;
154  } else {
155  $ok = false;
156  $this->setLastError( "Invalid title ($ns,$dbKey)." );
157  }
158  }
159 
160  } else {
161  // Job to update link tables for a given title
162  $ok = $this->runForTitle( $this->title );
163  }
164 
165  return $ok;
166  }
167 
172  protected function runForTitle( PageIdentity $pageIdentity ) {
173  $services = MediaWikiServices::getInstance();
174  $stats = $services->getStatsdDataFactory();
175  $renderer = $services->getRevisionRenderer();
176  $parserCache = $services->getParserCache();
177  $lbFactory = $services->getDBLoadBalancerFactory();
178  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
179 
180  // Load the page from the primary DB
181  $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
182  $page->loadPageData( WikiPage::READ_LATEST );
183 
184  if ( !$page->exists() ) {
185  // Probably due to concurrent deletion or renaming of the page
186  $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
187  $logger->warning(
188  'The page does not exist. Perhaps it was deleted?',
189  [
190  'page_title' => $this->title->getPrefixedDBkey(),
191  'job_params' => $this->getParams(),
192  'job_metadata' => $this->getMetadata()
193  ]
194  );
195  $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
196 
197  // retry later to handle unlucky race condition
198  return false;
199  }
200 
201  // Serialize link update job by page ID so they see each others' changes.
202  // The page ID and latest revision ID will be queried again after the lock
203  // is acquired to bail if they are changed from that of loadPageData() above.
204  // Serialize links updates by page ID so they see each others' changes
205  $dbw = $lbFactory->getPrimaryDatabase();
207  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
208  if ( $scopedLock === null ) {
209  // Another job is already updating the page, likely for a prior revision (T170596)
210  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
211  $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
212 
213  // retry later when overlapping job for previous rev is done
214  return false;
215  }
216 
217  if ( $this->isAlreadyRefreshed( $page ) ) {
218  // this job has been superseded, e.g. by overlapping recursive job
219  // for a different template edit, or by direct edit or purge.
220  $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
221  // treat as success
222  return true;
223  }
224 
225  // These can be fairly long-running jobs, while commitAndWaitForReplication
226  // releases primary snapshots, let the replica release their snapshot as well
227  $lbFactory->flushReplicaSnapshots( __METHOD__ );
228  // Parse during a fresh transaction round for better read consistency
229  $lbFactory->beginPrimaryChanges( __METHOD__ );
230  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
231  $options = $this->getDataUpdateOptions();
232  $lbFactory->commitPrimaryChanges( __METHOD__ );
233 
234  if ( !$output ) {
235  // probably raced out.
236  // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
237  // FIXME: Why do we retry this? Can this be a cancellation?
238  return false;
239  }
240 
241  // Tell DerivedPageDataUpdater to use this parser output
242  $options['known-revision-output'] = $output;
243  // Execute corresponding DataUpdates immediately
244  $page->doSecondaryDataUpdates( $options );
246 
247  // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
248  // This means the page will have to be rendered on-the-fly when it is next viewed.
249  // This is to avoid spending limited ParserCache capacity on rarely visited pages.
250  // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
251  // for pages that are likely to benefit (T327162).
252 
253  // Commit any writes here in case this method is called in a loop.
254  // In that case, the scoped lock will fail to be acquired.
255  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
256 
257  return true;
258  }
259 
263  private function getLagAwareRootTimestamp() {
264  // Get the timestamp of the change that triggered this job
265  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
266  if ( $rootTimestamp === null ) {
267  return null;
268  }
269 
270  if ( !empty( $this->params['isOpportunistic'] ) ) {
271  // Neither clock skew nor DB snapshot/replica DB lag matter much for
272  // such updates; focus on reusing the (often recently updated) cache
273  $lagAwareTimestamp = $rootTimestamp;
274  } else {
275  // For transclusion updates, the template changes must be reflected
276  $lagAwareTimestamp = wfTimestamp(
277  TS_MW,
278  (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
279  );
280  }
281 
282  return $lagAwareTimestamp;
283  }
284 
289  private function isAlreadyRefreshed( WikiPage $page ) {
290  $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
291 
292  return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
293  }
294 
300  private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool {
301  $services = MediaWikiServices::getInstance();
302  foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
303  $slot = $revision->getSlots()->getSlot( $role );
304  $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
305  if ( $contentHandler->generateHTMLOnEdit() ) {
306  return true;
307  }
308  }
309  return false;
310  }
311 
321  private function getParserOutput(
322  RevisionRenderer $renderer,
323  ParserCache $parserCache,
324  WikiPage $page,
325  StatsdDataFactoryInterface $stats
326  ) {
327  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
328  if ( !$revision ) {
329  // race condition?
330  return null;
331  }
332 
333  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
334  if ( $cachedOutput ) {
335  return $cachedOutput;
336  }
337 
338  $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
339  $renderedRevision = $renderer->getRenderedRevision(
340  $revision,
341  $page->makeParserOptions( 'canonical' ),
342  null,
343  [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
344  );
345 
346  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
347  $output = $renderedRevision->getRevisionParserOutput( [
348  // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
349  'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
350  ] );
351  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
352 
353  return $output;
354  }
355 
363  private function getCurrentRevisionIfUnchanged(
364  WikiPage $page,
365  StatsdDataFactoryInterface $stats
366  ) {
367  $title = $page->getTitle();
368  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
369  // This is used to detect edits/moves after loadPageData() but before the scope lock.
370  // The works around the chicken/egg problem of determining the scope lock key name
371  $latest = $title->getLatestRevID( Title::READ_LATEST );
372 
373  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
374  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
375  // This job is obsolete and one for the latest revision will handle updates
376  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
377  $this->setLastError( "Revision $triggeringRevisionId is not current" );
378  return null;
379  }
380 
381  // Load the current revision. Note that $page should have loaded with READ_LATEST.
382  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
383  $revision = $page->getRevisionRecord();
384  if ( !$revision ) {
385  // revision just got deleted?
386  $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
387  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
388  return null;
389 
390  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
391  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
392  // serialized, it would be OK to update links based on older revisions since it
393  // would eventually get to the latest. Since that is not the case (by design),
394  // only update the link tables to a state matching the current revision's output.
395  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
396  $this->setLastError( "Revision {$revision->getId()} is not current" );
397 
398  return null;
399  }
400 
401  return $revision;
402  }
403 
413  private function getParserOutputFromCache(
414  ParserCache $parserCache,
415  WikiPage $page,
416  RevisionRecord $currentRevision,
417  StatsdDataFactoryInterface $stats
418  ) {
419  $cachedOutput = null;
420  // If page_touched changed after this root job, then it is likely that
421  // any views of the pages already resulted in re-parses which are now in
422  // cache. The cache can be reused to avoid expensive parsing in some cases.
423  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
424  if ( $rootTimestamp !== null ) {
425  $opportunistic = !empty( $this->params['isOpportunistic'] );
426  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
427  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
428  // As long as the cache rev ID matches the current rev ID and it reflects
429  // the job's triggering change, then it is usable.
430  $parserOptions = $page->makeParserOptions( 'canonical' );
431  $output = $parserCache->getDirty( $page, $parserOptions );
432  if (
433  $output &&
434  $output->getCacheRevisionId() == $currentRevision->getId() &&
435  $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
436  ) {
437  $cachedOutput = $output;
438  }
439  }
440  }
441 
442  if ( $cachedOutput ) {
443  $stats->increment( 'refreshlinks.parser_cached' );
444  } else {
445  $stats->increment( 'refreshlinks.parser_uncached' );
446  }
447 
448  return $cachedOutput;
449  }
450 
454  private function getDataUpdateOptions() {
455  $options = [
456  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
457  // Carry over cause so the update can do extra logging
458  'causeAction' => $this->params['causeAction'],
459  'causeAgent' => $this->params['causeAgent']
460  ];
461  if ( !empty( $this->params['triggeringUser'] ) ) {
462  $userInfo = $this->params['triggeringUser'];
463  if ( $userInfo['userId'] ) {
464  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
465  } else {
466  // Anonymous, use the username
467  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
468  }
469  }
470 
471  return $options;
472  }
473 
474  public function getDeduplicationInfo() {
475  $info = parent::getDeduplicationInfo();
476  unset( $info['causeAction'] );
477  unset( $info['causeAgent'] );
478  if ( is_array( $info['params'] ) ) {
479  // For per-pages jobs, the job title is that of the template that changed
480  // (or similar), so remove that since it ruins duplicate detection
481  if ( isset( $info['params']['pages'] ) ) {
482  unset( $info['namespace'] );
483  unset( $info['title'] );
484  }
485  }
486 
487  return $info;
488  }
489 
490  public function workItemCount() {
491  if ( !empty( $this->params['recursive'] ) ) {
492  return 0; // nothing actually refreshed
493  } elseif ( isset( $this->params['pages'] ) ) {
494  return count( $this->params['pages'] );
495  }
496 
497  return 1; // one title
498  }
499 }
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:164
Class to both describe a background job and handle jobs.
Definition: Job.php:40
Title $title
Definition: Job.php:51
getRootJobParams()
Definition: Job.php:321
setLastError( $error)
Definition: Job.php:432
array $params
Array of job parameters.
Definition: Job.php:45
Class the manages updates of *_link tables as well as similar extension-managed tables.
Definition: LinksUpdate.php:55
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Page revision base class.
getPageId( $wikiId=self::LOCAL)
Get the page ID.
getSlots()
Returns the slots defined for this revision.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Represents a title within MediaWiki.
Definition: Title.php:76
canExist()
Can this title represent a page in the wiki's database?
Definition: Title.php:1226
internal since 1.36
Definition: User.php:98
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:64
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
Base representation for an editable wiki page.
Definition: WikiPage.php:77
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:682
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1897
getId( $wikiId=self::LOCAL)
Definition: WikiPage.php:528
getTitle()
Get the title object of the article.
Definition: WikiPage.php:258
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
Definition: WikiPage.php:2083
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition: WikiPage.php:415
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:758
getTouched()
Get the page_touched field.
Definition: WikiPage.php:660
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
if(count( $args)< 1) $job