MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
21 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
30 
56 class RefreshLinksJob extends Job {
58  private const NORMAL_MAX_LAG = 10;
60  private const LAG_WAIT_TIMEOUT = 15;
61 
62  public function __construct( PageIdentity $page, array $params ) {
63  if ( empty( $params['pages'] ) && !$page->canExist() ) {
64  // BC with the Title class
65  throw new PageAssertionException(
66  'The given PageIdentity {pageIdentity} does not represent a proper page',
67  [ 'pageIdentity' => $page ]
68  );
69  }
70 
71  parent::__construct( 'refreshLinks', $page, $params );
72  // Avoid the overhead of de-duplication when it would be pointless
73  $this->removeDuplicates = (
74  // Ranges rarely will line up
75  !isset( $params['range'] ) &&
76  // Multiple pages per job make matches unlikely
77  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
78  );
79  $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
80  // Tell JobRunner to not automatically wrap run() in a transaction round.
81  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
82  // and to avoid contention as well.
83  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
84  }
85 
91  public static function newPrioritized( PageIdentity $page, array $params ) {
92  $job = new self( $page, $params );
93  $job->command = 'refreshLinksPrioritized';
94 
95  return $job;
96  }
97 
103  public static function newDynamic( PageIdentity $page, array $params ) {
104  $job = new self( $page, $params );
105  $job->command = 'refreshLinksDynamic';
106 
107  return $job;
108  }
109 
110  public function run() {
111  $ok = true;
112 
113  if ( !empty( $this->params['recursive'] ) ) {
114  // Job to update all (or a range of) backlink pages for a page
115 
116  // When the base job branches, wait for the replica DBs to catch up to the primary.
117  // From then on, we know that any template changes at the time the base job was
118  // enqueued will be reflected in backlink page parses when the leaf jobs run.
119  $services = MediaWikiServices::getInstance();
120  if ( !isset( $this->params['range'] ) ) {
121  $lbFactory = $services->getDBLoadBalancerFactory();
122  if ( !$lbFactory->waitForReplication( [
123  'domain' => $lbFactory->getLocalDomainID(),
124  'timeout' => self::LAG_WAIT_TIMEOUT
125  ] ) ) {
126  // only try so hard, keep going with what we have
127  $stats = $services->getStatsdDataFactory();
128  $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
129  }
130  }
131  // Carry over information for de-duplication
132  $extraParams = $this->getRootJobParams();
133  $extraParams['triggeredRecursive'] = true;
134  // Carry over cause information for logging
135  $extraParams['causeAction'] = $this->params['causeAction'];
136  $extraParams['causeAgent'] = $this->params['causeAgent'];
137  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
138  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
140  $this,
141  $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
142  1, // job-per-title
143  [ 'params' => $extraParams ]
144  );
145  $services->getJobQueueGroup()->push( $jobs );
146 
147  } elseif ( isset( $this->params['pages'] ) ) {
148  // Job to update link tables for a set of titles
149  foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) {
150  $title = Title::makeTitleSafe( $ns, $dbKey );
151  if ( $title && $title->canExist() ) {
152  $ok = $this->runForTitle( $title ) && $ok;
153  } else {
154  $ok = false;
155  $this->setLastError( "Invalid title ($ns,$dbKey)." );
156  }
157  }
158 
159  } else {
160  // Job to update link tables for a given title
161  $ok = $this->runForTitle( $this->title );
162  }
163 
164  return $ok;
165  }
166 
171  protected function runForTitle( PageIdentity $pageIdentity ) {
172  $services = MediaWikiServices::getInstance();
173  $stats = $services->getStatsdDataFactory();
174  $renderer = $services->getRevisionRenderer();
175  $parserCache = $services->getParserCache();
176  $lbFactory = $services->getDBLoadBalancerFactory();
177  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
178 
179  // Load the page from the primary DB
180  $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
181  $page->loadPageData( WikiPage::READ_LATEST );
182 
183  if ( !$page->exists() ) {
184  // Probably due to concurrent deletion or renaming of the page
185  $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
186  $logger->notice(
187  'The page does not exist. Perhaps it was deleted?',
188  [
189  'page_title' => $this->title->getPrefixedDBkey(),
190  'job_params' => $this->getParams(),
191  'job_metadata' => $this->getMetadata()
192  ]
193  );
194  $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
195 
196  // retry later to handle unlucky race condition
197  return false;
198  }
199 
200  // Serialize link update job by page ID so they see each others' changes.
201  // The page ID and latest revision ID will be queried again after the lock
202  // is acquired to bail if they are changed from that of loadPageData() above.
203  // Serialize links updates by page ID so they see each others' changes
204  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
206  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
207  if ( $scopedLock === null ) {
208  // Another job is already updating the page, likely for a prior revision (T170596)
209  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
210  $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
211 
212  // retry later when overlapping job for previous rev is done
213  return false;
214  }
215 
216  if ( $this->isAlreadyRefreshed( $page ) ) {
217  // this job has been superseded, e.g. by overlapping recursive job
218  // for a different template edit, or by direct edit or purge.
219  $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
220  // treat as success
221  return true;
222  }
223 
224  // These can be fairly long-running jobs, while commitAndWaitForReplication
225  // releases primary snapshots, let the replica release their snapshot as well
226  $lbFactory->flushReplicaSnapshots( __METHOD__ );
227  // Parse during a fresh transaction round for better read consistency
228  $lbFactory->beginPrimaryChanges( __METHOD__ );
229  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
230  $options = $this->getDataUpdateOptions();
231  $lbFactory->commitPrimaryChanges( __METHOD__ );
232 
233  if ( !$output ) {
234  // probably raced out.
235  // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
236  // FIXME: Why do we retry this? Can this be a cancellation?
237  return false;
238  }
239 
240  // Tell DerivedPageDataUpdater to use this parser output
241  $options['known-revision-output'] = $output;
242  // Execute corresponding DataUpdates immediately
243  $page->doSecondaryDataUpdates( $options );
245 
246  // Commit any writes here in case this method is called in a loop.
247  // In that case, the scoped lock will fail to be acquired.
248  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
249 
250  return true;
251  }
252 
256  private function getLagAwareRootTimestamp() {
257  // Get the timestamp of the change that triggered this job
258  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
259  if ( $rootTimestamp === null ) {
260  return null;
261  }
262 
263  if ( !empty( $this->params['isOpportunistic'] ) ) {
264  // Neither clock skew nor DB snapshot/replica DB lag matter much for
265  // such updates; focus on reusing the (often recently updated) cache
266  $lagAwareTimestamp = $rootTimestamp;
267  } else {
268  // For transclusion updates, the template changes must be reflected
269  $lagAwareTimestamp = wfTimestamp(
270  TS_MW,
271  (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
272  );
273  }
274 
275  return $lagAwareTimestamp;
276  }
277 
282  private function isAlreadyRefreshed( WikiPage $page ) {
283  $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
284 
285  return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
286  }
287 
297  private function getParserOutput(
298  RevisionRenderer $renderer,
299  ParserCache $parserCache,
300  WikiPage $page,
301  StatsdDataFactoryInterface $stats
302  ) {
303  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
304  if ( !$revision ) {
305  // race condition?
306  return null;
307  }
308 
309  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
310  if ( $cachedOutput ) {
311  return $cachedOutput;
312  }
313 
314  $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob';
315  $renderedRevision = $renderer->getRenderedRevision(
316  $revision,
317  $page->makeParserOptions( 'canonical' ),
318  null,
319  [ 'audience' => $revision::RAW, 'causeAction' => $causeAction ]
320  );
321 
322  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
323  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
324  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
325 
326  return $output;
327  }
328 
336  private function getCurrentRevisionIfUnchanged(
337  WikiPage $page,
338  StatsdDataFactoryInterface $stats
339  ) {
340  $title = $page->getTitle();
341  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
342  // This is used to detect edits/moves after loadPageData() but before the scope lock.
343  // The works around the chicken/egg problem of determining the scope lock key name
344  $latest = $title->getLatestRevID( Title::READ_LATEST );
345 
346  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
347  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
348  // This job is obsolete and one for the latest revision will handle updates
349  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
350  $this->setLastError( "Revision $triggeringRevisionId is not current" );
351  return null;
352  }
353 
354  // Load the current revision. Note that $page should have loaded with READ_LATEST.
355  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
356  $revision = $page->getRevisionRecord();
357  if ( !$revision ) {
358  // revision just got deleted?
359  $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
360  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
361  return null;
362 
363  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
364  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
365  // serialized, it would be OK to update links based on older revisions since it
366  // would eventually get to the latest. Since that is not the case (by design),
367  // only update the link tables to a state matching the current revision's output.
368  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
369  $this->setLastError( "Revision {$revision->getId()} is not current" );
370 
371  return null;
372  }
373 
374  return $revision;
375  }
376 
386  private function getParserOutputFromCache(
387  ParserCache $parserCache,
388  WikiPage $page,
389  RevisionRecord $currentRevision,
390  StatsdDataFactoryInterface $stats
391  ) {
392  $cachedOutput = null;
393  // If page_touched changed after this root job, then it is likely that
394  // any views of the pages already resulted in re-parses which are now in
395  // cache. The cache can be reused to avoid expensive parsing in some cases.
396  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
397  if ( $rootTimestamp !== null ) {
398  $opportunistic = !empty( $this->params['isOpportunistic'] );
399  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
400  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
401  // As long as the cache rev ID matches the current rev ID and it reflects
402  // the job's triggering change, then it is usable.
403  $parserOptions = $page->makeParserOptions( 'canonical' );
404  $output = $parserCache->getDirty( $page, $parserOptions );
405  if (
406  $output &&
407  $output->getCacheRevisionId() == $currentRevision->getId() &&
408  $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
409  ) {
410  $cachedOutput = $output;
411  }
412  }
413  }
414 
415  if ( $cachedOutput ) {
416  $stats->increment( 'refreshlinks.parser_cached' );
417  } else {
418  $stats->increment( 'refreshlinks.parser_uncached' );
419  }
420 
421  return $cachedOutput;
422  }
423 
427  private function getDataUpdateOptions() {
428  $options = [
429  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
430  // Carry over cause so the update can do extra logging
431  'causeAction' => $this->params['causeAction'],
432  'causeAgent' => $this->params['causeAgent']
433  ];
434  if ( !empty( $this->params['triggeringUser'] ) ) {
435  $userInfo = $this->params['triggeringUser'];
436  if ( $userInfo['userId'] ) {
437  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
438  } else {
439  // Anonymous, use the username
440  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
441  }
442  }
443 
444  return $options;
445  }
446 
447  public function getDeduplicationInfo() {
448  $info = parent::getDeduplicationInfo();
449  unset( $info['causeAction'] );
450  unset( $info['causeAgent'] );
451  if ( is_array( $info['params'] ) ) {
452  // For per-pages jobs, the job title is that of the template that changed
453  // (or similar), so remove that since it ruins duplicate detection
454  if ( isset( $info['params']['pages'] ) ) {
455  unset( $info['namespace'] );
456  unset( $info['title'] );
457  }
458  }
459 
460  return $info;
461  }
462 
463  public function workItemCount() {
464  if ( !empty( $this->params['recursive'] ) ) {
465  return 0; // nothing actually refreshed
466  } elseif ( isset( $this->params['pages'] ) ) {
467  return count( $this->params['pages'] );
468  }
469 
470  return 1; // one title
471  }
472 }
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:182
Class to both describe a background job and handle jobs.
Definition: Job.php:39
Title $title
Definition: Job.php:50
getRootJobParams()
Definition: Job.php:362
setLastError( $error)
Definition: Job.php:469
array $params
Array of job parameters.
Definition: Job.php:44
Class the manages updates of *_link tables as well as similar extension-managed tables.
Definition: LinksUpdate.php:55
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Page revision base class.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:64
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for rerendered wiki pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
canExist()
Can this title represent a page in the wiki's database?
Definition: Title.php:1235
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:2886
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:667
static newFromName( $name, $validate='valid')
Definition: User.php:587
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:628
Base representation for an editable wiki page.
Definition: WikiPage.php:69
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:736
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1942
getId( $wikiId=self::LOCAL)
Definition: WikiPage.php:580
getTitle()
Get the title object of the article.
Definition: WikiPage.php:310
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
Definition: WikiPage.php:2130
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition: WikiPage.php:466
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:812
getTouched()
Get the page_touched field.
Definition: WikiPage.php:714
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
const DB_PRIMARY
Definition: defines.php:28
if(count( $args)< 1) $job