MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
23 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
32 
58 class RefreshLinksJob extends Job {
60  private const NORMAL_MAX_LAG = 10;
62  private const LAG_WAIT_TIMEOUT = 15;
63 
64  public function __construct( PageIdentity $page, array $params ) {
65  if ( empty( $params['pages'] ) && !$page->canExist() ) {
66  // BC with the Title class
67  throw new PageAssertionException(
68  'The given PageIdentity {pageIdentity} does not represent a proper page',
69  [ 'pageIdentity' => $page ]
70  );
71  }
72 
73  parent::__construct( 'refreshLinks', $page, $params );
74  // Avoid the overhead of de-duplication when it would be pointless
75  $this->removeDuplicates = (
76  // Ranges rarely will line up
77  !isset( $params['range'] ) &&
78  // Multiple pages per job make matches unlikely
79  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
80  );
81  $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
82  // Tell JobRunner to not automatically wrap run() in a transaction round.
83  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
84  // and to avoid contention as well.
85  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
86  }
87 
93  public static function newPrioritized( PageIdentity $page, array $params ) {
94  $job = new self( $page, $params );
95  $job->command = 'refreshLinksPrioritized';
96 
97  return $job;
98  }
99 
105  public static function newDynamic( PageIdentity $page, array $params ) {
106  $job = new self( $page, $params );
107  $job->command = 'refreshLinksDynamic';
108 
109  return $job;
110  }
111 
112  public function run() {
113  $ok = true;
114 
115  if ( !empty( $this->params['recursive'] ) ) {
116  // Job to update all (or a range of) backlink pages for a page
117 
118  // When the base job branches, wait for the replica DBs to catch up to the primary.
119  // From then on, we know that any template changes at the time the base job was
120  // enqueued will be reflected in backlink page parses when the leaf jobs run.
121  $services = MediaWikiServices::getInstance();
122  if ( !isset( $this->params['range'] ) ) {
123  $lbFactory = $services->getDBLoadBalancerFactory();
124  if ( !$lbFactory->waitForReplication( [
125  'domain' => $lbFactory->getLocalDomainID(),
126  'timeout' => self::LAG_WAIT_TIMEOUT
127  ] ) ) {
128  // only try so hard, keep going with what we have
129  $stats = $services->getStatsdDataFactory();
130  $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
131  }
132  }
133  // Carry over information for de-duplication
134  $extraParams = $this->getRootJobParams();
135  $extraParams['triggeredRecursive'] = true;
136  // Carry over cause information for logging
137  $extraParams['causeAction'] = $this->params['causeAction'];
138  $extraParams['causeAgent'] = $this->params['causeAgent'];
139  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
140  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
142  $this,
143  $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
144  1, // job-per-title
145  [ 'params' => $extraParams ]
146  );
147  $services->getJobQueueGroup()->push( $jobs );
148 
149  } elseif ( isset( $this->params['pages'] ) ) {
150  // Job to update link tables for a set of titles
151  foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
152  $title = Title::makeTitleSafe( $ns, $dbKey );
153  if ( $title && $title->canExist() ) {
154  $ok = $this->runForTitle( $title ) && $ok;
155  } else {
156  $ok = false;
157  $this->setLastError( "Invalid title ($ns,$dbKey)." );
158  }
159  }
160 
161  } else {
162  // Job to update link tables for a given title
163  $ok = $this->runForTitle( $this->title );
164  }
165 
166  return $ok;
167  }
168 
173  protected function runForTitle( PageIdentity $pageIdentity ) {
174  $services = MediaWikiServices::getInstance();
175  $stats = $services->getStatsdDataFactory();
176  $renderer = $services->getRevisionRenderer();
177  $parserCache = $services->getParserCache();
178  $lbFactory = $services->getDBLoadBalancerFactory();
179  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
180 
181  // Load the page from the primary DB
182  $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
183  $page->loadPageData( WikiPage::READ_LATEST );
184 
185  if ( !$page->exists() ) {
186  // Probably due to concurrent deletion or renaming of the page
187  $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
188  $logger->notice(
189  'The page does not exist. Perhaps it was deleted?',
190  [
191  'page_title' => $this->title->getPrefixedDBkey(),
192  'job_params' => $this->getParams(),
193  'job_metadata' => $this->getMetadata()
194  ]
195  );
196  $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
197 
198  // retry later to handle unlucky race condition
199  return false;
200  }
201 
202  // Serialize link update job by page ID so they see each others' changes.
203  // The page ID and latest revision ID will be queried again after the lock
204  // is acquired to bail if they are changed from that of loadPageData() above.
205  // Serialize links updates by page ID so they see each others' changes
206  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
208  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
209  if ( $scopedLock === null ) {
210  // Another job is already updating the page, likely for a prior revision (T170596)
211  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
212  $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
213 
214  // retry later when overlapping job for previous rev is done
215  return false;
216  }
217 
218  if ( $this->isAlreadyRefreshed( $page ) ) {
219  // this job has been superseded, e.g. by overlapping recursive job
220  // for a different template edit, or by direct edit or purge.
221  $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
222  // treat as success
223  return true;
224  }
225 
226  // Parse during a fresh transaction round for better read consistency
227  $lbFactory->beginPrimaryChanges( __METHOD__ );
228  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
229  $options = $this->getDataUpdateOptions();
230  $lbFactory->commitPrimaryChanges( __METHOD__ );
231 
232  if ( !$output ) {
233  // probably raced out.
234  // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
235  // FIXME: Why do we retry this? Can this be a cancellation?
236  return false;
237  }
238 
239  // Tell DerivedPageDataUpdater to use this parser output
240  $options['known-revision-output'] = $output;
241  // Execute corresponding DataUpdates immediately
242  $page->doSecondaryDataUpdates( $options );
244 
245  // Commit any writes here in case this method is called in a loop.
246  // In that case, the scoped lock will fail to be acquired.
247  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
248 
249  return true;
250  }
251 
255  private function getLagAwareRootTimestamp() {
256  // Get the timestamp of the change that triggered this job
257  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
258  if ( $rootTimestamp === null ) {
259  return null;
260  }
261 
262  if ( !empty( $this->params['isOpportunistic'] ) ) {
263  // Neither clock skew nor DB snapshot/replica DB lag matter much for
264  // such updates; focus on reusing the (often recently updated) cache
265  $lagAwareTimestamp = $rootTimestamp;
266  } else {
267  // For transclusion updates, the template changes must be reflected
268  $lagAwareTimestamp = wfTimestamp(
269  TS_MW,
270  (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
271  );
272  }
273 
274  return $lagAwareTimestamp;
275  }
276 
281  private function isAlreadyRefreshed( WikiPage $page ) {
282  $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
283 
284  return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
285  }
286 
296  private function getParserOutput(
297  RevisionRenderer $renderer,
298  ParserCache $parserCache,
299  WikiPage $page,
300  StatsdDataFactoryInterface $stats
301  ) {
302  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
303  if ( !$revision ) {
304  // race condition?
305  return null;
306  }
307 
308  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
309  if ( $cachedOutput ) {
310  return $cachedOutput;
311  }
312 
313  $renderedRevision = $renderer->getRenderedRevision(
314  $revision,
315  $page->makeParserOptions( 'canonical' ),
316  null,
317  [ 'audience' => $revision::RAW ]
318  );
319 
320  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
321  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
322  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
323 
324  return $output;
325  }
326 
335  WikiPage $page,
336  StatsdDataFactoryInterface $stats
337  ) {
338  $title = $page->getTitle();
339  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
340  // This is used to detect edits/moves after loadPageData() but before the scope lock.
341  // The works around the chicken/egg problem of determining the scope lock key name
342  $latest = $title->getLatestRevID( Title::READ_LATEST );
343 
344  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
345  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
346  // This job is obsolete and one for the latest revision will handle updates
347  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
348  $this->setLastError( "Revision $triggeringRevisionId is not current" );
349  return null;
350  }
351 
352  // Load the current revision. Note that $page should have loaded with READ_LATEST.
353  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
354  $revision = $page->getRevisionRecord();
355  if ( !$revision ) {
356  // revision just got deleted?
357  $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
358  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
359  return null;
360 
361  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
362  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
363  // serialized, it would be OK to update links based on older revisions since it
364  // would eventually get to the latest. Since that is not the case (by design),
365  // only update the link tables to a state matching the current revision's output.
366  $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
367  $this->setLastError( "Revision {$revision->getId()} is not current" );
368 
369  return null;
370  }
371 
372  return $revision;
373  }
374 
384  private function getParserOutputFromCache(
385  ParserCache $parserCache,
386  WikiPage $page,
387  RevisionRecord $currentRevision,
388  StatsdDataFactoryInterface $stats
389  ) {
390  $cachedOutput = null;
391  // If page_touched changed after this root job, then it is likely that
392  // any views of the pages already resulted in re-parses which are now in
393  // cache. The cache can be reused to avoid expensive parsing in some cases.
394  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
395  if ( $rootTimestamp !== null ) {
396  $opportunistic = !empty( $this->params['isOpportunistic'] );
397  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
398  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
399  // As long as the cache rev ID matches the current rev ID and it reflects
400  // the job's triggering change, then it is usable.
401  $parserOptions = $page->makeParserOptions( 'canonical' );
402  $output = $parserCache->getDirty( $page, $parserOptions );
403  if (
404  $output &&
405  $output->getCacheRevisionId() == $currentRevision->getId() &&
406  $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
407  ) {
408  $cachedOutput = $output;
409  }
410  }
411  }
412 
413  if ( $cachedOutput ) {
414  $stats->increment( 'refreshlinks.parser_cached' );
415  } else {
416  $stats->increment( 'refreshlinks.parser_uncached' );
417  }
418 
419  return $cachedOutput;
420  }
421 
425  private function getDataUpdateOptions() {
426  $options = [
427  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
428  // Carry over cause so the update can do extra logging
429  'causeAction' => $this->params['causeAction'],
430  'causeAgent' => $this->params['causeAgent']
431  ];
432  if ( !empty( $this->params['triggeringUser'] ) ) {
433  $userInfo = $this->params['triggeringUser'];
434  if ( $userInfo['userId'] ) {
435  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
436  } else {
437  // Anonymous, use the username
438  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
439  }
440  }
441 
442  return $options;
443  }
444 
445  public function getDeduplicationInfo() {
446  $info = parent::getDeduplicationInfo();
447  unset( $info['causeAction'] );
448  unset( $info['causeAgent'] );
449  if ( is_array( $info['params'] ) ) {
450  // For per-pages jobs, the job title is that of the template that changed
451  // (or similar), so remove that since it ruins duplicate detection
452  if ( isset( $info['params']['pages'] ) ) {
453  unset( $info['namespace'] );
454  unset( $info['title'] );
455  }
456  }
457 
458  return $info;
459  }
460 
461  public function workItemCount() {
462  if ( !empty( $this->params['recursive'] ) ) {
463  return 0; // nothing actually refreshed
464  } elseif ( isset( $this->params['pages'] ) ) {
465  return count( $this->params['pages'] );
466  }
467 
468  return 1; // one title
469  }
470 }
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:190
Class to both describe a background job and handle jobs.
Definition: Job.php:39
Title $title
Definition: Job.php:50
getRootJobParams()
Definition: Job.php:362
setLastError( $error)
Definition: Job.php:469
array $params
Array of job parameters.
Definition: Job.php:44
Class the manages updates of *_link tables as well as similar extension-managed tables.
Definition: LinksUpdate.php:55
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Page revision base class.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:63
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
isAlreadyRefreshed(WikiPage $page)
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
canExist()
Can this title represent a page in the wiki's database?
Definition: Title.php:1232
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:2885
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:664
static newFromName( $name, $validate='valid')
Definition: User.php:597
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:638
Base representation for an editable wiki page.
Definition: WikiPage.php:62
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:728
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1934
getId( $wikiId=self::LOCAL)
Definition: WikiPage.php:572
getTitle()
Get the title object of the article.
Definition: WikiPage.php:303
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:804
getTouched()
Get the page_touched field.
Definition: WikiPage.php:706
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
const DB_PRIMARY
Definition: defines.php:28
if(count( $args)< 1) $job