MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
26 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
27 
41 class RefreshLinksJob extends Job {
43  const NORMAL_MAX_LAG = 10;
45  const LAG_WAIT_TIMEOUT = 15;
46 
47  public function __construct( Title $title, array $params ) {
48  parent::__construct( 'refreshLinks', $title, $params );
49  // Avoid the overhead of de-duplication when it would be pointless
50  $this->removeDuplicates = (
51  // Ranges rarely will line up
52  !isset( $params['range'] ) &&
53  // Multiple pages per job make matches unlikely
54  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
55  );
56  $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
57  // Tell JobRunner to not automatically wrap run() in a transaction round.
58  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
59  // and to avoid contention as well.
60  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
61  }
62 
68  public static function newPrioritized( Title $title, array $params ) {
69  $job = new self( $title, $params );
70  $job->command = 'refreshLinksPrioritized';
71 
72  return $job;
73  }
74 
80  public static function newDynamic( Title $title, array $params ) {
81  $job = new self( $title, $params );
82  $job->command = 'refreshLinksDynamic';
83 
84  return $job;
85  }
86 
87  function run() {
88  $ok = true;
89 
90  // Job to update all (or a range of) backlink pages for a page
91  if ( !empty( $this->params['recursive'] ) ) {
92  $services = MediaWikiServices::getInstance();
93  // When the base job branches, wait for the replica DBs to catch up to the master.
94  // From then on, we know that any template changes at the time the base job was
95  // enqueued will be reflected in backlink page parses when the leaf jobs run.
96  if ( !isset( $this->params['range'] ) ) {
97  $lbFactory = $services->getDBLoadBalancerFactory();
98  if ( !$lbFactory->waitForReplication( [
99  'domain' => $lbFactory->getLocalDomainID(),
100  'timeout' => self::LAG_WAIT_TIMEOUT
101  ] ) ) { // only try so hard
102  $stats = $services->getStatsdDataFactory();
103  $stats->increment( 'refreshlinks.lag_wait_failed' );
104  }
105  }
106  // Carry over information for de-duplication
107  $extraParams = $this->getRootJobParams();
108  $extraParams['triggeredRecursive'] = true;
109  // Carry over cause information for logging
110  $extraParams['causeAction'] = $this->params['causeAction'];
111  $extraParams['causeAgent'] = $this->params['causeAgent'];
112  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
113  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
115  $this,
116  $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
117  1, // job-per-title
118  [ 'params' => $extraParams ]
119  );
120  JobQueueGroup::singleton()->push( $jobs );
121  // Job to update link tables for a set of titles
122  } elseif ( isset( $this->params['pages'] ) ) {
123  foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
124  $title = Title::makeTitleSafe( $ns, $dbKey );
125  if ( $title ) {
126  $ok = $this->runForTitle( $title ) && $ok;
127  } else {
128  $ok = false;
129  $this->setLastError( "Invalid title ($ns,$dbKey)." );
130  }
131  }
132  // Job to update link tables for a given title
133  } else {
134  $ok = $this->runForTitle( $this->title );
135  }
136 
137  return $ok;
138  }
139 
144  protected function runForTitle( Title $title ) {
145  $services = MediaWikiServices::getInstance();
146  $stats = $services->getStatsdDataFactory();
147  $renderer = $services->getRevisionRenderer();
148  $parserCache = $services->getParserCache();
149  $lbFactory = $services->getDBLoadBalancerFactory();
150  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
151 
152  // Load the page from the master DB
153  $page = WikiPage::factory( $title );
154  $page->loadPageData( WikiPage::READ_LATEST );
155 
156  // Serialize link update job by page ID so they see each others' changes.
157  // The page ID and latest revision ID will be queried again after the lock
158  // is acquired to bail if they are changed from that of loadPageData() above.
159  // Serialize links updates by page ID so they see each others' changes
160  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_MASTER );
162  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
163  if ( $scopedLock === null ) {
164  // Another job is already updating the page, likely for a prior revision (T170596)
165  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
166  $stats->increment( 'refreshlinks.lock_failure' );
167 
168  return false;
169  }
170 
171  if ( $this->isAlreadyRefreshed( $page ) ) {
172  $stats->increment( 'refreshlinks.update_skipped' );
173 
174  return true;
175  }
176 
177  // Parse during a fresh transaction round for better read consistency
178  $lbFactory->beginMasterChanges( __METHOD__ );
179  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
180  $options = $this->getDataUpdateOptions();
181  $lbFactory->commitMasterChanges( __METHOD__ );
182 
183  if ( !$output ) {
184  return false; // raced out?
185  }
186 
187  // Tell DerivedPageDataUpdater to use this parser output
188  $options['known-revision-output'] = $output;
189  // Execute corresponding DataUpdates immediately
190  $page->doSecondaryDataUpdates( $options );
192 
193  // Commit any writes here in case this method is called in a loop.
194  // In that case, the scoped lock will fail to be acquired.
195  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
196 
197  return true;
198  }
199 
204  private function isAlreadyRefreshed( WikiPage $page ) {
205  // Get the timestamp of the change that triggered this job
206  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
207  if ( $rootTimestamp === null ) {
208  return false;
209  }
210 
211  if ( !empty( $this->params['isOpportunistic'] ) ) {
212  // Neither clock skew nor DB snapshot/replica DB lag matter much for
213  // such updates; focus on reusing the (often recently updated) cache
214  $lagAwareTimestamp = $rootTimestamp;
215  } else {
216  // For transclusion updates, the template changes must be reflected
217  $lagAwareTimestamp = wfTimestamp(
218  TS_MW,
219  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
220  );
221  }
222 
223  return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
224  }
225 
235  private function getParserOutput(
236  RevisionRenderer $renderer,
237  ParserCache $parserCache,
238  WikiPage $page,
239  StatsdDataFactoryInterface $stats
240  ) {
241  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
242  if ( !$revision ) {
243  return null; // race condition?
244  }
245 
246  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
247  if ( $cachedOutput ) {
248  return $cachedOutput;
249  }
250 
251  $renderedRevision = $renderer->getRenderedRevision(
252  $revision,
253  $page->makeParserOptions( 'canonical' ),
254  null,
255  [ 'audience' => $revision::RAW ]
256  );
257 
258  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
259  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
260  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
261 
262  return $output;
263  }
264 
273  WikiPage $page,
274  StatsdDataFactoryInterface $stats
275  ) {
276  $title = $page->getTitle();
277  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
278  // This is used to detect edits/moves after loadPageData() but before the scope lock.
279  // The works around the chicken/egg problem of determining the scope lock key name
280  $latest = $title->getLatestRevID( Title::READ_LATEST );
281 
282  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
283  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
284  // This job is obsolete and one for the latest revision will handle updates
285  $stats->increment( 'refreshlinks.rev_not_current' );
286  $this->setLastError( "Revision $triggeringRevisionId is not current" );
287 
288  return null;
289  }
290 
291  // Load the current revision. Note that $page should have loaded with READ_LATEST.
292  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
293  $revision = $page->getRevisionRecord();
294  if ( !$revision ) {
295  $stats->increment( 'refreshlinks.rev_not_found' );
296  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
297 
298  return null; // just deleted?
299  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
300  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
301  // serialized, it would be OK to update links based on older revisions since it
302  // would eventually get to the latest. Since that is not the case (by design),
303  // only update the link tables to a state matching the current revision's output.
304  $stats->increment( 'refreshlinks.rev_not_current' );
305  $this->setLastError( "Revision {$revision->getId()} is not current" );
306 
307  return null;
308  }
309 
310  return $revision;
311  }
312 
322  private function getParserOutputFromCache(
323  ParserCache $parserCache,
324  WikiPage $page,
325  RevisionRecord $currentRevision,
326  StatsdDataFactoryInterface $stats
327  ) {
328  $cachedOutput = null;
329  // If page_touched changed after this root job, then it is likely that
330  // any views of the pages already resulted in re-parses which are now in
331  // cache. The cache can be reused to avoid expensive parsing in some cases.
332  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
333  if ( $rootTimestamp !== null ) {
334  $opportunistic = !empty( $this->params['isOpportunistic'] );
335  if ( $opportunistic ) {
336  // Neither clock skew nor DB snapshot/replica DB lag matter much for
337  // such updates; focus on reusing the (often recently updated) cache
338  $lagAwareTimestamp = $rootTimestamp;
339  } else {
340  // For transclusion updates, the template changes must be reflected
341  $lagAwareTimestamp = wfTimestamp(
342  TS_MW,
343  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
344  );
345  }
346 
347  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
348  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
349  // As long as the cache rev ID matches the current rev ID and it reflects
350  // the job's triggering change, then it is usable.
351  $parserOptions = $page->makeParserOptions( 'canonical' );
352  $output = $parserCache->getDirty( $page, $parserOptions );
353  if (
354  $output &&
355  $output->getCacheRevisionId() == $currentRevision->getId() &&
356  $output->getCacheTime() >= $lagAwareTimestamp
357  ) {
358  $cachedOutput = $output;
359  }
360  }
361  }
362 
363  if ( $cachedOutput ) {
364  $stats->increment( 'refreshlinks.parser_cached' );
365  } else {
366  $stats->increment( 'refreshlinks.parser_uncached' );
367  }
368 
369  return $cachedOutput;
370  }
371 
375  private function getDataUpdateOptions() {
376  $options = [
377  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
378  // Carry over cause so the update can do extra logging
379  'causeAction' => $this->params['causeAction'],
380  'causeAgent' => $this->params['causeAgent']
381  ];
382  if ( !empty( $this->params['triggeringUser'] ) ) {
383  $userInfo = $this->params['triggeringUser'];
384  if ( $userInfo['userId'] ) {
385  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
386  } else {
387  // Anonymous, use the username
388  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
389  }
390  }
391 
392  return $options;
393  }
394 
395  public function getDeduplicationInfo() {
396  $info = parent::getDeduplicationInfo();
397  unset( $info['causeAction'] );
398  unset( $info['causeAgent'] );
399  if ( is_array( $info['params'] ) ) {
400  // For per-pages jobs, the job title is that of the template that changed
401  // (or similar), so remove that since it ruins duplicate detection
402  if ( isset( $info['params']['pages'] ) ) {
403  unset( $info['namespace'] );
404  unset( $info['title'] );
405  }
406  }
407 
408  return $info;
409  }
410 
411  public function workItemCount() {
412  if ( !empty( $this->params['recursive'] ) ) {
413  return 0; // nothing actually refreshed
414  } elseif ( isset( $this->params['pages'] ) ) {
415  return count( $this->params['pages'] );
416  }
417 
418  return 1; // one title
419  }
420 }
Job\getRootJobParams
getRootJobParams()
Definition: Job.php:321
User\newFromId
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:560
Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:46
WikiPage\getRevisionRecord
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:797
LinksUpdate\acquirePageLock
static acquirePageLock(IDatabase $dbw, $pageId, $why='atomicity')
Acquire a session-level lock for performing link table updates for a page on a DB.
Definition: LinksUpdate.php:218
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:129
WikiPage\getTouched
getTouched()
Get the page_touched field.
Definition: WikiPage.php:690
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1871
Job\$title
Title $title
Definition: Job.php:41
RefreshLinksJob\__construct
__construct(Title $title, array $params)
Definition: RefreshLinksJob.php:47
RefreshLinksJob\getDeduplicationInfo
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
Definition: RefreshLinksJob.php:395
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:45
WikiPage\makeParserOptions
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1959
User\newFromName
static newFromName( $name, $validate='valid')
Static factory method for creation from username.
Definition: User.php:536
RefreshLinksJob\newDynamic
static newDynamic(Title $title, array $params)
Definition: RefreshLinksJob.php:80
RefreshLinksJob\newPrioritized
static newPrioritized(Title $title, array $params)
Definition: RefreshLinksJob.php:68
RefreshLinksJob\getParserOutput
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:235
Job\$params
array $params
Array of job parameters.
Definition: Job.php:35
Job\setLastError
setLastError( $error)
Definition: Job.php:418
BacklinkJobUtils\partitionBacklinkJob
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Definition: BacklinkJobUtils.php:87
RefreshLinksJob\getDataUpdateOptions
getDataUpdateOptions()
Definition: RefreshLinksJob.php:375
Job
Class to both describe a background job and handle jobs.
Definition: Job.php:30
Revision\RevisionRenderer\getRenderedRevision
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, User $forUser=null, array $hints=[])
Definition: RevisionRenderer.php:102
RefreshLinksJob\isAlreadyRefreshed
isAlreadyRefreshed(WikiPage $page)
Definition: RefreshLinksJob.php:204
WikiPage\factory
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:140
RefreshLinksJob\run
run()
Run the job.
Definition: RefreshLinksJob.php:87
ParserCache\getDirty
getDirty( $article, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Definition: ParserCache.php:144
WikiPage\getId
getId()
Definition: WikiPage.php:598
WikiPage\getTitle
getTitle()
Get the title object of the article.
Definition: WikiPage.php:296
WikiPage\getLinksTimestamp
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:701
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1900
DB_MASTER
const DB_MASTER
Definition: defines.php:26
Revision\RevisionRecord\getId
getId()
Get revision ID.
Definition: RevisionRecord.php:279
Revision\RevisionRenderer
The RevisionRenderer service provides access to rendered output for revisions.
Definition: RevisionRenderer.php:45
Title\makeTitleSafe
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:610
RefreshLinksJob
Job to update link tables for pages.
Definition: RefreshLinksJob.php:41
RefreshLinksJob\workItemCount
workItemCount()
Definition: RefreshLinksJob.php:411
Title\getLatestRevID
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:3242
RefreshLinksJob\getCurrentRevisionIfUnchanged
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:272
Title
Represents a title within MediaWiki.
Definition: Title.php:42
JobQueueGroup\singleton
static singleton( $domain=false)
Definition: JobQueueGroup.php:70
$job
if(count( $args)< 1) $job
Definition: recompressTracked.php:50
ParserCache
Definition: ParserCache.php:30
InfoAction\invalidateCache
static invalidateCache(Title $title, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:71
RefreshLinksJob\runForTitle
runForTitle(Title $title)
Definition: RefreshLinksJob.php:144
RefreshLinksJob\getParserOutputFromCache
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
Definition: RefreshLinksJob.php:322