MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
23 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
29 
43 class RefreshLinksJob extends Job {
45  private const NORMAL_MAX_LAG = 10;
47  private const LAG_WAIT_TIMEOUT = 15;
48 
49  public function __construct( PageIdentity $page, array $params ) {
50  parent::__construct( 'refreshLinks', $page, $params );
51  // Avoid the overhead of de-duplication when it would be pointless
52  $this->removeDuplicates = (
53  // Ranges rarely will line up
54  !isset( $params['range'] ) &&
55  // Multiple pages per job make matches unlikely
56  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
57  );
58  $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
59  // Tell JobRunner to not automatically wrap run() in a transaction round.
60  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
61  // and to avoid contention as well.
62  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
63  }
64 
70  public static function newPrioritized( PageIdentity $page, array $params ) {
71  $job = new self( $page, $params );
72  $job->command = 'refreshLinksPrioritized';
73 
74  return $job;
75  }
76 
82  public static function newDynamic( PageIdentity $page, array $params ) {
83  $job = new self( $page, $params );
84  $job->command = 'refreshLinksDynamic';
85 
86  return $job;
87  }
88 
89  public function run() {
90  $ok = true;
91 
92  // Job to update all (or a range of) backlink pages for a page
93  if ( !empty( $this->params['recursive'] ) ) {
94  $services = MediaWikiServices::getInstance();
95  // When the base job branches, wait for the replica DBs to catch up to the primary.
96  // From then on, we know that any template changes at the time the base job was
97  // enqueued will be reflected in backlink page parses when the leaf jobs run.
98  if ( !isset( $this->params['range'] ) ) {
99  $lbFactory = $services->getDBLoadBalancerFactory();
100  if ( !$lbFactory->waitForReplication( [
101  'domain' => $lbFactory->getLocalDomainID(),
102  'timeout' => self::LAG_WAIT_TIMEOUT
103  ] ) ) { // only try so hard
104  $stats = $services->getStatsdDataFactory();
105  $stats->increment( 'refreshlinks.lag_wait_failed' );
106  }
107  }
108  // Carry over information for de-duplication
109  $extraParams = $this->getRootJobParams();
110  $extraParams['triggeredRecursive'] = true;
111  // Carry over cause information for logging
112  $extraParams['causeAction'] = $this->params['causeAction'];
113  $extraParams['causeAgent'] = $this->params['causeAgent'];
114  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
115  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
117  $this,
118  $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
119  1, // job-per-title
120  [ 'params' => $extraParams ]
121  );
122  JobQueueGroup::singleton()->push( $jobs );
123  // Job to update link tables for a set of titles
124  } elseif ( isset( $this->params['pages'] ) ) {
125  foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
126  $title = Title::makeTitleSafe( $ns, $dbKey );
127  if ( $title ) {
128  $ok = $this->runForTitle( $title ) && $ok;
129  } else {
130  $ok = false;
131  $this->setLastError( "Invalid title ($ns,$dbKey)." );
132  }
133  }
134  // Job to update link tables for a given title
135  } else {
136  $ok = $this->runForTitle( $this->title );
137  }
138 
139  return $ok;
140  }
141 
146  protected function runForTitle( PageIdentity $pageIdentity ) {
147  $services = MediaWikiServices::getInstance();
148  $stats = $services->getStatsdDataFactory();
149  $renderer = $services->getRevisionRenderer();
150  $parserCache = $services->getParserCache();
151  $lbFactory = $services->getDBLoadBalancerFactory();
152  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
153 
154  // Load the page from the primary DB
155  $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
156  $page->loadPageData( WikiPage::READ_LATEST );
157 
158  if ( !$page->exists() ) {
159  // Probably due to concurrent deletion or renaming of the page
160  $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
161  $logger->notice(
162  'The page does not exist. Perhaps it was deleted?',
163  [
164  'page_title' => $this->title->getPrefixedDBkey(),
165  'job_params' => $this->getParams(),
166  'job_metadata' => $this->getMetadata()
167  ]
168  );
169 
170  // nothing to do
171  $stats->increment( 'refreshlinks.rev_not_found' );
172  return false;
173  }
174 
175  // Serialize link update job by page ID so they see each others' changes.
176  // The page ID and latest revision ID will be queried again after the lock
177  // is acquired to bail if they are changed from that of loadPageData() above.
178  // Serialize links updates by page ID so they see each others' changes
179  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
181  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
182  if ( $scopedLock === null ) {
183  // Another job is already updating the page, likely for a prior revision (T170596)
184  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
185  $stats->increment( 'refreshlinks.lock_failure' );
186 
187  return false;
188  }
189 
190  if ( $this->isAlreadyRefreshed( $page ) ) {
191  $stats->increment( 'refreshlinks.update_skipped' );
192 
193  return true;
194  }
195 
196  // Parse during a fresh transaction round for better read consistency
197  $lbFactory->beginPrimaryChanges( __METHOD__ );
198  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
199  $options = $this->getDataUpdateOptions();
200  $lbFactory->commitPrimaryChanges( __METHOD__ );
201 
202  if ( !$output ) {
203  return false; // raced out?
204  }
205 
206  // Tell DerivedPageDataUpdater to use this parser output
207  $options['known-revision-output'] = $output;
208  // Execute corresponding DataUpdates immediately
209  $page->doSecondaryDataUpdates( $options );
211 
212  // Commit any writes here in case this method is called in a loop.
213  // In that case, the scoped lock will fail to be acquired.
214  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
215 
216  return true;
217  }
218 
223  private function isAlreadyRefreshed( WikiPage $page ) {
224  // Get the timestamp of the change that triggered this job
225  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
226  if ( $rootTimestamp === null ) {
227  return false;
228  }
229 
230  if ( !empty( $this->params['isOpportunistic'] ) ) {
231  // Neither clock skew nor DB snapshot/replica DB lag matter much for
232  // such updates; focus on reusing the (often recently updated) cache
233  $lagAwareTimestamp = $rootTimestamp;
234  } else {
235  // For transclusion updates, the template changes must be reflected
236  $lagAwareTimestamp = wfTimestamp(
237  TS_MW,
238  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
239  );
240  }
241 
242  return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
243  }
244 
254  private function getParserOutput(
255  RevisionRenderer $renderer,
256  ParserCache $parserCache,
257  WikiPage $page,
258  StatsdDataFactoryInterface $stats
259  ) {
260  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
261  if ( !$revision ) {
262  return null; // race condition?
263  }
264 
265  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
266  if ( $cachedOutput ) {
267  return $cachedOutput;
268  }
269 
270  $renderedRevision = $renderer->getRenderedRevision(
271  $revision,
272  $page->makeParserOptions( 'canonical' ),
273  null,
274  [ 'audience' => $revision::RAW ]
275  );
276 
277  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
278  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
279  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
280 
281  return $output;
282  }
283 
292  WikiPage $page,
293  StatsdDataFactoryInterface $stats
294  ) {
295  $title = $page->getTitle();
296  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
297  // This is used to detect edits/moves after loadPageData() but before the scope lock.
298  // The works around the chicken/egg problem of determining the scope lock key name
299  $latest = $title->getLatestRevID( Title::READ_LATEST );
300 
301  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
302  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
303  // This job is obsolete and one for the latest revision will handle updates
304  $stats->increment( 'refreshlinks.rev_not_current' );
305  $this->setLastError( "Revision $triggeringRevisionId is not current" );
306 
307  return null;
308  }
309 
310  // Load the current revision. Note that $page should have loaded with READ_LATEST.
311  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
312  $revision = $page->getRevisionRecord();
313  if ( !$revision ) {
314  $stats->increment( 'refreshlinks.rev_not_found' );
315  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
316 
317  return null; // just deleted?
318  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
319  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
320  // serialized, it would be OK to update links based on older revisions since it
321  // would eventually get to the latest. Since that is not the case (by design),
322  // only update the link tables to a state matching the current revision's output.
323  $stats->increment( 'refreshlinks.rev_not_current' );
324  $this->setLastError( "Revision {$revision->getId()} is not current" );
325 
326  return null;
327  }
328 
329  return $revision;
330  }
331 
341  private function getParserOutputFromCache(
342  ParserCache $parserCache,
343  WikiPage $page,
344  RevisionRecord $currentRevision,
345  StatsdDataFactoryInterface $stats
346  ) {
347  $cachedOutput = null;
348  // If page_touched changed after this root job, then it is likely that
349  // any views of the pages already resulted in re-parses which are now in
350  // cache. The cache can be reused to avoid expensive parsing in some cases.
351  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
352  if ( $rootTimestamp !== null ) {
353  $opportunistic = !empty( $this->params['isOpportunistic'] );
354  if ( $opportunistic ) {
355  // Neither clock skew nor DB snapshot/replica DB lag matter much for
356  // such updates; focus on reusing the (often recently updated) cache
357  $lagAwareTimestamp = $rootTimestamp;
358  } else {
359  // For transclusion updates, the template changes must be reflected
360  $lagAwareTimestamp = wfTimestamp(
361  TS_MW,
362  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
363  );
364  }
365 
366  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
367  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
368  // As long as the cache rev ID matches the current rev ID and it reflects
369  // the job's triggering change, then it is usable.
370  $parserOptions = $page->makeParserOptions( 'canonical' );
371  $output = $parserCache->getDirty( $page, $parserOptions );
372  if (
373  $output &&
374  $output->getCacheRevisionId() == $currentRevision->getId() &&
375  $output->getCacheTime() >= $lagAwareTimestamp
376  ) {
377  $cachedOutput = $output;
378  }
379  }
380  }
381 
382  if ( $cachedOutput ) {
383  $stats->increment( 'refreshlinks.parser_cached' );
384  } else {
385  $stats->increment( 'refreshlinks.parser_uncached' );
386  }
387 
388  return $cachedOutput;
389  }
390 
394  private function getDataUpdateOptions() {
395  $options = [
396  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
397  // Carry over cause so the update can do extra logging
398  'causeAction' => $this->params['causeAction'],
399  'causeAgent' => $this->params['causeAgent']
400  ];
401  if ( !empty( $this->params['triggeringUser'] ) ) {
402  $userInfo = $this->params['triggeringUser'];
403  if ( $userInfo['userId'] ) {
404  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
405  } else {
406  // Anonymous, use the username
407  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
408  }
409  }
410 
411  return $options;
412  }
413 
414  public function getDeduplicationInfo() {
415  $info = parent::getDeduplicationInfo();
416  unset( $info['causeAction'] );
417  unset( $info['causeAgent'] );
418  if ( is_array( $info['params'] ) ) {
419  // For per-pages jobs, the job title is that of the template that changed
420  // (or similar), so remove that since it ruins duplicate detection
421  if ( isset( $info['params']['pages'] ) ) {
422  unset( $info['namespace'] );
423  unset( $info['title'] );
424  }
425  }
426 
427  return $info;
428  }
429 
430  public function workItemCount() {
431  if ( !empty( $this->params['recursive'] ) ) {
432  return 0; // nothing actually refreshed
433  } elseif ( isset( $this->params['pages'] ) ) {
434  return count( $this->params['pages'] );
435  }
436 
437  return 1; // one title
438  }
439 }
Job\getRootJobParams
getRootJobParams()
Definition: Job.php:359
Page\PageIdentity
Interface for objects (potentially) representing an editable wiki page.
Definition: PageIdentity.php:64
User\newFromId
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:647
MediaWiki\Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:47
WikiPage\getRevisionRecord
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:816
LinksUpdate\acquirePageLock
static acquirePageLock(IDatabase $dbw, $pageId, $why='atomicity')
Acquire a session-level lock for performing link table updates for a page on a DB.
Definition: LinksUpdate.php:229
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:193
WikiPage\getTouched
getTouched()
Get the page_touched field.
Definition: WikiPage.php:718
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1668
Job\$title
Title $title
Definition: Job.php:48
RefreshLinksJob\getDeduplicationInfo
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
Definition: RefreshLinksJob.php:414
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:60
WikiPage\makeParserOptions
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1996
User\newFromName
static newFromName( $name, $validate='valid')
Definition: User.php:606
RefreshLinksJob\getParserOutput
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:254
Job\$params
array $params
Array of job parameters.
Definition: Job.php:42
RefreshLinksJob\newPrioritized
static newPrioritized(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:70
Job\setLastError
setLastError( $error)
Definition: Job.php:466
BacklinkJobUtils\partitionBacklinkJob
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Definition: BacklinkJobUtils.php:89
RefreshLinksJob\getDataUpdateOptions
getDataUpdateOptions()
Definition: RefreshLinksJob.php:394
RefreshLinksJob\runForTitle
runForTitle(PageIdentity $pageIdentity)
Definition: RefreshLinksJob.php:146
Job
Class to both describe a background job and handle jobs.
Definition: Job.php:37
RefreshLinksJob\isAlreadyRefreshed
isAlreadyRefreshed(WikiPage $page)
Definition: RefreshLinksJob.php:223
RefreshLinksJob\run
run()
Run the job.
Definition: RefreshLinksJob.php:89
MediaWiki\Logger\LoggerFactory
PSR-3 logger instance factory.
Definition: LoggerFactory.php:45
MediaWiki\Revision\RevisionRenderer\getRenderedRevision
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Definition: RevisionRenderer.php:102
WikiPage\getTitle
getTitle()
Get the title object of the article.
Definition: WikiPage.php:311
WikiPage\getLinksTimestamp
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:740
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1697
MediaWiki\Revision\RevisionRenderer
The RevisionRenderer service provides access to rendered output for revisions.
Definition: RevisionRenderer.php:45
Title\makeTitleSafe
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:676
RefreshLinksJob
Job to update link tables for pages.
Definition: RefreshLinksJob.php:43
WikiPage\getId
getId( $wikiId=self::LOCAL)
Definition: WikiPage.php:584
RefreshLinksJob\workItemCount
workItemCount()
Definition: RefreshLinksJob.php:430
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
Title\getLatestRevID
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:2969
ParserCache\getDirty
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Definition: ParserCache.php:196
RefreshLinksJob\newDynamic
static newDynamic(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:82
RefreshLinksJob\getCurrentRevisionIfUnchanged
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:291
InfoAction\invalidateCache
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:168
JobQueueGroup\singleton
static singleton( $domain=false)
Definition: JobQueueGroup.php:114
MediaWiki\Revision\RevisionRecord\getId
getId( $wikiId=self::LOCAL)
Get revision ID.
Definition: RevisionRecord.php:279
RefreshLinksJob\__construct
__construct(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:49
$job
if(count( $args)< 1) $job
Definition: recompressTracked.php:49
ParserCache
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:63
RefreshLinksJob\getParserOutputFromCache
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
Definition: RefreshLinksJob.php:341