MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
23 use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
30 
44 class RefreshLinksJob extends Job {
46  private const NORMAL_MAX_LAG = 10;
48  private const LAG_WAIT_TIMEOUT = 15;
49 
50  public function __construct( PageIdentity $page, array $params ) {
51  parent::__construct( 'refreshLinks', $page, $params );
52  // Avoid the overhead of de-duplication when it would be pointless
53  $this->removeDuplicates = (
54  // Ranges rarely will line up
55  !isset( $params['range'] ) &&
56  // Multiple pages per job make matches unlikely
57  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
58  );
59  $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
60  // Tell JobRunner to not automatically wrap run() in a transaction round.
61  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
62  // and to avoid contention as well.
63  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
64  }
65 
71  public static function newPrioritized( PageIdentity $page, array $params ) {
72  $job = new self( $page, $params );
73  $job->command = 'refreshLinksPrioritized';
74 
75  return $job;
76  }
77 
83  public static function newDynamic( PageIdentity $page, array $params ) {
84  $job = new self( $page, $params );
85  $job->command = 'refreshLinksDynamic';
86 
87  return $job;
88  }
89 
90  public function run() {
91  $ok = true;
92 
93  // Job to update all (or a range of) backlink pages for a page
94  if ( !empty( $this->params['recursive'] ) ) {
95  $services = MediaWikiServices::getInstance();
96  // When the base job branches, wait for the replica DBs to catch up to the primary.
97  // From then on, we know that any template changes at the time the base job was
98  // enqueued will be reflected in backlink page parses when the leaf jobs run.
99  if ( !isset( $this->params['range'] ) ) {
100  $lbFactory = $services->getDBLoadBalancerFactory();
101  if ( !$lbFactory->waitForReplication( [
102  'domain' => $lbFactory->getLocalDomainID(),
103  'timeout' => self::LAG_WAIT_TIMEOUT
104  ] ) ) { // only try so hard
105  $stats = $services->getStatsdDataFactory();
106  $stats->increment( 'refreshlinks.lag_wait_failed' );
107  }
108  }
109  // Carry over information for de-duplication
110  $extraParams = $this->getRootJobParams();
111  $extraParams['triggeredRecursive'] = true;
112  // Carry over cause information for logging
113  $extraParams['causeAction'] = $this->params['causeAction'];
114  $extraParams['causeAgent'] = $this->params['causeAgent'];
115  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
116  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
118  $this,
119  $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
120  1, // job-per-title
121  [ 'params' => $extraParams ]
122  );
123  JobQueueGroup::singleton()->push( $jobs );
124  // Job to update link tables for a set of titles
125  } elseif ( isset( $this->params['pages'] ) ) {
126  foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
127  $title = Title::makeTitleSafe( $ns, $dbKey );
128  if ( $title ) {
129  $ok = $this->runForTitle( $title ) && $ok;
130  } else {
131  $ok = false;
132  $this->setLastError( "Invalid title ($ns,$dbKey)." );
133  }
134  }
135  // Job to update link tables for a given title
136  } else {
137  $ok = $this->runForTitle( $this->title );
138  }
139 
140  return $ok;
141  }
142 
147  protected function runForTitle( PageIdentity $pageIdentity ) {
148  $services = MediaWikiServices::getInstance();
149  $stats = $services->getStatsdDataFactory();
150  $renderer = $services->getRevisionRenderer();
151  $parserCache = $services->getParserCache();
152  $lbFactory = $services->getDBLoadBalancerFactory();
153  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
154 
155  // Load the page from the primary DB
156  $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
157  $page->loadPageData( WikiPage::READ_LATEST );
158 
159  if ( !$page->exists() ) {
160  // Probably due to concurrent deletion or renaming of the page
161  $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
162  $logger->notice(
163  'The page does not exist. Perhaps it was deleted?',
164  [
165  'page_title' => $this->title->getPrefixedDBkey(),
166  'job_params' => $this->getParams(),
167  'job_metadata' => $this->getMetadata()
168  ]
169  );
170 
171  // nothing to do
172  $stats->increment( 'refreshlinks.rev_not_found' );
173  return false;
174  }
175 
176  // Serialize link update job by page ID so they see each others' changes.
177  // The page ID and latest revision ID will be queried again after the lock
178  // is acquired to bail if they are changed from that of loadPageData() above.
179  // Serialize links updates by page ID so they see each others' changes
180  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
182  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
183  if ( $scopedLock === null ) {
184  // Another job is already updating the page, likely for a prior revision (T170596)
185  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
186  $stats->increment( 'refreshlinks.lock_failure' );
187 
188  return false;
189  }
190 
191  if ( $this->isAlreadyRefreshed( $page ) ) {
192  $stats->increment( 'refreshlinks.update_skipped' );
193 
194  return true;
195  }
196 
197  // Parse during a fresh transaction round for better read consistency
198  $lbFactory->beginPrimaryChanges( __METHOD__ );
199  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
200  $options = $this->getDataUpdateOptions();
201  $lbFactory->commitPrimaryChanges( __METHOD__ );
202 
203  if ( !$output ) {
204  return false; // raced out?
205  }
206 
207  // Tell DerivedPageDataUpdater to use this parser output
208  $options['known-revision-output'] = $output;
209  // Execute corresponding DataUpdates immediately
210  $page->doSecondaryDataUpdates( $options );
212 
213  // Commit any writes here in case this method is called in a loop.
214  // In that case, the scoped lock will fail to be acquired.
215  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
216 
217  return true;
218  }
219 
224  private function isAlreadyRefreshed( WikiPage $page ) {
225  // Get the timestamp of the change that triggered this job
226  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
227  if ( $rootTimestamp === null ) {
228  return false;
229  }
230 
231  if ( !empty( $this->params['isOpportunistic'] ) ) {
232  // Neither clock skew nor DB snapshot/replica DB lag matter much for
233  // such updates; focus on reusing the (often recently updated) cache
234  $lagAwareTimestamp = $rootTimestamp;
235  } else {
236  // For transclusion updates, the template changes must be reflected
237  $lagAwareTimestamp = wfTimestamp(
238  TS_MW,
239  (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
240  );
241  }
242 
243  return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
244  }
245 
255  private function getParserOutput(
256  RevisionRenderer $renderer,
257  ParserCache $parserCache,
258  WikiPage $page,
259  StatsdDataFactoryInterface $stats
260  ) {
261  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
262  if ( !$revision ) {
263  return null; // race condition?
264  }
265 
266  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
267  if ( $cachedOutput ) {
268  return $cachedOutput;
269  }
270 
271  $renderedRevision = $renderer->getRenderedRevision(
272  $revision,
273  $page->makeParserOptions( 'canonical' ),
274  null,
275  [ 'audience' => $revision::RAW ]
276  );
277 
278  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
279  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
280  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
281 
282  return $output;
283  }
284 
293  WikiPage $page,
294  StatsdDataFactoryInterface $stats
295  ) {
296  $title = $page->getTitle();
297  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
298  // This is used to detect edits/moves after loadPageData() but before the scope lock.
299  // The works around the chicken/egg problem of determining the scope lock key name
300  $latest = $title->getLatestRevID( Title::READ_LATEST );
301 
302  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
303  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
304  // This job is obsolete and one for the latest revision will handle updates
305  $stats->increment( 'refreshlinks.rev_not_current' );
306  $this->setLastError( "Revision $triggeringRevisionId is not current" );
307 
308  return null;
309  }
310 
311  // Load the current revision. Note that $page should have loaded with READ_LATEST.
312  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
313  $revision = $page->getRevisionRecord();
314  if ( !$revision ) {
315  $stats->increment( 'refreshlinks.rev_not_found' );
316  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
317 
318  return null; // just deleted?
319  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
320  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
321  // serialized, it would be OK to update links based on older revisions since it
322  // would eventually get to the latest. Since that is not the case (by design),
323  // only update the link tables to a state matching the current revision's output.
324  $stats->increment( 'refreshlinks.rev_not_current' );
325  $this->setLastError( "Revision {$revision->getId()} is not current" );
326 
327  return null;
328  }
329 
330  return $revision;
331  }
332 
342  private function getParserOutputFromCache(
343  ParserCache $parserCache,
344  WikiPage $page,
345  RevisionRecord $currentRevision,
346  StatsdDataFactoryInterface $stats
347  ) {
348  $cachedOutput = null;
349  // If page_touched changed after this root job, then it is likely that
350  // any views of the pages already resulted in re-parses which are now in
351  // cache. The cache can be reused to avoid expensive parsing in some cases.
352  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
353  if ( $rootTimestamp !== null ) {
354  $opportunistic = !empty( $this->params['isOpportunistic'] );
355  if ( $opportunistic ) {
356  // Neither clock skew nor DB snapshot/replica DB lag matter much for
357  // such updates; focus on reusing the (often recently updated) cache
358  $lagAwareTimestamp = $rootTimestamp;
359  } else {
360  // For transclusion updates, the template changes must be reflected
361  $lagAwareTimestamp = wfTimestamp(
362  TS_MW,
363  (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
364  );
365  }
366 
367  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
368  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
369  // As long as the cache rev ID matches the current rev ID and it reflects
370  // the job's triggering change, then it is usable.
371  $parserOptions = $page->makeParserOptions( 'canonical' );
372  $output = $parserCache->getDirty( $page, $parserOptions );
373  if (
374  $output &&
375  $output->getCacheRevisionId() == $currentRevision->getId() &&
376  $output->getCacheTime() >= $lagAwareTimestamp
377  ) {
378  $cachedOutput = $output;
379  }
380  }
381  }
382 
383  if ( $cachedOutput ) {
384  $stats->increment( 'refreshlinks.parser_cached' );
385  } else {
386  $stats->increment( 'refreshlinks.parser_uncached' );
387  }
388 
389  return $cachedOutput;
390  }
391 
395  private function getDataUpdateOptions() {
396  $options = [
397  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
398  // Carry over cause so the update can do extra logging
399  'causeAction' => $this->params['causeAction'],
400  'causeAgent' => $this->params['causeAgent']
401  ];
402  if ( !empty( $this->params['triggeringUser'] ) ) {
403  $userInfo = $this->params['triggeringUser'];
404  if ( $userInfo['userId'] ) {
405  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
406  } else {
407  // Anonymous, use the username
408  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
409  }
410  }
411 
412  return $options;
413  }
414 
415  public function getDeduplicationInfo() {
416  $info = parent::getDeduplicationInfo();
417  unset( $info['causeAction'] );
418  unset( $info['causeAgent'] );
419  if ( is_array( $info['params'] ) ) {
420  // For per-pages jobs, the job title is that of the template that changed
421  // (or similar), so remove that since it ruins duplicate detection
422  if ( isset( $info['params']['pages'] ) ) {
423  unset( $info['namespace'] );
424  unset( $info['title'] );
425  }
426  }
427 
428  return $info;
429  }
430 
431  public function workItemCount() {
432  if ( !empty( $this->params['recursive'] ) ) {
433  return 0; // nothing actually refreshed
434  } elseif ( isset( $this->params['pages'] ) ) {
435  return count( $this->params['pages'] );
436  }
437 
438  return 1; // one title
439  }
440 }
Job\getRootJobParams
getRootJobParams()
Definition: Job.php:360
Page\PageIdentity
Interface for objects (potentially) representing an editable wiki page.
Definition: PageIdentity.php:64
User\newFromId
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:636
MediaWiki\Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:47
WikiPage\getRevisionRecord
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:819
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:203
WikiPage\getTouched
getTouched()
Get the page_touched field.
Definition: WikiPage.php:721
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1649
Job\$title
Title $title
Definition: Job.php:49
RefreshLinksJob\getDeduplicationInfo
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
Definition: RefreshLinksJob.php:415
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:63
WikiPage\makeParserOptions
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:2011
User\newFromName
static newFromName( $name, $validate='valid')
Definition: User.php:595
RefreshLinksJob\getParserOutput
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:255
Job\$params
array $params
Array of job parameters.
Definition: Job.php:43
RefreshLinksJob\newPrioritized
static newPrioritized(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:71
Job\setLastError
setLastError( $error)
Definition: Job.php:467
BacklinkJobUtils\partitionBacklinkJob
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Definition: BacklinkJobUtils.php:90
RefreshLinksJob\getDataUpdateOptions
getDataUpdateOptions()
Definition: RefreshLinksJob.php:395
RefreshLinksJob\runForTitle
runForTitle(PageIdentity $pageIdentity)
Definition: RefreshLinksJob.php:147
Job
Class to both describe a background job and handle jobs.
Definition: Job.php:38
RefreshLinksJob\isAlreadyRefreshed
isAlreadyRefreshed(WikiPage $page)
Definition: RefreshLinksJob.php:224
RefreshLinksJob\run
run()
Run the job.
Definition: RefreshLinksJob.php:90
MediaWiki\Logger\LoggerFactory
PSR-3 logger instance factory.
Definition: LoggerFactory.php:45
MediaWiki\Revision\RevisionRenderer\getRenderedRevision
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Definition: RevisionRenderer.php:108
WikiPage\getTitle
getTitle()
Get the title object of the article.
Definition: WikiPage.php:314
WikiPage\getLinksTimestamp
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:743
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1678
MediaWiki\Revision\RevisionRenderer
The RevisionRenderer service provides access to rendered output for revisions.
Definition: RevisionRenderer.php:45
Title\makeTitleSafe
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:674
RefreshLinksJob
Job to update link tables for pages.
Definition: RefreshLinksJob.php:44
WikiPage\getId
getId( $wikiId=self::LOCAL)
Definition: WikiPage.php:587
RefreshLinksJob\workItemCount
workItemCount()
Definition: RefreshLinksJob.php:431
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
Title\getLatestRevID
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:2921
ParserCache\getDirty
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Definition: ParserCache.php:196
RefreshLinksJob\newDynamic
static newDynamic(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:83
RefreshLinksJob\getCurrentRevisionIfUnchanged
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
Definition: RefreshLinksJob.php:292
MediaWiki\Deferred\LinksUpdate\LinksUpdate
Class the manages updates of *_link tables as well as similar extension-managed tables.
Definition: LinksUpdate.php:55
InfoAction\invalidateCache
static invalidateCache(PageIdentity $page, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:181
JobQueueGroup\singleton
static singleton( $domain=false)
Definition: JobQueueGroup.php:114
MediaWiki\Revision\RevisionRecord\getId
getId( $wikiId=self::LOCAL)
Get revision ID.
Definition: RevisionRecord.php:279
RefreshLinksJob\__construct
__construct(PageIdentity $page, array $params)
Definition: RefreshLinksJob.php:50
$job
if(count( $args)< 1) $job
Definition: recompressTracked.php:49
ParserCache
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:63
RefreshLinksJob\getParserOutputFromCache
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
Definition: RefreshLinksJob.php:342