MediaWiki  master
RefreshLinksJob.php
Go to the documentation of this file.
1 <?php
27 
41 class RefreshLinksJob extends Job {
43  const NORMAL_MAX_LAG = 10;
45  const LAG_WAIT_TIMEOUT = 15;
46 
47  function __construct( Title $title, array $params ) {
48  parent::__construct( 'refreshLinks', $title, $params );
49  // Avoid the overhead of de-duplication when it would be pointless
50  $this->removeDuplicates = (
51  // Ranges rarely will line up
52  !isset( $params['range'] ) &&
53  // Multiple pages per job make matches unlikely
54  !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
55  );
56  $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
57  // Tell JobRunner to not automatically wrap run() in a transaction round.
58  // Each runForTitle() call will manage its own rounds in order to run DataUpdates
59  // and to avoid contention as well.
60  $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
61  }
62 
68  public static function newPrioritized( Title $title, array $params ) {
69  $job = new self( $title, $params );
70  $job->command = 'refreshLinksPrioritized';
71 
72  return $job;
73  }
74 
80  public static function newDynamic( Title $title, array $params ) {
81  $job = new self( $title, $params );
82  $job->command = 'refreshLinksDynamic';
83 
84  return $job;
85  }
86 
87  function run() {
88  $ok = true;
89 
90  // Job to update all (or a range of) backlink pages for a page
91  if ( !empty( $this->params['recursive'] ) ) {
92  $services = MediaWikiServices::getInstance();
93  // When the base job branches, wait for the replica DBs to catch up to the master.
94  // From then on, we know that any template changes at the time the base job was
95  // enqueued will be reflected in backlink page parses when the leaf jobs run.
96  if ( !isset( $this->params['range'] ) ) {
97  $lbFactory = $services->getDBLoadBalancerFactory();
98  if ( !$lbFactory->waitForReplication( [
99  'domain' => $lbFactory->getLocalDomainID(),
100  'timeout' => self::LAG_WAIT_TIMEOUT
101  ] ) ) { // only try so hard
102  $stats = $services->getStatsdDataFactory();
103  $stats->increment( 'refreshlinks.lag_wait_failed' );
104  }
105  }
106  // Carry over information for de-duplication
107  $extraParams = $this->getRootJobParams();
108  $extraParams['triggeredRecursive'] = true;
109  // Carry over cause information for logging
110  $extraParams['causeAction'] = $this->params['causeAction'];
111  $extraParams['causeAgent'] = $this->params['causeAgent'];
112  // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
113  // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
115  $this,
116  $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
117  1, // job-per-title
118  [ 'params' => $extraParams ]
119  );
120  JobQueueGroup::singleton()->push( $jobs );
121  // Job to update link tables for a set of titles
122  } elseif ( isset( $this->params['pages'] ) ) {
123  foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
124  $title = Title::makeTitleSafe( $ns, $dbKey );
125  if ( $title ) {
126  $ok = $this->runForTitle( $title ) && $ok;
127  } else {
128  $ok = false;
129  $this->setLastError( "Invalid title ($ns,$dbKey)." );
130  }
131  }
132  // Job to update link tables for a given title
133  } else {
134  $ok = $this->runForTitle( $this->title );
135  }
136 
137  return $ok;
138  }
139 
144  protected function runForTitle( Title $title ) {
145  $services = MediaWikiServices::getInstance();
146  $stats = $services->getStatsdDataFactory();
147  $renderer = $services->getRevisionRenderer();
148  $parserCache = $services->getParserCache();
149  $lbFactory = $services->getDBLoadBalancerFactory();
150  $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
151 
152  // Load the page from the master DB
153  $page = WikiPage::factory( $title );
154  $page->loadPageData( WikiPage::READ_LATEST );
155 
156  // Serialize link update job by page ID so they see each others' changes.
157  // The page ID and latest revision ID will be queried again after the lock
158  // is acquired to bail if they are changed from that of loadPageData() above.
159  // Serialize links updates by page ID so they see each others' changes
160  $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_MASTER );
162  $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
163  if ( $scopedLock === null ) {
164  // Another job is already updating the page, likely for a prior revision (T170596)
165  $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
166  $stats->increment( 'refreshlinks.lock_failure' );
167 
168  return false;
169  }
170 
171  if ( $this->isAlreadyRefreshed( $page ) ) {
172  $stats->increment( 'refreshlinks.update_skipped' );
173 
174  return true;
175  }
176 
177  // Parse during a fresh transaction round for better read consistency
178  $lbFactory->beginMasterChanges( __METHOD__ );
179  $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
180  $options = $this->getDataUpdateOptions();
181  $lbFactory->commitMasterChanges( __METHOD__ );
182 
183  if ( !$output ) {
184  return false; // raced out?
185  }
186 
187  // Tell DerivedPageDataUpdater to use this parser output
188  $options['known-revision-output'] = $output;
189  // Execute corresponding DataUpdates immediately
190  $page->doSecondaryDataUpdates( $options );
191  InfoAction::invalidateCache( $title );
192 
193  // Commit any writes here in case this method is called in a loop.
194  // In that case, the scoped lock will fail to be acquired.
195  $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
196 
197  return true;
198  }
199 
204  private function isAlreadyRefreshed( WikiPage $page ) {
205  // Get the timestamp of the change that triggered this job
206  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
207  if ( $rootTimestamp === null ) {
208  return false;
209  }
210 
211  if ( !empty( $this->params['isOpportunistic'] ) ) {
212  // Neither clock skew nor DB snapshot/replica DB lag matter much for
213  // such updates; focus on reusing the (often recently updated) cache
214  $lagAwareTimestamp = $rootTimestamp;
215  } else {
216  // For transclusion updates, the template changes must be reflected
217  $lagAwareTimestamp = wfTimestamp(
218  TS_MW,
219  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
220  );
221  }
222 
223  return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
224  }
225 
235  private function getParserOutput(
236  RevisionRenderer $renderer,
237  ParserCache $parserCache,
238  WikiPage $page,
239  StatsdDataFactoryInterface $stats
240  ) {
241  $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
242  if ( !$revision ) {
243  return null; // race condition?
244  }
245 
246  $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
247  if ( $cachedOutput ) {
248  return $cachedOutput;
249  }
250 
251  $renderedRevision = $renderer->getRenderedRevision(
252  $revision,
253  $page->makeParserOptions( 'canonical' ),
254  null,
255  [ 'audience' => $revision::RAW ]
256  );
257 
258  $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
259  $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
260  $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
261 
262  return $output;
263  }
264 
273  WikiPage $page,
274  StatsdDataFactoryInterface $stats
275  ) {
276  $title = $page->getTitle();
277  // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
278  // This is used to detect edits/moves after loadPageData() but before the scope lock.
279  // The works around the chicken/egg problem of determining the scope lock key name.
281 
282  $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
283  if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
284  // This job is obsolete and one for the latest revision will handle updates
285  $stats->increment( 'refreshlinks.rev_not_current' );
286  $this->setLastError( "Revision $triggeringRevisionId is not current" );
287 
288  return null;
289  }
290 
291  // Load the current revision. Note that $page should have loaded with READ_LATEST.
292  // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
293  $revision = $page->getRevisionRecord();
294  if ( !$revision ) {
295  $stats->increment( 'refreshlinks.rev_not_found' );
296  $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
297 
298  return null; // just deleted?
299  } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
300  // Do not clobber over newer updates with older ones. If all jobs where FIFO and
301  // serialized, it would be OK to update links based on older revisions since it
302  // would eventually get to the latest. Since that is not the case (by design),
303  // only update the link tables to a state matching the current revision's output.
304  $stats->increment( 'refreshlinks.rev_not_current' );
305  $this->setLastError( "Revision {$revision->getId()} is not current" );
306 
307  return null;
308  }
309 
310  return $revision;
311  }
312 
322  private function getParserOutputFromCache(
323  ParserCache $parserCache,
324  WikiPage $page,
325  RevisionRecord $currentRevision,
326  StatsdDataFactoryInterface $stats
327  ) {
328  $cachedOutput = null;
329  // If page_touched changed after this root job, then it is likely that
330  // any views of the pages already resulted in re-parses which are now in
331  // cache. The cache can be reused to avoid expensive parsing in some cases.
332  $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
333  if ( $rootTimestamp !== null ) {
334  $opportunistic = !empty( $this->params['isOpportunistic'] );
335  if ( $opportunistic ) {
336  // Neither clock skew nor DB snapshot/replica DB lag matter much for
337  // such updates; focus on reusing the (often recently updated) cache
338  $lagAwareTimestamp = $rootTimestamp;
339  } else {
340  // For transclusion updates, the template changes must be reflected
341  $lagAwareTimestamp = wfTimestamp(
342  TS_MW,
343  wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
344  );
345  }
346 
347  if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
348  // Cache is suspected to be up-to-date so it's worth the I/O of checking.
349  // As long as the cache rev ID matches the current rev ID and it reflects
350  // the job's triggering change, then it is usable.
351  $parserOptions = $page->makeParserOptions( 'canonical' );
352  $output = $parserCache->getDirty( $page, $parserOptions );
353  if (
354  $output &&
355  $output->getCacheRevisionId() == $currentRevision->getId() &&
356  $output->getCacheTime() >= $lagAwareTimestamp
357  ) {
358  $cachedOutput = $output;
359  }
360  }
361  }
362 
363  if ( $cachedOutput ) {
364  $stats->increment( 'refreshlinks.parser_cached' );
365  } else {
366  $stats->increment( 'refreshlinks.parser_uncached' );
367  }
368 
369  return $cachedOutput;
370  }
371 
375  private function getDataUpdateOptions() {
376  $options = [
377  'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
378  // Carry over cause so the update can do extra logging
379  'causeAction' => $this->params['causeAction'],
380  'causeAgent' => $this->params['causeAgent']
381  ];
382  if ( !empty( $this->params['triggeringUser'] ) ) {
383  $userInfo = $this->params['triggeringUser'];
384  if ( $userInfo['userId'] ) {
385  $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
386  } else {
387  // Anonymous, use the username
388  $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
389  }
390  }
391 
392  return $options;
393  }
394 
395  public function getDeduplicationInfo() {
396  $info = parent::getDeduplicationInfo();
397  unset( $info['causeAction'] );
398  unset( $info['causeAgent'] );
399  if ( is_array( $info['params'] ) ) {
400  // For per-pages jobs, the job title is that of the template that changed
401  // (or similar), so remove that since it ruins duplicate detection
402  if ( isset( $info['params']['pages'] ) ) {
403  unset( $info['namespace'] );
404  unset( $info['title'] );
405  }
406  }
407 
408  return $info;
409  }
410 
411  public function workItemCount() {
412  if ( !empty( $this->params['recursive'] ) ) {
413  return 0; // nothing actually refreshed
414  } elseif ( isset( $this->params['pages'] ) ) {
415  return count( $this->params['pages'] );
416  }
417 
418  return 1; // one title
419  }
420 }
getLinksTimestamp()
Get the page_links_updated field.
Definition: WikiPage.php:699
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:138
__construct(Title $title, array $params)
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
getRevisionRecord()
Get the latest revision.
Definition: WikiPage.php:795
run()
Run the job.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
The RevisionRenderer service provides access to rendered output for revisions.
Class to both describe a background job and handle jobs.
Definition: Job.php:30
isAlreadyRefreshed(WikiPage $page)
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
getTouched()
Get the page_touched field.
Definition: WikiPage.php:688
title
const DB_MASTER
Definition: defines.php:26
getDirty( $article, $popts)
Retrieve the ParserOutput from ParserCache, even if it&#39;s outdated.
setLastError( $error)
Definition: Job.php:418
static configuration should be added through ResourceLoaderGetConfigVars instead can be used to get the real title e g db for database replication lag or jobqueue for job queue size converted to pseudo seconds It is possible to add more fields and they will be returned to the user in the API response after the basic globals have been set but before ordinary actions take place $output
Definition: hooks.txt:2216
static acquirePageLock(IDatabase $dbw, $pageId, $why='atomicity')
Acquire a session-level lock for performing link table updates for a page on a DB.
runForTitle(Title $title)
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static invalidateCache(Title $title, $revid=null)
Clear the info cache for a given Title.
Definition: InfoAction.php:71
static configuration should be added through ResourceLoaderGetConfigVars instead can be used to get the real title e g db for database replication lag or jobqueue for job queue size converted to pseudo seconds It is possible to add more fields and they will be returned to the user in the API response after the basic globals have been set but before ordinary actions take place or wrap services the preferred way to define a new service is the $wgServiceWiringFiles array $services
Definition: hooks.txt:2216
getRootJobParams()
Definition: Job.php:321
const GAID_FOR_UPDATE
Used to be GAID_FOR_UPDATE define.
Definition: Title.php:57
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
getTitle()
Get the title object of the article.
Definition: WikiPage.php:294
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1983
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:773
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page. ...
Job to update link tables for pages.
static makeTitleSafe( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:620
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
Definition: WikiPage.php:1957
getId()
Get revision ID.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, User $forUser=null, array $hints=[])
static newDynamic(Title $title, array $params)
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition: User.php:559
if(count( $args)< 1) $job
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
static newPrioritized(Title $title, array $params)
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition: Title.php:3115
static singleton( $domain=false)
array $params
Array of job parameters.
Definition: Job.php:35
Page revision base class.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
static newFromName( $name, $validate='valid')
Static factory method for creation from username.
Definition: User.php:535
Title $title
Definition: Job.php:41
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.