MediaWiki REL1_39
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
23use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
32
58class RefreshLinksJob extends Job {
60 private const NORMAL_MAX_LAG = 10;
62 private const LAG_WAIT_TIMEOUT = 15;
63
64 public function __construct( PageIdentity $page, array $params ) {
65 if ( empty( $params['pages'] ) && !$page->canExist() ) {
66 // BC with the Title class
67 throw new PageAssertionException(
68 'The given PageIdentity {pageIdentity} does not represent a proper page',
69 [ 'pageIdentity' => $page ]
70 );
71 }
72
73 parent::__construct( 'refreshLinks', $page, $params );
74 // Avoid the overhead of de-duplication when it would be pointless
75 $this->removeDuplicates = (
76 // Ranges rarely will line up
77 !isset( $params['range'] ) &&
78 // Multiple pages per job make matches unlikely
79 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
80 );
81 $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
82 // Tell JobRunner to not automatically wrap run() in a transaction round.
83 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
84 // and to avoid contention as well.
85 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
86 }
87
93 public static function newPrioritized( PageIdentity $page, array $params ) {
94 $job = new self( $page, $params );
95 $job->command = 'refreshLinksPrioritized';
96
97 return $job;
98 }
99
105 public static function newDynamic( PageIdentity $page, array $params ) {
106 $job = new self( $page, $params );
107 $job->command = 'refreshLinksDynamic';
108
109 return $job;
110 }
111
112 public function run() {
113 $ok = true;
114
115 if ( !empty( $this->params['recursive'] ) ) {
116 // Job to update all (or a range of) backlink pages for a page
117
118 // When the base job branches, wait for the replica DBs to catch up to the primary.
119 // From then on, we know that any template changes at the time the base job was
120 // enqueued will be reflected in backlink page parses when the leaf jobs run.
121 $services = MediaWikiServices::getInstance();
122 if ( !isset( $this->params['range'] ) ) {
123 $lbFactory = $services->getDBLoadBalancerFactory();
124 if ( !$lbFactory->waitForReplication( [
125 'domain' => $lbFactory->getLocalDomainID(),
126 'timeout' => self::LAG_WAIT_TIMEOUT
127 ] ) ) {
128 // only try so hard, keep going with what we have
129 $stats = $services->getStatsdDataFactory();
130 $stats->increment( 'refreshlinks_warning.lag_wait_failed' );
131 }
132 }
133 // Carry over information for de-duplication
134 $extraParams = $this->getRootJobParams();
135 $extraParams['triggeredRecursive'] = true;
136 // Carry over cause information for logging
137 $extraParams['causeAction'] = $this->params['causeAction'];
138 $extraParams['causeAgent'] = $this->params['causeAgent'];
139 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
140 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
142 $this,
143 $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ),
144 1, // job-per-title
145 [ 'params' => $extraParams ]
146 );
147 $services->getJobQueueGroup()->push( $jobs );
148
149 } elseif ( isset( $this->params['pages'] ) ) {
150 // Job to update link tables for a set of titles
151 foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
152 $title = Title::makeTitleSafe( $ns, $dbKey );
153 if ( $title && $title->canExist() ) {
154 $ok = $this->runForTitle( $title ) && $ok;
155 } else {
156 $ok = false;
157 $this->setLastError( "Invalid title ($ns,$dbKey)." );
158 }
159 }
160
161 } else {
162 // Job to update link tables for a given title
163 $ok = $this->runForTitle( $this->title );
164 }
165
166 return $ok;
167 }
168
173 protected function runForTitle( PageIdentity $pageIdentity ) {
174 $services = MediaWikiServices::getInstance();
175 $stats = $services->getStatsdDataFactory();
176 $renderer = $services->getRevisionRenderer();
177 $parserCache = $services->getParserCache();
178 $lbFactory = $services->getDBLoadBalancerFactory();
179 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
180
181 // Load the page from the primary DB
182 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
183 $page->loadPageData( WikiPage::READ_LATEST );
184
185 if ( !$page->exists() ) {
186 // Probably due to concurrent deletion or renaming of the page
187 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
188 $logger->notice(
189 'The page does not exist. Perhaps it was deleted?',
190 [
191 'page_title' => $this->title->getPrefixedDBkey(),
192 'job_params' => $this->getParams(),
193 'job_metadata' => $this->getMetadata()
194 ]
195 );
196 $stats->increment( 'refreshlinks_outcome.bad_page_not_found' );
197
198 // retry later to handle unlucky race condition
199 return false;
200 }
201
202 // Serialize link update job by page ID so they see each others' changes.
203 // The page ID and latest revision ID will be queried again after the lock
204 // is acquired to bail if they are changed from that of loadPageData() above.
205 // Serialize links updates by page ID so they see each others' changes
206 $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
208 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
209 if ( $scopedLock === null ) {
210 // Another job is already updating the page, likely for a prior revision (T170596)
211 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
212 $stats->increment( 'refreshlinks_outcome.bad_lock_failure' );
213
214 // retry later when overlapping job for previous rev is done
215 return false;
216 }
217
218 if ( $this->isAlreadyRefreshed( $page ) ) {
219 // this job has been superseded, e.g. by overlapping recursive job
220 // for a different template edit, or by direct edit or purge.
221 $stats->increment( 'refreshlinks_outcome.good_update_superseded' );
222 // treat as success
223 return true;
224 }
225
226 // These can be fairly long-running jobs, while commitAndWaitForReplication
227 // releases primary snapshots, let the replica release their snapshot as well
228 $lbFactory->flushReplicaSnapshots( __METHOD__ );
229 // Parse during a fresh transaction round for better read consistency
230 $lbFactory->beginPrimaryChanges( __METHOD__ );
231 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
232 $options = $this->getDataUpdateOptions();
233 $lbFactory->commitPrimaryChanges( __METHOD__ );
234
235 if ( !$output ) {
236 // probably raced out.
237 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
238 // FIXME: Why do we retry this? Can this be a cancellation?
239 return false;
240 }
241
242 // Tell DerivedPageDataUpdater to use this parser output
243 $options['known-revision-output'] = $output;
244 // Execute corresponding DataUpdates immediately
245 $page->doSecondaryDataUpdates( $options );
246 InfoAction::invalidateCache( $page );
247
248 // Commit any writes here in case this method is called in a loop.
249 // In that case, the scoped lock will fail to be acquired.
250 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
251
252 return true;
253 }
254
258 private function getLagAwareRootTimestamp() {
259 // Get the timestamp of the change that triggered this job
260 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
261 if ( $rootTimestamp === null ) {
262 return null;
263 }
264
265 if ( !empty( $this->params['isOpportunistic'] ) ) {
266 // Neither clock skew nor DB snapshot/replica DB lag matter much for
267 // such updates; focus on reusing the (often recently updated) cache
268 $lagAwareTimestamp = $rootTimestamp;
269 } else {
270 // For transclusion updates, the template changes must be reflected
271 $lagAwareTimestamp = wfTimestamp(
272 TS_MW,
273 (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
274 );
275 }
276
277 return $lagAwareTimestamp;
278 }
279
284 private function isAlreadyRefreshed( WikiPage $page ) {
285 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
286
287 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
288 }
289
299 private function getParserOutput(
300 RevisionRenderer $renderer,
301 ParserCache $parserCache,
302 WikiPage $page,
303 StatsdDataFactoryInterface $stats
304 ) {
305 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
306 if ( !$revision ) {
307 // race condition?
308 return null;
309 }
310
311 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
312 if ( $cachedOutput ) {
313 return $cachedOutput;
314 }
315
316 $renderedRevision = $renderer->getRenderedRevision(
317 $revision,
318 $page->makeParserOptions( 'canonical' ),
319 null,
320 [ 'audience' => $revision::RAW ]
321 );
322
323 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
324 $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
325 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
326
327 return $output;
328 }
329
337 private function getCurrentRevisionIfUnchanged(
338 WikiPage $page,
339 StatsdDataFactoryInterface $stats
340 ) {
341 $title = $page->getTitle();
342 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
343 // This is used to detect edits/moves after loadPageData() but before the scope lock.
344 // The works around the chicken/egg problem of determining the scope lock key name
345 $latest = $title->getLatestRevID( Title::READ_LATEST );
346
347 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
348 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
349 // This job is obsolete and one for the latest revision will handle updates
350 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
351 $this->setLastError( "Revision $triggeringRevisionId is not current" );
352 return null;
353 }
354
355 // Load the current revision. Note that $page should have loaded with READ_LATEST.
356 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
357 $revision = $page->getRevisionRecord();
358 if ( !$revision ) {
359 // revision just got deleted?
360 $stats->increment( 'refreshlinks_outcome.bad_rev_not_found' );
361 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
362 return null;
363
364 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
365 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
366 // serialized, it would be OK to update links based on older revisions since it
367 // would eventually get to the latest. Since that is not the case (by design),
368 // only update the link tables to a state matching the current revision's output.
369 $stats->increment( 'refreshlinks_outcome.bad_rev_not_current' );
370 $this->setLastError( "Revision {$revision->getId()} is not current" );
371
372 return null;
373 }
374
375 return $revision;
376 }
377
387 private function getParserOutputFromCache(
388 ParserCache $parserCache,
389 WikiPage $page,
390 RevisionRecord $currentRevision,
391 StatsdDataFactoryInterface $stats
392 ) {
393 $cachedOutput = null;
394 // If page_touched changed after this root job, then it is likely that
395 // any views of the pages already resulted in re-parses which are now in
396 // cache. The cache can be reused to avoid expensive parsing in some cases.
397 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
398 if ( $rootTimestamp !== null ) {
399 $opportunistic = !empty( $this->params['isOpportunistic'] );
400 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
401 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
402 // As long as the cache rev ID matches the current rev ID and it reflects
403 // the job's triggering change, then it is usable.
404 $parserOptions = $page->makeParserOptions( 'canonical' );
405 $output = $parserCache->getDirty( $page, $parserOptions );
406 if (
407 $output &&
408 $output->getCacheRevisionId() == $currentRevision->getId() &&
409 $output->getCacheTime() >= $this->getLagAwareRootTimestamp()
410 ) {
411 $cachedOutput = $output;
412 }
413 }
414 }
415
416 if ( $cachedOutput ) {
417 $stats->increment( 'refreshlinks.parser_cached' );
418 } else {
419 $stats->increment( 'refreshlinks.parser_uncached' );
420 }
421
422 return $cachedOutput;
423 }
424
428 private function getDataUpdateOptions() {
429 $options = [
430 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
431 // Carry over cause so the update can do extra logging
432 'causeAction' => $this->params['causeAction'],
433 'causeAgent' => $this->params['causeAgent']
434 ];
435 if ( !empty( $this->params['triggeringUser'] ) ) {
436 $userInfo = $this->params['triggeringUser'];
437 if ( $userInfo['userId'] ) {
438 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
439 } else {
440 // Anonymous, use the username
441 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
442 }
443 }
444
445 return $options;
446 }
447
448 public function getDeduplicationInfo() {
449 $info = parent::getDeduplicationInfo();
450 unset( $info['causeAction'] );
451 unset( $info['causeAgent'] );
452 if ( is_array( $info['params'] ) ) {
453 // For per-pages jobs, the job title is that of the template that changed
454 // (or similar), so remove that since it ruins duplicate detection
455 if ( isset( $info['params']['pages'] ) ) {
456 unset( $info['namespace'] );
457 unset( $info['title'] );
458 }
459 }
460
461 return $info;
462 }
463
464 public function workItemCount() {
465 if ( !empty( $this->params['recursive'] ) ) {
466 return 0; // nothing actually refreshed
467 } elseif ( isset( $this->params['pages'] ) ) {
468 return count( $this->params['pages'] );
469 }
470
471 return 1; // one title
472 }
473}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Class to both describe a background job and handle jobs.
Definition Job.php:39
getRootJobParams()
Definition Job.php:362
setLastError( $error)
Definition Job.php:469
Class the manages updates of *_link tables as well as similar extension-managed tables.
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception if a PageIdentity is an invalid argument.
Page revision base class.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
static newFromName( $name, $validate='valid')
Definition User.php:598
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition User.php:639
Base representation for an editable wiki page.
Definition WikiPage.php:62
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:729
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:573
getTitle()
Get the title object of the article.
Definition WikiPage.php:303
doSecondaryDataUpdates(array $options=[])
Do secondary data updates (such as updating link tables).
loadPageData( $from='fromdb')
Load the object from a given source by title.
Definition WikiPage.php:459
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:805
getTouched()
Get the page_touched field.
Definition WikiPage.php:707
Interface for objects (potentially) representing an editable wiki page.
canExist()
Checks whether this PageIdentity represents a "proper" page, meaning that it could exist as an editab...
const DB_PRIMARY
Definition defines.php:28
if(count( $args)< 1) $job