MediaWiki REL1_37
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
23use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
29
43class RefreshLinksJob extends Job {
45 private const NORMAL_MAX_LAG = 10;
47 private const LAG_WAIT_TIMEOUT = 15;
48
49 public function __construct( PageIdentity $page, array $params ) {
50 parent::__construct( 'refreshLinks', $page, $params );
51 // Avoid the overhead of de-duplication when it would be pointless
52 $this->removeDuplicates = (
53 // Ranges rarely will line up
54 !isset( $params['range'] ) &&
55 // Multiple pages per job make matches unlikely
56 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
57 );
58 $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
59 // Tell JobRunner to not automatically wrap run() in a transaction round.
60 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
61 // and to avoid contention as well.
62 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
63 }
64
70 public static function newPrioritized( PageIdentity $page, array $params ) {
71 $job = new self( $page, $params );
72 $job->command = 'refreshLinksPrioritized';
73
74 return $job;
75 }
76
82 public static function newDynamic( PageIdentity $page, array $params ) {
83 $job = new self( $page, $params );
84 $job->command = 'refreshLinksDynamic';
85
86 return $job;
87 }
88
89 public function run() {
90 $ok = true;
91
92 // Job to update all (or a range of) backlink pages for a page
93 if ( !empty( $this->params['recursive'] ) ) {
94 $services = MediaWikiServices::getInstance();
95 // When the base job branches, wait for the replica DBs to catch up to the primary.
96 // From then on, we know that any template changes at the time the base job was
97 // enqueued will be reflected in backlink page parses when the leaf jobs run.
98 if ( !isset( $this->params['range'] ) ) {
99 $lbFactory = $services->getDBLoadBalancerFactory();
100 if ( !$lbFactory->waitForReplication( [
101 'domain' => $lbFactory->getLocalDomainID(),
102 'timeout' => self::LAG_WAIT_TIMEOUT
103 ] ) ) { // only try so hard
104 $stats = $services->getStatsdDataFactory();
105 $stats->increment( 'refreshlinks.lag_wait_failed' );
106 }
107 }
108 // Carry over information for de-duplication
109 $extraParams = $this->getRootJobParams();
110 $extraParams['triggeredRecursive'] = true;
111 // Carry over cause information for logging
112 $extraParams['causeAction'] = $this->params['causeAction'];
113 $extraParams['causeAgent'] = $this->params['causeAgent'];
114 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
115 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
117 $this,
118 $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
119 1, // job-per-title
120 [ 'params' => $extraParams ]
121 );
122 JobQueueGroup::singleton()->push( $jobs );
123 // Job to update link tables for a set of titles
124 } elseif ( isset( $this->params['pages'] ) ) {
125 foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
126 $title = Title::makeTitleSafe( $ns, $dbKey );
127 if ( $title ) {
128 $ok = $this->runForTitle( $title ) && $ok;
129 } else {
130 $ok = false;
131 $this->setLastError( "Invalid title ($ns,$dbKey)." );
132 }
133 }
134 // Job to update link tables for a given title
135 } else {
136 $ok = $this->runForTitle( $this->title );
137 }
138
139 return $ok;
140 }
141
146 protected function runForTitle( PageIdentity $pageIdentity ) {
147 $services = MediaWikiServices::getInstance();
148 $stats = $services->getStatsdDataFactory();
149 $renderer = $services->getRevisionRenderer();
150 $parserCache = $services->getParserCache();
151 $lbFactory = $services->getDBLoadBalancerFactory();
152 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
153
154 // Load the page from the primary DB
155 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
156 $page->loadPageData( WikiPage::READ_LATEST );
157
158 if ( !$page->exists() ) {
159 // Probably due to concurrent deletion or renaming of the page
160 $logger = LoggerFactory::getInstance( 'RefreshLinksJob' );
161 $logger->notice(
162 'The page does not exist. Perhaps it was deleted?',
163 [
164 'page_title' => $this->title->getPrefixedDBkey(),
165 'job_params' => $this->getParams(),
166 'job_metadata' => $this->getMetadata()
167 ]
168 );
169
170 // nothing to do
171 $stats->increment( 'refreshlinks.rev_not_found' );
172 return false;
173 }
174
175 // Serialize link update job by page ID so they see each others' changes.
176 // The page ID and latest revision ID will be queried again after the lock
177 // is acquired to bail if they are changed from that of loadPageData() above.
178 // Serialize links updates by page ID so they see each others' changes
179 $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_PRIMARY );
181 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
182 if ( $scopedLock === null ) {
183 // Another job is already updating the page, likely for a prior revision (T170596)
184 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
185 $stats->increment( 'refreshlinks.lock_failure' );
186
187 return false;
188 }
189
190 if ( $this->isAlreadyRefreshed( $page ) ) {
191 $stats->increment( 'refreshlinks.update_skipped' );
192
193 return true;
194 }
195
196 // Parse during a fresh transaction round for better read consistency
197 $lbFactory->beginPrimaryChanges( __METHOD__ );
198 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
199 $options = $this->getDataUpdateOptions();
200 $lbFactory->commitPrimaryChanges( __METHOD__ );
201
202 if ( !$output ) {
203 return false; // raced out?
204 }
205
206 // Tell DerivedPageDataUpdater to use this parser output
207 $options['known-revision-output'] = $output;
208 // Execute corresponding DataUpdates immediately
209 $page->doSecondaryDataUpdates( $options );
210 InfoAction::invalidateCache( $page );
211
212 // Commit any writes here in case this method is called in a loop.
213 // In that case, the scoped lock will fail to be acquired.
214 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
215
216 return true;
217 }
218
223 private function isAlreadyRefreshed( WikiPage $page ) {
224 // Get the timestamp of the change that triggered this job
225 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
226 if ( $rootTimestamp === null ) {
227 return false;
228 }
229
230 if ( !empty( $this->params['isOpportunistic'] ) ) {
231 // Neither clock skew nor DB snapshot/replica DB lag matter much for
232 // such updates; focus on reusing the (often recently updated) cache
233 $lagAwareTimestamp = $rootTimestamp;
234 } else {
235 // For transclusion updates, the template changes must be reflected
236 $lagAwareTimestamp = wfTimestamp(
237 TS_MW,
238 wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
239 );
240 }
241
242 return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
243 }
244
254 private function getParserOutput(
255 RevisionRenderer $renderer,
256 ParserCache $parserCache,
257 WikiPage $page,
258 StatsdDataFactoryInterface $stats
259 ) {
260 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
261 if ( !$revision ) {
262 return null; // race condition?
263 }
264
265 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
266 if ( $cachedOutput ) {
267 return $cachedOutput;
268 }
269
270 $renderedRevision = $renderer->getRenderedRevision(
271 $revision,
272 $page->makeParserOptions( 'canonical' ),
273 null,
274 [ 'audience' => $revision::RAW ]
275 );
276
277 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
278 $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
279 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
280
281 return $output;
282 }
283
292 WikiPage $page,
293 StatsdDataFactoryInterface $stats
294 ) {
295 $title = $page->getTitle();
296 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
297 // This is used to detect edits/moves after loadPageData() but before the scope lock.
298 // The works around the chicken/egg problem of determining the scope lock key name
299 $latest = $title->getLatestRevID( Title::READ_LATEST );
300
301 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
302 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
303 // This job is obsolete and one for the latest revision will handle updates
304 $stats->increment( 'refreshlinks.rev_not_current' );
305 $this->setLastError( "Revision $triggeringRevisionId is not current" );
306
307 return null;
308 }
309
310 // Load the current revision. Note that $page should have loaded with READ_LATEST.
311 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
312 $revision = $page->getRevisionRecord();
313 if ( !$revision ) {
314 $stats->increment( 'refreshlinks.rev_not_found' );
315 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
316
317 return null; // just deleted?
318 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
319 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
320 // serialized, it would be OK to update links based on older revisions since it
321 // would eventually get to the latest. Since that is not the case (by design),
322 // only update the link tables to a state matching the current revision's output.
323 $stats->increment( 'refreshlinks.rev_not_current' );
324 $this->setLastError( "Revision {$revision->getId()} is not current" );
325
326 return null;
327 }
328
329 return $revision;
330 }
331
341 private function getParserOutputFromCache(
342 ParserCache $parserCache,
343 WikiPage $page,
344 RevisionRecord $currentRevision,
345 StatsdDataFactoryInterface $stats
346 ) {
347 $cachedOutput = null;
348 // If page_touched changed after this root job, then it is likely that
349 // any views of the pages already resulted in re-parses which are now in
350 // cache. The cache can be reused to avoid expensive parsing in some cases.
351 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
352 if ( $rootTimestamp !== null ) {
353 $opportunistic = !empty( $this->params['isOpportunistic'] );
354 if ( $opportunistic ) {
355 // Neither clock skew nor DB snapshot/replica DB lag matter much for
356 // such updates; focus on reusing the (often recently updated) cache
357 $lagAwareTimestamp = $rootTimestamp;
358 } else {
359 // For transclusion updates, the template changes must be reflected
360 $lagAwareTimestamp = wfTimestamp(
361 TS_MW,
362 wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
363 );
364 }
365
366 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
367 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
368 // As long as the cache rev ID matches the current rev ID and it reflects
369 // the job's triggering change, then it is usable.
370 $parserOptions = $page->makeParserOptions( 'canonical' );
371 $output = $parserCache->getDirty( $page, $parserOptions );
372 if (
373 $output &&
374 $output->getCacheRevisionId() == $currentRevision->getId() &&
375 $output->getCacheTime() >= $lagAwareTimestamp
376 ) {
377 $cachedOutput = $output;
378 }
379 }
380 }
381
382 if ( $cachedOutput ) {
383 $stats->increment( 'refreshlinks.parser_cached' );
384 } else {
385 $stats->increment( 'refreshlinks.parser_uncached' );
386 }
387
388 return $cachedOutput;
389 }
390
394 private function getDataUpdateOptions() {
395 $options = [
396 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
397 // Carry over cause so the update can do extra logging
398 'causeAction' => $this->params['causeAction'],
399 'causeAgent' => $this->params['causeAgent']
400 ];
401 if ( !empty( $this->params['triggeringUser'] ) ) {
402 $userInfo = $this->params['triggeringUser'];
403 if ( $userInfo['userId'] ) {
404 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
405 } else {
406 // Anonymous, use the username
407 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
408 }
409 }
410
411 return $options;
412 }
413
414 public function getDeduplicationInfo() {
415 $info = parent::getDeduplicationInfo();
416 unset( $info['causeAction'] );
417 unset( $info['causeAgent'] );
418 if ( is_array( $info['params'] ) ) {
419 // For per-pages jobs, the job title is that of the template that changed
420 // (or similar), so remove that since it ruins duplicate detection
421 if ( isset( $info['params']['pages'] ) ) {
422 unset( $info['namespace'] );
423 unset( $info['title'] );
424 }
425 }
426
427 return $info;
428 }
429
430 public function workItemCount() {
431 if ( !empty( $this->params['recursive'] ) ) {
432 return 0; // nothing actually refreshed
433 } elseif ( isset( $this->params['pages'] ) ) {
434 return count( $this->params['pages'] );
435 }
436
437 return 1; // one title
438 }
439}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
Class to both describe a background job and handle jobs.
Definition Job.php:37
Title $title
Definition Job.php:48
getRootJobParams()
Definition Job.php:359
setLastError( $error)
Definition Job.php:466
array $params
Array of job parameters.
Definition Job.php:42
PSR-3 logger instance factory.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Page revision base class.
getId( $wikiId=self::LOCAL)
Get revision ID.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, Authority $forPerformer=null, array $hints=[])
Cache for ParserOutput objects corresponding to the latest page revisions.
getDirty(PageRecord $page, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
runForTitle(PageIdentity $pageIdentity)
static newDynamic(PageIdentity $page, array $params)
static newPrioritized(PageIdentity $page, array $params)
__construct(PageIdentity $page, array $params)
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
isAlreadyRefreshed(WikiPage $page)
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition Title.php:2993
static newFromName( $name, $validate='valid')
Definition User.php:607
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition User.php:648
Class representing a MediaWiki article and history.
Definition WikiPage.php:60
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:740
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getId( $wikiId=self::LOCAL)
Definition WikiPage.php:584
getTitle()
Get the title object of the article.
Definition WikiPage.php:311
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:816
getTouched()
Get the page_touched field.
Definition WikiPage.php:718
Interface for objects (potentially) representing an editable wiki page.
const DB_PRIMARY
Definition defines.php:27
if(count( $args)< 1) $job