MediaWiki REL1_35
RefreshLinksJob.php
Go to the documentation of this file.
1<?php
23use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
27
41class RefreshLinksJob extends Job {
43 private const NORMAL_MAX_LAG = 10;
45 private const LAG_WAIT_TIMEOUT = 15;
46
47 public function __construct( Title $title, array $params ) {
48 parent::__construct( 'refreshLinks', $title, $params );
49 // Avoid the overhead of de-duplication when it would be pointless
50 $this->removeDuplicates = (
51 // Ranges rarely will line up
52 !isset( $params['range'] ) &&
53 // Multiple pages per job make matches unlikely
54 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
55 );
56 $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
57 // Tell JobRunner to not automatically wrap run() in a transaction round.
58 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
59 // and to avoid contention as well.
60 $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
61 }
62
68 public static function newPrioritized( Title $title, array $params ) {
69 $job = new self( $title, $params );
70 $job->command = 'refreshLinksPrioritized';
71
72 return $job;
73 }
74
80 public static function newDynamic( Title $title, array $params ) {
81 $job = new self( $title, $params );
82 $job->command = 'refreshLinksDynamic';
83
84 return $job;
85 }
86
87 public function run() {
88 $ok = true;
89
90 // Job to update all (or a range of) backlink pages for a page
91 if ( !empty( $this->params['recursive'] ) ) {
92 $services = MediaWikiServices::getInstance();
93 // When the base job branches, wait for the replica DBs to catch up to the master.
94 // From then on, we know that any template changes at the time the base job was
95 // enqueued will be reflected in backlink page parses when the leaf jobs run.
96 if ( !isset( $this->params['range'] ) ) {
97 $lbFactory = $services->getDBLoadBalancerFactory();
98 if ( !$lbFactory->waitForReplication( [
99 'domain' => $lbFactory->getLocalDomainID(),
100 'timeout' => self::LAG_WAIT_TIMEOUT
101 ] ) ) { // only try so hard
102 $stats = $services->getStatsdDataFactory();
103 $stats->increment( 'refreshlinks.lag_wait_failed' );
104 }
105 }
106 // Carry over information for de-duplication
107 $extraParams = $this->getRootJobParams();
108 $extraParams['triggeredRecursive'] = true;
109 // Carry over cause information for logging
110 $extraParams['causeAction'] = $this->params['causeAction'];
111 $extraParams['causeAgent'] = $this->params['causeAgent'];
112 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
113 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
115 $this,
116 $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
117 1, // job-per-title
118 [ 'params' => $extraParams ]
119 );
120 JobQueueGroup::singleton()->push( $jobs );
121 // Job to update link tables for a set of titles
122 } elseif ( isset( $this->params['pages'] ) ) {
123 foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
124 $title = Title::makeTitleSafe( $ns, $dbKey );
125 if ( $title ) {
126 $ok = $this->runForTitle( $title ) && $ok;
127 } else {
128 $ok = false;
129 $this->setLastError( "Invalid title ($ns,$dbKey)." );
130 }
131 }
132 // Job to update link tables for a given title
133 } else {
134 $ok = $this->runForTitle( $this->title );
135 }
136
137 return $ok;
138 }
139
144 protected function runForTitle( Title $title ) {
145 $services = MediaWikiServices::getInstance();
146 $stats = $services->getStatsdDataFactory();
147 $renderer = $services->getRevisionRenderer();
148 $parserCache = $services->getParserCache();
149 $lbFactory = $services->getDBLoadBalancerFactory();
150 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
151
152 // Load the page from the master DB
153 $page = WikiPage::factory( $title );
154 $page->loadPageData( WikiPage::READ_LATEST );
155
156 // Serialize link update job by page ID so they see each others' changes.
157 // The page ID and latest revision ID will be queried again after the lock
158 // is acquired to bail if they are changed from that of loadPageData() above.
159 // Serialize links updates by page ID so they see each others' changes
160 $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_MASTER );
162 $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
163 if ( $scopedLock === null ) {
164 // Another job is already updating the page, likely for a prior revision (T170596)
165 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
166 $stats->increment( 'refreshlinks.lock_failure' );
167
168 return false;
169 }
170
171 if ( $this->isAlreadyRefreshed( $page ) ) {
172 $stats->increment( 'refreshlinks.update_skipped' );
173
174 return true;
175 }
176
177 // Parse during a fresh transaction round for better read consistency
178 $lbFactory->beginMasterChanges( __METHOD__ );
179 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
180 $options = $this->getDataUpdateOptions();
181 $lbFactory->commitMasterChanges( __METHOD__ );
182
183 if ( !$output ) {
184 return false; // raced out?
185 }
186
187 // Tell DerivedPageDataUpdater to use this parser output
188 $options['known-revision-output'] = $output;
189 // Execute corresponding DataUpdates immediately
190 $page->doSecondaryDataUpdates( $options );
192
193 // Commit any writes here in case this method is called in a loop.
194 // In that case, the scoped lock will fail to be acquired.
195 $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
196
197 return true;
198 }
199
204 private function isAlreadyRefreshed( WikiPage $page ) {
205 // Get the timestamp of the change that triggered this job
206 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
207 if ( $rootTimestamp === null ) {
208 return false;
209 }
210
211 if ( !empty( $this->params['isOpportunistic'] ) ) {
212 // Neither clock skew nor DB snapshot/replica DB lag matter much for
213 // such updates; focus on reusing the (often recently updated) cache
214 $lagAwareTimestamp = $rootTimestamp;
215 } else {
216 // For transclusion updates, the template changes must be reflected
217 $lagAwareTimestamp = wfTimestamp(
218 TS_MW,
219 wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
220 );
221 }
222
223 return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
224 }
225
235 private function getParserOutput(
236 RevisionRenderer $renderer,
237 ParserCache $parserCache,
238 WikiPage $page,
239 StatsdDataFactoryInterface $stats
240 ) {
241 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
242 if ( !$revision ) {
243 return null; // race condition?
244 }
245
246 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
247 if ( $cachedOutput ) {
248 return $cachedOutput;
249 }
250
251 $renderedRevision = $renderer->getRenderedRevision(
252 $revision,
253 $page->makeParserOptions( 'canonical' ),
254 null,
255 [ 'audience' => $revision::RAW ]
256 );
257
258 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
259 $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
260 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
261
262 return $output;
263 }
264
273 WikiPage $page,
274 StatsdDataFactoryInterface $stats
275 ) {
276 $title = $page->getTitle();
277 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
278 // This is used to detect edits/moves after loadPageData() but before the scope lock.
279 // The works around the chicken/egg problem of determining the scope lock key name
280 $latest = $title->getLatestRevID( Title::READ_LATEST );
281
282 $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
283 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
284 // This job is obsolete and one for the latest revision will handle updates
285 $stats->increment( 'refreshlinks.rev_not_current' );
286 $this->setLastError( "Revision $triggeringRevisionId is not current" );
287
288 return null;
289 }
290
291 // Load the current revision. Note that $page should have loaded with READ_LATEST.
292 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
293 $revision = $page->getRevisionRecord();
294 if ( !$revision ) {
295 $stats->increment( 'refreshlinks.rev_not_found' );
296 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
297
298 return null; // just deleted?
299 } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
300 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
301 // serialized, it would be OK to update links based on older revisions since it
302 // would eventually get to the latest. Since that is not the case (by design),
303 // only update the link tables to a state matching the current revision's output.
304 $stats->increment( 'refreshlinks.rev_not_current' );
305 $this->setLastError( "Revision {$revision->getId()} is not current" );
306
307 return null;
308 }
309
310 return $revision;
311 }
312
322 private function getParserOutputFromCache(
323 ParserCache $parserCache,
324 WikiPage $page,
325 RevisionRecord $currentRevision,
326 StatsdDataFactoryInterface $stats
327 ) {
328 $cachedOutput = null;
329 // If page_touched changed after this root job, then it is likely that
330 // any views of the pages already resulted in re-parses which are now in
331 // cache. The cache can be reused to avoid expensive parsing in some cases.
332 $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
333 if ( $rootTimestamp !== null ) {
334 $opportunistic = !empty( $this->params['isOpportunistic'] );
335 if ( $opportunistic ) {
336 // Neither clock skew nor DB snapshot/replica DB lag matter much for
337 // such updates; focus on reusing the (often recently updated) cache
338 $lagAwareTimestamp = $rootTimestamp;
339 } else {
340 // For transclusion updates, the template changes must be reflected
341 $lagAwareTimestamp = wfTimestamp(
342 TS_MW,
343 wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
344 );
345 }
346
347 if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
348 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
349 // As long as the cache rev ID matches the current rev ID and it reflects
350 // the job's triggering change, then it is usable.
351 $parserOptions = $page->makeParserOptions( 'canonical' );
352 $output = $parserCache->getDirty( $page, $parserOptions );
353 if (
354 $output &&
355 $output->getCacheRevisionId() == $currentRevision->getId() &&
356 $output->getCacheTime() >= $lagAwareTimestamp
357 ) {
358 $cachedOutput = $output;
359 }
360 }
361 }
362
363 if ( $cachedOutput ) {
364 $stats->increment( 'refreshlinks.parser_cached' );
365 } else {
366 $stats->increment( 'refreshlinks.parser_uncached' );
367 }
368
369 return $cachedOutput;
370 }
371
375 private function getDataUpdateOptions() {
376 $options = [
377 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
378 // Carry over cause so the update can do extra logging
379 'causeAction' => $this->params['causeAction'],
380 'causeAgent' => $this->params['causeAgent']
381 ];
382 if ( !empty( $this->params['triggeringUser'] ) ) {
383 $userInfo = $this->params['triggeringUser'];
384 if ( $userInfo['userId'] ) {
385 $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
386 } else {
387 // Anonymous, use the username
388 $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
389 }
390 }
391
392 return $options;
393 }
394
395 public function getDeduplicationInfo() {
396 $info = parent::getDeduplicationInfo();
397 unset( $info['causeAction'] );
398 unset( $info['causeAgent'] );
399 if ( is_array( $info['params'] ) ) {
400 // For per-pages jobs, the job title is that of the template that changed
401 // (or similar), so remove that since it ruins duplicate detection
402 if ( isset( $info['params']['pages'] ) ) {
403 unset( $info['namespace'] );
404 unset( $info['title'] );
405 }
406 }
407
408 return $info;
409 }
410
411 public function workItemCount() {
412 if ( !empty( $this->params['recursive'] ) ) {
413 return 0; // nothing actually refreshed
414 } elseif ( isset( $this->params['pages'] ) ) {
415 return count( $this->params['pages'] );
416 }
417
418 return 1; // one title
419 }
420}
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
static partitionBacklinkJob(Job $job, $bSize, $cSize, $opts=[])
Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition job that covers t...
static invalidateCache(Title $title, $revid=null)
Clear the info cache for a given Title.
Class to both describe a background job and handle jobs.
Definition Job.php:32
Title $title
Definition Job.php:43
getRootJobParams()
Stable to override.
Definition Job.php:354
setLastError( $error)
Definition Job.php:461
array $params
Array of job parameters.
Definition Job.php:37
MediaWikiServices is the service locator for the application scope of MediaWiki.
Page revision base class.
The RevisionRenderer service provides access to rendered output for revisions.
getRenderedRevision(RevisionRecord $rev, ParserOptions $options=null, User $forUser=null, array $hints=[])
getDirty(WikiPage $wikiPage, $popts)
Retrieve the ParserOutput from ParserCache, even if it's outdated.
Job to update link tables for pages.
getDeduplicationInfo()
Subclasses may need to override this to make duplication detection work.
run()
Run the job.
getParserOutput(RevisionRenderer $renderer, ParserCache $parserCache, WikiPage $page, StatsdDataFactoryInterface $stats)
Get the parser output if the page is unchanged from what was loaded in $page.
workItemCount()
Stable to override.
static newPrioritized(Title $title, array $params)
static newDynamic(Title $title, array $params)
getCurrentRevisionIfUnchanged(WikiPage $page, StatsdDataFactoryInterface $stats)
Get the current revision record if it is unchanged from what was loaded in $page.
isAlreadyRefreshed(WikiPage $page)
runForTitle(Title $title)
__construct(Title $title, array $params)
getParserOutputFromCache(ParserCache $parserCache, WikiPage $page, RevisionRecord $currentRevision, StatsdDataFactoryInterface $stats)
Get the parser output from cache if it reflects the change that triggered this job.
Represents a title within MediaWiki.
Definition Title.php:42
getLatestRevID( $flags=0)
What is the page_latest field for this page?
Definition Title.php:3311
static newFromName( $name, $validate='valid')
Static factory method for creation from username.
Definition User.php:541
static newFromId( $id)
Static factory method for creation from a given user ID.
Definition User.php:565
Class representing a MediaWiki article and history.
Definition WikiPage.php:51
getLinksTimestamp()
Get the page_links_updated field.
Definition WikiPage.php:685
makeParserOptions( $context)
Get parser options suitable for rendering the primary article wikitext.
getTitle()
Get the title object of the article.
Definition WikiPage.php:318
getRevisionRecord()
Get the latest revision.
Definition WikiPage.php:781
getTouched()
Get the page_touched field.
Definition WikiPage.php:674
const DB_MASTER
Definition defines.php:29
if(count( $args)< 1) $job