Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
60.92% |
145 / 238 |
|
18.75% |
3 / 16 |
CRAP | |
0.00% |
0 / 1 |
RefreshLinksJob | |
61.18% |
145 / 237 |
|
18.75% |
3 / 16 |
365.87 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
5 | |||
newPrioritized | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
newDynamic | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
run | |
27.27% |
9 / 33 |
|
0.00% |
0 / 1 |
40.16 | |||
runForTitle | |
92.86% |
39 / 42 |
|
0.00% |
0 / 1 |
5.01 | |||
getLagAwareRootTimestamp | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
3.01 | |||
isAlreadyRefreshed | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
shouldGenerateHTMLOnEdit | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
getParserOutput | |
59.42% |
41 / 69 |
|
0.00% |
0 / 1 |
36.31 | |||
getCurrentRevisionIfUnchanged | |
64.71% |
11 / 17 |
|
0.00% |
0 / 1 |
7.58 | |||
getParserOutputFromCache | |
55.56% |
5 / 9 |
|
0.00% |
0 / 1 |
11.30 | |||
canUseParserOutputFromCache | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
incrementFailureCounter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getDataUpdateOptions | |
63.64% |
7 / 11 |
|
0.00% |
0 / 1 |
3.43 | |||
getDeduplicationInfo | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
workItemCount | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | namespace MediaWiki\JobQueue\Jobs; |
22 | |
23 | use MediaWiki\Actions\InfoAction; |
24 | use MediaWiki\Deferred\LinksUpdate\LinksUpdate; |
25 | use MediaWiki\Deferred\RefreshSecondaryDataUpdate; |
26 | use MediaWiki\JobQueue\Job; |
27 | use MediaWiki\JobQueue\Utils\BacklinkJobUtils; |
28 | use MediaWiki\Logger\LoggerFactory; |
29 | use MediaWiki\MainConfigNames; |
30 | use MediaWiki\MediaWikiServices; |
31 | use MediaWiki\Page\PageAssertionException; |
32 | use MediaWiki\Page\PageIdentity; |
33 | use MediaWiki\Page\WikiPage; |
34 | use MediaWiki\Parser\ParserCache; |
35 | use MediaWiki\Parser\ParserOutput; |
36 | use MediaWiki\Parser\ParserOutputFlags; |
37 | use MediaWiki\Revision\RevisionRecord; |
38 | use MediaWiki\Revision\RevisionRenderer; |
39 | use MediaWiki\Revision\SlotRecord; |
40 | use MediaWiki\Title\Title; |
41 | use MediaWiki\User\User; |
42 | use MediaWiki\WikiMap\WikiMap; |
43 | use Wikimedia\Rdbms\IDBAccessObject; |
44 | use Wikimedia\Stats\StatsFactory; |
45 | |
46 | /** |
47 | * Job to update link tables for rerendered wiki pages. |
48 | * |
49 | * This job comes in a few variants: |
50 | * |
51 | * - a) Recursive jobs to update links for backlink pages for a given title. |
52 | * Scheduled by {@see LinksUpdate::queueRecursiveJobsForTable()}; used to |
53 | * refresh pages which link/transclude a given title. |
54 | * These jobs have (recursive:true,table:<table>) set. They just look up |
55 | * which pages link to the job title and schedule them as a set of non-recursive |
56 | * RefreshLinksJob jobs (and possible one new recursive job as a way of |
57 | * continuation). |
58 | * - b) Jobs to update links for a set of pages (the job title is ignored). |
59 | * These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set. |
60 | * - c) Jobs to update links for a single page (the job title). |
61 | * These jobs need no extra fields set. |
62 | * |
63 | * Job parameters for all jobs: |
64 | * - recursive (bool): When false, updates the current page. When true, updates |
65 | * the pages which link/transclude the current page. |
66 | * - triggeringRevisionId (int): The revision of the edit which caused the link |
67 | * refresh. For manually triggered updates, the last revision of the page (at the |
68 | * time of scheduling). |
69 | * - triggeringUser (array): The user who triggered the refresh, in the form of a |
70 | * [ 'userId' => int, 'userName' => string ] array. This is not necessarily the user |
71 | * who created the revision. |
72 | * - triggeredRecursive (bool): Set on all jobs which were partitioned from another, |
73 | * recursive job. For debugging. |
74 | * - Standard deduplication params (see {@see JobQueue::deduplicateRootJob()}). |
75 | * For recursive jobs: |
76 | * - table (string): Which table to use (imagelinks or templatelinks) when searching for |
77 | * affected pages. |
78 | * - range (array): Used for recursive jobs when some pages have already been partitioned |
79 | * into separate jobs. Contains the list of ranges that still need to be partitioned. |
80 | * See {@see BacklinkJobUtils::partitionBacklinkJob()}. |
81 | * - division: Number of times the job was partitioned already (for debugging). |
82 | * For non-recursive jobs: |
83 | * - pages (array): Associative array of [ <page ID> => [ <namespace>, <dbkey> ] ]. |
84 | * Might be omitted, then the job title will be used. |
85 | * - isOpportunistic (bool): Set for opportunistic single-page updates. These are "free" |
86 | * updates that are queued when most of the work needed to be performed anyway for |
87 | * non-linkrefresh-related reasons, and can be more easily discarded if they don't seem |
88 | * useful. See {@see WikiPage::triggerOpportunisticLinksUpdate()}. |
89 | * - useRecursiveLinksUpdate (bool): When true, triggers recursive jobs for each page. |
90 | * |
91 | * Metrics: |
92 | * - `refreshlinks_superseded_updates_total`: The number of times the job was cancelled |
93 | * because the target page had already been refreshed by a different edit or job. |
94 | * The job is considered to have succeeded in this case. |
95 | * |
96 | * - `refreshlinks_warnings_total`: The number of times the job failed due to a recoverable issue. |
97 | * Possible `reason` label values include: |
98 | * - `lag_wait_failed`: The job timed out while waiting for replication. |
99 | * |
100 | * - `refreshlinks_failures_total`: The number of times the job failed. |
101 | * The `reason` label may be: |
102 | * - `page_not_found`: The target page did not exist. |
103 | * - `rev_not_current`: The target revision was no longer the latest revision for the target page. |
104 | * - `rev_not_found`: The target revision was not found. |
105 | * - `lock_failure`: The job failed to acquire an exclusive lock to refresh the target page. |
106 | * |
107 | * - `refreshlinks_parsercache_operations_total`: The number of times the job attempted |
108 | * to fetch parser output from the parser cache. |
109 | * Possible `status` label values include: |
110 | * - `cache_hit`: The parser output was found in the cache. |
111 | * - `cache_miss`: The parser output was not found in the cache. |
112 | * |
113 | * @ingroup JobQueue |
114 | * @see RefreshSecondaryDataUpdate |
115 | * @see WikiPage::doSecondaryDataUpdates() |
116 | */ |
117 | class RefreshLinksJob extends Job { |
118 | /** @var int Lag safety margin when comparing root job times to last-refresh times */ |
119 | private const NORMAL_MAX_LAG = 10; |
120 | /** @var int How many seconds to wait for replica DBs to catch up */ |
121 | private const LAG_WAIT_TIMEOUT = 15; |
122 | |
123 | public function __construct( PageIdentity $page, array $params ) { |
124 | if ( empty( $params['pages'] ) && !$page->canExist() ) { |
125 | // BC with the Title class |
126 | throw new PageAssertionException( |
127 | 'The given PageIdentity {pageIdentity} does not represent a proper page', |
128 | [ 'pageIdentity' => $page ] |
129 | ); |
130 | } |
131 | |
132 | parent::__construct( 'refreshLinks', $page, $params ); |
133 | // Avoid the overhead of de-duplication when it would be pointless |
134 | $this->removeDuplicates = ( |
135 | // Ranges rarely will line up |
136 | !isset( $params['range'] ) && |
137 | // Multiple pages per job make matches unlikely |
138 | !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 ) |
139 | ); |
140 | $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ]; |
141 | // Tell JobRunner to not automatically wrap run() in a transaction round. |
142 | // Each runForTitle() call will manage its own rounds in order to run DataUpdates |
143 | // and to avoid contention as well. |
144 | $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND; |
145 | } |
146 | |
147 | /** |
148 | * @param PageIdentity $page |
149 | * @param array $params |
150 | * @return RefreshLinksJob |
151 | */ |
152 | public static function newPrioritized( PageIdentity $page, array $params ) { |
153 | $job = new self( $page, $params ); |
154 | $job->command = 'refreshLinksPrioritized'; |
155 | |
156 | return $job; |
157 | } |
158 | |
159 | /** |
160 | * @param PageIdentity $page |
161 | * @param array $params |
162 | * @return RefreshLinksJob |
163 | */ |
164 | public static function newDynamic( PageIdentity $page, array $params ) { |
165 | $job = new self( $page, $params ); |
166 | $job->command = 'refreshLinksDynamic'; |
167 | |
168 | return $job; |
169 | } |
170 | |
171 | public function run() { |
172 | $ok = true; |
173 | |
174 | if ( !empty( $this->params['recursive'] ) ) { |
175 | // Job to update all (or a range of) backlink pages for a page |
176 | |
177 | // When the base job branches, wait for the replica DBs to catch up to the primary. |
178 | // From then on, we know that any template changes at the time the base job was |
179 | // enqueued will be reflected in backlink page parses when the leaf jobs run. |
180 | $services = MediaWikiServices::getInstance(); |
181 | if ( !isset( $this->params['range'] ) ) { |
182 | $lbFactory = $services->getDBLoadBalancerFactory(); |
183 | if ( !$lbFactory->waitForReplication( [ |
184 | 'timeout' => self::LAG_WAIT_TIMEOUT |
185 | ] ) ) { |
186 | // only try so hard, keep going with what we have |
187 | $stats = $services->getStatsFactory(); |
188 | $stats->getCounter( 'refreshlinks_warnings_total' ) |
189 | ->setLabel( 'reason', 'lag_wait_failed' ) |
190 | ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' ) |
191 | ->increment(); |
192 | } |
193 | } |
194 | // Carry over information for de-duplication |
195 | $extraParams = $this->getRootJobParams(); |
196 | $extraParams['triggeredRecursive'] = true; |
197 | // Carry over cause information for logging |
198 | $extraParams['causeAction'] = $this->params['causeAction']; |
199 | $extraParams['causeAgent'] = $this->params['causeAgent']; |
200 | // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title |
201 | // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks |
202 | $jobs = BacklinkJobUtils::partitionBacklinkJob( |
203 | $this, |
204 | $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ), |
205 | 1, // job-per-title |
206 | [ 'params' => $extraParams ] |
207 | ); |
208 | $services->getJobQueueGroup()->push( $jobs ); |
209 | |
210 | } elseif ( isset( $this->params['pages'] ) ) { |
211 | // Job to update link tables for a set of titles |
212 | foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) { |
213 | $title = Title::makeTitleSafe( $ns, $dbKey ); |
214 | if ( $title && $title->canExist() ) { |
215 | $ok = $this->runForTitle( $title ) && $ok; |
216 | } else { |
217 | $ok = false; |
218 | $this->setLastError( "Invalid title ($ns,$dbKey)." ); |
219 | } |
220 | } |
221 | |
222 | } else { |
223 | // Job to update link tables for a given title |
224 | $ok = $this->runForTitle( $this->title ); |
225 | } |
226 | |
227 | return $ok; |
228 | } |
229 | |
230 | /** |
231 | * @param PageIdentity $pageIdentity |
232 | * @return bool |
233 | */ |
234 | protected function runForTitle( PageIdentity $pageIdentity ) { |
235 | $services = MediaWikiServices::getInstance(); |
236 | $stats = $services->getStatsFactory(); |
237 | $renderer = $services->getRevisionRenderer(); |
238 | $parserCache = $services->getParserCache(); |
239 | $lbFactory = $services->getDBLoadBalancerFactory(); |
240 | $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ ); |
241 | |
242 | // Load the page from the primary DB |
243 | $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity ); |
244 | $page->loadPageData( IDBAccessObject::READ_LATEST ); |
245 | |
246 | if ( !$page->exists() ) { |
247 | // Probably due to concurrent deletion or renaming of the page |
248 | $logger = LoggerFactory::getInstance( 'RefreshLinksJob' ); |
249 | $logger->warning( |
250 | 'The page does not exist. Perhaps it was deleted?', |
251 | [ |
252 | 'page_title' => $this->title->getPrefixedDBkey(), |
253 | 'job_params' => $this->getParams(), |
254 | 'job_metadata' => $this->getMetadata() |
255 | ] |
256 | ); |
257 | $this->incrementFailureCounter( $stats, 'page_not_found' ); |
258 | |
259 | // retry later to handle unlucky race condition |
260 | return false; |
261 | } |
262 | |
263 | // Serialize link update job by page ID so they see each others' changes. |
264 | // The page ID and latest revision ID will be queried again after the lock |
265 | // is acquired to bail if they are changed from that of loadPageData() above. |
266 | // Serialize links updates by page ID so they see each others' changes |
267 | $dbw = $lbFactory->getPrimaryDatabase(); |
268 | /** @noinspection PhpUnusedLocalVariableInspection */ |
269 | $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' ); |
270 | if ( $scopedLock === null ) { |
271 | // Another job is already updating the page, likely for a prior revision (T170596) |
272 | $this->setLastError( 'LinksUpdate already running for this page, try again later.' ); |
273 | $this->incrementFailureCounter( $stats, 'lock_failure' ); |
274 | |
275 | // retry later when overlapping job for previous rev is done |
276 | return false; |
277 | } |
278 | |
279 | if ( $this->isAlreadyRefreshed( $page ) ) { |
280 | // this job has been superseded, e.g. by overlapping recursive job |
281 | // for a different template edit, or by direct edit or purge. |
282 | $stats->getCounter( 'refreshlinks_superseded_updates_total' ) |
283 | ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' ) |
284 | ->increment(); |
285 | // treat as success |
286 | return true; |
287 | } |
288 | |
289 | // Parse during a fresh transaction round for better read consistency |
290 | $lbFactory->beginPrimaryChanges( __METHOD__ ); |
291 | $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats ); |
292 | $options = $this->getDataUpdateOptions(); |
293 | $lbFactory->commitPrimaryChanges( __METHOD__ ); |
294 | |
295 | if ( !$output ) { |
296 | // probably raced out. |
297 | // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged(). |
298 | // Don't retry job. |
299 | return true; |
300 | } |
301 | |
302 | // Tell DerivedPageDataUpdater to use this parser output |
303 | $options['known-revision-output'] = $output; |
304 | // Execute corresponding DataUpdates immediately |
305 | $page->doSecondaryDataUpdates( $options ); |
306 | InfoAction::invalidateCache( $page ); |
307 | |
308 | // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache! |
309 | // This means the page will have to be rendered on-the-fly when it is next viewed. |
310 | // This is to avoid spending limited ParserCache capacity on rarely visited pages. |
311 | // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache() |
312 | // for pages that are likely to benefit (T327162). |
313 | |
314 | // Commit any writes here in case this method is called in a loop. |
315 | // In that case, the scoped lock will fail to be acquired. |
316 | $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket ); |
317 | |
318 | return true; |
319 | } |
320 | |
321 | /** |
322 | * @return string|null Minimum lag-safe TS_MW timestamp with regard to root job creation |
323 | */ |
324 | private function getLagAwareRootTimestamp() { |
325 | // Get the timestamp of the change that triggered this job |
326 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
327 | if ( $rootTimestamp === null ) { |
328 | return null; |
329 | } |
330 | |
331 | if ( !empty( $this->params['isOpportunistic'] ) ) { |
332 | // Neither clock skew nor DB snapshot/replica DB lag matter much for |
333 | // such updates; focus on reusing the (often recently updated) cache |
334 | $lagAwareTimestamp = $rootTimestamp; |
335 | } else { |
336 | // For transclusion updates, the template changes must be reflected |
337 | $lagAwareTimestamp = wfTimestamp( |
338 | TS_MW, |
339 | (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG |
340 | ); |
341 | } |
342 | |
343 | return $lagAwareTimestamp; |
344 | } |
345 | |
346 | /** |
347 | * @param WikiPage $page |
348 | * @return bool Whether something updated the backlinks with data newer than this job |
349 | */ |
350 | private function isAlreadyRefreshed( WikiPage $page ) { |
351 | $lagAwareTimestamp = $this->getLagAwareRootTimestamp(); |
352 | |
353 | return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp ); |
354 | } |
355 | |
356 | /** |
357 | * @see DerivedPageDataUpdater::shouldGenerateHTMLOnEdit |
358 | * @return bool true if at least one of slots require rendering HTML on edit, false otherwise. |
359 | * This is needed for example in populating ParserCache. |
360 | */ |
361 | private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool { |
362 | $services = MediaWikiServices::getInstance(); |
363 | foreach ( $revision->getSlots()->getSlotRoles() as $role ) { |
364 | $slot = $revision->getSlots()->getSlot( $role ); |
365 | $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() ); |
366 | if ( $contentHandler->generateHTMLOnEdit() ) { |
367 | return true; |
368 | } |
369 | } |
370 | return false; |
371 | } |
372 | |
373 | /** |
374 | * Get the parser output if the page is unchanged from what was loaded in $page |
375 | * |
376 | * @param RevisionRenderer $renderer |
377 | * @param ParserCache $parserCache |
378 | * @param WikiPage $page Page already loaded with READ_LATEST |
379 | * @param StatsFactory $stats |
380 | * @return ParserOutput|null Combined output for all slots; might only contain metadata |
381 | */ |
382 | private function getParserOutput( |
383 | RevisionRenderer $renderer, |
384 | ParserCache $parserCache, |
385 | WikiPage $page, |
386 | StatsFactory $stats |
387 | ) { |
388 | $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats ); |
389 | if ( !$revision ) { |
390 | // race condition? |
391 | return null; |
392 | } |
393 | |
394 | $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats ); |
395 | $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' ); |
396 | |
397 | if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) { |
398 | $statsCounter |
399 | ->setLabel( 'status', 'cache_hit' ) |
400 | ->setLabel( 'html_changed', 'n/a' ) |
401 | ->copyToStatsdAt( 'refreshlinks.parser_cached' ) |
402 | ->increment(); |
403 | |
404 | return $cachedOutput; |
405 | } |
406 | |
407 | $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob'; |
408 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
409 | |
410 | // T371713: Temporary statistics collection code to determine |
411 | // feasibility of Parsoid selective update |
412 | $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get( |
413 | MainConfigNames::ParsoidSelectiveUpdateSampleRate |
414 | ); |
415 | $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1; |
416 | if ( $doSample && $cachedOutput === null ) { |
417 | // In order to collect accurate statistics, check for |
418 | // a dirty copy in the cache even if we wouldn't have |
419 | // to otherwise. |
420 | $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?: null; |
421 | } |
422 | |
423 | $renderedRevision = $renderer->getRenderedRevision( |
424 | $revision, |
425 | $parserOptions, |
426 | null, |
427 | [ |
428 | 'audience' => $revision::RAW, |
429 | 'causeAction' => $causeAction, |
430 | // Providing a previous parse potentially allows for |
431 | // selective updates |
432 | 'previous-output' => $cachedOutput, |
433 | ] |
434 | ); |
435 | |
436 | $parseTimestamp = wfTimestampNow(); // timestamp that parsing started |
437 | $output = $renderedRevision->getRevisionParserOutput( [ |
438 | // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309) |
439 | 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision ) |
440 | ] ); |
441 | $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate() |
442 | // T371713: Temporary statistics collection code to determine |
443 | // feasibility of Parsoid selective update |
444 | if ( $doSample ) { |
445 | $content = $revision->getContent( SlotRecord::MAIN ); |
446 | $labels = [ |
447 | 'source' => 'RefreshLinksJob', |
448 | 'type' => $cachedOutput === null ? 'full' : 'selective', |
449 | 'reason' => $causeAction, |
450 | 'parser' => $parserOptions->getUseParsoid() ? 'parsoid' : 'legacy', |
451 | 'opportunistic' => empty( $this->params['isOpportunistic'] ) ? 'false' : 'true', |
452 | 'wiki' => WikiMap::getCurrentWikiId(), |
453 | 'model' => $content ? $content->getModel() : 'unknown', |
454 | ]; |
455 | $stats |
456 | ->getCounter( 'ParserCache_selective_total' ) |
457 | ->setLabels( $labels ) |
458 | ->increment(); |
459 | $stats |
460 | ->getCounter( 'ParserCache_selective_cpu_seconds' ) |
461 | ->setLabels( $labels ) |
462 | ->incrementBy( $output->getTimeProfile( 'cpu' ) ); |
463 | } |
464 | |
465 | // Collect stats on parses that don't actually change the page content. |
466 | // In that case, we could abort here, and perhaps we could also avoid |
467 | // triggering CDN purges (T369898). |
468 | if ( !$cachedOutput ) { |
469 | // There was no cached output |
470 | $htmlChanged = 'unknown'; |
471 | } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) { |
472 | // We have cached output, but we couldn't be sure that it was still good. |
473 | // So we parsed again, but the result turned out to be the same HTML as |
474 | // before. |
475 | $htmlChanged = 'no'; |
476 | } else { |
477 | // Re-parsing yielded HTML different from the cached output. |
478 | $htmlChanged = 'yes'; |
479 | } |
480 | |
481 | $statsCounter |
482 | ->setLabel( 'status', 'cache_miss' ) |
483 | ->setLabel( 'html_changed', $htmlChanged ) |
484 | ->setLabel( 'has_async_content', |
485 | $output->getOutputFlag( ParserOutputFlags::HAS_ASYNC_CONTENT ) ? 'true' : 'false' ) |
486 | ->setLabel( 'async_not_ready', |
487 | $output->getOutputFlag( ParserOutputFlags::ASYNC_NOT_READY ) ? 'true' : 'false' ) |
488 | ->copyToStatsdAt( 'refreshlinks.parser_uncached' ) |
489 | ->increment(); |
490 | |
491 | return $output; |
492 | } |
493 | |
494 | /** |
495 | * Get the current revision record if it is unchanged from what was loaded in $page |
496 | * |
497 | * @param WikiPage $page Page already loaded with READ_LATEST |
498 | * @param StatsFactory $stats |
499 | * @return RevisionRecord|null The same instance that $page->getRevisionRecord() uses |
500 | */ |
501 | private function getCurrentRevisionIfUnchanged( |
502 | WikiPage $page, |
503 | StatsFactory $stats |
504 | ) { |
505 | $title = $page->getTitle(); |
506 | // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction. |
507 | // This is used to detect edits/moves after loadPageData() but before the scope lock. |
508 | // The works around the chicken/egg problem of determining the scope lock key name |
509 | $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST ); |
510 | |
511 | $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null; |
512 | if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) { |
513 | // This job is obsolete and one for the latest revision will handle updates |
514 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
515 | $this->setLastError( "Revision $triggeringRevisionId is not current" ); |
516 | return null; |
517 | } |
518 | |
519 | // Load the current revision. Note that $page should have loaded with READ_LATEST. |
520 | // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on. |
521 | $revision = $page->getRevisionRecord(); |
522 | if ( !$revision ) { |
523 | // revision just got deleted? |
524 | $this->incrementFailureCounter( $stats, 'rev_not_found' ); |
525 | $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" ); |
526 | return null; |
527 | |
528 | } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) { |
529 | // Do not clobber over newer updates with older ones. If all jobs where FIFO and |
530 | // serialized, it would be OK to update links based on older revisions since it |
531 | // would eventually get to the latest. Since that is not the case (by design), |
532 | // only update the link tables to a state matching the current revision's output. |
533 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
534 | $this->setLastError( "Revision {$revision->getId()} is not current" ); |
535 | |
536 | return null; |
537 | } |
538 | |
539 | return $revision; |
540 | } |
541 | |
542 | /** |
543 | * Get the parser output from cache if it reflects the change that triggered this job |
544 | * |
545 | * @param ParserCache $parserCache |
546 | * @param WikiPage $page |
547 | * @param RevisionRecord $currentRevision |
548 | * @param StatsFactory $stats |
549 | * @return ParserOutput|null |
550 | */ |
551 | private function getParserOutputFromCache( |
552 | ParserCache $parserCache, |
553 | WikiPage $page, |
554 | RevisionRecord $currentRevision, |
555 | StatsFactory $stats |
556 | ): ?ParserOutput { |
557 | // Parsoid can do selective updates, so it is always worth the I/O |
558 | // to check for a previous parse. |
559 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
560 | if ( $parserOptions->getUseParsoid() ) { |
561 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
562 | } |
563 | // If page_touched changed after this root job, then it is likely that |
564 | // any views of the pages already resulted in re-parses which are now in |
565 | // cache. The cache can be reused to avoid expensive parsing in some cases. |
566 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
567 | if ( $rootTimestamp !== null ) { |
568 | $opportunistic = !empty( $this->params['isOpportunistic'] ); |
569 | if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) { |
570 | // Cache is suspected to be up-to-date so it's worth the I/O of checking. |
571 | // We call canUseParserOutputFromCache() later to check if it's usable. |
572 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
573 | } |
574 | } |
575 | |
576 | return null; |
577 | } |
578 | |
579 | private function canUseParserOutputFromCache( |
580 | ParserOutput $cachedOutput, |
581 | RevisionRecord $currentRevision |
582 | ): bool { |
583 | // As long as the cache rev ID matches the current rev ID and it reflects |
584 | // the job's triggering change, then it is usable. |
585 | return $cachedOutput->getCacheRevisionId() == $currentRevision->getId() |
586 | && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp(); |
587 | } |
588 | |
589 | /** |
590 | * Increment the RefreshLinks failure counter metric with the given reason. |
591 | * |
592 | * @param StatsFactory $stats |
593 | * @param string $reason Well-known failure reason string |
594 | * @return void |
595 | */ |
596 | private function incrementFailureCounter( StatsFactory $stats, $reason ): void { |
597 | $stats->getCounter( 'refreshlinks_failures_total' ) |
598 | ->setLabel( 'reason', $reason ) |
599 | ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" ) |
600 | ->increment(); |
601 | } |
602 | |
603 | /** |
604 | * @return array |
605 | */ |
606 | private function getDataUpdateOptions() { |
607 | $options = [ |
608 | 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ), |
609 | // Carry over cause so the update can do extra logging |
610 | 'causeAction' => $this->params['causeAction'], |
611 | 'causeAgent' => $this->params['causeAgent'] |
612 | ]; |
613 | if ( !empty( $this->params['triggeringUser'] ) ) { |
614 | $userInfo = $this->params['triggeringUser']; |
615 | if ( $userInfo['userId'] ) { |
616 | $options['triggeringUser'] = User::newFromId( $userInfo['userId'] ); |
617 | } else { |
618 | // Anonymous, use the username |
619 | $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false ); |
620 | } |
621 | } |
622 | |
623 | return $options; |
624 | } |
625 | |
626 | public function getDeduplicationInfo() { |
627 | $info = parent::getDeduplicationInfo(); |
628 | unset( $info['causeAction'] ); |
629 | unset( $info['causeAgent'] ); |
630 | if ( is_array( $info['params'] ) ) { |
631 | // For per-pages jobs, the job title is that of the template that changed |
632 | // (or similar), so remove that since it ruins duplicate detection |
633 | if ( isset( $info['params']['pages'] ) ) { |
634 | unset( $info['namespace'] ); |
635 | unset( $info['title'] ); |
636 | } |
637 | } |
638 | |
639 | return $info; |
640 | } |
641 | |
642 | public function workItemCount() { |
643 | if ( !empty( $this->params['recursive'] ) ) { |
644 | return 0; // nothing actually refreshed |
645 | } elseif ( isset( $this->params['pages'] ) ) { |
646 | return count( $this->params['pages'] ); |
647 | } |
648 | |
649 | return 1; // one title |
650 | } |
651 | } |
652 | |
653 | /** @deprecated class alias since 1.44 */ |
654 | class_alias( RefreshLinksJob::class, 'RefreshLinksJob' ); |