Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
60.52% |
141 / 233 |
|
18.75% |
3 / 16 |
CRAP | |
0.00% |
0 / 1 |
RefreshLinksJob | |
60.52% |
141 / 233 |
|
18.75% |
3 / 16 |
362.09 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
5 | |||
newPrioritized | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
newDynamic | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
run | |
27.27% |
9 / 33 |
|
0.00% |
0 / 1 |
40.16 | |||
runForTitle | |
92.86% |
39 / 42 |
|
0.00% |
0 / 1 |
5.01 | |||
getLagAwareRootTimestamp | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
3.01 | |||
isAlreadyRefreshed | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
shouldGenerateHTMLOnEdit | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
getParserOutput | |
56.92% |
37 / 65 |
|
0.00% |
0 / 1 |
32.99 | |||
getCurrentRevisionIfUnchanged | |
64.71% |
11 / 17 |
|
0.00% |
0 / 1 |
7.58 | |||
getParserOutputFromCache | |
55.56% |
5 / 9 |
|
0.00% |
0 / 1 |
11.30 | |||
canUseParserOutputFromCache | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
incrementFailureCounter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getDataUpdateOptions | |
63.64% |
7 / 11 |
|
0.00% |
0 / 1 |
3.43 | |||
getDeduplicationInfo | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
workItemCount | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | use MediaWiki\Deferred\LinksUpdate\LinksUpdate; |
22 | use MediaWiki\Deferred\RefreshSecondaryDataUpdate; |
23 | use MediaWiki\Logger\LoggerFactory; |
24 | use MediaWiki\MainConfigNames; |
25 | use MediaWiki\MediaWikiServices; |
26 | use MediaWiki\Page\PageAssertionException; |
27 | use MediaWiki\Page\PageIdentity; |
28 | use MediaWiki\Parser\ParserCache; |
29 | use MediaWiki\Parser\ParserOutput; |
30 | use MediaWiki\Revision\RevisionRecord; |
31 | use MediaWiki\Revision\RevisionRenderer; |
32 | use MediaWiki\Revision\SlotRecord; |
33 | use MediaWiki\Title\Title; |
34 | use MediaWiki\User\User; |
35 | use MediaWiki\WikiMap\WikiMap; |
36 | use Wikimedia\Rdbms\IDBAccessObject; |
37 | use Wikimedia\Stats\StatsFactory; |
38 | |
39 | /** |
40 | * Job to update link tables for rerendered wiki pages. |
41 | * |
42 | * This job comes in a few variants: |
43 | * |
44 | * - a) Recursive jobs to update links for backlink pages for a given title. |
45 | * Scheduled by {@see LinksUpdate::queueRecursiveJobsForTable()}; used to |
46 | * refresh pages which link/transclude a given title. |
47 | * These jobs have (recursive:true,table:<table>) set. They just look up |
48 | * which pages link to the job title and schedule them as a set of non-recursive |
49 | * RefreshLinksJob jobs (and possible one new recursive job as a way of |
50 | * continuation). |
51 | * - b) Jobs to update links for a set of pages (the job title is ignored). |
52 | * These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set. |
53 | * - c) Jobs to update links for a single page (the job title). |
54 | * These jobs need no extra fields set. |
55 | * |
56 | * Job parameters for all jobs: |
57 | * - recursive (bool): When false, updates the current page. When true, updates |
58 | * the pages which link/transclude the current page. |
59 | * - triggeringRevisionId (int): The revision of the edit which caused the link |
60 | * refresh. For manually triggered updates, the last revision of the page (at the |
61 | * time of scheduling). |
62 | * - triggeringUser (array): The user who triggered the refresh, in the form of a |
63 | * [ 'userId' => int, 'userName' => string ] array. This is not necessarily the user |
64 | * who created the revision. |
65 | * - triggeredRecursive (bool): Set on all jobs which were partitioned from another, |
66 | * recursive job. For debugging. |
67 | * - Standard deduplication params (see {@see JobQueue::deduplicateRootJob()}). |
68 | * For recursive jobs: |
69 | * - table (string): Which table to use (imagelinks or templatelinks) when searching for |
70 | * affected pages. |
71 | * - range (array): Used for recursive jobs when some pages have already been partitioned |
72 | * into separate jobs. Contains the list of ranges that still need to be partitioned. |
73 | * See {@see BacklinkJobUtils::partitionBacklinkJob()}. |
74 | * - division: Number of times the job was partitioned already (for debugging). |
75 | * For non-recursive jobs: |
76 | * - pages (array): Associative array of [ <page ID> => [ <namespace>, <dbkey> ] ]. |
77 | * Might be omitted, then the job title will be used. |
78 | * - isOpportunistic (bool): Set for opportunistic single-page updates. These are "free" |
79 | * updates that are queued when most of the work needed to be performed anyway for |
80 | * non-linkrefresh-related reasons, and can be more easily discarded if they don't seem |
81 | * useful. See {@see WikiPage::triggerOpportunisticLinksUpdate()}. |
82 | * - useRecursiveLinksUpdate (bool): When true, triggers recursive jobs for each page. |
83 | * |
84 | * Metrics: |
85 | * - `refreshlinks_superseded_updates_total`: The number of times the job was cancelled |
86 | * because the target page had already been refreshed by a different edit or job. |
87 | * The job is considered to have succeeded in this case. |
88 | * |
89 | * - `refreshlinks_warnings_total`: The number of times the job failed due to a recoverable issue. |
90 | * Possible `reason` label values include: |
91 | * - `lag_wait_failed`: The job timed out while waiting for replication. |
92 | * |
93 | * - `refreshlinks_failures_total`: The number of times the job failed. |
94 | * The `reason` label may be: |
95 | * - `page_not_found`: The target page did not exist. |
96 | * - `rev_not_current`: The target revision was no longer the latest revision for the target page. |
97 | * - `rev_not_found`: The target revision was not found. |
98 | * - `lock_failure`: The job failed to acquire an exclusive lock to refresh the target page. |
99 | * |
100 | * - `refreshlinks_parsercache_operations_total`: The number of times the job attempted |
101 | * to fetch parser output from the parser cache. |
102 | * Possible `status` label values include: |
103 | * - `cache_hit`: The parser output was found in the cache. |
104 | * - `cache_miss`: The parser output was not found in the cache. |
105 | * |
106 | * @ingroup JobQueue |
107 | * @see RefreshSecondaryDataUpdate |
108 | * @see WikiPage::doSecondaryDataUpdates() |
109 | */ |
110 | class RefreshLinksJob extends Job { |
111 | /** @var int Lag safety margin when comparing root job times to last-refresh times */ |
112 | private const NORMAL_MAX_LAG = 10; |
113 | /** @var int How many seconds to wait for replica DBs to catch up */ |
114 | private const LAG_WAIT_TIMEOUT = 15; |
115 | |
116 | public function __construct( PageIdentity $page, array $params ) { |
117 | if ( empty( $params['pages'] ) && !$page->canExist() ) { |
118 | // BC with the Title class |
119 | throw new PageAssertionException( |
120 | 'The given PageIdentity {pageIdentity} does not represent a proper page', |
121 | [ 'pageIdentity' => $page ] |
122 | ); |
123 | } |
124 | |
125 | parent::__construct( 'refreshLinks', $page, $params ); |
126 | // Avoid the overhead of de-duplication when it would be pointless |
127 | $this->removeDuplicates = ( |
128 | // Ranges rarely will line up |
129 | !isset( $params['range'] ) && |
130 | // Multiple pages per job make matches unlikely |
131 | !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 ) |
132 | ); |
133 | $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ]; |
134 | // Tell JobRunner to not automatically wrap run() in a transaction round. |
135 | // Each runForTitle() call will manage its own rounds in order to run DataUpdates |
136 | // and to avoid contention as well. |
137 | $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND; |
138 | } |
139 | |
140 | /** |
141 | * @param PageIdentity $page |
142 | * @param array $params |
143 | * @return RefreshLinksJob |
144 | */ |
145 | public static function newPrioritized( PageIdentity $page, array $params ) { |
146 | $job = new self( $page, $params ); |
147 | $job->command = 'refreshLinksPrioritized'; |
148 | |
149 | return $job; |
150 | } |
151 | |
152 | /** |
153 | * @param PageIdentity $page |
154 | * @param array $params |
155 | * @return RefreshLinksJob |
156 | */ |
157 | public static function newDynamic( PageIdentity $page, array $params ) { |
158 | $job = new self( $page, $params ); |
159 | $job->command = 'refreshLinksDynamic'; |
160 | |
161 | return $job; |
162 | } |
163 | |
164 | public function run() { |
165 | $ok = true; |
166 | |
167 | if ( !empty( $this->params['recursive'] ) ) { |
168 | // Job to update all (or a range of) backlink pages for a page |
169 | |
170 | // When the base job branches, wait for the replica DBs to catch up to the primary. |
171 | // From then on, we know that any template changes at the time the base job was |
172 | // enqueued will be reflected in backlink page parses when the leaf jobs run. |
173 | $services = MediaWikiServices::getInstance(); |
174 | if ( !isset( $this->params['range'] ) ) { |
175 | $lbFactory = $services->getDBLoadBalancerFactory(); |
176 | if ( !$lbFactory->waitForReplication( [ |
177 | 'timeout' => self::LAG_WAIT_TIMEOUT |
178 | ] ) ) { |
179 | // only try so hard, keep going with what we have |
180 | $stats = $services->getStatsFactory(); |
181 | $stats->getCounter( 'refreshlinks_warnings_total' ) |
182 | ->setLabel( 'reason', 'lag_wait_failed' ) |
183 | ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' ) |
184 | ->increment(); |
185 | } |
186 | } |
187 | // Carry over information for de-duplication |
188 | $extraParams = $this->getRootJobParams(); |
189 | $extraParams['triggeredRecursive'] = true; |
190 | // Carry over cause information for logging |
191 | $extraParams['causeAction'] = $this->params['causeAction']; |
192 | $extraParams['causeAgent'] = $this->params['causeAgent']; |
193 | // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title |
194 | // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks |
195 | $jobs = BacklinkJobUtils::partitionBacklinkJob( |
196 | $this, |
197 | $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ), |
198 | 1, // job-per-title |
199 | [ 'params' => $extraParams ] |
200 | ); |
201 | $services->getJobQueueGroup()->push( $jobs ); |
202 | |
203 | } elseif ( isset( $this->params['pages'] ) ) { |
204 | // Job to update link tables for a set of titles |
205 | foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) { |
206 | $title = Title::makeTitleSafe( $ns, $dbKey ); |
207 | if ( $title && $title->canExist() ) { |
208 | $ok = $this->runForTitle( $title ) && $ok; |
209 | } else { |
210 | $ok = false; |
211 | $this->setLastError( "Invalid title ($ns,$dbKey)." ); |
212 | } |
213 | } |
214 | |
215 | } else { |
216 | // Job to update link tables for a given title |
217 | $ok = $this->runForTitle( $this->title ); |
218 | } |
219 | |
220 | return $ok; |
221 | } |
222 | |
223 | /** |
224 | * @param PageIdentity $pageIdentity |
225 | * @return bool |
226 | */ |
227 | protected function runForTitle( PageIdentity $pageIdentity ) { |
228 | $services = MediaWikiServices::getInstance(); |
229 | $stats = $services->getStatsFactory(); |
230 | $renderer = $services->getRevisionRenderer(); |
231 | $parserCache = $services->getParserCache(); |
232 | $lbFactory = $services->getDBLoadBalancerFactory(); |
233 | $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ ); |
234 | |
235 | // Load the page from the primary DB |
236 | $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity ); |
237 | $page->loadPageData( IDBAccessObject::READ_LATEST ); |
238 | |
239 | if ( !$page->exists() ) { |
240 | // Probably due to concurrent deletion or renaming of the page |
241 | $logger = LoggerFactory::getInstance( 'RefreshLinksJob' ); |
242 | $logger->warning( |
243 | 'The page does not exist. Perhaps it was deleted?', |
244 | [ |
245 | 'page_title' => $this->title->getPrefixedDBkey(), |
246 | 'job_params' => $this->getParams(), |
247 | 'job_metadata' => $this->getMetadata() |
248 | ] |
249 | ); |
250 | $this->incrementFailureCounter( $stats, 'page_not_found' ); |
251 | |
252 | // retry later to handle unlucky race condition |
253 | return false; |
254 | } |
255 | |
256 | // Serialize link update job by page ID so they see each others' changes. |
257 | // The page ID and latest revision ID will be queried again after the lock |
258 | // is acquired to bail if they are changed from that of loadPageData() above. |
259 | // Serialize links updates by page ID so they see each others' changes |
260 | $dbw = $lbFactory->getPrimaryDatabase(); |
261 | /** @noinspection PhpUnusedLocalVariableInspection */ |
262 | $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' ); |
263 | if ( $scopedLock === null ) { |
264 | // Another job is already updating the page, likely for a prior revision (T170596) |
265 | $this->setLastError( 'LinksUpdate already running for this page, try again later.' ); |
266 | $this->incrementFailureCounter( $stats, 'lock_failure' ); |
267 | |
268 | // retry later when overlapping job for previous rev is done |
269 | return false; |
270 | } |
271 | |
272 | if ( $this->isAlreadyRefreshed( $page ) ) { |
273 | // this job has been superseded, e.g. by overlapping recursive job |
274 | // for a different template edit, or by direct edit or purge. |
275 | $stats->getCounter( 'refreshlinks_superseded_updates_total' ) |
276 | ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' ) |
277 | ->increment(); |
278 | // treat as success |
279 | return true; |
280 | } |
281 | |
282 | // Parse during a fresh transaction round for better read consistency |
283 | $lbFactory->beginPrimaryChanges( __METHOD__ ); |
284 | $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats ); |
285 | $options = $this->getDataUpdateOptions(); |
286 | $lbFactory->commitPrimaryChanges( __METHOD__ ); |
287 | |
288 | if ( !$output ) { |
289 | // probably raced out. |
290 | // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged(). |
291 | // Don't retry job. |
292 | return true; |
293 | } |
294 | |
295 | // Tell DerivedPageDataUpdater to use this parser output |
296 | $options['known-revision-output'] = $output; |
297 | // Execute corresponding DataUpdates immediately |
298 | $page->doSecondaryDataUpdates( $options ); |
299 | InfoAction::invalidateCache( $page ); |
300 | |
301 | // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache! |
302 | // This means the page will have to be rendered on-the-fly when it is next viewed. |
303 | // This is to avoid spending limited ParserCache capacity on rarely visited pages. |
304 | // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache() |
305 | // for pages that are likely to benefit (T327162). |
306 | |
307 | // Commit any writes here in case this method is called in a loop. |
308 | // In that case, the scoped lock will fail to be acquired. |
309 | $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket ); |
310 | |
311 | return true; |
312 | } |
313 | |
314 | /** |
315 | * @return string|null Minimum lag-safe TS_MW timestamp with regard to root job creation |
316 | */ |
317 | private function getLagAwareRootTimestamp() { |
318 | // Get the timestamp of the change that triggered this job |
319 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
320 | if ( $rootTimestamp === null ) { |
321 | return null; |
322 | } |
323 | |
324 | if ( !empty( $this->params['isOpportunistic'] ) ) { |
325 | // Neither clock skew nor DB snapshot/replica DB lag matter much for |
326 | // such updates; focus on reusing the (often recently updated) cache |
327 | $lagAwareTimestamp = $rootTimestamp; |
328 | } else { |
329 | // For transclusion updates, the template changes must be reflected |
330 | $lagAwareTimestamp = wfTimestamp( |
331 | TS_MW, |
332 | (int)wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG |
333 | ); |
334 | } |
335 | |
336 | return $lagAwareTimestamp; |
337 | } |
338 | |
339 | /** |
340 | * @param WikiPage $page |
341 | * @return bool Whether something updated the backlinks with data newer than this job |
342 | */ |
343 | private function isAlreadyRefreshed( WikiPage $page ) { |
344 | $lagAwareTimestamp = $this->getLagAwareRootTimestamp(); |
345 | |
346 | return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp ); |
347 | } |
348 | |
349 | /** |
350 | * @see DerivedPageDataUpdater::shouldGenerateHTMLOnEdit |
351 | * @return bool true if at least one of slots require rendering HTML on edit, false otherwise. |
352 | * This is needed for example in populating ParserCache. |
353 | */ |
354 | private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool { |
355 | $services = MediaWikiServices::getInstance(); |
356 | foreach ( $revision->getSlots()->getSlotRoles() as $role ) { |
357 | $slot = $revision->getSlots()->getSlot( $role ); |
358 | $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() ); |
359 | if ( $contentHandler->generateHTMLOnEdit() ) { |
360 | return true; |
361 | } |
362 | } |
363 | return false; |
364 | } |
365 | |
366 | /** |
367 | * Get the parser output if the page is unchanged from what was loaded in $page |
368 | * |
369 | * @param RevisionRenderer $renderer |
370 | * @param ParserCache $parserCache |
371 | * @param WikiPage $page Page already loaded with READ_LATEST |
372 | * @param StatsFactory $stats |
373 | * @return ParserOutput|null Combined output for all slots; might only contain metadata |
374 | */ |
375 | private function getParserOutput( |
376 | RevisionRenderer $renderer, |
377 | ParserCache $parserCache, |
378 | WikiPage $page, |
379 | StatsFactory $stats |
380 | ) { |
381 | $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats ); |
382 | if ( !$revision ) { |
383 | // race condition? |
384 | return null; |
385 | } |
386 | |
387 | $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats ); |
388 | $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' ); |
389 | |
390 | if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) { |
391 | $statsCounter |
392 | ->setLabel( 'status', 'cache_hit' ) |
393 | ->setLabel( 'html_changed', 'n/a' ) |
394 | ->copyToStatsdAt( 'refreshlinks.parser_cached' ) |
395 | ->increment(); |
396 | |
397 | return $cachedOutput; |
398 | } |
399 | |
400 | $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob'; |
401 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
402 | |
403 | // T371713: Temporary statistics collection code to determine |
404 | // feasibility of Parsoid selective update |
405 | $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get( |
406 | MainConfigNames::ParsoidSelectiveUpdateSampleRate |
407 | ); |
408 | $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1; |
409 | if ( $doSample && $cachedOutput === null ) { |
410 | // In order to collect accurate statistics, check for |
411 | // a dirty copy in the cache even if we wouldn't have |
412 | // to otherwise. |
413 | $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?: null; |
414 | } |
415 | |
416 | $renderedRevision = $renderer->getRenderedRevision( |
417 | $revision, |
418 | $parserOptions, |
419 | null, |
420 | [ |
421 | 'audience' => $revision::RAW, |
422 | 'causeAction' => $causeAction, |
423 | // Providing a previous parse potentially allows for |
424 | // selective updates |
425 | 'previous-output' => $cachedOutput, |
426 | ] |
427 | ); |
428 | |
429 | $parseTimestamp = wfTimestampNow(); // timestamp that parsing started |
430 | $output = $renderedRevision->getRevisionParserOutput( [ |
431 | // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309) |
432 | 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision ) |
433 | ] ); |
434 | $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate() |
435 | // T371713: Temporary statistics collection code to determine |
436 | // feasibility of Parsoid selective update |
437 | if ( $doSample ) { |
438 | $content = $revision->getContent( SlotRecord::MAIN ); |
439 | $labels = [ |
440 | 'source' => 'RefreshLinksJob', |
441 | 'type' => $cachedOutput === null ? 'full' : 'selective', |
442 | 'reason' => $causeAction, |
443 | 'parser' => $parserOptions->getUseParsoid() ? 'parsoid' : 'legacy', |
444 | 'opportunistic' => empty( $this->params['isOpportunistic'] ) ? 'false' : 'true', |
445 | 'wiki' => WikiMap::getCurrentWikiId(), |
446 | 'model' => $content ? $content->getModel() : 'unknown', |
447 | ]; |
448 | $stats |
449 | ->getCounter( 'ParserCache_selective_total' ) |
450 | ->setLabels( $labels ) |
451 | ->increment(); |
452 | $stats |
453 | ->getCounter( 'ParserCache_selective_cpu_seconds' ) |
454 | ->setLabels( $labels ) |
455 | ->incrementBy( $output->getTimeProfile( 'cpu' ) ); |
456 | } |
457 | |
458 | // Collect stats on parses that don't actually change the page content. |
459 | // In that case, we could abort here, and perhaps we could also avoid |
460 | // triggering CDN purges (T369898). |
461 | if ( !$cachedOutput ) { |
462 | // There was no cached output |
463 | $htmlChanged = 'unknown'; |
464 | } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) { |
465 | // We have cached output, but we couldn't be sure that it was still good. |
466 | // So we parsed again, but the result turned out to be the same HTML as |
467 | // before. |
468 | $htmlChanged = 'no'; |
469 | } else { |
470 | // Re-parsing yielded HTML different from the cached output. |
471 | $htmlChanged = 'yes'; |
472 | } |
473 | |
474 | $statsCounter |
475 | ->setLabel( 'status', 'cache_miss' ) |
476 | ->setLabel( 'html_changed', $htmlChanged ) |
477 | ->copyToStatsdAt( 'refreshlinks.parser_uncached' ) |
478 | ->increment(); |
479 | |
480 | return $output; |
481 | } |
482 | |
483 | /** |
484 | * Get the current revision record if it is unchanged from what was loaded in $page |
485 | * |
486 | * @param WikiPage $page Page already loaded with READ_LATEST |
487 | * @param StatsFactory $stats |
488 | * @return RevisionRecord|null The same instance that $page->getRevisionRecord() uses |
489 | */ |
490 | private function getCurrentRevisionIfUnchanged( |
491 | WikiPage $page, |
492 | StatsFactory $stats |
493 | ) { |
494 | $title = $page->getTitle(); |
495 | // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction. |
496 | // This is used to detect edits/moves after loadPageData() but before the scope lock. |
497 | // The works around the chicken/egg problem of determining the scope lock key name |
498 | $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST ); |
499 | |
500 | $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null; |
501 | if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) { |
502 | // This job is obsolete and one for the latest revision will handle updates |
503 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
504 | $this->setLastError( "Revision $triggeringRevisionId is not current" ); |
505 | return null; |
506 | } |
507 | |
508 | // Load the current revision. Note that $page should have loaded with READ_LATEST. |
509 | // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on. |
510 | $revision = $page->getRevisionRecord(); |
511 | if ( !$revision ) { |
512 | // revision just got deleted? |
513 | $this->incrementFailureCounter( $stats, 'rev_not_found' ); |
514 | $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" ); |
515 | return null; |
516 | |
517 | } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) { |
518 | // Do not clobber over newer updates with older ones. If all jobs where FIFO and |
519 | // serialized, it would be OK to update links based on older revisions since it |
520 | // would eventually get to the latest. Since that is not the case (by design), |
521 | // only update the link tables to a state matching the current revision's output. |
522 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
523 | $this->setLastError( "Revision {$revision->getId()} is not current" ); |
524 | |
525 | return null; |
526 | } |
527 | |
528 | return $revision; |
529 | } |
530 | |
531 | /** |
532 | * Get the parser output from cache if it reflects the change that triggered this job |
533 | * |
534 | * @param ParserCache $parserCache |
535 | * @param WikiPage $page |
536 | * @param RevisionRecord $currentRevision |
537 | * @param StatsFactory $stats |
538 | * @return ParserOutput|null |
539 | */ |
540 | private function getParserOutputFromCache( |
541 | ParserCache $parserCache, |
542 | WikiPage $page, |
543 | RevisionRecord $currentRevision, |
544 | StatsFactory $stats |
545 | ): ?ParserOutput { |
546 | // Parsoid can do selective updates, so it is always worth the I/O |
547 | // to check for a previous parse. |
548 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
549 | if ( $parserOptions->getUseParsoid() ) { |
550 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
551 | } |
552 | // If page_touched changed after this root job, then it is likely that |
553 | // any views of the pages already resulted in re-parses which are now in |
554 | // cache. The cache can be reused to avoid expensive parsing in some cases. |
555 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
556 | if ( $rootTimestamp !== null ) { |
557 | $opportunistic = !empty( $this->params['isOpportunistic'] ); |
558 | if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) { |
559 | // Cache is suspected to be up-to-date so it's worth the I/O of checking. |
560 | // We call canUseParserOutputFromCache() later to check if it's usable. |
561 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
562 | } |
563 | } |
564 | |
565 | return null; |
566 | } |
567 | |
568 | private function canUseParserOutputFromCache( |
569 | ParserOutput $cachedOutput, |
570 | RevisionRecord $currentRevision |
571 | ) { |
572 | // As long as the cache rev ID matches the current rev ID and it reflects |
573 | // the job's triggering change, then it is usable. |
574 | return $cachedOutput->getCacheRevisionId() == $currentRevision->getId() |
575 | && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp(); |
576 | } |
577 | |
578 | /** |
579 | * Increment the RefreshLinks failure counter metric with the given reason. |
580 | * |
581 | * @param StatsFactory $stats |
582 | * @param string $reason Well-known failure reason string |
583 | * @return void |
584 | */ |
585 | private function incrementFailureCounter( StatsFactory $stats, $reason ): void { |
586 | $stats->getCounter( 'refreshlinks_failures_total' ) |
587 | ->setLabel( 'reason', $reason ) |
588 | ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" ) |
589 | ->increment(); |
590 | } |
591 | |
592 | /** |
593 | * @return array |
594 | */ |
595 | private function getDataUpdateOptions() { |
596 | $options = [ |
597 | 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ), |
598 | // Carry over cause so the update can do extra logging |
599 | 'causeAction' => $this->params['causeAction'], |
600 | 'causeAgent' => $this->params['causeAgent'] |
601 | ]; |
602 | if ( !empty( $this->params['triggeringUser'] ) ) { |
603 | $userInfo = $this->params['triggeringUser']; |
604 | if ( $userInfo['userId'] ) { |
605 | $options['triggeringUser'] = User::newFromId( $userInfo['userId'] ); |
606 | } else { |
607 | // Anonymous, use the username |
608 | $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false ); |
609 | } |
610 | } |
611 | |
612 | return $options; |
613 | } |
614 | |
615 | public function getDeduplicationInfo() { |
616 | $info = parent::getDeduplicationInfo(); |
617 | unset( $info['causeAction'] ); |
618 | unset( $info['causeAgent'] ); |
619 | if ( is_array( $info['params'] ) ) { |
620 | // For per-pages jobs, the job title is that of the template that changed |
621 | // (or similar), so remove that since it ruins duplicate detection |
622 | if ( isset( $info['params']['pages'] ) ) { |
623 | unset( $info['namespace'] ); |
624 | unset( $info['title'] ); |
625 | } |
626 | } |
627 | |
628 | return $info; |
629 | } |
630 | |
631 | public function workItemCount() { |
632 | if ( !empty( $this->params['recursive'] ) ) { |
633 | return 0; // nothing actually refreshed |
634 | } elseif ( isset( $this->params['pages'] ) ) { |
635 | return count( $this->params['pages'] ); |
636 | } |
637 | |
638 | return 1; // one title |
639 | } |
640 | } |