Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
60.94% |
142 / 233 |
|
18.75% |
3 / 16 |
CRAP | |
0.00% |
0 / 1 |
| RefreshLinksJob | |
61.21% |
142 / 232 |
|
18.75% |
3 / 16 |
374.64 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
5 | |||
| newPrioritized | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| newDynamic | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| run | |
28.12% |
9 / 32 |
|
0.00% |
0 / 1 |
39.08 | |||
| runForTitle | |
92.68% |
38 / 41 |
|
0.00% |
0 / 1 |
5.01 | |||
| getLagAwareRootTimestamp | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
3.01 | |||
| isAlreadyRefreshed | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| shouldGenerateHTMLOnEdit | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
| getParserOutput | |
59.70% |
40 / 67 |
|
0.00% |
0 / 1 |
39.20 | |||
| getCurrentRevisionIfUnchanged | |
64.71% |
11 / 17 |
|
0.00% |
0 / 1 |
7.58 | |||
| getParserOutputFromCache | |
55.56% |
5 / 9 |
|
0.00% |
0 / 1 |
11.30 | |||
| canUseParserOutputFromCache | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| incrementFailureCounter | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| getDataUpdateOptions | |
63.64% |
7 / 11 |
|
0.00% |
0 / 1 |
3.43 | |||
| getDeduplicationInfo | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| workItemCount | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | */ |
| 6 | |
| 7 | namespace MediaWiki\JobQueue\Jobs; |
| 8 | |
| 9 | use MediaWiki\Actions\InfoAction; |
| 10 | use MediaWiki\Deferred\LinksUpdate\LinksUpdate; |
| 11 | use MediaWiki\Deferred\RefreshSecondaryDataUpdate; |
| 12 | use MediaWiki\JobQueue\Job; |
| 13 | use MediaWiki\JobQueue\Utils\BacklinkJobUtils; |
| 14 | use MediaWiki\Logger\LoggerFactory; |
| 15 | use MediaWiki\MainConfigNames; |
| 16 | use MediaWiki\MediaWikiServices; |
| 17 | use MediaWiki\Page\PageAssertionException; |
| 18 | use MediaWiki\Page\PageIdentity; |
| 19 | use MediaWiki\Page\WikiPage; |
| 20 | use MediaWiki\Parser\ParserCache; |
| 21 | use MediaWiki\Parser\ParserOutput; |
| 22 | use MediaWiki\Parser\ParserOutputFlags; |
| 23 | use MediaWiki\Revision\RevisionRecord; |
| 24 | use MediaWiki\Revision\RevisionRenderer; |
| 25 | use MediaWiki\Revision\SlotRecord; |
| 26 | use MediaWiki\Title\Title; |
| 27 | use MediaWiki\User\User; |
| 28 | use MediaWiki\WikiMap\WikiMap; |
| 29 | use Wikimedia\Rdbms\IDBAccessObject; |
| 30 | use Wikimedia\Stats\StatsFactory; |
| 31 | use Wikimedia\Timestamp\TimestampFormat as TS; |
| 32 | |
| 33 | /** |
| 34 | * Job to update link tables for rerendered wiki pages. |
| 35 | * |
| 36 | * This job comes in a few variants: |
| 37 | * |
| 38 | * - a) Recursive jobs to update links for backlink pages for a given title. |
| 39 | * Scheduled by {@see LinksUpdate::queueRecursiveJobsForTable()}; used to |
| 40 | * refresh pages which link/transclude a given title. |
| 41 | * These jobs have (recursive:true,table:<table>) set. They just look up |
| 42 | * which pages link to the job title and schedule them as a set of non-recursive |
| 43 | * RefreshLinksJob jobs (and possible one new recursive job as a way of |
| 44 | * continuation). |
| 45 | * - b) Jobs to update links for a set of pages (the job title is ignored). |
| 46 | * These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set. |
| 47 | * - c) Jobs to update links for a single page (the job title). |
| 48 | * These jobs need no extra fields set. |
| 49 | * |
| 50 | * Job parameters for all jobs: |
| 51 | * - recursive (bool): When false, updates the current page. When true, updates |
| 52 | * the pages which link/transclude the current page. |
| 53 | * - triggeringRevisionId (int): The revision of the edit which caused the link |
| 54 | * refresh. For manually triggered updates, the last revision of the page (at the |
| 55 | * time of scheduling). |
| 56 | * - triggeringUser (array): The user who triggered the refresh, in the form of a |
| 57 | * [ 'userId' => int, 'userName' => string ] array. This is not necessarily the user |
| 58 | * who created the revision. |
| 59 | * - triggeredRecursive (bool): Set on all jobs which were partitioned from another, |
| 60 | * recursive job. For debugging. |
| 61 | * - Standard deduplication params (see {@see JobQueue::deduplicateRootJob()}). |
| 62 | * For recursive jobs: |
| 63 | * - table (string): Which table to use (imagelinks or templatelinks) when searching for |
| 64 | * affected pages. |
| 65 | * - range (array): Used for recursive jobs when some pages have already been partitioned |
| 66 | * into separate jobs. Contains the list of ranges that still need to be partitioned. |
| 67 | * See {@see BacklinkJobUtils::partitionBacklinkJob()}. |
| 68 | * - division: Number of times the job was partitioned already (for debugging). |
| 69 | * For non-recursive jobs: |
| 70 | * - pages (array): Associative array of [ <page ID> => [ <namespace>, <dbkey> ] ]. |
| 71 | * Might be omitted, then the job title will be used. |
| 72 | * - isOpportunistic (bool): Set for opportunistic single-page updates. These are "free" |
| 73 | * updates that are queued when most of the work needed to be performed anyway for |
| 74 | * non-linkrefresh-related reasons, and can be more easily discarded if they don't seem |
| 75 | * useful. See {@see WikiPage::triggerOpportunisticLinksUpdate()}. |
| 76 | * - useRecursiveLinksUpdate (bool): When true, triggers recursive jobs for each page. |
| 77 | * |
| 78 | * Metrics: |
| 79 | * - `refreshlinks_superseded_updates_total`: The number of times the job was cancelled |
| 80 | * because the target page had already been refreshed by a different edit or job. |
| 81 | * The job is considered to have succeeded in this case. |
| 82 | * |
| 83 | * - `refreshlinks_warnings_total`: The number of times the job failed due to a recoverable issue. |
| 84 | * Possible `reason` label values include: |
| 85 | * - `lag_wait_failed`: The job timed out while waiting for replication. |
| 86 | * |
| 87 | * - `refreshlinks_failures_total`: The number of times the job failed. |
| 88 | * The `reason` label may be: |
| 89 | * - `page_not_found`: The target page did not exist. |
| 90 | * - `rev_not_current`: The target revision was no longer the latest revision for the target page. |
| 91 | * - `rev_not_found`: The target revision was not found. |
| 92 | * - `lock_failure`: The job failed to acquire an exclusive lock to refresh the target page. |
| 93 | * |
| 94 | * - `refreshlinks_parsercache_operations_total`: The number of times the job attempted |
| 95 | * to fetch parser output from the parser cache. |
| 96 | * Possible `status` label values include: |
| 97 | * - `cache_hit`: The parser output was found in the cache. |
| 98 | * - `cache_miss`: The parser output was not found in the cache. |
| 99 | * |
| 100 | * @ingroup JobQueue |
| 101 | * @see RefreshSecondaryDataUpdate |
| 102 | * @see WikiPage::doSecondaryDataUpdates() |
| 103 | */ |
| 104 | class RefreshLinksJob extends Job { |
| 105 | /** @var int Lag safety margin when comparing root job times to last-refresh times */ |
| 106 | private const NORMAL_MAX_LAG = 10; |
| 107 | /** @var int How many seconds to wait for replica DBs to catch up */ |
| 108 | private const LAG_WAIT_TIMEOUT = 15; |
| 109 | |
| 110 | public function __construct( PageIdentity $page, array $params ) { |
| 111 | if ( empty( $params['pages'] ) && !$page->canExist() ) { |
| 112 | // BC with the Title class |
| 113 | throw new PageAssertionException( |
| 114 | 'The given PageIdentity {pageIdentity} does not represent a proper page', |
| 115 | [ 'pageIdentity' => $page ] |
| 116 | ); |
| 117 | } |
| 118 | |
| 119 | parent::__construct( 'refreshLinks', $page, $params ); |
| 120 | // Avoid the overhead of de-duplication when it would be pointless |
| 121 | $this->removeDuplicates = ( |
| 122 | // Ranges rarely will line up |
| 123 | !isset( $params['range'] ) && |
| 124 | // Multiple pages per job make matches unlikely |
| 125 | !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 ) |
| 126 | ); |
| 127 | $this->params += [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ]; |
| 128 | // Tell JobRunner to not automatically wrap run() in a transaction round. |
| 129 | // Each runForTitle() call will manage its own rounds in order to run DataUpdates |
| 130 | // and to avoid contention as well. |
| 131 | $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND; |
| 132 | } |
| 133 | |
| 134 | /** |
| 135 | * @param PageIdentity $page |
| 136 | * @param array $params |
| 137 | * @return RefreshLinksJob |
| 138 | */ |
| 139 | public static function newPrioritized( PageIdentity $page, array $params ) { |
| 140 | $job = new self( $page, $params ); |
| 141 | $job->command = 'refreshLinksPrioritized'; |
| 142 | |
| 143 | return $job; |
| 144 | } |
| 145 | |
| 146 | /** |
| 147 | * @param PageIdentity $page |
| 148 | * @param array $params |
| 149 | * @return RefreshLinksJob |
| 150 | */ |
| 151 | public static function newDynamic( PageIdentity $page, array $params ) { |
| 152 | $job = new self( $page, $params ); |
| 153 | $job->command = 'refreshLinksDynamic'; |
| 154 | |
| 155 | return $job; |
| 156 | } |
| 157 | |
| 158 | /** @inheritDoc */ |
| 159 | public function run() { |
| 160 | $ok = true; |
| 161 | |
| 162 | if ( !empty( $this->params['recursive'] ) ) { |
| 163 | // Job to update all (or a range of) backlink pages for a page |
| 164 | |
| 165 | // When the base job branches, wait for the replica DBs to catch up to the primary. |
| 166 | // From then on, we know that any template changes at the time the base job was |
| 167 | // enqueued will be reflected in backlink page parses when the leaf jobs run. |
| 168 | $services = MediaWikiServices::getInstance(); |
| 169 | if ( !isset( $this->params['range'] ) ) { |
| 170 | $lbFactory = $services->getDBLoadBalancerFactory(); |
| 171 | if ( !$lbFactory->waitForReplication( [ |
| 172 | 'timeout' => self::LAG_WAIT_TIMEOUT |
| 173 | ] ) ) { |
| 174 | // only try so hard, keep going with what we have |
| 175 | $stats = $services->getStatsFactory(); |
| 176 | $stats->getCounter( 'refreshlinks_warnings_total' ) |
| 177 | ->setLabel( 'reason', 'lag_wait_failed' ) |
| 178 | ->increment(); |
| 179 | } |
| 180 | } |
| 181 | // Carry over information for de-duplication |
| 182 | $extraParams = $this->getRootJobParams(); |
| 183 | $extraParams['triggeredRecursive'] = true; |
| 184 | // Carry over cause information for logging |
| 185 | $extraParams['causeAction'] = $this->params['causeAction']; |
| 186 | $extraParams['causeAgent'] = $this->params['causeAgent']; |
| 187 | // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title |
| 188 | // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks |
| 189 | $jobs = BacklinkJobUtils::partitionBacklinkJob( |
| 190 | $this, |
| 191 | $services->getMainConfig()->get( MainConfigNames::UpdateRowsPerJob ), |
| 192 | 1, // job-per-title |
| 193 | [ 'params' => $extraParams ] |
| 194 | ); |
| 195 | $services->getJobQueueGroup()->push( $jobs ); |
| 196 | |
| 197 | } elseif ( isset( $this->params['pages'] ) ) { |
| 198 | // Job to update link tables for a set of titles |
| 199 | foreach ( $this->params['pages'] as [ $ns, $dbKey ] ) { |
| 200 | $title = Title::makeTitleSafe( $ns, $dbKey ); |
| 201 | if ( $title && $title->canExist() ) { |
| 202 | $ok = $this->runForTitle( $title ) && $ok; |
| 203 | } else { |
| 204 | $ok = false; |
| 205 | $this->setLastError( "Invalid title ($ns,$dbKey)." ); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | } else { |
| 210 | // Job to update link tables for a given title |
| 211 | $ok = $this->runForTitle( $this->title ); |
| 212 | } |
| 213 | |
| 214 | return $ok; |
| 215 | } |
| 216 | |
| 217 | /** |
| 218 | * @param PageIdentity $pageIdentity |
| 219 | * @return bool |
| 220 | */ |
| 221 | protected function runForTitle( PageIdentity $pageIdentity ) { |
| 222 | $services = MediaWikiServices::getInstance(); |
| 223 | $stats = $services->getStatsFactory(); |
| 224 | $renderer = $services->getRevisionRenderer(); |
| 225 | $parserCache = $services->getParserCache(); |
| 226 | $lbFactory = $services->getDBLoadBalancerFactory(); |
| 227 | $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ ); |
| 228 | |
| 229 | // Load the page from the primary DB |
| 230 | $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity ); |
| 231 | $page->loadPageData( IDBAccessObject::READ_LATEST ); |
| 232 | |
| 233 | if ( !$page->exists() ) { |
| 234 | // Probably due to concurrent deletion or renaming of the page |
| 235 | $logger = LoggerFactory::getInstance( 'RefreshLinksJob' ); |
| 236 | $logger->warning( |
| 237 | 'The page does not exist. Perhaps it was deleted?', |
| 238 | [ |
| 239 | 'page_title' => $this->title->getPrefixedDBkey(), |
| 240 | 'job_params' => $this->getParams(), |
| 241 | 'job_metadata' => $this->getMetadata() |
| 242 | ] |
| 243 | ); |
| 244 | $this->incrementFailureCounter( $stats, 'page_not_found' ); |
| 245 | |
| 246 | // retry later to handle unlucky race condition |
| 247 | return false; |
| 248 | } |
| 249 | |
| 250 | // Serialize link update job by page ID so they see each others' changes. |
| 251 | // The page ID and latest revision ID will be queried again after the lock |
| 252 | // is acquired to bail if they are changed from that of loadPageData() above. |
| 253 | // Serialize links updates by page ID so they see each others' changes |
| 254 | $dbw = $lbFactory->getPrimaryDatabase(); |
| 255 | /** @noinspection PhpUnusedLocalVariableInspection */ |
| 256 | $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' ); |
| 257 | if ( $scopedLock === null ) { |
| 258 | // Another job is already updating the page, likely for a prior revision (T170596) |
| 259 | $this->setLastError( 'LinksUpdate already running for this page, try again later.' ); |
| 260 | $this->incrementFailureCounter( $stats, 'lock_failure' ); |
| 261 | |
| 262 | // retry later when overlapping job for previous rev is done |
| 263 | return false; |
| 264 | } |
| 265 | |
| 266 | if ( $this->isAlreadyRefreshed( $page ) ) { |
| 267 | // this job has been superseded, e.g. by overlapping recursive job |
| 268 | // for a different template edit, or by direct edit or purge. |
| 269 | $stats->getCounter( 'refreshlinks_superseded_updates_total' ) |
| 270 | ->increment(); |
| 271 | // treat as success |
| 272 | return true; |
| 273 | } |
| 274 | |
| 275 | // Parse during a fresh transaction round for better read consistency |
| 276 | $lbFactory->beginPrimaryChanges( __METHOD__ ); |
| 277 | $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats ); |
| 278 | $options = $this->getDataUpdateOptions(); |
| 279 | $lbFactory->commitPrimaryChanges( __METHOD__ ); |
| 280 | |
| 281 | if ( !$output ) { |
| 282 | // probably raced out. |
| 283 | // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged(). |
| 284 | // Don't retry job. |
| 285 | return true; |
| 286 | } |
| 287 | |
| 288 | // Tell DerivedPageDataUpdater to use this parser output |
| 289 | $options['known-revision-output'] = $output; |
| 290 | // Execute corresponding DataUpdates immediately |
| 291 | $page->doSecondaryDataUpdates( $options ); |
| 292 | InfoAction::invalidateCache( $page ); |
| 293 | |
| 294 | // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache! |
| 295 | // This means the page will have to be rendered on-the-fly when it is next viewed. |
| 296 | // This is to avoid spending limited ParserCache capacity on rarely visited pages. |
| 297 | // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache() |
| 298 | // for pages that are likely to benefit (T327162). |
| 299 | |
| 300 | // Commit any writes here in case this method is called in a loop. |
| 301 | // In that case, the scoped lock will fail to be acquired. |
| 302 | $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket ); |
| 303 | |
| 304 | return true; |
| 305 | } |
| 306 | |
| 307 | /** |
| 308 | * @return string|null Minimum lag-safe TS::MW timestamp with regard to root job creation |
| 309 | */ |
| 310 | private function getLagAwareRootTimestamp() { |
| 311 | // Get the timestamp of the change that triggered this job |
| 312 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
| 313 | if ( $rootTimestamp === null ) { |
| 314 | return null; |
| 315 | } |
| 316 | |
| 317 | if ( !empty( $this->params['isOpportunistic'] ) ) { |
| 318 | // Neither clock skew nor DB snapshot/replica DB lag matter much for |
| 319 | // such updates; focus on reusing the (often recently updated) cache |
| 320 | $lagAwareTimestamp = $rootTimestamp; |
| 321 | } else { |
| 322 | // For transclusion updates, the template changes must be reflected |
| 323 | $lagAwareTimestamp = wfTimestamp( |
| 324 | TS::MW, |
| 325 | (int)wfTimestamp( TS::UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG |
| 326 | ); |
| 327 | } |
| 328 | |
| 329 | return $lagAwareTimestamp; |
| 330 | } |
| 331 | |
| 332 | /** |
| 333 | * @param WikiPage $page |
| 334 | * @return bool Whether something updated the backlinks with data newer than this job |
| 335 | */ |
| 336 | private function isAlreadyRefreshed( WikiPage $page ) { |
| 337 | $lagAwareTimestamp = $this->getLagAwareRootTimestamp(); |
| 338 | |
| 339 | return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp ); |
| 340 | } |
| 341 | |
| 342 | /** |
| 343 | * @see DerivedPageDataUpdater::shouldGenerateHTMLOnEdit |
| 344 | * @return bool true if at least one of slots require rendering HTML on edit, false otherwise. |
| 345 | * This is needed for example in populating ParserCache. |
| 346 | */ |
| 347 | private function shouldGenerateHTMLOnEdit( RevisionRecord $revision ): bool { |
| 348 | $services = MediaWikiServices::getInstance(); |
| 349 | foreach ( $revision->getSlots()->getSlotRoles() as $role ) { |
| 350 | $slot = $revision->getSlots()->getSlot( $role ); |
| 351 | $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() ); |
| 352 | if ( $contentHandler->generateHTMLOnEdit() ) { |
| 353 | return true; |
| 354 | } |
| 355 | } |
| 356 | return false; |
| 357 | } |
| 358 | |
| 359 | /** |
| 360 | * Get the parser output if the page is unchanged from what was loaded in $page |
| 361 | * |
| 362 | * @param RevisionRenderer $renderer |
| 363 | * @param ParserCache $parserCache |
| 364 | * @param WikiPage $page Page already loaded with READ_LATEST |
| 365 | * @param StatsFactory $stats |
| 366 | * @return ParserOutput|null Combined output for all slots; might only contain metadata |
| 367 | */ |
| 368 | private function getParserOutput( |
| 369 | RevisionRenderer $renderer, |
| 370 | ParserCache $parserCache, |
| 371 | WikiPage $page, |
| 372 | StatsFactory $stats |
| 373 | ) { |
| 374 | $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats ); |
| 375 | if ( !$revision ) { |
| 376 | // race condition? |
| 377 | return null; |
| 378 | } |
| 379 | |
| 380 | $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats ); |
| 381 | $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' ); |
| 382 | |
| 383 | if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) { |
| 384 | $statsCounter |
| 385 | ->setLabel( 'status', 'cache_hit' ) |
| 386 | ->setLabel( 'html_changed', 'n/a' ) |
| 387 | ->increment(); |
| 388 | |
| 389 | return $cachedOutput; |
| 390 | } |
| 391 | |
| 392 | $causeAction = $this->params['causeAction'] ?? 'RefreshLinksJob'; |
| 393 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
| 394 | |
| 395 | // T371713: Temporary statistics collection code to determine |
| 396 | // feasibility of Parsoid selective update |
| 397 | $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get( |
| 398 | MainConfigNames::ParsoidSelectiveUpdateSampleRate |
| 399 | ); |
| 400 | $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1; |
| 401 | if ( $doSample && $cachedOutput === null ) { |
| 402 | // In order to collect accurate statistics, check for |
| 403 | // a dirty copy in the cache even if we wouldn't have |
| 404 | // to otherwise. |
| 405 | $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?: null; |
| 406 | } |
| 407 | |
| 408 | $renderedRevision = $renderer->getRenderedRevision( |
| 409 | $revision, |
| 410 | $parserOptions, |
| 411 | null, |
| 412 | [ |
| 413 | 'audience' => $revision::RAW, |
| 414 | 'causeAction' => $causeAction, |
| 415 | // Providing a previous parse potentially allows for |
| 416 | // selective updates |
| 417 | 'previous-output' => $cachedOutput, |
| 418 | ] |
| 419 | ); |
| 420 | |
| 421 | $parseTimestamp = wfTimestampNow(); // timestamp that parsing started |
| 422 | $output = $renderedRevision->getRevisionParserOutput( [ |
| 423 | // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309) |
| 424 | 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision ) |
| 425 | ] ); |
| 426 | $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate() |
| 427 | // T371713: Temporary statistics collection code to determine |
| 428 | // feasibility of Parsoid selective update |
| 429 | if ( $doSample ) { |
| 430 | $content = $revision->getContent( SlotRecord::MAIN ); |
| 431 | $labels = [ |
| 432 | 'source' => 'RefreshLinksJob', |
| 433 | 'type' => $cachedOutput === null ? 'full' : 'selective', |
| 434 | 'reason' => $causeAction, |
| 435 | 'parser' => $parserOptions->getUseParsoid() ? 'parsoid' : 'legacy', |
| 436 | 'opportunistic' => empty( $this->params['isOpportunistic'] ) ? 'false' : 'true', |
| 437 | 'wiki' => WikiMap::getCurrentWikiId(), |
| 438 | 'model' => $content ? $content->getModel() : 'unknown', |
| 439 | ]; |
| 440 | $stats |
| 441 | ->getCounter( 'ParserCache_selective_total' ) |
| 442 | ->setLabels( $labels ) |
| 443 | ->increment(); |
| 444 | $stats |
| 445 | ->getCounter( 'ParserCache_selective_cpu_seconds' ) |
| 446 | ->setLabels( $labels ) |
| 447 | ->incrementBy( $output->getTimeProfile( 'cpu' ) ); |
| 448 | } |
| 449 | |
| 450 | // Collect stats on parses that don't actually change the page content. |
| 451 | // In that case, we could abort here, and perhaps we could also avoid |
| 452 | // triggering CDN purges (T369898). |
| 453 | if ( !$cachedOutput || !$output->hasText() ) { |
| 454 | // There was no cached output, or no HTML was generated because |
| 455 | // shouldGenerateHTMLOnEdit returned false. |
| 456 | $htmlChanged = 'unknown'; |
| 457 | } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) { |
| 458 | // We have cached output, but we couldn't be sure that it was still good. |
| 459 | // So we parsed again, but the result turned out to be the same HTML as |
| 460 | // before. |
| 461 | $htmlChanged = 'no'; |
| 462 | } else { |
| 463 | // Re-parsing yielded HTML different from the cached output. |
| 464 | $htmlChanged = 'yes'; |
| 465 | } |
| 466 | |
| 467 | $statsCounter |
| 468 | ->setLabel( 'status', 'cache_miss' ) |
| 469 | ->setLabel( 'html_changed', $htmlChanged ) |
| 470 | ->setLabel( 'has_async_content', |
| 471 | $output->getOutputFlag( ParserOutputFlags::HAS_ASYNC_CONTENT ) ? 'true' : 'false' ) |
| 472 | ->setLabel( 'async_not_ready', |
| 473 | $output->getOutputFlag( ParserOutputFlags::ASYNC_NOT_READY ) ? 'true' : 'false' ) |
| 474 | ->increment(); |
| 475 | |
| 476 | return $output; |
| 477 | } |
| 478 | |
| 479 | /** |
| 480 | * Get the current revision record if it is unchanged from what was loaded in $page |
| 481 | * |
| 482 | * @param WikiPage $page Page already loaded with READ_LATEST |
| 483 | * @param StatsFactory $stats |
| 484 | * @return RevisionRecord|null The same instance that $page->getRevisionRecord() uses |
| 485 | */ |
| 486 | private function getCurrentRevisionIfUnchanged( |
| 487 | WikiPage $page, |
| 488 | StatsFactory $stats |
| 489 | ) { |
| 490 | $title = $page->getTitle(); |
| 491 | // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction. |
| 492 | // This is used to detect edits/moves after loadPageData() but before the scope lock. |
| 493 | // The works around the chicken/egg problem of determining the scope lock key name |
| 494 | $latest = $title->getLatestRevID( IDBAccessObject::READ_LATEST ); |
| 495 | |
| 496 | $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null; |
| 497 | if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) { |
| 498 | // This job is obsolete and one for the latest revision will handle updates |
| 499 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
| 500 | $this->setLastError( "Revision $triggeringRevisionId is not current" ); |
| 501 | return null; |
| 502 | } |
| 503 | |
| 504 | // Load the current revision. Note that $page should have loaded with READ_LATEST. |
| 505 | // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on. |
| 506 | $revision = $page->getRevisionRecord(); |
| 507 | if ( !$revision ) { |
| 508 | // revision just got deleted? |
| 509 | $this->incrementFailureCounter( $stats, 'rev_not_found' ); |
| 510 | $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" ); |
| 511 | return null; |
| 512 | |
| 513 | } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) { |
| 514 | // Do not clobber over newer updates with older ones. If all jobs where FIFO and |
| 515 | // serialized, it would be OK to update links based on older revisions since it |
| 516 | // would eventually get to the latest. Since that is not the case (by design), |
| 517 | // only update the link tables to a state matching the current revision's output. |
| 518 | $this->incrementFailureCounter( $stats, 'rev_not_current' ); |
| 519 | $this->setLastError( "Revision {$revision->getId()} is not current" ); |
| 520 | |
| 521 | return null; |
| 522 | } |
| 523 | |
| 524 | return $revision; |
| 525 | } |
| 526 | |
| 527 | /** |
| 528 | * Get the parser output from cache if it reflects the change that triggered this job |
| 529 | * |
| 530 | * @param ParserCache $parserCache |
| 531 | * @param WikiPage $page |
| 532 | * @param RevisionRecord $currentRevision |
| 533 | * @param StatsFactory $stats |
| 534 | * @return ParserOutput|null |
| 535 | */ |
| 536 | private function getParserOutputFromCache( |
| 537 | ParserCache $parserCache, |
| 538 | WikiPage $page, |
| 539 | RevisionRecord $currentRevision, |
| 540 | StatsFactory $stats |
| 541 | ): ?ParserOutput { |
| 542 | // Parsoid can do selective updates, so it is always worth the I/O |
| 543 | // to check for a previous parse. |
| 544 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
| 545 | if ( $parserOptions->getUseParsoid() ) { |
| 546 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
| 547 | } |
| 548 | // If page_touched changed after this root job, then it is likely that |
| 549 | // any views of the pages already resulted in re-parses which are now in |
| 550 | // cache. The cache can be reused to avoid expensive parsing in some cases. |
| 551 | $rootTimestamp = $this->params['rootJobTimestamp'] ?? null; |
| 552 | if ( $rootTimestamp !== null ) { |
| 553 | $opportunistic = !empty( $this->params['isOpportunistic'] ); |
| 554 | if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) { |
| 555 | // Cache is suspected to be up-to-date so it's worth the I/O of checking. |
| 556 | // We call canUseParserOutputFromCache() later to check if it's usable. |
| 557 | return $parserCache->getDirty( $page, $parserOptions ) ?: null; |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | return null; |
| 562 | } |
| 563 | |
| 564 | private function canUseParserOutputFromCache( |
| 565 | ParserOutput $cachedOutput, |
| 566 | RevisionRecord $currentRevision |
| 567 | ): bool { |
| 568 | // As long as the cache rev ID matches the current rev ID and it reflects |
| 569 | // the job's triggering change, then it is usable. |
| 570 | return $cachedOutput->getCacheRevisionId() == $currentRevision->getId() |
| 571 | && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp(); |
| 572 | } |
| 573 | |
| 574 | /** |
| 575 | * Increment the RefreshLinks failure counter metric with the given reason. |
| 576 | * |
| 577 | * @param StatsFactory $stats |
| 578 | * @param string $reason Well-known failure reason string |
| 579 | * @return void |
| 580 | */ |
| 581 | private function incrementFailureCounter( StatsFactory $stats, $reason ): void { |
| 582 | $stats->getCounter( 'refreshlinks_failures_total' ) |
| 583 | ->setLabel( 'reason', $reason ) |
| 584 | ->increment(); |
| 585 | } |
| 586 | |
| 587 | /** |
| 588 | * @return array |
| 589 | */ |
| 590 | private function getDataUpdateOptions() { |
| 591 | $options = [ |
| 592 | 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ), |
| 593 | // Carry over cause so the update can do extra logging |
| 594 | 'causeAction' => $this->params['causeAction'], |
| 595 | 'causeAgent' => $this->params['causeAgent'] |
| 596 | ]; |
| 597 | if ( !empty( $this->params['triggeringUser'] ) ) { |
| 598 | $userInfo = $this->params['triggeringUser']; |
| 599 | '@phan-var array{userId:int,userName:string} $userInfo'; |
| 600 | if ( $userInfo['userId'] ) { |
| 601 | $options['triggeringUser'] = User::newFromId( $userInfo['userId'] ); |
| 602 | } else { |
| 603 | // Anonymous, use the username |
| 604 | $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false ); |
| 605 | } |
| 606 | } |
| 607 | |
| 608 | return $options; |
| 609 | } |
| 610 | |
| 611 | /** @inheritDoc */ |
| 612 | public function getDeduplicationInfo() { |
| 613 | $info = parent::getDeduplicationInfo(); |
| 614 | unset( $info['causeAction'] ); |
| 615 | unset( $info['causeAgent'] ); |
| 616 | if ( is_array( $info['params'] ) ) { |
| 617 | // For per-pages jobs, the job title is that of the template that changed |
| 618 | // (or similar), so remove that since it ruins duplicate detection |
| 619 | if ( isset( $info['params']['pages'] ) ) { |
| 620 | unset( $info['namespace'] ); |
| 621 | unset( $info['title'] ); |
| 622 | } |
| 623 | } |
| 624 | |
| 625 | return $info; |
| 626 | } |
| 627 | |
| 628 | /** @inheritDoc */ |
| 629 | public function workItemCount() { |
| 630 | if ( !empty( $this->params['recursive'] ) ) { |
| 631 | return 0; // nothing actually refreshed |
| 632 | } elseif ( isset( $this->params['pages'] ) ) { |
| 633 | return count( $this->params['pages'] ); |
| 634 | } |
| 635 | |
| 636 | return 1; // one title |
| 637 | } |
| 638 | } |
| 639 | |
| 640 | /** @deprecated class alias since 1.44 */ |
| 641 | class_alias( RefreshLinksJob::class, 'RefreshLinksJob' ); |