Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 163 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
| ArticleCompileProcessor | |
0.00% |
0 / 163 |
|
0.00% |
0 / 11 |
2970 | |
0.00% |
0 / 1 |
| getSafeComponentDbConfigForCompilation | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
| __construct | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
| newFromPageId | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| registerLinksUpdate | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| registerComponent | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| configComponentDb | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
20 | |||
| getLastEditTimestamp | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
| compileMetadata | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
72 | |||
| prepare | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
| process | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
110 | |||
| save | |
0.00% |
0 / 46 |
|
0.00% |
0 / 1 |
156 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Extension\PageTriage\ArticleCompile; |
| 4 | |
| 5 | use MediaWiki\Context\RequestContext; |
| 6 | use MediaWiki\Deferred\DeferredUpdates; |
| 7 | use MediaWiki\Deferred\LinksUpdate\LinksUpdate; |
| 8 | use MediaWiki\Extension\PageTriage\ArticleMetadata; |
| 9 | use MediaWiki\Extension\PageTriage\CompileArticleMetadataJob; |
| 10 | use MediaWiki\Extension\PageTriage\PageTriage; |
| 11 | use MediaWiki\Extension\PageTriage\PageTriageUtil; |
| 12 | use MediaWiki\Logger\LoggerFactory; |
| 13 | use MediaWiki\MediaWikiServices; |
| 14 | use MediaWiki\Page\WikiPage; |
| 15 | use MediaWiki\Title\Title; |
| 16 | use RuntimeException; |
| 17 | use Wikimedia\Rdbms\IDBAccessObject; |
| 18 | use Wikimedia\Stats\StatsFactory; |
| 19 | |
| 20 | /** |
| 21 | * Compiling metadata for articles |
| 22 | */ |
| 23 | class ArticleCompileProcessor { |
| 24 | /** @var string[] */ |
| 25 | protected $component; |
| 26 | |
| 27 | /** @var int[] Either DB_PRIMARY or DB_REPLICA */ |
| 28 | protected $componentDb; |
| 29 | |
| 30 | /** @var int[] List of page IDs */ |
| 31 | protected $pageIds; |
| 32 | |
| 33 | /** @var array */ |
| 34 | protected $metadata; |
| 35 | |
| 36 | /** @var bool */ |
| 37 | protected $defaultMode; |
| 38 | |
| 39 | /** @var WikiPage[] */ |
| 40 | protected $articles = []; |
| 41 | |
| 42 | /** @var LinksUpdate[] */ |
| 43 | protected $linksUpdates = []; |
| 44 | |
| 45 | /** @var StatsFactory */ |
| 46 | private StatsFactory $statsFactory; |
| 47 | |
| 48 | public const SAVE_IMMEDIATE = 0; |
| 49 | public const SAVE_DEFERRED = 1; |
| 50 | public const SAVE_JOB = 2; |
| 51 | |
| 52 | /** |
| 53 | * Array of configuration options to pass to self::configComponentDb() for metadata compilation. |
| 54 | * |
| 55 | * BasicData accesses the `pagetriage_page` table and this may not necessarily be up to |
| 56 | * date in a replica, so it is excluded from this list. |
| 57 | * |
| 58 | * @return array |
| 59 | */ |
| 60 | public static function getSafeComponentDbConfigForCompilation() { |
| 61 | return [ |
| 62 | 'LinkCount' => DB_REPLICA, |
| 63 | 'CategoryCount' => DB_REPLICA, |
| 64 | 'Snippet' => DB_REPLICA, |
| 65 | 'UserData' => DB_REPLICA, |
| 66 | 'DeletionTag' => DB_REPLICA, |
| 67 | 'AfcTag' => DB_REPLICA, |
| 68 | 'Recreated' => DB_REPLICA, |
| 69 | ]; |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * @param int[] $pageIds List of page IDs. |
| 74 | * @param StatsFactory $statsFactory |
| 75 | */ |
| 76 | private function __construct( $pageIds, StatsFactory $statsFactory ) { |
| 77 | $this->pageIds = $pageIds; |
| 78 | |
| 79 | $this->component = [ |
| 80 | 'BasicData' => 'off', |
| 81 | 'LinkCount' => 'off', |
| 82 | 'CategoryCount' => 'off', |
| 83 | 'Snippet' => 'off', |
| 84 | 'UserData' => 'off', |
| 85 | 'DeletionTag' => 'off', |
| 86 | 'AfcTag' => 'off', |
| 87 | 'Recreated' => 'off', |
| 88 | ]; |
| 89 | // default to use master database for data compilation |
| 90 | foreach ( $this->component as $key => $value ) { |
| 91 | $this->componentDb[$key] = DB_PRIMARY; |
| 92 | } |
| 93 | |
| 94 | $this->metadata = array_fill_keys( $this->pageIds, [] ); |
| 95 | $this->defaultMode = true; |
| 96 | $this->statsFactory = $statsFactory->withComponent( 'PageTriage' ); |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * Factory for creating an instance |
| 101 | * @param int[] $pageIds |
| 102 | * @param bool $validated whether page ids are validated |
| 103 | * @param int $validateDb const DB_PRIMARY/DB_REPLICA |
| 104 | * @return ArticleCompileProcessor|false |
| 105 | */ |
| 106 | public static function newFromPageId( |
| 107 | array $pageIds, $validated = true, $validateDb = DB_PRIMARY |
| 108 | ) { |
| 109 | if ( !$validated ) { |
| 110 | $pageIds = ArticleMetadata::validatePageIds( $pageIds, $validateDb ); |
| 111 | } |
| 112 | if ( $pageIds ) { |
| 113 | return new ArticleCompileProcessor( |
| 114 | $pageIds, |
| 115 | MediaWikiServices::getInstance()->getStatsFactory() |
| 116 | ); |
| 117 | } else { |
| 118 | return false; |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | /** |
| 123 | * Register a linksUpdate to the processor for future compiling |
| 124 | * @param LinksUpdate $linksUpdate |
| 125 | */ |
| 126 | public function registerLinksUpdate( LinksUpdate $linksUpdate ) { |
| 127 | $id = $linksUpdate->getTitle()->getArticleID(); |
| 128 | if ( in_array( $id, $this->pageIds ) ) { |
| 129 | $this->linksUpdates[$id] = $linksUpdate; |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | /** |
| 134 | * Register a component to the processor for compiling |
| 135 | * @param string $component |
| 136 | */ |
| 137 | public function registerComponent( $component ) { |
| 138 | if ( isset( $this->component[$component] ) ) { |
| 139 | $this->component[$component] = 'on'; |
| 140 | $this->defaultMode = false; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | /** |
| 145 | * Config what db to use for each component |
| 146 | * @param array $config |
| 147 | * example: array( 'BasicData' => DB_REPLICA, 'UserData' => DB_PRIMARY ) |
| 148 | */ |
| 149 | public function configComponentDb( $config ) { |
| 150 | $dbMode = [ DB_PRIMARY, DB_REPLICA ]; |
| 151 | foreach ( $this->componentDb as $key => $value ) { |
| 152 | if ( isset( $config[$key] ) && in_array( $config[$key], $dbMode ) ) { |
| 153 | $this->componentDb[$key] = $config[$key]; |
| 154 | } |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | /** |
| 159 | * Get the timestamp of the last edit to a page |
| 160 | * @param int $pageId Page ID |
| 161 | * @return string Timestamp of last update, or current timestamp if not found |
| 162 | */ |
| 163 | protected function getLastEditTimestamp( $pageId ) { |
| 164 | if ( isset( $this->linksUpdates[$pageId] ) ) { |
| 165 | return $this->linksUpdates[$pageId]->getRevisionRecord()->getTimestamp(); |
| 166 | } |
| 167 | if ( isset( $this->articles[$pageId] ) ) { |
| 168 | return $this->articles[$pageId]->getTimestamp(); |
| 169 | } |
| 170 | // TODO deduplicate with ArticleCompileInterface::getArticleByPageId(), maybe move to this class |
| 171 | $fromdb = $this->componentDb['BasicData'] === DB_PRIMARY ? |
| 172 | IDBAccessObject::READ_LATEST : IDBAccessObject::READ_NORMAL; |
| 173 | $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromID( $pageId, $fromdb ); |
| 174 | if ( $page ) { |
| 175 | return $page->getTimestamp(); |
| 176 | } |
| 177 | // Give up and return the current time |
| 178 | return wfTimestampNow(); |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * Wrapper function for compiling metadata. |
| 183 | * |
| 184 | * @param int $mode Class SAVE_* constant |
| 185 | * - SAVE_IMMEDIATE = Unless overridden with self::configComponentDb(), uses |
| 186 | * primary DB for reads and writes. The caller should use self::configComponentDB() |
| 187 | * to use the replica for as many compilation components as possible. |
| 188 | * - SAVE_DEFERRED = The replica is used for reads. The metadata will be written |
| 189 | * to the database at the end of the request in a deferred update. |
| 190 | * - SAVE_JOB = The replica is used for reads. The metadata will be written to |
| 191 | * the database via the job queue. Usage of this mode should be the exception, not |
| 192 | * the norm – it exists as a safeguard to ensure metadata is compiled for any cases |
| 193 | * where the hook implementations missed generating the data. |
| 194 | * @return array |
| 195 | * The compiled metadata. |
| 196 | */ |
| 197 | public function compileMetadata( $mode = self::SAVE_IMMEDIATE ) { |
| 198 | $startTime = microtime( true ); |
| 199 | |
| 200 | // For deferred / job saves, use the replica for reading data. |
| 201 | if ( in_array( $mode, [ self::SAVE_DEFERRED, self::SAVE_JOB ] ) ) { |
| 202 | foreach ( $this->component as $key => $value ) { |
| 203 | $this->componentDb[$key] = DB_REPLICA; |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | // Set up which components of metadata to compile. |
| 208 | $this->prepare(); |
| 209 | |
| 210 | // Instantiate the dedicated class for each component, compile the metadata associated |
| 211 | // with the class, then store the metadata in $this->metadata for use below. |
| 212 | $this->process(); |
| 213 | |
| 214 | switch ( $mode ) { |
| 215 | case self::SAVE_JOB: |
| 216 | // This flag is used in ArticleMetadata::getMetadata() when article metadata |
| 217 | // is missing and the request context is a GET. |
| 218 | // We will return the already compiled metadata, which was generated by querying |
| 219 | // a replica, but we will not save the results to the database in this request, |
| 220 | // instead it will get added to the job queue for later processing. |
| 221 | // Additionally, the metadata will be cached in memcache for 24 hours. |
| 222 | // The logging statement below can alert us to errors in our hook implementation. |
| 223 | // Queue a job for each page that doesn't have metadata. |
| 224 | $jobs = []; |
| 225 | foreach ( $this->pageIds as $pageId ) { |
| 226 | $jobs[] = new CompileArticleMetadataJob( |
| 227 | Title::newMainPage(), |
| 228 | [ 'pageId' => (int)$pageId ] |
| 229 | ); |
| 230 | } |
| 231 | MediaWikiServices::getInstance()->getJobQueueGroup()->push( $jobs ); |
| 232 | LoggerFactory::getInstance( 'PageTriage' )->debug( |
| 233 | 'Article metadata not found in DB, will attempt to save to DB via the job queue.', |
| 234 | [ |
| 235 | 'exception' => new RuntimeException(), |
| 236 | 'articles_without_metadata' => implode( ',', $this->pageIds ), |
| 237 | 'raw_query_string' => RequestContext::getMain()->getRequest() |
| 238 | ->getRawQueryString(), |
| 239 | ] |
| 240 | ); |
| 241 | break; |
| 242 | case self::SAVE_DEFERRED: |
| 243 | DeferredUpdates::addCallableUpdate( function () { |
| 244 | // T152847 |
| 245 | $this->save(); |
| 246 | } ); |
| 247 | break; |
| 248 | case self::SAVE_IMMEDIATE: |
| 249 | $this->save(); |
| 250 | } |
| 251 | |
| 252 | if ( $mode === self::SAVE_IMMEDIATE ) { |
| 253 | $this->statsFactory->getTiming( 'articleCompileProcessor_compileMetadata_saveImmediate_seconds' ) |
| 254 | ->copyToStatsdAt( 'timing.pageTriage.articleCompileProcessor.compileMetadata.saveImmediate' ) |
| 255 | ->observe( microtime( true ) - $startTime ); |
| 256 | } |
| 257 | |
| 258 | return $this->metadata; |
| 259 | } |
| 260 | |
| 261 | /** |
| 262 | * Set up the data before compiling |
| 263 | */ |
| 264 | protected function prepare() { |
| 265 | if ( $this->defaultMode ) { |
| 266 | foreach ( $this->component as $key => $val ) { |
| 267 | $this->component[$key] = 'on'; |
| 268 | } |
| 269 | } else { |
| 270 | // These two set of data are related |
| 271 | if ( $this->component['CategoryCount'] == 'on' || $this->component['DeletionTag'] == 'on' ) { |
| 272 | $this->component['CategoryCount'] = 'on'; |
| 273 | $this->component['DeletionTag'] = 'on'; |
| 274 | } |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | /** |
| 279 | * Compile all the registered components in order |
| 280 | */ |
| 281 | protected function process() { |
| 282 | $completed = []; |
| 283 | |
| 284 | foreach ( $this->component as $key => $val ) { |
| 285 | if ( $val === 'on' ) { |
| 286 | $startTime = microtime( true ); |
| 287 | $compClass = 'MediaWiki\Extension\PageTriage\ArticleCompile\ArticleCompile' . $key; |
| 288 | /** @var ArticleCompile $comp */ |
| 289 | $comp = new $compClass( $this->pageIds, $this->componentDb[$key], $this->articles, |
| 290 | $this->linksUpdates |
| 291 | ); |
| 292 | if ( !$comp->compile() ) { |
| 293 | break; |
| 294 | } |
| 295 | $this->statsFactory->getTiming( 'articleCompileProcessor_process_seconds' ) |
| 296 | ->setLabel( 'key', $key ) |
| 297 | ->copyToStatsdAt( 'timing.pageTriage.articleCompileProcessor.process.' . $key ) |
| 298 | ->observe( microtime( true ) - $startTime ); |
| 299 | foreach ( $comp->getMetadata() as $pageId => $row ) { |
| 300 | $this->metadata[$pageId] += $row; |
| 301 | } |
| 302 | $completed[] = $key; |
| 303 | } |
| 304 | } |
| 305 | |
| 306 | // Subtract deletion tags from category count |
| 307 | if ( in_array( 'CategoryCount', $completed ) ) { |
| 308 | $deletionTags = ArticleCompileDeletionTag::getDeletionTags(); |
| 309 | foreach ( $this->metadata as $pageId => $row ) { |
| 310 | foreach ( $deletionTags as $val ) { |
| 311 | if ( $this->metadata[$pageId][$val] ) { |
| 312 | $this->metadata[$pageId]['category_count']--; |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | if ( $this->metadata[$pageId]['category_count'] < 0 ) { |
| 317 | $this->metadata[$pageId]['category_count'] = '0'; |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | } |
| 322 | |
| 323 | /** |
| 324 | * Save the compiling result to database as well as cache |
| 325 | */ |
| 326 | protected function save() { |
| 327 | $dbw = PageTriageUtil::getPrimaryConnection(); |
| 328 | $dbr = PageTriageUtil::getReplicaConnection(); |
| 329 | |
| 330 | if ( !$this->pageIds ) { |
| 331 | return; |
| 332 | } |
| 333 | |
| 334 | $tags = ArticleMetadata::getValidTags(); |
| 335 | |
| 336 | // Grab existing old metadata |
| 337 | $res = $dbr->newSelectQueryBuilder() |
| 338 | ->select( [ 'ptrpt_page_id', 'ptrt_tag_name', 'ptrpt_value' ] ) |
| 339 | ->from( 'pagetriage_page_tags' ) |
| 340 | ->join( 'pagetriage_tags', null, 'ptrpt_tag_id = ptrt_tag_id' ) |
| 341 | ->where( [ 'ptrpt_page_id' => $this->pageIds ] ) |
| 342 | ->caller( __METHOD__ ) |
| 343 | ->fetchResultSet(); |
| 344 | // data in $newData is used for update, initialize it with new metadata |
| 345 | $newData = $this->metadata; |
| 346 | // Loop through old metadata value and compare them with the new one, |
| 347 | // if they are the same, remove them from $newData |
| 348 | foreach ( $res as $row ) { |
| 349 | if ( isset( $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] ) |
| 350 | && $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] == $row->ptrpt_value |
| 351 | ) { |
| 352 | unset( $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] ); |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | foreach ( $newData as $pageId => $data ) { |
| 357 | // Flush cache so a new copy of cache will be generated, it's safe to |
| 358 | // refresh in case some data other than metadata gets updated |
| 359 | $articleMetadata = new ArticleMetadata( [ $pageId ] ); |
| 360 | $articleMetadata->flushMetadataFromCache(); |
| 361 | // Make sure either all or none metadata for a single page_id |
| 362 | $dbw->startAtomic( __METHOD__ ); |
| 363 | |
| 364 | $updateReviewedTimestamp = false; |
| 365 | |
| 366 | // Check for the update_reviewed_timestamp flag, which means we should update the |
| 367 | // ptrp_reviewed_updated field after processing (e.g. submission date of AfC drafts). |
| 368 | if ( array_key_exists( 'update_reviewed_timestamp', $data ) ) { |
| 369 | unset( $data['update_reviewed_timestamp'] ); |
| 370 | $updateReviewedTimestamp = true; |
| 371 | } |
| 372 | |
| 373 | foreach ( $data as $key => $val ) { |
| 374 | if ( isset( $tags[$key] ) ) { |
| 375 | $row = [ |
| 376 | 'ptrpt_page_id' => $pageId, |
| 377 | 'ptrpt_tag_id' => $tags[$key], |
| 378 | 'ptrpt_value' => (string)$val |
| 379 | ]; |
| 380 | $dbw->newReplaceQueryBuilder() |
| 381 | ->replaceInto( 'pagetriage_page_tags' ) |
| 382 | ->uniqueIndexFields( [ 'ptrpt_page_id', 'ptrpt_tag_id' ] ) |
| 383 | ->row( $row ) |
| 384 | ->caller( __METHOD__ ) |
| 385 | ->execute(); |
| 386 | } |
| 387 | } |
| 388 | $pt = new PageTriage( $pageId ); |
| 389 | $row = [ 'ptrp_tags_updated' => $dbw->timestamp( wfTimestampNow() ) ]; |
| 390 | |
| 391 | if ( $updateReviewedTimestamp ) { |
| 392 | $row['ptrp_reviewed_updated'] = $dbw->timestamp( $this->getLastEditTimestamp( $pageId ) ); |
| 393 | } |
| 394 | |
| 395 | if ( isset( $data['deleted'] ) ) { |
| 396 | $row['ptrp_deleted'] = $data['deleted'] ? '1' : '0'; |
| 397 | } |
| 398 | $pt->update( $row ); |
| 399 | $dbw->endAtomic( __METHOD__ ); |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | } |