Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 167 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
ArticleCompileProcessor | |
0.00% |
0 / 167 |
|
0.00% |
0 / 11 |
2970 | |
0.00% |
0 / 1 |
getSafeComponentDbConfigForCompilation | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
newFromPageId | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
registerLinksUpdate | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
registerComponent | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
configComponentDb | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
20 | |||
getLastEditTimestamp | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
compileMetadata | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
72 | |||
prepare | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
process | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
110 | |||
save | |
0.00% |
0 / 46 |
|
0.00% |
0 / 1 |
156 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\PageTriage\ArticleCompile; |
4 | |
5 | use MediaWiki\Context\RequestContext; |
6 | use MediaWiki\Deferred\DeferredUpdates; |
7 | use MediaWiki\Deferred\LinksUpdate\LinksUpdate; |
8 | use MediaWiki\Extension\PageTriage\ArticleMetadata; |
9 | use MediaWiki\Extension\PageTriage\CompileArticleMetadataJob; |
10 | use MediaWiki\Extension\PageTriage\PageTriage; |
11 | use MediaWiki\Extension\PageTriage\PageTriageUtil; |
12 | use MediaWiki\Logger\LoggerFactory; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Title\Title; |
15 | use RuntimeException; |
16 | use Wikimedia\Rdbms\IDBAccessObject; |
17 | use Wikimedia\Stats\IBufferingStatsdDataFactory; |
18 | use WikiPage; |
19 | |
20 | /** |
21 | * Compiling metadata for articles |
22 | */ |
23 | class ArticleCompileProcessor { |
24 | /** @var string[] */ |
25 | protected $component; |
26 | |
27 | /** @var int[] Either DB_PRIMARY or DB_REPLICA */ |
28 | protected $componentDb; |
29 | |
30 | /** @var int[] List of page IDs */ |
31 | protected $pageIds; |
32 | |
33 | /** @var array */ |
34 | protected $metadata; |
35 | |
36 | /** @var bool */ |
37 | protected $defaultMode; |
38 | |
39 | /** @var WikiPage[] */ |
40 | protected $articles = []; |
41 | |
42 | /** @var LinksUpdate[] */ |
43 | protected $linksUpdates = []; |
44 | |
45 | /** @var IBufferingStatsdDataFactory */ |
46 | private IBufferingStatsdDataFactory $statsdDataFactory; |
47 | |
48 | public const SAVE_IMMEDIATE = 0; |
49 | public const SAVE_DEFERRED = 1; |
50 | public const SAVE_JOB = 2; |
51 | |
52 | /** |
53 | * Array of configuration options to pass to self::configComponentDb() for metadata compilation. |
54 | * |
55 | * BasicData accesses the `pagetriage_page` table and this may not necessarily be up to |
56 | * date in a replica, so it is excluded from this list. |
57 | * |
58 | * @return array |
59 | */ |
60 | public static function getSafeComponentDbConfigForCompilation() { |
61 | return [ |
62 | 'LinkCount' => DB_REPLICA, |
63 | 'CategoryCount' => DB_REPLICA, |
64 | 'Snippet' => DB_REPLICA, |
65 | 'UserData' => DB_REPLICA, |
66 | 'DeletionTag' => DB_REPLICA, |
67 | 'AfcTag' => DB_REPLICA, |
68 | 'Recreated' => DB_REPLICA, |
69 | ]; |
70 | } |
71 | |
72 | /** |
73 | * @param int[] $pageIds List of page IDs. |
74 | * @param IBufferingStatsdDataFactory $statsdDataFactory |
75 | */ |
76 | private function __construct( $pageIds, IBufferingStatsdDataFactory $statsdDataFactory ) { |
77 | $this->pageIds = $pageIds; |
78 | |
79 | $this->component = [ |
80 | 'BasicData' => 'off', |
81 | 'LinkCount' => 'off', |
82 | 'CategoryCount' => 'off', |
83 | 'Snippet' => 'off', |
84 | 'UserData' => 'off', |
85 | 'DeletionTag' => 'off', |
86 | 'AfcTag' => 'off', |
87 | 'Recreated' => 'off', |
88 | ]; |
89 | // default to use master database for data compilation |
90 | foreach ( $this->component as $key => $value ) { |
91 | $this->componentDb[$key] = DB_PRIMARY; |
92 | } |
93 | |
94 | $this->metadata = array_fill_keys( $this->pageIds, [] ); |
95 | $this->defaultMode = true; |
96 | $this->statsdDataFactory = $statsdDataFactory; |
97 | } |
98 | |
99 | /** |
100 | * Factory for creating an instance |
101 | * @param int[] $pageIds |
102 | * @param bool $validated whether page ids are validated |
103 | * @param int $validateDb const DB_PRIMARY/DB_REPLICA |
104 | * @return ArticleCompileProcessor|false |
105 | */ |
106 | public static function newFromPageId( |
107 | array $pageIds, $validated = true, $validateDb = DB_PRIMARY |
108 | ) { |
109 | if ( !$validated ) { |
110 | $pageIds = ArticleMetadata::validatePageIds( $pageIds, $validateDb ); |
111 | } |
112 | if ( $pageIds ) { |
113 | return new ArticleCompileProcessor( |
114 | $pageIds, |
115 | MediaWikiServices::getInstance()->getStatsdDataFactory() |
116 | ); |
117 | } else { |
118 | return false; |
119 | } |
120 | } |
121 | |
122 | /** |
123 | * Register a linksUpdate to the processor for future compiling |
124 | * @param LinksUpdate $linksUpdate |
125 | */ |
126 | public function registerLinksUpdate( LinksUpdate $linksUpdate ) { |
127 | $id = $linksUpdate->getTitle()->getArticleID(); |
128 | if ( in_array( $id, $this->pageIds ) ) { |
129 | $this->linksUpdates[$id] = $linksUpdate; |
130 | } |
131 | } |
132 | |
133 | /** |
134 | * Register a component to the processor for compiling |
135 | * @param string $component |
136 | */ |
137 | public function registerComponent( $component ) { |
138 | if ( isset( $this->component[$component] ) ) { |
139 | $this->component[$component] = 'on'; |
140 | $this->defaultMode = false; |
141 | } |
142 | } |
143 | |
144 | /** |
145 | * Config what db to use for each component |
146 | * @param array $config |
147 | * example: array( 'BasicData' => DB_REPLICA, 'UserData' => DB_PRIMARY ) |
148 | */ |
149 | public function configComponentDb( $config ) { |
150 | $dbMode = [ DB_PRIMARY, DB_REPLICA ]; |
151 | foreach ( $this->componentDb as $key => $value ) { |
152 | if ( isset( $config[$key] ) && in_array( $config[$key], $dbMode ) ) { |
153 | $this->componentDb[$key] = $config[$key]; |
154 | } |
155 | } |
156 | } |
157 | |
158 | /** |
159 | * Get the timestamp of the last edit to a page |
160 | * @param int $pageId Page ID |
161 | * @return string Timestamp of last update, or current timestamp if not found |
162 | */ |
163 | protected function getLastEditTimestamp( $pageId ) { |
164 | if ( isset( $this->linksUpdates[$pageId] ) ) { |
165 | return $this->linksUpdates[$pageId]->getRevisionRecord()->getTimestamp(); |
166 | } |
167 | if ( isset( $this->articles[$pageId] ) ) { |
168 | return $this->articles[$pageId]->getTimestamp(); |
169 | } |
170 | // TODO deduplicate with ArticleCompileInterface::getArticleByPageId(), maybe move to this class |
171 | $fromdb = $this->componentDb['BasicData'] === DB_PRIMARY ? |
172 | IDBAccessObject::READ_LATEST : IDBAccessObject::READ_NORMAL; |
173 | $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromID( $pageId, $fromdb ); |
174 | if ( $page ) { |
175 | return $page->getTimestamp(); |
176 | } |
177 | // Give up and return the current time |
178 | return wfTimestampNow(); |
179 | } |
180 | |
181 | /** |
182 | * Wrapper function for compiling metadata. |
183 | * |
184 | * @param int $mode Class SAVE_* constant |
185 | * - SAVE_IMMEDIATE = Unless overridden with self::configComponentDb(), uses |
186 | * primary DB for reads and writes. The caller should use self::configComponentDB() |
187 | * to use the replica for as many compilation components as possible. |
188 | * - SAVE_DEFERRED = The replica is used for reads. The metadata will be written |
189 | * to the database at the end of the request in a deferred update. |
190 | * - SAVE_JOB = The replica is used for reads. The metadata will be written to |
191 | * the database via the job queue. Usage of this mode should be the exception, not |
192 | * the norm – it exists as a safeguard to ensure metadata is compiled for any cases |
193 | * where the hook implementations missed generating the data. |
194 | * @return array |
195 | * The compiled metadata. |
196 | */ |
197 | public function compileMetadata( $mode = self::SAVE_IMMEDIATE ) { |
198 | $startTime = microtime( true ); |
199 | |
200 | // For deferred / job saves, use the replica for reading data. |
201 | if ( in_array( $mode, [ self::SAVE_DEFERRED, self::SAVE_JOB ] ) ) { |
202 | foreach ( $this->component as $key => $value ) { |
203 | $this->componentDb[$key] = DB_REPLICA; |
204 | } |
205 | } |
206 | |
207 | // Set up which components of metadata to compile. |
208 | $this->prepare(); |
209 | |
210 | // Instantiate the dedicated class for each component, compile the metadata associated |
211 | // with the class, then store the metadata in $this->metadata for use below. |
212 | $this->process(); |
213 | |
214 | switch ( $mode ) { |
215 | case self::SAVE_JOB: |
216 | // This flag is used in ArticleMetadata::getMetadata() when article metadata |
217 | // is missing and the request context is a GET. |
218 | // We will return the already compiled metadata, which was generated by querying |
219 | // a replica, but we will not save the results to the database in this request, |
220 | // instead it will get added to the job queue for later processing. |
221 | // Additionally, the metadata will be cached in memcache for 24 hours. |
222 | // The logging statement below can alert us to errors in our hook implementation. |
223 | // Queue a job for each page that doesn't have metadata. |
224 | $jobs = []; |
225 | foreach ( $this->pageIds as $pageId ) { |
226 | $jobs[] = new CompileArticleMetadataJob( |
227 | Title::newMainPage(), |
228 | [ 'pageId' => (int)$pageId ] |
229 | ); |
230 | } |
231 | MediaWikiServices::getInstance()->getJobQueueGroup()->push( $jobs ); |
232 | LoggerFactory::getInstance( 'PageTriage' )->debug( |
233 | 'Article metadata not found in DB, will attempt to save to DB via the job queue.', |
234 | [ |
235 | 'exception' => new RuntimeException(), |
236 | 'articles_without_metadata' => implode( ',', $this->pageIds ), |
237 | 'raw_query_string' => RequestContext::getMain()->getRequest() |
238 | ->getRawQueryString(), |
239 | ] |
240 | ); |
241 | break; |
242 | case self::SAVE_DEFERRED: |
243 | DeferredUpdates::addCallableUpdate( function () { |
244 | // T152847 |
245 | $this->save(); |
246 | } ); |
247 | break; |
248 | case self::SAVE_IMMEDIATE: |
249 | $this->save(); |
250 | } |
251 | |
252 | if ( $mode === self::SAVE_IMMEDIATE ) { |
253 | $this->statsdDataFactory->timing( |
254 | 'timing.pageTriage.articleCompileProcessor.compileMetadata.saveImmediate', |
255 | microtime( true ) - $startTime |
256 | ); |
257 | } |
258 | |
259 | return $this->metadata; |
260 | } |
261 | |
262 | /** |
263 | * Set up the data before compiling |
264 | */ |
265 | protected function prepare() { |
266 | if ( $this->defaultMode ) { |
267 | foreach ( $this->component as $key => $val ) { |
268 | $this->component[$key] = 'on'; |
269 | } |
270 | } else { |
271 | // These two set of data are related |
272 | if ( $this->component['CategoryCount'] == 'on' || $this->component['DeletionTag'] == 'on' ) { |
273 | $this->component['CategoryCount'] = 'on'; |
274 | $this->component['DeletionTag'] = 'on'; |
275 | } |
276 | } |
277 | } |
278 | |
279 | /** |
280 | * Compile all the registered components in order |
281 | */ |
282 | protected function process() { |
283 | $completed = []; |
284 | |
285 | foreach ( $this->component as $key => $val ) { |
286 | if ( $val === 'on' ) { |
287 | $startTime = microtime( true ); |
288 | $compClass = 'MediaWiki\Extension\PageTriage\ArticleCompile\ArticleCompile' . $key; |
289 | /** @var ArticleCompile $comp */ |
290 | $comp = new $compClass( $this->pageIds, $this->componentDb[$key], $this->articles, |
291 | $this->linksUpdates |
292 | ); |
293 | if ( !$comp->compile() ) { |
294 | break; |
295 | } |
296 | $this->statsdDataFactory->timing( |
297 | 'timing.pageTriage.articleCompileProcessor.process.' . $key, |
298 | microtime( true ) - $startTime |
299 | ); |
300 | foreach ( $comp->getMetadata() as $pageId => $row ) { |
301 | $this->metadata[$pageId] += $row; |
302 | } |
303 | $completed[] = $key; |
304 | } |
305 | } |
306 | |
307 | // Subtract deletion tags from category count |
308 | if ( in_array( 'CategoryCount', $completed ) ) { |
309 | $deletionTags = ArticleCompileDeletionTag::getDeletionTags(); |
310 | foreach ( $this->metadata as $pageId => $row ) { |
311 | foreach ( $deletionTags as $val ) { |
312 | if ( $this->metadata[$pageId][$val] ) { |
313 | $this->metadata[$pageId]['category_count']--; |
314 | } |
315 | } |
316 | |
317 | if ( $this->metadata[$pageId]['category_count'] < 0 ) { |
318 | $this->metadata[$pageId]['category_count'] = '0'; |
319 | } |
320 | } |
321 | } |
322 | } |
323 | |
324 | /** |
325 | * Save the compiling result to database as well as cache |
326 | */ |
327 | protected function save() { |
328 | $dbw = PageTriageUtil::getPrimaryConnection(); |
329 | $dbr = PageTriageUtil::getReplicaConnection(); |
330 | |
331 | if ( !$this->pageIds ) { |
332 | return; |
333 | } |
334 | |
335 | $tags = ArticleMetadata::getValidTags(); |
336 | |
337 | // Grab existing old metadata |
338 | $res = $dbr->newSelectQueryBuilder() |
339 | ->select( [ 'ptrpt_page_id', 'ptrt_tag_name', 'ptrpt_value' ] ) |
340 | ->from( 'pagetriage_page_tags' ) |
341 | ->join( 'pagetriage_tags', null, 'ptrpt_tag_id = ptrt_tag_id' ) |
342 | ->where( [ 'ptrpt_page_id' => $this->pageIds ] ) |
343 | ->caller( __METHOD__ ) |
344 | ->fetchResultSet(); |
345 | // data in $newData is used for update, initialize it with new metadata |
346 | $newData = $this->metadata; |
347 | // Loop through old metadata value and compare them with the new one, |
348 | // if they are the same, remove them from $newData |
349 | foreach ( $res as $row ) { |
350 | if ( isset( $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] ) |
351 | && $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] == $row->ptrpt_value |
352 | ) { |
353 | unset( $newData[$row->ptrpt_page_id][$row->ptrt_tag_name] ); |
354 | } |
355 | } |
356 | |
357 | foreach ( $newData as $pageId => $data ) { |
358 | // Flush cache so a new copy of cache will be generated, it's safe to |
359 | // refresh in case some data other than metadata gets updated |
360 | $articleMetadata = new ArticleMetadata( [ $pageId ] ); |
361 | $articleMetadata->flushMetadataFromCache(); |
362 | // Make sure either all or none metadata for a single page_id |
363 | $dbw->startAtomic( __METHOD__ ); |
364 | |
365 | $updateReviewedTimestamp = false; |
366 | |
367 | // Check for the update_reviewed_timestamp flag, which means we should update the |
368 | // ptrp_reviewed_updated field after processing (e.g. submission date of AfC drafts). |
369 | if ( array_key_exists( 'update_reviewed_timestamp', $data ) ) { |
370 | unset( $data['update_reviewed_timestamp'] ); |
371 | $updateReviewedTimestamp = true; |
372 | } |
373 | |
374 | foreach ( $data as $key => $val ) { |
375 | if ( isset( $tags[$key] ) ) { |
376 | $row = [ |
377 | 'ptrpt_page_id' => $pageId, |
378 | 'ptrpt_tag_id' => $tags[$key], |
379 | 'ptrpt_value' => (string)$val |
380 | ]; |
381 | $dbw->newReplaceQueryBuilder() |
382 | ->replaceInto( 'pagetriage_page_tags' ) |
383 | ->uniqueIndexFields( [ 'ptrpt_page_id', 'ptrpt_tag_id' ] ) |
384 | ->row( $row ) |
385 | ->caller( __METHOD__ ) |
386 | ->execute(); |
387 | } |
388 | } |
389 | $pt = new PageTriage( $pageId ); |
390 | $row = [ 'ptrp_tags_updated' => $dbw->timestamp( wfTimestampNow() ) ]; |
391 | |
392 | if ( $updateReviewedTimestamp ) { |
393 | $row['ptrp_reviewed_updated'] = $dbw->timestamp( $this->getLastEditTimestamp( $pageId ) ); |
394 | } |
395 | |
396 | if ( isset( $data['deleted'] ) ) { |
397 | $row['ptrp_deleted'] = $data['deleted'] ? '1' : '0'; |
398 | } |
399 | $pt->update( $row ); |
400 | $dbw->endAtomic( __METHOD__ ); |
401 | } |
402 | } |
403 | |
404 | } |