Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
70.46% |
198 / 281 |
|
62.50% |
15 / 24 |
CRAP | |
0.00% |
0 / 1 |
CategoryChangesAsRdf | |
70.46% |
198 / 281 |
|
62.50% |
15 / 24 |
152.77 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
20 | |||
getInsertRdf | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getCategoriesUpdate | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
writeParentCategories | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
updateTS | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
setupChangesIterator | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
1 | |||
getNewCatsIterator | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getMovedCatsIterator | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
2 | |||
getDeletedCatsIterator | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
2 | |||
getRestoredCatsIterator | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
getChangedCatsIterator | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
addTimestampConditions | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
addIndex | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryLinksIterator | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
getRdf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
handleDeletes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
writeCategoryData | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
handleMoves | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
handleRestores | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleAdds | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleEdits | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
handleCategorization | |
96.61% |
57 / 59 |
|
0.00% |
0 / 1 |
13 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use MediaWiki\Maintenance\Maintenance; |
23 | use MediaWiki\Utils\MWTimestamp; |
24 | use Wikimedia\Purtle\RdfWriter; |
25 | use Wikimedia\Purtle\TurtleRdfWriter; |
26 | use Wikimedia\Rdbms\IReadableDatabase; |
27 | |
28 | // @codeCoverageIgnoreStart |
29 | require_once __DIR__ . '/Maintenance.php'; |
30 | // @codeCoverageIgnoreEnd |
31 | |
32 | /** |
33 | * Maintenance script to provide RDF representation of the recent changes in category tree. |
34 | * |
35 | * @ingroup Maintenance |
36 | * @since 1.30 |
37 | */ |
38 | class CategoryChangesAsRdf extends Maintenance { |
39 | /** |
40 | * Insert query |
41 | */ |
42 | private const SPARQL_INSERT = <<<SPARQL |
43 | INSERT DATA { |
44 | %s |
45 | }; |
46 | |
47 | SPARQL; |
48 | |
49 | /** |
50 | * Delete query |
51 | */ |
52 | private const SPARQL_DELETE = <<<SPARQLD |
53 | DELETE { |
54 | ?category ?x ?y |
55 | } WHERE { |
56 | ?category ?x ?y |
57 | VALUES ?category { |
58 | %s |
59 | } |
60 | }; |
61 | |
62 | SPARQLD; |
63 | |
64 | /** |
65 | * @var RdfWriter |
66 | */ |
67 | private $rdfWriter; |
68 | /** |
69 | * Categories RDF helper. |
70 | * @var CategoriesRdf |
71 | */ |
72 | private $categoriesRdf; |
73 | |
74 | /** @var string */ |
75 | private $startTS; |
76 | /** @var string */ |
77 | private $endTS; |
78 | |
79 | /** |
80 | * List of processed page IDs, |
81 | * so we don't try to process same thing twice |
82 | * @var true[] |
83 | */ |
84 | protected $processed = []; |
85 | |
86 | public function __construct() { |
87 | parent::__construct(); |
88 | |
89 | $this->addDescription( "Generate RDF dump of category changes in a wiki." ); |
90 | |
91 | $this->setBatchSize( 200 ); |
92 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, |
93 | true, 'o' ); |
94 | $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.', |
95 | true, true, 's' ); |
96 | $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true, |
97 | true, 'e' ); |
98 | } |
99 | |
100 | /** |
101 | * Initialize external service classes. |
102 | */ |
103 | public function initialize() { |
104 | // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. |
105 | $this->rdfWriter = new TurtleRdfWriter(); |
106 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
107 | } |
108 | |
109 | public function execute() { |
110 | $this->initialize(); |
111 | $startTS = new MWTimestamp( $this->getOption( "start" ) ); |
112 | |
113 | $endTS = new MWTimestamp( $this->getOption( "end" ) ); |
114 | $now = new MWTimestamp(); |
115 | $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge ); |
116 | |
117 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
118 | $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" ); |
119 | } |
120 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
121 | $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" ); |
122 | } |
123 | |
124 | $this->startTS = $startTS->getTimestamp(); |
125 | $this->endTS = $endTS->getTimestamp(); |
126 | |
127 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
128 | if ( $outFile === '-' ) { |
129 | $outFile = 'php://stdout'; |
130 | } |
131 | |
132 | $output = fopen( $outFile, 'wb' ); |
133 | |
134 | $this->categoriesRdf->setupPrefixes(); |
135 | $this->rdfWriter->start(); |
136 | |
137 | $prefixes = $this->getRdf(); |
138 | // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them |
139 | // Also strip dot at the end. |
140 | $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); |
141 | fwrite( $output, $prefixes ); |
142 | |
143 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
144 | |
145 | // Deletes go first because if the page was deleted, other changes |
146 | // do not matter. This only gets true deletes, i.e. not pages that were restored. |
147 | $this->handleDeletes( $dbr, $output ); |
148 | // Moves go before additions because if category is moved, we should not process creation |
149 | // as it would produce wrong data - because create row has old title |
150 | $this->handleMoves( $dbr, $output ); |
151 | // We need to handle restores too since delete may have happened in previous update. |
152 | $this->handleRestores( $dbr, $output ); |
153 | // Process newly added pages |
154 | $this->handleAdds( $dbr, $output ); |
155 | // Process page edits |
156 | $this->handleEdits( $dbr, $output ); |
157 | // Process categorization changes |
158 | $this->handleCategorization( $dbr, $output ); |
159 | |
160 | // Update timestamp |
161 | fwrite( $output, $this->updateTS( $this->endTS ) ); |
162 | } |
163 | |
164 | /** |
165 | * Get the text of SPARQL INSERT DATA clause |
166 | * @return string |
167 | */ |
168 | private function getInsertRdf() { |
169 | $rdfText = $this->getRdf(); |
170 | if ( !$rdfText ) { |
171 | return ""; |
172 | } |
173 | return sprintf( self::SPARQL_INSERT, $rdfText ); |
174 | } |
175 | |
176 | /** |
177 | * Get SPARQL for updating set of categories |
178 | * @param IReadableDatabase $dbr |
179 | * @param string[] $deleteUrls List of URIs to be deleted, with <> |
180 | * @param string[] $pages List of categories: id => title |
181 | * @param string $mark Marks which operation requests the query |
182 | * @return string SPARQL query |
183 | */ |
184 | private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) { |
185 | if ( !$deleteUrls ) { |
186 | return ""; |
187 | } |
188 | |
189 | if ( $pages ) { |
190 | $this->writeParentCategories( $dbr, $pages ); |
191 | } |
192 | |
193 | return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) . |
194 | $this->getInsertRdf(); |
195 | } |
196 | |
197 | /** |
198 | * Write parent data for a set of categories. |
199 | * The list has the child categories. |
200 | * @param IReadableDatabase $dbr |
201 | * @param string[] $pages List of child categories: id => title |
202 | */ |
203 | private function writeParentCategories( IReadableDatabase $dbr, $pages ) { |
204 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
205 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
206 | } |
207 | } |
208 | |
209 | /** |
210 | * Generate SPARQL Update code for updating dump timestamp |
211 | * @param string|int $timestamp Timestamp for last change |
212 | * @return string SPARQL Update query for timestamp. |
213 | */ |
214 | public function updateTS( $timestamp ) { |
215 | $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; |
216 | $ts = wfTimestamp( TS_ISO_8601, $timestamp ); |
217 | $tsQuery = <<<SPARQL |
218 | DELETE { |
219 | $dumpUrl schema:dateModified ?o . |
220 | } |
221 | WHERE { |
222 | $dumpUrl schema:dateModified ?o . |
223 | }; |
224 | INSERT DATA { |
225 | $dumpUrl schema:dateModified "$ts"^^xsd:dateTime . |
226 | } |
227 | |
228 | SPARQL; |
229 | return $tsQuery; |
230 | } |
231 | |
232 | /** |
233 | * Set up standard iterator for retrieving category changes. |
234 | * @param IReadableDatabase $dbr |
235 | * @param string[] $columns List of additional fields to get |
236 | * @param string $fname Name of the calling function |
237 | * @return BatchRowIterator |
238 | */ |
239 | private function setupChangesIterator( |
240 | IReadableDatabase $dbr, |
241 | array $columns, |
242 | string $fname |
243 | ) { |
244 | $it = new BatchRowIterator( $dbr, |
245 | $dbr->newSelectQueryBuilder() |
246 | ->from( 'recentchanges' ) |
247 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] ) |
248 | ->leftJoin( 'category', null, [ 'cat_title = rc_title' ] ) |
249 | ->select( array_merge( $columns, [ |
250 | 'rc_title', |
251 | 'rc_cur_id', |
252 | 'pp_propname', |
253 | 'cat_pages', |
254 | 'cat_subcats', |
255 | 'cat_files' |
256 | ] ) ) |
257 | ->caller( $fname ), |
258 | [ 'rc_timestamp' ], |
259 | $this->mBatchSize |
260 | ); |
261 | $this->addTimestampConditions( $it, $dbr ); |
262 | return $it; |
263 | } |
264 | |
265 | /** |
266 | * Fetch newly created categories |
267 | * @param IReadableDatabase $dbr |
268 | * @param string $fname Name of the calling function |
269 | * @return BatchRowIterator |
270 | */ |
271 | protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) { |
272 | $it = $this->setupChangesIterator( $dbr, [], $fname ); |
273 | $it->sqb->conds( [ |
274 | 'rc_namespace' => NS_CATEGORY, |
275 | 'rc_new' => 1, |
276 | ] ); |
277 | return $it; |
278 | } |
279 | |
280 | /** |
281 | * Fetch moved categories |
282 | * @param IReadableDatabase $dbr |
283 | * @param string $fname Name of the calling function |
284 | * @return BatchRowIterator |
285 | */ |
286 | protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) { |
287 | $it = $this->setupChangesIterator( |
288 | $dbr, |
289 | [ 'page_title', 'page_namespace' ], |
290 | $fname |
291 | ); |
292 | $it->sqb->conds( [ |
293 | 'rc_namespace' => NS_CATEGORY, |
294 | 'rc_new' => 0, |
295 | 'rc_log_type' => 'move', |
296 | 'rc_type' => RC_LOG, |
297 | ] ); |
298 | $it->sqb->join( 'page', null, 'rc_cur_id = page_id' ); |
299 | $this->addIndex( $it ); |
300 | return $it; |
301 | } |
302 | |
303 | /** |
304 | * Fetch deleted categories |
305 | * @param IReadableDatabase $dbr |
306 | * @param string $fname Name of the calling function |
307 | * @return BatchRowIterator |
308 | */ |
309 | protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) { |
310 | $it = new BatchRowIterator( $dbr, |
311 | $dbr->newSelectQueryBuilder() |
312 | ->from( 'recentchanges' ) |
313 | ->select( [ 'rc_cur_id', 'rc_title' ] ) |
314 | ->where( [ |
315 | 'rc_namespace' => NS_CATEGORY, |
316 | 'rc_new' => 0, |
317 | 'rc_log_type' => 'delete', |
318 | 'rc_log_action' => 'delete', |
319 | 'rc_type' => RC_LOG, |
320 | // We will fetch ones that do not have page record. If they do, |
321 | // this means they were restored, thus restoring handler will pick it up. |
322 | 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', |
323 | ] ) |
324 | ->caller( $fname ), |
325 | [ 'rc_timestamp' ], |
326 | $this->mBatchSize |
327 | ); |
328 | $this->addTimestampConditions( $it, $dbr ); |
329 | $this->addIndex( $it ); |
330 | return $it; |
331 | } |
332 | |
333 | /** |
334 | * Fetch restored categories |
335 | * @param IReadableDatabase $dbr |
336 | * @param string $fname Name of the calling function |
337 | * @return BatchRowIterator |
338 | */ |
339 | protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) { |
340 | $it = $this->setupChangesIterator( $dbr, [], $fname ); |
341 | $it->sqb->conds( [ |
342 | 'rc_namespace' => NS_CATEGORY, |
343 | 'rc_new' => 0, |
344 | 'rc_log_type' => 'delete', |
345 | 'rc_log_action' => 'restore', |
346 | 'rc_type' => RC_LOG, |
347 | // We will only fetch ones that have page record |
348 | 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', |
349 | ] ); |
350 | $this->addIndex( $it ); |
351 | return $it; |
352 | } |
353 | |
354 | /** |
355 | * Fetch categorization changes or edits |
356 | * @param IReadableDatabase $dbr |
357 | * @param int $type |
358 | * @param string $fname Name of the calling function |
359 | * @return BatchRowIterator |
360 | */ |
361 | protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) { |
362 | $it = $this->setupChangesIterator( $dbr, [], $fname ); |
363 | $it->sqb->conds( [ |
364 | 'rc_namespace' => NS_CATEGORY, |
365 | 'rc_new' => 0, |
366 | 'rc_type' => $type, |
367 | ] ); |
368 | $this->addIndex( $it ); |
369 | return $it; |
370 | } |
371 | |
372 | /** |
373 | * Add timestamp limits to iterator |
374 | * @param BatchRowIterator $it Iterator |
375 | * @param IReadableDatabase $dbr |
376 | */ |
377 | private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) { |
378 | $it->sqb->conds( [ |
379 | $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ), |
380 | $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ), |
381 | ] ); |
382 | } |
383 | |
384 | /** |
385 | * Need to force index, somehow on terbium the optimizer chooses wrong one |
386 | */ |
387 | private function addIndex( BatchRowIterator $it ) { |
388 | $it->sqb->options( [ |
389 | 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ] |
390 | ] ); |
391 | } |
392 | |
393 | /** |
394 | * Get iterator for links for categories. |
395 | * @param IReadableDatabase $dbr |
396 | * @param int[] $ids List of page IDs |
397 | * @param string $fname Name of the calling function |
398 | * @return Traversable |
399 | */ |
400 | protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
401 | $it = new BatchRowIterator( |
402 | $dbr, |
403 | $dbr->newSelectQueryBuilder() |
404 | ->from( 'categorylinks' ) |
405 | ->select( [ 'cl_from', 'cl_to' ] ) |
406 | ->where( [ |
407 | 'cl_type' => 'subcat', |
408 | 'cl_from' => $ids |
409 | ] ) |
410 | ->caller( $fname ), |
411 | [ 'cl_from', 'cl_to' ], |
412 | $this->mBatchSize |
413 | ); |
414 | return new RecursiveIteratorIterator( $it ); |
415 | } |
416 | |
417 | /** |
418 | * Get accumulated RDF. |
419 | * @return string |
420 | */ |
421 | public function getRdf() { |
422 | return $this->rdfWriter->drain(); |
423 | } |
424 | |
425 | /** |
426 | * Handle category deletes. |
427 | * @param IReadableDatabase $dbr |
428 | * @param resource $output File to write the output |
429 | */ |
430 | public function handleDeletes( IReadableDatabase $dbr, $output ) { |
431 | // This only does "true" deletes - i.e. those that the page stays deleted |
432 | |
433 | foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
434 | $deleteUrls = []; |
435 | foreach ( $batch as $row ) { |
436 | // This can produce duplicates, we don't care |
437 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
438 | $this->processed[$row->rc_cur_id] = true; |
439 | } |
440 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); |
441 | } |
442 | } |
443 | |
444 | /** |
445 | * Write category data to RDF. |
446 | * @param stdclass $row Database row |
447 | */ |
448 | private function writeCategoryData( $row ) { |
449 | $this->categoriesRdf->writeCategoryData( |
450 | $row->rc_title, |
451 | $row->pp_propname === 'hiddencat', |
452 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
453 | (int)$row->cat_subcats |
454 | ); |
455 | } |
456 | |
457 | /** |
458 | * @param IReadableDatabase $dbr |
459 | * @param resource $output |
460 | */ |
461 | public function handleMoves( IReadableDatabase $dbr, $output ) { |
462 | foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
463 | $pages = []; |
464 | $deleteUrls = []; |
465 | foreach ( $batch as $row ) { |
466 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
467 | |
468 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
469 | // We already captured this one before |
470 | continue; |
471 | } |
472 | |
473 | if ( $row->page_namespace != NS_CATEGORY ) { |
474 | // If page was moved out of Category:, we'll just delete |
475 | continue; |
476 | } |
477 | $row->rc_title = $row->page_title; |
478 | $this->writeCategoryData( $row ); |
479 | $pages[$row->rc_cur_id] = $row->page_title; |
480 | $this->processed[$row->rc_cur_id] = true; |
481 | } |
482 | |
483 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); |
484 | } |
485 | } |
486 | |
487 | /** |
488 | * @param IReadableDatabase $dbr |
489 | * @param resource $output |
490 | */ |
491 | public function handleRestores( IReadableDatabase $dbr, $output ) { |
492 | fwrite( $output, "# Restores\n" ); |
493 | |
494 | // This will only find those restores that were not deleted later. |
495 | foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
496 | $pages = []; |
497 | foreach ( $batch as $row ) { |
498 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
499 | // We already captured this one before |
500 | continue; |
501 | } |
502 | $this->writeCategoryData( $row ); |
503 | $pages[$row->rc_cur_id] = $row->rc_title; |
504 | $this->processed[$row->rc_cur_id] = true; |
505 | } |
506 | |
507 | if ( !$pages ) { |
508 | continue; |
509 | } |
510 | |
511 | $this->writeParentCategories( $dbr, $pages ); |
512 | |
513 | fwrite( $output, $this->getInsertRdf() ); |
514 | } |
515 | } |
516 | |
517 | /** |
518 | * @param IReadableDatabase $dbr |
519 | * @param resource $output |
520 | */ |
521 | public function handleAdds( IReadableDatabase $dbr, $output ) { |
522 | fwrite( $output, "# Additions\n" ); |
523 | |
524 | foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
525 | $pages = []; |
526 | foreach ( $batch as $row ) { |
527 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
528 | // We already captured this one before |
529 | continue; |
530 | } |
531 | $this->writeCategoryData( $row ); |
532 | $pages[$row->rc_cur_id] = $row->rc_title; |
533 | $this->processed[$row->rc_cur_id] = true; |
534 | } |
535 | |
536 | if ( !$pages ) { |
537 | continue; |
538 | } |
539 | |
540 | $this->writeParentCategories( $dbr, $pages ); |
541 | fwrite( $output, $this->getInsertRdf() ); |
542 | } |
543 | } |
544 | |
545 | /** |
546 | * Handle edits for category texts |
547 | * @param IReadableDatabase $dbr |
548 | * @param resource $output |
549 | */ |
550 | public function handleEdits( IReadableDatabase $dbr, $output ) { |
551 | // Editing category can change hidden flag and add new parents. |
552 | // TODO: it's pretty expensive to update all edited categories, and most edits |
553 | // aren't actually interesting for us. Some way to know which are interesting? |
554 | // We can capture recategorization on the next step, but not change in hidden status. |
555 | |
556 | foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) { |
557 | $pages = []; |
558 | $deleteUrls = []; |
559 | foreach ( $batch as $row ) { |
560 | // Note that on categorization event, cur_id points to |
561 | // the child page, not the parent category! |
562 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
563 | // We already captured this one before |
564 | continue; |
565 | } |
566 | $this->writeCategoryData( $row ); |
567 | $pages[$row->rc_cur_id] = $row->rc_title; |
568 | $this->processed[$row->rc_cur_id] = true; |
569 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
570 | } |
571 | |
572 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); |
573 | } |
574 | } |
575 | |
576 | /** |
577 | * Handles categorization changes |
578 | * @param IReadableDatabase $dbr |
579 | * @param resource $output |
580 | */ |
581 | public function handleCategorization( IReadableDatabase $dbr, $output ) { |
582 | $processedTitle = []; |
583 | |
584 | // Categorization change can add new parents and change counts |
585 | // for the parent category. |
586 | |
587 | foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) { |
588 | /* |
589 | * Note that on categorization event, cur_id points to |
590 | * the child page, not the parent category! |
591 | * So we need to have a two-stage process, since we have ID from one |
592 | * category and title from another, and we need both for proper updates. |
593 | * TODO: For now, we do full update even though some data hasn't changed, |
594 | * e.g. parents for parent cat and counts for child cat. |
595 | */ |
596 | $childPages = []; |
597 | $parentCats = []; |
598 | foreach ( $batch as $row ) { |
599 | $childPages[$row->rc_cur_id] = true; |
600 | $parentCats[$row->rc_title] = true; |
601 | } |
602 | |
603 | $pages = []; |
604 | $deleteUrls = []; |
605 | |
606 | if ( $childPages ) { |
607 | // Load child rows by ID |
608 | $childRows = $dbr->newSelectQueryBuilder() |
609 | ->select( [ |
610 | 'page_id', |
611 | 'rc_title' => 'page_title', |
612 | 'pp_propname', |
613 | 'cat_pages', |
614 | 'cat_subcats', |
615 | 'cat_files', |
616 | ] ) |
617 | ->from( 'page' ) |
618 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
619 | ->leftJoin( 'category', null, [ 'cat_title = page_title' ] ) |
620 | ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] ) |
621 | ->caller( __METHOD__ )->fetchResultSet(); |
622 | foreach ( $childRows as $row ) { |
623 | if ( isset( $this->processed[$row->page_id] ) ) { |
624 | // We already captured this one before |
625 | continue; |
626 | } |
627 | $this->writeCategoryData( $row ); |
628 | if ( $row->page_id ) { |
629 | $pages[$row->page_id] = $row->rc_title; |
630 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
631 | $this->processed[$row->page_id] = true; |
632 | } |
633 | } |
634 | } |
635 | |
636 | if ( $parentCats ) { |
637 | // Load parent rows by title |
638 | $parentRows = $dbr->newSelectQueryBuilder() |
639 | ->select( [ |
640 | 'page_id', |
641 | 'rc_title' => 'cat_title', |
642 | 'pp_propname', |
643 | 'cat_pages', |
644 | 'cat_subcats', |
645 | 'cat_files', |
646 | ] ) |
647 | ->from( 'category' ) |
648 | ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] ) |
649 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
650 | ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] ) |
651 | ->caller( __METHOD__ )->fetchResultSet(); |
652 | foreach ( $parentRows as $row ) { |
653 | if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { |
654 | // We already captured this one before |
655 | continue; |
656 | } |
657 | if ( isset( $processedTitle[$row->rc_title] ) ) { |
658 | // We already captured this one before |
659 | continue; |
660 | } |
661 | $this->writeCategoryData( $row ); |
662 | if ( $row->page_id ) { |
663 | $pages[$row->page_id] = $row->rc_title; |
664 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
665 | $this->processed[$row->page_id] = true; |
666 | } |
667 | $processedTitle[$row->rc_title] = true; |
668 | } |
669 | } |
670 | |
671 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); |
672 | } |
673 | } |
674 | } |
675 | |
676 | // @codeCoverageIgnoreStart |
677 | $maintClass = CategoryChangesAsRdf::class; |
678 | require_once RUN_MAINTENANCE_IF_MAIN; |
679 | // @codeCoverageIgnoreEnd |