Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
70.55% |
206 / 292 |
|
58.33% |
14 / 24 |
CRAP | |
0.00% |
0 / 1 |
CategoryChangesAsRdf | |
70.55% |
206 / 292 |
|
58.33% |
14 / 24 |
156.06 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
20 | |||
getInsertRdf | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getCategoriesUpdate | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
writeParentCategories | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
updateTS | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
setupChangesIterator | |
96.55% |
28 / 29 |
|
0.00% |
0 / 1 |
2 | |||
getNewCatsIterator | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getMovedCatsIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
getDeletedCatsIterator | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
2 | |||
getRestoredCatsIterator | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
getChangedCatsIterator | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
addTimestampConditions | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
addIndex | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryLinksIterator | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getRdf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
handleDeletes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
writeCategoryData | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
handleMoves | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
handleRestores | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleAdds | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleEdits | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
handleCategorization | |
96.61% |
57 / 59 |
|
0.00% |
0 / 1 |
13 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use MediaWiki\Utils\MWTimestamp; |
23 | use Wikimedia\Purtle\RdfWriter; |
24 | use Wikimedia\Purtle\TurtleRdfWriter; |
25 | use Wikimedia\Rdbms\IReadableDatabase; |
26 | |
27 | // @codeCoverageIgnoreStart |
28 | require_once __DIR__ . '/Maintenance.php'; |
29 | // @codeCoverageIgnoreEnd |
30 | |
31 | /** |
32 | * Maintenance script to provide RDF representation of the recent changes in category tree. |
33 | * |
34 | * @ingroup Maintenance |
35 | * @since 1.30 |
36 | */ |
37 | class CategoryChangesAsRdf extends Maintenance { |
38 | /** |
39 | * Insert query |
40 | */ |
41 | private const SPARQL_INSERT = <<<SPARQL |
42 | INSERT DATA { |
43 | %s |
44 | }; |
45 | |
46 | SPARQL; |
47 | |
48 | /** |
49 | * Delete query |
50 | */ |
51 | private const SPARQL_DELETE = <<<SPARQLD |
52 | DELETE { |
53 | ?category ?x ?y |
54 | } WHERE { |
55 | ?category ?x ?y |
56 | VALUES ?category { |
57 | %s |
58 | } |
59 | }; |
60 | |
61 | SPARQLD; |
62 | |
63 | /** |
64 | * @var RdfWriter |
65 | */ |
66 | private $rdfWriter; |
67 | /** |
68 | * Categories RDF helper. |
69 | * @var CategoriesRdf |
70 | */ |
71 | private $categoriesRdf; |
72 | |
73 | /** @var string */ |
74 | private $startTS; |
75 | /** @var string */ |
76 | private $endTS; |
77 | |
78 | /** |
79 | * List of processed page IDs, |
80 | * so we don't try to process same thing twice |
81 | * @var true[] |
82 | */ |
83 | protected $processed = []; |
84 | |
85 | public function __construct() { |
86 | parent::__construct(); |
87 | |
88 | $this->addDescription( "Generate RDF dump of category changes in a wiki." ); |
89 | |
90 | $this->setBatchSize( 200 ); |
91 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, |
92 | true, 'o' ); |
93 | $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.', |
94 | true, true, 's' ); |
95 | $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true, |
96 | true, 'e' ); |
97 | } |
98 | |
99 | /** |
100 | * Initialize external service classes. |
101 | */ |
102 | public function initialize() { |
103 | // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. |
104 | $this->rdfWriter = new TurtleRdfWriter(); |
105 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
106 | } |
107 | |
108 | public function execute() { |
109 | $this->initialize(); |
110 | $startTS = new MWTimestamp( $this->getOption( "start" ) ); |
111 | |
112 | $endTS = new MWTimestamp( $this->getOption( "end" ) ); |
113 | $now = new MWTimestamp(); |
114 | $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge ); |
115 | |
116 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
117 | $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" ); |
118 | } |
119 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
120 | $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" ); |
121 | } |
122 | |
123 | $this->startTS = $startTS->getTimestamp(); |
124 | $this->endTS = $endTS->getTimestamp(); |
125 | |
126 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
127 | if ( $outFile === '-' ) { |
128 | $outFile = 'php://stdout'; |
129 | } |
130 | |
131 | $output = fopen( $outFile, 'wb' ); |
132 | |
133 | $this->categoriesRdf->setupPrefixes(); |
134 | $this->rdfWriter->start(); |
135 | |
136 | $prefixes = $this->getRdf(); |
137 | // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them |
138 | // Also strip dot at the end. |
139 | $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); |
140 | fwrite( $output, $prefixes ); |
141 | |
142 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
143 | |
144 | // Deletes go first because if the page was deleted, other changes |
145 | // do not matter. This only gets true deletes, i.e. not pages that were restored. |
146 | $this->handleDeletes( $dbr, $output ); |
147 | // Moves go before additions because if category is moved, we should not process creation |
148 | // as it would produce wrong data - because create row has old title |
149 | $this->handleMoves( $dbr, $output ); |
150 | // We need to handle restores too since delete may have happened in previous update. |
151 | $this->handleRestores( $dbr, $output ); |
152 | // Process newly added pages |
153 | $this->handleAdds( $dbr, $output ); |
154 | // Process page edits |
155 | $this->handleEdits( $dbr, $output ); |
156 | // Process categorization changes |
157 | $this->handleCategorization( $dbr, $output ); |
158 | |
159 | // Update timestamp |
160 | fwrite( $output, $this->updateTS( $this->endTS ) ); |
161 | } |
162 | |
163 | /** |
164 | * Get the text of SPARQL INSERT DATA clause |
165 | * @return string |
166 | */ |
167 | private function getInsertRdf() { |
168 | $rdfText = $this->getRdf(); |
169 | if ( !$rdfText ) { |
170 | return ""; |
171 | } |
172 | return sprintf( self::SPARQL_INSERT, $rdfText ); |
173 | } |
174 | |
175 | /** |
176 | * Get SPARQL for updating set of categories |
177 | * @param IReadableDatabase $dbr |
178 | * @param string[] $deleteUrls List of URIs to be deleted, with <> |
179 | * @param string[] $pages List of categories: id => title |
180 | * @param string $mark Marks which operation requests the query |
181 | * @return string SPARQL query |
182 | */ |
183 | private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) { |
184 | if ( !$deleteUrls ) { |
185 | return ""; |
186 | } |
187 | |
188 | if ( $pages ) { |
189 | $this->writeParentCategories( $dbr, $pages ); |
190 | } |
191 | |
192 | return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) . |
193 | $this->getInsertRdf(); |
194 | } |
195 | |
196 | /** |
197 | * Write parent data for a set of categories. |
198 | * The list has the child categories. |
199 | * @param IReadableDatabase $dbr |
200 | * @param string[] $pages List of child categories: id => title |
201 | */ |
202 | private function writeParentCategories( IReadableDatabase $dbr, $pages ) { |
203 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
204 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
205 | } |
206 | } |
207 | |
208 | /** |
209 | * Generate SPARQL Update code for updating dump timestamp |
210 | * @param string|int $timestamp Timestamp for last change |
211 | * @return string SPARQL Update query for timestamp. |
212 | */ |
213 | public function updateTS( $timestamp ) { |
214 | $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; |
215 | $ts = wfTimestamp( TS_ISO_8601, $timestamp ); |
216 | $tsQuery = <<<SPARQL |
217 | DELETE { |
218 | $dumpUrl schema:dateModified ?o . |
219 | } |
220 | WHERE { |
221 | $dumpUrl schema:dateModified ?o . |
222 | }; |
223 | INSERT DATA { |
224 | $dumpUrl schema:dateModified "$ts"^^xsd:dateTime . |
225 | } |
226 | |
227 | SPARQL; |
228 | return $tsQuery; |
229 | } |
230 | |
231 | /** |
232 | * Set up standard iterator for retrieving category changes. |
233 | * @param IReadableDatabase $dbr |
234 | * @param string[] $columns List of additional fields to get |
235 | * @param string[] $extra_tables List of additional tables to join |
236 | * @param string $fname Name of the calling function |
237 | * @return BatchRowIterator |
238 | */ |
239 | private function setupChangesIterator( |
240 | IReadableDatabase $dbr, |
241 | array $columns, |
242 | array $extra_tables, |
243 | string $fname |
244 | ) { |
245 | $tables = [ 'recentchanges', 'page_props', 'category' ]; |
246 | if ( $extra_tables ) { |
247 | $tables = array_merge( $tables, $extra_tables ); |
248 | } |
249 | $it = new BatchRowIterator( $dbr, |
250 | $tables, |
251 | [ 'rc_timestamp' ], |
252 | $this->mBatchSize |
253 | ); |
254 | $this->addTimestampConditions( $it, $dbr ); |
255 | $it->addJoinConditions( |
256 | [ |
257 | 'page_props' => [ |
258 | 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] |
259 | ], |
260 | 'category' => [ |
261 | 'LEFT JOIN', [ 'cat_title = rc_title' ] |
262 | ] |
263 | ] |
264 | ); |
265 | $it->setFetchColumns( array_merge( $columns, [ |
266 | 'rc_title', |
267 | 'rc_cur_id', |
268 | 'pp_propname', |
269 | 'cat_pages', |
270 | 'cat_subcats', |
271 | 'cat_files' |
272 | ] ) ); |
273 | $it->setCaller( $fname ); |
274 | return $it; |
275 | } |
276 | |
277 | /** |
278 | * Fetch newly created categories |
279 | * @param IReadableDatabase $dbr |
280 | * @param string $fname Name of the calling function |
281 | * @return BatchRowIterator |
282 | */ |
283 | protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) { |
284 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
285 | $it->addConditions( [ |
286 | 'rc_namespace' => NS_CATEGORY, |
287 | 'rc_new' => 1, |
288 | ] ); |
289 | return $it; |
290 | } |
291 | |
292 | /** |
293 | * Fetch moved categories |
294 | * @param IReadableDatabase $dbr |
295 | * @param string $fname Name of the calling function |
296 | * @return BatchRowIterator |
297 | */ |
298 | protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) { |
299 | $it = $this->setupChangesIterator( |
300 | $dbr, |
301 | [ 'page_title', 'page_namespace' ], |
302 | [ 'page' ], |
303 | $fname |
304 | ); |
305 | $it->addConditions( [ |
306 | 'rc_namespace' => NS_CATEGORY, |
307 | 'rc_new' => 0, |
308 | 'rc_log_type' => 'move', |
309 | 'rc_type' => RC_LOG, |
310 | ] ); |
311 | $it->addJoinConditions( [ |
312 | 'page' => [ 'JOIN', 'rc_cur_id = page_id' ], |
313 | ] ); |
314 | $this->addIndex( $it ); |
315 | return $it; |
316 | } |
317 | |
318 | /** |
319 | * Fetch deleted categories |
320 | * @param IReadableDatabase $dbr |
321 | * @param string $fname Name of the calling function |
322 | * @return BatchRowIterator |
323 | */ |
324 | protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) { |
325 | $it = new BatchRowIterator( $dbr, |
326 | 'recentchanges', |
327 | [ 'rc_timestamp' ], |
328 | $this->mBatchSize |
329 | ); |
330 | $this->addTimestampConditions( $it, $dbr ); |
331 | $it->addConditions( [ |
332 | 'rc_namespace' => NS_CATEGORY, |
333 | 'rc_new' => 0, |
334 | 'rc_log_type' => 'delete', |
335 | 'rc_log_action' => 'delete', |
336 | 'rc_type' => RC_LOG, |
337 | // We will fetch ones that do not have page record. If they do, |
338 | // this means they were restored, thus restoring handler will pick it up. |
339 | 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', |
340 | ] ); |
341 | $this->addIndex( $it ); |
342 | $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] ); |
343 | $it->setCaller( $fname ); |
344 | return $it; |
345 | } |
346 | |
347 | /** |
348 | * Fetch restored categories |
349 | * @param IReadableDatabase $dbr |
350 | * @param string $fname Name of the calling function |
351 | * @return BatchRowIterator |
352 | */ |
353 | protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) { |
354 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
355 | $it->addConditions( [ |
356 | 'rc_namespace' => NS_CATEGORY, |
357 | 'rc_new' => 0, |
358 | 'rc_log_type' => 'delete', |
359 | 'rc_log_action' => 'restore', |
360 | 'rc_type' => RC_LOG, |
361 | // We will only fetch ones that have page record |
362 | 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', |
363 | ] ); |
364 | $this->addIndex( $it ); |
365 | return $it; |
366 | } |
367 | |
368 | /** |
369 | * Fetch categorization changes or edits |
370 | * @param IReadableDatabase $dbr |
371 | * @param int $type |
372 | * @param string $fname Name of the calling function |
373 | * @return BatchRowIterator |
374 | */ |
375 | protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) { |
376 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
377 | $it->addConditions( [ |
378 | 'rc_namespace' => NS_CATEGORY, |
379 | 'rc_new' => 0, |
380 | 'rc_type' => $type, |
381 | ] ); |
382 | $this->addIndex( $it ); |
383 | return $it; |
384 | } |
385 | |
386 | /** |
387 | * Add timestamp limits to iterator |
388 | * @param BatchRowIterator $it Iterator |
389 | * @param IReadableDatabase $dbr |
390 | */ |
391 | private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) { |
392 | $it->addConditions( [ |
393 | $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ), |
394 | $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ), |
395 | ] ); |
396 | } |
397 | |
398 | /** |
399 | * Need to force index, somehow on terbium the optimizer chooses wrong one |
400 | * @param BatchRowIterator $it |
401 | */ |
402 | private function addIndex( BatchRowIterator $it ) { |
403 | $it->addOptions( [ |
404 | 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ] |
405 | ] ); |
406 | } |
407 | |
408 | /** |
409 | * Get iterator for links for categories. |
410 | * @param IReadableDatabase $dbr |
411 | * @param int[] $ids List of page IDs |
412 | * @param string $fname Name of the calling function |
413 | * @return Traversable |
414 | */ |
415 | protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
416 | $it = new BatchRowIterator( |
417 | $dbr, |
418 | 'categorylinks', |
419 | [ 'cl_from', 'cl_to' ], |
420 | $this->mBatchSize |
421 | ); |
422 | $it->addConditions( [ |
423 | 'cl_type' => 'subcat', |
424 | 'cl_from' => $ids |
425 | ] ); |
426 | $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); |
427 | $it->setCaller( $fname ); |
428 | return new RecursiveIteratorIterator( $it ); |
429 | } |
430 | |
431 | /** |
432 | * Get accumulated RDF. |
433 | * @return string |
434 | */ |
435 | public function getRdf() { |
436 | return $this->rdfWriter->drain(); |
437 | } |
438 | |
439 | /** |
440 | * Handle category deletes. |
441 | * @param IReadableDatabase $dbr |
442 | * @param resource $output File to write the output |
443 | */ |
444 | public function handleDeletes( IReadableDatabase $dbr, $output ) { |
445 | // This only does "true" deletes - i.e. those that the page stays deleted |
446 | |
447 | foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
448 | $deleteUrls = []; |
449 | foreach ( $batch as $row ) { |
450 | // This can produce duplicates, we don't care |
451 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
452 | $this->processed[$row->rc_cur_id] = true; |
453 | } |
454 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); |
455 | } |
456 | } |
457 | |
458 | /** |
459 | * Write category data to RDF. |
460 | * @param stdclass $row Database row |
461 | */ |
462 | private function writeCategoryData( $row ) { |
463 | $this->categoriesRdf->writeCategoryData( |
464 | $row->rc_title, |
465 | $row->pp_propname === 'hiddencat', |
466 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
467 | (int)$row->cat_subcats |
468 | ); |
469 | } |
470 | |
471 | /** |
472 | * @param IReadableDatabase $dbr |
473 | * @param resource $output |
474 | */ |
475 | public function handleMoves( IReadableDatabase $dbr, $output ) { |
476 | foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
477 | $pages = []; |
478 | $deleteUrls = []; |
479 | foreach ( $batch as $row ) { |
480 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
481 | |
482 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
483 | // We already captured this one before |
484 | continue; |
485 | } |
486 | |
487 | if ( $row->page_namespace != NS_CATEGORY ) { |
488 | // If page was moved out of Category:, we'll just delete |
489 | continue; |
490 | } |
491 | $row->rc_title = $row->page_title; |
492 | $this->writeCategoryData( $row ); |
493 | $pages[$row->rc_cur_id] = $row->page_title; |
494 | $this->processed[$row->rc_cur_id] = true; |
495 | } |
496 | |
497 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); |
498 | } |
499 | } |
500 | |
501 | /** |
502 | * @param IReadableDatabase $dbr |
503 | * @param resource $output |
504 | */ |
505 | public function handleRestores( IReadableDatabase $dbr, $output ) { |
506 | fwrite( $output, "# Restores\n" ); |
507 | |
508 | // This will only find those restores that were not deleted later. |
509 | foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
510 | $pages = []; |
511 | foreach ( $batch as $row ) { |
512 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
513 | // We already captured this one before |
514 | continue; |
515 | } |
516 | $this->writeCategoryData( $row ); |
517 | $pages[$row->rc_cur_id] = $row->rc_title; |
518 | $this->processed[$row->rc_cur_id] = true; |
519 | } |
520 | |
521 | if ( !$pages ) { |
522 | continue; |
523 | } |
524 | |
525 | $this->writeParentCategories( $dbr, $pages ); |
526 | |
527 | fwrite( $output, $this->getInsertRdf() ); |
528 | } |
529 | } |
530 | |
531 | /** |
532 | * @param IReadableDatabase $dbr |
533 | * @param resource $output |
534 | */ |
535 | public function handleAdds( IReadableDatabase $dbr, $output ) { |
536 | fwrite( $output, "# Additions\n" ); |
537 | |
538 | foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
539 | $pages = []; |
540 | foreach ( $batch as $row ) { |
541 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
542 | // We already captured this one before |
543 | continue; |
544 | } |
545 | $this->writeCategoryData( $row ); |
546 | $pages[$row->rc_cur_id] = $row->rc_title; |
547 | $this->processed[$row->rc_cur_id] = true; |
548 | } |
549 | |
550 | if ( !$pages ) { |
551 | continue; |
552 | } |
553 | |
554 | $this->writeParentCategories( $dbr, $pages ); |
555 | fwrite( $output, $this->getInsertRdf() ); |
556 | } |
557 | } |
558 | |
559 | /** |
560 | * Handle edits for category texts |
561 | * @param IReadableDatabase $dbr |
562 | * @param resource $output |
563 | */ |
564 | public function handleEdits( IReadableDatabase $dbr, $output ) { |
565 | // Editing category can change hidden flag and add new parents. |
566 | // TODO: it's pretty expensive to update all edited categories, and most edits |
567 | // aren't actually interesting for us. Some way to know which are interesting? |
568 | // We can capture recategorization on the next step, but not change in hidden status. |
569 | |
570 | foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) { |
571 | $pages = []; |
572 | $deleteUrls = []; |
573 | foreach ( $batch as $row ) { |
574 | // Note that on categorization event, cur_id points to |
575 | // the child page, not the parent category! |
576 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
577 | // We already captured this one before |
578 | continue; |
579 | } |
580 | $this->writeCategoryData( $row ); |
581 | $pages[$row->rc_cur_id] = $row->rc_title; |
582 | $this->processed[$row->rc_cur_id] = true; |
583 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
584 | } |
585 | |
586 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); |
587 | } |
588 | } |
589 | |
590 | /** |
591 | * Handles categorization changes |
592 | * @param IReadableDatabase $dbr |
593 | * @param resource $output |
594 | */ |
595 | public function handleCategorization( IReadableDatabase $dbr, $output ) { |
596 | $processedTitle = []; |
597 | |
598 | // Categorization change can add new parents and change counts |
599 | // for the parent category. |
600 | |
601 | foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) { |
602 | /* |
603 | * Note that on categorization event, cur_id points to |
604 | * the child page, not the parent category! |
605 | * So we need to have a two-stage process, since we have ID from one |
606 | * category and title from another, and we need both for proper updates. |
607 | * TODO: For now, we do full update even though some data hasn't changed, |
608 | * e.g. parents for parent cat and counts for child cat. |
609 | */ |
610 | $childPages = []; |
611 | $parentCats = []; |
612 | foreach ( $batch as $row ) { |
613 | $childPages[$row->rc_cur_id] = true; |
614 | $parentCats[$row->rc_title] = true; |
615 | } |
616 | |
617 | $pages = []; |
618 | $deleteUrls = []; |
619 | |
620 | if ( $childPages ) { |
621 | // Load child rows by ID |
622 | $childRows = $dbr->newSelectQueryBuilder() |
623 | ->select( [ |
624 | 'page_id', |
625 | 'rc_title' => 'page_title', |
626 | 'pp_propname', |
627 | 'cat_pages', |
628 | 'cat_subcats', |
629 | 'cat_files', |
630 | ] ) |
631 | ->from( 'page' ) |
632 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
633 | ->leftJoin( 'category', null, [ 'cat_title = page_title' ] ) |
634 | ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] ) |
635 | ->caller( __METHOD__ )->fetchResultSet(); |
636 | foreach ( $childRows as $row ) { |
637 | if ( isset( $this->processed[$row->page_id] ) ) { |
638 | // We already captured this one before |
639 | continue; |
640 | } |
641 | $this->writeCategoryData( $row ); |
642 | if ( $row->page_id ) { |
643 | $pages[$row->page_id] = $row->rc_title; |
644 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
645 | $this->processed[$row->page_id] = true; |
646 | } |
647 | } |
648 | } |
649 | |
650 | if ( $parentCats ) { |
651 | // Load parent rows by title |
652 | $parentRows = $dbr->newSelectQueryBuilder() |
653 | ->select( [ |
654 | 'page_id', |
655 | 'rc_title' => 'cat_title', |
656 | 'pp_propname', |
657 | 'cat_pages', |
658 | 'cat_subcats', |
659 | 'cat_files', |
660 | ] ) |
661 | ->from( 'category' ) |
662 | ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] ) |
663 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
664 | ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] ) |
665 | ->caller( __METHOD__ )->fetchResultSet(); |
666 | foreach ( $parentRows as $row ) { |
667 | if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { |
668 | // We already captured this one before |
669 | continue; |
670 | } |
671 | if ( isset( $processedTitle[$row->rc_title] ) ) { |
672 | // We already captured this one before |
673 | continue; |
674 | } |
675 | $this->writeCategoryData( $row ); |
676 | if ( $row->page_id ) { |
677 | $pages[$row->page_id] = $row->rc_title; |
678 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
679 | $this->processed[$row->page_id] = true; |
680 | } |
681 | $processedTitle[$row->rc_title] = true; |
682 | } |
683 | } |
684 | |
685 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); |
686 | } |
687 | } |
688 | } |
689 | |
690 | // @codeCoverageIgnoreStart |
691 | $maintClass = CategoryChangesAsRdf::class; |
692 | require_once RUN_MAINTENANCE_IF_MAIN; |
693 | // @codeCoverageIgnoreEnd |