Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
69.83% |
206 / 295 |
|
58.33% |
14 / 24 |
CRAP | |
0.00% |
0 / 1 |
CategoryChangesAsRdf | |
70.55% |
206 / 292 |
|
58.33% |
14 / 24 |
156.06 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
20 | |||
getInsertRdf | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getCategoriesUpdate | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
writeParentCategories | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
updateTS | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
setupChangesIterator | |
96.55% |
28 / 29 |
|
0.00% |
0 / 1 |
2 | |||
getNewCatsIterator | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getMovedCatsIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
getDeletedCatsIterator | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
2 | |||
getRestoredCatsIterator | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
getChangedCatsIterator | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
addTimestampConditions | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
addIndex | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryLinksIterator | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getRdf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
handleDeletes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
writeCategoryData | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
handleMoves | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
handleRestores | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleAdds | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
handleEdits | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
handleCategorization | |
96.61% |
57 / 59 |
|
0.00% |
0 / 1 |
13 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use MediaWiki\Utils\MWTimestamp; |
23 | use Wikimedia\Purtle\RdfWriter; |
24 | use Wikimedia\Purtle\TurtleRdfWriter; |
25 | use Wikimedia\Rdbms\IReadableDatabase; |
26 | |
27 | require_once __DIR__ . '/Maintenance.php'; |
28 | |
29 | /** |
30 | * Maintenance script to provide RDF representation of the recent changes in category tree. |
31 | * |
32 | * @ingroup Maintenance |
33 | * @since 1.30 |
34 | */ |
35 | class CategoryChangesAsRdf extends Maintenance { |
36 | /** |
37 | * Insert query |
38 | */ |
39 | private const SPARQL_INSERT = <<<SPARQL |
40 | INSERT DATA { |
41 | %s |
42 | }; |
43 | |
44 | SPARQL; |
45 | |
46 | /** |
47 | * Delete query |
48 | */ |
49 | private const SPARQL_DELETE = <<<SPARQLD |
50 | DELETE { |
51 | ?category ?x ?y |
52 | } WHERE { |
53 | ?category ?x ?y |
54 | VALUES ?category { |
55 | %s |
56 | } |
57 | }; |
58 | |
59 | SPARQLD; |
60 | |
61 | /** |
62 | * @var RdfWriter |
63 | */ |
64 | private $rdfWriter; |
65 | /** |
66 | * Categories RDF helper. |
67 | * @var CategoriesRdf |
68 | */ |
69 | private $categoriesRdf; |
70 | |
71 | private $startTS; |
72 | private $endTS; |
73 | |
74 | /** |
75 | * List of processed page IDs, |
76 | * so we don't try to process same thing twice |
77 | * @var true[] |
78 | */ |
79 | protected $processed = []; |
80 | |
81 | public function __construct() { |
82 | parent::__construct(); |
83 | |
84 | $this->addDescription( "Generate RDF dump of category changes in a wiki." ); |
85 | |
86 | $this->setBatchSize( 200 ); |
87 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, |
88 | true, 'o' ); |
89 | $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.', |
90 | true, true, 's' ); |
91 | $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true, |
92 | true, 'e' ); |
93 | } |
94 | |
95 | /** |
96 | * Initialize external service classes. |
97 | */ |
98 | public function initialize() { |
99 | // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. |
100 | $this->rdfWriter = new TurtleRdfWriter(); |
101 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
102 | } |
103 | |
104 | public function execute() { |
105 | $this->initialize(); |
106 | $startTS = new MWTimestamp( $this->getOption( "start" ) ); |
107 | |
108 | $endTS = new MWTimestamp( $this->getOption( "end" ) ); |
109 | $now = new MWTimestamp(); |
110 | $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge ); |
111 | |
112 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
113 | $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" ); |
114 | } |
115 | if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) { |
116 | $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" ); |
117 | } |
118 | |
119 | $this->startTS = $startTS->getTimestamp(); |
120 | $this->endTS = $endTS->getTimestamp(); |
121 | |
122 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
123 | if ( $outFile === '-' ) { |
124 | $outFile = 'php://stdout'; |
125 | } |
126 | |
127 | $output = fopen( $outFile, 'wb' ); |
128 | |
129 | $this->categoriesRdf->setupPrefixes(); |
130 | $this->rdfWriter->start(); |
131 | |
132 | $prefixes = $this->getRdf(); |
133 | // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them |
134 | // Also strip dot at the end. |
135 | $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); |
136 | fwrite( $output, $prefixes ); |
137 | |
138 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
139 | |
140 | // Deletes go first because if the page was deleted, other changes |
141 | // do not matter. This only gets true deletes, i.e. not pages that were restored. |
142 | $this->handleDeletes( $dbr, $output ); |
143 | // Moves go before additions because if category is moved, we should not process creation |
144 | // as it would produce wrong data - because create row has old title |
145 | $this->handleMoves( $dbr, $output ); |
146 | // We need to handle restores too since delete may have happened in previous update. |
147 | $this->handleRestores( $dbr, $output ); |
148 | // Process newly added pages |
149 | $this->handleAdds( $dbr, $output ); |
150 | // Process page edits |
151 | $this->handleEdits( $dbr, $output ); |
152 | // Process categorization changes |
153 | $this->handleCategorization( $dbr, $output ); |
154 | |
155 | // Update timestamp |
156 | fwrite( $output, $this->updateTS( $this->endTS ) ); |
157 | } |
158 | |
159 | /** |
160 | * Get the text of SPARQL INSERT DATA clause |
161 | * @return string |
162 | */ |
163 | private function getInsertRdf() { |
164 | $rdfText = $this->getRdf(); |
165 | if ( !$rdfText ) { |
166 | return ""; |
167 | } |
168 | return sprintf( self::SPARQL_INSERT, $rdfText ); |
169 | } |
170 | |
171 | /** |
172 | * Get SPARQL for updating set of categories |
173 | * @param IReadableDatabase $dbr |
174 | * @param string[] $deleteUrls List of URIs to be deleted, with <> |
175 | * @param string[] $pages List of categories: id => title |
176 | * @param string $mark Marks which operation requests the query |
177 | * @return string SPARQL query |
178 | */ |
179 | private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) { |
180 | if ( !$deleteUrls ) { |
181 | return ""; |
182 | } |
183 | |
184 | if ( $pages ) { |
185 | $this->writeParentCategories( $dbr, $pages ); |
186 | } |
187 | |
188 | return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) . |
189 | $this->getInsertRdf(); |
190 | } |
191 | |
192 | /** |
193 | * Write parent data for a set of categories. |
194 | * The list has the child categories. |
195 | * @param IReadableDatabase $dbr |
196 | * @param string[] $pages List of child categories: id => title |
197 | */ |
198 | private function writeParentCategories( IReadableDatabase $dbr, $pages ) { |
199 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
200 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
201 | } |
202 | } |
203 | |
204 | /** |
205 | * Generate SPARQL Update code for updating dump timestamp |
206 | * @param string|int $timestamp Timestamp for last change |
207 | * @return string SPARQL Update query for timestamp. |
208 | */ |
209 | public function updateTS( $timestamp ) { |
210 | $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; |
211 | $ts = wfTimestamp( TS_ISO_8601, $timestamp ); |
212 | $tsQuery = <<<SPARQL |
213 | DELETE { |
214 | $dumpUrl schema:dateModified ?o . |
215 | } |
216 | WHERE { |
217 | $dumpUrl schema:dateModified ?o . |
218 | }; |
219 | INSERT DATA { |
220 | $dumpUrl schema:dateModified "$ts"^^xsd:dateTime . |
221 | } |
222 | |
223 | SPARQL; |
224 | return $tsQuery; |
225 | } |
226 | |
227 | /** |
228 | * Set up standard iterator for retrieving category changes. |
229 | * @param IReadableDatabase $dbr |
230 | * @param string[] $columns List of additional fields to get |
231 | * @param string[] $extra_tables List of additional tables to join |
232 | * @param string $fname Name of the calling function |
233 | * @return BatchRowIterator |
234 | */ |
235 | private function setupChangesIterator( |
236 | IReadableDatabase $dbr, |
237 | array $columns = [], |
238 | array $extra_tables = [], |
239 | $fname = __METHOD__ |
240 | ) { |
241 | $tables = [ 'recentchanges', 'page_props', 'category' ]; |
242 | if ( $extra_tables ) { |
243 | $tables = array_merge( $tables, $extra_tables ); |
244 | } |
245 | $it = new BatchRowIterator( $dbr, |
246 | $tables, |
247 | [ 'rc_timestamp' ], |
248 | $this->mBatchSize |
249 | ); |
250 | $this->addTimestampConditions( $it, $dbr ); |
251 | $it->addJoinConditions( |
252 | [ |
253 | 'page_props' => [ |
254 | 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] |
255 | ], |
256 | 'category' => [ |
257 | 'LEFT JOIN', [ 'cat_title = rc_title' ] |
258 | ] |
259 | ] |
260 | ); |
261 | $it->setFetchColumns( array_merge( $columns, [ |
262 | 'rc_title', |
263 | 'rc_cur_id', |
264 | 'pp_propname', |
265 | 'cat_pages', |
266 | 'cat_subcats', |
267 | 'cat_files' |
268 | ] ) ); |
269 | $it->setCaller( $fname ); |
270 | return $it; |
271 | } |
272 | |
273 | /** |
274 | * Fetch newly created categories |
275 | * @param IReadableDatabase $dbr |
276 | * @param string $fname Name of the calling function |
277 | * @return BatchRowIterator |
278 | */ |
279 | protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) { |
280 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
281 | $it->addConditions( [ |
282 | 'rc_namespace' => NS_CATEGORY, |
283 | 'rc_new' => 1, |
284 | ] ); |
285 | return $it; |
286 | } |
287 | |
288 | /** |
289 | * Fetch moved categories |
290 | * @param IReadableDatabase $dbr |
291 | * @param string $fname Name of the calling function |
292 | * @return BatchRowIterator |
293 | */ |
294 | protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) { |
295 | $it = $this->setupChangesIterator( |
296 | $dbr, |
297 | [ 'page_title', 'page_namespace' ], |
298 | [ 'page' ], |
299 | $fname |
300 | ); |
301 | $it->addConditions( [ |
302 | 'rc_namespace' => NS_CATEGORY, |
303 | 'rc_new' => 0, |
304 | 'rc_log_type' => 'move', |
305 | 'rc_type' => RC_LOG, |
306 | ] ); |
307 | $it->addJoinConditions( [ |
308 | 'page' => [ 'JOIN', 'rc_cur_id = page_id' ], |
309 | ] ); |
310 | $this->addIndex( $it ); |
311 | return $it; |
312 | } |
313 | |
314 | /** |
315 | * Fetch deleted categories |
316 | * @param IReadableDatabase $dbr |
317 | * @param string $fname Name of the calling function |
318 | * @return BatchRowIterator |
319 | */ |
320 | protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) { |
321 | $it = new BatchRowIterator( $dbr, |
322 | 'recentchanges', |
323 | [ 'rc_timestamp' ], |
324 | $this->mBatchSize |
325 | ); |
326 | $this->addTimestampConditions( $it, $dbr ); |
327 | $it->addConditions( [ |
328 | 'rc_namespace' => NS_CATEGORY, |
329 | 'rc_new' => 0, |
330 | 'rc_log_type' => 'delete', |
331 | 'rc_log_action' => 'delete', |
332 | 'rc_type' => RC_LOG, |
333 | // We will fetch ones that do not have page record. If they do, |
334 | // this means they were restored, thus restoring handler will pick it up. |
335 | 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', |
336 | ] ); |
337 | $this->addIndex( $it ); |
338 | $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] ); |
339 | $it->setCaller( $fname ); |
340 | return $it; |
341 | } |
342 | |
343 | /** |
344 | * Fetch restored categories |
345 | * @param IReadableDatabase $dbr |
346 | * @param string $fname Name of the calling function |
347 | * @return BatchRowIterator |
348 | */ |
349 | protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) { |
350 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
351 | $it->addConditions( [ |
352 | 'rc_namespace' => NS_CATEGORY, |
353 | 'rc_new' => 0, |
354 | 'rc_log_type' => 'delete', |
355 | 'rc_log_action' => 'restore', |
356 | 'rc_type' => RC_LOG, |
357 | // We will only fetch ones that have page record |
358 | 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', |
359 | ] ); |
360 | $this->addIndex( $it ); |
361 | return $it; |
362 | } |
363 | |
364 | /** |
365 | * Fetch categorization changes or edits |
366 | * @param IReadableDatabase $dbr |
367 | * @param int $type |
368 | * @param string $fname Name of the calling function |
369 | * @return BatchRowIterator |
370 | */ |
371 | protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) { |
372 | $it = $this->setupChangesIterator( $dbr, [], [], $fname ); |
373 | $it->addConditions( [ |
374 | 'rc_namespace' => NS_CATEGORY, |
375 | 'rc_new' => 0, |
376 | 'rc_type' => $type, |
377 | ] ); |
378 | $this->addIndex( $it ); |
379 | return $it; |
380 | } |
381 | |
382 | /** |
383 | * Add timestamp limits to iterator |
384 | * @param BatchRowIterator $it Iterator |
385 | * @param IReadableDatabase $dbr |
386 | */ |
387 | private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) { |
388 | $it->addConditions( [ |
389 | $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ), |
390 | $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ), |
391 | ] ); |
392 | } |
393 | |
394 | /** |
395 | * Need to force index, somehow on terbium the optimizer chooses wrong one |
396 | * @param BatchRowIterator $it |
397 | */ |
398 | private function addIndex( BatchRowIterator $it ) { |
399 | $it->addOptions( [ |
400 | 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ] |
401 | ] ); |
402 | } |
403 | |
404 | /** |
405 | * Get iterator for links for categories. |
406 | * @param IReadableDatabase $dbr |
407 | * @param int[] $ids List of page IDs |
408 | * @param string $fname Name of the calling function |
409 | * @return Traversable |
410 | */ |
411 | protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
412 | $it = new BatchRowIterator( |
413 | $dbr, |
414 | 'categorylinks', |
415 | [ 'cl_from', 'cl_to' ], |
416 | $this->mBatchSize |
417 | ); |
418 | $it->addConditions( [ |
419 | 'cl_type' => 'subcat', |
420 | 'cl_from' => $ids |
421 | ] ); |
422 | $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); |
423 | $it->setCaller( $fname ); |
424 | return new RecursiveIteratorIterator( $it ); |
425 | } |
426 | |
427 | /** |
428 | * Get accumulated RDF. |
429 | * @return string |
430 | */ |
431 | public function getRdf() { |
432 | return $this->rdfWriter->drain(); |
433 | } |
434 | |
435 | /** |
436 | * Handle category deletes. |
437 | * @param IReadableDatabase $dbr |
438 | * @param resource $output File to write the output |
439 | */ |
440 | public function handleDeletes( IReadableDatabase $dbr, $output ) { |
441 | // This only does "true" deletes - i.e. those that the page stays deleted |
442 | |
443 | foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
444 | $deleteUrls = []; |
445 | foreach ( $batch as $row ) { |
446 | // This can produce duplicates, we don't care |
447 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
448 | $this->processed[$row->rc_cur_id] = true; |
449 | } |
450 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); |
451 | } |
452 | } |
453 | |
454 | /** |
455 | * Write category data to RDF. |
456 | * @param stdclass $row Database row |
457 | */ |
458 | private function writeCategoryData( $row ) { |
459 | $this->categoriesRdf->writeCategoryData( |
460 | $row->rc_title, |
461 | $row->pp_propname === 'hiddencat', |
462 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
463 | (int)$row->cat_subcats |
464 | ); |
465 | } |
466 | |
467 | /** |
468 | * @param IReadableDatabase $dbr |
469 | * @param resource $output |
470 | */ |
471 | public function handleMoves( IReadableDatabase $dbr, $output ) { |
472 | foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
473 | $pages = []; |
474 | $deleteUrls = []; |
475 | foreach ( $batch as $row ) { |
476 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
477 | |
478 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
479 | // We already captured this one before |
480 | continue; |
481 | } |
482 | |
483 | if ( $row->page_namespace != NS_CATEGORY ) { |
484 | // If page was moved out of Category:, we'll just delete |
485 | continue; |
486 | } |
487 | $row->rc_title = $row->page_title; |
488 | $this->writeCategoryData( $row ); |
489 | $pages[$row->rc_cur_id] = $row->page_title; |
490 | $this->processed[$row->rc_cur_id] = true; |
491 | } |
492 | |
493 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); |
494 | } |
495 | } |
496 | |
497 | /** |
498 | * @param IReadableDatabase $dbr |
499 | * @param resource $output |
500 | */ |
501 | public function handleRestores( IReadableDatabase $dbr, $output ) { |
502 | fwrite( $output, "# Restores\n" ); |
503 | |
504 | // This will only find those restores that were not deleted later. |
505 | foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
506 | $pages = []; |
507 | foreach ( $batch as $row ) { |
508 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
509 | // We already captured this one before |
510 | continue; |
511 | } |
512 | $this->writeCategoryData( $row ); |
513 | $pages[$row->rc_cur_id] = $row->rc_title; |
514 | $this->processed[$row->rc_cur_id] = true; |
515 | } |
516 | |
517 | if ( !$pages ) { |
518 | continue; |
519 | } |
520 | |
521 | $this->writeParentCategories( $dbr, $pages ); |
522 | |
523 | fwrite( $output, $this->getInsertRdf() ); |
524 | } |
525 | } |
526 | |
527 | /** |
528 | * @param IReadableDatabase $dbr |
529 | * @param resource $output |
530 | */ |
531 | public function handleAdds( IReadableDatabase $dbr, $output ) { |
532 | fwrite( $output, "# Additions\n" ); |
533 | |
534 | foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) { |
535 | $pages = []; |
536 | foreach ( $batch as $row ) { |
537 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
538 | // We already captured this one before |
539 | continue; |
540 | } |
541 | $this->writeCategoryData( $row ); |
542 | $pages[$row->rc_cur_id] = $row->rc_title; |
543 | $this->processed[$row->rc_cur_id] = true; |
544 | } |
545 | |
546 | if ( !$pages ) { |
547 | continue; |
548 | } |
549 | |
550 | $this->writeParentCategories( $dbr, $pages ); |
551 | fwrite( $output, $this->getInsertRdf() ); |
552 | } |
553 | } |
554 | |
555 | /** |
556 | * Handle edits for category texts |
557 | * @param IReadableDatabase $dbr |
558 | * @param resource $output |
559 | */ |
560 | public function handleEdits( IReadableDatabase $dbr, $output ) { |
561 | // Editing category can change hidden flag and add new parents. |
562 | // TODO: it's pretty expensive to update all edited categories, and most edits |
563 | // aren't actually interesting for us. Some way to know which are interesting? |
564 | // We can capture recategorization on the next step, but not change in hidden status. |
565 | |
566 | foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) { |
567 | $pages = []; |
568 | $deleteUrls = []; |
569 | foreach ( $batch as $row ) { |
570 | // Note that on categorization event, cur_id points to |
571 | // the child page, not the parent category! |
572 | if ( isset( $this->processed[$row->rc_cur_id] ) ) { |
573 | // We already captured this one before |
574 | continue; |
575 | } |
576 | $this->writeCategoryData( $row ); |
577 | $pages[$row->rc_cur_id] = $row->rc_title; |
578 | $this->processed[$row->rc_cur_id] = true; |
579 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
580 | } |
581 | |
582 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); |
583 | } |
584 | } |
585 | |
586 | /** |
587 | * Handles categorization changes |
588 | * @param IReadableDatabase $dbr |
589 | * @param resource $output |
590 | */ |
591 | public function handleCategorization( IReadableDatabase $dbr, $output ) { |
592 | $processedTitle = []; |
593 | |
594 | // Categorization change can add new parents and change counts |
595 | // for the parent category. |
596 | |
597 | foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) { |
598 | /* |
599 | * Note that on categorization event, cur_id points to |
600 | * the child page, not the parent category! |
601 | * So we need to have a two-stage process, since we have ID from one |
602 | * category and title from another, and we need both for proper updates. |
603 | * TODO: For now, we do full update even though some data hasn't changed, |
604 | * e.g. parents for parent cat and counts for child cat. |
605 | */ |
606 | $childPages = []; |
607 | $parentCats = []; |
608 | foreach ( $batch as $row ) { |
609 | $childPages[$row->rc_cur_id] = true; |
610 | $parentCats[$row->rc_title] = true; |
611 | } |
612 | |
613 | $pages = []; |
614 | $deleteUrls = []; |
615 | |
616 | if ( $childPages ) { |
617 | // Load child rows by ID |
618 | $childRows = $dbr->newSelectQueryBuilder() |
619 | ->select( [ |
620 | 'page_id', |
621 | 'rc_title' => 'page_title', |
622 | 'pp_propname', |
623 | 'cat_pages', |
624 | 'cat_subcats', |
625 | 'cat_files', |
626 | ] ) |
627 | ->from( 'page' ) |
628 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
629 | ->leftJoin( 'category', null, [ 'cat_title = page_title' ] ) |
630 | ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] ) |
631 | ->caller( __METHOD__ )->fetchResultSet(); |
632 | foreach ( $childRows as $row ) { |
633 | if ( isset( $this->processed[$row->page_id] ) ) { |
634 | // We already captured this one before |
635 | continue; |
636 | } |
637 | $this->writeCategoryData( $row ); |
638 | if ( $row->page_id ) { |
639 | $pages[$row->page_id] = $row->rc_title; |
640 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
641 | $this->processed[$row->page_id] = true; |
642 | } |
643 | } |
644 | } |
645 | |
646 | if ( $parentCats ) { |
647 | // Load parent rows by title |
648 | $parentRows = $dbr->newSelectQueryBuilder() |
649 | ->select( [ |
650 | 'page_id', |
651 | 'rc_title' => 'cat_title', |
652 | 'pp_propname', |
653 | 'cat_pages', |
654 | 'cat_subcats', |
655 | 'cat_files', |
656 | ] ) |
657 | ->from( 'category' ) |
658 | ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] ) |
659 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
660 | ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] ) |
661 | ->caller( __METHOD__ )->fetchResultSet(); |
662 | foreach ( $parentRows as $row ) { |
663 | if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { |
664 | // We already captured this one before |
665 | continue; |
666 | } |
667 | if ( isset( $processedTitle[$row->rc_title] ) ) { |
668 | // We already captured this one before |
669 | continue; |
670 | } |
671 | $this->writeCategoryData( $row ); |
672 | if ( $row->page_id ) { |
673 | $pages[$row->page_id] = $row->rc_title; |
674 | $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; |
675 | $this->processed[$row->page_id] = true; |
676 | } |
677 | $processedTitle[$row->rc_title] = true; |
678 | } |
679 | } |
680 | |
681 | fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); |
682 | } |
683 | } |
684 | } |
685 | |
686 | $maintClass = CategoryChangesAsRdf::class; |
687 | require_once RUN_MAINTENANCE_IF_MAIN; |