MediaWiki master
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
23use Wikimedia\Purtle\RdfWriter;
24use Wikimedia\Purtle\TurtleRdfWriter;
26
27require_once __DIR__ . '/Maintenance.php';
28
39 private const SPARQL_INSERT = <<<SPARQL
40INSERT DATA {
41%s
42};
43
44SPARQL;
45
49 private const SPARQL_DELETE = <<<SPARQLD
50DELETE {
51?category ?x ?y
52} WHERE {
53 ?category ?x ?y
54 VALUES ?category {
55 %s
56 }
57};
58
59SPARQLD;
60
64 private $rdfWriter;
69 private $categoriesRdf;
70
71 private $startTS;
72 private $endTS;
73
79 protected $processed = [];
80
81 public function __construct() {
82 parent::__construct();
83
84 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
85
86 $this->setBatchSize( 200 );
87 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
88 true, 'o' );
89 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.',
90 true, true, 's' );
91 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true,
92 true, 'e' );
93 }
94
98 public function initialize() {
99 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
100 $this->rdfWriter = new TurtleRdfWriter();
101 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
102 }
103
104 public function execute() {
105 $this->initialize();
106 $startTS = new MWTimestamp( $this->getOption( "start" ) );
107
108 $endTS = new MWTimestamp( $this->getOption( "end" ) );
109 $now = new MWTimestamp();
110 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
111
112 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
113 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
114 }
115 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
116 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
117 }
118
119 $this->startTS = $startTS->getTimestamp();
120 $this->endTS = $endTS->getTimestamp();
121
122 $outFile = $this->getOption( 'output', 'php://stdout' );
123 if ( $outFile === '-' ) {
124 $outFile = 'php://stdout';
125 }
126
127 $output = fopen( $outFile, 'wb' );
128
129 $this->categoriesRdf->setupPrefixes();
130 $this->rdfWriter->start();
131
132 $prefixes = $this->getRdf();
133 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
134 // Also strip dot at the end.
135 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
136 fwrite( $output, $prefixes );
137
138 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
139
140 // Deletes go first because if the page was deleted, other changes
141 // do not matter. This only gets true deletes, i.e. not pages that were restored.
142 $this->handleDeletes( $dbr, $output );
143 // Moves go before additions because if category is moved, we should not process creation
144 // as it would produce wrong data - because create row has old title
145 $this->handleMoves( $dbr, $output );
146 // We need to handle restores too since delete may have happened in previous update.
147 $this->handleRestores( $dbr, $output );
148 // Process newly added pages
149 $this->handleAdds( $dbr, $output );
150 // Process page edits
151 $this->handleEdits( $dbr, $output );
152 // Process categorization changes
153 $this->handleCategorization( $dbr, $output );
154
155 // Update timestamp
156 fwrite( $output, $this->updateTS( $this->endTS ) );
157 }
158
163 private function getInsertRdf() {
164 $rdfText = $this->getRdf();
165 if ( !$rdfText ) {
166 return "";
167 }
168 return sprintf( self::SPARQL_INSERT, $rdfText );
169 }
170
179 private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
180 if ( !$deleteUrls ) {
181 return "";
182 }
183
184 if ( $pages ) {
185 $this->writeParentCategories( $dbr, $pages );
186 }
187
188 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
189 $this->getInsertRdf();
190 }
191
198 private function writeParentCategories( IReadableDatabase $dbr, $pages ) {
199 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
200 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
201 }
202 }
203
209 public function updateTS( $timestamp ) {
210 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
211 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
212 $tsQuery = <<<SPARQL
213DELETE {
214 $dumpUrl schema:dateModified ?o .
215}
216WHERE {
217 $dumpUrl schema:dateModified ?o .
218};
219INSERT DATA {
220 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
221}
222
223SPARQL;
224 return $tsQuery;
225 }
226
235 private function setupChangesIterator(
237 array $columns = [],
238 array $extra_tables = [],
239 $fname = __METHOD__
240 ) {
241 $tables = [ 'recentchanges', 'page_props', 'category' ];
242 if ( $extra_tables ) {
243 $tables = array_merge( $tables, $extra_tables );
244 }
245 $it = new BatchRowIterator( $dbr,
246 $tables,
247 [ 'rc_timestamp' ],
248 $this->mBatchSize
249 );
250 $this->addTimestampConditions( $it, $dbr );
252 [
253 'page_props' => [
254 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
255 ],
256 'category' => [
257 'LEFT JOIN', [ 'cat_title = rc_title' ]
258 ]
259 ]
260 );
261 $it->setFetchColumns( array_merge( $columns, [
262 'rc_title',
263 'rc_cur_id',
264 'pp_propname',
265 'cat_pages',
266 'cat_subcats',
267 'cat_files'
268 ] ) );
269 $it->setCaller( $fname );
270 return $it;
271 }
272
279 protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) {
280 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
281 $it->addConditions( [
282 'rc_namespace' => NS_CATEGORY,
283 'rc_new' => 1,
284 ] );
285 return $it;
286 }
287
294 protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) {
295 $it = $this->setupChangesIterator(
296 $dbr,
297 [ 'page_title', 'page_namespace' ],
298 [ 'page' ],
299 $fname
300 );
301 $it->addConditions( [
302 'rc_namespace' => NS_CATEGORY,
303 'rc_new' => 0,
304 'rc_log_type' => 'move',
305 'rc_type' => RC_LOG,
306 ] );
307 $it->addJoinConditions( [
308 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
309 ] );
310 $this->addIndex( $it );
311 return $it;
312 }
313
320 protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) {
321 $it = new BatchRowIterator( $dbr,
322 'recentchanges',
323 [ 'rc_timestamp' ],
324 $this->mBatchSize
325 );
326 $this->addTimestampConditions( $it, $dbr );
327 $it->addConditions( [
328 'rc_namespace' => NS_CATEGORY,
329 'rc_new' => 0,
330 'rc_log_type' => 'delete',
331 'rc_log_action' => 'delete',
332 'rc_type' => RC_LOG,
333 // We will fetch ones that do not have page record. If they do,
334 // this means they were restored, thus restoring handler will pick it up.
335 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
336 ] );
337 $this->addIndex( $it );
338 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
339 $it->setCaller( $fname );
340 return $it;
341 }
342
349 protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) {
350 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
351 $it->addConditions( [
352 'rc_namespace' => NS_CATEGORY,
353 'rc_new' => 0,
354 'rc_log_type' => 'delete',
355 'rc_log_action' => 'restore',
356 'rc_type' => RC_LOG,
357 // We will only fetch ones that have page record
358 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359 ] );
360 $this->addIndex( $it );
361 return $it;
362 }
363
371 protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) {
372 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
373 $it->addConditions( [
374 'rc_namespace' => NS_CATEGORY,
375 'rc_new' => 0,
376 'rc_type' => $type,
377 ] );
378 $this->addIndex( $it );
379 return $it;
380 }
381
387 private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
388 $it->addConditions( [
389 $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ),
390 $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ),
391 ] );
392 }
393
398 private function addIndex( BatchRowIterator $it ) {
399 $it->addOptions( [
400 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
401 ] );
402 }
403
411 protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
412 $it = new BatchRowIterator(
413 $dbr,
414 'categorylinks',
415 [ 'cl_from', 'cl_to' ],
416 $this->mBatchSize
417 );
418 $it->addConditions( [
419 'cl_type' => 'subcat',
420 'cl_from' => $ids
421 ] );
422 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
423 $it->setCaller( $fname );
424 return new RecursiveIteratorIterator( $it );
425 }
426
431 public function getRdf() {
432 return $this->rdfWriter->drain();
433 }
434
440 public function handleDeletes( IReadableDatabase $dbr, $output ) {
441 // This only does "true" deletes - i.e. those that the page stays deleted
442
443 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
444 $deleteUrls = [];
445 foreach ( $batch as $row ) {
446 // This can produce duplicates, we don't care
447 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
448 $this->processed[$row->rc_cur_id] = true;
449 }
450 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
451 }
452 }
453
458 private function writeCategoryData( $row ) {
459 $this->categoriesRdf->writeCategoryData(
460 $row->rc_title,
461 $row->pp_propname === 'hiddencat',
462 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
463 (int)$row->cat_subcats
464 );
465 }
466
471 public function handleMoves( IReadableDatabase $dbr, $output ) {
472 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
473 $pages = [];
474 $deleteUrls = [];
475 foreach ( $batch as $row ) {
476 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
477
478 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
479 // We already captured this one before
480 continue;
481 }
482
483 if ( $row->page_namespace != NS_CATEGORY ) {
484 // If page was moved out of Category:, we'll just delete
485 continue;
486 }
487 $row->rc_title = $row->page_title;
488 $this->writeCategoryData( $row );
489 $pages[$row->rc_cur_id] = $row->page_title;
490 $this->processed[$row->rc_cur_id] = true;
491 }
492
493 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
494 }
495 }
496
501 public function handleRestores( IReadableDatabase $dbr, $output ) {
502 fwrite( $output, "# Restores\n" );
503
504 // This will only find those restores that were not deleted later.
505 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
506 $pages = [];
507 foreach ( $batch as $row ) {
508 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
509 // We already captured this one before
510 continue;
511 }
512 $this->writeCategoryData( $row );
513 $pages[$row->rc_cur_id] = $row->rc_title;
514 $this->processed[$row->rc_cur_id] = true;
515 }
516
517 if ( !$pages ) {
518 continue;
519 }
520
521 $this->writeParentCategories( $dbr, $pages );
522
523 fwrite( $output, $this->getInsertRdf() );
524 }
525 }
526
531 public function handleAdds( IReadableDatabase $dbr, $output ) {
532 fwrite( $output, "# Additions\n" );
533
534 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
535 $pages = [];
536 foreach ( $batch as $row ) {
537 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
538 // We already captured this one before
539 continue;
540 }
541 $this->writeCategoryData( $row );
542 $pages[$row->rc_cur_id] = $row->rc_title;
543 $this->processed[$row->rc_cur_id] = true;
544 }
545
546 if ( !$pages ) {
547 continue;
548 }
549
550 $this->writeParentCategories( $dbr, $pages );
551 fwrite( $output, $this->getInsertRdf() );
552 }
553 }
554
560 public function handleEdits( IReadableDatabase $dbr, $output ) {
561 // Editing category can change hidden flag and add new parents.
562 // TODO: it's pretty expensive to update all edited categories, and most edits
563 // aren't actually interesting for us. Some way to know which are interesting?
564 // We can capture recategorization on the next step, but not change in hidden status.
565
566 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
567 $pages = [];
568 $deleteUrls = [];
569 foreach ( $batch as $row ) {
570 // Note that on categorization event, cur_id points to
571 // the child page, not the parent category!
572 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
573 // We already captured this one before
574 continue;
575 }
576 $this->writeCategoryData( $row );
577 $pages[$row->rc_cur_id] = $row->rc_title;
578 $this->processed[$row->rc_cur_id] = true;
579 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
580 }
581
582 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
583 }
584 }
585
591 public function handleCategorization( IReadableDatabase $dbr, $output ) {
592 $processedTitle = [];
593
594 // Categorization change can add new parents and change counts
595 // for the parent category.
596
597 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
598 /*
599 * Note that on categorization event, cur_id points to
600 * the child page, not the parent category!
601 * So we need to have a two-stage process, since we have ID from one
602 * category and title from another, and we need both for proper updates.
603 * TODO: For now, we do full update even though some data hasn't changed,
604 * e.g. parents for parent cat and counts for child cat.
605 */
606 $childPages = [];
607 $parentCats = [];
608 foreach ( $batch as $row ) {
609 $childPages[$row->rc_cur_id] = true;
610 $parentCats[$row->rc_title] = true;
611 }
612
613 $pages = [];
614 $deleteUrls = [];
615
616 if ( $childPages ) {
617 // Load child rows by ID
618 $childRows = $dbr->newSelectQueryBuilder()
619 ->select( [
620 'page_id',
621 'rc_title' => 'page_title',
622 'pp_propname',
623 'cat_pages',
624 'cat_subcats',
625 'cat_files',
626 ] )
627 ->from( 'page' )
628 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
629 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
630 ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] )
631 ->caller( __METHOD__ )->fetchResultSet();
632 foreach ( $childRows as $row ) {
633 if ( isset( $this->processed[$row->page_id] ) ) {
634 // We already captured this one before
635 continue;
636 }
637 $this->writeCategoryData( $row );
638 if ( $row->page_id ) {
639 $pages[$row->page_id] = $row->rc_title;
640 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
641 $this->processed[$row->page_id] = true;
642 }
643 }
644 }
645
646 if ( $parentCats ) {
647 // Load parent rows by title
648 $parentRows = $dbr->newSelectQueryBuilder()
649 ->select( [
650 'page_id',
651 'rc_title' => 'cat_title',
652 'pp_propname',
653 'cat_pages',
654 'cat_subcats',
655 'cat_files',
656 ] )
657 ->from( 'category' )
658 ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] )
659 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
660 ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] )
661 ->caller( __METHOD__ )->fetchResultSet();
662 foreach ( $parentRows as $row ) {
663 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
664 // We already captured this one before
665 continue;
666 }
667 if ( isset( $processedTitle[$row->rc_title] ) ) {
668 // We already captured this one before
669 continue;
670 }
671 $this->writeCategoryData( $row );
672 if ( $row->page_id ) {
673 $pages[$row->page_id] = $row->rc_title;
674 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
675 $this->processed[$row->page_id] = true;
676 }
677 $processedTitle[$row->rc_title] = true;
678 }
679 }
680
681 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
682 }
683 }
684}
685
686$maintClass = CategoryChangesAsRdf::class;
687require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const RC_LOG
Definition Defines.php:118
const RC_EDIT
Definition Defines.php:116
const NS_CATEGORY
Definition Defines.php:78
const RC_CATEGORIZE
Definition Defines.php:120
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addJoinConditions(array $conditions)
setFetchColumns(array $columns)
setCaller( $caller)
Use ->setCaller( METHOD ) to indicate which code is using this class.
addOptions(array $options)
Maintenance script to provide RDF representation of the recent changes in category tree.
initialize()
Initialize external service classes.
handleRestores(IReadableDatabase $dbr, $output)
handleMoves(IReadableDatabase $dbr, $output)
getNewCatsIterator(IReadableDatabase $dbr, $fname)
Fetch newly created categories.
execute()
Do the actual work.
getDeletedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch deleted categories.
handleCategorization(IReadableDatabase $dbr, $output)
Handles categorization changes.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getRestoredCatsIterator(IReadableDatabase $dbr, $fname)
Fetch restored categories.
getMovedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch moved categories.
handleEdits(IReadableDatabase $dbr, $output)
Handle edits for category texts.
getRdf()
Get accumulated RDF.
getChangedCatsIterator(IReadableDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
handleAdds(IReadableDatabase $dbr, $output)
handleDeletes(IReadableDatabase $dbr, $output)
Handle category deletes.
__construct()
Default constructor.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Library for creating and parsing MW-style timestamps.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
const DB_REPLICA
Definition defines.php:26