MediaWiki master
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
12use Wikimedia\Purtle\RdfWriter;
13use Wikimedia\Purtle\TurtleRdfWriter;
15use Wikimedia\Timestamp\TimestampFormat as TS;
16
17// @codeCoverageIgnoreStart
18require_once __DIR__ . '/Maintenance.php';
19// @codeCoverageIgnoreEnd
20
31 private const SPARQL_INSERT = <<<SPARQL
32INSERT DATA {
33%s
34};
35
36SPARQL;
37
41 private const SPARQL_DELETE = <<<SPARQLD
42DELETE {
43?category ?x ?y
44} WHERE {
45 ?category ?x ?y
46 VALUES ?category {
47 %s
48 }
49};
50
51SPARQLD;
52
56 private $rdfWriter;
61 private $categoriesRdf;
62
64 private $startTS;
66 private $endTS;
67
73 protected $processed = [];
74
75 public function __construct() {
76 parent::__construct();
77
78 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
79
80 $this->setBatchSize( 200 );
81 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
82 true, 'o' );
83 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.',
84 true, true, 's' );
85 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true,
86 true, 'e' );
87 }
88
92 public function initialize() {
93 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
94 $this->rdfWriter = new TurtleRdfWriter();
95 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
96 }
97
98 public function execute() {
99 $this->initialize();
100 $startTS = new MWTimestamp( $this->getOption( "start" ) );
101
102 $endTS = new MWTimestamp( $this->getOption( "end" ) );
103 $now = new MWTimestamp();
104 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
105
106 if ( (int)$now->getTimestamp( TS::UNIX ) - (int)$startTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
107 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
108 }
109 if ( (int)$now->getTimestamp( TS::UNIX ) - (int)$endTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
110 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
111 }
112
113 $this->startTS = $startTS->getTimestamp();
114 $this->endTS = $endTS->getTimestamp();
115
116 $outFile = $this->getOption( 'output', 'php://stdout' );
117 if ( $outFile === '-' ) {
118 $outFile = 'php://stdout';
119 }
120
121 $output = fopen( $outFile, 'wb' );
122
123 $this->categoriesRdf->setupPrefixes();
124 $this->rdfWriter->start();
125
126 $prefixes = $this->getRdf();
127 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
128 // Also strip dot at the end.
129 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
130 fwrite( $output, $prefixes );
131
132 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
133
134 // Deletes go first because if the page was deleted, other changes
135 // do not matter. This only gets true deletes, i.e. not pages that were restored.
136 $this->handleDeletes( $dbr, $output );
137 // Moves go before additions because if category is moved, we should not process creation
138 // as it would produce wrong data - because create row has old title
139 $this->handleMoves( $dbr, $output );
140 // We need to handle restores too since delete may have happened in previous update.
141 $this->handleRestores( $dbr, $output );
142 // Process newly added pages
143 $this->handleAdds( $dbr, $output );
144 // Process page edits
145 $this->handleEdits( $dbr, $output );
146 // Process categorization changes
147 $this->handleCategorization( $dbr, $output );
148
149 // Update timestamp
150 fwrite( $output, $this->updateTS( $this->endTS ) );
151 }
152
157 private function getInsertRdf() {
158 $rdfText = $this->getRdf();
159 if ( !$rdfText ) {
160 return "";
161 }
162 return sprintf( self::SPARQL_INSERT, $rdfText );
163 }
164
173 private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
174 if ( !$deleteUrls ) {
175 return "";
176 }
177
178 if ( $pages ) {
179 $this->writeParentCategories( $dbr, $pages );
180 }
181
182 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
183 $this->getInsertRdf();
184 }
185
192 private function writeParentCategories( IReadableDatabase $dbr, $pages ) {
193 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
194 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
195 }
196 }
197
203 public function updateTS( $timestamp ) {
204 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
205 $ts = wfTimestamp( TS::ISO_8601, $timestamp );
206 $tsQuery = <<<SPARQL
207DELETE {
208 $dumpUrl schema:dateModified ?o .
209}
210WHERE {
211 $dumpUrl schema:dateModified ?o .
212};
213INSERT DATA {
214 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
215}
216
217SPARQL;
218 return $tsQuery;
219 }
220
228 private function setupChangesIterator(
230 array $columns,
231 string $fname
232 ) {
233 $it = new BatchRowIterator( $dbr,
235 ->from( 'recentchanges' )
236 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] )
237 ->leftJoin( 'category', null, [ 'cat_title = rc_title' ] )
238 ->select( array_merge( $columns, [
239 'rc_title',
240 'rc_cur_id',
241 'pp_propname',
242 'cat_pages',
243 'cat_subcats',
244 'cat_files'
245 ] ) )
246 ->caller( $fname ),
247 [ 'rc_timestamp' ],
248 $this->mBatchSize
249 );
250 $this->addTimestampConditions( $it, $dbr );
251 return $it;
252 }
253
260 protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) {
261 $it = $this->setupChangesIterator( $dbr, [], $fname );
262 $it->sqb->conds( [
263 'rc_namespace' => NS_CATEGORY,
264 'rc_source' => RecentChange::SRC_NEW,
265 ] );
266 return $it;
267 }
268
275 protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) {
276 $it = $this->setupChangesIterator(
277 $dbr,
278 [ 'page_title', 'page_namespace' ],
279 $fname
280 );
281 $it->sqb->conds( [
282 'rc_namespace' => NS_CATEGORY,
283 'rc_source' => RecentChange::SRC_LOG,
284 'rc_log_type' => 'move',
285 ] );
286 $it->sqb->join( 'page', null, 'rc_cur_id = page_id' );
287 $this->addIndex( $it );
288 return $it;
289 }
290
297 protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) {
298 $it = new BatchRowIterator( $dbr,
300 ->from( 'recentchanges' )
301 ->select( [ 'rc_cur_id', 'rc_title' ] )
302 ->where( [
303 'rc_namespace' => NS_CATEGORY,
304 'rc_source' => RecentChange::SRC_LOG,
305 'rc_log_type' => 'delete',
306 'rc_log_action' => 'delete',
307 // We will fetch ones that do not have page record. If they do,
308 // this means they were restored, thus restoring handler will pick it up.
309 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
310 ] )
311 ->caller( $fname ),
312 [ 'rc_timestamp' ],
313 $this->mBatchSize
314 );
315 $this->addTimestampConditions( $it, $dbr );
316 $this->addIndex( $it );
317 return $it;
318 }
319
326 protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) {
327 $it = $this->setupChangesIterator( $dbr, [], $fname );
328 $it->sqb->conds( [
329 'rc_namespace' => NS_CATEGORY,
330 'rc_source' => RecentChange::SRC_LOG,
331 'rc_log_type' => 'delete',
332 'rc_log_action' => 'restore',
333 // We will only fetch ones that have page record
334 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
335 ] );
336 $this->addIndex( $it );
337 return $it;
338 }
339
347 protected function getChangedCatsIterator( IReadableDatabase $dbr, $source, $fname ) {
348 $it = $this->setupChangesIterator( $dbr, [], $fname );
349 $it->sqb->conds( [
350 'rc_namespace' => NS_CATEGORY,
351 'rc_source' => $source,
352 ] );
353 $this->addIndex( $it );
354 return $it;
355 }
356
362 private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
363 $it->sqb->conds( [
364 $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ),
365 $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ),
366 ] );
367 }
368
372 private function addIndex( BatchRowIterator $it ) {
373 $it->sqb->options( [
374 'USE INDEX' => [ 'recentchanges' => 'rc_source_name_timestamp' ]
375 ] );
376 }
377
385 protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
386 $qb = $dbr->newSelectQueryBuilder()
387 ->select( [ 'cl_from', 'lt_title' ] )
388 ->from( 'categorylinks' )
389 ->join( 'linktarget', null, 'cl_target_id=lt_id' )
390 ->where( [
391 'cl_type' => 'subcat',
392 'cl_from' => $ids
393 ] )
394 ->caller( $fname );
395 $primaryKey = [ 'cl_from', 'cl_target_id' ];
396
397 $it = new BatchRowIterator(
398 $dbr,
399 $qb,
400 $primaryKey,
401 $this->mBatchSize
402 );
403 return new RecursiveIteratorIterator( $it );
404 }
405
410 public function getRdf() {
411 return $this->rdfWriter->drain();
412 }
413
419 public function handleDeletes( IReadableDatabase $dbr, $output ) {
420 // This only does "true" deletes - i.e. those that the page stays deleted
421
422 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
423 $deleteUrls = [];
424 foreach ( $batch as $row ) {
425 // This can produce duplicates, we don't care
426 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
427 $this->processed[$row->rc_cur_id] = true;
428 }
429 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
430 }
431 }
432
437 private function writeCategoryData( $row ) {
438 $this->categoriesRdf->writeCategoryData(
439 $row->rc_title,
440 $row->pp_propname === 'hiddencat',
441 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
442 (int)$row->cat_subcats
443 );
444 }
445
450 public function handleMoves( IReadableDatabase $dbr, $output ) {
451 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
452 $pages = [];
453 $deleteUrls = [];
454 foreach ( $batch as $row ) {
455 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
456
457 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
458 // We already captured this one before
459 continue;
460 }
461
462 if ( $row->page_namespace != NS_CATEGORY ) {
463 // If page was moved out of Category:, we'll just delete
464 continue;
465 }
466 $row->rc_title = $row->page_title;
467 $this->writeCategoryData( $row );
468 $pages[$row->rc_cur_id] = $row->page_title;
469 $this->processed[$row->rc_cur_id] = true;
470 }
471
472 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
473 }
474 }
475
480 public function handleRestores( IReadableDatabase $dbr, $output ) {
481 fwrite( $output, "# Restores\n" );
482
483 // This will only find those restores that were not deleted later.
484 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
485 $pages = [];
486 foreach ( $batch as $row ) {
487 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
488 // We already captured this one before
489 continue;
490 }
491 $this->writeCategoryData( $row );
492 $pages[$row->rc_cur_id] = $row->rc_title;
493 $this->processed[$row->rc_cur_id] = true;
494 }
495
496 if ( !$pages ) {
497 continue;
498 }
499
500 $this->writeParentCategories( $dbr, $pages );
501
502 fwrite( $output, $this->getInsertRdf() );
503 }
504 }
505
510 public function handleAdds( IReadableDatabase $dbr, $output ) {
511 fwrite( $output, "# Additions\n" );
512
513 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
514 $pages = [];
515 foreach ( $batch as $row ) {
516 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
517 // We already captured this one before
518 continue;
519 }
520 $this->writeCategoryData( $row );
521 $pages[$row->rc_cur_id] = $row->rc_title;
522 $this->processed[$row->rc_cur_id] = true;
523 }
524
525 if ( !$pages ) {
526 continue;
527 }
528
529 $this->writeParentCategories( $dbr, $pages );
530 fwrite( $output, $this->getInsertRdf() );
531 }
532 }
533
539 public function handleEdits( IReadableDatabase $dbr, $output ) {
540 // Editing category can change hidden flag and add new parents.
541 // TODO: it's pretty expensive to update all edited categories, and most edits
542 // aren't actually interesting for us. Some way to know which are interesting?
543 // We can capture recategorization on the next step, but not change in hidden status.
544
545 foreach ( $this->getChangedCatsIterator( $dbr, RecentChange::SRC_EDIT, __METHOD__ ) as $batch ) {
546 $pages = [];
547 $deleteUrls = [];
548 foreach ( $batch as $row ) {
549 // Note that on categorization event, cur_id points to
550 // the child page, not the parent category!
551 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
552 // We already captured this one before
553 continue;
554 }
555 $this->writeCategoryData( $row );
556 $pages[$row->rc_cur_id] = $row->rc_title;
557 $this->processed[$row->rc_cur_id] = true;
558 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
559 }
560
561 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
562 }
563 }
564
570 public function handleCategorization( IReadableDatabase $dbr, $output ) {
571 $processedTitle = [];
572
573 // Categorization change can add new parents and change counts
574 // for the parent category.
575
576 foreach ( $this->getChangedCatsIterator( $dbr, RecentChange::SRC_CATEGORIZE, __METHOD__ ) as $batch ) {
577 /*
578 * Note that on categorization event, cur_id points to
579 * the child page, not the parent category!
580 * So we need to have a two-stage process, since we have ID from one
581 * category and title from another, and we need both for proper updates.
582 * TODO: For now, we do full update even though some data hasn't changed,
583 * e.g. parents for parent cat and counts for child cat.
584 */
585 $childPages = [];
586 $parentCats = [];
587 foreach ( $batch as $row ) {
588 $childPages[$row->rc_cur_id] = true;
589 $parentCats[$row->rc_title] = true;
590 }
591
592 $pages = [];
593 $deleteUrls = [];
594
595 if ( $childPages ) {
596 // Load child rows by ID
597 $childRows = $dbr->newSelectQueryBuilder()
598 ->select( [
599 'page_id',
600 'rc_title' => 'page_title',
601 'pp_propname',
602 'cat_pages',
603 'cat_subcats',
604 'cat_files',
605 ] )
606 ->from( 'page' )
607 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
608 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
609 ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] )
610 ->caller( __METHOD__ )->fetchResultSet();
611 foreach ( $childRows as $row ) {
612 if ( isset( $this->processed[$row->page_id] ) ) {
613 // We already captured this one before
614 continue;
615 }
616 $this->writeCategoryData( $row );
617 if ( $row->page_id ) {
618 $pages[$row->page_id] = $row->rc_title;
619 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
620 $this->processed[$row->page_id] = true;
621 }
622 }
623 }
624
625 if ( $parentCats ) {
626 // Load parent rows by title
627 $parentRows = $dbr->newSelectQueryBuilder()
628 ->select( [
629 'page_id',
630 'rc_title' => 'cat_title',
631 'pp_propname',
632 'cat_pages',
633 'cat_subcats',
634 'cat_files',
635 ] )
636 ->from( 'category' )
637 ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] )
638 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
639 ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] )
640 ->caller( __METHOD__ )->fetchResultSet();
641 foreach ( $parentRows as $row ) {
642 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
643 // We already captured this one before
644 continue;
645 }
646 if ( isset( $processedTitle[$row->rc_title] ) ) {
647 // We already captured this one before
648 continue;
649 }
650 $this->writeCategoryData( $row );
651 if ( $row->page_id ) {
652 $pages[$row->page_id] = $row->rc_title;
653 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
654 $this->processed[$row->page_id] = true;
655 }
656 $processedTitle[$row->rc_title] = true;
657 }
658 }
659
660 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
661 }
662 }
663}
664
665// @codeCoverageIgnoreStart
666$maintClass = CategoryChangesAsRdf::class;
667require_once RUN_MAINTENANCE_IF_MAIN;
668// @codeCoverageIgnoreEnd
const NS_CATEGORY
Definition Defines.php:65
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DB_REPLICA
Definition defines.php:26
Maintenance script to provide RDF representation of the recent changes in category tree.
initialize()
Initialize external service classes.
handleRestores(IReadableDatabase $dbr, $output)
handleMoves(IReadableDatabase $dbr, $output)
getNewCatsIterator(IReadableDatabase $dbr, $fname)
Fetch newly created categories.
execute()
Do the actual work.
getDeletedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch deleted categories.
getChangedCatsIterator(IReadableDatabase $dbr, $source, $fname)
Fetch categorization changes or edits.
handleCategorization(IReadableDatabase $dbr, $output)
Handles categorization changes.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getRestoredCatsIterator(IReadableDatabase $dbr, $fname)
Fetch restored categories.
getMovedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch moved categories.
handleEdits(IReadableDatabase $dbr, $output)
Handle edits for category texts.
getRdf()
Get accumulated RDF.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
handleAdds(IReadableDatabase $dbr, $output)
handleDeletes(IReadableDatabase $dbr, $output)
Handle category deletes.
__construct()
Default constructor.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
Utility class for creating and reading rows in the recentchanges table.
Allows iterating a large number of rows in batches transparently.
Library for creating and parsing MW-style timestamps.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
$source