MediaWiki master
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
10use Wikimedia\Purtle\RdfWriter;
11use Wikimedia\Purtle\TurtleRdfWriter;
13use Wikimedia\Timestamp\TimestampFormat as TS;
14
15// @codeCoverageIgnoreStart
16require_once __DIR__ . '/Maintenance.php';
17// @codeCoverageIgnoreEnd
18
29 private const SPARQL_INSERT = <<<SPARQL
30INSERT DATA {
31%s
32};
33
34SPARQL;
35
39 private const SPARQL_DELETE = <<<SPARQLD
40DELETE {
41?category ?x ?y
42} WHERE {
43 ?category ?x ?y
44 VALUES ?category {
45 %s
46 }
47};
48
49SPARQLD;
50
54 private $rdfWriter;
59 private $categoriesRdf;
60
62 private $startTS;
64 private $endTS;
65
71 protected $processed = [];
72
73 public function __construct() {
74 parent::__construct();
75
76 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
77
78 $this->setBatchSize( 200 );
79 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
80 true, 'o' );
81 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.',
82 true, true, 's' );
83 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true,
84 true, 'e' );
85 }
86
90 public function initialize() {
91 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
92 $this->rdfWriter = new TurtleRdfWriter();
93 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
94 }
95
96 public function execute() {
97 $this->initialize();
98 $startTS = new MWTimestamp( $this->getOption( "start" ) );
99
100 $endTS = new MWTimestamp( $this->getOption( "end" ) );
101 $now = new MWTimestamp();
102 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
103
104 if ( (int)$now->getTimestamp( TS::UNIX ) - (int)$startTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
105 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
106 }
107 if ( (int)$now->getTimestamp( TS::UNIX ) - (int)$endTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
108 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
109 }
110
111 $this->startTS = $startTS->getTimestamp();
112 $this->endTS = $endTS->getTimestamp();
113
114 $outFile = $this->getOption( 'output', 'php://stdout' );
115 if ( $outFile === '-' ) {
116 $outFile = 'php://stdout';
117 }
118
119 $output = fopen( $outFile, 'wb' );
120
121 $this->categoriesRdf->setupPrefixes();
122 $this->rdfWriter->start();
123
124 $prefixes = $this->getRdf();
125 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
126 // Also strip dot at the end.
127 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
128 fwrite( $output, $prefixes );
129
130 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
131
132 // Deletes go first because if the page was deleted, other changes
133 // do not matter. This only gets true deletes, i.e. not pages that were restored.
134 $this->handleDeletes( $dbr, $output );
135 // Moves go before additions because if category is moved, we should not process creation
136 // as it would produce wrong data - because create row has old title
137 $this->handleMoves( $dbr, $output );
138 // We need to handle restores too since delete may have happened in previous update.
139 $this->handleRestores( $dbr, $output );
140 // Process newly added pages
141 $this->handleAdds( $dbr, $output );
142 // Process page edits
143 $this->handleEdits( $dbr, $output );
144 // Process categorization changes
145 $this->handleCategorization( $dbr, $output );
146
147 // Update timestamp
148 fwrite( $output, $this->updateTS( $this->endTS ) );
149 }
150
155 private function getInsertRdf() {
156 $rdfText = $this->getRdf();
157 if ( !$rdfText ) {
158 return "";
159 }
160 return sprintf( self::SPARQL_INSERT, $rdfText );
161 }
162
171 private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
172 if ( !$deleteUrls ) {
173 return "";
174 }
175
176 if ( $pages ) {
177 $this->writeParentCategories( $dbr, $pages );
178 }
179
180 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
181 $this->getInsertRdf();
182 }
183
190 private function writeParentCategories( IReadableDatabase $dbr, $pages ) {
191 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
192 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
193 }
194 }
195
201 public function updateTS( $timestamp ) {
202 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
203 $ts = wfTimestamp( TS::ISO_8601, $timestamp );
204 $tsQuery = <<<SPARQL
205DELETE {
206 $dumpUrl schema:dateModified ?o .
207}
208WHERE {
209 $dumpUrl schema:dateModified ?o .
210};
211INSERT DATA {
212 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
213}
214
215SPARQL;
216 return $tsQuery;
217 }
218
226 private function setupChangesIterator(
228 array $columns,
229 string $fname
230 ) {
231 $it = new BatchRowIterator( $dbr,
233 ->from( 'recentchanges' )
234 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] )
235 ->leftJoin( 'category', null, [ 'cat_title = rc_title' ] )
236 ->select( array_merge( $columns, [
237 'rc_title',
238 'rc_cur_id',
239 'pp_propname',
240 'cat_pages',
241 'cat_subcats',
242 'cat_files'
243 ] ) )
244 ->caller( $fname ),
245 [ 'rc_timestamp' ],
246 $this->mBatchSize
247 );
248 $this->addTimestampConditions( $it, $dbr );
249 return $it;
250 }
251
258 protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) {
259 $it = $this->setupChangesIterator( $dbr, [], $fname );
260 $it->sqb->conds( [
261 'rc_namespace' => NS_CATEGORY,
262 'rc_source' => RecentChange::SRC_NEW,
263 ] );
264 return $it;
265 }
266
273 protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) {
274 $it = $this->setupChangesIterator(
275 $dbr,
276 [ 'page_title', 'page_namespace' ],
277 $fname
278 );
279 $it->sqb->conds( [
280 'rc_namespace' => NS_CATEGORY,
281 'rc_source' => RecentChange::SRC_LOG,
282 'rc_log_type' => 'move',
283 ] );
284 $it->sqb->join( 'page', null, 'rc_cur_id = page_id' );
285 $this->addIndex( $it );
286 return $it;
287 }
288
295 protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) {
296 $it = new BatchRowIterator( $dbr,
298 ->from( 'recentchanges' )
299 ->select( [ 'rc_cur_id', 'rc_title' ] )
300 ->where( [
301 'rc_namespace' => NS_CATEGORY,
302 'rc_source' => RecentChange::SRC_LOG,
303 'rc_log_type' => 'delete',
304 'rc_log_action' => 'delete',
305 // We will fetch ones that do not have page record. If they do,
306 // this means they were restored, thus restoring handler will pick it up.
307 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
308 ] )
309 ->caller( $fname ),
310 [ 'rc_timestamp' ],
311 $this->mBatchSize
312 );
313 $this->addTimestampConditions( $it, $dbr );
314 $this->addIndex( $it );
315 return $it;
316 }
317
324 protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) {
325 $it = $this->setupChangesIterator( $dbr, [], $fname );
326 $it->sqb->conds( [
327 'rc_namespace' => NS_CATEGORY,
328 'rc_source' => RecentChange::SRC_LOG,
329 'rc_log_type' => 'delete',
330 'rc_log_action' => 'restore',
331 // We will only fetch ones that have page record
332 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
333 ] );
334 $this->addIndex( $it );
335 return $it;
336 }
337
345 protected function getChangedCatsIterator( IReadableDatabase $dbr, $source, $fname ) {
346 $it = $this->setupChangesIterator( $dbr, [], $fname );
347 $it->sqb->conds( [
348 'rc_namespace' => NS_CATEGORY,
349 'rc_source' => $source,
350 ] );
351 $this->addIndex( $it );
352 return $it;
353 }
354
360 private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
361 $it->sqb->conds( [
362 $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ),
363 $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ),
364 ] );
365 }
366
370 private function addIndex( BatchRowIterator $it ) {
371 $it->sqb->options( [
372 'USE INDEX' => [ 'recentchanges' => 'rc_source_name_timestamp' ]
373 ] );
374 }
375
383 protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
384 $qb = $dbr->newSelectQueryBuilder()
385 ->select( [ 'cl_from', 'lt_title' ] )
386 ->from( 'categorylinks' )
387 ->join( 'linktarget', null, 'cl_target_id=lt_id' )
388 ->where( [
389 'cl_type' => 'subcat',
390 'cl_from' => $ids
391 ] )
392 ->caller( $fname );
393 $primaryKey = [ 'cl_from', 'cl_target_id' ];
394
395 $it = new BatchRowIterator(
396 $dbr,
397 $qb,
398 $primaryKey,
399 $this->mBatchSize
400 );
401 return new RecursiveIteratorIterator( $it );
402 }
403
408 public function getRdf() {
409 return $this->rdfWriter->drain();
410 }
411
417 public function handleDeletes( IReadableDatabase $dbr, $output ) {
418 // This only does "true" deletes - i.e. those that the page stays deleted
419
420 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
421 $deleteUrls = [];
422 foreach ( $batch as $row ) {
423 // This can produce duplicates, we don't care
424 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
425 $this->processed[$row->rc_cur_id] = true;
426 }
427 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
428 }
429 }
430
435 private function writeCategoryData( $row ) {
436 $this->categoriesRdf->writeCategoryData(
437 $row->rc_title,
438 $row->pp_propname === 'hiddencat',
439 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
440 (int)$row->cat_subcats
441 );
442 }
443
448 public function handleMoves( IReadableDatabase $dbr, $output ) {
449 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
450 $pages = [];
451 $deleteUrls = [];
452 foreach ( $batch as $row ) {
453 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
454
455 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
456 // We already captured this one before
457 continue;
458 }
459
460 if ( $row->page_namespace != NS_CATEGORY ) {
461 // If page was moved out of Category:, we'll just delete
462 continue;
463 }
464 $row->rc_title = $row->page_title;
465 $this->writeCategoryData( $row );
466 $pages[$row->rc_cur_id] = $row->page_title;
467 $this->processed[$row->rc_cur_id] = true;
468 }
469
470 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
471 }
472 }
473
478 public function handleRestores( IReadableDatabase $dbr, $output ) {
479 fwrite( $output, "# Restores\n" );
480
481 // This will only find those restores that were not deleted later.
482 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
483 $pages = [];
484 foreach ( $batch as $row ) {
485 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
486 // We already captured this one before
487 continue;
488 }
489 $this->writeCategoryData( $row );
490 $pages[$row->rc_cur_id] = $row->rc_title;
491 $this->processed[$row->rc_cur_id] = true;
492 }
493
494 if ( !$pages ) {
495 continue;
496 }
497
498 $this->writeParentCategories( $dbr, $pages );
499
500 fwrite( $output, $this->getInsertRdf() );
501 }
502 }
503
508 public function handleAdds( IReadableDatabase $dbr, $output ) {
509 fwrite( $output, "# Additions\n" );
510
511 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
512 $pages = [];
513 foreach ( $batch as $row ) {
514 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
515 // We already captured this one before
516 continue;
517 }
518 $this->writeCategoryData( $row );
519 $pages[$row->rc_cur_id] = $row->rc_title;
520 $this->processed[$row->rc_cur_id] = true;
521 }
522
523 if ( !$pages ) {
524 continue;
525 }
526
527 $this->writeParentCategories( $dbr, $pages );
528 fwrite( $output, $this->getInsertRdf() );
529 }
530 }
531
537 public function handleEdits( IReadableDatabase $dbr, $output ) {
538 // Editing category can change hidden flag and add new parents.
539 // TODO: it's pretty expensive to update all edited categories, and most edits
540 // aren't actually interesting for us. Some way to know which are interesting?
541 // We can capture recategorization on the next step, but not change in hidden status.
542
543 foreach ( $this->getChangedCatsIterator( $dbr, RecentChange::SRC_EDIT, __METHOD__ ) as $batch ) {
544 $pages = [];
545 $deleteUrls = [];
546 foreach ( $batch as $row ) {
547 // Note that on categorization event, cur_id points to
548 // the child page, not the parent category!
549 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
550 // We already captured this one before
551 continue;
552 }
553 $this->writeCategoryData( $row );
554 $pages[$row->rc_cur_id] = $row->rc_title;
555 $this->processed[$row->rc_cur_id] = true;
556 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
557 }
558
559 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
560 }
561 }
562
568 public function handleCategorization( IReadableDatabase $dbr, $output ) {
569 $processedTitle = [];
570
571 // Categorization change can add new parents and change counts
572 // for the parent category.
573
574 foreach ( $this->getChangedCatsIterator( $dbr, RecentChange::SRC_CATEGORIZE, __METHOD__ ) as $batch ) {
575 /*
576 * Note that on categorization event, cur_id points to
577 * the child page, not the parent category!
578 * So we need to have a two-stage process, since we have ID from one
579 * category and title from another, and we need both for proper updates.
580 * TODO: For now, we do full update even though some data hasn't changed,
581 * e.g. parents for parent cat and counts for child cat.
582 */
583 $childPages = [];
584 $parentCats = [];
585 foreach ( $batch as $row ) {
586 $childPages[$row->rc_cur_id] = true;
587 $parentCats[$row->rc_title] = true;
588 }
589
590 $pages = [];
591 $deleteUrls = [];
592
593 if ( $childPages ) {
594 // Load child rows by ID
595 $childRows = $dbr->newSelectQueryBuilder()
596 ->select( [
597 'page_id',
598 'rc_title' => 'page_title',
599 'pp_propname',
600 'cat_pages',
601 'cat_subcats',
602 'cat_files',
603 ] )
604 ->from( 'page' )
605 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
606 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
607 ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] )
608 ->caller( __METHOD__ )->fetchResultSet();
609 foreach ( $childRows as $row ) {
610 if ( isset( $this->processed[$row->page_id] ) ) {
611 // We already captured this one before
612 continue;
613 }
614 $this->writeCategoryData( $row );
615 if ( $row->page_id ) {
616 $pages[$row->page_id] = $row->rc_title;
617 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
618 $this->processed[$row->page_id] = true;
619 }
620 }
621 }
622
623 if ( $parentCats ) {
624 // Load parent rows by title
625 $parentRows = $dbr->newSelectQueryBuilder()
626 ->select( [
627 'page_id',
628 'rc_title' => 'cat_title',
629 'pp_propname',
630 'cat_pages',
631 'cat_subcats',
632 'cat_files',
633 ] )
634 ->from( 'category' )
635 ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] )
636 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
637 ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] )
638 ->caller( __METHOD__ )->fetchResultSet();
639 foreach ( $parentRows as $row ) {
640 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
641 // We already captured this one before
642 continue;
643 }
644 if ( isset( $processedTitle[$row->rc_title] ) ) {
645 // We already captured this one before
646 continue;
647 }
648 $this->writeCategoryData( $row );
649 if ( $row->page_id ) {
650 $pages[$row->page_id] = $row->rc_title;
651 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
652 $this->processed[$row->page_id] = true;
653 }
654 $processedTitle[$row->rc_title] = true;
655 }
656 }
657
658 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
659 }
660 }
661}
662
663// @codeCoverageIgnoreStart
664$maintClass = CategoryChangesAsRdf::class;
665require_once RUN_MAINTENANCE_IF_MAIN;
666// @codeCoverageIgnoreEnd
const NS_CATEGORY
Definition Defines.php:65
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DB_REPLICA
Definition defines.php:26
Allows iterating a large number of rows in batches transparently.
Maintenance script to provide RDF representation of the recent changes in category tree.
initialize()
Initialize external service classes.
handleRestores(IReadableDatabase $dbr, $output)
handleMoves(IReadableDatabase $dbr, $output)
getNewCatsIterator(IReadableDatabase $dbr, $fname)
Fetch newly created categories.
execute()
Do the actual work.
getDeletedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch deleted categories.
getChangedCatsIterator(IReadableDatabase $dbr, $source, $fname)
Fetch categorization changes or edits.
handleCategorization(IReadableDatabase $dbr, $output)
Handles categorization changes.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getRestoredCatsIterator(IReadableDatabase $dbr, $fname)
Fetch restored categories.
getMovedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch moved categories.
handleEdits(IReadableDatabase $dbr, $output)
Handle edits for category texts.
getRdf()
Get accumulated RDF.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
handleAdds(IReadableDatabase $dbr, $output)
handleDeletes(IReadableDatabase $dbr, $output)
Handle category deletes.
__construct()
Default constructor.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
Library for creating and parsing MW-style timestamps.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
$source