MediaWiki master
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
24use Wikimedia\Purtle\RdfWriter;
25use Wikimedia\Purtle\TurtleRdfWriter;
27
28// @codeCoverageIgnoreStart
29require_once __DIR__ . '/Maintenance.php';
30// @codeCoverageIgnoreEnd
31
42 private const SPARQL_INSERT = <<<SPARQL
43INSERT DATA {
44%s
45};
46
47SPARQL;
48
52 private const SPARQL_DELETE = <<<SPARQLD
53DELETE {
54?category ?x ?y
55} WHERE {
56 ?category ?x ?y
57 VALUES ?category {
58 %s
59 }
60};
61
62SPARQLD;
63
67 private $rdfWriter;
72 private $categoriesRdf;
73
75 private $startTS;
77 private $endTS;
78
84 protected $processed = [];
85
86 public function __construct() {
87 parent::__construct();
88
89 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
90
91 $this->setBatchSize( 200 );
92 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
93 true, 'o' );
94 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.',
95 true, true, 's' );
96 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true,
97 true, 'e' );
98 }
99
103 public function initialize() {
104 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
105 $this->rdfWriter = new TurtleRdfWriter();
106 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
107 }
108
109 public function execute() {
110 $this->initialize();
111 $startTS = new MWTimestamp( $this->getOption( "start" ) );
112
113 $endTS = new MWTimestamp( $this->getOption( "end" ) );
114 $now = new MWTimestamp();
115 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
116
117 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
118 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
119 }
120 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
121 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
122 }
123
124 $this->startTS = $startTS->getTimestamp();
125 $this->endTS = $endTS->getTimestamp();
126
127 $outFile = $this->getOption( 'output', 'php://stdout' );
128 if ( $outFile === '-' ) {
129 $outFile = 'php://stdout';
130 }
131
132 $output = fopen( $outFile, 'wb' );
133
134 $this->categoriesRdf->setupPrefixes();
135 $this->rdfWriter->start();
136
137 $prefixes = $this->getRdf();
138 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
139 // Also strip dot at the end.
140 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
141 fwrite( $output, $prefixes );
142
143 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
144
145 // Deletes go first because if the page was deleted, other changes
146 // do not matter. This only gets true deletes, i.e. not pages that were restored.
147 $this->handleDeletes( $dbr, $output );
148 // Moves go before additions because if category is moved, we should not process creation
149 // as it would produce wrong data - because create row has old title
150 $this->handleMoves( $dbr, $output );
151 // We need to handle restores too since delete may have happened in previous update.
152 $this->handleRestores( $dbr, $output );
153 // Process newly added pages
154 $this->handleAdds( $dbr, $output );
155 // Process page edits
156 $this->handleEdits( $dbr, $output );
157 // Process categorization changes
158 $this->handleCategorization( $dbr, $output );
159
160 // Update timestamp
161 fwrite( $output, $this->updateTS( $this->endTS ) );
162 }
163
168 private function getInsertRdf() {
169 $rdfText = $this->getRdf();
170 if ( !$rdfText ) {
171 return "";
172 }
173 return sprintf( self::SPARQL_INSERT, $rdfText );
174 }
175
184 private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
185 if ( !$deleteUrls ) {
186 return "";
187 }
188
189 if ( $pages ) {
190 $this->writeParentCategories( $dbr, $pages );
191 }
192
193 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
194 $this->getInsertRdf();
195 }
196
203 private function writeParentCategories( IReadableDatabase $dbr, $pages ) {
204 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
205 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
206 }
207 }
208
214 public function updateTS( $timestamp ) {
215 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
216 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
217 $tsQuery = <<<SPARQL
218DELETE {
219 $dumpUrl schema:dateModified ?o .
220}
221WHERE {
222 $dumpUrl schema:dateModified ?o .
223};
224INSERT DATA {
225 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
226}
227
228SPARQL;
229 return $tsQuery;
230 }
231
239 private function setupChangesIterator(
241 array $columns,
242 string $fname
243 ) {
244 $it = new BatchRowIterator( $dbr,
246 ->from( 'recentchanges' )
247 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] )
248 ->leftJoin( 'category', null, [ 'cat_title = rc_title' ] )
249 ->select( array_merge( $columns, [
250 'rc_title',
251 'rc_cur_id',
252 'pp_propname',
253 'cat_pages',
254 'cat_subcats',
255 'cat_files'
256 ] ) )
257 ->caller( $fname ),
258 [ 'rc_timestamp' ],
259 $this->mBatchSize
260 );
261 $this->addTimestampConditions( $it, $dbr );
262 return $it;
263 }
264
271 protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) {
272 $it = $this->setupChangesIterator( $dbr, [], $fname );
273 $it->sqb->conds( [
274 'rc_namespace' => NS_CATEGORY,
275 'rc_new' => 1,
276 ] );
277 return $it;
278 }
279
286 protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) {
287 $it = $this->setupChangesIterator(
288 $dbr,
289 [ 'page_title', 'page_namespace' ],
290 $fname
291 );
292 $it->sqb->conds( [
293 'rc_namespace' => NS_CATEGORY,
294 'rc_new' => 0,
295 'rc_log_type' => 'move',
296 'rc_type' => RC_LOG,
297 ] );
298 $it->sqb->join( 'page', null, 'rc_cur_id = page_id' );
299 $this->addIndex( $it );
300 return $it;
301 }
302
309 protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) {
310 $it = new BatchRowIterator( $dbr,
312 ->from( 'recentchanges' )
313 ->select( [ 'rc_cur_id', 'rc_title' ] )
314 ->where( [
315 'rc_namespace' => NS_CATEGORY,
316 'rc_new' => 0,
317 'rc_log_type' => 'delete',
318 'rc_log_action' => 'delete',
319 'rc_type' => RC_LOG,
320 // We will fetch ones that do not have page record. If they do,
321 // this means they were restored, thus restoring handler will pick it up.
322 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
323 ] )
324 ->caller( $fname ),
325 [ 'rc_timestamp' ],
326 $this->mBatchSize
327 );
328 $this->addTimestampConditions( $it, $dbr );
329 $this->addIndex( $it );
330 return $it;
331 }
332
339 protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) {
340 $it = $this->setupChangesIterator( $dbr, [], $fname );
341 $it->sqb->conds( [
342 'rc_namespace' => NS_CATEGORY,
343 'rc_new' => 0,
344 'rc_log_type' => 'delete',
345 'rc_log_action' => 'restore',
346 'rc_type' => RC_LOG,
347 // We will only fetch ones that have page record
348 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
349 ] );
350 $this->addIndex( $it );
351 return $it;
352 }
353
361 protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) {
362 $it = $this->setupChangesIterator( $dbr, [], $fname );
363 $it->sqb->conds( [
364 'rc_namespace' => NS_CATEGORY,
365 'rc_new' => 0,
366 'rc_type' => $type,
367 ] );
368 $this->addIndex( $it );
369 return $it;
370 }
371
377 private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
378 $it->sqb->conds( [
379 $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ),
380 $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ),
381 ] );
382 }
383
387 private function addIndex( BatchRowIterator $it ) {
388 $it->sqb->options( [
389 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
390 ] );
391 }
392
400 protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
401 $it = new BatchRowIterator(
402 $dbr,
404 ->from( 'categorylinks' )
405 ->select( [ 'cl_from', 'cl_to' ] )
406 ->where( [
407 'cl_type' => 'subcat',
408 'cl_from' => $ids
409 ] )
410 ->caller( $fname ),
411 [ 'cl_from', 'cl_to' ],
412 $this->mBatchSize
413 );
414 return new RecursiveIteratorIterator( $it );
415 }
416
421 public function getRdf() {
422 return $this->rdfWriter->drain();
423 }
424
430 public function handleDeletes( IReadableDatabase $dbr, $output ) {
431 // This only does "true" deletes - i.e. those that the page stays deleted
432
433 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
434 $deleteUrls = [];
435 foreach ( $batch as $row ) {
436 // This can produce duplicates, we don't care
437 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
438 $this->processed[$row->rc_cur_id] = true;
439 }
440 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
441 }
442 }
443
448 private function writeCategoryData( $row ) {
449 $this->categoriesRdf->writeCategoryData(
450 $row->rc_title,
451 $row->pp_propname === 'hiddencat',
452 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
453 (int)$row->cat_subcats
454 );
455 }
456
461 public function handleMoves( IReadableDatabase $dbr, $output ) {
462 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
463 $pages = [];
464 $deleteUrls = [];
465 foreach ( $batch as $row ) {
466 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
467
468 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
469 // We already captured this one before
470 continue;
471 }
472
473 if ( $row->page_namespace != NS_CATEGORY ) {
474 // If page was moved out of Category:, we'll just delete
475 continue;
476 }
477 $row->rc_title = $row->page_title;
478 $this->writeCategoryData( $row );
479 $pages[$row->rc_cur_id] = $row->page_title;
480 $this->processed[$row->rc_cur_id] = true;
481 }
482
483 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
484 }
485 }
486
491 public function handleRestores( IReadableDatabase $dbr, $output ) {
492 fwrite( $output, "# Restores\n" );
493
494 // This will only find those restores that were not deleted later.
495 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
496 $pages = [];
497 foreach ( $batch as $row ) {
498 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
499 // We already captured this one before
500 continue;
501 }
502 $this->writeCategoryData( $row );
503 $pages[$row->rc_cur_id] = $row->rc_title;
504 $this->processed[$row->rc_cur_id] = true;
505 }
506
507 if ( !$pages ) {
508 continue;
509 }
510
511 $this->writeParentCategories( $dbr, $pages );
512
513 fwrite( $output, $this->getInsertRdf() );
514 }
515 }
516
521 public function handleAdds( IReadableDatabase $dbr, $output ) {
522 fwrite( $output, "# Additions\n" );
523
524 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
525 $pages = [];
526 foreach ( $batch as $row ) {
527 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
528 // We already captured this one before
529 continue;
530 }
531 $this->writeCategoryData( $row );
532 $pages[$row->rc_cur_id] = $row->rc_title;
533 $this->processed[$row->rc_cur_id] = true;
534 }
535
536 if ( !$pages ) {
537 continue;
538 }
539
540 $this->writeParentCategories( $dbr, $pages );
541 fwrite( $output, $this->getInsertRdf() );
542 }
543 }
544
550 public function handleEdits( IReadableDatabase $dbr, $output ) {
551 // Editing category can change hidden flag and add new parents.
552 // TODO: it's pretty expensive to update all edited categories, and most edits
553 // aren't actually interesting for us. Some way to know which are interesting?
554 // We can capture recategorization on the next step, but not change in hidden status.
555
556 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
557 $pages = [];
558 $deleteUrls = [];
559 foreach ( $batch as $row ) {
560 // Note that on categorization event, cur_id points to
561 // the child page, not the parent category!
562 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
563 // We already captured this one before
564 continue;
565 }
566 $this->writeCategoryData( $row );
567 $pages[$row->rc_cur_id] = $row->rc_title;
568 $this->processed[$row->rc_cur_id] = true;
569 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
570 }
571
572 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
573 }
574 }
575
581 public function handleCategorization( IReadableDatabase $dbr, $output ) {
582 $processedTitle = [];
583
584 // Categorization change can add new parents and change counts
585 // for the parent category.
586
587 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
588 /*
589 * Note that on categorization event, cur_id points to
590 * the child page, not the parent category!
591 * So we need to have a two-stage process, since we have ID from one
592 * category and title from another, and we need both for proper updates.
593 * TODO: For now, we do full update even though some data hasn't changed,
594 * e.g. parents for parent cat and counts for child cat.
595 */
596 $childPages = [];
597 $parentCats = [];
598 foreach ( $batch as $row ) {
599 $childPages[$row->rc_cur_id] = true;
600 $parentCats[$row->rc_title] = true;
601 }
602
603 $pages = [];
604 $deleteUrls = [];
605
606 if ( $childPages ) {
607 // Load child rows by ID
608 $childRows = $dbr->newSelectQueryBuilder()
609 ->select( [
610 'page_id',
611 'rc_title' => 'page_title',
612 'pp_propname',
613 'cat_pages',
614 'cat_subcats',
615 'cat_files',
616 ] )
617 ->from( 'page' )
618 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
619 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
620 ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] )
621 ->caller( __METHOD__ )->fetchResultSet();
622 foreach ( $childRows as $row ) {
623 if ( isset( $this->processed[$row->page_id] ) ) {
624 // We already captured this one before
625 continue;
626 }
627 $this->writeCategoryData( $row );
628 if ( $row->page_id ) {
629 $pages[$row->page_id] = $row->rc_title;
630 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
631 $this->processed[$row->page_id] = true;
632 }
633 }
634 }
635
636 if ( $parentCats ) {
637 // Load parent rows by title
638 $parentRows = $dbr->newSelectQueryBuilder()
639 ->select( [
640 'page_id',
641 'rc_title' => 'cat_title',
642 'pp_propname',
643 'cat_pages',
644 'cat_subcats',
645 'cat_files',
646 ] )
647 ->from( 'category' )
648 ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] )
649 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
650 ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] )
651 ->caller( __METHOD__ )->fetchResultSet();
652 foreach ( $parentRows as $row ) {
653 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
654 // We already captured this one before
655 continue;
656 }
657 if ( isset( $processedTitle[$row->rc_title] ) ) {
658 // We already captured this one before
659 continue;
660 }
661 $this->writeCategoryData( $row );
662 if ( $row->page_id ) {
663 $pages[$row->page_id] = $row->rc_title;
664 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
665 $this->processed[$row->page_id] = true;
666 }
667 $processedTitle[$row->rc_title] = true;
668 }
669 }
670
671 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
672 }
673 }
674}
675
676// @codeCoverageIgnoreStart
677$maintClass = CategoryChangesAsRdf::class;
678require_once RUN_MAINTENANCE_IF_MAIN;
679// @codeCoverageIgnoreEnd
const RC_LOG
Definition Defines.php:119
const RC_EDIT
Definition Defines.php:117
const NS_CATEGORY
Definition Defines.php:79
const RC_CATEGORIZE
Definition Defines.php:121
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
Maintenance script to provide RDF representation of the recent changes in category tree.
initialize()
Initialize external service classes.
handleRestores(IReadableDatabase $dbr, $output)
handleMoves(IReadableDatabase $dbr, $output)
getNewCatsIterator(IReadableDatabase $dbr, $fname)
Fetch newly created categories.
execute()
Do the actual work.
getDeletedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch deleted categories.
handleCategorization(IReadableDatabase $dbr, $output)
Handles categorization changes.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getRestoredCatsIterator(IReadableDatabase $dbr, $fname)
Fetch restored categories.
getMovedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch moved categories.
handleEdits(IReadableDatabase $dbr, $output)
Handle edits for category texts.
getRdf()
Get accumulated RDF.
getChangedCatsIterator(IReadableDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
handleAdds(IReadableDatabase $dbr, $output)
handleDeletes(IReadableDatabase $dbr, $output)
Handle category deletes.
__construct()
Default constructor.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
Library for creating and parsing MW-style timestamps.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
const DB_REPLICA
Definition defines.php:26