MediaWiki REL1_39
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
20use Wikimedia\Purtle\RdfWriter;
21use Wikimedia\Purtle\TurtleRdfWriter;
23
24require_once __DIR__ . '/Maintenance.php';
25
36 private const SPARQL_INSERT = <<<SPARQL
37INSERT DATA {
38%s
39};
40
41SPARQL;
42
46 private const SPARQL_DELETE = <<<SPARQLD
47DELETE {
48?category ?x ?y
49} WHERE {
50 ?category ?x ?y
51 VALUES ?category {
52 %s
53 }
54};
55
56SPARQLD;
57
61 private $rdfWriter;
66 private $categoriesRdf;
67
68 private $startTS;
69 private $endTS;
70
76 protected $processed = [];
77
78 public function __construct() {
79 parent::__construct();
80
81 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
82
83 $this->setBatchSize( 200 );
84 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
85 true, 'o' );
86 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
87 true, true, 's' );
88 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
89 true, 'e' );
90 }
91
95 public function initialize() {
96 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
97 $this->rdfWriter = new TurtleRdfWriter();
98 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
99 }
100
101 public function execute() {
102 $this->initialize();
103 $startTS = new MWTimestamp( $this->getOption( "start" ) );
104
105 $endTS = new MWTimestamp( $this->getOption( "end" ) );
106 $now = new MWTimestamp();
107 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
108
109 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
110 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
111 }
112 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
113 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
114 }
115
116 $this->startTS = $startTS->getTimestamp();
117 $this->endTS = $endTS->getTimestamp();
118
119 $outFile = $this->getOption( 'output', 'php://stdout' );
120 if ( $outFile === '-' ) {
121 $outFile = 'php://stdout';
122 }
123
124 $output = fopen( $outFile, 'wb' );
125
126 $this->categoriesRdf->setupPrefixes();
127 $this->rdfWriter->start();
128
129 $prefixes = $this->getRdf();
130 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
131 // Also strip dot at the end.
132 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
133 fwrite( $output, $prefixes );
134
135 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
136
137 // Deletes go first because if the page was deleted, other changes
138 // do not matter. This only gets true deletes, i.e. not pages that were restored.
139 $this->handleDeletes( $dbr, $output );
140 // Moves go before additions because if category is moved, we should not process creation
141 // as it would produce wrong data - because create row has old title
142 $this->handleMoves( $dbr, $output );
143 // We need to handle restores too since delete may have happened in previous update.
144 $this->handleRestores( $dbr, $output );
145 // Process newly added pages
146 $this->handleAdds( $dbr, $output );
147 // Process page edits
148 $this->handleEdits( $dbr, $output );
149 // Process categorization changes
150 $this->handleCategorization( $dbr, $output );
151
152 // Update timestamp
153 fwrite( $output, $this->updateTS( $this->endTS ) );
154 }
155
160 private function getInsertRdf() {
161 $rdfText = $this->getRdf();
162 if ( !$rdfText ) {
163 return "";
164 }
165 return sprintf( self::SPARQL_INSERT, $rdfText );
166 }
167
176 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
177 if ( empty( $deleteUrls ) ) {
178 return "";
179 }
180
181 if ( !empty( $pages ) ) {
182 $this->writeParentCategories( $dbr, $pages );
183 }
184
185 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
186 $this->getInsertRdf();
187 }
188
195 private function writeParentCategories( IDatabase $dbr, $pages ) {
196 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
197 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
198 }
199 }
200
206 public function updateTS( $timestamp ) {
207 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
208 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
209 $tsQuery = <<<SPARQL
210DELETE {
211 $dumpUrl schema:dateModified ?o .
212}
213WHERE {
214 $dumpUrl schema:dateModified ?o .
215};
216INSERT DATA {
217 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
218}
219
220SPARQL;
221 return $tsQuery;
222 }
223
232 private function setupChangesIterator(
234 array $columns = [],
235 array $extra_tables = [],
236 $fname = __METHOD__
237 ) {
238 $tables = [ 'recentchanges', 'page_props', 'category' ];
239 if ( $extra_tables ) {
240 $tables = array_merge( $tables, $extra_tables );
241 }
242 $it = new BatchRowIterator( $dbr,
243 $tables,
244 [ 'rc_timestamp' ],
245 $this->mBatchSize
246 );
247 $this->addTimestampConditions( $it, $dbr );
249 [
250 'page_props' => [
251 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
252 ],
253 'category' => [
254 'LEFT JOIN', [ 'cat_title = rc_title' ]
255 ]
256 ]
257 );
258 $it->setFetchColumns( array_merge( $columns, [
259 'rc_title',
260 'rc_cur_id',
261 'pp_propname',
262 'cat_pages',
263 'cat_subcats',
264 'cat_files'
265 ] ) );
266 $it->setCaller( $fname );
267 return $it;
268 }
269
276 protected function getNewCatsIterator( IDatabase $dbr, $fname ) {
277 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
278 $it->addConditions( [
279 'rc_namespace' => NS_CATEGORY,
280 'rc_new' => 1,
281 ] );
282 return $it;
283 }
284
291 protected function getMovedCatsIterator( IDatabase $dbr, $fname ) {
292 $it = $this->setupChangesIterator(
293 $dbr,
294 [ 'page_title', 'page_namespace' ],
295 [ 'page' ],
296 $fname
297 );
298 $it->addConditions( [
299 'rc_namespace' => NS_CATEGORY,
300 'rc_new' => 0,
301 'rc_log_type' => 'move',
302 'rc_type' => RC_LOG,
303 ] );
304 $it->addJoinConditions( [
305 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
306 ] );
307 $this->addIndex( $it, $dbr );
308 return $it;
309 }
310
317 protected function getDeletedCatsIterator( IDatabase $dbr, $fname ) {
318 $it = new BatchRowIterator( $dbr,
319 'recentchanges',
320 [ 'rc_timestamp' ],
321 $this->mBatchSize
322 );
323 $this->addTimestampConditions( $it, $dbr );
324 $it->addConditions( [
325 'rc_namespace' => NS_CATEGORY,
326 'rc_new' => 0,
327 'rc_log_type' => 'delete',
328 'rc_log_action' => 'delete',
329 'rc_type' => RC_LOG,
330 // We will fetch ones that do not have page record. If they do,
331 // this means they were restored, thus restoring handler will pick it up.
332 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
333 ] );
334 $this->addIndex( $it, $dbr );
335 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
336 $it->setCaller( $fname );
337 return $it;
338 }
339
346 protected function getRestoredCatsIterator( IDatabase $dbr, $fname ) {
347 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
348 $it->addConditions( [
349 'rc_namespace' => NS_CATEGORY,
350 'rc_new' => 0,
351 'rc_log_type' => 'delete',
352 'rc_log_action' => 'restore',
353 'rc_type' => RC_LOG,
354 // We will only fetch ones that have page record
355 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
356 ] );
357 $this->addIndex( $it, $dbr );
358 return $it;
359 }
360
368 protected function getChangedCatsIterator( IDatabase $dbr, $type, $fname ) {
369 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
370 $it->addConditions( [
371 'rc_namespace' => NS_CATEGORY,
372 'rc_new' => 0,
373 'rc_type' => $type,
374 ] );
375 $this->addIndex( $it, $dbr );
376 return $it;
377 }
378
384 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
385 $it->addConditions( [
386 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
387 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
388 ] );
389 }
390
396 private function addIndex( BatchRowIterator $it, IDatabase $dbr ) {
397 $it->addOptions( [
398 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
399 ] );
400 }
401
409 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
410 $it = new BatchRowIterator(
411 $dbr,
412 'categorylinks',
413 [ 'cl_from', 'cl_to' ],
414 $this->mBatchSize
415 );
416 $it->addConditions( [
417 'cl_type' => 'subcat',
418 'cl_from' => $ids
419 ] );
420 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
421 $it->setCaller( $fname );
422 return new RecursiveIteratorIterator( $it );
423 }
424
429 public function getRdf() {
430 return $this->rdfWriter->drain();
431 }
432
438 public function handleDeletes( IDatabase $dbr, $output ) {
439 // This only does "true" deletes - i.e. those that the page stays deleted
440
441 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
442 $deleteUrls = [];
443 foreach ( $batch as $row ) {
444 // This can produce duplicates, we don't care
445 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
446 $this->processed[$row->rc_cur_id] = true;
447 }
448 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
449 }
450 }
451
456 private function writeCategoryData( $row ) {
457 $this->categoriesRdf->writeCategoryData(
458 $row->rc_title,
459 $row->pp_propname === 'hiddencat',
460 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
461 (int)$row->cat_subcats
462 );
463 }
464
469 public function handleMoves( IDatabase $dbr, $output ) {
470 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
471 $pages = [];
472 $deleteUrls = [];
473 foreach ( $batch as $row ) {
474 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
475
476 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
477 // We already captured this one before
478 continue;
479 }
480
481 if ( $row->page_namespace != NS_CATEGORY ) {
482 // If page was moved out of Category:, we'll just delete
483 continue;
484 }
485 $row->rc_title = $row->page_title;
486 $this->writeCategoryData( $row );
487 $pages[$row->rc_cur_id] = $row->page_title;
488 $this->processed[$row->rc_cur_id] = true;
489 }
490
491 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
492 }
493 }
494
499 public function handleRestores( IDatabase $dbr, $output ) {
500 fwrite( $output, "# Restores\n" );
501
502 // This will only find those restores that were not deleted later.
503 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
504 $pages = [];
505 foreach ( $batch as $row ) {
506 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
507 // We already captured this one before
508 continue;
509 }
510 $this->writeCategoryData( $row );
511 $pages[$row->rc_cur_id] = $row->rc_title;
512 $this->processed[$row->rc_cur_id] = true;
513 }
514
515 if ( empty( $pages ) ) {
516 continue;
517 }
518
519 $this->writeParentCategories( $dbr, $pages );
520
521 fwrite( $output, $this->getInsertRdf() );
522 }
523 }
524
529 public function handleAdds( IDatabase $dbr, $output ) {
530 fwrite( $output, "# Additions\n" );
531
532 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
533 $pages = [];
534 foreach ( $batch as $row ) {
535 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
536 // We already captured this one before
537 continue;
538 }
539 $this->writeCategoryData( $row );
540 $pages[$row->rc_cur_id] = $row->rc_title;
541 $this->processed[$row->rc_cur_id] = true;
542 }
543
544 if ( empty( $pages ) ) {
545 continue;
546 }
547
548 $this->writeParentCategories( $dbr, $pages );
549 fwrite( $output, $this->getInsertRdf() );
550 }
551 }
552
558 public function handleEdits( IDatabase $dbr, $output ) {
559 // Editing category can change hidden flag and add new parents.
560 // TODO: it's pretty expensive to update all edited categories, and most edits
561 // aren't actually interesting for us. Some way to know which are interesting?
562 // We can capture recategorization on the next step, but not change in hidden status.
563
564 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
565 $pages = [];
566 $deleteUrls = [];
567 foreach ( $batch as $row ) {
568 // Note that on categorization event, cur_id points to
569 // the child page, not the parent category!
570 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
571 // We already captured this one before
572 continue;
573 }
574 $this->writeCategoryData( $row );
575 $pages[$row->rc_cur_id] = $row->rc_title;
576 $this->processed[$row->rc_cur_id] = true;
577 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
578 }
579
580 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
581 }
582 }
583
589 public function handleCategorization( IDatabase $dbr, $output ) {
590 $processedTitle = [];
591
592 // Categorization change can add new parents and change counts
593 // for the parent category.
594
595 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
596 /*
597 * Note that on categorization event, cur_id points to
598 * the child page, not the parent category!
599 * So we need to have a two-stage process, since we have ID from one
600 * category and title from another, and we need both for proper updates.
601 * TODO: For now, we do full update even though some data hasn't changed,
602 * e.g. parents for parent cat and counts for child cat.
603 */
604 $childPages = [];
605 $parentCats = [];
606 foreach ( $batch as $row ) {
607 $childPages[$row->rc_cur_id] = true;
608 $parentCats[$row->rc_title] = true;
609 }
610
611 $joinConditions = [
612 'page_props' => [
613 'LEFT JOIN',
614 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
615 ],
616 'category' => [
617 'LEFT JOIN',
618 [ 'cat_title = page_title' ],
619 ],
620 ];
621
622 $pages = [];
623 $deleteUrls = [];
624
625 if ( $childPages ) {
626 // Load child rows by ID
627 $childRows = $dbr->select(
628 [ 'page', 'page_props', 'category' ],
629 [
630 'page_id',
631 'rc_title' => 'page_title',
632 'pp_propname',
633 'cat_pages',
634 'cat_subcats',
635 'cat_files',
636 ],
637 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
638 __METHOD__,
639 [],
640 $joinConditions
641 );
642 foreach ( $childRows as $row ) {
643 if ( isset( $this->processed[$row->page_id] ) ) {
644 // We already captured this one before
645 continue;
646 }
647 $this->writeCategoryData( $row );
648 if ( $row->page_id ) {
649 $pages[$row->page_id] = $row->rc_title;
650 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
651 $this->processed[$row->page_id] = true;
652 }
653 }
654 }
655
656 if ( $parentCats ) {
657 // Load parent rows by title
658 $joinConditions = [
659 'page' => [
660 'LEFT JOIN',
661 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
662 ],
663 'page_props' => [
664 'LEFT JOIN',
665 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
666 ],
667 ];
668
669 $parentRows = $dbr->select(
670 [ 'category', 'page', 'page_props' ],
671 [
672 'page_id',
673 'rc_title' => 'cat_title',
674 'pp_propname',
675 'cat_pages',
676 'cat_subcats',
677 'cat_files',
678 ],
679 [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
680 __METHOD__,
681 [],
682 $joinConditions
683 );
684 foreach ( $parentRows as $row ) {
685 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
686 // We already captured this one before
687 continue;
688 }
689 if ( isset( $processedTitle[$row->rc_title] ) ) {
690 // We already captured this one before
691 continue;
692 }
693 $this->writeCategoryData( $row );
694 if ( $row->page_id ) {
695 $pages[$row->page_id] = $row->rc_title;
696 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
697 $this->processed[$row->page_id] = true;
698 }
699 $processedTitle[$row->rc_title] = true;
700 }
701 }
702
703 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
704 }
705 }
706}
707
708$maintClass = CategoryChangesAsRdf::class;
709require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const RC_LOG
Definition Defines.php:118
const RC_EDIT
Definition Defines.php:116
const NS_CATEGORY
Definition Defines.php:78
const RC_CATEGORIZE
Definition Defines.php:120
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addJoinConditions(array $conditions)
setFetchColumns(array $columns)
setCaller( $caller)
Use ->setCaller( METHOD ) to indicate which code is using this class.
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
initialize()
Initialize external service classes.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:39
const DB_REPLICA
Definition defines.php:26