MediaWiki REL1_37
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
22
23require_once __DIR__ . '/Maintenance.php';
24
35 private const SPARQL_INSERT = <<<SPARQL
36INSERT DATA {
37%s
38};
39
40SPARQL;
41
45 private const SPARQL_DELETE = <<<SPARQLD
46DELETE {
47?category ?x ?y
48} WHERE {
49 ?category ?x ?y
50 VALUES ?category {
51 %s
52 }
53};
54
55SPARQLD;
56
60 private $rdfWriter;
66
67 private $startTS;
68 private $endTS;
69
75 protected $processed = [];
76
77 public function __construct() {
78 parent::__construct();
79
80 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
81
82 $this->setBatchSize( 200 );
83 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
84 true, 'o' );
85 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
86 true, true, 's' );
87 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
88 true, 'e' );
89 }
90
94 public function initialize() {
95 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
96 $this->rdfWriter = new TurtleRdfWriter();
97 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
98 }
99
100 public function execute() {
101 $this->initialize();
102 $startTS = new MWTimestamp( $this->getOption( "start" ) );
103
104 $endTS = new MWTimestamp( $this->getOption( "end" ) );
105 $now = new MWTimestamp();
106 $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
107
108 if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
109 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
110 }
111 if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
112 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
113 }
114
115 $this->startTS = $startTS->getTimestamp();
116 $this->endTS = $endTS->getTimestamp();
117
118 $outFile = $this->getOption( 'output', 'php://stdout' );
119 if ( $outFile === '-' ) {
120 $outFile = 'php://stdout';
121 }
122
123 $output = fopen( $outFile, 'wb' );
124
125 $this->categoriesRdf->setupPrefixes();
126 $this->rdfWriter->start();
127
128 $prefixes = $this->getRdf();
129 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
130 // Also strip dot at the end.
131 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
132 fwrite( $output, $prefixes );
133
134 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
135
136 // Deletes go first because if the page was deleted, other changes
137 // do not matter. This only gets true deletes, i.e. not pages that were restored.
138 $this->handleDeletes( $dbr, $output );
139 // Moves go before additions because if category is moved, we should not process creation
140 // as it would produce wrong data - because create row has old title
141 $this->handleMoves( $dbr, $output );
142 // We need to handle restores too since delete may have happened in previous update.
143 $this->handleRestores( $dbr, $output );
144 // Process newly added pages
145 $this->handleAdds( $dbr, $output );
146 // Process page edits
147 $this->handleEdits( $dbr, $output );
148 // Process categorization changes
149 $this->handleCategorization( $dbr, $output );
150
151 // Update timestamp
152 fwrite( $output, $this->updateTS( $this->endTS ) );
153 }
154
159 private function getInsertRdf() {
160 $rdfText = $this->getRdf();
161 if ( !$rdfText ) {
162 return "";
163 }
164 return sprintf( self::SPARQL_INSERT, $rdfText );
165 }
166
175 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
176 if ( empty( $deleteUrls ) ) {
177 return "";
178 }
179
180 if ( !empty( $pages ) ) {
181 $this->writeParentCategories( $dbr, $pages );
182 }
183
184 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
185 $this->getInsertRdf();
186 }
187
194 private function writeParentCategories( IDatabase $dbr, $pages ) {
195 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
196 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
197 }
198 }
199
205 public function updateTS( $timestamp ) {
206 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
207 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
208 $tsQuery = <<<SPARQL
209DELETE {
210 $dumpUrl schema:dateModified ?o .
211}
212WHERE {
213 $dumpUrl schema:dateModified ?o .
214};
215INSERT DATA {
216 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
217}
218
219SPARQL;
220 return $tsQuery;
221 }
222
231 private function setupChangesIterator(
233 array $columns = [],
234 array $extra_tables = [],
235 $fname = __METHOD__
236 ) {
237 $tables = [ 'recentchanges', 'page_props', 'category' ];
238 if ( $extra_tables ) {
239 $tables = array_merge( $tables, $extra_tables );
240 }
241 $it = new BatchRowIterator( $dbr,
242 $tables,
243 [ 'rc_timestamp' ],
244 $this->mBatchSize
245 );
246 $this->addTimestampConditions( $it, $dbr );
247 $it->addJoinConditions(
248 [
249 'page_props' => [
250 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
251 ],
252 'category' => [
253 'LEFT JOIN', [ 'cat_title = rc_title' ]
254 ]
255 ]
256 );
257 $it->setFetchColumns( array_merge( $columns, [
258 'rc_title',
259 'rc_cur_id',
260 'pp_propname',
261 'cat_pages',
262 'cat_subcats',
263 'cat_files'
264 ] ) );
265 $it->setCaller( $fname );
266 return $it;
267 }
268
275 protected function getNewCatsIterator( IDatabase $dbr, $fname ) {
276 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
277 $it->addConditions( [
278 'rc_namespace' => NS_CATEGORY,
279 'rc_new' => 1,
280 ] );
281 return $it;
282 }
283
290 protected function getMovedCatsIterator( IDatabase $dbr, $fname ) {
291 $it = $this->setupChangesIterator(
292 $dbr,
293 [ 'page_title', 'page_namespace' ],
294 [ 'page' ],
295 $fname
296 );
297 $it->addConditions( [
298 'rc_namespace' => NS_CATEGORY,
299 'rc_new' => 0,
300 'rc_log_type' => 'move',
301 'rc_type' => RC_LOG,
302 ] );
303 $it->addJoinConditions( [
304 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
305 ] );
306 $this->addIndex( $it, $dbr );
307 return $it;
308 }
309
316 protected function getDeletedCatsIterator( IDatabase $dbr, $fname ) {
317 $it = new BatchRowIterator( $dbr,
318 'recentchanges',
319 [ 'rc_timestamp' ],
320 $this->mBatchSize
321 );
322 $this->addTimestampConditions( $it, $dbr );
323 $it->addConditions( [
324 'rc_namespace' => NS_CATEGORY,
325 'rc_new' => 0,
326 'rc_log_type' => 'delete',
327 'rc_log_action' => 'delete',
328 'rc_type' => RC_LOG,
329 // We will fetch ones that do not have page record. If they do,
330 // this means they were restored, thus restoring handler will pick it up.
331 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
332 ] );
333 $this->addIndex( $it, $dbr );
334 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
335 $it->setCaller( $fname );
336 return $it;
337 }
338
345 protected function getRestoredCatsIterator( IDatabase $dbr, $fname ) {
346 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
347 $it->addConditions( [
348 'rc_namespace' => NS_CATEGORY,
349 'rc_new' => 0,
350 'rc_log_type' => 'delete',
351 'rc_log_action' => 'restore',
352 'rc_type' => RC_LOG,
353 // We will only fetch ones that have page record
354 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
355 ] );
356 $this->addIndex( $it, $dbr );
357 return $it;
358 }
359
367 protected function getChangedCatsIterator( IDatabase $dbr, $type, $fname ) {
368 $it = $this->setupChangesIterator( $dbr, [], [], $fname );
369 $it->addConditions( [
370 'rc_namespace' => NS_CATEGORY,
371 'rc_new' => 0,
372 'rc_type' => $type,
373 ] );
374 $this->addIndex( $it, $dbr );
375 return $it;
376 }
377
384 $it->addConditions( [
385 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
386 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
387 ] );
388 }
389
395 private function addIndex( BatchRowIterator $it, IDatabase $dbr ) {
396 $it->addOptions( [
397 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
398 ] );
399 }
400
408 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
409 $it = new BatchRowIterator(
410 $dbr,
411 'categorylinks',
412 [ 'cl_from', 'cl_to' ],
413 $this->mBatchSize
414 );
415 $it->addConditions( [
416 'cl_type' => 'subcat',
417 'cl_from' => $ids
418 ] );
419 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
420 $it->setCaller( $fname );
421 return new RecursiveIteratorIterator( $it );
422 }
423
428 public function getRdf() {
429 return $this->rdfWriter->drain();
430 }
431
437 public function handleDeletes( IDatabase $dbr, $output ) {
438 // This only does "true" deletes - i.e. those that the page stays deleted
439
440 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
441 $deleteUrls = [];
442 foreach ( $batch as $row ) {
443 // This can produce duplicates, we don't care
444 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
445 $this->processed[$row->rc_cur_id] = true;
446 }
447 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
448 }
449 }
450
455 private function writeCategoryData( $row ) {
456 $this->categoriesRdf->writeCategoryData(
457 $row->rc_title,
458 $row->pp_propname === 'hiddencat',
459 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
460 (int)$row->cat_subcats
461 );
462 }
463
468 public function handleMoves( IDatabase $dbr, $output ) {
469 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
470 $pages = [];
471 $deleteUrls = [];
472 foreach ( $batch as $row ) {
473 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
474
475 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
476 // We already captured this one before
477 continue;
478 }
479
480 if ( $row->page_namespace != NS_CATEGORY ) {
481 // If page was moved out of Category:, we'll just delete
482 continue;
483 }
484 $row->rc_title = $row->page_title;
485 $this->writeCategoryData( $row );
486 $pages[$row->rc_cur_id] = $row->page_title;
487 $this->processed[$row->rc_cur_id] = true;
488 }
489
490 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
491 }
492 }
493
498 public function handleRestores( IDatabase $dbr, $output ) {
499 fwrite( $output, "# Restores\n" );
500
501 // This will only find those restores that were not deleted later.
502 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
503 $pages = [];
504 foreach ( $batch as $row ) {
505 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
506 // We already captured this one before
507 continue;
508 }
509 $this->writeCategoryData( $row );
510 $pages[$row->rc_cur_id] = $row->rc_title;
511 $this->processed[$row->rc_cur_id] = true;
512 }
513
514 if ( empty( $pages ) ) {
515 continue;
516 }
517
518 $this->writeParentCategories( $dbr, $pages );
519
520 fwrite( $output, $this->getInsertRdf() );
521 }
522 }
523
528 public function handleAdds( IDatabase $dbr, $output ) {
529 fwrite( $output, "# Additions\n" );
530
531 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
532 $pages = [];
533 foreach ( $batch as $row ) {
534 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
535 // We already captured this one before
536 continue;
537 }
538 $this->writeCategoryData( $row );
539 $pages[$row->rc_cur_id] = $row->rc_title;
540 $this->processed[$row->rc_cur_id] = true;
541 }
542
543 if ( empty( $pages ) ) {
544 continue;
545 }
546
547 $this->writeParentCategories( $dbr, $pages );
548 fwrite( $output, $this->getInsertRdf() );
549 }
550 }
551
557 public function handleEdits( IDatabase $dbr, $output ) {
558 // Editing category can change hidden flag and add new parents.
559 // TODO: it's pretty expensive to update all edited categories, and most edits
560 // aren't actually interesting for us. Some way to know which are interesting?
561 // We can capture recategorization on the next step, but not change in hidden status.
562
563 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
564 $pages = [];
565 $deleteUrls = [];
566 foreach ( $batch as $row ) {
567 // Note that on categorization event, cur_id points to
568 // the child page, not the parent category!
569 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
570 // We already captured this one before
571 continue;
572 }
573 $this->writeCategoryData( $row );
574 $pages[$row->rc_cur_id] = $row->rc_title;
575 $this->processed[$row->rc_cur_id] = true;
576 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
577 }
578
579 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
580 }
581 }
582
588 public function handleCategorization( IDatabase $dbr, $output ) {
589 $processedTitle = [];
590
591 // Categorization change can add new parents and change counts
592 // for the parent category.
593
594 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
595 /*
596 * Note that on categorization event, cur_id points to
597 * the child page, not the parent category!
598 * So we need to have a two-stage process, since we have ID from one
599 * category and title from another, and we need both for proper updates.
600 * TODO: For now, we do full update even though some data hasn't changed,
601 * e.g. parents for parent cat and counts for child cat.
602 */
603 $childPages = [];
604 $parentCats = [];
605 foreach ( $batch as $row ) {
606 $childPages[$row->rc_cur_id] = true;
607 $parentCats[$row->rc_title] = true;
608 }
609
610 $joinConditions = [
611 'page_props' => [
612 'LEFT JOIN',
613 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
614 ],
615 'category' => [
616 'LEFT JOIN',
617 [ 'cat_title = page_title' ],
618 ],
619 ];
620
621 $pages = [];
622 $deleteUrls = [];
623
624 if ( $childPages ) {
625 // Load child rows by ID
626 $childRows = $dbr->select(
627 [ 'page', 'page_props', 'category' ],
628 [
629 'page_id',
630 'rc_title' => 'page_title',
631 'pp_propname',
632 'cat_pages',
633 'cat_subcats',
634 'cat_files',
635 ],
636 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
637 __METHOD__,
638 [],
639 $joinConditions
640 );
641 foreach ( $childRows as $row ) {
642 if ( isset( $this->processed[$row->page_id] ) ) {
643 // We already captured this one before
644 continue;
645 }
646 $this->writeCategoryData( $row );
647 if ( $row->page_id ) {
648 $pages[$row->page_id] = $row->rc_title;
649 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
650 $this->processed[$row->page_id] = true;
651 }
652 }
653 }
654
655 if ( $parentCats ) {
656 // Load parent rows by title
657 $joinConditions = [
658 'page' => [
659 'LEFT JOIN',
660 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
661 ],
662 'page_props' => [
663 'LEFT JOIN',
664 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
665 ],
666 ];
667
668 $parentRows = $dbr->select(
669 [ 'category', 'page', 'page_props' ],
670 [
671 'page_id',
672 'rc_title' => 'cat_title',
673 'pp_propname',
674 'cat_pages',
675 'cat_subcats',
676 'cat_files',
677 ],
678 [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
679 __METHOD__,
680 [],
681 $joinConditions
682 );
683 foreach ( $parentRows as $row ) {
684 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
685 // We already captured this one before
686 continue;
687 }
688 if ( isset( $processedTitle[$row->rc_title] ) ) {
689 // We already captured this one before
690 continue;
691 }
692 $this->writeCategoryData( $row );
693 if ( $row->page_id ) {
694 $pages[$row->page_id] = $row->rc_title;
695 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
696 $this->processed[$row->page_id] = true;
697 }
698 $processedTitle[$row->rc_title] = true;
699 }
700 }
701
702 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
703 }
704 }
705}
706
707$maintClass = CategoryChangesAsRdf::class;
708require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const RC_LOG
Definition Defines.php:117
const RC_EDIT
Definition Defines.php:115
const NS_CATEGORY
Definition Defines.php:78
const RC_CATEGORIZE
Definition Defines.php:119
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
addIndex(BatchRowIterator $it, IDatabase $dbr)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
const SPARQL_DELETE
Delete query.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
const SPARQL_INSERT
Insert query.
int[] $processed
List of processed page IDs, so we don't try to process same thing twice.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[], $fname=__METHOD__)
Set up standard iterator for retrieving category changes.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
const DB_REPLICA
Definition defines.php:25