MediaWiki REL1_35
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
22
23require_once __DIR__ . '/Maintenance.php';
24
35 private const SPARQL_INSERT = <<<SPARQL
36INSERT DATA {
37%s
38};
39
40SPARQL;
41
45 private const SPARQL_DELETE = <<<SPARQLD
46DELETE {
47?category ?x ?y
48} WHERE {
49 ?category ?x ?y
50 VALUES ?category {
51 %s
52 }
53};
54
55SPARQLD;
56
60 private const SPARQL_DELETE_INSERT = <<<SPARQLDI
61DELETE {
62?category ?x ?y
63} INSERT {
64%s
65} WHERE {
66 ?category ?x ?y
67 VALUES ?category {
68 %s
69 }
70};
71
72SPARQLDI;
73
77 private $rdfWriter;
83
84 private $startTS;
85 private $endTS;
86
92 protected $processed = [];
93
94 public function __construct() {
95 parent::__construct();
96
97 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
98
99 $this->setBatchSize( 200 );
100 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101 true, 'o' );
102 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103 true, true, 's' );
104 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105 true, 'e' );
106 }
107
111 public function initialize() {
112 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113 $this->rdfWriter = new TurtleRdfWriter();
114 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115 }
116
117 public function execute() {
118 $this->initialize();
119 $startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121 $endTS = new MWTimestamp( $this->getOption( "end" ) );
122 $now = new MWTimestamp();
123 $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
124
125 if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
126 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
127 }
128 if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
129 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
130 }
131
132 $this->startTS = $startTS->getTimestamp();
133 $this->endTS = $endTS->getTimestamp();
134
135 $outFile = $this->getOption( 'output', 'php://stdout' );
136 if ( $outFile === '-' ) {
137 $outFile = 'php://stdout';
138 }
139
140 $output = fopen( $outFile, 'wb' );
141
142 $this->categoriesRdf->setupPrefixes();
143 $this->rdfWriter->start();
144
145 $prefixes = $this->getRdf();
146 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
147 // Also strip dot at the end.
148 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
149 fwrite( $output, $prefixes );
150
151 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
152
153 // Deletes go first because if the page was deleted, other changes
154 // do not matter. This only gets true deletes, i.e. not pages that were restored.
155 $this->handleDeletes( $dbr, $output );
156 // Moves go before additions because if category is moved, we should not process creation
157 // as it would produce wrong data - because create row has old title
158 $this->handleMoves( $dbr, $output );
159 // We need to handle restores too since delete may have happened in previous update.
160 $this->handleRestores( $dbr, $output );
161 // Process newly added pages
162 $this->handleAdds( $dbr, $output );
163 // Process page edits
164 $this->handleEdits( $dbr, $output );
165 // Process categorization changes
166 $this->handleCategorization( $dbr, $output );
167
168 // Update timestamp
169 fwrite( $output, $this->updateTS( $this->endTS ) );
170 }
171
176 private function getInsertRdf() {
177 $rdfText = $this->getRdf();
178 if ( !$rdfText ) {
179 return "";
180 }
181 return sprintf( self::SPARQL_INSERT, $rdfText );
182 }
183
192 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
193 if ( empty( $deleteUrls ) ) {
194 return "";
195 }
196
197 if ( !empty( $pages ) ) {
198 $this->writeParentCategories( $dbr, $pages );
199 }
200
201 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
202 $this->getInsertRdf();
203 }
204
211 private function writeParentCategories( IDatabase $dbr, $pages ) {
212 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
213 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
214 }
215 }
216
222 public function updateTS( $timestamp ) {
223 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
224 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
225 $tsQuery = <<<SPARQL
226DELETE {
227 $dumpUrl schema:dateModified ?o .
228}
229WHERE {
230 $dumpUrl schema:dateModified ?o .
231};
232INSERT DATA {
233 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
234}
235
236SPARQL;
237 return $tsQuery;
238 }
239
247 private function setupChangesIterator(
249 array $columns = [],
250 array $extra_tables = []
251 ) {
252 $tables = [ 'recentchanges', 'page_props', 'category' ];
253 if ( $extra_tables ) {
254 $tables = array_merge( $tables, $extra_tables );
255 }
256 $it = new BatchRowIterator( $dbr,
257 $tables,
258 [ 'rc_timestamp' ],
259 $this->mBatchSize
260 );
261 $this->addTimestampConditions( $it, $dbr );
262 $it->addJoinConditions(
263 [
264 'page_props' => [
265 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
266 ],
267 'category' => [
268 'LEFT JOIN', [ 'cat_title = rc_title' ]
269 ]
270 ]
271 );
272 $it->setFetchColumns( array_merge( $columns, [
273 'rc_title',
274 'rc_cur_id',
275 'pp_propname',
276 'cat_pages',
277 'cat_subcats',
278 'cat_files'
279 ] ) );
280 return $it;
281 }
282
288 protected function getNewCatsIterator( IDatabase $dbr ) {
289 $it = $this->setupChangesIterator( $dbr );
290 $it->addConditions( [
291 'rc_namespace' => NS_CATEGORY,
292 'rc_new' => 1,
293 ] );
294 return $it;
295 }
296
302 protected function getMovedCatsIterator( IDatabase $dbr ) {
303 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
304 $it->addConditions( [
305 'rc_namespace' => NS_CATEGORY,
306 'rc_new' => 0,
307 'rc_log_type' => 'move',
308 'rc_type' => RC_LOG,
309 ] );
310 $it->addJoinConditions( [
311 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
312 ] );
313 $this->addIndex( $it );
314 return $it;
315 }
316
322 protected function getDeletedCatsIterator( IDatabase $dbr ) {
323 $it = new BatchRowIterator( $dbr,
324 'recentchanges',
325 [ 'rc_timestamp' ],
326 $this->mBatchSize
327 );
328 $this->addTimestampConditions( $it, $dbr );
329 $it->addConditions( [
330 'rc_namespace' => NS_CATEGORY,
331 'rc_new' => 0,
332 'rc_log_type' => 'delete',
333 'rc_log_action' => 'delete',
334 'rc_type' => RC_LOG,
335 // We will fetch ones that do not have page record. If they do,
336 // this means they were restored, thus restoring handler will pick it up.
337 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
338 ] );
339 $this->addIndex( $it );
340 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
341 return $it;
342 }
343
349 protected function getRestoredCatsIterator( IDatabase $dbr ) {
350 $it = $this->setupChangesIterator( $dbr );
351 $it->addConditions( [
352 'rc_namespace' => NS_CATEGORY,
353 'rc_new' => 0,
354 'rc_log_type' => 'delete',
355 'rc_log_action' => 'restore',
356 'rc_type' => RC_LOG,
357 // We will only fetch ones that have page record
358 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359 ] );
360 $this->addIndex( $it );
361 return $it;
362 }
363
371 $it =
372 $this->setupChangesIterator( $dbr );
373 $it->addConditions( [
374 'rc_namespace' => NS_CATEGORY,
375 'rc_new' => 0,
376 'rc_type' => $type,
377 ] );
378 $this->addIndex( $it );
379 return $it;
380 }
381
388 $it->addConditions( [
389 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
390 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
391 ] );
392 }
393
398 private function addIndex( BatchRowIterator $it ) {
399 $it->addOptions( [
400 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
401 ] );
402 }
403
410 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
411 $it = new BatchRowIterator(
412 $dbr,
413 'categorylinks',
414 [ 'cl_from', 'cl_to' ],
415 $this->mBatchSize
416 );
417 $it->addConditions( [
418 'cl_type' => 'subcat',
419 'cl_from' => $ids
420 ] );
421 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
422 return new RecursiveIteratorIterator( $it );
423 }
424
429 public function getRdf() {
430 return $this->rdfWriter->drain();
431 }
432
438 public function handleDeletes( IDatabase $dbr, $output ) {
439 // This only does "true" deletes - i.e. those that the page stays deleted
440 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
441 $deleteUrls = [];
442 foreach ( $batch as $row ) {
443 // This can produce duplicates, we don't care
444 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
445 $this->processed[$row->rc_cur_id] = true;
446 }
447 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
448 }
449 }
450
455 private function writeCategoryData( $row ) {
456 $this->categoriesRdf->writeCategoryData(
457 $row->rc_title,
458 $row->pp_propname === 'hiddencat',
459 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
460 (int)$row->cat_subcats
461 );
462 }
463
468 public function handleMoves( IDatabase $dbr, $output ) {
469 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
470 $pages = [];
471 $deleteUrls = [];
472 foreach ( $batch as $row ) {
473 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
474
475 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
476 // We already captured this one before
477 continue;
478 }
479
480 if ( $row->page_namespace != NS_CATEGORY ) {
481 // If page was moved out of Category:, we'll just delete
482 continue;
483 }
484 $row->rc_title = $row->page_title;
485 $this->writeCategoryData( $row );
486 $pages[$row->rc_cur_id] = $row->page_title;
487 $this->processed[$row->rc_cur_id] = true;
488 }
489
490 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
491 }
492 }
493
498 public function handleRestores( IDatabase $dbr, $output ) {
499 fwrite( $output, "# Restores\n" );
500 // This will only find those restores that were not deleted later.
501 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
502 $pages = [];
503 foreach ( $batch as $row ) {
504 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
505 // We already captured this one before
506 continue;
507 }
508 $this->writeCategoryData( $row );
509 $pages[$row->rc_cur_id] = $row->rc_title;
510 $this->processed[$row->rc_cur_id] = true;
511 }
512
513 if ( empty( $pages ) ) {
514 continue;
515 }
516
517 $this->writeParentCategories( $dbr, $pages );
518
519 fwrite( $output, $this->getInsertRdf() );
520 }
521 }
522
527 public function handleAdds( IDatabase $dbr, $output ) {
528 fwrite( $output, "# Additions\n" );
529 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
530 $pages = [];
531 foreach ( $batch as $row ) {
532 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
533 // We already captured this one before
534 continue;
535 }
536 $this->writeCategoryData( $row );
537 $pages[$row->rc_cur_id] = $row->rc_title;
538 $this->processed[$row->rc_cur_id] = true;
539 }
540
541 if ( empty( $pages ) ) {
542 continue;
543 }
544
545 $this->writeParentCategories( $dbr, $pages );
546 fwrite( $output, $this->getInsertRdf() );
547 }
548 }
549
555 public function handleEdits( IDatabase $dbr, $output ) {
556 // Editing category can change hidden flag and add new parents.
557 // TODO: it's pretty expensive to update all edited categories, and most edits
558 // aren't actually interesting for us. Some way to know which are interesting?
559 // We can capture recategorization on the next step, but not change in hidden status.
560 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
561 $pages = [];
562 $deleteUrls = [];
563 foreach ( $batch as $row ) {
564 // Note that on categorization event, cur_id points to
565 // the child page, not the parent category!
566 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
567 // We already captured this one before
568 continue;
569 }
570 $this->writeCategoryData( $row );
571 $pages[$row->rc_cur_id] = $row->rc_title;
572 $this->processed[$row->rc_cur_id] = true;
573 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
574 }
575
576 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
577 }
578 }
579
585 public function handleCategorization( IDatabase $dbr, $output ) {
586 $processedTitle = [];
587 // Categorization change can add new parents and change counts
588 // for the parent category.
589 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
590 /*
591 * Note that on categorization event, cur_id points to
592 * the child page, not the parent category!
593 * So we need to have a two-stage process, since we have ID from one
594 * category and title from another, and we need both for proper updates.
595 * TODO: For now, we do full update even though some data hasn't changed,
596 * e.g. parents for parent cat and counts for child cat.
597 */
598 $childPages = [];
599 $parentCats = [];
600 foreach ( $batch as $row ) {
601 $childPages[$row->rc_cur_id] = true;
602 $parentCats[$row->rc_title] = true;
603 }
604
605 $joinConditions = [
606 'page_props' => [
607 'LEFT JOIN',
608 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
609 ],
610 'category' => [
611 'LEFT JOIN',
612 [ 'cat_title = page_title' ],
613 ],
614 ];
615
616 $pages = [];
617 $deleteUrls = [];
618
619 if ( $childPages ) {
620 // Load child rows by ID
621 $childRows = $dbr->select(
622 [ 'page', 'page_props', 'category' ],
623 [
624 'page_id',
625 'rc_title' => 'page_title',
626 'pp_propname',
627 'cat_pages',
628 'cat_subcats',
629 'cat_files',
630 ],
631 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
632 __METHOD__,
633 [],
634 $joinConditions
635 );
636 foreach ( $childRows as $row ) {
637 if ( isset( $this->processed[$row->page_id] ) ) {
638 // We already captured this one before
639 continue;
640 }
641 $this->writeCategoryData( $row );
642 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
643 $this->processed[$row->page_id] = true;
644 }
645 }
646
647 if ( $parentCats ) {
648 // Load parent rows by title
649 $joinConditions = [
650 'page' => [
651 'LEFT JOIN',
652 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
653 ],
654 'page_props' => [
655 'LEFT JOIN',
656 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
657 ],
658 ];
659
660 $parentRows = $dbr->select(
661 [ 'category', 'page', 'page_props' ],
662 [
663 'page_id',
664 'rc_title' => 'cat_title',
665 'pp_propname',
666 'cat_pages',
667 'cat_subcats',
668 'cat_files',
669 ],
670 [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
671 __METHOD__,
672 [],
673 $joinConditions
674 );
675 foreach ( $parentRows as $row ) {
676 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
677 // We already captured this one before
678 continue;
679 }
680 if ( isset( $processedTitle[$row->rc_title] ) ) {
681 // We already captured this one before
682 continue;
683 }
684 $this->writeCategoryData( $row );
685 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
686 if ( $row->page_id ) {
687 $this->processed[$row->page_id] = true;
688 }
689 $processedTitle[$row->rc_title] = true;
690 }
691 }
692
693 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
694 }
695 }
696}
697
698$maintClass = CategoryChangesAsRdf::class;
699require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
const RUN_MAINTENANCE_IF_MAIN
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getMovedCatsIterator(IDatabase $dbr)
Fetch moved categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
addIndex(BatchRowIterator $it)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
const SPARQL_DELETE
Delete query.
getDeletedCatsIterator(IDatabase $dbr)
Fetch deleted categories.
getChangedCatsIterator(IDatabase $dbr, $type)
Fetch categorization changes or edits.
const SPARQL_INSERT
Insert query.
const SPARQL_DELETE_INSERT
Delete/Insert query.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[])
Set up standard iterator for retrieving category changes.
int[] $processed
List of processed page IDs, so we don't try to process same thing twice.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
getRestoredCatsIterator(IDatabase $dbr)
Fetch restored categories.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Set the batch size.
const RC_LOG
Definition Defines.php:134
const RC_EDIT
Definition Defines.php:132
const NS_CATEGORY
Definition Defines.php:84
const RC_CATEGORIZE
Definition Defines.php:136
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
const DB_REPLICA
Definition defines.php:25