MediaWiki REL1_32
categoryChangesAsRdf.php
Go to the documentation of this file.
1<?php
19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
22
23require_once __DIR__ . '/Maintenance.php';
24
35 const SPARQL_INSERT = <<<SPARQL
36INSERT DATA {
37%s
38};
39
40SPARQL;
41
45 const SPARQL_DELETE = <<<SPARQLD
46DELETE {
47?category ?x ?y
48} WHERE {
49 VALUES ?category {
50 %s
51 }
52};
53
54SPARQLD;
55
59 const SPARQL_DELETE_INSERT = <<<SPARQLDI
60DELETE {
61?category ?x ?y
62} INSERT {
63%s
64} WHERE {
65 VALUES ?category {
66 %s
67 }
68};
69
70SPARQLDI;
71
75 private $rdfWriter;
81
82 private $startTS;
83 private $endTS;
84
90 protected $processed = [];
91
92 public function __construct() {
93 parent::__construct();
94
95 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
96
97 $this->setBatchSize( 200 );
98 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
99 true, 'o' );
100 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
101 true, true, 's' );
102 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
103 true, 'e' );
104 }
105
109 public function initialize() {
110 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
111 $this->rdfWriter = new TurtleRdfWriter();
112 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
113 }
114
115 public function execute() {
116 global $wgRCMaxAge;
117
118 $this->initialize();
119 $startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121 $endTS = new MWTimestamp( $this->getOption( "end" ) );
122 $now = new MWTimestamp();
123
124 if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
125 $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
126 }
127 if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
128 $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
129 }
130
131 $this->startTS = $startTS->getTimestamp();
132 $this->endTS = $endTS->getTimestamp();
133
134 $outFile = $this->getOption( 'output', 'php://stdout' );
135 if ( $outFile === '-' ) {
136 $outFile = 'php://stdout';
137 }
138
139 $output = fopen( $outFile, 'wb' );
140
141 $this->categoriesRdf->setupPrefixes();
142 $this->rdfWriter->start();
143
144 $prefixes = $this->getRdf();
145 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
146 // Also strip dot at the end.
147 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
148 fwrite( $output, $prefixes );
149
150 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
151
152 // Deletes go first because if the page was deleted, other changes
153 // do not matter. This only gets true deletes, i.e. not pages that were restored.
154 $this->handleDeletes( $dbr, $output );
155 // Moves go before additions because if category is moved, we should not process creation
156 // as it would produce wrong data - because create row has old title
157 $this->handleMoves( $dbr, $output );
158 // We need to handle restores too since delete may have happened in previous update.
159 $this->handleRestores( $dbr, $output );
160 // Process newly added pages
161 $this->handleAdds( $dbr, $output );
162 // Process page edits
163 $this->handleEdits( $dbr, $output );
164 // Process categorization changes
166
167 // Update timestamp
168 fwrite( $output, $this->updateTS( $this->endTS ) );
169 }
170
175 private function getInsertRdf() {
176 $rdfText = $this->getRdf();
177 if ( !$rdfText ) {
178 return "";
179 }
180 return sprintf( self::SPARQL_INSERT, $rdfText );
181 }
182
191 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
192 if ( empty( $deleteUrls ) ) {
193 return "";
194 }
195
196 if ( !empty( $pages ) ) {
197 $this->writeParentCategories( $dbr, $pages );
198 }
199
200 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
201 $this->getInsertRdf();
202 }
203
210 private function writeParentCategories( IDatabase $dbr, $pages ) {
211 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
212 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
213 }
214 }
215
221 public function updateTS( $timestamp ) {
222 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
223 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
224 $tsQuery = <<<SPARQL
225DELETE {
226 $dumpUrl schema:dateModified ?o .
227}
228WHERE {
229 $dumpUrl schema:dateModified ?o .
230};
231INSERT DATA {
232 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
233}
234
235SPARQL;
236 return $tsQuery;
237 }
238
246 private function setupChangesIterator(
248 array $columns = [],
249 array $extra_tables = []
250 ) {
251 $tables = [ 'recentchanges', 'page_props', 'category' ];
252 if ( $extra_tables ) {
253 $tables = array_merge( $tables, $extra_tables );
254 }
255 $it = new BatchRowIterator( $dbr,
256 $tables,
257 [ 'rc_timestamp' ],
258 $this->mBatchSize
259 );
260 $this->addTimestampConditions( $it, $dbr );
261 $it->addJoinConditions(
262 [
263 'page_props' => [
264 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
265 ],
266 'category' => [
267 'LEFT JOIN', [ 'cat_title = rc_title' ]
268 ]
269 ]
270 );
271 $it->setFetchColumns( array_merge( $columns, [
272 'rc_title',
273 'rc_cur_id',
274 'pp_propname',
275 'cat_pages',
276 'cat_subcats',
277 'cat_files'
278 ] ) );
279 return $it;
280 }
281
287 protected function getNewCatsIterator( IDatabase $dbr ) {
288 $it = $this->setupChangesIterator( $dbr );
289 $it->addConditions( [
290 'rc_namespace' => NS_CATEGORY,
291 'rc_new' => 1,
292 ] );
293 return $it;
294 }
295
301 protected function getMovedCatsIterator( IDatabase $dbr ) {
302 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
303 $it->addConditions( [
304 'rc_namespace' => NS_CATEGORY,
305 'rc_new' => 0,
306 'rc_log_type' => 'move',
307 'rc_type' => RC_LOG,
308 ] );
309 $it->addJoinConditions( [
310 'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ],
311 ] );
312 $this->addIndex( $it );
313 return $it;
314 }
315
321 protected function getDeletedCatsIterator( IDatabase $dbr ) {
322 $it = new BatchRowIterator( $dbr,
323 'recentchanges',
324 [ 'rc_timestamp' ],
325 $this->mBatchSize
326 );
327 $this->addTimestampConditions( $it, $dbr );
328 $it->addConditions( [
329 'rc_namespace' => NS_CATEGORY,
330 'rc_new' => 0,
331 'rc_log_type' => 'delete',
332 'rc_log_action' => 'delete',
333 'rc_type' => RC_LOG,
334 // We will fetch ones that do not have page record. If they do,
335 // this means they were restored, thus restoring handler will pick it up.
336 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
337 ] );
338 $this->addIndex( $it );
339 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
340 return $it;
341 }
342
348 protected function getRestoredCatsIterator( IDatabase $dbr ) {
349 $it = $this->setupChangesIterator( $dbr );
350 $it->addConditions( [
351 'rc_namespace' => NS_CATEGORY,
352 'rc_new' => 0,
353 'rc_log_type' => 'delete',
354 'rc_log_action' => 'restore',
355 'rc_type' => RC_LOG,
356 // We will only fetch ones that have page record
357 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
358 ] );
359 $this->addIndex( $it );
360 return $it;
361 }
362
369 $it =
370 $this->setupChangesIterator( $dbr );
371 $it->addConditions( [
372 'rc_namespace' => NS_CATEGORY,
373 'rc_new' => 0,
374 'rc_type' => $type,
375 ] );
376 $this->addIndex( $it );
377 return $it;
378 }
379
386 $it->addConditions( [
387 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
388 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
389 ] );
390 }
391
396 private function addIndex( BatchRowIterator $it ) {
397 $it->addOptions( [
398 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
399 ] );
400 }
401
408 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
409 $it = new BatchRowIterator(
410 $dbr,
411 'categorylinks',
412 [ 'cl_from', 'cl_to' ],
413 $this->mBatchSize
414 );
415 $it->addConditions( [
416 'cl_type' => 'subcat',
417 'cl_from' => $ids
418 ] );
419 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
420 return new RecursiveIteratorIterator( $it );
421 }
422
427 public function getRdf() {
428 return $this->rdfWriter->drain();
429 }
430
436 public function handleDeletes( IDatabase $dbr, $output ) {
437 // This only does "true" deletes - i.e. those that the page stays deleted
438 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
439 $deleteUrls = [];
440 foreach ( $batch as $row ) {
441 // This can produce duplicates, we don't care
442 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
443 $this->processed[$row->rc_cur_id] = true;
444 }
445 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
446 }
447 }
448
453 private function writeCategoryData( $row ) {
454 $this->categoriesRdf->writeCategoryData(
455 $row->rc_title,
456 $row->pp_propname === 'hiddencat',
457 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
458 (int)$row->cat_subcats
459 );
460 }
461
466 public function handleMoves( IDatabase $dbr, $output ) {
467 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
468 $pages = [];
469 $deleteUrls = [];
470 foreach ( $batch as $row ) {
471 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
472
473 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
474 // We already captured this one before
475 continue;
476 }
477
478 if ( $row->page_namespace != NS_CATEGORY ) {
479 // If page was moved out of Category:, we'll just delete
480 continue;
481 }
482 $row->rc_title = $row->page_title;
483 $this->writeCategoryData( $row );
484 $pages[$row->rc_cur_id] = $row->page_title;
485 $this->processed[$row->rc_cur_id] = true;
486 }
487
488 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
489 }
490 }
491
496 public function handleRestores( IDatabase $dbr, $output ) {
497 fwrite( $output, "# Restores\n" );
498 // This will only find those restores that were not deleted later.
499 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
500 $pages = [];
501 foreach ( $batch as $row ) {
502 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
503 // We already captured this one before
504 continue;
505 }
506 $this->writeCategoryData( $row );
507 $pages[$row->rc_cur_id] = $row->rc_title;
508 $this->processed[$row->rc_cur_id] = true;
509 }
510
511 if ( empty( $pages ) ) {
512 continue;
513 }
514
515 $this->writeParentCategories( $dbr, $pages );
516
517 fwrite( $output, $this->getInsertRdf() );
518 }
519 }
520
525 public function handleAdds( IDatabase $dbr, $output ) {
526 fwrite( $output, "# Additions\n" );
527 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
528 $pages = [];
529 foreach ( $batch as $row ) {
530 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
531 // We already captured this one before
532 continue;
533 }
534 $this->writeCategoryData( $row );
535 $pages[$row->rc_cur_id] = $row->rc_title;
536 $this->processed[$row->rc_cur_id] = true;
537 }
538
539 if ( empty( $pages ) ) {
540 continue;
541 }
542
543 $this->writeParentCategories( $dbr, $pages );
544 fwrite( $output, $this->getInsertRdf() );
545 }
546 }
547
553 public function handleEdits( IDatabase $dbr, $output ) {
554 // Editing category can change hidden flag and add new parents.
555 // TODO: it's pretty expensive to update all edited categories, and most edits
556 // aren't actually interesting for us. Some way to know which are interesting?
557 // We can capture recategorization on the next step, but not change in hidden status.
558 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
559 $pages = [];
560 $deleteUrls = [];
561 foreach ( $batch as $row ) {
562 // Note that on categorization event, cur_id points to
563 // the child page, not the parent category!
564 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
565 // We already captured this one before
566 continue;
567 }
568 $this->writeCategoryData( $row );
569 $pages[$row->rc_cur_id] = $row->rc_title;
570 $this->processed[$row->rc_cur_id] = true;
571 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
572 }
573
574 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
575 }
576 }
577
584 $processedTitle = [];
585 // Categorization change can add new parents and change counts
586 // for the parent category.
587 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
588 /*
589 * Note that on categorization event, cur_id points to
590 * the child page, not the parent category!
591 * So we need to have a two-stage process, since we have ID from one
592 * category and title from another, and we need both for proper updates.
593 * TODO: For now, we do full update even though some data hasn't changed,
594 * e.g. parents for parent cat and counts for child cat.
595 */
596 foreach ( $batch as $row ) {
597 $childPages[$row->rc_cur_id] = true;
598 $parentCats[$row->rc_title] = true;
599 }
600
601 $joinConditions = [
602 'page_props' => [
603 'LEFT JOIN',
604 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
605 ],
606 'category' => [
607 'LEFT JOIN',
608 [ 'cat_title = page_title' ],
609 ],
610 ];
611
612 $pages = [];
613 $deleteUrls = [];
614
615 if ( !empty( $childPages ) ) {
616 // Load child rows by ID
617 $childRows = $dbr->select(
618 [ 'page', 'page_props', 'category' ],
619 [
620 'page_id',
621 'rc_title' => 'page_title',
622 'pp_propname',
623 'cat_pages',
624 'cat_subcats',
625 'cat_files',
626 ],
627 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
628 __METHOD__,
629 [],
630 $joinConditions
631 );
632 foreach ( $childRows as $row ) {
633 if ( isset( $this->processed[$row->page_id] ) ) {
634 // We already captured this one before
635 continue;
636 }
637 $this->writeCategoryData( $row );
638 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
639 $this->processed[$row->page_id] = true;
640 }
641 }
642
643 if ( !empty( $parentCats ) ) {
644 // Load parent rows by title
645 $joinConditions = [
646 'page' => [
647 'LEFT JOIN',
648 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
649 ],
650 'page_props' => [
651 'LEFT JOIN',
652 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
653 ],
654 ];
655
656 $parentRows = $dbr->select(
657 [ 'category', 'page', 'page_props' ],
658 [
659 'page_id',
660 'rc_title' => 'cat_title',
661 'pp_propname',
662 'cat_pages',
663 'cat_subcats',
664 'cat_files',
665 ],
666 [ 'cat_title' => array_keys( $parentCats ) ],
667 __METHOD__,
668 [],
669 $joinConditions
670 );
671 foreach ( $parentRows as $row ) {
672 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
673 // We already captured this one before
674 continue;
675 }
676 if ( isset( $processedTitle[$row->rc_title] ) ) {
677 // We already captured this one before
678 continue;
679 }
680 $this->writeCategoryData( $row );
681 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
682 if ( $row->page_id ) {
683 $this->processed[$row->page_id] = true;
684 }
685 $processedTitle[$row->rc_title] = true;
686 }
687 }
688
689 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
690 }
691 }
692}
693
694$maintClass = CategoryChangesAsRdf::class;
695require_once RUN_MAINTENANCE_IF_MAIN;
$wgRCMaxAge
Recentchanges items are periodically purged; entries older than this many seconds will go.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getMovedCatsIterator(IDatabase $dbr)
Fetch moved categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
addIndex(BatchRowIterator $it)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
const SPARQL_DELETE
Delete query.
getDeletedCatsIterator(IDatabase $dbr)
Fetch deleted categories.
getChangedCatsIterator(IDatabase $dbr, $type)
Fetch categorization changes or edits.
const SPARQL_INSERT
Insert query.
const SPARQL_DELETE_INSERT
Delete/Insert query.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[])
Set up standard iterator for retrieving category changes.
int[] $processed
List of processed page IDs, so we don't try to process same thing twice.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
getRestoredCatsIterator(IDatabase $dbr)
Fetch restored categories.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getDB( $db, $groups=[], $wiki=false)
Returns a database to be used by current maintenance script.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Set the batch size.
do that in ParserLimitReportFormat instead use this to modify the parameters of the image all existing parser cache entries will be invalid To avoid you ll need to handle that somehow(e.g. with the RejectParserCacheValue hook) because MediaWiki won 't do it for you. & $defaults error
Definition hooks.txt:2683
this hook is for auditing only RecentChangesLinked and Watchlist Do not use this to implement individual filters if they are compatible with the ChangesListFilter and ChangesListFilterGroup structure use sub classes of those in conjunction with the ChangesListSpecialPageStructuredFilters hook This hook can be used to implement filters that do not implement that or custom behavior that is not an individual filter e g Watchlist & $tables
Definition hooks.txt:1035
static configuration should be added through ResourceLoaderGetConfigVars instead can be used to get the real title e g db for database replication lag or jobqueue for job queue size converted to pseudo seconds It is possible to add more fields and they will be returned to the user in the API response after the basic globals have been set but before ordinary actions take place $output
Definition hooks.txt:2317
const RC_LOG
Definition Defines.php:144
const RC_EDIT
Definition Defines.php:142
const NS_CATEGORY
Definition Defines.php:78
const RC_CATEGORIZE
Definition Defines.php:146
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
$batch
Definition linkcache.txt:23
require_once RUN_MAINTENANCE_IF_MAIN
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a array("192.168.0.1:11211", 2))
const DB_REPLICA
Definition defines.php:25