10use Wikimedia\Purtle\RdfWriter;
11use Wikimedia\Purtle\TurtleRdfWriter;
13use Wikimedia\Timestamp\TimestampFormat as TS;
16require_once __DIR__ .
'/Maintenance.php';
29 private const SPARQL_INSERT = <<<SPARQL
39 private const SPARQL_DELETE = <<<SPARQLD
59 private $categoriesRdf;
74 parent::__construct();
76 $this->
addDescription(
"Generate RDF dump of category changes in a wiki." );
79 $this->
addOption(
'output',
"Output file (default is stdout). Will be overwritten.",
false,
81 $this->
addOption(
'start',
'Starting timestamp (inclusive), in ISO or MediaWiki format.',
83 $this->
addOption(
'end',
'Ending timestamp (exclusive), in ISO or MediaWiki format.',
true,
92 $this->rdfWriter =
new TurtleRdfWriter();
102 $rcMaxAge = $this->
getConfig()->get( MainConfigNames::RCMaxAge );
104 if ( (
int)$now->getTimestamp( TS::UNIX ) - (
int)$startTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
105 $this->
error(
"Start timestamp too old, maximum RC age is $rcMaxAge!" );
107 if ( (
int)$now->getTimestamp( TS::UNIX ) - (
int)$endTS->getTimestamp( TS::UNIX ) > $rcMaxAge ) {
108 $this->
error(
"End timestamp too old, maximum RC age is $rcMaxAge!" );
111 $this->startTS = $startTS->getTimestamp();
112 $this->endTS = $endTS->getTimestamp();
114 $outFile = $this->
getOption(
'output',
'php://stdout' );
115 if ( $outFile ===
'-' ) {
116 $outFile =
'php://stdout';
119 $output = fopen( $outFile,
'wb' );
121 $this->categoriesRdf->setupPrefixes();
122 $this->rdfWriter->start();
124 $prefixes = $this->
getRdf();
127 $prefixes = preg_replace( [
'/^@/m',
'/\s*[.]$/m' ],
'', $prefixes );
128 fwrite( $output, $prefixes );
148 fwrite( $output, $this->
updateTS( $this->endTS ) );
155 private function getInsertRdf() {
156 $rdfText = $this->
getRdf();
160 return sprintf( self::SPARQL_INSERT, $rdfText );
171 private function getCategoriesUpdate(
IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
172 if ( !$deleteUrls ) {
177 $this->writeParentCategories( $dbr, $pages );
180 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode(
' ', $deleteUrls ) ) .
181 $this->getInsertRdf();
192 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
202 $dumpUrl =
'<' . $this->categoriesRdf->getDumpURI() .
'>';
206 $dumpUrl schema:dateModified ?o .
209 $dumpUrl schema:dateModified ?o .
212 $dumpUrl schema:dateModified
"$ts"^^xsd:dateTime .
226 private function setupChangesIterator(
233 ->from(
'recentchanges' )
234 ->leftJoin(
'page_props',
null, [
'pp_propname' =>
'hiddencat',
'pp_page = rc_cur_id' ] )
235 ->leftJoin(
'category',
null, [
'cat_title = rc_title' ] )
236 ->select( array_merge( $columns, [
248 $this->addTimestampConditions( $it, $dbr );
259 $it = $this->setupChangesIterator( $dbr, [], $fname );
262 'rc_source' => RecentChange::SRC_NEW,
274 $it = $this->setupChangesIterator(
276 [
'page_title',
'page_namespace' ],
281 'rc_source' => RecentChange::SRC_LOG,
282 'rc_log_type' =>
'move',
284 $it->sqb->join(
'page',
null,
'rc_cur_id = page_id' );
285 $this->addIndex( $it );
298 ->from(
'recentchanges' )
299 ->select( [
'rc_cur_id',
'rc_title' ] )
302 'rc_source' => RecentChange::SRC_LOG,
303 'rc_log_type' =>
'delete',
304 'rc_log_action' =>
'delete',
307 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
313 $this->addTimestampConditions( $it, $dbr );
314 $this->addIndex( $it );
325 $it = $this->setupChangesIterator( $dbr, [], $fname );
328 'rc_source' => RecentChange::SRC_LOG,
329 'rc_log_type' =>
'delete',
330 'rc_log_action' =>
'restore',
332 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
334 $this->addIndex( $it );
346 $it = $this->setupChangesIterator( $dbr, [], $fname );
351 $this->addIndex( $it );
362 $dbr->
expr(
'rc_timestamp',
'>=', $dbr->
timestamp( $this->startTS ) ),
363 $dbr->
expr(
'rc_timestamp',
'<', $dbr->
timestamp( $this->endTS ) ),
372 'USE INDEX' => [
'recentchanges' =>
'rc_source_name_timestamp' ]
385 ->select( [
'cl_from',
'lt_title' ] )
386 ->from(
'categorylinks' )
387 ->join(
'linktarget',
null,
'cl_target_id=lt_id' )
389 'cl_type' =>
'subcat',
393 $primaryKey = [
'cl_from',
'cl_target_id' ];
401 return new RecursiveIteratorIterator( $it );
409 return $this->rdfWriter->drain();
422 foreach ( $batch as $row ) {
424 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
425 $this->processed[$row->rc_cur_id] =
true;
427 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [],
"Deletes" ) );
435 private function writeCategoryData( $row ) {
436 $this->categoriesRdf->writeCategoryData(
438 $row->pp_propname ===
'hiddencat',
439 (
int)$row->cat_pages - (
int)$row->cat_subcats - (
int)$row->cat_files,
440 (
int)$row->cat_subcats
452 foreach ( $batch as $row ) {
453 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
455 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
464 $row->rc_title = $row->page_title;
465 $this->writeCategoryData( $row );
466 $pages[$row->rc_cur_id] = $row->page_title;
467 $this->processed[$row->rc_cur_id] =
true;
470 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages,
"Moves" ) );
479 fwrite( $output,
"# Restores\n" );
484 foreach ( $batch as $row ) {
485 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
489 $this->writeCategoryData( $row );
490 $pages[$row->rc_cur_id] = $row->rc_title;
491 $this->processed[$row->rc_cur_id] =
true;
498 $this->writeParentCategories( $dbr, $pages );
500 fwrite( $output, $this->getInsertRdf() );
509 fwrite( $output,
"# Additions\n" );
513 foreach ( $batch as $row ) {
514 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
518 $this->writeCategoryData( $row );
519 $pages[$row->rc_cur_id] = $row->rc_title;
520 $this->processed[$row->rc_cur_id] =
true;
527 $this->writeParentCategories( $dbr, $pages );
528 fwrite( $output, $this->getInsertRdf() );
546 foreach ( $batch as $row ) {
549 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
553 $this->writeCategoryData( $row );
554 $pages[$row->rc_cur_id] = $row->rc_title;
555 $this->processed[$row->rc_cur_id] =
true;
556 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
559 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages,
"Edits" ) );
569 $processedTitle = [];
585 foreach ( $batch as $row ) {
586 $childPages[$row->rc_cur_id] =
true;
587 $parentCats[$row->rc_title] =
true;
598 'rc_title' =>
'page_title',
605 ->leftJoin(
'page_props',
null, [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ] )
606 ->leftJoin(
'category',
null, [
'cat_title = page_title' ] )
607 ->where( [
'page_namespace' =>
NS_CATEGORY,
'page_id' => array_keys( $childPages ) ] )
608 ->caller( __METHOD__ )->fetchResultSet();
609 foreach ( $childRows as $row ) {
610 if ( isset( $this->processed[$row->page_id] ) ) {
614 $this->writeCategoryData( $row );
615 if ( $row->page_id ) {
616 $pages[$row->page_id] = $row->rc_title;
617 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
618 $this->processed[$row->page_id] =
true;
628 'rc_title' =>
'cat_title',
635 ->leftJoin(
'page',
null, [
'page_title = cat_title',
'page_namespace' =>
NS_CATEGORY ] )
636 ->leftJoin(
'page_props',
null, [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ] )
637 ->where( [
'cat_title' => array_map(
'strval', array_keys( $parentCats ) ) ] )
638 ->caller( __METHOD__ )->fetchResultSet();
639 foreach ( $parentRows as $row ) {
640 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
644 if ( isset( $processedTitle[$row->rc_title] ) ) {
648 $this->writeCategoryData( $row );
649 if ( $row->page_id ) {
650 $pages[$row->page_id] = $row->rc_title;
651 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
652 $this->processed[$row->page_id] =
true;
654 $processedTitle[$row->rc_title] =
true;
658 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages,
"Changes" ) );
665require_once RUN_MAINTENANCE_IF_MAIN;
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
Maintenance script to provide RDF representation of the recent changes in category tree.
initialize()
Initialize external service classes.
handleRestores(IReadableDatabase $dbr, $output)
handleMoves(IReadableDatabase $dbr, $output)
getNewCatsIterator(IReadableDatabase $dbr, $fname)
Fetch newly created categories.
execute()
Do the actual work.
getDeletedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch deleted categories.
getChangedCatsIterator(IReadableDatabase $dbr, $source, $fname)
Fetch categorization changes or edits.
handleCategorization(IReadableDatabase $dbr, $output)
Handles categorization changes.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getRestoredCatsIterator(IReadableDatabase $dbr, $fname)
Fetch restored categories.
getMovedCatsIterator(IReadableDatabase $dbr, $fname)
Fetch moved categories.
handleEdits(IReadableDatabase $dbr, $output)
Handle edits for category texts.
getRdf()
Get accumulated RDF.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
handleAdds(IReadableDatabase $dbr, $output)
handleDeletes(IReadableDatabase $dbr, $output)
Handle category deletes.
__construct()
Default constructor.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.