20use Wikimedia\Purtle\RdfWriter;
21use Wikimedia\Purtle\TurtleRdfWriter;
24require_once __DIR__ .
'/Maintenance.php';
36 private const SPARQL_INSERT = <<<SPARQL
46 private const SPARQL_DELETE = <<<SPARQLD
66 private $categoriesRdf;
79 parent::__construct();
81 $this->
addDescription(
"Generate RDF dump of category changes in a wiki." );
84 $this->
addOption(
'output',
"Output file (default is stdout). Will be overwritten.",
false,
86 $this->
addOption(
'start',
'Starting timestamp (inclusive), in ISO or Mediawiki format.',
88 $this->
addOption(
'end',
'Ending timestamp (exclusive), in ISO or Mediawiki format.',
true,
97 $this->rdfWriter =
new TurtleRdfWriter();
107 $rcMaxAge = $this->
getConfig()->get( MainConfigNames::RCMaxAge );
109 if ( (
int)$now->getTimestamp( TS_UNIX ) - (
int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
110 $this->
error(
"Start timestamp too old, maximum RC age is $rcMaxAge!" );
112 if ( (
int)$now->getTimestamp( TS_UNIX ) - (
int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
113 $this->
error(
"End timestamp too old, maximum RC age is $rcMaxAge!" );
116 $this->startTS = $startTS->getTimestamp();
117 $this->endTS = $endTS->getTimestamp();
119 $outFile = $this->
getOption(
'output',
'php://stdout' );
120 if ( $outFile ===
'-' ) {
121 $outFile =
'php://stdout';
124 $output = fopen( $outFile,
'wb' );
126 $this->categoriesRdf->setupPrefixes();
127 $this->rdfWriter->start();
129 $prefixes = $this->
getRdf();
132 $prefixes = preg_replace( [
'/^@/m',
'/\s*[.]$/m' ],
'', $prefixes );
133 fwrite( $output, $prefixes );
153 fwrite( $output, $this->
updateTS( $this->endTS ) );
160 private function getInsertRdf() {
161 $rdfText = $this->
getRdf();
165 return sprintf( self::SPARQL_INSERT, $rdfText );
176 private function getCategoriesUpdate(
IDatabase $dbr, $deleteUrls, $pages, $mark ) {
177 if ( empty( $deleteUrls ) ) {
181 if ( !empty( $pages ) ) {
182 $this->writeParentCategories(
$dbr, $pages );
185 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode(
' ', $deleteUrls ) ) .
186 $this->getInsertRdf();
195 private function writeParentCategories(
IDatabase $dbr, $pages ) {
197 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
207 $dumpUrl =
'<' . $this->categoriesRdf->getDumpURI() .
'>';
211 $dumpUrl schema:dateModified ?o .
214 $dumpUrl schema:dateModified ?o .
217 $dumpUrl schema:dateModified
"$ts"^^xsd:dateTime .
232 private function setupChangesIterator(
235 array $extra_tables = [],
238 $tables = [
'recentchanges',
'page_props',
'category' ];
239 if ( $extra_tables ) {
240 $tables = array_merge( $tables, $extra_tables );
247 $this->addTimestampConditions( $it,
$dbr );
251 'LEFT JOIN', [
'pp_propname' =>
'hiddencat',
'pp_page = rc_cur_id' ]
254 'LEFT JOIN', [
'cat_title = rc_title' ]
277 $it = $this->setupChangesIterator(
$dbr, [], [], $fname );
292 $it = $this->setupChangesIterator(
294 [
'page_title',
'page_namespace' ],
301 'rc_log_type' =>
'move',
305 'page' => [
'JOIN',
'rc_cur_id = page_id' ],
307 $this->addIndex( $it,
$dbr );
323 $this->addTimestampConditions( $it,
$dbr );
327 'rc_log_type' =>
'delete',
328 'rc_log_action' =>
'delete',
332 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
334 $this->addIndex( $it,
$dbr );
347 $it = $this->setupChangesIterator(
$dbr, [], [], $fname );
351 'rc_log_type' =>
'delete',
352 'rc_log_action' =>
'restore',
355 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
357 $this->addIndex( $it,
$dbr );
369 $it = $this->setupChangesIterator(
$dbr, [], [], $fname );
375 $this->addIndex( $it,
$dbr );
386 'rc_timestamp >= ' .
$dbr->addQuotes(
$dbr->timestamp( $this->startTS ) ),
387 'rc_timestamp < ' .
$dbr->addQuotes(
$dbr->timestamp( $this->endTS ) ),
398 'USE INDEX' => [
'recentchanges' =>
'rc_new_name_timestamp' ]
413 [
'cl_from',
'cl_to' ],
417 'cl_type' =>
'subcat',
422 return new RecursiveIteratorIterator( $it );
430 return $this->rdfWriter->drain();
443 foreach ( $batch as $row ) {
445 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
446 $this->processed[$row->rc_cur_id] =
true;
448 fwrite( $output, $this->getCategoriesUpdate(
$dbr, $deleteUrls, [],
"Deletes" ) );
456 private function writeCategoryData( $row ) {
457 $this->categoriesRdf->writeCategoryData(
459 $row->pp_propname ===
'hiddencat',
460 (
int)$row->cat_pages - (
int)$row->cat_subcats - (
int)$row->cat_files,
461 (
int)$row->cat_subcats
473 foreach ( $batch as $row ) {
474 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
476 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
485 $row->rc_title = $row->page_title;
486 $this->writeCategoryData( $row );
487 $pages[$row->rc_cur_id] = $row->page_title;
488 $this->processed[$row->rc_cur_id] =
true;
491 fwrite( $output, $this->getCategoriesUpdate(
$dbr, $deleteUrls, $pages,
"Moves" ) );
500 fwrite( $output,
"# Restores\n" );
505 foreach ( $batch as $row ) {
506 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
510 $this->writeCategoryData( $row );
511 $pages[$row->rc_cur_id] = $row->rc_title;
512 $this->processed[$row->rc_cur_id] =
true;
515 if ( empty( $pages ) ) {
519 $this->writeParentCategories(
$dbr, $pages );
521 fwrite( $output, $this->getInsertRdf() );
530 fwrite( $output,
"# Additions\n" );
534 foreach ( $batch as $row ) {
535 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
539 $this->writeCategoryData( $row );
540 $pages[$row->rc_cur_id] = $row->rc_title;
541 $this->processed[$row->rc_cur_id] =
true;
544 if ( empty( $pages ) ) {
548 $this->writeParentCategories(
$dbr, $pages );
549 fwrite( $output, $this->getInsertRdf() );
567 foreach ( $batch as $row ) {
570 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
574 $this->writeCategoryData( $row );
575 $pages[$row->rc_cur_id] = $row->rc_title;
576 $this->processed[$row->rc_cur_id] =
true;
577 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
580 fwrite( $output, $this->getCategoriesUpdate(
$dbr, $deleteUrls, $pages,
"Edits" ) );
590 $processedTitle = [];
606 foreach ( $batch as $row ) {
607 $childPages[$row->rc_cur_id] =
true;
608 $parentCats[$row->rc_title] =
true;
614 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
618 [
'cat_title = page_title' ],
627 $childRows =
$dbr->select(
628 [
'page',
'page_props',
'category' ],
631 'rc_title' =>
'page_title',
637 [
'page_namespace' =>
NS_CATEGORY,
'page_id' => array_keys( $childPages ) ],
642 foreach ( $childRows as $row ) {
643 if ( isset( $this->processed[$row->page_id] ) ) {
647 $this->writeCategoryData( $row );
648 if ( $row->page_id ) {
649 $pages[$row->page_id] = $row->rc_title;
650 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
651 $this->processed[$row->page_id] =
true;
661 [
'page_title = cat_title',
'page_namespace' =>
NS_CATEGORY ],
665 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
669 $parentRows =
$dbr->select(
670 [
'category',
'page',
'page_props' ],
673 'rc_title' =>
'cat_title',
679 [
'cat_title' => array_map(
'strval', array_keys( $parentCats ) ) ],
684 foreach ( $parentRows as $row ) {
685 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
689 if ( isset( $processedTitle[$row->rc_title] ) ) {
693 $this->writeCategoryData( $row );
694 if ( $row->page_id ) {
695 $pages[$row->page_id] = $row->rc_title;
696 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
697 $this->processed[$row->page_id] =
true;
699 $processedTitle[$row->rc_title] =
true;
703 fwrite( $output, $this->getCategoriesUpdate(
$dbr, $deleteUrls, $pages,
"Changes" ) );
709require_once RUN_MAINTENANCE_IF_MAIN;
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addJoinConditions(array $conditions)
setFetchColumns(array $columns)
setCaller( $caller)
Use ->setCaller( METHOD ) to indicate which code is using this class.
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
initialize()
Initialize external service classes.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
A class containing constants representing the names of configuration variables.