19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
23require_once __DIR__ .
'/Maintenance.php';
78 parent::__construct();
80 $this->
addDescription(
"Generate RDF dump of category changes in a wiki." );
83 $this->
addOption(
'output',
"Output file (default is stdout). Will be overwritten.",
false,
85 $this->
addOption(
'start',
'Starting timestamp (inclusive), in ISO or Mediawiki format.',
87 $this->
addOption(
'end',
'Ending timestamp (exclusive), in ISO or Mediawiki format.',
true,
96 $this->rdfWriter =
new TurtleRdfWriter();
106 $rcMaxAge = $this->
getConfig()->get(
'RCMaxAge' );
108 if ( $now->getTimestamp() -
$startTS->getTimestamp() > $rcMaxAge ) {
109 $this->
error(
"Start timestamp too old, maximum RC age is $rcMaxAge!" );
111 if ( $now->getTimestamp() -
$endTS->getTimestamp() > $rcMaxAge ) {
112 $this->
error(
"End timestamp too old, maximum RC age is $rcMaxAge!" );
115 $this->startTS =
$startTS->getTimestamp();
116 $this->endTS =
$endTS->getTimestamp();
118 $outFile = $this->
getOption(
'output',
'php://stdout' );
119 if ( $outFile ===
'-' ) {
120 $outFile =
'php://stdout';
123 $output = fopen( $outFile,
'wb' );
125 $this->categoriesRdf->setupPrefixes();
126 $this->rdfWriter->start();
128 $prefixes = $this->
getRdf();
131 $prefixes = preg_replace( [
'/^@/m',
'/\s*[.]$/m' ],
'', $prefixes );
132 fwrite( $output, $prefixes );
152 fwrite( $output, $this->
updateTS( $this->endTS ) );
160 $rdfText = $this->
getRdf();
164 return sprintf( self::SPARQL_INSERT, $rdfText );
176 if ( empty( $deleteUrls ) ) {
180 if ( !empty( $pages ) ) {
184 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode(
' ', $deleteUrls ) ) .
196 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
206 $dumpUrl =
'<' . $this->categoriesRdf->getDumpURI() .
'>';
210 $dumpUrl schema:dateModified ?o .
213 $dumpUrl schema:dateModified ?o .
216 $dumpUrl schema:dateModified
"$ts"^^xsd:dateTime .
234 array $extra_tables = [],
237 $tables = [
'recentchanges',
'page_props',
'category' ];
238 if ( $extra_tables ) {
239 $tables = array_merge( $tables, $extra_tables );
247 $it->addJoinConditions(
250 'LEFT JOIN', [
'pp_propname' =>
'hiddencat',
'pp_page = rc_cur_id' ]
253 'LEFT JOIN', [
'cat_title = rc_title' ]
257 $it->setFetchColumns( array_merge( $columns, [
265 $it->setCaller( $fname );
277 $it->addConditions( [
293 [
'page_title',
'page_namespace' ],
297 $it->addConditions( [
300 'rc_log_type' =>
'move',
303 $it->addJoinConditions( [
304 'page' => [
'JOIN',
'rc_cur_id = page_id' ],
323 $it->addConditions( [
326 'rc_log_type' =>
'delete',
327 'rc_log_action' =>
'delete',
331 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
334 $it->setFetchColumns( [
'rc_cur_id',
'rc_title' ] );
335 $it->setCaller( $fname );
347 $it->addConditions( [
350 'rc_log_type' =>
'delete',
351 'rc_log_action' =>
'restore',
354 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
369 $it->addConditions( [
385 'rc_timestamp >= ' .
$dbr->addQuotes(
$dbr->timestamp( $this->startTS ) ),
386 'rc_timestamp < ' .
$dbr->addQuotes(
$dbr->timestamp( $this->endTS ) ),
397 'USE INDEX' => [
'recentchanges' =>
'rc_new_name_timestamp' ]
412 [
'cl_from',
'cl_to' ],
415 $it->addConditions( [
416 'cl_type' =>
'subcat',
419 $it->setFetchColumns( [
'cl_from',
'cl_to' ] );
420 $it->setCaller( $fname );
421 return new RecursiveIteratorIterator( $it );
429 return $this->rdfWriter->drain();
442 foreach ( $batch as $row ) {
444 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
445 $this->processed[$row->rc_cur_id] =
true;
456 $this->categoriesRdf->writeCategoryData(
458 $row->pp_propname ===
'hiddencat',
459 (
int)$row->cat_pages - (
int)$row->cat_subcats - (
int)$row->cat_files,
460 (
int)$row->cat_subcats
472 foreach ( $batch as $row ) {
473 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
475 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
484 $row->rc_title = $row->page_title;
486 $pages[$row->rc_cur_id] = $row->page_title;
487 $this->processed[$row->rc_cur_id] =
true;
499 fwrite( $output,
"# Restores\n" );
504 foreach ( $batch as $row ) {
505 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
510 $pages[$row->rc_cur_id] = $row->rc_title;
511 $this->processed[$row->rc_cur_id] =
true;
514 if ( empty( $pages ) ) {
529 fwrite( $output,
"# Additions\n" );
533 foreach ( $batch as $row ) {
534 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
539 $pages[$row->rc_cur_id] = $row->rc_title;
540 $this->processed[$row->rc_cur_id] =
true;
543 if ( empty( $pages ) ) {
566 foreach ( $batch as $row ) {
569 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
574 $pages[$row->rc_cur_id] = $row->rc_title;
575 $this->processed[$row->rc_cur_id] =
true;
576 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
589 $processedTitle = [];
605 foreach ( $batch as $row ) {
606 $childPages[$row->rc_cur_id] =
true;
607 $parentCats[$row->rc_title] =
true;
613 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
617 [
'cat_title = page_title' ],
626 $childRows =
$dbr->select(
627 [
'page',
'page_props',
'category' ],
630 'rc_title' =>
'page_title',
636 [
'page_namespace' =>
NS_CATEGORY,
'page_id' => array_keys( $childPages ) ],
641 foreach ( $childRows as $row ) {
642 if ( isset( $this->processed[$row->page_id] ) ) {
647 if ( $row->page_id ) {
648 $pages[$row->page_id] = $row->rc_title;
649 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
650 $this->processed[$row->page_id] =
true;
660 [
'page_title = cat_title',
'page_namespace' =>
NS_CATEGORY ],
664 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
668 $parentRows =
$dbr->select(
669 [
'category',
'page',
'page_props' ],
672 'rc_title' =>
'cat_title',
678 [
'cat_title' => array_map(
'strval', array_keys( $parentCats ) ) ],
683 foreach ( $parentRows as $row ) {
684 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
688 if ( isset( $processedTitle[$row->rc_title] ) ) {
693 if ( $row->page_id ) {
694 $pages[$row->page_id] = $row->rc_title;
695 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
696 $this->processed[$row->page_id] =
true;
698 $processedTitle[$row->rc_title] =
true;
708require_once RUN_MAINTENANCE_IF_MAIN;
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
addIndex(BatchRowIterator $it, IDatabase $dbr)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
const SPARQL_DELETE
Delete query.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
const SPARQL_INSERT
Insert query.
int[] $processed
List of processed page IDs, so we don't try to process same thing twice.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[], $fname=__METHOD__)
Set up standard iterator for retrieving category changes.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.