19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
23require_once __DIR__ .
'/Maintenance.php';
95 parent::__construct();
97 $this->
addDescription(
"Generate RDF dump of category changes in a wiki." );
100 $this->
addOption(
'output',
"Output file (default is stdout). Will be overwritten.",
false,
102 $this->
addOption(
'start',
'Starting timestamp (inclusive), in ISO or Mediawiki format.',
104 $this->
addOption(
'end',
'Ending timestamp (exclusive), in ISO or Mediawiki format.',
true,
113 $this->rdfWriter =
new TurtleRdfWriter();
114 $this->categoriesRdf =
new CategoriesRdf( $this->rdfWriter );
123 $rcMaxAge = $this->
getConfig()->get(
'RCMaxAge' );
125 if ( $now->getTimestamp() -
$startTS->getTimestamp() > $rcMaxAge ) {
126 $this->
error(
"Start timestamp too old, maximum RC age is $rcMaxAge!" );
128 if ( $now->getTimestamp() -
$endTS->getTimestamp() > $rcMaxAge ) {
129 $this->
error(
"End timestamp too old, maximum RC age is $rcMaxAge!" );
132 $this->startTS =
$startTS->getTimestamp();
133 $this->endTS =
$endTS->getTimestamp();
135 $outFile = $this->
getOption(
'output',
'php://stdout' );
136 if ( $outFile ===
'-' ) {
137 $outFile =
'php://stdout';
140 $output = fopen( $outFile,
'wb' );
142 $this->categoriesRdf->setupPrefixes();
143 $this->rdfWriter->start();
145 $prefixes = $this->
getRdf();
148 $prefixes = preg_replace( [
'/^@/m',
'/\s*[.]$/m' ],
'', $prefixes );
149 fwrite( $output, $prefixes );
169 fwrite( $output, $this->
updateTS( $this->endTS ) );
177 $rdfText = $this->
getRdf();
181 return sprintf( self::SPARQL_INSERT, $rdfText );
193 if ( empty( $deleteUrls ) ) {
197 if ( !empty( $pages ) ) {
201 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode(
' ', $deleteUrls ) ) .
213 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
223 $dumpUrl =
'<' . $this->categoriesRdf->getDumpURI() .
'>';
227 $dumpUrl schema:dateModified ?o .
230 $dumpUrl schema:dateModified ?o .
233 $dumpUrl schema:dateModified
"$ts"^^xsd:dateTime .
250 array $extra_tables = []
252 $tables = [
'recentchanges',
'page_props',
'category' ];
253 if ( $extra_tables ) {
254 $tables = array_merge( $tables, $extra_tables );
262 $it->addJoinConditions(
265 'LEFT JOIN', [
'pp_propname' =>
'hiddencat',
'pp_page = rc_cur_id' ]
268 'LEFT JOIN', [
'cat_title = rc_title' ]
272 $it->setFetchColumns( array_merge( $columns, [
290 $it->addConditions( [
304 $it->addConditions( [
307 'rc_log_type' =>
'move',
310 $it->addJoinConditions( [
311 'page' => [
'JOIN',
'rc_cur_id = page_id' ],
329 $it->addConditions( [
332 'rc_log_type' =>
'delete',
333 'rc_log_action' =>
'delete',
337 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
340 $it->setFetchColumns( [
'rc_cur_id',
'rc_title' ] );
351 $it->addConditions( [
354 'rc_log_type' =>
'delete',
355 'rc_log_action' =>
'restore',
358 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
373 $it->addConditions( [
389 'rc_timestamp >= ' .
$dbr->addQuotes(
$dbr->timestamp( $this->startTS ) ),
390 'rc_timestamp < ' .
$dbr->addQuotes(
$dbr->timestamp( $this->endTS ) ),
400 'USE INDEX' => [
'recentchanges' =>
'new_name_timestamp' ]
414 [
'cl_from',
'cl_to' ],
417 $it->addConditions( [
418 'cl_type' =>
'subcat',
421 $it->setFetchColumns( [
'cl_from',
'cl_to' ] );
422 return new RecursiveIteratorIterator( $it );
430 return $this->rdfWriter->drain();
442 foreach ( $batch as $row ) {
444 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
445 $this->processed[$row->rc_cur_id] =
true;
456 $this->categoriesRdf->writeCategoryData(
458 $row->pp_propname ===
'hiddencat',
459 (
int)$row->cat_pages - (
int)$row->cat_subcats - (
int)$row->cat_files,
460 (
int)$row->cat_subcats
472 foreach ( $batch as $row ) {
473 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
475 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
484 $row->rc_title = $row->page_title;
486 $pages[$row->rc_cur_id] = $row->page_title;
487 $this->processed[$row->rc_cur_id] =
true;
499 fwrite( $output,
"# Restores\n" );
503 foreach ( $batch as $row ) {
504 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
509 $pages[$row->rc_cur_id] = $row->rc_title;
510 $this->processed[$row->rc_cur_id] =
true;
513 if ( empty( $pages ) ) {
528 fwrite( $output,
"# Additions\n" );
531 foreach ( $batch as $row ) {
532 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
537 $pages[$row->rc_cur_id] = $row->rc_title;
538 $this->processed[$row->rc_cur_id] =
true;
541 if ( empty( $pages ) ) {
563 foreach ( $batch as $row ) {
566 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
571 $pages[$row->rc_cur_id] = $row->rc_title;
572 $this->processed[$row->rc_cur_id] =
true;
573 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
586 $processedTitle = [];
600 foreach ( $batch as $row ) {
601 $childPages[$row->rc_cur_id] =
true;
602 $parentCats[$row->rc_title] =
true;
608 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
612 [
'cat_title = page_title' ],
621 $childRows =
$dbr->select(
622 [
'page',
'page_props',
'category' ],
625 'rc_title' =>
'page_title',
631 [
'page_namespace' =>
NS_CATEGORY,
'page_id' => array_keys( $childPages ) ],
636 foreach ( $childRows as $row ) {
637 if ( isset( $this->processed[$row->page_id] ) ) {
642 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
643 $this->processed[$row->page_id] =
true;
652 [
'page_title = cat_title',
'page_namespace' =>
NS_CATEGORY ],
656 [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ],
660 $parentRows =
$dbr->select(
661 [
'category',
'page',
'page_props' ],
664 'rc_title' =>
'cat_title',
670 [
'cat_title' => array_map(
'strval', array_keys( $parentCats ) ) ],
675 foreach ( $parentRows as $row ) {
676 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
680 if ( isset( $processedTitle[$row->rc_title] ) ) {
685 $deleteUrls[] =
'<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) .
'>';
686 if ( $row->page_id ) {
687 $this->processed[$row->page_id] =
true;
689 $processedTitle[$row->rc_title] =
true;
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
const RUN_MAINTENANCE_IF_MAIN
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getMovedCatsIterator(IDatabase $dbr)
Fetch moved categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
addIndex(BatchRowIterator $it)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
const SPARQL_DELETE
Delete query.
getDeletedCatsIterator(IDatabase $dbr)
Fetch deleted categories.
getChangedCatsIterator(IDatabase $dbr, $type)
Fetch categorization changes or edits.
const SPARQL_INSERT
Insert query.
const SPARQL_DELETE_INSERT
Delete/Insert query.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[])
Set up standard iterator for retrieving category changes.
int[] $processed
List of processed page IDs, so we don't try to process same thing twice.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
getRestoredCatsIterator(IDatabase $dbr)
Fetch restored categories.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
Library for creating and parsing MW-style timestamps.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Set the batch size.