MediaWiki  master
categoryChangesAsRdf.php
Go to the documentation of this file.
1 <?php
22 use Wikimedia\Purtle\RdfWriter;
23 use Wikimedia\Purtle\TurtleRdfWriter;
26 
27 require_once __DIR__ . '/Maintenance.php';
28 
39  private const SPARQL_INSERT = <<<SPARQL
40 INSERT DATA {
41 %s
42 };
43 
44 SPARQL;
45 
49  private const SPARQL_DELETE = <<<SPARQLD
50 DELETE {
51 ?category ?x ?y
52 } WHERE {
53  ?category ?x ?y
54  VALUES ?category {
55  %s
56  }
57 };
58 
59 SPARQLD;
60 
64  private $rdfWriter;
69  private $categoriesRdf;
70 
71  private $startTS;
72  private $endTS;
73 
79  protected $processed = [];
80 
81  public function __construct() {
82  parent::__construct();
83 
84  $this->addDescription( "Generate RDF dump of category changes in a wiki." );
85 
86  $this->setBatchSize( 200 );
87  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
88  true, 'o' );
89  $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
90  true, true, 's' );
91  $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
92  true, 'e' );
93  }
94 
98  public function initialize() {
99  // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
100  $this->rdfWriter = new TurtleRdfWriter();
101  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
102  }
103 
104  public function execute() {
105  $this->initialize();
106  $startTS = new MWTimestamp( $this->getOption( "start" ) );
107 
108  $endTS = new MWTimestamp( $this->getOption( "end" ) );
109  $now = new MWTimestamp();
110  $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
111 
112  if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
113  $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
114  }
115  if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
116  $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
117  }
118 
119  $this->startTS = $startTS->getTimestamp();
120  $this->endTS = $endTS->getTimestamp();
121 
122  $outFile = $this->getOption( 'output', 'php://stdout' );
123  if ( $outFile === '-' ) {
124  $outFile = 'php://stdout';
125  }
126 
127  $output = fopen( $outFile, 'wb' );
128 
129  $this->categoriesRdf->setupPrefixes();
130  $this->rdfWriter->start();
131 
132  $prefixes = $this->getRdf();
133  // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
134  // Also strip dot at the end.
135  $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
136  fwrite( $output, $prefixes );
137 
138  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
139 
140  // Deletes go first because if the page was deleted, other changes
141  // do not matter. This only gets true deletes, i.e. not pages that were restored.
142  $this->handleDeletes( $dbr, $output );
143  // Moves go before additions because if category is moved, we should not process creation
144  // as it would produce wrong data - because create row has old title
145  $this->handleMoves( $dbr, $output );
146  // We need to handle restores too since delete may have happened in previous update.
147  $this->handleRestores( $dbr, $output );
148  // Process newly added pages
149  $this->handleAdds( $dbr, $output );
150  // Process page edits
151  $this->handleEdits( $dbr, $output );
152  // Process categorization changes
153  $this->handleCategorization( $dbr, $output );
154 
155  // Update timestamp
156  fwrite( $output, $this->updateTS( $this->endTS ) );
157  }
158 
163  private function getInsertRdf() {
164  $rdfText = $this->getRdf();
165  if ( !$rdfText ) {
166  return "";
167  }
168  return sprintf( self::SPARQL_INSERT, $rdfText );
169  }
170 
179  private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
180  if ( empty( $deleteUrls ) ) {
181  return "";
182  }
183 
184  if ( !empty( $pages ) ) {
185  $this->writeParentCategories( $dbr, $pages );
186  }
187 
188  return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
189  $this->getInsertRdf();
190  }
191 
198  private function writeParentCategories( IDatabase $dbr, $pages ) {
199  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
200  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
201  }
202  }
203 
209  public function updateTS( $timestamp ) {
210  $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
211  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
212  $tsQuery = <<<SPARQL
213 DELETE {
214  $dumpUrl schema:dateModified ?o .
215 }
216 WHERE {
217  $dumpUrl schema:dateModified ?o .
218 };
219 INSERT DATA {
220  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
221 }
222 
223 SPARQL;
224  return $tsQuery;
225  }
226 
235  private function setupChangesIterator(
236  IDatabase $dbr,
237  array $columns = [],
238  array $extra_tables = [],
239  $fname = __METHOD__
240  ) {
241  $tables = [ 'recentchanges', 'page_props', 'category' ];
242  if ( $extra_tables ) {
243  $tables = array_merge( $tables, $extra_tables );
244  }
245  $it = new BatchRowIterator( $dbr,
246  $tables,
247  [ 'rc_timestamp' ],
248  $this->mBatchSize
249  );
250  $this->addTimestampConditions( $it, $dbr );
251  $it->addJoinConditions(
252  [
253  'page_props' => [
254  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
255  ],
256  'category' => [
257  'LEFT JOIN', [ 'cat_title = rc_title' ]
258  ]
259  ]
260  );
261  $it->setFetchColumns( array_merge( $columns, [
262  'rc_title',
263  'rc_cur_id',
264  'pp_propname',
265  'cat_pages',
266  'cat_subcats',
267  'cat_files'
268  ] ) );
269  $it->setCaller( $fname );
270  return $it;
271  }
272 
279  protected function getNewCatsIterator( IDatabase $dbr, $fname ) {
280  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
281  $it->addConditions( [
282  'rc_namespace' => NS_CATEGORY,
283  'rc_new' => 1,
284  ] );
285  return $it;
286  }
287 
294  protected function getMovedCatsIterator( IDatabase $dbr, $fname ) {
295  $it = $this->setupChangesIterator(
296  $dbr,
297  [ 'page_title', 'page_namespace' ],
298  [ 'page' ],
299  $fname
300  );
301  $it->addConditions( [
302  'rc_namespace' => NS_CATEGORY,
303  'rc_new' => 0,
304  'rc_log_type' => 'move',
305  'rc_type' => RC_LOG,
306  ] );
307  $it->addJoinConditions( [
308  'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
309  ] );
310  $this->addIndex( $it );
311  return $it;
312  }
313 
320  protected function getDeletedCatsIterator( IDatabase $dbr, $fname ) {
321  $it = new BatchRowIterator( $dbr,
322  'recentchanges',
323  [ 'rc_timestamp' ],
324  $this->mBatchSize
325  );
326  $this->addTimestampConditions( $it, $dbr );
327  $it->addConditions( [
328  'rc_namespace' => NS_CATEGORY,
329  'rc_new' => 0,
330  'rc_log_type' => 'delete',
331  'rc_log_action' => 'delete',
332  'rc_type' => RC_LOG,
333  // We will fetch ones that do not have page record. If they do,
334  // this means they were restored, thus restoring handler will pick it up.
335  'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
336  ] );
337  $this->addIndex( $it );
338  $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
339  $it->setCaller( $fname );
340  return $it;
341  }
342 
349  protected function getRestoredCatsIterator( IDatabase $dbr, $fname ) {
350  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
351  $it->addConditions( [
352  'rc_namespace' => NS_CATEGORY,
353  'rc_new' => 0,
354  'rc_log_type' => 'delete',
355  'rc_log_action' => 'restore',
356  'rc_type' => RC_LOG,
357  // We will only fetch ones that have page record
358  'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359  ] );
360  $this->addIndex( $it );
361  return $it;
362  }
363 
371  protected function getChangedCatsIterator( IDatabase $dbr, $type, $fname ) {
372  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
373  $it->addConditions( [
374  'rc_namespace' => NS_CATEGORY,
375  'rc_new' => 0,
376  'rc_type' => $type,
377  ] );
378  $this->addIndex( $it );
379  return $it;
380  }
381 
387  private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
388  $it->addConditions( [
389  'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
390  'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
391  ] );
392  }
393 
398  private function addIndex( BatchRowIterator $it ) {
399  $it->addOptions( [
400  'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
401  ] );
402  }
403 
411  protected function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
412  $it = new BatchRowIterator(
413  $dbr,
414  'categorylinks',
415  [ 'cl_from', 'cl_to' ],
416  $this->mBatchSize
417  );
418  $it->addConditions( [
419  'cl_type' => 'subcat',
420  'cl_from' => $ids
421  ] );
422  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
423  $it->setCaller( $fname );
424  return new RecursiveIteratorIterator( $it );
425  }
426 
431  public function getRdf() {
432  return $this->rdfWriter->drain();
433  }
434 
440  public function handleDeletes( IDatabase $dbr, $output ) {
441  // This only does "true" deletes - i.e. those that the page stays deleted
442 
443  foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
444  $deleteUrls = [];
445  foreach ( $batch as $row ) {
446  // This can produce duplicates, we don't care
447  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
448  $this->processed[$row->rc_cur_id] = true;
449  }
450  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
451  }
452  }
453 
458  private function writeCategoryData( $row ) {
459  $this->categoriesRdf->writeCategoryData(
460  $row->rc_title,
461  $row->pp_propname === 'hiddencat',
462  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
463  (int)$row->cat_subcats
464  );
465  }
466 
471  public function handleMoves( IDatabase $dbr, $output ) {
472  foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
473  $pages = [];
474  $deleteUrls = [];
475  foreach ( $batch as $row ) {
476  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
477 
478  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
479  // We already captured this one before
480  continue;
481  }
482 
483  if ( $row->page_namespace != NS_CATEGORY ) {
484  // If page was moved out of Category:, we'll just delete
485  continue;
486  }
487  $row->rc_title = $row->page_title;
488  $this->writeCategoryData( $row );
489  $pages[$row->rc_cur_id] = $row->page_title;
490  $this->processed[$row->rc_cur_id] = true;
491  }
492 
493  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
494  }
495  }
496 
501  public function handleRestores( IDatabase $dbr, $output ) {
502  fwrite( $output, "# Restores\n" );
503 
504  // This will only find those restores that were not deleted later.
505  foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
506  $pages = [];
507  foreach ( $batch as $row ) {
508  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
509  // We already captured this one before
510  continue;
511  }
512  $this->writeCategoryData( $row );
513  $pages[$row->rc_cur_id] = $row->rc_title;
514  $this->processed[$row->rc_cur_id] = true;
515  }
516 
517  if ( empty( $pages ) ) {
518  continue;
519  }
520 
521  $this->writeParentCategories( $dbr, $pages );
522 
523  fwrite( $output, $this->getInsertRdf() );
524  }
525  }
526 
531  public function handleAdds( IDatabase $dbr, $output ) {
532  fwrite( $output, "# Additions\n" );
533 
534  foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
535  $pages = [];
536  foreach ( $batch as $row ) {
537  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
538  // We already captured this one before
539  continue;
540  }
541  $this->writeCategoryData( $row );
542  $pages[$row->rc_cur_id] = $row->rc_title;
543  $this->processed[$row->rc_cur_id] = true;
544  }
545 
546  if ( empty( $pages ) ) {
547  continue;
548  }
549 
550  $this->writeParentCategories( $dbr, $pages );
551  fwrite( $output, $this->getInsertRdf() );
552  }
553  }
554 
560  public function handleEdits( IDatabase $dbr, $output ) {
561  // Editing category can change hidden flag and add new parents.
562  // TODO: it's pretty expensive to update all edited categories, and most edits
563  // aren't actually interesting for us. Some way to know which are interesting?
564  // We can capture recategorization on the next step, but not change in hidden status.
565 
566  foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
567  $pages = [];
568  $deleteUrls = [];
569  foreach ( $batch as $row ) {
570  // Note that on categorization event, cur_id points to
571  // the child page, not the parent category!
572  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
573  // We already captured this one before
574  continue;
575  }
576  $this->writeCategoryData( $row );
577  $pages[$row->rc_cur_id] = $row->rc_title;
578  $this->processed[$row->rc_cur_id] = true;
579  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
580  }
581 
582  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
583  }
584  }
585 
591  public function handleCategorization( IDatabase $dbr, $output ) {
592  $processedTitle = [];
593 
594  // Categorization change can add new parents and change counts
595  // for the parent category.
596 
597  foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
598  /*
599  * Note that on categorization event, cur_id points to
600  * the child page, not the parent category!
601  * So we need to have a two-stage process, since we have ID from one
602  * category and title from another, and we need both for proper updates.
603  * TODO: For now, we do full update even though some data hasn't changed,
604  * e.g. parents for parent cat and counts for child cat.
605  */
606  $childPages = [];
607  $parentCats = [];
608  foreach ( $batch as $row ) {
609  $childPages[$row->rc_cur_id] = true;
610  $parentCats[$row->rc_title] = true;
611  }
612 
613  $joinConditions = [
614  'page_props' => [
615  'LEFT JOIN',
616  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
617  ],
618  'category' => [
619  'LEFT JOIN',
620  [ 'cat_title = page_title' ],
621  ],
622  ];
623 
624  $pages = [];
625  $deleteUrls = [];
626 
627  if ( $childPages ) {
628  // Load child rows by ID
629  $childRows = $dbr->select(
630  [ 'page', 'page_props', 'category' ],
631  [
632  'page_id',
633  'rc_title' => 'page_title',
634  'pp_propname',
635  'cat_pages',
636  'cat_subcats',
637  'cat_files',
638  ],
639  [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
640  __METHOD__,
641  [],
642  $joinConditions
643  );
644  foreach ( $childRows as $row ) {
645  if ( isset( $this->processed[$row->page_id] ) ) {
646  // We already captured this one before
647  continue;
648  }
649  $this->writeCategoryData( $row );
650  if ( $row->page_id ) {
651  $pages[$row->page_id] = $row->rc_title;
652  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
653  $this->processed[$row->page_id] = true;
654  }
655  }
656  }
657 
658  if ( $parentCats ) {
659  // Load parent rows by title
660  $joinConditions = [
661  'page' => [
662  'LEFT JOIN',
663  [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
664  ],
665  'page_props' => [
666  'LEFT JOIN',
667  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
668  ],
669  ];
670 
671  $parentRows = $dbr->select(
672  [ 'category', 'page', 'page_props' ],
673  [
674  'page_id',
675  'rc_title' => 'cat_title',
676  'pp_propname',
677  'cat_pages',
678  'cat_subcats',
679  'cat_files',
680  ],
681  [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
682  __METHOD__,
683  [],
684  $joinConditions
685  );
686  foreach ( $parentRows as $row ) {
687  if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
688  // We already captured this one before
689  continue;
690  }
691  if ( isset( $processedTitle[$row->rc_title] ) ) {
692  // We already captured this one before
693  continue;
694  }
695  $this->writeCategoryData( $row );
696  if ( $row->page_id ) {
697  $pages[$row->page_id] = $row->rc_title;
698  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
699  $this->processed[$row->page_id] = true;
700  }
701  $processedTitle[$row->rc_title] = true;
702  }
703  }
704 
705  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
706  }
707  }
708 }
709 
710 $maintClass = CategoryChangesAsRdf::class;
711 require_once RUN_MAINTENANCE_IF_MAIN;
const RC_LOG
Definition: Defines.php:118
const RC_EDIT
Definition: Defines.php:116
const NS_CATEGORY
Definition: Defines.php:78
const RC_CATEGORIZE
Definition: Defines.php:120
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addJoinConditions(array $conditions)
setFetchColumns(array $columns)
setCaller( $caller)
Use ->setCaller( METHOD ) to indicate which code is using this class.
addOptions(array $options)
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
initialize()
Initialize external service classes.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Definition: MWTimestamp.php:40
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:36
A database connection without write operations.
const DB_REPLICA
Definition: defines.php:26