MediaWiki  master
categoryChangesAsRdf.php
Go to the documentation of this file.
1 <?php
20 use Wikimedia\Purtle\RdfWriter;
21 use Wikimedia\Purtle\TurtleRdfWriter;
23 
24 require_once __DIR__ . '/Maintenance.php';
25 
36  private const SPARQL_INSERT = <<<SPARQL
37 INSERT DATA {
38 %s
39 };
40 
41 SPARQL;
42 
46  private const SPARQL_DELETE = <<<SPARQLD
47 DELETE {
48 ?category ?x ?y
49 } WHERE {
50  ?category ?x ?y
51  VALUES ?category {
52  %s
53  }
54 };
55 
56 SPARQLD;
57 
61  private $rdfWriter;
66  private $categoriesRdf;
67 
68  private $startTS;
69  private $endTS;
70 
76  protected $processed = [];
77 
78  public function __construct() {
79  parent::__construct();
80 
81  $this->addDescription( "Generate RDF dump of category changes in a wiki." );
82 
83  $this->setBatchSize( 200 );
84  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
85  true, 'o' );
86  $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
87  true, true, 's' );
88  $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
89  true, 'e' );
90  }
91 
95  public function initialize() {
96  // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
97  $this->rdfWriter = new TurtleRdfWriter();
98  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
99  }
100 
101  public function execute() {
102  $this->initialize();
103  $startTS = new MWTimestamp( $this->getOption( "start" ) );
104 
105  $endTS = new MWTimestamp( $this->getOption( "end" ) );
106  $now = new MWTimestamp();
107  $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
108 
109  if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
110  $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
111  }
112  if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
113  $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
114  }
115 
116  $this->startTS = $startTS->getTimestamp();
117  $this->endTS = $endTS->getTimestamp();
118 
119  $outFile = $this->getOption( 'output', 'php://stdout' );
120  if ( $outFile === '-' ) {
121  $outFile = 'php://stdout';
122  }
123 
124  $output = fopen( $outFile, 'wb' );
125 
126  $this->categoriesRdf->setupPrefixes();
127  $this->rdfWriter->start();
128 
129  $prefixes = $this->getRdf();
130  // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
131  // Also strip dot at the end.
132  $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
133  fwrite( $output, $prefixes );
134 
135  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
136 
137  // Deletes go first because if the page was deleted, other changes
138  // do not matter. This only gets true deletes, i.e. not pages that were restored.
139  $this->handleDeletes( $dbr, $output );
140  // Moves go before additions because if category is moved, we should not process creation
141  // as it would produce wrong data - because create row has old title
142  $this->handleMoves( $dbr, $output );
143  // We need to handle restores too since delete may have happened in previous update.
144  $this->handleRestores( $dbr, $output );
145  // Process newly added pages
146  $this->handleAdds( $dbr, $output );
147  // Process page edits
148  $this->handleEdits( $dbr, $output );
149  // Process categorization changes
150  $this->handleCategorization( $dbr, $output );
151 
152  // Update timestamp
153  fwrite( $output, $this->updateTS( $this->endTS ) );
154  }
155 
160  private function getInsertRdf() {
161  $rdfText = $this->getRdf();
162  if ( !$rdfText ) {
163  return "";
164  }
165  return sprintf( self::SPARQL_INSERT, $rdfText );
166  }
167 
176  private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
177  if ( empty( $deleteUrls ) ) {
178  return "";
179  }
180 
181  if ( !empty( $pages ) ) {
182  $this->writeParentCategories( $dbr, $pages );
183  }
184 
185  return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
186  $this->getInsertRdf();
187  }
188 
195  private function writeParentCategories( IDatabase $dbr, $pages ) {
196  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
197  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
198  }
199  }
200 
206  public function updateTS( $timestamp ) {
207  $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
208  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
209  $tsQuery = <<<SPARQL
210 DELETE {
211  $dumpUrl schema:dateModified ?o .
212 }
213 WHERE {
214  $dumpUrl schema:dateModified ?o .
215 };
216 INSERT DATA {
217  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
218 }
219 
220 SPARQL;
221  return $tsQuery;
222  }
223 
232  private function setupChangesIterator(
233  IDatabase $dbr,
234  array $columns = [],
235  array $extra_tables = [],
236  $fname = __METHOD__
237  ) {
238  $tables = [ 'recentchanges', 'page_props', 'category' ];
239  if ( $extra_tables ) {
240  $tables = array_merge( $tables, $extra_tables );
241  }
242  $it = new BatchRowIterator( $dbr,
243  $tables,
244  [ 'rc_timestamp' ],
245  $this->mBatchSize
246  );
247  $this->addTimestampConditions( $it, $dbr );
248  $it->addJoinConditions(
249  [
250  'page_props' => [
251  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
252  ],
253  'category' => [
254  'LEFT JOIN', [ 'cat_title = rc_title' ]
255  ]
256  ]
257  );
258  $it->setFetchColumns( array_merge( $columns, [
259  'rc_title',
260  'rc_cur_id',
261  'pp_propname',
262  'cat_pages',
263  'cat_subcats',
264  'cat_files'
265  ] ) );
266  $it->setCaller( $fname );
267  return $it;
268  }
269 
276  protected function getNewCatsIterator( IDatabase $dbr, $fname ) {
277  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
278  $it->addConditions( [
279  'rc_namespace' => NS_CATEGORY,
280  'rc_new' => 1,
281  ] );
282  return $it;
283  }
284 
291  protected function getMovedCatsIterator( IDatabase $dbr, $fname ) {
292  $it = $this->setupChangesIterator(
293  $dbr,
294  [ 'page_title', 'page_namespace' ],
295  [ 'page' ],
296  $fname
297  );
298  $it->addConditions( [
299  'rc_namespace' => NS_CATEGORY,
300  'rc_new' => 0,
301  'rc_log_type' => 'move',
302  'rc_type' => RC_LOG,
303  ] );
304  $it->addJoinConditions( [
305  'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
306  ] );
307  $this->addIndex( $it, $dbr );
308  return $it;
309  }
310 
317  protected function getDeletedCatsIterator( IDatabase $dbr, $fname ) {
318  $it = new BatchRowIterator( $dbr,
319  'recentchanges',
320  [ 'rc_timestamp' ],
321  $this->mBatchSize
322  );
323  $this->addTimestampConditions( $it, $dbr );
324  $it->addConditions( [
325  'rc_namespace' => NS_CATEGORY,
326  'rc_new' => 0,
327  'rc_log_type' => 'delete',
328  'rc_log_action' => 'delete',
329  'rc_type' => RC_LOG,
330  // We will fetch ones that do not have page record. If they do,
331  // this means they were restored, thus restoring handler will pick it up.
332  'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
333  ] );
334  $this->addIndex( $it, $dbr );
335  $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
336  $it->setCaller( $fname );
337  return $it;
338  }
339 
346  protected function getRestoredCatsIterator( IDatabase $dbr, $fname ) {
347  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
348  $it->addConditions( [
349  'rc_namespace' => NS_CATEGORY,
350  'rc_new' => 0,
351  'rc_log_type' => 'delete',
352  'rc_log_action' => 'restore',
353  'rc_type' => RC_LOG,
354  // We will only fetch ones that have page record
355  'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
356  ] );
357  $this->addIndex( $it, $dbr );
358  return $it;
359  }
360 
368  protected function getChangedCatsIterator( IDatabase $dbr, $type, $fname ) {
369  $it = $this->setupChangesIterator( $dbr, [], [], $fname );
370  $it->addConditions( [
371  'rc_namespace' => NS_CATEGORY,
372  'rc_new' => 0,
373  'rc_type' => $type,
374  ] );
375  $this->addIndex( $it, $dbr );
376  return $it;
377  }
378 
385  $it->addConditions( [
386  'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
387  'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
388  ] );
389  }
390 
396  private function addIndex( BatchRowIterator $it, IDatabase $dbr ) {
397  $it->addOptions( [
398  'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
399  ] );
400  }
401 
409  protected function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
410  $it = new BatchRowIterator(
411  $dbr,
412  'categorylinks',
413  [ 'cl_from', 'cl_to' ],
414  $this->mBatchSize
415  );
416  $it->addConditions( [
417  'cl_type' => 'subcat',
418  'cl_from' => $ids
419  ] );
420  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
421  $it->setCaller( $fname );
422  return new RecursiveIteratorIterator( $it );
423  }
424 
429  public function getRdf() {
430  return $this->rdfWriter->drain();
431  }
432 
438  public function handleDeletes( IDatabase $dbr, $output ) {
439  // This only does "true" deletes - i.e. those that the page stays deleted
440 
441  foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
442  $deleteUrls = [];
443  foreach ( $batch as $row ) {
444  // This can produce duplicates, we don't care
445  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
446  $this->processed[$row->rc_cur_id] = true;
447  }
448  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
449  }
450  }
451 
456  private function writeCategoryData( $row ) {
457  $this->categoriesRdf->writeCategoryData(
458  $row->rc_title,
459  $row->pp_propname === 'hiddencat',
460  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
461  (int)$row->cat_subcats
462  );
463  }
464 
469  public function handleMoves( IDatabase $dbr, $output ) {
470  foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
471  $pages = [];
472  $deleteUrls = [];
473  foreach ( $batch as $row ) {
474  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
475 
476  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
477  // We already captured this one before
478  continue;
479  }
480 
481  if ( $row->page_namespace != NS_CATEGORY ) {
482  // If page was moved out of Category:, we'll just delete
483  continue;
484  }
485  $row->rc_title = $row->page_title;
486  $this->writeCategoryData( $row );
487  $pages[$row->rc_cur_id] = $row->page_title;
488  $this->processed[$row->rc_cur_id] = true;
489  }
490 
491  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
492  }
493  }
494 
499  public function handleRestores( IDatabase $dbr, $output ) {
500  fwrite( $output, "# Restores\n" );
501 
502  // This will only find those restores that were not deleted later.
503  foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
504  $pages = [];
505  foreach ( $batch as $row ) {
506  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
507  // We already captured this one before
508  continue;
509  }
510  $this->writeCategoryData( $row );
511  $pages[$row->rc_cur_id] = $row->rc_title;
512  $this->processed[$row->rc_cur_id] = true;
513  }
514 
515  if ( empty( $pages ) ) {
516  continue;
517  }
518 
519  $this->writeParentCategories( $dbr, $pages );
520 
521  fwrite( $output, $this->getInsertRdf() );
522  }
523  }
524 
529  public function handleAdds( IDatabase $dbr, $output ) {
530  fwrite( $output, "# Additions\n" );
531 
532  foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
533  $pages = [];
534  foreach ( $batch as $row ) {
535  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
536  // We already captured this one before
537  continue;
538  }
539  $this->writeCategoryData( $row );
540  $pages[$row->rc_cur_id] = $row->rc_title;
541  $this->processed[$row->rc_cur_id] = true;
542  }
543 
544  if ( empty( $pages ) ) {
545  continue;
546  }
547 
548  $this->writeParentCategories( $dbr, $pages );
549  fwrite( $output, $this->getInsertRdf() );
550  }
551  }
552 
558  public function handleEdits( IDatabase $dbr, $output ) {
559  // Editing category can change hidden flag and add new parents.
560  // TODO: it's pretty expensive to update all edited categories, and most edits
561  // aren't actually interesting for us. Some way to know which are interesting?
562  // We can capture recategorization on the next step, but not change in hidden status.
563 
564  foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
565  $pages = [];
566  $deleteUrls = [];
567  foreach ( $batch as $row ) {
568  // Note that on categorization event, cur_id points to
569  // the child page, not the parent category!
570  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
571  // We already captured this one before
572  continue;
573  }
574  $this->writeCategoryData( $row );
575  $pages[$row->rc_cur_id] = $row->rc_title;
576  $this->processed[$row->rc_cur_id] = true;
577  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
578  }
579 
580  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
581  }
582  }
583 
589  public function handleCategorization( IDatabase $dbr, $output ) {
590  $processedTitle = [];
591 
592  // Categorization change can add new parents and change counts
593  // for the parent category.
594 
595  foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
596  /*
597  * Note that on categorization event, cur_id points to
598  * the child page, not the parent category!
599  * So we need to have a two-stage process, since we have ID from one
600  * category and title from another, and we need both for proper updates.
601  * TODO: For now, we do full update even though some data hasn't changed,
602  * e.g. parents for parent cat and counts for child cat.
603  */
604  $childPages = [];
605  $parentCats = [];
606  foreach ( $batch as $row ) {
607  $childPages[$row->rc_cur_id] = true;
608  $parentCats[$row->rc_title] = true;
609  }
610 
611  $joinConditions = [
612  'page_props' => [
613  'LEFT JOIN',
614  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
615  ],
616  'category' => [
617  'LEFT JOIN',
618  [ 'cat_title = page_title' ],
619  ],
620  ];
621 
622  $pages = [];
623  $deleteUrls = [];
624 
625  if ( $childPages ) {
626  // Load child rows by ID
627  $childRows = $dbr->select(
628  [ 'page', 'page_props', 'category' ],
629  [
630  'page_id',
631  'rc_title' => 'page_title',
632  'pp_propname',
633  'cat_pages',
634  'cat_subcats',
635  'cat_files',
636  ],
637  [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
638  __METHOD__,
639  [],
640  $joinConditions
641  );
642  foreach ( $childRows as $row ) {
643  if ( isset( $this->processed[$row->page_id] ) ) {
644  // We already captured this one before
645  continue;
646  }
647  $this->writeCategoryData( $row );
648  if ( $row->page_id ) {
649  $pages[$row->page_id] = $row->rc_title;
650  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
651  $this->processed[$row->page_id] = true;
652  }
653  }
654  }
655 
656  if ( $parentCats ) {
657  // Load parent rows by title
658  $joinConditions = [
659  'page' => [
660  'LEFT JOIN',
661  [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
662  ],
663  'page_props' => [
664  'LEFT JOIN',
665  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
666  ],
667  ];
668 
669  $parentRows = $dbr->select(
670  [ 'category', 'page', 'page_props' ],
671  [
672  'page_id',
673  'rc_title' => 'cat_title',
674  'pp_propname',
675  'cat_pages',
676  'cat_subcats',
677  'cat_files',
678  ],
679  [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
680  __METHOD__,
681  [],
682  $joinConditions
683  );
684  foreach ( $parentRows as $row ) {
685  if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
686  // We already captured this one before
687  continue;
688  }
689  if ( isset( $processedTitle[$row->rc_title] ) ) {
690  // We already captured this one before
691  continue;
692  }
693  $this->writeCategoryData( $row );
694  if ( $row->page_id ) {
695  $pages[$row->page_id] = $row->rc_title;
696  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
697  $this->processed[$row->page_id] = true;
698  }
699  $processedTitle[$row->rc_title] = true;
700  }
701  }
702 
703  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
704  }
705  }
706 }
707 
708 $maintClass = CategoryChangesAsRdf::class;
709 require_once RUN_MAINTENANCE_IF_MAIN;
const RC_LOG
Definition: Defines.php:117
const RC_EDIT
Definition: Defines.php:115
const NS_CATEGORY
Definition: Defines.php:78
const RC_CATEGORIZE
Definition: Defines.php:119
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
addConditions(array $conditions)
addOptions(array $options)
Helper class to produce RDF representation of categories.
Maintenance script to provide RDF representation of the recent changes in category tree.
getChangedCatsIterator(IDatabase $dbr, $type, $fname)
Fetch categorization changes or edits.
writeCategoryData( $row)
Write category data to RDF.
initialize()
Initialize external service classes.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
execute()
Do the actual work.
getDeletedCatsIterator(IDatabase $dbr, $fname)
Fetch deleted categories.
addIndex(BatchRowIterator $it, IDatabase $dbr)
Need to force index, somehow on terbium the optimizer chooses wrong one.
getNewCatsIterator(IDatabase $dbr, $fname)
Fetch newly created categories.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
true[] $processed
List of processed page IDs, so we don't try to process same thing twice.
const SPARQL_DELETE
Delete query.
getMovedCatsIterator(IDatabase $dbr, $fname)
Fetch moved categories.
const SPARQL_INSERT
Insert query.
handleMoves(IDatabase $dbr, $output)
getRdf()
Get accumulated RDF.
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
CategoriesRdf $categoriesRdf
Categories RDF helper.
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[], $fname=__METHOD__)
Set up standard iterator for retrieving category changes.
__construct()
Default constructor.
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleRestores(IDatabase $dbr, $output)
handleAdds(IDatabase $dbr, $output)
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
getRestoredCatsIterator(IDatabase $dbr, $fname)
Fetch restored categories.
Library for creating and parsing MW-style timestamps.
Definition: MWTimestamp.php:39
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:61
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
error( $err, $die=0)
Throw an error to the user.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:40
const DB_REPLICA
Definition: defines.php:25