MediaWiki  master
categoryChangesAsRdf.php
Go to the documentation of this file.
1 <?php
22 
23 require_once __DIR__ . '/Maintenance.php';
24 
35  const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39 
41 
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49  ?category ?x ?y
50  VALUES ?category {
51  %s
52  }
53 };
54 
56 
61 DELETE {
62 ?category ?x ?y
63 } INSERT {
64 %s
65 } WHERE {
66  ?category ?x ?y
67  VALUES ?category {
68  %s
69  }
70 };
71 
73 
77  private $rdfWriter;
82  private $categoriesRdf;
83 
84  private $startTS;
85  private $endTS;
86 
92  protected $processed = [];
93 
94  public function __construct() {
95  parent::__construct();
96 
97  $this->addDescription( "Generate RDF dump of category changes in a wiki." );
98 
99  $this->setBatchSize( 200 );
100  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101  true, 'o' );
102  $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103  true, true, 's' );
104  $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105  true, 'e' );
106  }
107 
111  public function initialize() {
112  // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113  $this->rdfWriter = new TurtleRdfWriter();
114  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115  }
116 
117  public function execute() {
118  $this->initialize();
119  $startTS = new MWTimestamp( $this->getOption( "start" ) );
120 
121  $endTS = new MWTimestamp( $this->getOption( "end" ) );
122  $now = new MWTimestamp();
123  $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
124 
125  if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
126  $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
127  }
128  if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
129  $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
130  }
131 
132  $this->startTS = $startTS->getTimestamp();
133  $this->endTS = $endTS->getTimestamp();
134 
135  $outFile = $this->getOption( 'output', 'php://stdout' );
136  if ( $outFile === '-' ) {
137  $outFile = 'php://stdout';
138  }
139 
140  $output = fopen( $outFile, 'wb' );
141 
142  $this->categoriesRdf->setupPrefixes();
143  $this->rdfWriter->start();
144 
145  $prefixes = $this->getRdf();
146  // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
147  // Also strip dot at the end.
148  $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
149  fwrite( $output, $prefixes );
150 
151  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
152 
153  // Deletes go first because if the page was deleted, other changes
154  // do not matter. This only gets true deletes, i.e. not pages that were restored.
155  $this->handleDeletes( $dbr, $output );
156  // Moves go before additions because if category is moved, we should not process creation
157  // as it would produce wrong data - because create row has old title
158  $this->handleMoves( $dbr, $output );
159  // We need to handle restores too since delete may have happened in previous update.
160  $this->handleRestores( $dbr, $output );
161  // Process newly added pages
162  $this->handleAdds( $dbr, $output );
163  // Process page edits
164  $this->handleEdits( $dbr, $output );
165  // Process categorization changes
166  $this->handleCategorization( $dbr, $output );
167 
168  // Update timestamp
169  fwrite( $output, $this->updateTS( $this->endTS ) );
170  }
171 
176  private function getInsertRdf() {
177  $rdfText = $this->getRdf();
178  if ( !$rdfText ) {
179  return "";
180  }
181  return sprintf( self::SPARQL_INSERT, $rdfText );
182  }
183 
192  private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
193  if ( empty( $deleteUrls ) ) {
194  return "";
195  }
196 
197  if ( !empty( $pages ) ) {
198  $this->writeParentCategories( $dbr, $pages );
199  }
200 
201  return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
202  $this->getInsertRdf();
203  }
204 
211  private function writeParentCategories( IDatabase $dbr, $pages ) {
212  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
213  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
214  }
215  }
216 
222  public function updateTS( $timestamp ) {
223  $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
224  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
225  $tsQuery = <<<SPARQL
226 DELETE {
227  $dumpUrl schema:dateModified ?o .
228 }
229 WHERE {
230  $dumpUrl schema:dateModified ?o .
231 };
232 INSERT DATA {
233  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
234 }
235 
236 SPARQL;
237  return $tsQuery;
238  }
239 
247  private function setupChangesIterator(
248  IDatabase $dbr,
249  array $columns = [],
250  array $extra_tables = []
251  ) {
252  $tables = [ 'recentchanges', 'page_props', 'category' ];
253  if ( $extra_tables ) {
254  $tables = array_merge( $tables, $extra_tables );
255  }
256  $it = new BatchRowIterator( $dbr,
257  $tables,
258  [ 'rc_timestamp' ],
259  $this->mBatchSize
260  );
261  $this->addTimestampConditions( $it, $dbr );
262  $it->addJoinConditions(
263  [
264  'page_props' => [
265  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
266  ],
267  'category' => [
268  'LEFT JOIN', [ 'cat_title = rc_title' ]
269  ]
270  ]
271  );
272  $it->setFetchColumns( array_merge( $columns, [
273  'rc_title',
274  'rc_cur_id',
275  'pp_propname',
276  'cat_pages',
277  'cat_subcats',
278  'cat_files'
279  ] ) );
280  return $it;
281  }
282 
288  protected function getNewCatsIterator( IDatabase $dbr ) {
289  $it = $this->setupChangesIterator( $dbr );
290  $it->addConditions( [
291  'rc_namespace' => NS_CATEGORY,
292  'rc_new' => 1,
293  ] );
294  return $it;
295  }
296 
302  protected function getMovedCatsIterator( IDatabase $dbr ) {
303  $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
304  $it->addConditions( [
305  'rc_namespace' => NS_CATEGORY,
306  'rc_new' => 0,
307  'rc_log_type' => 'move',
308  'rc_type' => RC_LOG,
309  ] );
310  $it->addJoinConditions( [
311  'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
312  ] );
313  $this->addIndex( $it );
314  return $it;
315  }
316 
322  protected function getDeletedCatsIterator( IDatabase $dbr ) {
323  $it = new BatchRowIterator( $dbr,
324  'recentchanges',
325  [ 'rc_timestamp' ],
326  $this->mBatchSize
327  );
328  $this->addTimestampConditions( $it, $dbr );
329  $it->addConditions( [
330  'rc_namespace' => NS_CATEGORY,
331  'rc_new' => 0,
332  'rc_log_type' => 'delete',
333  'rc_log_action' => 'delete',
334  'rc_type' => RC_LOG,
335  // We will fetch ones that do not have page record. If they do,
336  // this means they were restored, thus restoring handler will pick it up.
337  'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
338  ] );
339  $this->addIndex( $it );
340  $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
341  return $it;
342  }
343 
349  protected function getRestoredCatsIterator( IDatabase $dbr ) {
350  $it = $this->setupChangesIterator( $dbr );
351  $it->addConditions( [
352  'rc_namespace' => NS_CATEGORY,
353  'rc_new' => 0,
354  'rc_log_type' => 'delete',
355  'rc_log_action' => 'restore',
356  'rc_type' => RC_LOG,
357  // We will only fetch ones that have page record
358  'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359  ] );
360  $this->addIndex( $it );
361  return $it;
362  }
363 
370  protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
371  $it =
372  $this->setupChangesIterator( $dbr );
373  $it->addConditions( [
374  'rc_namespace' => NS_CATEGORY,
375  'rc_new' => 0,
376  'rc_type' => $type,
377  ] );
378  $this->addIndex( $it );
379  return $it;
380  }
381 
387  private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
388  $it->addConditions( [
389  'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
390  'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
391  ] );
392  }
393 
398  private function addIndex( BatchRowIterator $it ) {
399  $it->addOptions( [
400  'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
401  ] );
402  }
403 
410  protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
411  $it = new BatchRowIterator(
412  $dbr,
413  'categorylinks',
414  [ 'cl_from', 'cl_to' ],
415  $this->mBatchSize
416  );
417  $it->addConditions( [
418  'cl_type' => 'subcat',
419  'cl_from' => $ids
420  ] );
421  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
422  return new RecursiveIteratorIterator( $it );
423  }
424 
429  public function getRdf() {
430  return $this->rdfWriter->drain();
431  }
432 
438  public function handleDeletes( IDatabase $dbr, $output ) {
439  // This only does "true" deletes - i.e. those that the page stays deleted
440  foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
441  $deleteUrls = [];
442  foreach ( $batch as $row ) {
443  // This can produce duplicates, we don't care
444  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
445  $this->processed[$row->rc_cur_id] = true;
446  }
447  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
448  }
449  }
450 
455  private function writeCategoryData( $row ) {
456  $this->categoriesRdf->writeCategoryData(
457  $row->rc_title,
458  $row->pp_propname === 'hiddencat',
459  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
460  (int)$row->cat_subcats
461  );
462  }
463 
468  public function handleMoves( IDatabase $dbr, $output ) {
469  foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
470  $pages = [];
471  $deleteUrls = [];
472  foreach ( $batch as $row ) {
473  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
474 
475  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
476  // We already captured this one before
477  continue;
478  }
479 
480  if ( $row->page_namespace != NS_CATEGORY ) {
481  // If page was moved out of Category:, we'll just delete
482  continue;
483  }
484  $row->rc_title = $row->page_title;
485  $this->writeCategoryData( $row );
486  $pages[$row->rc_cur_id] = $row->page_title;
487  $this->processed[$row->rc_cur_id] = true;
488  }
489 
490  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
491  }
492  }
493 
498  public function handleRestores( IDatabase $dbr, $output ) {
499  fwrite( $output, "# Restores\n" );
500  // This will only find those restores that were not deleted later.
501  foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
502  $pages = [];
503  foreach ( $batch as $row ) {
504  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
505  // We already captured this one before
506  continue;
507  }
508  $this->writeCategoryData( $row );
509  $pages[$row->rc_cur_id] = $row->rc_title;
510  $this->processed[$row->rc_cur_id] = true;
511  }
512 
513  if ( empty( $pages ) ) {
514  continue;
515  }
516 
517  $this->writeParentCategories( $dbr, $pages );
518 
519  fwrite( $output, $this->getInsertRdf() );
520  }
521  }
522 
527  public function handleAdds( IDatabase $dbr, $output ) {
528  fwrite( $output, "# Additions\n" );
529  foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
530  $pages = [];
531  foreach ( $batch as $row ) {
532  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
533  // We already captured this one before
534  continue;
535  }
536  $this->writeCategoryData( $row );
537  $pages[$row->rc_cur_id] = $row->rc_title;
538  $this->processed[$row->rc_cur_id] = true;
539  }
540 
541  if ( empty( $pages ) ) {
542  continue;
543  }
544 
545  $this->writeParentCategories( $dbr, $pages );
546  fwrite( $output, $this->getInsertRdf() );
547  }
548  }
549 
555  public function handleEdits( IDatabase $dbr, $output ) {
556  // Editing category can change hidden flag and add new parents.
557  // TODO: it's pretty expensive to update all edited categories, and most edits
558  // aren't actually interesting for us. Some way to know which are interesting?
559  // We can capture recategorization on the next step, but not change in hidden status.
560  foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
561  $pages = [];
562  $deleteUrls = [];
563  foreach ( $batch as $row ) {
564  // Note that on categorization event, cur_id points to
565  // the child page, not the parent category!
566  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
567  // We already captured this one before
568  continue;
569  }
570  $this->writeCategoryData( $row );
571  $pages[$row->rc_cur_id] = $row->rc_title;
572  $this->processed[$row->rc_cur_id] = true;
573  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
574  }
575 
576  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
577  }
578  }
579 
585  public function handleCategorization( IDatabase $dbr, $output ) {
586  $processedTitle = [];
587  // Categorization change can add new parents and change counts
588  // for the parent category.
589  foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
590  /*
591  * Note that on categorization event, cur_id points to
592  * the child page, not the parent category!
593  * So we need to have a two-stage process, since we have ID from one
594  * category and title from another, and we need both for proper updates.
595  * TODO: For now, we do full update even though some data hasn't changed,
596  * e.g. parents for parent cat and counts for child cat.
597  */
598  $childPages = [];
599  $parentCats = [];
600  foreach ( $batch as $row ) {
601  $childPages[$row->rc_cur_id] = true;
602  $parentCats[$row->rc_title] = true;
603  }
604 
605  $joinConditions = [
606  'page_props' => [
607  'LEFT JOIN',
608  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
609  ],
610  'category' => [
611  'LEFT JOIN',
612  [ 'cat_title = page_title' ],
613  ],
614  ];
615 
616  $pages = [];
617  $deleteUrls = [];
618 
619  if ( $childPages ) {
620  // Load child rows by ID
621  $childRows = $dbr->select(
622  [ 'page', 'page_props', 'category' ],
623  [
624  'page_id',
625  'rc_title' => 'page_title',
626  'pp_propname',
627  'cat_pages',
628  'cat_subcats',
629  'cat_files',
630  ],
631  [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
632  __METHOD__,
633  [],
634  $joinConditions
635  );
636  foreach ( $childRows as $row ) {
637  if ( isset( $this->processed[$row->page_id] ) ) {
638  // We already captured this one before
639  continue;
640  }
641  $this->writeCategoryData( $row );
642  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
643  $this->processed[$row->page_id] = true;
644  }
645  }
646 
647  if ( $parentCats ) {
648  // Load parent rows by title
649  $joinConditions = [
650  'page' => [
651  'LEFT JOIN',
652  [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
653  ],
654  'page_props' => [
655  'LEFT JOIN',
656  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
657  ],
658  ];
659 
660  $parentRows = $dbr->select(
661  [ 'category', 'page', 'page_props' ],
662  [
663  'page_id',
664  'rc_title' => 'cat_title',
665  'pp_propname',
666  'cat_pages',
667  'cat_subcats',
668  'cat_files',
669  ],
670  [ 'cat_title' => array_keys( $parentCats ) ],
671  __METHOD__,
672  [],
673  $joinConditions
674  );
675  foreach ( $parentRows as $row ) {
676  if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
677  // We already captured this one before
678  continue;
679  }
680  if ( isset( $processedTitle[$row->rc_title] ) ) {
681  // We already captured this one before
682  continue;
683  }
684  $this->writeCategoryData( $row );
685  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
686  if ( $row->page_id ) {
687  $this->processed[$row->page_id] = true;
688  }
689  $processedTitle[$row->rc_title] = true;
690  }
691  }
692 
693  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
694  }
695  }
696 }
697 
698 $maintClass = CategoryChangesAsRdf::class;
699 require_once RUN_MAINTENANCE_IF_MAIN;
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
getDeletedCatsIterator(IDatabase $dbr)
Fetch deleted categories.
const RC_CATEGORIZE
Definition: Defines.php:126
Maintenance script to provide RDF representation of the recent changes in category tree...
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
getRdf()
Get accumulated RDF.
error( $err, $die=0)
Throw an error to the user.
addIndex(BatchRowIterator $it)
Need to force index, somehow on terbium the optimizer chooses wrong one.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
getMovedCatsIterator(IDatabase $dbr)
Fetch moved categories.
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:86
Helper class to produce RDF representation of categories.
const SPARQL_DELETE_INSERT
Delete/Insert query.
setBatchSize( $s=0)
Set the batch size.
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getRestoredCatsIterator(IDatabase $dbr)
Fetch restored categories.
const SPARQL_INSERT
Insert query.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
handleRestores(IDatabase $dbr, $output)
writeCategoryData( $row)
Write category data to RDF.
addDescription( $text)
Set the description text.
const NS_CATEGORY
Definition: Defines.php:74
handleMoves(IDatabase $dbr, $output)
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
const SPARQL_DELETE
Delete query.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleAdds(IDatabase $dbr, $output)
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
getChangedCatsIterator(IDatabase $dbr, $type)
Fetch categorization changes or edits.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
CategoriesRdf $categoriesRdf
Categories RDF helper.
initialize()
Initialize external service classes.
const DB_REPLICA
Definition: defines.php:25
addOptions(array $options)
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[])
Set up standard iterator for retrieving category changes.
getNewCatsIterator(IDatabase $dbr)
Fetch newly created categories.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
This program is free software; you can redistribute it and/or modify it under the terms of the GNU Ge...
const RC_EDIT
Definition: Defines.php:122
const RC_LOG
Definition: Defines.php:124
addConditions(array $conditions)