MediaWiki  master
categoryChangesAsRdf.php
Go to the documentation of this file.
1 <?php
22 
23 require_once __DIR__ . '/Maintenance.php';
24 
35  const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39 
41 
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49  ?category ?x ?y
50  VALUES ?category {
51  %s
52  }
53 };
54 
56 
61 DELETE {
62 ?category ?x ?y
63 } INSERT {
64 %s
65 } WHERE {
66  ?category ?x ?y
67  VALUES ?category {
68  %s
69  }
70 };
71 
73 
77  private $rdfWriter;
82  private $categoriesRdf;
83 
84  private $startTS;
85  private $endTS;
86 
92  protected $processed = [];
93 
94  public function __construct() {
95  parent::__construct();
96 
97  $this->addDescription( "Generate RDF dump of category changes in a wiki." );
98 
99  $this->setBatchSize( 200 );
100  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101  true, 'o' );
102  $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103  true, true, 's' );
104  $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105  true, 'e' );
106  }
107 
111  public function initialize() {
112  // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113  $this->rdfWriter = new TurtleRdfWriter();
114  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115  }
116 
117  public function execute() {
118  $this->initialize();
119  $startTS = new MWTimestamp( $this->getOption( "start" ) );
120 
121  $endTS = new MWTimestamp( $this->getOption( "end" ) );
122  $now = new MWTimestamp();
123  $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
124 
125  if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
126  $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
127  }
128  if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
129  $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
130  }
131 
132  $this->startTS = $startTS->getTimestamp();
133  $this->endTS = $endTS->getTimestamp();
134 
135  $outFile = $this->getOption( 'output', 'php://stdout' );
136  if ( $outFile === '-' ) {
137  $outFile = 'php://stdout';
138  }
139 
140  $output = fopen( $outFile, 'wb' );
141 
142  $this->categoriesRdf->setupPrefixes();
143  $this->rdfWriter->start();
144 
145  $prefixes = $this->getRdf();
146  // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
147  // Also strip dot at the end.
148  $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
149  fwrite( $output, $prefixes );
150 
151  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
152 
153  // Deletes go first because if the page was deleted, other changes
154  // do not matter. This only gets true deletes, i.e. not pages that were restored.
155  $this->handleDeletes( $dbr, $output );
156  // Moves go before additions because if category is moved, we should not process creation
157  // as it would produce wrong data - because create row has old title
158  $this->handleMoves( $dbr, $output );
159  // We need to handle restores too since delete may have happened in previous update.
160  $this->handleRestores( $dbr, $output );
161  // Process newly added pages
162  $this->handleAdds( $dbr, $output );
163  // Process page edits
164  $this->handleEdits( $dbr, $output );
165  // Process categorization changes
166  $this->handleCategorization( $dbr, $output );
167 
168  // Update timestamp
169  fwrite( $output, $this->updateTS( $this->endTS ) );
170  }
171 
176  private function getInsertRdf() {
177  $rdfText = $this->getRdf();
178  if ( !$rdfText ) {
179  return "";
180  }
181  return sprintf( self::SPARQL_INSERT, $rdfText );
182  }
183 
192  private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
193  if ( empty( $deleteUrls ) ) {
194  return "";
195  }
196 
197  if ( !empty( $pages ) ) {
198  $this->writeParentCategories( $dbr, $pages );
199  }
200 
201  return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
202  $this->getInsertRdf();
203  }
204 
211  private function writeParentCategories( IDatabase $dbr, $pages ) {
212  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
213  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
214  }
215  }
216 
222  public function updateTS( $timestamp ) {
223  $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
224  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
225  $tsQuery = <<<SPARQL
226 DELETE {
227  $dumpUrl schema:dateModified ?o .
228 }
229 WHERE {
230  $dumpUrl schema:dateModified ?o .
231 };
232 INSERT DATA {
233  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
234 }
235 
236 SPARQL;
237  return $tsQuery;
238  }
239 
247  private function setupChangesIterator(
248  IDatabase $dbr,
249  array $columns = [],
250  array $extra_tables = []
251  ) {
252  $tables = [ 'recentchanges', 'page_props', 'category' ];
253  if ( $extra_tables ) {
254  $tables = array_merge( $tables, $extra_tables );
255  }
256  $it = new BatchRowIterator( $dbr,
257  $tables,
258  [ 'rc_timestamp' ],
259  $this->mBatchSize
260  );
261  $this->addTimestampConditions( $it, $dbr );
262  $it->addJoinConditions(
263  [
264  'page_props' => [
265  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
266  ],
267  'category' => [
268  'LEFT JOIN', [ 'cat_title = rc_title' ]
269  ]
270  ]
271  );
272  $it->setFetchColumns( array_merge( $columns, [
273  'rc_title',
274  'rc_cur_id',
275  'pp_propname',
276  'cat_pages',
277  'cat_subcats',
278  'cat_files'
279  ] ) );
280  return $it;
281  }
282 
288  protected function getNewCatsIterator( IDatabase $dbr ) {
289  $it = $this->setupChangesIterator( $dbr );
290  $it->addConditions( [
291  'rc_namespace' => NS_CATEGORY,
292  'rc_new' => 1,
293  ] );
294  return $it;
295  }
296 
302  protected function getMovedCatsIterator( IDatabase $dbr ) {
303  $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
304  $it->addConditions( [
305  'rc_namespace' => NS_CATEGORY,
306  'rc_new' => 0,
307  'rc_log_type' => 'move',
308  'rc_type' => RC_LOG,
309  ] );
310  $it->addJoinConditions( [
311  'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
312  ] );
313  $this->addIndex( $it );
314  return $it;
315  }
316 
322  protected function getDeletedCatsIterator( IDatabase $dbr ) {
323  $it = new BatchRowIterator( $dbr,
324  'recentchanges',
325  [ 'rc_timestamp' ],
326  $this->mBatchSize
327  );
328  $this->addTimestampConditions( $it, $dbr );
329  $it->addConditions( [
330  'rc_namespace' => NS_CATEGORY,
331  'rc_new' => 0,
332  'rc_log_type' => 'delete',
333  'rc_log_action' => 'delete',
334  'rc_type' => RC_LOG,
335  // We will fetch ones that do not have page record. If they do,
336  // this means they were restored, thus restoring handler will pick it up.
337  'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
338  ] );
339  $this->addIndex( $it );
340  $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
341  return $it;
342  }
343 
349  protected function getRestoredCatsIterator( IDatabase $dbr ) {
350  $it = $this->setupChangesIterator( $dbr );
351  $it->addConditions( [
352  'rc_namespace' => NS_CATEGORY,
353  'rc_new' => 0,
354  'rc_log_type' => 'delete',
355  'rc_log_action' => 'restore',
356  'rc_type' => RC_LOG,
357  // We will only fetch ones that have page record
358  'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359  ] );
360  $this->addIndex( $it );
361  return $it;
362  }
363 
369  protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
370  $it =
371  $this->setupChangesIterator( $dbr );
372  $it->addConditions( [
373  'rc_namespace' => NS_CATEGORY,
374  'rc_new' => 0,
375  'rc_type' => $type,
376  ] );
377  $this->addIndex( $it );
378  return $it;
379  }
380 
386  private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
387  $it->addConditions( [
388  'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
389  'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
390  ] );
391  }
392 
397  private function addIndex( BatchRowIterator $it ) {
398  $it->addOptions( [
399  'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
400  ] );
401  }
402 
409  protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
410  $it = new BatchRowIterator(
411  $dbr,
412  'categorylinks',
413  [ 'cl_from', 'cl_to' ],
414  $this->mBatchSize
415  );
416  $it->addConditions( [
417  'cl_type' => 'subcat',
418  'cl_from' => $ids
419  ] );
420  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
421  return new RecursiveIteratorIterator( $it );
422  }
423 
428  public function getRdf() {
429  return $this->rdfWriter->drain();
430  }
431 
437  public function handleDeletes( IDatabase $dbr, $output ) {
438  // This only does "true" deletes - i.e. those that the page stays deleted
439  foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
440  $deleteUrls = [];
441  foreach ( $batch as $row ) {
442  // This can produce duplicates, we don't care
443  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
444  $this->processed[$row->rc_cur_id] = true;
445  }
446  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
447  }
448  }
449 
454  private function writeCategoryData( $row ) {
455  $this->categoriesRdf->writeCategoryData(
456  $row->rc_title,
457  $row->pp_propname === 'hiddencat',
458  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
459  (int)$row->cat_subcats
460  );
461  }
462 
467  public function handleMoves( IDatabase $dbr, $output ) {
468  foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
469  $pages = [];
470  $deleteUrls = [];
471  foreach ( $batch as $row ) {
472  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
473 
474  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
475  // We already captured this one before
476  continue;
477  }
478 
479  if ( $row->page_namespace != NS_CATEGORY ) {
480  // If page was moved out of Category:, we'll just delete
481  continue;
482  }
483  $row->rc_title = $row->page_title;
484  $this->writeCategoryData( $row );
485  $pages[$row->rc_cur_id] = $row->page_title;
486  $this->processed[$row->rc_cur_id] = true;
487  }
488 
489  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
490  }
491  }
492 
497  public function handleRestores( IDatabase $dbr, $output ) {
498  fwrite( $output, "# Restores\n" );
499  // This will only find those restores that were not deleted later.
500  foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
501  $pages = [];
502  foreach ( $batch as $row ) {
503  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
504  // We already captured this one before
505  continue;
506  }
507  $this->writeCategoryData( $row );
508  $pages[$row->rc_cur_id] = $row->rc_title;
509  $this->processed[$row->rc_cur_id] = true;
510  }
511 
512  if ( empty( $pages ) ) {
513  continue;
514  }
515 
516  $this->writeParentCategories( $dbr, $pages );
517 
518  fwrite( $output, $this->getInsertRdf() );
519  }
520  }
521 
526  public function handleAdds( IDatabase $dbr, $output ) {
527  fwrite( $output, "# Additions\n" );
528  foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
529  $pages = [];
530  foreach ( $batch as $row ) {
531  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
532  // We already captured this one before
533  continue;
534  }
535  $this->writeCategoryData( $row );
536  $pages[$row->rc_cur_id] = $row->rc_title;
537  $this->processed[$row->rc_cur_id] = true;
538  }
539 
540  if ( empty( $pages ) ) {
541  continue;
542  }
543 
544  $this->writeParentCategories( $dbr, $pages );
545  fwrite( $output, $this->getInsertRdf() );
546  }
547  }
548 
554  public function handleEdits( IDatabase $dbr, $output ) {
555  // Editing category can change hidden flag and add new parents.
556  // TODO: it's pretty expensive to update all edited categories, and most edits
557  // aren't actually interesting for us. Some way to know which are interesting?
558  // We can capture recategorization on the next step, but not change in hidden status.
559  foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
560  $pages = [];
561  $deleteUrls = [];
562  foreach ( $batch as $row ) {
563  // Note that on categorization event, cur_id points to
564  // the child page, not the parent category!
565  if ( isset( $this->processed[$row->rc_cur_id] ) ) {
566  // We already captured this one before
567  continue;
568  }
569  $this->writeCategoryData( $row );
570  $pages[$row->rc_cur_id] = $row->rc_title;
571  $this->processed[$row->rc_cur_id] = true;
572  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
573  }
574 
575  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
576  }
577  }
578 
584  public function handleCategorization( IDatabase $dbr, $output ) {
585  $processedTitle = [];
586  // Categorization change can add new parents and change counts
587  // for the parent category.
588  foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
589  /*
590  * Note that on categorization event, cur_id points to
591  * the child page, not the parent category!
592  * So we need to have a two-stage process, since we have ID from one
593  * category and title from another, and we need both for proper updates.
594  * TODO: For now, we do full update even though some data hasn't changed,
595  * e.g. parents for parent cat and counts for child cat.
596  */
597  $childPages = [];
598  $parentCats = [];
599  foreach ( $batch as $row ) {
600  $childPages[$row->rc_cur_id] = true;
601  $parentCats[$row->rc_title] = true;
602  }
603 
604  $joinConditions = [
605  'page_props' => [
606  'LEFT JOIN',
607  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
608  ],
609  'category' => [
610  'LEFT JOIN',
611  [ 'cat_title = page_title' ],
612  ],
613  ];
614 
615  $pages = [];
616  $deleteUrls = [];
617 
618  if ( $childPages ) {
619  // Load child rows by ID
620  $childRows = $dbr->select(
621  [ 'page', 'page_props', 'category' ],
622  [
623  'page_id',
624  'rc_title' => 'page_title',
625  'pp_propname',
626  'cat_pages',
627  'cat_subcats',
628  'cat_files',
629  ],
630  [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
631  __METHOD__,
632  [],
633  $joinConditions
634  );
635  foreach ( $childRows as $row ) {
636  if ( isset( $this->processed[$row->page_id] ) ) {
637  // We already captured this one before
638  continue;
639  }
640  $this->writeCategoryData( $row );
641  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
642  $this->processed[$row->page_id] = true;
643  }
644  }
645 
646  if ( $parentCats ) {
647  // Load parent rows by title
648  $joinConditions = [
649  'page' => [
650  'LEFT JOIN',
651  [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
652  ],
653  'page_props' => [
654  'LEFT JOIN',
655  [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
656  ],
657  ];
658 
659  $parentRows = $dbr->select(
660  [ 'category', 'page', 'page_props' ],
661  [
662  'page_id',
663  'rc_title' => 'cat_title',
664  'pp_propname',
665  'cat_pages',
666  'cat_subcats',
667  'cat_files',
668  ],
669  [ 'cat_title' => array_keys( $parentCats ) ],
670  __METHOD__,
671  [],
672  $joinConditions
673  );
674  foreach ( $parentRows as $row ) {
675  if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
676  // We already captured this one before
677  continue;
678  }
679  if ( isset( $processedTitle[$row->rc_title] ) ) {
680  // We already captured this one before
681  continue;
682  }
683  $this->writeCategoryData( $row );
684  $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
685  if ( $row->page_id ) {
686  $this->processed[$row->page_id] = true;
687  }
688  $processedTitle[$row->rc_title] = true;
689  }
690  }
691 
692  fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
693  }
694  }
695 }
696 
697 $maintClass = CategoryChangesAsRdf::class;
698 require_once RUN_MAINTENANCE_IF_MAIN;
handleDeletes(IDatabase $dbr, $output)
Handle category deletes.
getDeletedCatsIterator(IDatabase $dbr)
Fetch deleted categories.
const RC_CATEGORIZE
Definition: Defines.php:126
Maintenance script to provide RDF representation of the recent changes in category tree...
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
getRdf()
Get accumulated RDF.
error( $err, $die=0)
Throw an error to the user.
addIndex(BatchRowIterator $it)
Need to force index, somehow on terbium the optimizer chooses wrong one.
addTimestampConditions(BatchRowIterator $it, IDatabase $dbr)
Add timestamp limits to iterator.
getMovedCatsIterator(IDatabase $dbr)
Fetch moved categories.
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:82
Helper class to produce RDF representation of categories.
const SPARQL_DELETE_INSERT
Delete/Insert query.
setBatchSize( $s=0)
Set the batch size.
timestamp( $ts=0)
Convert a timestamp in one of the formats accepted by ConvertibleTimestamp to the format used for ins...
updateTS( $timestamp)
Generate SPARQL Update code for updating dump timestamp.
writeParentCategories(IDatabase $dbr, $pages)
Write parent data for a set of categories.
getRestoredCatsIterator(IDatabase $dbr)
Fetch restored categories.
const SPARQL_INSERT
Insert query.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
getInsertRdf()
Get the text of SPARQL INSERT DATA clause.
handleRestores(IDatabase $dbr, $output)
writeCategoryData( $row)
Write category data to RDF.
addDescription( $text)
Set the description text.
const NS_CATEGORY
Definition: Defines.php:74
handleMoves(IDatabase $dbr, $output)
handleCategorization(IDatabase $dbr, $output)
Handles categorization changes.
getCategoriesUpdate(IDatabase $dbr, $deleteUrls, $pages, $mark)
Get SPARQL for updating set of categories.
const SPARQL_DELETE
Delete query.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
handleEdits(IDatabase $dbr, $output)
Handle edits for category texts.
handleAdds(IDatabase $dbr, $output)
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
getChangedCatsIterator(IDatabase $dbr, $type)
Fetch categorization changes or edits.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
CategoriesRdf $categoriesRdf
Categories RDF helper.
initialize()
Initialize external service classes.
const DB_REPLICA
Definition: defines.php:25
addOptions(array $options)
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
addQuotes( $s)
Escape and quote a raw value string for use in a SQL query.
setupChangesIterator(IDatabase $dbr, array $columns=[], array $extra_tables=[])
Set up standard iterator for retrieving category changes.
getNewCatsIterator(IDatabase $dbr)
Fetch newly created categories.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
This program is free software; you can redistribute it and/or modify it under the terms of the GNU Ge...
const RC_EDIT
Definition: Defines.php:122
const RC_LOG
Definition: Defines.php:124
addConditions(array $conditions)