MediaWiki  master
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1 <?php
21 use Wikimedia\Purtle\RdfWriter;
22 use Wikimedia\Purtle\RdfWriterFactory;
24 
25 require_once __DIR__ . '/Maintenance.php';
26 
37  private $rdfWriter;
42  private $categoriesRdf;
43 
44  public function __construct() {
45  parent::__construct();
46 
47  $this->addDescription( "Generate RDF dump of categories in a wiki." );
48 
49  $this->setBatchSize( 200 );
50  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
51  false, true );
52  $this->addOption( 'format', "Set the dump format.", false, true );
53  }
54 
61  public function getCategoryIterator( IDatabase $dbr, $fname ) {
62  $it = new BatchRowIterator(
63  $dbr,
64  [ 'page', 'page_props', 'category' ],
65  [ 'page_title' ],
66  $this->getBatchSize()
67  );
68  $it->addConditions( [
69  'page_namespace' => NS_CATEGORY,
70  ] );
71  $it->setFetchColumns( [
72  'page_title',
73  'page_id',
74  'pp_propname',
75  'cat_pages',
76  'cat_subcats',
77  'cat_files'
78  ] );
79  $it->addJoinConditions(
80  [
81  'page_props' => [
82  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
83  ],
84  'category' => [
85  'LEFT JOIN', [ 'cat_title = page_title' ]
86  ]
87  ]
88 
89  );
90  $it->setCaller( $fname );
91  return $it;
92  }
93 
101  public function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
102  $it = new BatchRowIterator(
103  $dbr,
104  'categorylinks',
105  [ 'cl_from', 'cl_to' ],
106  $this->getBatchSize()
107  );
108  $it->addConditions( [
109  'cl_type' => 'subcat',
110  'cl_from' => $ids
111  ] );
112  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
113  $it->setCaller( $fname );
114  return new RecursiveIteratorIterator( $it );
115  }
116 
120  public function addDumpHeader( $timestamp ) {
121  $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl );
122  if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
123  $licenseUrl = 'https:' . $licenseUrl;
124  }
125  $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
126  ->a( 'schema', 'Dataset' )
127  ->a( 'owl', 'Ontology' )
128  ->say( 'cc', 'license' )->is( $licenseUrl )
129  ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
130  ->say( 'schema', 'dateModified' )
131  ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
132  ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
133  ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
134  }
135 
136  public function execute() {
137  $outFile = $this->getOption( 'output', 'php://stdout' );
138 
139  if ( $outFile === '-' ) {
140  $outFile = 'php://stdout';
141  }
142 
143  $output = fopen( $outFile, 'w' );
144  $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
145  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
146 
147  $this->categoriesRdf->setupPrefixes();
148  $this->rdfWriter->start();
149 
150  $this->addDumpHeader( time() );
151  fwrite( $output, $this->rdfWriter->drain() );
152 
153  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
154 
155  foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
156  $pages = [];
157  foreach ( $batch as $row ) {
158  $this->categoriesRdf->writeCategoryData(
159  $row->page_title,
160  $row->pp_propname === 'hiddencat',
161  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
162  (int)$row->cat_subcats
163  );
164  if ( $row->page_id ) {
165  $pages[$row->page_id] = $row->page_title;
166  }
167  }
168 
169  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
170  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
171  }
172  fwrite( $output, $this->rdfWriter->drain() );
173  }
174  fflush( $output );
175  if ( $outFile !== '-' ) {
176  fclose( $output );
177  }
178  }
179 
184  private function createRdfWriter( $format ) {
185  $factory = new RdfWriterFactory();
186  return $factory->getWriter( $factory->getFormatName( $format ) );
187  }
188 }
189 
190 $maintClass = DumpCategoriesAsRdf::class;
191 require_once RUN_MAINTENANCE_IF_MAIN;
const PROTO_CANONICAL
Definition: Defines.php:199
const NS_CATEGORY
Definition: Defines.php:78
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
Helper class to produce RDF representation of categories.
const FORMAT_VERSION
Current version of the dump format.
const OWL_URL
OWL description of the ontology.
Maintenance script to provide RDF representation of the category tree.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
getCategoryIterator(IDatabase $dbr, $fname)
Produce row iterator for categories.
execute()
Do the actual work.
__construct()
Default constructor.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:39
const DB_REPLICA
Definition: defines.php:26