MediaWiki 1.39.10
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1<?php
21use Wikimedia\Purtle\RdfWriter;
22use Wikimedia\Purtle\RdfWriterFactory;
24
25require_once __DIR__ . '/Maintenance.php';
26
37 private $rdfWriter;
42 private $categoriesRdf;
43
44 public function __construct() {
45 parent::__construct();
46
47 $this->addDescription( "Generate RDF dump of categories in a wiki." );
48
49 $this->setBatchSize( 200 );
50 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
51 false, true );
52 $this->addOption( 'format', "Set the dump format.", false, true );
53 }
54
61 public function getCategoryIterator( IDatabase $dbr, $fname ) {
62 $it = new BatchRowIterator(
63 $dbr,
64 [ 'page', 'page_props', 'category' ],
65 [ 'page_title' ],
66 $this->getBatchSize()
67 );
68 $it->addConditions( [
69 'page_namespace' => NS_CATEGORY,
70 ] );
71 $it->setFetchColumns( [
72 'page_title',
73 'page_id',
74 'pp_propname',
75 'cat_pages',
76 'cat_subcats',
77 'cat_files'
78 ] );
79 $it->addJoinConditions(
80 [
81 'page_props' => [
82 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
83 ],
84 'category' => [
85 'LEFT JOIN', [ 'cat_title = page_title' ]
86 ]
87 ]
88
89 );
90 $it->setCaller( $fname );
91 return $it;
92 }
93
101 public function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
102 $it = new BatchRowIterator(
103 $dbr,
104 'categorylinks',
105 [ 'cl_from', 'cl_to' ],
106 $this->getBatchSize()
107 );
108 $it->addConditions( [
109 'cl_type' => 'subcat',
110 'cl_from' => $ids
111 ] );
112 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
113 $it->setCaller( $fname );
114 return new RecursiveIteratorIterator( $it );
115 }
116
120 public function addDumpHeader( $timestamp ) {
121 $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl );
122 if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
123 $licenseUrl = 'https:' . $licenseUrl;
124 }
125 $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
126 ->a( 'schema', 'Dataset' )
127 ->a( 'owl', 'Ontology' )
128 ->say( 'cc', 'license' )->is( $licenseUrl )
129 ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
130 ->say( 'schema', 'dateModified' )
131 ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
132 ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
133 ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
134 }
135
136 public function execute() {
137 $outFile = $this->getOption( 'output', 'php://stdout' );
138
139 if ( $outFile === '-' ) {
140 $outFile = 'php://stdout';
141 }
142
143 $output = fopen( $outFile, 'w' );
144 $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
145 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
146
147 $this->categoriesRdf->setupPrefixes();
148 $this->rdfWriter->start();
149
150 $this->addDumpHeader( time() );
151 fwrite( $output, $this->rdfWriter->drain() );
152
153 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
154
155 foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
156 $pages = [];
157 foreach ( $batch as $row ) {
158 $this->categoriesRdf->writeCategoryData(
159 $row->page_title,
160 $row->pp_propname === 'hiddencat',
161 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
162 (int)$row->cat_subcats
163 );
164 if ( $row->page_id ) {
165 $pages[$row->page_id] = $row->page_title;
166 }
167 }
168
169 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
170 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
171 }
172 fwrite( $output, $this->rdfWriter->drain() );
173 }
174 fflush( $output );
175 if ( $outFile !== '-' ) {
176 fclose( $output );
177 }
178 }
179
184 private function createRdfWriter( $format ) {
185 $factory = new RdfWriterFactory();
186 return $factory->getWriter( $factory->getFormatName( $format ) );
187 }
188}
189
190$maintClass = DumpCategoriesAsRdf::class;
191require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const PROTO_CANONICAL
Definition Defines.php:199
const NS_CATEGORY
Definition Defines.php:78
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
Helper class to produce RDF representation of categories.
const FORMAT_VERSION
Current version of the dump format.
const OWL_URL
OWL description of the ontology.
Maintenance script to provide RDF representation of the category tree.
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
getCategoryIterator(IDatabase $dbr, $fname)
Produce row iterator for categories.
execute()
Do the actual work.
__construct()
Default constructor.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
A class containing constants representing the names of configuration variables.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:39
const DB_REPLICA
Definition defines.php:26