MediaWiki master
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1<?php
9use Wikimedia\Purtle\RdfWriter;
10use Wikimedia\Purtle\RdfWriterFactory;
12use Wikimedia\Timestamp\TimestampFormat as TS;
13
14// @codeCoverageIgnoreStart
15require_once __DIR__ . '/Maintenance.php';
16// @codeCoverageIgnoreEnd
17
28 private $rdfWriter;
33 private $categoriesRdf;
34
35 public function __construct() {
36 parent::__construct();
37
38 $this->addDescription( "Generate RDF dump of categories in a wiki." );
39
40 $this->setBatchSize( 200 );
41 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
42 false, true );
43 $this->addOption( 'format', "Set the dump format.", false, true );
44 }
45
52 public function getCategoryIterator( IReadableDatabase $dbr, $fname ) {
53 $it = new BatchRowIterator(
54 $dbr,
56 ->from( 'page' )
57 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
58 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
59 ->select( [
60 'page_title',
61 'page_id',
62 'pp_propname',
63 'cat_pages',
64 'cat_subcats',
65 'cat_files'
66 ] )
67 ->where( [
68 'page_namespace' => NS_CATEGORY,
69 ] )
70 ->caller( $fname ),
71 [ 'page_title' ],
72 $this->getBatchSize()
73 );
74 return $it;
75 }
76
84 public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
85 $qb = $dbr->newSelectQueryBuilder()
86 ->select( [ 'cl_from', 'lt_title' ] )
87 ->from( 'categorylinks' )
88 ->join( 'linktarget', null, 'cl_target_id=lt_id' )
89 ->where( [
90 'cl_type' => 'subcat',
91 'cl_from' => $ids
92 ] )
93 ->caller( $fname );
94 $primaryKey = [ 'cl_from', 'cl_target_id' ];
95
96 $it = new BatchRowIterator(
97 $dbr,
98 $qb,
99 $primaryKey,
100 $this->getBatchSize()
101 );
102 return new RecursiveIteratorIterator( $it );
103 }
104
108 public function addDumpHeader( $timestamp ) {
109 $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl );
110 if ( str_starts_with( $licenseUrl, '//' ) ) {
111 $licenseUrl = 'https:' . $licenseUrl;
112 }
113 $urlUtils = $this->getServiceContainer()->getUrlUtils();
114 $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
115 ->a( 'schema', 'Dataset' )
116 ->a( 'owl', 'Ontology' )
117 ->say( 'cc', 'license' )->is( $licenseUrl )
118 ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
119 ->say( 'schema', 'dateModified' )
120 ->value( wfTimestamp( TS::ISO_8601, $timestamp ), 'xsd', 'dateTime' )
121 ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) )
122 ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
123 }
124
125 public function execute() {
126 $outFile = $this->getOption( 'output', 'php://stdout' );
127
128 if ( $outFile === '-' ) {
129 $outFile = 'php://stdout';
130 }
131
132 $output = fopen( $outFile, 'w' );
133 $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
134 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
135
136 $this->categoriesRdf->setupPrefixes();
137 $this->rdfWriter->start();
138
139 $this->addDumpHeader( time() );
140 fwrite( $output, $this->rdfWriter->drain() );
141
142 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
143
144 foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
145 $pages = [];
146 foreach ( $batch as $row ) {
147 $this->categoriesRdf->writeCategoryData(
148 $row->page_title,
149 $row->pp_propname === 'hiddencat',
150 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
151 (int)$row->cat_subcats
152 );
153 if ( $row->page_id ) {
154 $pages[$row->page_id] = $row->page_title;
155 }
156 }
157
158 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
159 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
160 }
161 fwrite( $output, $this->rdfWriter->drain() );
162 }
163 fflush( $output );
164 if ( $outFile !== '-' ) {
165 fclose( $output );
166 }
167 }
168
173 private function createRdfWriter( $format ) {
174 $factory = new RdfWriterFactory();
175 return $factory->getWriter( $factory->getFormatName( $format ) );
176 }
177}
178
179// @codeCoverageIgnoreStart
180$maintClass = DumpCategoriesAsRdf::class;
181require_once RUN_MAINTENANCE_IF_MAIN;
182// @codeCoverageIgnoreEnd
const PROTO_CANONICAL
Definition Defines.php:223
const NS_CATEGORY
Definition Defines.php:65
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DB_REPLICA
Definition defines.php:26
Allows iterating a large number of rows in batches transparently.
Maintenance script to provide RDF representation of the category tree.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
getCategoryIterator(IReadableDatabase $dbr, $fname)
Produce row iterator for categories.
__construct()
Default constructor.
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.