MediaWiki master
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1<?php
10use Wikimedia\Purtle\RdfWriter;
11use Wikimedia\Purtle\RdfWriterFactory;
13use Wikimedia\Timestamp\TimestampFormat as TS;
14
15// @codeCoverageIgnoreStart
16require_once __DIR__ . '/Maintenance.php';
17// @codeCoverageIgnoreEnd
18
29 private $rdfWriter;
34 private $categoriesRdf;
35
36 public function __construct() {
37 parent::__construct();
38
39 $this->addDescription( "Generate RDF dump of categories in a wiki." );
40
41 $this->setBatchSize( 200 );
42 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
43 false, true );
44 $this->addOption( 'format', "Set the dump format.", false, true );
45 }
46
53 public function getCategoryIterator( IReadableDatabase $dbr, $fname ) {
54 $it = new BatchRowIterator(
55 $dbr,
57 ->from( 'page' )
58 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
59 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
60 ->select( [
61 'page_title',
62 'page_id',
63 'pp_propname',
64 'cat_pages',
65 'cat_subcats',
66 'cat_files'
67 ] )
68 ->where( [
69 'page_namespace' => NS_CATEGORY,
70 ] )
71 ->caller( $fname ),
72 [ 'page_title' ],
73 $this->getBatchSize()
74 );
75 return $it;
76 }
77
85 public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
86 $qb = $dbr->newSelectQueryBuilder()
87 ->select( [ 'cl_from', 'lt_title' ] )
88 ->from( 'categorylinks' )
89 ->join( 'linktarget', null, 'cl_target_id=lt_id' )
90 ->where( [
91 'cl_type' => 'subcat',
92 'cl_from' => $ids
93 ] )
94 ->caller( $fname );
95 $primaryKey = [ 'cl_from', 'cl_target_id' ];
96
97 $it = new BatchRowIterator(
98 $dbr,
99 $qb,
100 $primaryKey,
101 $this->getBatchSize()
102 );
103 return new RecursiveIteratorIterator( $it );
104 }
105
109 public function addDumpHeader( $timestamp ) {
110 $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl );
111 if ( str_starts_with( $licenseUrl, '//' ) ) {
112 $licenseUrl = 'https:' . $licenseUrl;
113 }
114 $urlUtils = $this->getServiceContainer()->getUrlUtils();
115 $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
116 ->a( 'schema', 'Dataset' )
117 ->a( 'owl', 'Ontology' )
118 ->say( 'cc', 'license' )->is( $licenseUrl )
119 ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
120 ->say( 'schema', 'dateModified' )
121 ->value( wfTimestamp( TS::ISO_8601, $timestamp ), 'xsd', 'dateTime' )
122 ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) )
123 ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
124 }
125
126 public function execute() {
127 $outFile = $this->getOption( 'output', 'php://stdout' );
128
129 if ( $outFile === '-' ) {
130 $outFile = 'php://stdout';
131 }
132
133 $output = fopen( $outFile, 'w' );
134 $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
135 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
136
137 $this->categoriesRdf->setupPrefixes();
138 $this->rdfWriter->start();
139
140 $this->addDumpHeader( time() );
141 fwrite( $output, $this->rdfWriter->drain() );
142
143 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
144
145 foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
146 $pages = [];
147 foreach ( $batch as $row ) {
148 $this->categoriesRdf->writeCategoryData(
149 $row->page_title,
150 $row->pp_propname === 'hiddencat',
151 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
152 (int)$row->cat_subcats
153 );
154 if ( $row->page_id ) {
155 $pages[$row->page_id] = $row->page_title;
156 }
157 }
158
159 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
160 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
161 }
162 fwrite( $output, $this->rdfWriter->drain() );
163 }
164 fflush( $output );
165 if ( $outFile !== '-' ) {
166 fclose( $output );
167 }
168 }
169
174 private function createRdfWriter( $format ) {
175 $factory = new RdfWriterFactory();
176 return $factory->getWriter( $factory->getFormatName( $format ) );
177 }
178}
179
180// @codeCoverageIgnoreStart
181$maintClass = DumpCategoriesAsRdf::class;
182require_once RUN_MAINTENANCE_IF_MAIN;
183// @codeCoverageIgnoreEnd
const PROTO_CANONICAL
Definition Defines.php:223
const NS_CATEGORY
Definition Defines.php:65
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DB_REPLICA
Definition defines.php:26
Maintenance script to provide RDF representation of the category tree.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
getCategoryIterator(IReadableDatabase $dbr, $fname)
Produce row iterator for categories.
__construct()
Default constructor.
Helper class to produce RDF representation of categories.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Allows iterating a large number of rows in batches transparently.
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.