MediaWiki REL1_31
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1<?php
19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\RdfWriterFactory;
22
23require_once __DIR__ . '/Maintenance.php';
24
35 private $rdfWriter;
41
42 public function __construct() {
43 parent::__construct();
44
45 $this->addDescription( "Generate RDF dump of categories in a wiki." );
46
47 $this->setBatchSize( 200 );
48 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
49 false, true );
50 $this->addOption( 'format', "Set the dump format.", false, true );
51 }
52
58 public function getCategoryIterator( IDatabase $dbr ) {
59 $it = new BatchRowIterator(
60 $dbr,
61 [ 'page', 'page_props', 'category' ],
62 [ 'page_title' ],
63 $this->getBatchSize()
64 );
65 $it->addConditions( [
66 'page_namespace' => NS_CATEGORY,
67 ] );
68 $it->setFetchColumns( [
69 'page_title',
70 'page_id',
71 'pp_propname',
72 'cat_pages',
73 'cat_subcats',
74 'cat_files'
75 ] );
76 $it->addJoinConditions(
77 [
78 'page_props' => [
79 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
80 ],
81 'category' => [
82 'LEFT JOIN', [ 'cat_title = page_title' ]
83 ]
84 ]
85
86 );
87 return $it;
88 }
89
96 public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
97 $it = new BatchRowIterator(
98 $dbr,
99 'categorylinks',
100 [ 'cl_from', 'cl_to' ],
101 $this->getBatchSize()
102 );
103 $it->addConditions( [
104 'cl_type' => 'subcat',
105 'cl_from' => $ids
106 ] );
107 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
108 return new RecursiveIteratorIterator( $it );
109 }
110
114 public function addDumpHeader( $timestamp ) {
115 global $wgRightsUrl;
116 $licenseUrl = $wgRightsUrl;
117 if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
118 $licenseUrl = 'https:' . $licenseUrl;
119 }
120 $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
121 ->a( 'schema', 'Dataset' )
122 ->a( 'owl', 'Ontology' )
123 ->say( 'cc', 'license' )->is( $licenseUrl )
124 ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
125 ->say( 'schema', 'dateModified' )
126 ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
127 ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
128 ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
129 }
130
131 public function execute() {
132 $outFile = $this->getOption( 'output', 'php://stdout' );
133
134 if ( $outFile === '-' ) {
135 $outFile = 'php://stdout';
136 }
137
138 $output = fopen( $outFile, 'w' );
139 $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
140 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
141
142 $this->categoriesRdf->setupPrefixes();
143 $this->rdfWriter->start();
144
145 $this->addDumpHeader( time() );
146 fwrite( $output, $this->rdfWriter->drain() );
147
148 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
149
150 foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
151 $pages = [];
152 foreach ( $batch as $row ) {
153 $this->categoriesRdf->writeCategoryData(
154 $row->page_title,
155 $row->pp_propname === 'hiddencat',
156 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
157 (int)$row->cat_subcats
158 );
159 if ( $row->page_id ) {
160 $pages[$row->page_id] = $row->page_title;
161 }
162 }
163
164 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
165 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
166 }
167 fwrite( $output, $this->rdfWriter->drain() );
168 }
169 fflush( $output );
170 if ( $outFile !== '-' ) {
171 fclose( $output );
172 }
173 }
174
179 private function createRdfWriter( $format ) {
180 $factory = new RdfWriterFactory();
181 return $factory->getWriter( $factory->getFormatName( $format ) );
182 }
183}
184
185$maintClass = DumpCategoriesAsRdf::class;
186require_once RUN_MAINTENANCE_IF_MAIN;
$wgRightsUrl
Set this to specify an external URL containing details about the content license used on your wiki.
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Helper class to produce RDF representation of categories.
const FORMAT_VERSION
Current version of the dump format.
const OWL_URL
OWL description of the ontology.
Maintenance script to provide RDF representation of the category tree.
getCategoryIterator(IDatabase $dbr)
Produce row iterator for categories.
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
execute()
Do the actual work.
CategoriesRdf $categoriesRdf
Categories RDF helper.
__construct()
Default constructor.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getDB( $db, $groups=[], $wiki=false)
Returns a database to be used by current maintenance script.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
Set the batch size.
static configuration should be added through ResourceLoaderGetConfigVars instead can be used to get the real title after the basic globals have been set but before ordinary actions take place $output
Definition hooks.txt:2255
const PROTO_CANONICAL
Definition Defines.php:233
const NS_CATEGORY
Definition Defines.php:88
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
$batch
Definition linkcache.txt:23
require_once RUN_MAINTENANCE_IF_MAIN
const DB_REPLICA
Definition defines.php:25