23use Wikimedia\Purtle\RdfWriter;
24use Wikimedia\Purtle\RdfWriterFactory;
28require_once __DIR__ .
'/Maintenance.php';
46 private $categoriesRdf;
49 parent::__construct();
51 $this->
addDescription(
"Generate RDF dump of categories in a wiki." );
54 $this->
addOption(
'output',
"Output file (default is stdout). Will be overwritten.",
56 $this->
addOption(
'format',
"Set the dump format.",
false,
true );
70 ->leftJoin(
'page_props',
null, [
'pp_propname' =>
'hiddencat',
'pp_page = page_id' ] )
71 ->leftJoin(
'category',
null, [
'cat_title = page_title' ] )
101 ->from(
'categorylinks' )
102 ->select( [
'cl_from',
'cl_to' ] )
104 'cl_type' =>
'subcat',
108 [
'cl_from',
'cl_to' ],
109 $this->getBatchSize()
111 return new RecursiveIteratorIterator( $it );
118 $licenseUrl = $this->
getConfig()->get( MainConfigNames::RightsUrl );
119 if ( str_starts_with( $licenseUrl,
'//' ) ) {
120 $licenseUrl =
'https:' . $licenseUrl;
123 $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
124 ->a(
'schema',
'Dataset' )
125 ->a(
'owl',
'Ontology' )
126 ->say(
'cc',
'license' )->is( $licenseUrl )
127 ->say(
'schema',
'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
128 ->say(
'schema',
'dateModified' )
129 ->value(
wfTimestamp( TS_ISO_8601, $timestamp ),
'xsd',
'dateTime' )
130 ->say(
'schema',
'isPartOf' )->is( (
string)$urlUtils->expand(
'/',
PROTO_CANONICAL ) )
131 ->say(
'owl',
'imports' )->is( CategoriesRdf::OWL_URL );
135 $outFile = $this->
getOption(
'output',
'php://stdout' );
137 if ( $outFile ===
'-' ) {
138 $outFile =
'php://stdout';
141 $output = fopen( $outFile,
'w' );
142 $this->rdfWriter = $this->createRdfWriter( $this->
getOption(
'format',
'ttl' ) );
143 $this->categoriesRdf =
new CategoriesRdf( $this->rdfWriter );
145 $this->categoriesRdf->setupPrefixes();
146 $this->rdfWriter->start();
149 fwrite( $output, $this->rdfWriter->drain() );
155 foreach ( $batch as $row ) {
156 $this->categoriesRdf->writeCategoryData(
158 $row->pp_propname ===
'hiddencat',
159 (
int)$row->cat_pages - (
int)$row->cat_subcats - (
int)$row->cat_files,
160 (
int)$row->cat_subcats
162 if ( $row->page_id ) {
163 $pages[$row->page_id] = $row->page_title;
168 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
170 fwrite( $output, $this->rdfWriter->drain() );
173 if ( $outFile !==
'-' ) {
182 private function createRdfWriter( $format ) {
183 $factory =
new RdfWriterFactory();
184 return $factory->getWriter( $factory->getFormatName( $format ) );
190require_once RUN_MAINTENANCE_IF_MAIN;
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Allows iterating a large number of rows in batches transparently.
Maintenance script to provide RDF representation of the category tree.
getCategoryLinksIterator(IReadableDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
execute()
Do the actual work.
addDumpHeader( $timestamp)
getCategoryIterator(IReadableDatabase $dbr, $fname)
Produce row iterator for categories.
__construct()
Default constructor.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
getOption( $name, $default=null)
Get an option, or return the default.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.