MediaWiki  1.34.0
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1 <?php
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\RdfWriterFactory;
22 
23 require_once __DIR__ . '/Maintenance.php';
24 
35  private $rdfWriter;
40  private $categoriesRdf;
41 
42  public function __construct() {
43  parent::__construct();
44 
45  $this->addDescription( "Generate RDF dump of categories in a wiki." );
46 
47  $this->setBatchSize( 200 );
48  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
49  false, true );
50  $this->addOption( 'format', "Set the dump format.", false, true );
51  }
52 
58  public function getCategoryIterator( IDatabase $dbr ) {
59  $it = new BatchRowIterator(
60  $dbr,
61  [ 'page', 'page_props', 'category' ],
62  [ 'page_title' ],
63  $this->getBatchSize()
64  );
65  $it->addConditions( [
66  'page_namespace' => NS_CATEGORY,
67  ] );
68  $it->setFetchColumns( [
69  'page_title',
70  'page_id',
71  'pp_propname',
72  'cat_pages',
73  'cat_subcats',
74  'cat_files'
75  ] );
76  $it->addJoinConditions(
77  [
78  'page_props' => [
79  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
80  ],
81  'category' => [
82  'LEFT JOIN', [ 'cat_title = page_title' ]
83  ]
84  ]
85 
86  );
87  return $it;
88  }
89 
96  public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
97  $it = new BatchRowIterator(
98  $dbr,
99  'categorylinks',
100  [ 'cl_from', 'cl_to' ],
101  $this->getBatchSize()
102  );
103  $it->addConditions( [
104  'cl_type' => 'subcat',
105  'cl_from' => $ids
106  ] );
107  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
108  return new RecursiveIteratorIterator( $it );
109  }
110 
114  public function addDumpHeader( $timestamp ) {
115  global $wgRightsUrl;
116  $licenseUrl = $wgRightsUrl;
117  if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
118  $licenseUrl = 'https:' . $licenseUrl;
119  }
120  $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
121  ->a( 'schema', 'Dataset' )
122  ->a( 'owl', 'Ontology' )
123  ->say( 'cc', 'license' )->is( $licenseUrl )
124  ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
125  ->say( 'schema', 'dateModified' )
126  ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
127  ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
128  ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
129  }
130 
131  public function execute() {
132  $outFile = $this->getOption( 'output', 'php://stdout' );
133 
134  if ( $outFile === '-' ) {
135  $outFile = 'php://stdout';
136  }
137 
138  $output = fopen( $outFile, 'w' );
139  $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
140  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
141 
142  $this->categoriesRdf->setupPrefixes();
143  $this->rdfWriter->start();
144 
145  $this->addDumpHeader( time() );
146  fwrite( $output, $this->rdfWriter->drain() );
147 
148  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
149 
150  foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
151  $pages = [];
152  foreach ( $batch as $row ) {
153  $this->categoriesRdf->writeCategoryData(
154  $row->page_title,
155  $row->pp_propname === 'hiddencat',
156  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
157  (int)$row->cat_subcats
158  );
159  $pages[$row->page_id] = $row->page_title;
160  }
161 
162  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
163  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
164  }
165  fwrite( $output, $this->rdfWriter->drain() );
166  }
167  fflush( $output );
168  if ( $outFile !== '-' ) {
169  fclose( $output );
170  }
171  }
172 
177  private function createRdfWriter( $format ) {
178  $factory = new RdfWriterFactory();
179  return $factory->getWriter( $factory->getFormatName( $format ) );
180  }
181 }
182 
183 $maintClass = DumpCategoriesAsRdf::class;
184 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
PROTO_CANONICAL
const PROTO_CANONICAL
Definition: Defines.php:203
CategoriesRdf
Helper class to produce RDF representation of categories.
Definition: CategoriesRdf.php:24
CategoriesRdf\OWL_URL
const OWL_URL
OWL description of the ontology.
Definition: CategoriesRdf.php:36
DumpCategoriesAsRdf\$categoriesRdf
CategoriesRdf $categoriesRdf
Categories RDF helper.
Definition: dumpCategoriesAsRdf.php:40
DumpCategoriesAsRdf\getCategoryIterator
getCategoryIterator(IDatabase $dbr)
Produce row iterator for categories.
Definition: dumpCategoriesAsRdf.php:58
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:348
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1869
BatchRowIterator
Definition: BatchRowIterator.php:29
$maintClass
$maintClass
Definition: dumpCategoriesAsRdf.php:183
DumpCategoriesAsRdf\getCategoryLinksIterator
getCategoryLinksIterator(IDatabase $dbr, array $ids)
Get iterator for links for categories.
Definition: dumpCategoriesAsRdf.php:96
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:82
DumpCategoriesAsRdf\__construct
__construct()
Default constructor.
Definition: dumpCategoriesAsRdf.php:42
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
$dbr
$dbr
Definition: testCompression.php:50
DumpCategoriesAsRdf\$rdfWriter
RdfWriter $rdfWriter
Definition: dumpCategoriesAsRdf.php:35
DumpCategoriesAsRdf\createRdfWriter
createRdfWriter( $format)
Definition: dumpCategoriesAsRdf.php:177
CategoriesRdf\FORMAT_VERSION
const FORMAT_VERSION
Current version of the dump format.
Definition: CategoriesRdf.php:40
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:267
$output
$output
Definition: SyntaxHighlight.php:335
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:74
DumpCategoriesAsRdf\addDumpHeader
addDumpHeader( $timestamp)
Definition: dumpCategoriesAsRdf.php:114
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1396
DumpCategoriesAsRdf\execute
execute()
Do the actual work.
Definition: dumpCategoriesAsRdf.php:131
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:302
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:386
DumpCategoriesAsRdf
Maintenance script to provide RDF representation of the category tree.
Definition: dumpCategoriesAsRdf.php:31
wfExpandUrl
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
Definition: GlobalFunctions.php:491
Maintenance\setBatchSize
setBatchSize( $s=0)
Set the batch size.
Definition: Maintenance.php:394
$wgRightsUrl
$wgRightsUrl
Set this to specify an external URL containing details about the content license used on your wiki.
Definition: DefaultSettings.php:7156