MediaWiki  master
dumpCategoriesAsRdf.php
Go to the documentation of this file.
1 <?php
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\RdfWriterFactory;
22 
23 require_once __DIR__ . '/Maintenance.php';
24 
35  private $rdfWriter;
40  private $categoriesRdf;
41 
42  public function __construct() {
43  parent::__construct();
44 
45  $this->addDescription( "Generate RDF dump of categories in a wiki." );
46 
47  $this->setBatchSize( 200 );
48  $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
49  false, true );
50  $this->addOption( 'format', "Set the dump format.", false, true );
51  }
52 
59  public function getCategoryIterator( IDatabase $dbr, $fname ) {
60  $it = new BatchRowIterator(
61  $dbr,
62  [ 'page', 'page_props', 'category' ],
63  [ 'page_title' ],
64  $this->getBatchSize()
65  );
66  $it->addConditions( [
67  'page_namespace' => NS_CATEGORY,
68  ] );
69  $it->setFetchColumns( [
70  'page_title',
71  'page_id',
72  'pp_propname',
73  'cat_pages',
74  'cat_subcats',
75  'cat_files'
76  ] );
77  $it->addJoinConditions(
78  [
79  'page_props' => [
80  'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
81  ],
82  'category' => [
83  'LEFT JOIN', [ 'cat_title = page_title' ]
84  ]
85  ]
86 
87  );
88  $it->setCaller( $fname );
89  return $it;
90  }
91 
99  public function getCategoryLinksIterator( IDatabase $dbr, array $ids, $fname ) {
100  $it = new BatchRowIterator(
101  $dbr,
102  'categorylinks',
103  [ 'cl_from', 'cl_to' ],
104  $this->getBatchSize()
105  );
106  $it->addConditions( [
107  'cl_type' => 'subcat',
108  'cl_from' => $ids
109  ] );
110  $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
111  $it->setCaller( $fname );
112  return new RecursiveIteratorIterator( $it );
113  }
114 
118  public function addDumpHeader( $timestamp ) {
119  global $wgRightsUrl;
120  $licenseUrl = $wgRightsUrl;
121  if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
122  $licenseUrl = 'https:' . $licenseUrl;
123  }
124  $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
125  ->a( 'schema', 'Dataset' )
126  ->a( 'owl', 'Ontology' )
127  ->say( 'cc', 'license' )->is( $licenseUrl )
128  ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
129  ->say( 'schema', 'dateModified' )
130  ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
131  ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
132  ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
133  }
134 
135  public function execute() {
136  $outFile = $this->getOption( 'output', 'php://stdout' );
137 
138  if ( $outFile === '-' ) {
139  $outFile = 'php://stdout';
140  }
141 
142  $output = fopen( $outFile, 'w' );
143  $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
144  $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
145 
146  $this->categoriesRdf->setupPrefixes();
147  $this->rdfWriter->start();
148 
149  $this->addDumpHeader( time() );
150  fwrite( $output, $this->rdfWriter->drain() );
151 
152  $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
153 
154  foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
155  $pages = [];
156  foreach ( $batch as $row ) {
157  $this->categoriesRdf->writeCategoryData(
158  $row->page_title,
159  $row->pp_propname === 'hiddencat',
160  (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
161  (int)$row->cat_subcats
162  );
163  if ( $row->page_id ) {
164  $pages[$row->page_id] = $row->page_title;
165  }
166  }
167 
168  foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
169  $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
170  }
171  fwrite( $output, $this->rdfWriter->drain() );
172  }
173  fflush( $output );
174  if ( $outFile !== '-' ) {
175  fclose( $output );
176  }
177  }
178 
183  private function createRdfWriter( $format ) {
184  $factory = new RdfWriterFactory();
185  return $factory->getWriter( $factory->getFormatName( $format ) );
186  }
187 }
188 
189 $maintClass = DumpCategoriesAsRdf::class;
190 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:38
PROTO_CANONICAL
const PROTO_CANONICAL
Definition: Defines.php:212
CategoriesRdf
Helper class to produce RDF representation of categories.
Definition: CategoriesRdf.php:24
DumpCategoriesAsRdf\getCategoryIterator
getCategoryIterator(IDatabase $dbr, $fname)
Produce row iterator for categories.
Definition: dumpCategoriesAsRdf.php:59
CategoriesRdf\OWL_URL
const OWL_URL
OWL description of the ontology.
Definition: CategoriesRdf.php:36
DumpCategoriesAsRdf\$categoriesRdf
CategoriesRdf $categoriesRdf
Categories RDF helper.
Definition: dumpCategoriesAsRdf.php:40
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:327
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1818
BatchRowIterator
Allows iterating a large number of rows in batches transparently.
Definition: BatchRowIterator.php:33
$maintClass
$maintClass
Definition: dumpCategoriesAsRdf.php:189
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:55
DumpCategoriesAsRdf\__construct
__construct()
Default constructor.
Definition: dumpCategoriesAsRdf.php:42
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
$dbr
$dbr
Definition: testCompression.php:54
DumpCategoriesAsRdf\$rdfWriter
RdfWriter $rdfWriter
Definition: dumpCategoriesAsRdf.php:35
DumpCategoriesAsRdf\createRdfWriter
createRdfWriter( $format)
Definition: dumpCategoriesAsRdf.php:183
CategoriesRdf\FORMAT_VERSION
const FORMAT_VERSION
Current version of the dump format.
Definition: CategoriesRdf.php:40
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:245
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:83
DumpCategoriesAsRdf\addDumpHeader
addDumpHeader( $timestamp)
Definition: dumpCategoriesAsRdf.php:118
DumpCategoriesAsRdf\getCategoryLinksIterator
getCategoryLinksIterator(IDatabase $dbr, array $ids, $fname)
Get iterator for links for categories.
Definition: dumpCategoriesAsRdf.php:99
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1369
DumpCategoriesAsRdf\execute
execute()
Do the actual work.
Definition: dumpCategoriesAsRdf.php:135
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:281
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:366
DumpCategoriesAsRdf
Maintenance script to provide RDF representation of the category tree.
Definition: dumpCategoriesAsRdf.php:31
wfExpandUrl
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
Definition: GlobalFunctions.php:493
Maintenance\setBatchSize
setBatchSize( $s=0)
Definition: Maintenance.php:373
$wgRightsUrl
$wgRightsUrl
Set this to specify an external URL containing details about the content license used on your wiki.
Definition: DefaultSettings.php:7588