Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
53.41% covered (warning)
53.41%
47 / 88
33.33% covered (danger)
33.33%
2 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
DumpCategoriesAsRdf
53.41% covered (warning)
53.41%
47 / 88
33.33% covered (danger)
33.33%
2 / 6
30.09
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 getCategoryIterator
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
2
 getCategoryLinksIterator
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
2
 addDumpHeader
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
2.00
 execute
96.43% covered (success)
96.43%
27 / 28
0.00% covered (danger)
0.00%
0 / 1
7
 createRdfWriter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2/**
3 * @license GPL-2.0-or-later
4 */
5
6use MediaWiki\Category\CategoriesRdf;
7use MediaWiki\MainConfigNames;
8use MediaWiki\Maintenance\Maintenance;
9use MediaWiki\Utils\BatchRowIterator;
10use Wikimedia\Purtle\RdfWriter;
11use Wikimedia\Purtle\RdfWriterFactory;
12use Wikimedia\Rdbms\IReadableDatabase;
13use Wikimedia\Timestamp\TimestampFormat as TS;
14
15// @codeCoverageIgnoreStart
16require_once __DIR__ . '/Maintenance.php';
17// @codeCoverageIgnoreEnd
18
19/**
20 * Maintenance script to provide RDF representation of the category tree.
21 *
22 * @ingroup Maintenance
23 * @since 1.30
24 */
25class DumpCategoriesAsRdf extends Maintenance {
26    /**
27     * @var RdfWriter
28     */
29    private $rdfWriter;
30    /**
31     * Categories RDF helper.
32     * @var CategoriesRdf
33     */
34    private $categoriesRdf;
35
36    public function __construct() {
37        parent::__construct();
38
39        $this->addDescription( "Generate RDF dump of categories in a wiki." );
40
41        $this->setBatchSize( 200 );
42        $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
43            false, true );
44        $this->addOption( 'format', "Set the dump format.", false, true );
45    }
46
47    /**
48     * Produce row iterator for categories.
49     * @param IReadableDatabase $dbr
50     * @param string $fname Name of the calling function
51     * @return RecursiveIterator
52     */
53    public function getCategoryIterator( IReadableDatabase $dbr, $fname ) {
54        $it = new BatchRowIterator(
55            $dbr,
56            $dbr->newSelectQueryBuilder()
57                ->from( 'page' )
58                ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
59                ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
60                ->select( [
61                    'page_title',
62                    'page_id',
63                    'pp_propname',
64                    'cat_pages',
65                    'cat_subcats',
66                    'cat_files'
67                ] )
68                ->where( [
69                    'page_namespace' => NS_CATEGORY,
70                ] )
71                ->caller( $fname ),
72            [ 'page_title' ],
73            $this->getBatchSize()
74        );
75        return $it;
76    }
77
78    /**
79     * Get iterator for links for categories.
80     * @param IReadableDatabase $dbr
81     * @param int[] $ids List of page IDs
82     * @param string $fname Name of the calling function
83     * @return Traversable
84     */
85    public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
86        $qb = $dbr->newSelectQueryBuilder()
87            ->select( [ 'cl_from', 'lt_title' ] )
88            ->from( 'categorylinks' )
89            ->join( 'linktarget', null, 'cl_target_id=lt_id' )
90            ->where( [
91                'cl_type' => 'subcat',
92                'cl_from' => $ids
93            ] )
94            ->caller( $fname );
95            $primaryKey = [ 'cl_from', 'cl_target_id' ];
96
97        $it = new BatchRowIterator(
98            $dbr,
99            $qb,
100            $primaryKey,
101            $this->getBatchSize()
102        );
103        return new RecursiveIteratorIterator( $it );
104    }
105
106    /**
107     * @param int $timestamp
108     */
109    public function addDumpHeader( $timestamp ) {
110        $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl );
111        if ( str_starts_with( $licenseUrl, '//' ) ) {
112            $licenseUrl = 'https:' . $licenseUrl;
113        }
114        $urlUtils = $this->getServiceContainer()->getUrlUtils();
115        $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
116            ->a( 'schema', 'Dataset' )
117            ->a( 'owl', 'Ontology' )
118            ->say( 'cc', 'license' )->is( $licenseUrl )
119            ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
120            ->say( 'schema', 'dateModified' )
121            ->value( wfTimestamp( TS::ISO_8601, $timestamp ), 'xsd', 'dateTime' )
122            ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) )
123            ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
124    }
125
126    public function execute() {
127        $outFile = $this->getOption( 'output', 'php://stdout' );
128
129        if ( $outFile === '-' ) {
130            $outFile = 'php://stdout';
131        }
132
133        $output = fopen( $outFile, 'w' );
134        $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
135        $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
136
137        $this->categoriesRdf->setupPrefixes();
138        $this->rdfWriter->start();
139
140        $this->addDumpHeader( time() );
141        fwrite( $output, $this->rdfWriter->drain() );
142
143        $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
144
145        foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) {
146            $pages = [];
147            foreach ( $batch as $row ) {
148                $this->categoriesRdf->writeCategoryData(
149                    $row->page_title,
150                    $row->pp_propname === 'hiddencat',
151                    (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
152                    (int)$row->cat_subcats
153                );
154                if ( $row->page_id ) {
155                    $pages[$row->page_id] = $row->page_title;
156                }
157            }
158
159            foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
160                $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title );
161            }
162            fwrite( $output, $this->rdfWriter->drain() );
163        }
164        fflush( $output );
165        if ( $outFile !== '-' ) {
166            fclose( $output );
167        }
168    }
169
170    /**
171     * @param string $format Writer format
172     * @return RdfWriter
173     */
174    private function createRdfWriter( $format ) {
175        $factory = new RdfWriterFactory();
176        return $factory->getWriter( $factory->getFormatName( $format ) );
177    }
178}
179
180// @codeCoverageIgnoreStart
181$maintClass = DumpCategoriesAsRdf::class;
182require_once RUN_MAINTENANCE_IF_MAIN;
183// @codeCoverageIgnoreEnd