Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
55.29% |
47 / 85 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
DumpCategoriesAsRdf | |
55.29% |
47 / 85 |
|
33.33% |
2 / 6 |
28.10 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryIterator | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
2 | |||
getCategoryLinksIterator | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
2 | |||
addDumpHeader | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
execute | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
7 | |||
createRdfWriter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use MediaWiki\Maintenance\Maintenance; |
23 | use Wikimedia\Purtle\RdfWriter; |
24 | use Wikimedia\Purtle\RdfWriterFactory; |
25 | use Wikimedia\Rdbms\IReadableDatabase; |
26 | |
27 | // @codeCoverageIgnoreStart |
28 | require_once __DIR__ . '/Maintenance.php'; |
29 | // @codeCoverageIgnoreEnd |
30 | |
31 | /** |
32 | * Maintenance script to provide RDF representation of the category tree. |
33 | * |
34 | * @ingroup Maintenance |
35 | * @since 1.30 |
36 | */ |
37 | class DumpCategoriesAsRdf extends Maintenance { |
38 | /** |
39 | * @var RdfWriter |
40 | */ |
41 | private $rdfWriter; |
42 | /** |
43 | * Categories RDF helper. |
44 | * @var CategoriesRdf |
45 | */ |
46 | private $categoriesRdf; |
47 | |
48 | public function __construct() { |
49 | parent::__construct(); |
50 | |
51 | $this->addDescription( "Generate RDF dump of categories in a wiki." ); |
52 | |
53 | $this->setBatchSize( 200 ); |
54 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", |
55 | false, true ); |
56 | $this->addOption( 'format', "Set the dump format.", false, true ); |
57 | } |
58 | |
59 | /** |
60 | * Produce row iterator for categories. |
61 | * @param IReadableDatabase $dbr |
62 | * @param string $fname Name of the calling function |
63 | * @return RecursiveIterator |
64 | */ |
65 | public function getCategoryIterator( IReadableDatabase $dbr, $fname ) { |
66 | $it = new BatchRowIterator( |
67 | $dbr, |
68 | $dbr->newSelectQueryBuilder() |
69 | ->from( 'page' ) |
70 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
71 | ->leftJoin( 'category', null, [ 'cat_title = page_title' ] ) |
72 | ->select( [ |
73 | 'page_title', |
74 | 'page_id', |
75 | 'pp_propname', |
76 | 'cat_pages', |
77 | 'cat_subcats', |
78 | 'cat_files' |
79 | ] ) |
80 | ->where( [ |
81 | 'page_namespace' => NS_CATEGORY, |
82 | ] ) |
83 | ->caller( $fname ), |
84 | [ 'page_title' ], |
85 | $this->getBatchSize() |
86 | ); |
87 | return $it; |
88 | } |
89 | |
90 | /** |
91 | * Get iterator for links for categories. |
92 | * @param IReadableDatabase $dbr |
93 | * @param int[] $ids List of page IDs |
94 | * @param string $fname Name of the calling function |
95 | * @return Traversable |
96 | */ |
97 | public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
98 | $it = new BatchRowIterator( |
99 | $dbr, |
100 | $dbr->newSelectQueryBuilder() |
101 | ->from( 'categorylinks' ) |
102 | ->select( [ 'cl_from', 'cl_to' ] ) |
103 | ->where( [ |
104 | 'cl_type' => 'subcat', |
105 | 'cl_from' => $ids |
106 | ] ) |
107 | ->caller( $fname ), |
108 | [ 'cl_from', 'cl_to' ], |
109 | $this->getBatchSize() |
110 | ); |
111 | return new RecursiveIteratorIterator( $it ); |
112 | } |
113 | |
114 | /** |
115 | * @param int $timestamp |
116 | */ |
117 | public function addDumpHeader( $timestamp ) { |
118 | $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl ); |
119 | if ( str_starts_with( $licenseUrl, '//' ) ) { |
120 | $licenseUrl = 'https:' . $licenseUrl; |
121 | } |
122 | $urlUtils = $this->getServiceContainer()->getUrlUtils(); |
123 | $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() ) |
124 | ->a( 'schema', 'Dataset' ) |
125 | ->a( 'owl', 'Ontology' ) |
126 | ->say( 'cc', 'license' )->is( $licenseUrl ) |
127 | ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) |
128 | ->say( 'schema', 'dateModified' ) |
129 | ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) |
130 | ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) ) |
131 | ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); |
132 | } |
133 | |
134 | public function execute() { |
135 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
136 | |
137 | if ( $outFile === '-' ) { |
138 | $outFile = 'php://stdout'; |
139 | } |
140 | |
141 | $output = fopen( $outFile, 'w' ); |
142 | $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); |
143 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
144 | |
145 | $this->categoriesRdf->setupPrefixes(); |
146 | $this->rdfWriter->start(); |
147 | |
148 | $this->addDumpHeader( time() ); |
149 | fwrite( $output, $this->rdfWriter->drain() ); |
150 | |
151 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
152 | |
153 | foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) { |
154 | $pages = []; |
155 | foreach ( $batch as $row ) { |
156 | $this->categoriesRdf->writeCategoryData( |
157 | $row->page_title, |
158 | $row->pp_propname === 'hiddencat', |
159 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
160 | (int)$row->cat_subcats |
161 | ); |
162 | if ( $row->page_id ) { |
163 | $pages[$row->page_id] = $row->page_title; |
164 | } |
165 | } |
166 | |
167 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
168 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
169 | } |
170 | fwrite( $output, $this->rdfWriter->drain() ); |
171 | } |
172 | fflush( $output ); |
173 | if ( $outFile !== '-' ) { |
174 | fclose( $output ); |
175 | } |
176 | } |
177 | |
178 | /** |
179 | * @param string $format Writer format |
180 | * @return RdfWriter |
181 | */ |
182 | private function createRdfWriter( $format ) { |
183 | $factory = new RdfWriterFactory(); |
184 | return $factory->getWriter( $factory->getFormatName( $format ) ); |
185 | } |
186 | } |
187 | |
188 | // @codeCoverageIgnoreStart |
189 | $maintClass = DumpCategoriesAsRdf::class; |
190 | require_once RUN_MAINTENANCE_IF_MAIN; |
191 | // @codeCoverageIgnoreEnd |