Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
50.00% |
47 / 94 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
DumpCategoriesAsRdf | |
51.65% |
47 / 91 |
|
33.33% |
2 / 6 |
32.10 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryIterator | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
2 | |||
getCategoryLinksIterator | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
addDumpHeader | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
execute | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
7 | |||
createRdfWriter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use Wikimedia\Purtle\RdfWriter; |
23 | use Wikimedia\Purtle\RdfWriterFactory; |
24 | use Wikimedia\Rdbms\IReadableDatabase; |
25 | |
26 | require_once __DIR__ . '/Maintenance.php'; |
27 | |
28 | /** |
29 | * Maintenance script to provide RDF representation of the category tree. |
30 | * |
31 | * @ingroup Maintenance |
32 | * @since 1.30 |
33 | */ |
34 | class DumpCategoriesAsRdf extends Maintenance { |
35 | /** |
36 | * @var RdfWriter |
37 | */ |
38 | private $rdfWriter; |
39 | /** |
40 | * Categories RDF helper. |
41 | * @var CategoriesRdf |
42 | */ |
43 | private $categoriesRdf; |
44 | |
45 | public function __construct() { |
46 | parent::__construct(); |
47 | |
48 | $this->addDescription( "Generate RDF dump of categories in a wiki." ); |
49 | |
50 | $this->setBatchSize( 200 ); |
51 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", |
52 | false, true ); |
53 | $this->addOption( 'format', "Set the dump format.", false, true ); |
54 | } |
55 | |
56 | /** |
57 | * Produce row iterator for categories. |
58 | * @param IReadableDatabase $dbr |
59 | * @param string $fname Name of the calling function |
60 | * @return RecursiveIterator |
61 | */ |
62 | public function getCategoryIterator( IReadableDatabase $dbr, $fname ) { |
63 | $it = new BatchRowIterator( |
64 | $dbr, |
65 | [ 'page', 'page_props', 'category' ], |
66 | [ 'page_title' ], |
67 | $this->getBatchSize() |
68 | ); |
69 | $it->addConditions( [ |
70 | 'page_namespace' => NS_CATEGORY, |
71 | ] ); |
72 | $it->setFetchColumns( [ |
73 | 'page_title', |
74 | 'page_id', |
75 | 'pp_propname', |
76 | 'cat_pages', |
77 | 'cat_subcats', |
78 | 'cat_files' |
79 | ] ); |
80 | $it->addJoinConditions( |
81 | [ |
82 | 'page_props' => [ |
83 | 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] |
84 | ], |
85 | 'category' => [ |
86 | 'LEFT JOIN', [ 'cat_title = page_title' ] |
87 | ] |
88 | ] |
89 | |
90 | ); |
91 | $it->setCaller( $fname ); |
92 | return $it; |
93 | } |
94 | |
95 | /** |
96 | * Get iterator for links for categories. |
97 | * @param IReadableDatabase $dbr |
98 | * @param int[] $ids List of page IDs |
99 | * @param string $fname Name of the calling function |
100 | * @return Traversable |
101 | */ |
102 | public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
103 | $it = new BatchRowIterator( |
104 | $dbr, |
105 | 'categorylinks', |
106 | [ 'cl_from', 'cl_to' ], |
107 | $this->getBatchSize() |
108 | ); |
109 | $it->addConditions( [ |
110 | 'cl_type' => 'subcat', |
111 | 'cl_from' => $ids |
112 | ] ); |
113 | $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); |
114 | $it->setCaller( $fname ); |
115 | return new RecursiveIteratorIterator( $it ); |
116 | } |
117 | |
118 | /** |
119 | * @param int $timestamp |
120 | */ |
121 | public function addDumpHeader( $timestamp ) { |
122 | $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl ); |
123 | if ( str_starts_with( $licenseUrl, '//' ) ) { |
124 | $licenseUrl = 'https:' . $licenseUrl; |
125 | } |
126 | $urlUtils = $this->getServiceContainer()->getUrlUtils(); |
127 | $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() ) |
128 | ->a( 'schema', 'Dataset' ) |
129 | ->a( 'owl', 'Ontology' ) |
130 | ->say( 'cc', 'license' )->is( $licenseUrl ) |
131 | ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) |
132 | ->say( 'schema', 'dateModified' ) |
133 | ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) |
134 | ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) ) |
135 | ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); |
136 | } |
137 | |
138 | public function execute() { |
139 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
140 | |
141 | if ( $outFile === '-' ) { |
142 | $outFile = 'php://stdout'; |
143 | } |
144 | |
145 | $output = fopen( $outFile, 'w' ); |
146 | $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); |
147 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
148 | |
149 | $this->categoriesRdf->setupPrefixes(); |
150 | $this->rdfWriter->start(); |
151 | |
152 | $this->addDumpHeader( time() ); |
153 | fwrite( $output, $this->rdfWriter->drain() ); |
154 | |
155 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
156 | |
157 | foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) { |
158 | $pages = []; |
159 | foreach ( $batch as $row ) { |
160 | $this->categoriesRdf->writeCategoryData( |
161 | $row->page_title, |
162 | $row->pp_propname === 'hiddencat', |
163 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
164 | (int)$row->cat_subcats |
165 | ); |
166 | if ( $row->page_id ) { |
167 | $pages[$row->page_id] = $row->page_title; |
168 | } |
169 | } |
170 | |
171 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
172 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
173 | } |
174 | fwrite( $output, $this->rdfWriter->drain() ); |
175 | } |
176 | fflush( $output ); |
177 | if ( $outFile !== '-' ) { |
178 | fclose( $output ); |
179 | } |
180 | } |
181 | |
182 | /** |
183 | * @param string $format Writer format |
184 | * @return RdfWriter |
185 | */ |
186 | private function createRdfWriter( $format ) { |
187 | $factory = new RdfWriterFactory(); |
188 | return $factory->getWriter( $factory->getFormatName( $format ) ); |
189 | } |
190 | } |
191 | |
192 | $maintClass = DumpCategoriesAsRdf::class; |
193 | require_once RUN_MAINTENANCE_IF_MAIN; |