Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
51.65% |
47 / 91 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
DumpCategoriesAsRdf | |
51.65% |
47 / 91 |
|
33.33% |
2 / 6 |
32.10 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryIterator | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
2 | |||
getCategoryLinksIterator | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
addDumpHeader | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
execute | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
7 | |||
createRdfWriter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use MediaWiki\Maintenance\Maintenance; |
23 | use Wikimedia\Purtle\RdfWriter; |
24 | use Wikimedia\Purtle\RdfWriterFactory; |
25 | use Wikimedia\Rdbms\IReadableDatabase; |
26 | |
27 | // @codeCoverageIgnoreStart |
28 | require_once __DIR__ . '/Maintenance.php'; |
29 | // @codeCoverageIgnoreEnd |
30 | |
31 | /** |
32 | * Maintenance script to provide RDF representation of the category tree. |
33 | * |
34 | * @ingroup Maintenance |
35 | * @since 1.30 |
36 | */ |
37 | class DumpCategoriesAsRdf extends Maintenance { |
38 | /** |
39 | * @var RdfWriter |
40 | */ |
41 | private $rdfWriter; |
42 | /** |
43 | * Categories RDF helper. |
44 | * @var CategoriesRdf |
45 | */ |
46 | private $categoriesRdf; |
47 | |
48 | public function __construct() { |
49 | parent::__construct(); |
50 | |
51 | $this->addDescription( "Generate RDF dump of categories in a wiki." ); |
52 | |
53 | $this->setBatchSize( 200 ); |
54 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", |
55 | false, true ); |
56 | $this->addOption( 'format', "Set the dump format.", false, true ); |
57 | } |
58 | |
59 | /** |
60 | * Produce row iterator for categories. |
61 | * @param IReadableDatabase $dbr |
62 | * @param string $fname Name of the calling function |
63 | * @return RecursiveIterator |
64 | */ |
65 | public function getCategoryIterator( IReadableDatabase $dbr, $fname ) { |
66 | $it = new BatchRowIterator( |
67 | $dbr, |
68 | [ 'page', 'page_props', 'category' ], |
69 | [ 'page_title' ], |
70 | $this->getBatchSize() |
71 | ); |
72 | $it->addConditions( [ |
73 | 'page_namespace' => NS_CATEGORY, |
74 | ] ); |
75 | $it->setFetchColumns( [ |
76 | 'page_title', |
77 | 'page_id', |
78 | 'pp_propname', |
79 | 'cat_pages', |
80 | 'cat_subcats', |
81 | 'cat_files' |
82 | ] ); |
83 | $it->addJoinConditions( |
84 | [ |
85 | 'page_props' => [ |
86 | 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] |
87 | ], |
88 | 'category' => [ |
89 | 'LEFT JOIN', [ 'cat_title = page_title' ] |
90 | ] |
91 | ] |
92 | |
93 | ); |
94 | $it->setCaller( $fname ); |
95 | return $it; |
96 | } |
97 | |
98 | /** |
99 | * Get iterator for links for categories. |
100 | * @param IReadableDatabase $dbr |
101 | * @param int[] $ids List of page IDs |
102 | * @param string $fname Name of the calling function |
103 | * @return Traversable |
104 | */ |
105 | public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
106 | $it = new BatchRowIterator( |
107 | $dbr, |
108 | 'categorylinks', |
109 | [ 'cl_from', 'cl_to' ], |
110 | $this->getBatchSize() |
111 | ); |
112 | $it->addConditions( [ |
113 | 'cl_type' => 'subcat', |
114 | 'cl_from' => $ids |
115 | ] ); |
116 | $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); |
117 | $it->setCaller( $fname ); |
118 | return new RecursiveIteratorIterator( $it ); |
119 | } |
120 | |
121 | /** |
122 | * @param int $timestamp |
123 | */ |
124 | public function addDumpHeader( $timestamp ) { |
125 | $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl ); |
126 | if ( str_starts_with( $licenseUrl, '//' ) ) { |
127 | $licenseUrl = 'https:' . $licenseUrl; |
128 | } |
129 | $urlUtils = $this->getServiceContainer()->getUrlUtils(); |
130 | $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() ) |
131 | ->a( 'schema', 'Dataset' ) |
132 | ->a( 'owl', 'Ontology' ) |
133 | ->say( 'cc', 'license' )->is( $licenseUrl ) |
134 | ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) |
135 | ->say( 'schema', 'dateModified' ) |
136 | ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) |
137 | ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) ) |
138 | ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); |
139 | } |
140 | |
141 | public function execute() { |
142 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
143 | |
144 | if ( $outFile === '-' ) { |
145 | $outFile = 'php://stdout'; |
146 | } |
147 | |
148 | $output = fopen( $outFile, 'w' ); |
149 | $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); |
150 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
151 | |
152 | $this->categoriesRdf->setupPrefixes(); |
153 | $this->rdfWriter->start(); |
154 | |
155 | $this->addDumpHeader( time() ); |
156 | fwrite( $output, $this->rdfWriter->drain() ); |
157 | |
158 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
159 | |
160 | foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) { |
161 | $pages = []; |
162 | foreach ( $batch as $row ) { |
163 | $this->categoriesRdf->writeCategoryData( |
164 | $row->page_title, |
165 | $row->pp_propname === 'hiddencat', |
166 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
167 | (int)$row->cat_subcats |
168 | ); |
169 | if ( $row->page_id ) { |
170 | $pages[$row->page_id] = $row->page_title; |
171 | } |
172 | } |
173 | |
174 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
175 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
176 | } |
177 | fwrite( $output, $this->rdfWriter->drain() ); |
178 | } |
179 | fflush( $output ); |
180 | if ( $outFile !== '-' ) { |
181 | fclose( $output ); |
182 | } |
183 | } |
184 | |
185 | /** |
186 | * @param string $format Writer format |
187 | * @return RdfWriter |
188 | */ |
189 | private function createRdfWriter( $format ) { |
190 | $factory = new RdfWriterFactory(); |
191 | return $factory->getWriter( $factory->getFormatName( $format ) ); |
192 | } |
193 | } |
194 | |
195 | // @codeCoverageIgnoreStart |
196 | $maintClass = DumpCategoriesAsRdf::class; |
197 | require_once RUN_MAINTENANCE_IF_MAIN; |
198 | // @codeCoverageIgnoreEnd |