Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
51.65% |
47 / 91 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
DumpCategoriesAsRdf | |
51.65% |
47 / 91 |
|
33.33% |
2 / 6 |
32.10 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getCategoryIterator | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
2 | |||
getCategoryLinksIterator | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
addDumpHeader | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
execute | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
7 | |||
createRdfWriter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | */ |
19 | |
20 | use MediaWiki\Category\CategoriesRdf; |
21 | use MediaWiki\MainConfigNames; |
22 | use Wikimedia\Purtle\RdfWriter; |
23 | use Wikimedia\Purtle\RdfWriterFactory; |
24 | use Wikimedia\Rdbms\IReadableDatabase; |
25 | |
26 | // @codeCoverageIgnoreStart |
27 | require_once __DIR__ . '/Maintenance.php'; |
28 | // @codeCoverageIgnoreEnd |
29 | |
30 | /** |
31 | * Maintenance script to provide RDF representation of the category tree. |
32 | * |
33 | * @ingroup Maintenance |
34 | * @since 1.30 |
35 | */ |
36 | class DumpCategoriesAsRdf extends Maintenance { |
37 | /** |
38 | * @var RdfWriter |
39 | */ |
40 | private $rdfWriter; |
41 | /** |
42 | * Categories RDF helper. |
43 | * @var CategoriesRdf |
44 | */ |
45 | private $categoriesRdf; |
46 | |
47 | public function __construct() { |
48 | parent::__construct(); |
49 | |
50 | $this->addDescription( "Generate RDF dump of categories in a wiki." ); |
51 | |
52 | $this->setBatchSize( 200 ); |
53 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", |
54 | false, true ); |
55 | $this->addOption( 'format', "Set the dump format.", false, true ); |
56 | } |
57 | |
58 | /** |
59 | * Produce row iterator for categories. |
60 | * @param IReadableDatabase $dbr |
61 | * @param string $fname Name of the calling function |
62 | * @return RecursiveIterator |
63 | */ |
64 | public function getCategoryIterator( IReadableDatabase $dbr, $fname ) { |
65 | $it = new BatchRowIterator( |
66 | $dbr, |
67 | [ 'page', 'page_props', 'category' ], |
68 | [ 'page_title' ], |
69 | $this->getBatchSize() |
70 | ); |
71 | $it->addConditions( [ |
72 | 'page_namespace' => NS_CATEGORY, |
73 | ] ); |
74 | $it->setFetchColumns( [ |
75 | 'page_title', |
76 | 'page_id', |
77 | 'pp_propname', |
78 | 'cat_pages', |
79 | 'cat_subcats', |
80 | 'cat_files' |
81 | ] ); |
82 | $it->addJoinConditions( |
83 | [ |
84 | 'page_props' => [ |
85 | 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] |
86 | ], |
87 | 'category' => [ |
88 | 'LEFT JOIN', [ 'cat_title = page_title' ] |
89 | ] |
90 | ] |
91 | |
92 | ); |
93 | $it->setCaller( $fname ); |
94 | return $it; |
95 | } |
96 | |
97 | /** |
98 | * Get iterator for links for categories. |
99 | * @param IReadableDatabase $dbr |
100 | * @param int[] $ids List of page IDs |
101 | * @param string $fname Name of the calling function |
102 | * @return Traversable |
103 | */ |
104 | public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
105 | $it = new BatchRowIterator( |
106 | $dbr, |
107 | 'categorylinks', |
108 | [ 'cl_from', 'cl_to' ], |
109 | $this->getBatchSize() |
110 | ); |
111 | $it->addConditions( [ |
112 | 'cl_type' => 'subcat', |
113 | 'cl_from' => $ids |
114 | ] ); |
115 | $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); |
116 | $it->setCaller( $fname ); |
117 | return new RecursiveIteratorIterator( $it ); |
118 | } |
119 | |
120 | /** |
121 | * @param int $timestamp |
122 | */ |
123 | public function addDumpHeader( $timestamp ) { |
124 | $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl ); |
125 | if ( str_starts_with( $licenseUrl, '//' ) ) { |
126 | $licenseUrl = 'https:' . $licenseUrl; |
127 | } |
128 | $urlUtils = $this->getServiceContainer()->getUrlUtils(); |
129 | $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() ) |
130 | ->a( 'schema', 'Dataset' ) |
131 | ->a( 'owl', 'Ontology' ) |
132 | ->say( 'cc', 'license' )->is( $licenseUrl ) |
133 | ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) |
134 | ->say( 'schema', 'dateModified' ) |
135 | ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) |
136 | ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) ) |
137 | ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); |
138 | } |
139 | |
140 | public function execute() { |
141 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
142 | |
143 | if ( $outFile === '-' ) { |
144 | $outFile = 'php://stdout'; |
145 | } |
146 | |
147 | $output = fopen( $outFile, 'w' ); |
148 | $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); |
149 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
150 | |
151 | $this->categoriesRdf->setupPrefixes(); |
152 | $this->rdfWriter->start(); |
153 | |
154 | $this->addDumpHeader( time() ); |
155 | fwrite( $output, $this->rdfWriter->drain() ); |
156 | |
157 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
158 | |
159 | foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) { |
160 | $pages = []; |
161 | foreach ( $batch as $row ) { |
162 | $this->categoriesRdf->writeCategoryData( |
163 | $row->page_title, |
164 | $row->pp_propname === 'hiddencat', |
165 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
166 | (int)$row->cat_subcats |
167 | ); |
168 | if ( $row->page_id ) { |
169 | $pages[$row->page_id] = $row->page_title; |
170 | } |
171 | } |
172 | |
173 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
174 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); |
175 | } |
176 | fwrite( $output, $this->rdfWriter->drain() ); |
177 | } |
178 | fflush( $output ); |
179 | if ( $outFile !== '-' ) { |
180 | fclose( $output ); |
181 | } |
182 | } |
183 | |
184 | /** |
185 | * @param string $format Writer format |
186 | * @return RdfWriter |
187 | */ |
188 | private function createRdfWriter( $format ) { |
189 | $factory = new RdfWriterFactory(); |
190 | return $factory->getWriter( $factory->getFormatName( $format ) ); |
191 | } |
192 | } |
193 | |
194 | // @codeCoverageIgnoreStart |
195 | $maintClass = DumpCategoriesAsRdf::class; |
196 | require_once RUN_MAINTENANCE_IF_MAIN; |
197 | // @codeCoverageIgnoreEnd |