Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
53.41% |
47 / 88 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
| DumpCategoriesAsRdf | |
53.41% |
47 / 88 |
|
33.33% |
2 / 6 |
30.09 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| getCategoryIterator | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
2 | |||
| getCategoryLinksIterator | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
| addDumpHeader | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
| execute | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
7 | |||
| createRdfWriter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | */ |
| 5 | |
| 6 | use MediaWiki\Category\CategoriesRdf; |
| 7 | use MediaWiki\MainConfigNames; |
| 8 | use MediaWiki\Maintenance\Maintenance; |
| 9 | use MediaWiki\Utils\BatchRowIterator; |
| 10 | use Wikimedia\Purtle\RdfWriter; |
| 11 | use Wikimedia\Purtle\RdfWriterFactory; |
| 12 | use Wikimedia\Rdbms\IReadableDatabase; |
| 13 | use Wikimedia\Timestamp\TimestampFormat as TS; |
| 14 | |
| 15 | // @codeCoverageIgnoreStart |
| 16 | require_once __DIR__ . '/Maintenance.php'; |
| 17 | // @codeCoverageIgnoreEnd |
| 18 | |
| 19 | /** |
| 20 | * Maintenance script to provide RDF representation of the category tree. |
| 21 | * |
| 22 | * @ingroup Maintenance |
| 23 | * @since 1.30 |
| 24 | */ |
| 25 | class DumpCategoriesAsRdf extends Maintenance { |
| 26 | /** |
| 27 | * @var RdfWriter |
| 28 | */ |
| 29 | private $rdfWriter; |
| 30 | /** |
| 31 | * Categories RDF helper. |
| 32 | * @var CategoriesRdf |
| 33 | */ |
| 34 | private $categoriesRdf; |
| 35 | |
| 36 | public function __construct() { |
| 37 | parent::__construct(); |
| 38 | |
| 39 | $this->addDescription( "Generate RDF dump of categories in a wiki." ); |
| 40 | |
| 41 | $this->setBatchSize( 200 ); |
| 42 | $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", |
| 43 | false, true ); |
| 44 | $this->addOption( 'format', "Set the dump format.", false, true ); |
| 45 | } |
| 46 | |
| 47 | /** |
| 48 | * Produce row iterator for categories. |
| 49 | * @param IReadableDatabase $dbr |
| 50 | * @param string $fname Name of the calling function |
| 51 | * @return RecursiveIterator |
| 52 | */ |
| 53 | public function getCategoryIterator( IReadableDatabase $dbr, $fname ) { |
| 54 | $it = new BatchRowIterator( |
| 55 | $dbr, |
| 56 | $dbr->newSelectQueryBuilder() |
| 57 | ->from( 'page' ) |
| 58 | ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] ) |
| 59 | ->leftJoin( 'category', null, [ 'cat_title = page_title' ] ) |
| 60 | ->select( [ |
| 61 | 'page_title', |
| 62 | 'page_id', |
| 63 | 'pp_propname', |
| 64 | 'cat_pages', |
| 65 | 'cat_subcats', |
| 66 | 'cat_files' |
| 67 | ] ) |
| 68 | ->where( [ |
| 69 | 'page_namespace' => NS_CATEGORY, |
| 70 | ] ) |
| 71 | ->caller( $fname ), |
| 72 | [ 'page_title' ], |
| 73 | $this->getBatchSize() |
| 74 | ); |
| 75 | return $it; |
| 76 | } |
| 77 | |
| 78 | /** |
| 79 | * Get iterator for links for categories. |
| 80 | * @param IReadableDatabase $dbr |
| 81 | * @param int[] $ids List of page IDs |
| 82 | * @param string $fname Name of the calling function |
| 83 | * @return Traversable |
| 84 | */ |
| 85 | public function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) { |
| 86 | $qb = $dbr->newSelectQueryBuilder() |
| 87 | ->select( [ 'cl_from', 'lt_title' ] ) |
| 88 | ->from( 'categorylinks' ) |
| 89 | ->join( 'linktarget', null, 'cl_target_id=lt_id' ) |
| 90 | ->where( [ |
| 91 | 'cl_type' => 'subcat', |
| 92 | 'cl_from' => $ids |
| 93 | ] ) |
| 94 | ->caller( $fname ); |
| 95 | $primaryKey = [ 'cl_from', 'cl_target_id' ]; |
| 96 | |
| 97 | $it = new BatchRowIterator( |
| 98 | $dbr, |
| 99 | $qb, |
| 100 | $primaryKey, |
| 101 | $this->getBatchSize() |
| 102 | ); |
| 103 | return new RecursiveIteratorIterator( $it ); |
| 104 | } |
| 105 | |
| 106 | /** |
| 107 | * @param int $timestamp |
| 108 | */ |
| 109 | public function addDumpHeader( $timestamp ) { |
| 110 | $licenseUrl = $this->getConfig()->get( MainConfigNames::RightsUrl ); |
| 111 | if ( str_starts_with( $licenseUrl, '//' ) ) { |
| 112 | $licenseUrl = 'https:' . $licenseUrl; |
| 113 | } |
| 114 | $urlUtils = $this->getServiceContainer()->getUrlUtils(); |
| 115 | $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() ) |
| 116 | ->a( 'schema', 'Dataset' ) |
| 117 | ->a( 'owl', 'Ontology' ) |
| 118 | ->say( 'cc', 'license' )->is( $licenseUrl ) |
| 119 | ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) |
| 120 | ->say( 'schema', 'dateModified' ) |
| 121 | ->value( wfTimestamp( TS::ISO_8601, $timestamp ), 'xsd', 'dateTime' ) |
| 122 | ->say( 'schema', 'isPartOf' )->is( (string)$urlUtils->expand( '/', PROTO_CANONICAL ) ) |
| 123 | ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); |
| 124 | } |
| 125 | |
| 126 | public function execute() { |
| 127 | $outFile = $this->getOption( 'output', 'php://stdout' ); |
| 128 | |
| 129 | if ( $outFile === '-' ) { |
| 130 | $outFile = 'php://stdout'; |
| 131 | } |
| 132 | |
| 133 | $output = fopen( $outFile, 'w' ); |
| 134 | $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); |
| 135 | $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); |
| 136 | |
| 137 | $this->categoriesRdf->setupPrefixes(); |
| 138 | $this->rdfWriter->start(); |
| 139 | |
| 140 | $this->addDumpHeader( time() ); |
| 141 | fwrite( $output, $this->rdfWriter->drain() ); |
| 142 | |
| 143 | $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); |
| 144 | |
| 145 | foreach ( $this->getCategoryIterator( $dbr, __METHOD__ ) as $batch ) { |
| 146 | $pages = []; |
| 147 | foreach ( $batch as $row ) { |
| 148 | $this->categoriesRdf->writeCategoryData( |
| 149 | $row->page_title, |
| 150 | $row->pp_propname === 'hiddencat', |
| 151 | (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, |
| 152 | (int)$row->cat_subcats |
| 153 | ); |
| 154 | if ( $row->page_id ) { |
| 155 | $pages[$row->page_id] = $row->page_title; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) { |
| 160 | $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->lt_title ); |
| 161 | } |
| 162 | fwrite( $output, $this->rdfWriter->drain() ); |
| 163 | } |
| 164 | fflush( $output ); |
| 165 | if ( $outFile !== '-' ) { |
| 166 | fclose( $output ); |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | /** |
| 171 | * @param string $format Writer format |
| 172 | * @return RdfWriter |
| 173 | */ |
| 174 | private function createRdfWriter( $format ) { |
| 175 | $factory = new RdfWriterFactory(); |
| 176 | return $factory->getWriter( $factory->getFormatName( $format ) ); |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | // @codeCoverageIgnoreStart |
| 181 | $maintClass = DumpCategoriesAsRdf::class; |
| 182 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 183 | // @codeCoverageIgnoreEnd |