Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
101 / 101 |
|
100.00% |
12 / 12 |
CRAP | |
100.00% |
1 / 1 |
| SitemapGenerator | |
100.00% |
101 / 101 |
|
100.00% |
12 / 12 |
41 | |
100.00% |
1 / 1 |
| getVariants | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| __construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| namespaces | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| additionalNamespaces | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| namespacesFromConfig | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
8 | |||
| isNoIndex | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| idRange | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| skipRedirects | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| limit | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| nextBatch | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| getXml | |
100.00% |
48 / 48 |
|
100.00% |
1 / 1 |
15 | |||
| getSelectedAndAllowedNamespaces | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Page; |
| 4 | |
| 5 | use MediaWiki\Cache\GenderCache; |
| 6 | use MediaWiki\Config\Config; |
| 7 | use MediaWiki\Language\Language; |
| 8 | use MediaWiki\Languages\LanguageConverterFactory; |
| 9 | use MediaWiki\MainConfigNames; |
| 10 | use MediaWiki\Title\Title; |
| 11 | use MediaWiki\Xml\Xml; |
| 12 | use Wikimedia\Rdbms\IReadableDatabase; |
| 13 | use Wikimedia\Timestamp\TimestampFormat as TS; |
| 14 | |
| 15 | /** |
| 16 | * Utility for generating a sitemap |
| 17 | * |
| 18 | * @internal |
| 19 | */ |
| 20 | class SitemapGenerator { |
| 21 | private ?array $selectedNamespaces = null; |
| 22 | private ?array $allowedNamespaces = null; |
| 23 | private ?array $excludedNamespaces = null; |
| 24 | private ?int $startId = null; |
| 25 | private ?int $endId = null; |
| 26 | private array $variants = []; |
| 27 | private bool $skipRedirects = false; |
| 28 | private ?int $limit = null; |
| 29 | private ?int $nextBatchStart = null; |
| 30 | |
| 31 | /** |
| 32 | * The gender cache, or null if the content language has no gendered namespaces |
| 33 | * @var GenderCache|null |
| 34 | */ |
| 35 | private ?GenderCache $genderCache; |
| 36 | |
| 37 | /** @return array */ |
| 38 | public static function getVariants( |
| 39 | Language $contLang, |
| 40 | LanguageConverterFactory $languageConverterFactory, |
| 41 | ) { |
| 42 | $converter = $languageConverterFactory->getLanguageConverter( $contLang ); |
| 43 | $variants = []; |
| 44 | foreach ( $converter->getVariants() as $vCode ) { |
| 45 | // We don't want the default variant |
| 46 | if ( $vCode !== $contLang->getCode() ) { |
| 47 | $variants[] = $vCode; |
| 48 | } |
| 49 | } |
| 50 | return $variants; |
| 51 | } |
| 52 | |
| 53 | public function __construct( |
| 54 | Language $contLang, |
| 55 | LanguageConverterFactory $languageConverterFactory, |
| 56 | GenderCache $genderCache, |
| 57 | ) { |
| 58 | $this->variants = self::getVariants( $contLang, $languageConverterFactory ); |
| 59 | $this->genderCache = $contLang->needsGenderDistinction() ? $genderCache : null; |
| 60 | } |
| 61 | |
| 62 | /** |
| 63 | * Set the selected namespaces |
| 64 | * |
| 65 | * @param int[]|null $namespaces |
| 66 | * @return $this |
| 67 | */ |
| 68 | public function namespaces( ?array $namespaces ) { |
| 69 | $this->selectedNamespaces = $namespaces; |
| 70 | return $this; |
| 71 | } |
| 72 | |
| 73 | /** |
| 74 | * Add namespaces to the selected namespace list. If namespacesFromConfig() |
| 75 | * was already called, the selected namespace list will include namespaces |
| 76 | * from both $wgSitemapNamespaces and $namespaces. |
| 77 | * |
| 78 | * @param array|null $namespaces |
| 79 | * @return $this |
| 80 | */ |
| 81 | public function additionalNamespaces( ?array $namespaces ) { |
| 82 | if ( $namespaces && $this->selectedNamespaces !== null ) { |
| 83 | $this->selectedNamespaces = array_unique( array_merge( |
| 84 | $this->selectedNamespaces, $namespaces |
| 85 | ) ); |
| 86 | } |
| 87 | return $this; |
| 88 | } |
| 89 | |
| 90 | /** |
| 91 | * Set the included/excluded namespace list based on configuration |
| 92 | * |
| 93 | * @param Config $config |
| 94 | * @return $this |
| 95 | */ |
| 96 | public function namespacesFromConfig( Config $config ) { |
| 97 | $this->allowedNamespaces = null; |
| 98 | $this->excludedNamespaces = null; |
| 99 | |
| 100 | $sitemapNamespaces = $config->get( MainConfigNames::SitemapNamespaces ); |
| 101 | if ( $sitemapNamespaces ) { |
| 102 | $this->selectedNamespaces = $sitemapNamespaces; |
| 103 | } |
| 104 | |
| 105 | $defaultPolicy = $config->get( MainConfigNames::DefaultRobotPolicy ); |
| 106 | $namespacePolicies = $config->get( MainConfigNames::NamespaceRobotPolicies ); |
| 107 | if ( self::isNoIndex( $defaultPolicy ) ) { |
| 108 | $namespaces = []; |
| 109 | foreach ( $namespacePolicies as $ns => $policy ) { |
| 110 | if ( !self::isNoIndex( $policy ) ) { |
| 111 | $namespaces[] = $ns; |
| 112 | } |
| 113 | } |
| 114 | $this->allowedNamespaces = $namespaces; |
| 115 | } else { |
| 116 | $excluded = []; |
| 117 | foreach ( $namespacePolicies as $ns => $policy ) { |
| 118 | if ( self::isNoIndex( $policy ) ) { |
| 119 | $excluded[] = $ns; |
| 120 | } |
| 121 | } |
| 122 | if ( $excluded ) { |
| 123 | $this->excludedNamespaces = $excluded; |
| 124 | } |
| 125 | } |
| 126 | return $this; |
| 127 | } |
| 128 | |
| 129 | /** |
| 130 | * Interpret a configured robots policy |
| 131 | * |
| 132 | * @param string|array $policy |
| 133 | * @return bool |
| 134 | */ |
| 135 | private static function isNoIndex( $policy ) { |
| 136 | $policyArray = Article::formatRobotPolicy( $policy ); |
| 137 | return ( $policyArray['index'] ?? '' ) === 'noindex'; |
| 138 | } |
| 139 | |
| 140 | /** |
| 141 | * Limit the page_id range to the given half-open interval |
| 142 | * |
| 143 | * @param int|null $startId The start ID, or null for unlimited |
| 144 | * @param int|null $endId The end ID, or null for unlimited. Only pages |
| 145 | * with a page_id less than this value will be returned. |
| 146 | * @return $this |
| 147 | */ |
| 148 | public function idRange( ?int $startId, ?int $endId ) { |
| 149 | $this->startId = $startId; |
| 150 | $this->endId = $endId; |
| 151 | return $this; |
| 152 | } |
| 153 | |
| 154 | /** |
| 155 | * Skip redirects |
| 156 | * |
| 157 | * @param bool $skip |
| 158 | * @return $this |
| 159 | */ |
| 160 | public function skipRedirects( bool $skip = true ) { |
| 161 | $this->skipRedirects = $skip; |
| 162 | return $this; |
| 163 | } |
| 164 | |
| 165 | /** |
| 166 | * Limit the number of returned results. |
| 167 | * |
| 168 | * @param int $limit |
| 169 | * @return $this |
| 170 | */ |
| 171 | public function limit( int $limit ) { |
| 172 | $this->limit = $limit; |
| 173 | return $this; |
| 174 | } |
| 175 | |
| 176 | /** |
| 177 | * If a previous call to getXml() reached the limit set by limit() and there |
| 178 | * were still more rows, calling this will advance an internal cursor to the |
| 179 | * start of the next batch, and return true. |
| 180 | * |
| 181 | * If there were no more rows, return false. |
| 182 | * |
| 183 | * @return bool |
| 184 | */ |
| 185 | public function nextBatch() { |
| 186 | if ( $this->nextBatchStart !== null ) { |
| 187 | $this->startId = $this->nextBatchStart; |
| 188 | return true; |
| 189 | } else { |
| 190 | return false; |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | /** |
| 195 | * Use the previously set options to generate an XML sitemap |
| 196 | * |
| 197 | * @param IReadableDatabase $dbr |
| 198 | * @return string |
| 199 | */ |
| 200 | public function getXml( IReadableDatabase $dbr ) { |
| 201 | $empty = false; |
| 202 | $sqb = $dbr->newSelectQueryBuilder() |
| 203 | ->select( [ 'page_id', 'page_namespace', 'page_title', 'page_touched' ] ) |
| 204 | ->from( 'page' ) |
| 205 | ->leftJoin( 'page_props', null, [ 'page_id = pp_page', 'pp_propname' => 'noindex' ] ) |
| 206 | ->where( [ 'pp_propname' => null ] ) |
| 207 | ->orderBy( 'page_id' ) |
| 208 | ->caller( __METHOD__ ); |
| 209 | |
| 210 | if ( $this->startId !== null ) { |
| 211 | $sqb->where( $dbr->expr( 'page_id', '>=', $this->startId ) ); |
| 212 | } |
| 213 | if ( $this->endId !== null ) { |
| 214 | $sqb->where( $dbr->expr( 'page_id', '<', $this->endId ) ); |
| 215 | } |
| 216 | $namespaces = $this->getSelectedAndAllowedNamespaces(); |
| 217 | if ( $namespaces !== null ) { |
| 218 | if ( $namespaces === [] ) { |
| 219 | $empty = true; |
| 220 | } else { |
| 221 | $sqb->where( [ 'page_namespace' => $namespaces ] ); |
| 222 | } |
| 223 | } |
| 224 | if ( $this->excludedNamespaces !== null ) { |
| 225 | $sqb->where( [ $dbr->expr( 'page_namespace', '!=', $this->excludedNamespaces ) ] ); |
| 226 | } |
| 227 | if ( $this->skipRedirects ) { |
| 228 | $sqb->where( [ 'page_is_redirect' => 0 ] ); |
| 229 | } |
| 230 | $variants = [ null, ...$this->variants ]; |
| 231 | if ( $this->limit ) { |
| 232 | $pageLimit = (int)( $this->limit / count( $variants ) ); |
| 233 | $sqb->limit( $pageLimit + 1 ); |
| 234 | } else { |
| 235 | $pageLimit = null; |
| 236 | } |
| 237 | |
| 238 | $res = $empty ? [] : $sqb->fetchResultSet(); |
| 239 | $this->genderCache?->doPageRows( $res ); |
| 240 | |
| 241 | $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . |
| 242 | "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"; |
| 243 | $count = 0; |
| 244 | $nextBatchStart = 0; |
| 245 | foreach ( $res as $row ) { |
| 246 | if ( $pageLimit !== null && ++$count > $pageLimit ) { |
| 247 | $nextBatchStart = (int)$row->page_id; |
| 248 | break; |
| 249 | } |
| 250 | $title = Title::makeTitle( $row->page_namespace, $row->page_title ); |
| 251 | foreach ( $variants as $variant ) { |
| 252 | $query = $variant === null ? '' : 'variant=' . urlencode( $variant ); |
| 253 | $xml .= '<url>' . |
| 254 | Xml::element( 'loc', null, $title->getCanonicalURL( $query ) ) . |
| 255 | Xml::element( 'lastmod', null, wfTimestamp( TS::ISO_8601, $row->page_touched ) ) . |
| 256 | "</url>\n"; |
| 257 | } |
| 258 | } |
| 259 | $xml .= "</urlset>\n"; |
| 260 | |
| 261 | if ( $nextBatchStart ) { |
| 262 | $this->nextBatchStart = $nextBatchStart; |
| 263 | } else { |
| 264 | $this->nextBatchStart = null; |
| 265 | } |
| 266 | |
| 267 | return $xml; |
| 268 | } |
| 269 | |
| 270 | /** |
| 271 | * Get namespaces that are both selected and allowed, or null if all |
| 272 | * namespaces are selected. |
| 273 | * |
| 274 | * @return array|null |
| 275 | */ |
| 276 | private function getSelectedAndAllowedNamespaces() { |
| 277 | if ( $this->selectedNamespaces !== null ) { |
| 278 | if ( $this->allowedNamespaces !== null ) { |
| 279 | return array_intersect( $this->selectedNamespaces, $this->allowedNamespaces ); |
| 280 | } else { |
| 281 | return $this->selectedNamespaces; |
| 282 | } |
| 283 | } else { |
| 284 | return $this->allowedNamespaces; |
| 285 | } |
| 286 | } |
| 287 | } |