MediaWiki master
SitemapGenerator.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Page;
4
8use MediaWiki\Languages\LanguageConverterFactory;
13use Wikimedia\Timestamp\TimestampFormat as TS;
14
21 private ?array $selectedNamespaces = null;
22 private ?array $allowedNamespaces = null;
23 private ?array $excludedNamespaces = null;
24 private ?int $startId = null;
25 private ?int $endId = null;
26 private array $variants = [];
27 private bool $skipRedirects = false;
28 private ?int $limit = null;
29 private ?int $nextBatchStart = null;
30
35 private ?GenderCache $genderCache;
36
38 public static function getVariants(
39 Language $contLang,
40 LanguageConverterFactory $languageConverterFactory,
41 ) {
42 $converter = $languageConverterFactory->getLanguageConverter( $contLang );
43 $variants = [];
44 foreach ( $converter->getVariants() as $vCode ) {
45 // We don't want the default variant
46 if ( $vCode !== $contLang->getCode() ) {
47 $variants[] = $vCode;
48 }
49 }
50 return $variants;
51 }
52
53 public function __construct(
54 Language $contLang,
55 LanguageConverterFactory $languageConverterFactory,
56 GenderCache $genderCache,
57 ) {
58 $this->variants = self::getVariants( $contLang, $languageConverterFactory );
59 $this->genderCache = $contLang->needsGenderDistinction() ? $genderCache : null;
60 }
61
68 public function namespaces( ?array $namespaces ) {
69 $this->selectedNamespaces = $namespaces;
70 return $this;
71 }
72
81 public function additionalNamespaces( ?array $namespaces ) {
82 if ( $namespaces && $this->selectedNamespaces !== null ) {
83 $this->selectedNamespaces = array_unique( array_merge(
84 $this->selectedNamespaces, $namespaces
85 ) );
86 }
87 return $this;
88 }
89
96 public function namespacesFromConfig( Config $config ) {
97 $this->allowedNamespaces = null;
98 $this->excludedNamespaces = null;
99
100 $sitemapNamespaces = $config->get( MainConfigNames::SitemapNamespaces );
101 if ( $sitemapNamespaces ) {
102 $this->selectedNamespaces = $sitemapNamespaces;
103 }
104
105 $defaultPolicy = $config->get( MainConfigNames::DefaultRobotPolicy );
106 $namespacePolicies = $config->get( MainConfigNames::NamespaceRobotPolicies );
107 if ( self::isNoIndex( $defaultPolicy ) ) {
108 $namespaces = [];
109 foreach ( $namespacePolicies as $ns => $policy ) {
110 if ( !self::isNoIndex( $policy ) ) {
111 $namespaces[] = $ns;
112 }
113 }
114 $this->allowedNamespaces = $namespaces;
115 } else {
116 $excluded = [];
117 foreach ( $namespacePolicies as $ns => $policy ) {
118 if ( self::isNoIndex( $policy ) ) {
119 $excluded[] = $ns;
120 }
121 }
122 if ( $excluded ) {
123 $this->excludedNamespaces = $excluded;
124 }
125 }
126 return $this;
127 }
128
135 private static function isNoIndex( $policy ) {
136 $policyArray = Article::formatRobotPolicy( $policy );
137 return ( $policyArray['index'] ?? '' ) === 'noindex';
138 }
139
148 public function idRange( ?int $startId, ?int $endId ) {
149 $this->startId = $startId;
150 $this->endId = $endId;
151 return $this;
152 }
153
160 public function skipRedirects( bool $skip = true ) {
161 $this->skipRedirects = $skip;
162 return $this;
163 }
164
171 public function limit( int $limit ) {
172 $this->limit = $limit;
173 return $this;
174 }
175
185 public function nextBatch() {
186 if ( $this->nextBatchStart !== null ) {
187 $this->startId = $this->nextBatchStart;
188 return true;
189 } else {
190 return false;
191 }
192 }
193
200 public function getXml( IReadableDatabase $dbr ) {
201 $empty = false;
202 $sqb = $dbr->newSelectQueryBuilder()
203 ->select( [ 'page_id', 'page_namespace', 'page_title', 'page_touched' ] )
204 ->from( 'page' )
205 ->leftJoin( 'page_props', null, [ 'page_id = pp_page', 'pp_propname' => 'noindex' ] )
206 ->where( [ 'pp_propname' => null ] )
207 ->orderBy( 'page_id' )
208 ->caller( __METHOD__ );
209
210 if ( $this->startId !== null ) {
211 $sqb->where( $dbr->expr( 'page_id', '>=', $this->startId ) );
212 }
213 if ( $this->endId !== null ) {
214 $sqb->where( $dbr->expr( 'page_id', '<', $this->endId ) );
215 }
216 $namespaces = $this->getSelectedAndAllowedNamespaces();
217 if ( $namespaces !== null ) {
218 if ( $namespaces === [] ) {
219 $empty = true;
220 } else {
221 $sqb->where( [ 'page_namespace' => $namespaces ] );
222 }
223 }
224 if ( $this->excludedNamespaces !== null ) {
225 $sqb->where( [ $dbr->expr( 'page_namespace', '!=', $this->excludedNamespaces ) ] );
226 }
227 if ( $this->skipRedirects ) {
228 $sqb->where( [ 'page_is_redirect' => 0 ] );
229 }
230 $variants = [ null, ...$this->variants ];
231 if ( $this->limit ) {
232 $pageLimit = (int)( $this->limit / count( $variants ) );
233 $sqb->limit( $pageLimit + 1 );
234 } else {
235 $pageLimit = null;
236 }
237
238 $res = $empty ? [] : $sqb->fetchResultSet();
239 $this->genderCache?->doPageRows( $res );
240
241 $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" .
242 "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n";
243 $count = 0;
244 $nextBatchStart = 0;
245 foreach ( $res as $row ) {
246 if ( $pageLimit !== null && ++$count > $pageLimit ) {
247 $nextBatchStart = (int)$row->page_id;
248 break;
249 }
250 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
251 foreach ( $variants as $variant ) {
252 $query = $variant === null ? '' : 'variant=' . urlencode( $variant );
253 $xml .= '<url>' .
254 Xml::element( 'loc', null, $title->getCanonicalURL( $query ) ) .
255 Xml::element( 'lastmod', null, wfTimestamp( TS::ISO_8601, $row->page_touched ) ) .
256 "</url>\n";
257 }
258 }
259 $xml .= "</urlset>\n";
260
261 if ( $nextBatchStart ) {
262 $this->nextBatchStart = $nextBatchStart;
263 } else {
264 $this->nextBatchStart = null;
265 }
266
267 return $xml;
268 }
269
276 private function getSelectedAndAllowedNamespaces() {
277 if ( $this->selectedNamespaces !== null ) {
278 if ( $this->allowedNamespaces !== null ) {
279 return array_intersect( $this->selectedNamespaces, $this->allowedNamespaces );
280 } else {
281 return $this->selectedNamespaces;
282 }
283 } else {
284 return $this->allowedNamespaces;
285 }
286 }
287}
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
Look up "gender" user preference.
makeTitle( $linkId)
Convert a link ID to a Title.to override Title
static element( $element, $attribs=[], $contents='')
Identical to rawElement(), but HTML-escapes $contents (like Xml::element()).
Definition Html.php:308
Base class for language-specific code.
Definition Language.php:68
getCode()
Get the internal language code for this language object.
needsGenderDistinction()
Whether this language uses gender-dependent namespace aliases.
Definition Language.php:521
A class containing constants representing the names of configuration variables.
const NamespaceRobotPolicies
Name constant for the NamespaceRobotPolicies setting, for use with Config::get()
const SitemapNamespaces
Name constant for the SitemapNamespaces setting, for use with Config::get()
const DefaultRobotPolicy
Name constant for the DefaultRobotPolicy setting, for use with Config::get()
static formatRobotPolicy( $policy)
Converts a String robot policy into an associative array, to allow merging of several policies using ...
Definition Article.php:1246
Utility for generating a sitemap.
nextBatch()
If a previous call to getXml() reached the limit set by limit() and there were still more rows,...
__construct(Language $contLang, LanguageConverterFactory $languageConverterFactory, GenderCache $genderCache,)
namespacesFromConfig(Config $config)
Set the included/excluded namespace list based on configuration.
limit(int $limit)
Limit the number of returned results.
idRange(?int $startId, ?int $endId)
Limit the page_id range to the given half-open interval.
namespaces(?array $namespaces)
Set the selected namespaces.
static getVariants(Language $contLang, LanguageConverterFactory $languageConverterFactory,)
skipRedirects(bool $skip=true)
Skip redirects.
additionalNamespaces(?array $namespaces)
Add namespaces to the selected namespace list.
getXml(IReadableDatabase $dbr)
Use the previously set options to generate an XML sitemap.
Represents a title within MediaWiki.
Definition Title.php:69
Module of static functions for generating XML.
Definition Xml.php:19
Interface for configuration instances.
Definition Config.php:18
get( $name)
Get a configuration variable such as "Sitename" or "UploadMaintenance.".
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()