MediaWiki master
SitemapGenerator.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Page;
4
8use MediaWiki\Languages\LanguageConverterFactory;
13
20 private ?array $selectedNamespaces = null;
21 private ?array $allowedNamespaces = null;
22 private ?array $excludedNamespaces = null;
23 private ?int $startId = null;
24 private ?int $endId = null;
25 private array $variants = [];
26 private bool $skipRedirects = false;
27 private ?int $limit = null;
28 private ?int $nextBatchStart = null;
29
34 private ?GenderCache $genderCache;
35
37 public static function getVariants(
38 Language $contLang,
39 LanguageConverterFactory $languageConverterFactory,
40 ) {
41 $converter = $languageConverterFactory->getLanguageConverter( $contLang );
42 $variants = [];
43 foreach ( $converter->getVariants() as $vCode ) {
44 // We don't want the default variant
45 if ( $vCode !== $contLang->getCode() ) {
46 $variants[] = $vCode;
47 }
48 }
49 return $variants;
50 }
51
52 public function __construct(
53 Language $contLang,
54 LanguageConverterFactory $languageConverterFactory,
55 GenderCache $genderCache,
56 ) {
57 $this->variants = self::getVariants( $contLang, $languageConverterFactory );
58 $this->genderCache = $contLang->needsGenderDistinction() ? $genderCache : null;
59 }
60
67 public function namespaces( ?array $namespaces ) {
68 $this->selectedNamespaces = $namespaces;
69 return $this;
70 }
71
80 public function additionalNamespaces( ?array $namespaces ) {
81 if ( $namespaces && $this->selectedNamespaces !== null ) {
82 $this->selectedNamespaces = array_unique( array_merge(
83 $this->selectedNamespaces, $namespaces
84 ) );
85 }
86 return $this;
87 }
88
95 public function namespacesFromConfig( Config $config ) {
96 $this->allowedNamespaces = null;
97 $this->excludedNamespaces = null;
98
99 $sitemapNamespaces = $config->get( MainConfigNames::SitemapNamespaces );
100 if ( $sitemapNamespaces ) {
101 $this->selectedNamespaces = $sitemapNamespaces;
102 }
103
104 $defaultPolicy = $config->get( MainConfigNames::DefaultRobotPolicy );
105 $namespacePolicies = $config->get( MainConfigNames::NamespaceRobotPolicies );
106 if ( self::isNoIndex( $defaultPolicy ) ) {
107 $namespaces = [];
108 foreach ( $namespacePolicies as $ns => $policy ) {
109 if ( !self::isNoIndex( $policy ) ) {
110 $namespaces[] = $ns;
111 }
112 }
113 $this->allowedNamespaces = $namespaces;
114 } else {
115 $excluded = [];
116 foreach ( $namespacePolicies as $ns => $policy ) {
117 if ( self::isNoIndex( $policy ) ) {
118 $excluded[] = $ns;
119 }
120 }
121 if ( $excluded ) {
122 $this->excludedNamespaces = $excluded;
123 }
124 }
125 return $this;
126 }
127
134 private static function isNoIndex( $policy ) {
135 $policyArray = Article::formatRobotPolicy( $policy );
136 return ( $policyArray['index'] ?? '' ) === 'noindex';
137 }
138
147 public function idRange( ?int $startId, ?int $endId ) {
148 $this->startId = $startId;
149 $this->endId = $endId;
150 return $this;
151 }
152
159 public function skipRedirects( bool $skip = true ) {
160 $this->skipRedirects = $skip;
161 return $this;
162 }
163
170 public function limit( int $limit ) {
171 $this->limit = $limit;
172 return $this;
173 }
174
184 public function nextBatch() {
185 if ( $this->nextBatchStart !== null ) {
186 $this->startId = $this->nextBatchStart;
187 return true;
188 } else {
189 return false;
190 }
191 }
192
199 public function getXml( IReadableDatabase $dbr ) {
200 $empty = false;
201 $sqb = $dbr->newSelectQueryBuilder()
202 ->select( [ 'page_id', 'page_namespace', 'page_title', 'page_touched' ] )
203 ->from( 'page' )
204 ->leftJoin( 'page_props', null, [ 'page_id = pp_page', 'pp_propname' => 'noindex' ] )
205 ->where( [ 'pp_propname' => null ] )
206 ->orderBy( 'page_id' )
207 ->caller( __METHOD__ );
208
209 if ( $this->startId !== null ) {
210 $sqb->where( $dbr->expr( 'page_id', '>=', $this->startId ) );
211 }
212 if ( $this->endId !== null ) {
213 $sqb->where( $dbr->expr( 'page_id', '<', $this->endId ) );
214 }
215 $namespaces = $this->getSelectedAndAllowedNamespaces();
216 if ( $namespaces !== null ) {
217 if ( $namespaces === [] ) {
218 $empty = true;
219 } else {
220 $sqb->where( [ 'page_namespace' => $namespaces ] );
221 }
222 }
223 if ( $this->excludedNamespaces !== null ) {
224 $sqb->where( [ $dbr->expr( 'page_namespace', '!=', $this->excludedNamespaces ) ] );
225 }
226 if ( $this->skipRedirects ) {
227 $sqb->where( [ 'page_is_redirect' => 0 ] );
228 }
229 $variants = [ null, ...$this->variants ];
230 if ( $this->limit ) {
231 $pageLimit = (int)( $this->limit / count( $variants ) );
232 $sqb->limit( $pageLimit + 1 );
233 } else {
234 $pageLimit = null;
235 }
236
237 $res = $empty ? [] : $sqb->fetchResultSet();
238 $this->genderCache?->doPageRows( $res );
239
240 $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" .
241 "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n";
242 $count = 0;
243 $nextBatchStart = 0;
244 foreach ( $res as $row ) {
245 if ( $pageLimit !== null && ++$count > $pageLimit ) {
246 $nextBatchStart = (int)$row->page_id;
247 break;
248 }
249 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
250 foreach ( $variants as $variant ) {
251 $query = $variant === null ? '' : 'variant=' . urlencode( $variant );
252 $xml .= '<url>' .
253 Xml::element( 'loc', null, $title->getCanonicalURL( $query ) ) .
254 Xml::element( 'lastmod', null, wfTimestamp( TS_ISO_8601, $row->page_touched ) ) .
255 "</url>\n";
256 }
257 }
258 $xml .= "</urlset>\n";
259
260 if ( $nextBatchStart ) {
261 $this->nextBatchStart = $nextBatchStart;
262 } else {
263 $this->nextBatchStart = null;
264 }
265
266 return $xml;
267 }
268
275 private function getSelectedAndAllowedNamespaces() {
276 if ( $this->selectedNamespaces !== null ) {
277 if ( $this->allowedNamespaces !== null ) {
278 return array_intersect( $this->selectedNamespaces, $this->allowedNamespaces );
279 } else {
280 return $this->selectedNamespaces;
281 }
282 } else {
283 return $this->allowedNamespaces;
284 }
285 }
286}
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Look up "gender" user preference.
makeTitle( $linkId)
Convert a link ID to a Title.to override Title
static element( $element, $attribs=[], $contents='')
Identical to rawElement(), but HTML-escapes $contents (like Xml::element()).
Definition Html.php:231
Base class for language-specific code.
Definition Language.php:69
getCode()
Get the internal language code for this language object.
needsGenderDistinction()
Whether this language uses gender-dependent namespace aliases.
Definition Language.php:522
A class containing constants representing the names of configuration variables.
const NamespaceRobotPolicies
Name constant for the NamespaceRobotPolicies setting, for use with Config::get()
const SitemapNamespaces
Name constant for the SitemapNamespaces setting, for use with Config::get()
const DefaultRobotPolicy
Name constant for the DefaultRobotPolicy setting, for use with Config::get()
static formatRobotPolicy( $policy)
Converts a String robot policy into an associative array, to allow merging of several policies using ...
Definition Article.php:1180
Utility for generating a sitemap.
nextBatch()
If a previous call to getXml() reached the limit set by limit() and there were still more rows,...
__construct(Language $contLang, LanguageConverterFactory $languageConverterFactory, GenderCache $genderCache,)
namespacesFromConfig(Config $config)
Set the included/excluded namespace list based on configuration.
limit(int $limit)
Limit the number of returned results.
idRange(?int $startId, ?int $endId)
Limit the page_id range to the given half-open interval.
namespaces(?array $namespaces)
Set the selected namespaces.
static getVariants(Language $contLang, LanguageConverterFactory $languageConverterFactory,)
skipRedirects(bool $skip=true)
Skip redirects.
additionalNamespaces(?array $namespaces)
Add namespaces to the selected namespace list.
getXml(IReadableDatabase $dbr)
Use the previously set options to generate an XML sitemap.
Represents a title within MediaWiki.
Definition Title.php:69
Module of static functions for generating XML.
Definition Xml.php:19
Interface for configuration instances.
Definition Config.php:18
get( $name)
Get a configuration variable such as "Sitename" or "UploadMaintenance.".
A database connection without write operations.
newSelectQueryBuilder()
Create an empty SelectQueryBuilder which can be used to run queries against this connection.
expr(string $field, string $op, $value)
See Expression::__construct()