MediaWiki master
generateSitemap.php
Go to the documentation of this file.
1<?php
20use Wikimedia\Timestamp\TimestampFormat as TS;
21
22// @codeCoverageIgnoreStart
23require_once __DIR__ . '/Maintenance.php';
24// @codeCoverageIgnoreEnd
25
39 public $url_limit;
40
46 public $fspath;
47
54 public $urlpath;
55
61 public $compress;
62
69
75 public $namespaces = [];
76
82 public $timestamp;
83
89 public $dbr;
90
96 public $indexFile;
97
103 public $file;
104
110 private $identifier;
111
112 public function __construct() {
113 parent::__construct();
114 $this->addDescription( 'Creates a sitemap for the site' );
115 $this->addOption(
116 'fspath',
117 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory',
118 false,
119 true
120 );
121 $this->addOption(
122 'urlpath',
123 'The URL path corresponding to --fspath, prepended to filenames in the index; '
124 . 'defaults to an empty string',
125 false,
126 true
127 );
128 $this->addOption(
129 'compress',
130 'Compress the sitemap files, can take value yes|no, default yes',
131 false,
132 true
133 );
134 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
135 $this->addOption(
136 'identifier',
137 'What site identifier to use for the wiki, defaults to $wgDBname',
138 false,
139 true
140 );
141 $this->addOption(
142 'namespaces',
143 'Only include pages in these namespaces in the sitemap, ' .
144 'defaults to the value of wgSitemapNamespaces if not defined.',
145 false, true, false, true
146 );
147 $this->addOption(
148 'limit',
149 'Maximum number of URLs per sitemap file. Default 50,000.',
150 false,
151 true
152 );
153 }
154
158 public function execute() {
159 $this->url_limit = $this->getOption( 'limit', 50_000 );
160
161 # Create directory if needed
162 $fspath = $this->getOption( 'fspath', getcwd() );
163 if ( !wfMkdirParents( $fspath, null, __METHOD__ ) ) {
164 $this->fatalError( "Can not create directory $fspath." );
165 }
166
167 $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId();
168 $this->fspath = realpath( $fspath ) . DIRECTORY_SEPARATOR;
169 $this->urlpath = $this->getOption( 'urlpath', "" );
170 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
171 $this->urlpath .= '/';
172 }
173 $this->identifier = $this->getOption( 'identifier', $dbDomain );
174 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
175 $this->skipRedirects = $this->hasOption( 'skip-redirects' );
176 $this->dbr = $this->getReplicaDB();
177 $this->timestamp = wfTimestamp( TS::ISO_8601, wfTimestampNow() );
178 $encIdentifier = rawurlencode( $this->identifier );
179 $indexPath = "{$this->fspath}sitemap-index-{$encIdentifier}.xml";
180 $this->indexFile = fopen( "{$this->fspath}sitemap-index-{$encIdentifier}.xml", 'wb' );
181 $this->main();
182 $this->output( "Wrote index: $indexPath\n" );
183 }
184
189 private function getNamespaces() {
190 // Use the namespaces passed in via command line arguments if they are set.
191 return $this->getOption( 'namespaces' )
192 ?? $this->getConfig()->get( MainConfigNames::SitemapNamespaces )
193 ?: null;
194 }
195
199 public function main() {
200 $services = $this->getServiceContainer();
201 $contLang = $services->getContentLanguage();
202 $serverUrl = $services->getUrlUtils()->getServer( PROTO_CANONICAL ) ?? '';
203
204 fwrite( $this->indexFile, $this->openIndex() );
205
206 $generator = new SitemapGenerator(
207 $contLang,
208 $services->getLanguageConverterFactory(),
209 $services->getGenderCache()
210 );
211 $generator->skipRedirects( $this->skipRedirects )
212 ->namespaces( $this->getNamespaces() )
213 ->limit( $this->url_limit );
214
215 $sitemapId = 0;
216 do {
217 $filename = $this->sitemapFilename( $sitemapId++ );
218 $filePath = $this->fspath . $filename;
219 $file = $this->open( $filePath, 'wb' );
220 $xml = $generator->getXml( $this->dbr );
221 $this->write( $file, $xml );
222 $this->close( $file );
223 fwrite( $this->indexFile, $this->indexEntry( $filename, $serverUrl ) );
224 $this->output( "Wrote sitemap: $filePath\n" );
225 } while ( $generator->nextBatch() );
226
227 fwrite( $this->indexFile, $this->closeIndex() );
228 fclose( $this->indexFile );
229 }
230
238 private function open( $file, $flags ) {
239 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
240 if ( $resource === false ) {
241 throw new RuntimeException( __METHOD__
242 . " error opening file $file with flags $flags. Check permissions?" );
243 }
244
245 return $resource;
246 }
247
254 private function write( &$handle, $str ) {
255 if ( $handle === true || $handle === false ) {
256 throw new InvalidArgumentException( __METHOD__ . " was passed a boolean as a file handle.\n" );
257 }
258 if ( $this->compress ) {
259 gzwrite( $handle, $str );
260 } else {
261 fwrite( $handle, $str );
262 }
263 }
264
270 private function close( &$handle ) {
271 if ( $this->compress ) {
272 gzclose( $handle );
273 } else {
274 fclose( $handle );
275 }
276 }
277
284 private function sitemapFilename( $count ) {
285 $ext = $this->compress ? '.gz' : '';
286
287 return "sitemap-{$this->identifier}-$count.xml$ext";
288 }
289
295 private function xmlHead() {
296 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
297 }
298
304 private function xmlSchema() {
305 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
306 }
307
313 private function openIndex() {
314 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
315 }
316
324 private function indexEntry( $filename, $serverUrl ) {
325 return "\t<sitemap>\n" .
326 "\t\t<loc>" . $serverUrl .
327 ( substr( $this->urlpath, 0, 1 ) === "/" ? "" : "/" ) .
328 "{$this->urlpath}$filename</loc>\n" .
329 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
330 "\t</sitemap>\n";
331 }
332
338 private function closeIndex() {
339 return "</sitemapindex>\n";
340 }
341}
342
343// @codeCoverageIgnoreStart
344$maintClass = GenerateSitemap::class;
345require_once RUN_MAINTENANCE_IF_MAIN;
346// @codeCoverageIgnoreEnd
const PROTO_CANONICAL
Definition Defines.php:223
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
wfMkdirParents( $dir, $mode=null, $caller=null)
Make directory, and make all parent directories if they don't exist.
Maintenance script that generates a sitemap for the site.
string $timestamp
When this sitemap batch was generated.
string $fspath
The path to prepend to the filename.
IReadableDatabase $dbr
A database replica DB object.
bool $skipRedirects
Whether or not to include redirection pages.
__construct()
Default constructor.
resource $indexFile
A resource pointing to the sitemap index file.
bool $compress
Whether or not to use compression.
array $namespaces
A one-dimensional array of namespaces in the wiki.
string $urlpath
The URL path to prepend to filenames in the index; should resolve to the same directory as $fspath.
int $url_limit
The maximum amount of urls in a sitemap file.
resource false $file
A resource pointing to a sitemap file.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
output( $out, $channel=null)
Throw some output to the user.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
getReplicaDB(string|false $virtualDomain=false)
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Utility for generating a sitemap.
Tools for dealing with other locally-hosted wikis.
Definition WikiMap.php:19
A database connection without write operations.