MediaWiki  1.23.2
generateSitemap.php
Go to the documentation of this file.
1 <?php
29 require_once __DIR__ . '/Maintenance.php';
30 
36 class GenerateSitemap extends Maintenance {
37  const GS_MAIN = -2;
38  const GS_TALK = -1;
39 
47  public $url_limit;
48 
56  public $size_limit;
57 
63  public $fspath;
64 
70  public $urlpath;
71 
77  public $compress;
78 
84  public $skipRedirects;
85 
91  public $limit = array();
92 
98  public $priorities = array();
99 
105  public $namespaces = array();
106 
112  public $timestamp;
113 
119  public $dbr;
120 
126  public $findex;
127 
133  public $file;
134 
140  private $identifier;
141 
145  public function __construct() {
146  parent::__construct();
147  $this->mDescription = "Creates a sitemap for the site";
148  $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true );
149  $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true );
150  $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
151  $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
152  $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true );
153  }
154 
158  public function execute() {
159  $this->setNamespacePriorities();
160  $this->url_limit = 50000;
161  $this->size_limit = pow( 2, 20 ) * 10;
162  $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
163  $this->urlpath = $this->getOption( 'urlpath', "" );
164  if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
165  $this->urlpath .= '/';
166  }
167  $this->identifier = $this->getOption( 'identifier', wfWikiID() );
168  $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
169  $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false;
170  $this->dbr = wfGetDB( DB_SLAVE );
171  $this->generateNamespaces();
172  $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
173  $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
174  $this->main();
175  }
176 
177  private function setNamespacePriorities() {
178  global $wgSitemapNamespacesPriorities;
179 
180  // Custom main namespaces
181  $this->priorities[self::GS_MAIN] = '0.5';
182  // Custom talk namesspaces
183  $this->priorities[self::GS_TALK] = '0.1';
184  // MediaWiki standard namespaces
185  $this->priorities[NS_MAIN] = '1.0';
186  $this->priorities[NS_TALK] = '0.1';
187  $this->priorities[NS_USER] = '0.5';
188  $this->priorities[NS_USER_TALK] = '0.1';
189  $this->priorities[NS_PROJECT] = '0.5';
190  $this->priorities[NS_PROJECT_TALK] = '0.1';
191  $this->priorities[NS_FILE] = '0.5';
192  $this->priorities[NS_FILE_TALK] = '0.1';
193  $this->priorities[NS_MEDIAWIKI] = '0.0';
194  $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
195  $this->priorities[NS_TEMPLATE] = '0.0';
196  $this->priorities[NS_TEMPLATE_TALK] = '0.1';
197  $this->priorities[NS_HELP] = '0.5';
198  $this->priorities[NS_HELP_TALK] = '0.1';
199  $this->priorities[NS_CATEGORY] = '0.5';
200  $this->priorities[NS_CATEGORY_TALK] = '0.1';
201 
202  // Custom priorities
203  if ( $wgSitemapNamespacesPriorities !== false ) {
207  foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
208  $float = floatval( $priority );
209  if ( $float > 1.0 ) {
210  $priority = '1.0';
211  } elseif ( $float < 0.0 ) {
212  $priority = '0.0';
213  }
214  $this->priorities[$namespace] = $priority;
215  }
216  }
217  }
218 
224  private static function init_path( $fspath ) {
225  if ( !isset( $fspath ) ) {
226  return null;
227  }
228  # Create directory if needed
229  if ( $fspath && !is_dir( $fspath ) ) {
230  wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
231  }
232 
233  return realpath( $fspath ) . DIRECTORY_SEPARATOR;
234  }
235 
239  function generateNamespaces() {
240  // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
241  global $wgSitemapNamespaces;
242  if ( is_array( $wgSitemapNamespaces ) ) {
243  $this->namespaces = $wgSitemapNamespaces;
244  return;
245  }
246 
247  $res = $this->dbr->select( 'page',
248  array( 'page_namespace' ),
249  array(),
250  __METHOD__,
251  array(
252  'GROUP BY' => 'page_namespace',
253  'ORDER BY' => 'page_namespace',
254  )
255  );
256 
257  foreach ( $res as $row ) {
258  $this->namespaces[] = $row->page_namespace;
259  }
260  }
261 
268  function priority( $namespace ) {
269  return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
270  }
271 
280  function guessPriority( $namespace ) {
281  return MWNamespace::isSubject( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK];
282  }
283 
290  function getPageRes( $namespace ) {
291  return $this->dbr->select( 'page',
292  array(
293  'page_namespace',
294  'page_title',
295  'page_touched',
296  'page_is_redirect'
297  ),
298  array( 'page_namespace' => $namespace ),
299  __METHOD__
300  );
301  }
302 
306  public function main() {
308 
309  fwrite( $this->findex, $this->openIndex() );
310 
311  foreach ( $this->namespaces as $namespace ) {
312  $res = $this->getPageRes( $namespace );
313  $this->file = false;
314  $this->generateLimit( $namespace );
315  $length = $this->limit[0];
316  $i = $smcount = 0;
317 
318  $fns = $wgContLang->getFormattedNsText( $namespace );
319  $this->output( "$namespace ($fns)\n" );
320  $skippedRedirects = 0; // Number of redirects skipped for that namespace
321  foreach ( $res as $row ) {
322  if ( $this->skipRedirects && $row->page_is_redirect ) {
323  $skippedRedirects++;
324  continue;
325  }
326 
327  if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
328  if ( $this->file !== false ) {
329  $this->write( $this->file, $this->closeFile() );
330  $this->close( $this->file );
331  }
332  $filename = $this->sitemapFilename( $namespace, $smcount++ );
333  $this->file = $this->open( $this->fspath . $filename, 'wb' );
334  $this->write( $this->file, $this->openFile() );
335  fwrite( $this->findex, $this->indexEntry( $filename ) );
336  $this->output( "\t$this->fspath$filename\n" );
337  $length = $this->limit[0];
338  $i = 1;
339  }
340  $title = Title::makeTitle( $row->page_namespace, $row->page_title );
341  $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
342  $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
343  $length += strlen( $entry );
344  $this->write( $this->file, $entry );
345  // generate pages for language variants
346  if ( $wgContLang->hasVariants() ) {
347  $variants = $wgContLang->getVariants();
348  foreach ( $variants as $vCode ) {
349  if ( $vCode == $wgContLang->getCode() ) {
350  continue; // we don't want default variant
351  }
352  $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) );
353  $length += strlen( $entry );
354  $this->write( $this->file, $entry );
355  }
356  }
357  }
358 
359  if ( $this->skipRedirects && $skippedRedirects > 0 ) {
360  $this->output( " skipped $skippedRedirects redirect(s)\n" );
361  }
362 
363  if ( $this->file ) {
364  $this->write( $this->file, $this->closeFile() );
365  $this->close( $this->file );
366  }
367  }
368  fwrite( $this->findex, $this->closeIndex() );
369  fclose( $this->findex );
370  }
371 
377  function open( $file, $flags ) {
378  $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
379  if ( $resource === false ) {
380  throw new MWException( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" );
381  }
382  return $resource;
383  }
384 
388  function write( &$handle, $str ) {
389  if ( $handle === true || $handle === false ) {
390  throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" );
391  }
392  if ( $this->compress ) {
393  gzwrite( $handle, $str );
394  } else {
395  fwrite( $handle, $str );
396  }
397  }
398 
402  function close( &$handle ) {
403  if ( $this->compress ) {
404  gzclose( $handle );
405  } else {
406  fclose( $handle );
407  }
408  }
409 
417  function sitemapFilename( $namespace, $count ) {
418  $ext = $this->compress ? '.gz' : '';
419  return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
420  }
421 
427  function xmlHead() {
428  return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
429  }
430 
436  function xmlSchema() {
437  return 'http://www.sitemaps.org/schemas/sitemap/0.9';
438  }
439 
445  function openIndex() {
446  return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
447  }
448 
455  function indexEntry( $filename ) {
456  return
457  "\t<sitemap>\n" .
458  "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
459  "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
460  "\t</sitemap>\n";
461  }
462 
468  function closeIndex() {
469  return "</sitemapindex>\n";
470  }
471 
477  function openFile() {
478  return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
479  }
480 
489  function fileEntry( $url, $date, $priority ) {
490  return
491  "\t<url>\n" .
492  // bug 34666: $url may contain bad characters such as ampersands.
493  "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" .
494  "\t\t<lastmod>$date</lastmod>\n" .
495  "\t\t<priority>$priority</priority>\n" .
496  "\t</url>\n";
497  }
498 
504  function closeFile() {
505  return "</urlset>\n";
506  }
507 
511  function generateLimit( $namespace ) {
512  // bug 17961: make a title with the longest possible URL in this namespace
513  $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
514 
515  $this->limit = array(
516  strlen( $this->openFile() ),
517  strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
518  strlen( $this->closeFile() )
519  );
520  }
521 }
522 
523 $maintClass = "GenerateSitemap";
524 require_once RUN_MAINTENANCE_IF_MAIN;
Title\makeTitle
static & makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:398
NS_HELP
const NS_HELP
Definition: Defines.php:91
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
wfMkdirParents
wfMkdirParents( $dir, $mode=null, $caller=null)
Make directory, and make all parent directories if they don't exist.
Definition: GlobalFunctions.php:2590
NS_TEMPLATE_TALK
const NS_TEMPLATE_TALK
Definition: Defines.php:90
wfGetDB
& wfGetDB( $db, $groups=array(), $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:3650
$timestamp
if( $limit) $timestamp
Definition: importImages.php:104
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:2483
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false)
Add a parameter to the script.
Definition: Maintenance.php:169
NS_FILE
const NS_FILE
Definition: Defines.php:85
RUN_MAINTENANCE_IF_MAIN
require_once RUN_MAINTENANCE_IF_MAIN
Definition: maintenance.txt:50
$limit
if( $sleep) $limit
Definition: importImages.php:99
NS_TEMPLATE
const NS_TEMPLATE
Definition: Defines.php:89
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: maintenance.txt:39
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
Makefile.open
open
Definition: Makefile.py:14
$flags
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition: hooks.txt:2113
$maintClass
$maintClass
Definition: attachLatest.php:91
$dbr
$dbr
Definition: testCompression.php:48
NS_MAIN
const NS_MAIN
Definition: Defines.php:79
file
We ve cleaned up the code here by removing clumps of infrequently used code and moving them off somewhere else It s much easier for someone working with this code to see what s _really_ going and make changes or fix bugs In we can take all the code that deals with the little used title reversing we can concentrate it all in an extension file
Definition: hooks.txt:93
namespaces
to move a page</td >< td > &*You are moving the page across namespaces
Definition: All_system_messages.txt:2677
MWException
MediaWiki exception.
Definition: MWException.php:26
Maintenance::__construct
public function __construct()
Definition: maintenance.txt:41
NS_PROJECT
const NS_PROJECT
Definition: Defines.php:83
TS_ISO_8601
const TS_ISO_8601
ISO 8601 format with no timezone: 1986-02-09T20:00:00Z.
Definition: GlobalFunctions.php:2448
NS_MEDIAWIKI_TALK
const NS_MEDIAWIKI_TALK
Definition: Defines.php:88
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:2514
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:93
MWNamespace\isSubject
static isSubject( $index)
Is the given namespace is a subject (non-talk) namespace?
Definition: Namespace.php:87
wfWikiID
wfWikiID()
Get an ASCII string identifying this wiki This is used as a prefix in memcached keys.
Definition: GlobalFunctions.php:3604
$title
presenting them properly to the user as errors is done by the caller $title
Definition: hooks.txt:1324
NS_USER_TALK
const NS_USER_TALK
Definition: Defines.php:82
cssjanus.main
def main(argv)
Definition: cssjanus.py:554
NS_PROJECT_TALK
const NS_PROJECT_TALK
Definition: Defines.php:84
$file
if(PHP_SAPI !='cli') $file
Definition: UtfNormalTest2.php:30
$count
$count
Definition: UtfNormalTest2.php:96
DB_SLAVE
const DB_SLAVE
Definition: Defines.php:55
$namespaces
namespace and then decline to actually register it & $namespaces
Definition: hooks.txt:815
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:191
NS_HELP_TALK
const NS_HELP_TALK
Definition: Defines.php:92
$ext
$ext
Definition: NoLocalSettings.php:34
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
NS_USER
const NS_USER
Definition: Defines.php:81
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:314
NS_TALK
const NS_TALK
Definition: Defines.php:80
NS_MEDIAWIKI
const NS_MEDIAWIKI
Definition: Defines.php:87
NS_FILE_TALK
const NS_FILE_TALK
Definition: Defines.php:86
NS_CATEGORY_TALK
const NS_CATEGORY_TALK
Definition: Defines.php:94
Maintenance::execute
public function execute()
Definition: maintenance.txt:45
$res
$res
Definition: database.txt:21