Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 202 |
|
0.00% |
0 / 21 |
CRAP | |
0.00% |
0 / 1 |
GenerateSitemap | |
0.00% |
0 / 199 |
|
0.00% |
0 / 21 |
3080 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
20 | |||
setNamespacePriorities | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
30 | |||
generateNamespaces | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
priority | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
guessPriority | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getPageRes | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
main | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
306 | |||
open | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
write | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
close | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
sitemapFilename | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
xmlHead | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
xmlSchema | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
openIndex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
indexEntry | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
closeIndex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
openFile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fileEntry | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
closeFile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
generateLimit | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Creates a sitemap for the site. |
4 | * |
5 | * Copyright © 2005, Ævar Arnfjörð Bjarmason, Jens Frank <jeluf@gmx.de> and |
6 | * Brooke Vibber <bvibber@wikimedia.org> |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Maintenance |
25 | * @see http://www.sitemaps.org/ |
26 | * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd |
27 | */ |
28 | |
29 | use MediaWiki\MainConfigNames; |
30 | use MediaWiki\Title\Title; |
31 | use MediaWiki\WikiMap\WikiMap; |
32 | use Wikimedia\Rdbms\IDatabase; |
33 | use Wikimedia\Rdbms\IResultWrapper; |
34 | |
35 | require_once __DIR__ . '/Maintenance.php'; |
36 | |
37 | /** |
38 | * Maintenance script that generates a sitemap for the site. |
39 | * |
40 | * @ingroup Maintenance |
41 | */ |
42 | class GenerateSitemap extends Maintenance { |
43 | private const GS_MAIN = -2; |
44 | private const GS_TALK = -1; |
45 | |
46 | /** |
47 | * The maximum amount of urls in a sitemap file |
48 | * |
49 | * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd |
50 | * |
51 | * @var int |
52 | */ |
53 | public $url_limit; |
54 | |
55 | /** |
56 | * The maximum size of a sitemap file |
57 | * |
58 | * @link http://www.sitemaps.org/faq.php#faq_sitemap_size |
59 | * |
60 | * @var int |
61 | */ |
62 | public $size_limit; |
63 | |
64 | /** |
65 | * The path to prepend to the filename |
66 | * |
67 | * @var string |
68 | */ |
69 | public $fspath; |
70 | |
71 | /** |
72 | * The URL path to prepend to filenames in the index; |
73 | * should resolve to the same directory as $fspath. |
74 | * |
75 | * @var string |
76 | */ |
77 | public $urlpath; |
78 | |
79 | /** |
80 | * Whether or not to use compression |
81 | * |
82 | * @var bool |
83 | */ |
84 | public $compress; |
85 | |
86 | /** |
87 | * Whether or not to include redirection pages |
88 | * |
89 | * @var bool |
90 | */ |
91 | public $skipRedirects; |
92 | |
93 | /** |
94 | * The number of entries to save in each sitemap file |
95 | * |
96 | * @var array |
97 | */ |
98 | public $limit = []; |
99 | |
100 | /** |
101 | * Key => value entries of namespaces and their priorities |
102 | * |
103 | * @var array |
104 | */ |
105 | public $priorities = []; |
106 | |
107 | /** |
108 | * A one-dimensional array of namespaces in the wiki |
109 | * |
110 | * @var array |
111 | */ |
112 | public $namespaces = []; |
113 | |
114 | /** |
115 | * When this sitemap batch was generated |
116 | * |
117 | * @var string |
118 | */ |
119 | public $timestamp; |
120 | |
121 | /** |
122 | * A database replica DB object |
123 | * |
124 | * @var IDatabase |
125 | */ |
126 | public $dbr; |
127 | |
128 | /** |
129 | * A resource pointing to the sitemap index file |
130 | * |
131 | * @var resource |
132 | */ |
133 | public $findex; |
134 | |
135 | /** |
136 | * A resource pointing to a sitemap file |
137 | * |
138 | * @var resource|false |
139 | */ |
140 | public $file; |
141 | |
142 | /** |
143 | * Identifier to use in filenames, default $wgDBname |
144 | * |
145 | * @var string |
146 | */ |
147 | private $identifier; |
148 | |
149 | public function __construct() { |
150 | parent::__construct(); |
151 | $this->addDescription( 'Creates a sitemap for the site' ); |
152 | $this->addOption( |
153 | 'fspath', |
154 | 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', |
155 | false, |
156 | true |
157 | ); |
158 | $this->addOption( |
159 | 'urlpath', |
160 | 'The URL path corresponding to --fspath, prepended to filenames in the index; ' |
161 | . 'defaults to an empty string', |
162 | false, |
163 | true |
164 | ); |
165 | $this->addOption( |
166 | 'compress', |
167 | 'Compress the sitemap files, can take value yes|no, default yes', |
168 | false, |
169 | true |
170 | ); |
171 | $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' ); |
172 | $this->addOption( |
173 | 'identifier', |
174 | 'What site identifier to use for the wiki, defaults to $wgDBname', |
175 | false, |
176 | true |
177 | ); |
178 | } |
179 | |
180 | /** |
181 | * Execute |
182 | */ |
183 | public function execute() { |
184 | $this->setNamespacePriorities(); |
185 | $this->url_limit = 50000; |
186 | $this->size_limit = ( 2 ** 20 ) * 10; |
187 | |
188 | # Create directory if needed |
189 | $fspath = $this->getOption( 'fspath', getcwd() ); |
190 | if ( !wfMkdirParents( $fspath, null, __METHOD__ ) ) { |
191 | $this->fatalError( "Can not create directory $fspath." ); |
192 | } |
193 | |
194 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
195 | $this->fspath = realpath( $fspath ) . DIRECTORY_SEPARATOR; |
196 | $this->urlpath = $this->getOption( 'urlpath', "" ); |
197 | if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) { |
198 | $this->urlpath .= '/'; |
199 | } |
200 | $this->identifier = $this->getOption( 'identifier', $dbDomain ); |
201 | $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no'; |
202 | $this->skipRedirects = $this->hasOption( 'skip-redirects' ); |
203 | $this->dbr = $this->getReplicaDB(); |
204 | $this->generateNamespaces(); |
205 | $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); |
206 | $encIdentifier = rawurlencode( $this->identifier ); |
207 | $this->findex = fopen( "{$this->fspath}sitemap-index-{$encIdentifier}.xml", 'wb' ); |
208 | $this->main(); |
209 | } |
210 | |
211 | private function setNamespacePriorities() { |
212 | $sitemapNamespacesPriorities = $this->getConfig()->get( MainConfigNames::SitemapNamespacesPriorities ); |
213 | |
214 | // Custom main namespaces |
215 | $this->priorities[self::GS_MAIN] = '0.5'; |
216 | // Custom talk namesspaces |
217 | $this->priorities[self::GS_TALK] = '0.1'; |
218 | // MediaWiki standard namespaces |
219 | $this->priorities[NS_MAIN] = '1.0'; |
220 | $this->priorities[NS_TALK] = '0.1'; |
221 | $this->priorities[NS_USER] = '0.5'; |
222 | $this->priorities[NS_USER_TALK] = '0.1'; |
223 | $this->priorities[NS_PROJECT] = '0.5'; |
224 | $this->priorities[NS_PROJECT_TALK] = '0.1'; |
225 | $this->priorities[NS_FILE] = '0.5'; |
226 | $this->priorities[NS_FILE_TALK] = '0.1'; |
227 | $this->priorities[NS_MEDIAWIKI] = '0.0'; |
228 | $this->priorities[NS_MEDIAWIKI_TALK] = '0.1'; |
229 | $this->priorities[NS_TEMPLATE] = '0.0'; |
230 | $this->priorities[NS_TEMPLATE_TALK] = '0.1'; |
231 | $this->priorities[NS_HELP] = '0.5'; |
232 | $this->priorities[NS_HELP_TALK] = '0.1'; |
233 | $this->priorities[NS_CATEGORY] = '0.5'; |
234 | $this->priorities[NS_CATEGORY_TALK] = '0.1'; |
235 | |
236 | // Custom priorities |
237 | if ( $sitemapNamespacesPriorities !== false ) { |
238 | /** |
239 | * @var array $sitemapNamespacesPriorities |
240 | */ |
241 | foreach ( $sitemapNamespacesPriorities as $namespace => $priority ) { |
242 | $float = floatval( $priority ); |
243 | if ( $float > 1.0 ) { |
244 | $priority = '1.0'; |
245 | } elseif ( $float < 0.0 ) { |
246 | $priority = '0.0'; |
247 | } |
248 | $this->priorities[$namespace] = $priority; |
249 | } |
250 | } |
251 | } |
252 | |
253 | /** |
254 | * Generate a one-dimensional array of existing namespaces |
255 | */ |
256 | private function generateNamespaces() { |
257 | // Only generate for specific namespaces if $wgSitemapNamespaces is an array. |
258 | $sitemapNamespaces = $this->getConfig()->get( MainConfigNames::SitemapNamespaces ); |
259 | if ( is_array( $sitemapNamespaces ) ) { |
260 | $this->namespaces = $sitemapNamespaces; |
261 | |
262 | return; |
263 | } |
264 | |
265 | $res = $this->dbr->newSelectQueryBuilder() |
266 | ->select( [ 'page_namespace' ] ) |
267 | ->from( 'page' ) |
268 | ->groupBy( 'page_namespace' ) |
269 | ->orderBy( 'page_namespace' ) |
270 | ->caller( __METHOD__ )->fetchResultSet(); |
271 | |
272 | foreach ( $res as $row ) { |
273 | $this->namespaces[] = $row->page_namespace; |
274 | } |
275 | } |
276 | |
277 | /** |
278 | * Get the priority of a given namespace |
279 | * |
280 | * @param int $namespace The namespace to get the priority for |
281 | * @return string |
282 | */ |
283 | private function priority( $namespace ) { |
284 | return $this->priorities[$namespace] ?? $this->guessPriority( $namespace ); |
285 | } |
286 | |
287 | /** |
288 | * If the namespace isn't listed on the priority list return the |
289 | * default priority for the namespace, varies depending on whether it's |
290 | * a talkpage or not. |
291 | * |
292 | * @param int $namespace The namespace to get the priority for |
293 | * @return string |
294 | */ |
295 | private function guessPriority( $namespace ) { |
296 | return $this->getServiceContainer()->getNamespaceInfo()->isSubject( $namespace ) |
297 | ? $this->priorities[self::GS_MAIN] |
298 | : $this->priorities[self::GS_TALK]; |
299 | } |
300 | |
301 | /** |
302 | * Return a database resolution of all the pages in a given namespace |
303 | * |
304 | * @param int $namespace Limit the query to this namespace |
305 | * @return IResultWrapper |
306 | */ |
307 | private function getPageRes( $namespace ) { |
308 | return $this->dbr->newSelectQueryBuilder() |
309 | ->select( [ 'page_namespace', 'page_title', 'page_touched', 'page_is_redirect', 'pp_propname' ] ) |
310 | ->from( 'page' ) |
311 | ->leftJoin( 'page_props', null, [ 'page_id = pp_page', 'pp_propname' => 'noindex' ] ) |
312 | ->where( [ 'page_namespace' => $namespace ] ) |
313 | ->caller( __METHOD__ )->fetchResultSet(); |
314 | } |
315 | |
316 | /** |
317 | * Main loop |
318 | */ |
319 | public function main() { |
320 | $services = $this->getServiceContainer(); |
321 | $contLang = $services->getContentLanguage(); |
322 | $langConverter = $services->getLanguageConverterFactory()->getLanguageConverter( $contLang ); |
323 | |
324 | fwrite( $this->findex, $this->openIndex() ); |
325 | |
326 | foreach ( $this->namespaces as $namespace ) { |
327 | $res = $this->getPageRes( $namespace ); |
328 | $this->file = false; |
329 | $this->generateLimit( $namespace ); |
330 | $length = $this->limit[0]; |
331 | $i = $smcount = 0; |
332 | |
333 | $fns = $contLang->getFormattedNsText( $namespace ); |
334 | $this->output( "$namespace ($fns)\n" ); |
335 | $skippedRedirects = 0; // Number of redirects skipped for that namespace |
336 | $skippedNoindex = 0; // Number of pages with __NOINDEX__ switch for that NS |
337 | foreach ( $res as $row ) { |
338 | if ( $row->pp_propname === 'noindex' ) { |
339 | $skippedNoindex++; |
340 | continue; |
341 | } |
342 | |
343 | if ( $this->skipRedirects && $row->page_is_redirect ) { |
344 | $skippedRedirects++; |
345 | continue; |
346 | } |
347 | |
348 | if ( $i++ === 0 |
349 | || $i === $this->url_limit + 1 |
350 | || $length + $this->limit[1] + $this->limit[2] > $this->size_limit |
351 | ) { |
352 | if ( $this->file !== false ) { |
353 | $this->write( $this->file, $this->closeFile() ); |
354 | $this->close( $this->file ); |
355 | } |
356 | $filename = $this->sitemapFilename( $namespace, $smcount++ ); |
357 | $this->file = $this->open( $this->fspath . $filename, 'wb' ); |
358 | $this->write( $this->file, $this->openFile() ); |
359 | fwrite( $this->findex, $this->indexEntry( $filename ) ); |
360 | $this->output( "\t$this->fspath$filename\n" ); |
361 | $length = $this->limit[0]; |
362 | $i = 1; |
363 | } |
364 | $title = Title::makeTitle( $row->page_namespace, $row->page_title ); |
365 | $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); |
366 | $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) ); |
367 | $length += strlen( $entry ); |
368 | $this->write( $this->file, $entry ); |
369 | // generate pages for language variants |
370 | if ( $langConverter->hasVariants() ) { |
371 | $variants = $langConverter->getVariants(); |
372 | foreach ( $variants as $vCode ) { |
373 | if ( $vCode == $contLang->getCode() ) { |
374 | continue; // we don't want default variant |
375 | } |
376 | $entry = $this->fileEntry( |
377 | $title->getCanonicalURL( [ 'variant' => $vCode ] ), |
378 | $date, |
379 | $this->priority( $namespace ) |
380 | ); |
381 | $length += strlen( $entry ); |
382 | $this->write( $this->file, $entry ); |
383 | } |
384 | } |
385 | } |
386 | |
387 | if ( $skippedNoindex > 0 ) { |
388 | $this->output( " skipped $skippedNoindex page(s) with __NOINDEX__ switch\n" ); |
389 | } |
390 | |
391 | if ( $this->skipRedirects && $skippedRedirects > 0 ) { |
392 | $this->output( " skipped $skippedRedirects redirect(s)\n" ); |
393 | } |
394 | |
395 | if ( $this->file ) { |
396 | $this->write( $this->file, $this->closeFile() ); |
397 | $this->close( $this->file ); |
398 | } |
399 | } |
400 | fwrite( $this->findex, $this->closeIndex() ); |
401 | fclose( $this->findex ); |
402 | } |
403 | |
404 | /** |
405 | * gzopen() / fopen() wrapper |
406 | * |
407 | * @param string $file |
408 | * @param string $flags |
409 | * @return resource |
410 | */ |
411 | private function open( $file, $flags ) { |
412 | $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); |
413 | if ( $resource === false ) { |
414 | throw new RuntimeException( __METHOD__ |
415 | . " error opening file $file with flags $flags. Check permissions?" ); |
416 | } |
417 | |
418 | return $resource; |
419 | } |
420 | |
421 | /** |
422 | * gzwrite() / fwrite() wrapper |
423 | * |
424 | * @param resource &$handle |
425 | * @param string $str |
426 | */ |
427 | private function write( &$handle, $str ) { |
428 | if ( $handle === true || $handle === false ) { |
429 | throw new InvalidArgumentException( __METHOD__ . " was passed a boolean as a file handle.\n" ); |
430 | } |
431 | if ( $this->compress ) { |
432 | gzwrite( $handle, $str ); |
433 | } else { |
434 | fwrite( $handle, $str ); |
435 | } |
436 | } |
437 | |
438 | /** |
439 | * gzclose() / fclose() wrapper |
440 | * |
441 | * @param resource &$handle |
442 | */ |
443 | private function close( &$handle ) { |
444 | if ( $this->compress ) { |
445 | gzclose( $handle ); |
446 | } else { |
447 | fclose( $handle ); |
448 | } |
449 | } |
450 | |
451 | /** |
452 | * Get a sitemap filename |
453 | * |
454 | * @param int $namespace |
455 | * @param int $count |
456 | * @return string |
457 | */ |
458 | private function sitemapFilename( $namespace, $count ) { |
459 | $ext = $this->compress ? '.gz' : ''; |
460 | |
461 | return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext"; |
462 | } |
463 | |
464 | /** |
465 | * Return the XML required to open an XML file |
466 | * |
467 | * @return string |
468 | */ |
469 | private function xmlHead() { |
470 | return '<?xml version="1.0" encoding="UTF-8"?>' . "\n"; |
471 | } |
472 | |
473 | /** |
474 | * Return the XML schema being used |
475 | * |
476 | * @return string |
477 | */ |
478 | private function xmlSchema() { |
479 | return 'http://www.sitemaps.org/schemas/sitemap/0.9'; |
480 | } |
481 | |
482 | /** |
483 | * Return the XML required to open a sitemap index file |
484 | * |
485 | * @return string |
486 | */ |
487 | private function openIndex() { |
488 | return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n"; |
489 | } |
490 | |
491 | /** |
492 | * Return the XML for a single sitemap indexfile entry |
493 | * |
494 | * @param string $filename The filename of the sitemap file |
495 | * @return string |
496 | */ |
497 | private function indexEntry( $filename ) { |
498 | return "\t<sitemap>\n" . |
499 | "\t\t<loc>" . wfGetServerUrl( PROTO_CANONICAL ) . |
500 | ( substr( $this->urlpath, 0, 1 ) === "/" ? "" : "/" ) . |
501 | "{$this->urlpath}$filename</loc>\n" . |
502 | "\t\t<lastmod>{$this->timestamp}</lastmod>\n" . |
503 | "\t</sitemap>\n"; |
504 | } |
505 | |
506 | /** |
507 | * Return the XML required to close a sitemap index file |
508 | * |
509 | * @return string |
510 | */ |
511 | private function closeIndex() { |
512 | return "</sitemapindex>\n"; |
513 | } |
514 | |
515 | /** |
516 | * Return the XML required to open a sitemap file |
517 | * |
518 | * @return string |
519 | */ |
520 | private function openFile() { |
521 | return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n"; |
522 | } |
523 | |
524 | /** |
525 | * Return the XML for a single sitemap entry |
526 | * |
527 | * @param string $url An RFC 2396 compliant URL |
528 | * @param string $date A ISO 8601 date |
529 | * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize |
530 | * @return string |
531 | */ |
532 | private function fileEntry( $url, $date, $priority ) { |
533 | return "\t<url>\n" . |
534 | // T36666: $url may contain bad characters such as ampersands. |
535 | "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" . |
536 | "\t\t<lastmod>$date</lastmod>\n" . |
537 | "\t\t<priority>$priority</priority>\n" . |
538 | "\t</url>\n"; |
539 | } |
540 | |
541 | /** |
542 | * Return the XML required to close sitemap file |
543 | * |
544 | * @return string |
545 | */ |
546 | private function closeFile() { |
547 | return "</urlset>\n"; |
548 | } |
549 | |
550 | /** |
551 | * Populate $this->limit |
552 | * |
553 | * @param int $namespace |
554 | */ |
555 | private function generateLimit( $namespace ) { |
556 | // T19961: make a title with the longest possible URL in this namespace |
557 | $title = Title::makeTitle( $namespace, str_repeat( "\u{28B81}", 63 ) . "\u{5583}" ); |
558 | |
559 | $this->limit = [ |
560 | strlen( $this->openFile() ), |
561 | strlen( $this->fileEntry( |
562 | $title->getCanonicalURL(), |
563 | wfTimestamp( TS_ISO_8601, wfTimestamp() ), |
564 | $this->priority( $namespace ) |
565 | ) ), |
566 | strlen( $this->closeFile() ) |
567 | ]; |
568 | } |
569 | } |
570 | |
571 | $maintClass = GenerateSitemap::class; |
572 | require_once RUN_MAINTENANCE_IF_MAIN; |