#!/usr/bin/env python3
"""This bot downloads dump from dumps.wikimedia.org.
This script supports the following command line parameters:
-filename:# The name of the file (e.g. abstract.xml)
-storepath:# The stored file's path.
-dumpdate:# The dumpdate date of the dump (default to `latest`)
formatted as YYYYMMDD.
.. note:: This script is a
:class:`ConfigParserBot<bot.ConfigParserBot>`. All options can be set
within a settings file which is scripts.ini by default.
.. versionadded:: 3.0.20180108
"""
#
# (C) Pywikibot team, 2017-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations
import binascii
import os.path
from http import HTTPStatus
from os import remove, replace, symlink, urandom
import pywikibot
from pywikibot.bot import Bot, ConfigParserBot
from pywikibot.comms.http import fetch
[docs]
class DownloadDumpBot(Bot, ConfigParserBot):
"""Download dump bot.
.. versionchanged:: 7.0
DownloadDumpBot is a ConfigParserBot
"""
available_options = {
'wikiname': '',
'filename': '',
'storepath': './',
'dumpdate': 'latest',
}
[docs]
@staticmethod
def get_dump_name(db_name, typ, dumpdate):
"""Check if dump file exists locally in a Toolforge server."""
db_path = f'/public/dumps/public/{db_name}/'
if os.path.isdir(db_path):
dump_filepath_template = (
'/public/dumps/public/{db_name}/{date}/{db_name}-{date}-{typ}')
if dumpdate != 'latest':
dump_filepath = dump_filepath_template.format(
db_name=db_name, date=dumpdate, typ=typ)
if os.path.isfile(dump_filepath):
return dump_filepath
else:
# Search for the "latest" dump
dirs = [directory for directory in os.listdir(db_path) if
directory.isdigit()]
dates = map(int, dirs)
dates = sorted(dates, reverse=True)
for date in dates:
dump_filepath = dump_filepath_template.format(
db_name=db_name, date=date, typ=typ)
if os.path.isfile(dump_filepath):
return dump_filepath
return None
[docs]
def run(self) -> None:
"""Run bot."""
def convert_from_bytes(total_bytes):
for unit in ['B', 'K', 'M', 'G', 'T']:
if abs(total_bytes) < 1024:
return str(total_bytes) + unit
total_bytes = float(format(total_bytes / 1024.0, '.2f'))
return str(total_bytes) + 'P'
pywikibot.info('Downloading dump from ' + self.opt.wikiname)
download_filename = '{wikiname}-{dumpdate}-{filename}'.format_map(
self.opt)
temp_filename = download_filename + '-' \
+ binascii.b2a_hex(urandom(8)).decode('ascii') + '.part'
file_final_storepath = os.path.join(
self.opt.storepath, download_filename)
file_current_storepath = os.path.join(
self.opt.storepath, temp_filename)
# https://wikitech.wikimedia.org/wiki/Help:Toolforge/Dumps
toolforge_dump_filepath = self.get_dump_name(
self.opt.wikiname, self.opt.filename, self.opt.dumpdate)
# First iteration for atomic download with temporary file
# Second iteration for fallback non-atomic download
for non_atomic in range(2):
try:
if toolforge_dump_filepath:
pywikibot.info('Symlinking file from '
+ toolforge_dump_filepath)
if non_atomic and os.path.exists(file_final_storepath):
remove(file_final_storepath)
symlink(toolforge_dump_filepath, file_current_storepath)
else:
url = (f'https://dumps.wikimedia.org/{self.opt.wikiname}/'
f'{self.opt.dumpdate}/{download_filename}')
pywikibot.info('Downloading file from ' + url)
response = fetch(url, stream=True)
if response.status_code != HTTPStatus.OK:
if response.status_code == HTTPStatus.NOT_FOUND:
pywikibot.info(
'File with name {filename!r}, from dumpdate '
'{dumpdate!r}, and wiki {wikiname!r} ({url}) '
"isn't available in the Wikimedia Dumps"
.format(url=url, **self.opt))
else:
pywikibot.info(
HTTPStatus(response.status_code).description)
return
with open(file_current_storepath, 'wb') as result_file:
total = int(response.headers['content-length'])
if total == -1:
pywikibot.warning("'content-length' missing in "
'response headers')
downloaded = 0
parts = 50
display_string = ''
pywikibot.info()
for data in response.iter_content(100 * 1024):
result_file.write(data)
if total <= 0:
continue
downloaded += len(data)
done = int(parts * downloaded / total)
display = map(convert_from_bytes,
(downloaded, total))
prior_display = display_string
display_string = '\r|{}{}|{}{}/{}'.format(
'=' * done,
'-' * (parts - done),
' ' * 5,
*display)
# Add whitespace to cover up prior bar
display_string += ' ' * (
len(prior_display.rstrip())
- len(display_string.rstrip()))
pywikibot.info(display_string, newline=False)
pywikibot.info()
# Rename the temporary file to the target file
# if the download completes successfully
if not non_atomic:
replace(file_current_storepath, file_final_storepath)
break
except OSError as e:
pywikibot.error(e)
try:
remove(file_current_storepath)
except OSError as e:
pywikibot.error(e)
# If the atomic download fails, try without a temporary file
# If the non-atomic download also fails, exit the script
if non_atomic:
return
pywikibot.info('Cannot make temporary file, '
'falling back to non-atomic download')
file_current_storepath = file_final_storepath
pywikibot.info('Done! File stored as ' + file_final_storepath)
[docs]
def main(*args: str) -> None:
"""Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
:param args: command line arguments
"""
opts = {}
unknown_args = []
local_args = pywikibot.handle_args(args)
for arg in local_args:
option, _, value = arg.partition(':')
if option.startswith('-'):
option = option[1:]
if option == 'filename':
opts[option] = value or pywikibot.input('Enter the filename: ')
continue
if option == 'storepath':
opts[option] = os.path.abspath(value) or pywikibot.input(
'Enter the store path: ')
continue
if option == 'dumpdate':
opts[option] = value or pywikibot.input(
'Enter the dumpdate of the dump: ')
continue
unknown_args.append(arg)
missing = []
if 'filename' not in opts:
missing.append('-filename')
if pywikibot.bot.suggest_help(missing_parameters=missing,
unknown_parameters=unknown_args):
return
site = pywikibot.Site()
opts['wikiname'] = site.dbName()
bot = DownloadDumpBot(**opts)
bot.run()
if __name__ == '__main__':
main()