Source code for scripts.redirect

#!/usr/bin/env python3
"""Script to resolve double redirects, and to delete broken redirects.

Requires access to MediaWiki's maintenance pages or to a XML dump file.
Delete function requires adminship.

Syntax:

    python pwb.py redirect action [-arguments ...]

where action can be one of these

:double:       Shortcut: **do**. Fix redirects which point to other redirects.

:broken:       Shortcut: **br**. Tries to fix redirect which point to nowhere
               by using the last moved target of the destination page. If this
               fails and the -delete option is set, it either deletes the page
               or marks it for deletion depending on whether the account has
               admin rights. It will mark the redirect not for deletion if
               there is no speedy deletion template available.

:both:         Both of the above. Retrieves redirect pages from live wiki,
               not from a special page.

and arguments can be:

-xml           Retrieve information from a local XML dump
               (https://dumps.wikimedia.org). Argument can also be given as
               "-xml:filename.xml". Cannot be used with -fullscan or -moves.

-fullscan      Retrieve redirect pages from live wiki, not from a special page
               Cannot be used with -xml or 'both' action.

-moves         Use the page move log to find double-redirect candidates. Only
               works with action "double", does not work with -xml.

               NOTE: You may use only one of these options above.
               If neither of -xml -fullscan -moves is given, info will be
               loaded from a special page of the live wiki.

-offset:n      With -moves, the number of hours ago to start scanning moved
               pages. With -xml, the number of the redirect to restart with
               (see progress). Otherwise, ignored.

-start:title   The starting page title in each namespace. Page need not exist.

-until:title   The possible last page title in each namespace. Page needs not
               exist.

-limit:n       The maximum count of redirects to work upon. If omitted, there
               is no limit.

-delete        Prompt the user whether broken redirects should be deleted (or
               marked for deletion if the account has no admin rights) instead
               of just skipping them.

-sdtemplate:x  Add the speedy deletion template string including brackets.
               This enables overriding the default template via i18n or
               to enable speedy deletion for projects other than Wikipedias.

-always        Don't prompt you for each replacement.

Furthermore the following options are provided:

&params;
"""
#
# (C) Pywikibot team, 2004-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import datetime
from contextlib import suppress
from textwrap import fill
from typing import Any, Generator

import pywikibot
import pywikibot.data
from pywikibot import i18n, pagegenerators, xmlreader
from pywikibot.bot import ExistingPageBot, OptionHandler, suggest_help
from pywikibot.exceptions import (
    CircularRedirectError,
    InterwikiRedirectPageError,
    InvalidTitleError,
    IsNotRedirectPageError,
    IsRedirectPageError,
    NoMoveTargetError,
    NoPageError,
    SectionError,
    ServerError,
    SiteDefinitionError,
    UnsupportedPageError,
)
from pywikibot.textlib import extract_templates_and_params_regex_simple


docuReplacements = {'&params;': pagegenerators.parameterHelp}  # noqa: N816


[docs] def space_to_underscore(link) -> str: """Convert spaces to underscore.""" # previous versions weren't expecting spaces but underscores return link.canonical_title().replace(' ', '_')
[docs] class RedirectGenerator(OptionHandler): """Redirect generator.""" available_options = { 'fullscan': False, 'moves': False, 'namespaces': {0}, 'offset': -1, 'start': None, 'limit': None, 'until': None, 'xml': None, } def __init__(self, action, **kwargs) -> None: """Initializer.""" super().__init__(**kwargs) self.site = pywikibot.Site() # connect the generator selected by 'action' parameter cls = self.__class__ if action == 'double': cls.__iter__ = lambda slf: slf.retrieve_double_redirects() elif action == 'broken': cls.__iter__ = lambda slf: slf.retrieve_broken_redirects() elif action == 'both': cls.__iter__ = lambda slf: slf.get_redirects_via_api(maxlen=2)
[docs] def get_redirects_from_dump( self, alsoGetPageTitles: bool = False ) -> tuple[dict[str, str], set[str]]: """Extract redirects from dump. Load a local XML dump file, look at all pages which have the redirect flag set, and find out where they're pointing at. Return a dictionary where the redirect names are the keys and the redirect targets are the values. """ xmlFilename = self.opt.xml redict = {} # open xml dump and read page titles out of it dump = xmlreader.XmlDump(xmlFilename, on_error=pywikibot.error) redirR = self.site.redirect_regex readPagesCount = 0 pageTitles = set() for entry in dump.parse(): readPagesCount += 1 # always print status message after 10000 pages if readPagesCount % 10000 == 0: pywikibot.info(f'{readPagesCount} pages read...') if self.opt.namespaces and pywikibot.Page( self.site, entry.title).namespace() not in self.opt.namespaces: continue if alsoGetPageTitles: pageTitles.add(space_to_underscore(pywikibot.Link(entry.title, self.site))) m = redirR.match(entry.text) if m: target = m[1] # There might be redirects to another wiki. Ignore these. target_link = pywikibot.Link(target, self.site) try: target_link.parse() except SiteDefinitionError as e: pywikibot.log(e) pywikibot.info( f'NOTE: Ignoring {entry.title} which is a redirect ' f'({target}) to an unknown site.' ) target_link = None else: if target_link.site != self.site: pywikibot.info( f'NOTE: Ignoring {entry.title} which is a ' f'redirect to another site {target_link.site}.' ) target_link = None # if the redirect does not link to another wiki if target_link and target_link.title: source = pywikibot.Link(entry.title, self.site) if target_link.anchor: pywikibot.info(f'HINT: {entry.title} is a redirect' ' with a pipelink.') redict[space_to_underscore(source)] = ( space_to_underscore(target_link)) return redict, pageTitles
[docs] def get_redirect_pages_via_api(self) -> Generator[pywikibot.Page]: """Yield Pages that are redirects.""" for ns in self.opt.namespaces: gen = self.site.allpages(start=self.opt.start, namespace=ns, filterredir=True) if self.opt.limit: gen.set_maximum_items(self.opt.limit) for p in gen: done = (self.opt.until and p.title(with_ns=False) >= self.opt.until) if done: return yield p
def _next_redirect_group(self) -> Generator[list[pywikibot.Page]]: """Generator that yields batches of redirects as a list.""" chunk = [] for page in self.get_redirect_pages_via_api(): chunk.append(str(page.pageid)) if len(chunk) >= self.site.maxlimit: pywikibot.info('.', newline=False) yield chunk chunk.clear() if chunk: yield chunk
[docs] def get_redirects_via_api( self, maxlen: int = 8 ) -> Generator[tuple[str, int | None, str, str | None]]: r"""Return a generator that yields tuples of data about redirect Pages. .. versionchanged:: 7.0 only yield tuple if type of redirect is not 1 (normal redirect) The description of returned tuple items is as follows: :[0]: page title of a redirect page :[1]: type of redirect: :None: start of a redirect chain of unknown length, or loop :[0]: broken redirect, target page title missing :[1]: normal redirect, target page exists and is not a redirect :[2\:maxlen]: start of a redirect chain of that many redirects (currently, the API seems not to return sufficient data to make these return values possible, but that may change) :[maxlen+1]: start of an even longer chain, or a loop (currently, the API seems not to return sufficient data to allow this return values, but that may change) :[2]: target page title of the redirect, or chain (may not exist) :[3]: target page of the redirect, or end of chain, or page title where chain or loop detecton was halted, or None if unknown """ for apiQ in self._next_redirect_group(): gen = pywikibot.data.api.Request( site=self.site, parameters={'action': 'query', 'redirects': True, 'pageids': apiQ}) data = gen.submit() if 'error' in data: raise RuntimeError(f'API query error: {data}') if data == [] or 'query' not in data: raise RuntimeError('No results given.') pages = {} redirects = {x['from']: x['to'] for x in data['query']['redirects']} for pagetitle in data['query']['pages'].values(): pages[pagetitle['title']] = \ 'missing' not in pagetitle or 'pageid' in pagetitle for redirect, target in redirects.items(): result = None final = None if pages.get(target): result = 0 final = target with suppress(KeyError): while result <= maxlen: result += 1 final = redirects[final] # only yield multiple or broken redirects if result != 1: yield redirect, result, target, final
[docs] def retrieve_broken_redirects(self) -> Generator[ str | pywikibot.Page]: """Retrieve broken redirects.""" if self.opt.fullscan: count = 0 for pagetitle, type_, *_ in self.get_redirects_via_api(maxlen=2): if type_ == 0: yield pagetitle count += 1 if self.opt.limit and count >= self.opt.limit: break elif self.opt.xml: # retrieve information from XML dump pywikibot.info( 'Getting a list of all redirects and of all page titles...') redirs, pageTitles = self.get_redirects_from_dump( alsoGetPageTitles=True) for (key, value) in redirs.items(): if value not in pageTitles: yield key else: pywikibot.info('Retrieving broken redirect special page...') yield from self.site.preloadpages(self.site.broken_redirects())
[docs] def retrieve_double_redirects(self) -> Generator[ str | pywikibot.Page]: """Retrieve double redirects.""" if self.opt.moves: yield from self.get_moved_pages_redirects() elif self.opt.fullscan: count = 0 for pagetitle, type_, *_ in self.get_redirects_via_api(maxlen=2): if type_ not in (0, 1): yield pagetitle count += 1 if self.opt.limit and count >= self.opt.limit: break elif self.opt.xml: redict, _ = self.get_redirects_from_dump() total = len(redict) for num, (key, value) in enumerate(redict.items(), start=1): # check if the value - that is, the redirect target - is a # redirect as well if num > self.opt.offset and value in redict: pywikibot.info(f'\nChecking redirect {num} of {total}...') yield key else: pywikibot.info('Retrieving double redirect special page...') yield from self.site.preloadpages(self.site.double_redirects())
[docs] def get_moved_pages_redirects(self) -> Generator[pywikibot.Page]: """Generate redirects to recently-moved pages.""" # this will run forever, until user interrupts it if self.opt.offset <= 0: self.opt.offset = 1 start = (pywikibot.Timestamp.nowutc() - datetime.timedelta(0, self.opt.offset * 3600)) # self.opt.offset hours ago offset_time = start.strftime('%Y%m%d%H%M%S') pywikibot.info('Retrieving {} moved pages...' .format(self.opt.limit if self.opt.limit is not None else 'all')) move_gen = self.site.logevents(logtype='move', start=offset_time) if self.opt.limit: move_gen.set_maximum_items(self.opt.limit) pywikibot.info('.', newline=False) for logentry in move_gen: try: moved_page = logentry.page() except KeyError: # hidden page continue pywikibot.info('.', newline=False) try: if not moved_page.isRedirectPage(): continue except ServerError: continue # moved_page is now a redirect, so any redirects pointing # to it need to be changed try: yield from moved_page.getReferences(follow_redirects=True, filter_redirects=True) except (CircularRedirectError, InterwikiRedirectPageError, NoPageError, ): continue
[docs] class RedirectRobot(ExistingPageBot): """Redirect bot.""" use_redirects = True update_options = { 'limit': float('inf'), 'delete': False, 'sdtemplate': '', } def __init__(self, action, **kwargs) -> None: """Initializer.""" super().__init__(**kwargs) # connect the action treat to treat_page method called by treat if action == 'double': self.treat_page = self.fix_1_double_redirect elif action == 'broken': self.treat_page = self.delete_1_broken_redirect elif action == 'both': self.treat_page = self.fix_double_or_delete_broken_redirect else: raise NotImplementedError(f'No valid action "{action}" found.')
[docs] def get_sd_template( self, site: pywikibot.site.BaseSite | None = None ) -> str | None: """Look for speedy deletion template and return it. :param site: site for which the template has to be given :return: A valid speedy deletion template. """ title = None if site: sd = self.opt.sdtemplate if not sd and i18n.twhas_key(site, 'redirect-broken-redirect-template'): sd = i18n.twtranslate(site, 'redirect-broken-redirect-template') # check whether template exists for this site if sd: template = extract_templates_and_params_regex_simple(sd) if template: title = template[0][0] page = pywikibot.Page(site, title, ns=10) if page.exists(): return sd pywikibot.warning( 'No speedy deletion template {}available.' .format(f'"{title}" ' if title else '')) return None
@property def sdtemplate(self): """Gives the speedy deletion template for the current_page.""" return self.get_sd_template(self.current_page.site)
[docs] def init_page(self, item) -> pywikibot.Page: """Ensure that we process page objects.""" default_site = pywikibot.Site() if isinstance(item, str): item = pywikibot.Page(default_site, item) elif isinstance(item, tuple): redir_name, code, *_ = item item = pywikibot.Page(default_site, redir_name) item._redirect_type = code page = super().init_page(item) self.repo = page.site.data_repository() self.is_repo = self.repo if self.repo == page.site else None return page
[docs] def delete_redirect(self, page, summary_key: str) -> None: """Delete the redirect page. :param page: The page to delete :type page: pywikibot.page.BasePage :param summary_key: The message key for the deletion summary """ assert page.site == self.current_page.site, ( f'target page is on different site {page.site}') reason = i18n.twtranslate(page.site, summary_key, bot_prefix=True) if page.site.has_right('delete'): page.delete(reason, prompt=False) elif self.sdtemplate: pywikibot.info('User does not have delete right, ' 'put page to speedy deletion.') try: content = page.get(get_redirect=True) except SectionError: content_page = pywikibot.Page(page.site, page.title(with_section=False)) content = content_page.get(get_redirect=True) content = self.sdtemplate + '\n' + content self.userPut(page, page.text, content, summary=reason, ignore_save_related_errors=True, ignore_server_errors=True)
[docs] @staticmethod def get_redirect_target(page): """Get redirect target page and handle some exceptions.""" try: return page.getRedirectTarget() except (CircularRedirectError, RuntimeError) as e: pywikibot.error(e) pywikibot.info(f'Skipping {page}.') except InterwikiRedirectPageError: pywikibot.info(f'{page} is on another site, skipping.') return None
[docs] def delete_1_broken_redirect(self) -> None: """Treat one broken redirect.""" redir_page = self.current_page done = not self.opt.delete try: targetPage = self.get_redirect_target(redir_page) except InvalidTitleError as e: pywikibot.error(e) targetPage = None if not targetPage: return try: targetPage.get() except InvalidTitleError as e: pywikibot.error(e) except NoPageError: movedTarget = None with suppress(NoMoveTargetError): movedTarget = targetPage.moved_target() if movedTarget: if not movedTarget.exists(): # FIXME: Test to another move pywikibot.info(f'Target page {movedTarget} does not exist') elif redir_page == movedTarget: pywikibot.info( 'Redirect to target page forms a redirect loop') else: pywikibot.info( f'{redir_page} has been moved to {movedTarget}') reason = i18n.twtranslate( redir_page.site, 'redirect-fix-broken-moved', {'from': targetPage.title(allow_interwiki=False), 'to': movedTarget.title(as_link=True, allow_interwiki=False)}, bot_prefix=True) content = redir_page.get(get_redirect=True) redir_page.set_redirect_target( movedTarget, keep_section=True, save=False) pywikibot.info('Summary - ' + reason) done = self.userPut(redir_page, content, redir_page.text, summary=reason, ignore_save_related_errors=True, ignore_server_errors=True) if not done and self.user_confirm( f'Redirect target {targetPage.title(as_link=True)} does not' ' exist.\nDo you want to delete ' f'{redir_page.title(as_link=True)}?'): self.delete_redirect(redir_page, 'redirect-remove-broken') elif not (self.opt.delete or movedTarget): pywikibot.info('Cannot fix or delete the broken redirect') except IsRedirectPageError: pywikibot.info( 'Redirect target {} is also a redirect! {}'.format( targetPage.title(as_link=True), "Won't delete anything." if self.opt.delete else 'Skipping.')) else: # we successfully get the target page, meaning that # it exists and is not a redirect: no reason to touch it. pywikibot.info('Redirect target {} does exist! {}' .format(targetPage.title(as_link=True), "Won't delete anything." if self.opt.delete else 'Skipping.'))
def _skip_on_exception(self, error: Exception, loop_length: int, new_redir: pywikibot.Page) -> bool: """Handle exceptions of fix_1_double_redirect.""" if isinstance(error, IsNotRedirectPageError): if loop_length != 2: return False # do nothing pywikibot.info( f'Skipping: Redirect target {new_redir} is not a redirect.') elif isinstance(error, SectionError): pywikibot.warning( f"Redirect target section {new_redir} doesn't exist.") return False elif isinstance(error, UnsupportedPageError): pywikibot.error(error) pywikibot.info(f'Skipping {new_redir}.') elif isinstance(error, NoPageError): if not self.opt.always: pywikibot.warning( f"Redirect target {new_redir} doesn't exist.") return False # skip if automatic pywikibot.info( f"Skipping: Redirect target {new_redir} doesn't exist.") elif isinstance(error, ServerError): pywikibot.info('Skipping due to server error: No textarea found') else: # all uncatched exceptions # Note: elif statements are necessary above because all Errors # above derive from Exception class raise error return True
[docs] def fix_1_double_redirect(self) -> None: """Treat one double redirect.""" newRedir = redir = self.current_page redirList = [] # bookkeeping to detect loops first_target = None while True: redirList.append( f'{newRedir.site.lang}:{newRedir.title(with_section=False)}') try: targetPage = self.get_redirect_target(newRedir) except Exception as e: if self._skip_on_exception(e, len(redirList), newRedir): break else: if not targetPage: break first_target = first_target or targetPage pywikibot.info(f' Links to: {targetPage}.') mw_msg = None with suppress(KeyError): mw_msg = targetPage.site.mediawiki_message( 'wikieditor-toolbar-tool-redirect-example') if mw_msg and targetPage.title() == mw_msg: pywikibot.info('Skipping toolbar example: Redirect source ' 'is potentially vandalized.') break # watch out for redirect loops if redirList.count(f'{targetPage.site.lang}:' f'{targetPage.title(with_section=False)}'): pywikibot.warning( f'Redirect target {targetPage} forms a redirect loop.') break # FIXME: doesn't work. edits twice! if self.opt.delete: # Delete the two redirects # TODO: Check whether pages aren't vandalized # and (maybe) do not have a version history self.delete_redirect(targetPage, 'redirect-remove-loop') self.delete_redirect(redir, 'redirect-remove-loop') break # redirect target found if targetPage.isStaticRedirect(): pywikibot.info(' Redirect target is STATICREDIRECT.') else: newRedir = targetPage continue oldText = redir.get(get_redirect=True) if self.is_repo and redir.namespace() == self.repo.item_namespace: redir = pywikibot.ItemPage(self.repo, redir.title()) targetPage = pywikibot.ItemPage(self.repo, targetPage.title()) pywikibot.info('Fixing double item redirect') redir.set_redirect_target(targetPage) break redir.set_redirect_target(targetPage, keep_section=True, save=False) summary = i18n.twtranslate( redir.site, 'redirect-fix-double', {'from': first_target.title(as_link=True, allow_interwiki=False), 'to': targetPage.title(as_link=True, allow_interwiki=False)}, bot_prefix=True) self.userPut(redir, oldText, redir.text, summary=summary, ignore_save_related_errors=True, ignore_server_errors=True) break
[docs] def fix_double_or_delete_broken_redirect(self) -> None: """Treat one broken or double redirect.""" if self.current_page._redirect_type == 0: self.delete_1_broken_redirect() elif self.current_page._redirect_type != 1: self.fix_1_double_redirect()
[docs] def treat(self, page) -> None: """Treat a page. :param page: Page to be treated. :type page: pywikibot.page.BasePage """ if self.counter['read'] >= self.opt.limit: pywikibot.info( '\nNumber of pages reached the limit. Script terminated.') self.generator.close() super().treat(page)
[docs] def main(*args: str) -> None: """Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. :param args: command line arguments """ options: dict[str, Any] = {} gen_options: dict[str, Any] = {} # what the bot should do (either resolve double redirs, or process broken # redirs) action = None source = set() unknown = [] gen_factory = pagegenerators.GeneratorFactory() local_args = pywikibot.handle_args(args) shorts = {'do': 'double', 'br': 'broken'} for argument in local_args: arg, _, value = argument.partition(':') option = arg.partition('-')[2] # bot options if arg in shorts: action = shorts[arg] elif arg in ('both', 'broken', 'double'): action = arg elif option in ('always', 'delete'): options[option] = True elif option == 'sdtemplate': options['sdtemplate'] = value or pywikibot.input( 'Which speedy deletion template to use?') # generator options elif option in ('fullscan', 'moves'): gen_options[option] = True source.add(arg) elif option == 'xml': gen_options[option] = value or i18n.input( 'pywikibot-enter-xml-filename') source.add(arg) elif option == 'offset': gen_options[option] = int(value) elif option in ('start', 'until'): gen_options[option] = value elif option == 'limit': options['limit'] = gen_options['limit'] = int(value) elif gen_factory.handle_arg(argument): pass else: unknown.append(arg) problem = 'You can only use one of {} options.'.format( ' or '.join(source)) if len(source) > 1 else '' if action == 'both' and '-fullscan' in source: problem += (' You can only use either -fullscan together with ' "broken/double action or 'both' action") if suggest_help(additional_text=fill(problem), unknown_parameters=unknown, missing_action=not action): return gen = None if not gen_factory.gens: if gen_factory.namespaces: gen_options['namespaces'] = gen_factory.namespaces gen = RedirectGenerator(action, **gen_options) if gen_factory.gens \ or action != 'both' and source not in ('-fullscan', '-xml'): gen = gen_factory.getCombinedGenerator(gen=gen) bot = RedirectRobot(action, generator=gen, **options) bot.run()
if __name__ == '__main__': main()