Source code for scripts.reflinks

#!/usr/bin/python3
"""
Fetch and add titles for bare links in references.

This bot will search for references which are only made of a link
without title (i.e. <ref>[https://www.google.fr/]</ref> or
<ref>https://www.google.fr/</ref>) and will fetch the html title from
the link to use it as the title of the wiki link in the reference, i.e.
<ref>[https://www.google.fr/search?q=test test - Google Search]</ref>

The bot checks every 20 edits a special stop page. If the page has been
edited, it stops.

As it uses it, you need to configure noreferences.py for your wiki, or it
will not work.

pdfinfo is needed for parsing pdf titles.

The following parameters are supported:

-xml:dump.xml     Should be used instead of a simple page fetching method
                  from pagegenerators.py for performance and load issues

-xmlstart         Page to start with when using an XML dump

This script is a :py:obj:`ConfigParserBot <pywikibot.bot.ConfigParserBot>`.
The following options can be set within a settings file which is scripts.ini
by default::

-always          Doesn't ask every time whether the bot should make the change.
                 Do it always.

-limit:n          Stops after n edits

-ignorepdf        Do not handle PDF files (handy if you use Windows and
                  can't get pdfinfo)

-summary          Use a custom edit summary. Otherwise it uses the
                  default one from translatewiki

The following generators and filters are supported:

&params;
"""
# (C) Pywikibot team, 2008-2022
#
# Distributed under the terms of the MIT license.
#
import codecs
import http.client as httplib
import itertools
import os
import re
import subprocess
import tempfile
from contextlib import suppress
from enum import IntEnum
from functools import partial
from http import HTTPStatus
from textwrap import shorten

import pywikibot
from pywikibot import comms, config, i18n, pagegenerators, textlib
from pywikibot.backports import removeprefix
from pywikibot.bot import ConfigParserBot, ExistingPageBot, SingleSiteBot
from pywikibot.comms.http import get_charset_from_content_type
from pywikibot.exceptions import (
    FatalServerError,
    Server414Error,
    Server504Error,
)
from pywikibot.pagegenerators import (
    XMLDumpPageGenerator as _XMLDumpPageGenerator,
)
from pywikibot.textlib import replaceExcept
from pywikibot.tools.chars import string2html
from scripts import noreferences


docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}

localized_msg = ('fr', 'it', 'pl')  # localized message at MediaWiki

# localized message at specific Wikipedia site
# should be moved to MediaWiki Pywikibot manual


stop_page = {
    'fr': 'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
    'da': 'Bruger:DumZiBoT/EditThisPageToStopMe',
    'de': 'Benutzer:DumZiBoT/EditThisPageToStopMe',
    'fa': 'کاربر:Amirobot/EditThisPageToStopMe',
    'it': 'Utente:Marco27Bot/EditThisPageToStopMe',
    'ko': '사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
    'he': 'User:Matanyabot/EditThisPageToStopMe',
    'hu': 'User:Damibot/EditThisPageToStopMe',
    'en': 'User:DumZiBoT/EditThisPageToStopMe',
    'pl': 'Wikipedysta:MastiBot/EditThisPageToStopMe',
    'ru': 'User:Rubinbot/EditThisPageToStopMe',
    'ur': 'صارف:Shuaib-bot/EditThisPageToStopMe',
    'zh': 'User:Sz-iwbot',
}

deadLinkTag = {
    'ar': '[%s] {{وصلة مكسورة}}',
    'fr': '[%s] {{lien mort}}',
    'da': '[%s] {{dødt link}}',
    'fa': '[%s] {{پیوند مرده}}',
    'he': '{{קישור שבור}}',
    'hi': '[%s] {{Dead link}}',
    'hu': '[%s] {{halott link}}',
    'ko': '[%s] {{죽은 바깥 고리}}',
    'es': '{{enlace roto2|%s}}',
    'it': '{{Collegamento interrotto|%s}}',
    'en': '[%s] {{dead link}}',
    'pl': '[%s] {{Martwy link}}',
    'ru': '[%s] {{Недоступная ссылка}}',
    'sr': '[%s] {{dead link}}',
    'ur': '[%s] {{مردہ ربط}}',
}


soft404 = re.compile(
    r'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
    re.IGNORECASE)
# matches an URL at the index of a website
dirIndex = re.compile(
    r'\w+://[^/]+/((default|index)\.'
    r'(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
    re.IGNORECASE)
# Extracts the domain name
domain = re.compile(r'^(\w+)://(?:www.|)([^/]+)')

globalbadtitles = r"""
# is
(test|
# starts with
    ^\W*(
            register
            |registration
            |(sign|log)[ \-]?in
            |subscribe
            |sign[ \-]?up
            |log[ \-]?on
            |untitled[ ]?(document|page|\d+|$)
            |404[ ]
        ).*
# anywhere
    |.*(
            403[ ]forbidden
            |(404|page|file|information|resource).*not([ ]*be)?[ ]*
            (available|found)
            |are[ ](?:.+?[ ])?robot
            |site.*disabled
            |error[ ]404
            |error.+not[ ]found
            |not[ ]found.+error
            |404[ ]error
            |\D404\D
            |check[ ]browser[ ]settings
            |log[ \-]?(on|in)[ ]to
            |site[ ]redirection
     ).*
# ends with
    |.*(
            register
            |registration
            |(sign|log)[ \-]?in
            |subscribe|sign[ \-]?up
            |log[ \-]?on
        )\W*$
)
"""
# Language-specific bad titles
badtitles = {
    'en': '',
    'fr': '.*(404|page|site).*en +travaux.*',
    'es': '.*sitio.*no +disponible.*',
    'it': '((pagina|sito) (non trovat[ao]|inesistente)|accedi|errore)',
    'ru': '.*([Сс]траница.*(не[ ]*найдена|отсутствует)|Вы.*человек).*',
}

# Regex that match bare references
linksInRef = re.compile(
    # bracketed URLs
    r'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https)://(?:'
    # unbracketed with()
    r'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'
    # unbracketed without ()
    r'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))'
    r'[!?,\s]*\]?\s*</ref>')

# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Dispenser )
listof404pages = '404-links.txt'

XmlDumpPageGenerator = partial(
    _XMLDumpPageGenerator, text_predicate=linksInRef.search)





[docs]class IX(IntEnum): """Index class for references data.""" name = 0 reflist = 1 quoted = 2 change_needed = 3
[docs]class DuplicateReferences: """Helper to de-duplicate references in text. When some references are duplicated in an article, name the first, and remove the content of the others """ def __init__(self, site=None) -> None: """Initializer.""" if not site: site = pywikibot.Site() # Match references self.REFS = re.compile( r'(?is)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') fmt = r'(?i){0}\s*=\s*(?P<quote>["\']?)\s*(?P<{0}>.+)\s*(?P=quote)' self.NAMES = re.compile(fmt.format('name')) self.GROUPS = re.compile(fmt.format('group')) self.autogen = i18n.twtranslate(site, 'reflinks-autogen')
[docs] def process(self, text): """Process the page.""" # keys are ref groups # values are a dict where : # keys are ref content # values are [name, [list of full ref matches], # quoted, need_to_change] found_refs = {} found_ref_names = set() # Replace key by [value, quoted] named_repl = {} # Parse references for match in self.REFS.finditer(text): content = match.group('content') if not content.strip(): continue params = match.group('params') group = self.GROUPS.search(params) or '' if group not in found_refs: found_refs[group] = {} groupdict = found_refs[group] if content in groupdict: v = groupdict[content] v[IX.reflist].append(match.group()) else: v = [None, [match.group()], False, False] found = self.NAMES.search(params) if found: quoted = found.group('quote') in ['"', "'"] name = found.group('name') if not v[IX.name]: # First name associated with this content if name not in found_ref_names: # first time ever we meet this name v[IX.quoted] = quoted v[IX.name] = name else: # if has_key, means that this name is used # with another content. We'll need to change it v[IX.change_needed] = True elif v[IX.name] != name: named_repl[name] = [v[IX.name], v[IX.quoted]] found_ref_names.add(name) groupdict[content] = v # Find used autogenerated numbers used_numbers = set() for name in found_ref_names: number = removeprefix(name, self.autogen) with suppress(ValueError): used_numbers.add(int(number)) # generator to give the next free number for autogenerating names free_number = (str(i) for i in itertools.count(start=1) if i not in used_numbers) # Fix references for groupname, references in found_refs.items(): group = 'group="{}" '.format(groupname) if groupname else '' for ref, v in references.items(): if len(v[IX.reflist]) == 1 and not v[IX.change_needed]: continue name = v[IX.name] if not name: name = '"{}{}"'.format(self.autogen, next(free_number)) elif v[IX.quoted]: name = '"{}"'.format(name) named = '<ref {}name={}>{}</ref>'.format(group, name, ref) text = text.replace(v[IX.reflist][0], named, 1) # make sure that the first (named ref) is not removed later pos = text.index(named) + len(named) header = text[:pos] end = text[pos:] # replace multiple identical references with repeated ref repeated_ref = '<ref {}name={} />'.format(group, name) for ref in v[IX.reflist][1:]: # Don't replace inside templates (T266411) end = replaceExcept(end, re.escape(ref), repeated_ref, exceptions=['template']) text = header + end # Fix references with different names for ref, v in named_repl.items(): # TODO : Support ref groups name = v[IX.name] if v[IX.reflist]: name = '"{}"'.format(name) text = re.sub( r'<ref name\s*=\s*(?P<quote>["\']?)\s*{}\s*(?P=quote)\s*/>' .format(ref), '<ref name={} />'.format(name), text) return text
[docs]class ReferencesRobot(SingleSiteBot, ConfigParserBot, ExistingPageBot): """References bot. .. versionchanged:: 7.0 ReferencesRobot is a ConfigParserBot """ use_redirects = False update_options = { 'ignorepdf': False, 'limit': 0, # stop after n modified pages 'summary': '', } def __init__(self, **kwargs) -> None: """Initializer.""" super().__init__(**kwargs) self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/{}'.format(code) if self.opt.summary: self.msg = self.opt.summary else: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) local = i18n.translate(self.site, badtitles) if local: bad = '({}|{})'.format(globalbadtitles, local) else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(verbose=False) self.deduplicator = DuplicateReferences(self.site) self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page {} does not exist' .format(self.stop_page.title(as_link=True))) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile( br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
[docs] @staticmethod def httpError(err_num, link, pagetitleaslink) -> None: """Log HTTP Error.""" pywikibot.stdout('HTTP error ({}) for {} on {}' .format(err_num, link, pagetitleaslink))
[docs] @staticmethod def getPDFTitle(ref, response) -> None: """Use pdfinfo to retrieve title from a PDF.""" # pdfinfo is Unix-only pywikibot.output('Reading PDF file...') try: fd, infile = tempfile.mkstemp() urlobj = os.fdopen(fd, 'w+') urlobj.write(response.text) pdfinfo_out = subprocess.Popen([r'pdfinfo', '/dev/stdin'], stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False).communicate()[0] except ValueError: pywikibot.output('pdfinfo value error.') except OSError: pywikibot.output('pdfinfo OS error.') except Exception as e: # Ignore errors pywikibot.output('PDF processing error.') pywikibot.error(e) else: for aline in pdfinfo_out.splitlines(): if isinstance(aline, bytes): aline = aline.decode() if aline.lower().startswith('title'): ref.title = ' '.join(aline.split()[1:]) if ref.title: pywikibot.output('title: ' + ref.title) break pywikibot.output('PDF done.') finally: urlobj.close() os.unlink(infile)
[docs] def setup(self): """Read dead links from file.""" try: with codecs.open(listof404pages, 'r', 'latin_1') as f: self.dead_links = f.read() except OSError: raise NotImplementedError( '404-links.txt is required for reflinks.py\n' 'You need to download\n' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n' 'and to unzip it in the same directory')
[docs] def skip_page(self, page): """Skip unwanted pages.""" if super().skip_page(page): return True if not page.has_permission(): pywikibot.warning("You can't edit page {page}" .format(page=page)) return True return False
[docs] def treat(self, page) -> None: """Process one page.""" # Load the page's text from the wiki new_text = page.text raw_text = textlib.removeDisabledParts(new_text) # for each link to change for match in linksInRef.finditer(raw_text): link = match.group('url') if 'jstor.org' in link: # TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name'), site=self.site) try: r = comms.http.fetch( ref.url, use_fake_user_agent=self._use_fake_user_agent) # Try to get Content-Type from server content_type = r.headers.get('content-type') if content_type and not self.MIME.search(content_type): if ref.link.lower().endswith('.pdf') \ and not self.opt.ignorepdf: # If file has a PDF suffix self.getPDFTitle(ref, r) else: pywikibot.output('<<lightyellow>>WARNING<<default>> : ' 'media : {} '.format(ref.link)) if not ref.title: repl = ref.refLink() elif not re.match('(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output('<<lightyellow>>WARNING<<default>> : ' 'PDF title blacklisted : {} ' .format(ref.title)) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = r.url if redir != ref.link \ and domain.findall(redir) == domain.findall(link): if soft404.search(redir) \ and not soft404.search(ref.link): pywikibot.output('<<lightyellow>>WARNING<<default>> : ' 'Redirect 404 : {} ' .format(ref.link)) continue if dirIndex.match(redir) \ and not dirIndex.match(ref.link): pywikibot.output('<<lightyellow>>WARNING<<default>> : ' 'Redirect to root : {} ' .format(ref.link)) continue if r.status_code != HTTPStatus.OK: pywikibot.stdout('HTTP error ({}) for {} on {}' .format(r.status_code, ref.url, page.title(as_link=True))) # 410 Gone, indicates that the resource has been # purposely removed if r.status_code == HTTPStatus.GONE \ or (r.status_code == HTTPStatus.NOT_FOUND and '\t{}\t'.format(ref.url) in self.dead_links): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue except UnicodeError: # example: # http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output('<<lightred>>Bad link<<default>> : {} in {}' .format(ref.url, page.title(as_link=True))) continue except (ValueError, # urllib3.LocationParseError derives from it OSError, httplib.error, FatalServerError, Server414Error, Server504Error) as e: pywikibot.output( "{err.__class__.__name__}: Can't retrieve url {url}: {err}" .format(url=ref.url, err=e)) continue linkedpagetext = r.content # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) encoding = None if content_type: encoding = get_charset_from_content_type(content_type) if meta_content: tag = None encodings = [encoding] if encoding else [] encodings += list(page.site.encodings()) for enc in encodings: with suppress(UnicodeDecodeError): tag = meta_content.group().decode(enc) break # Prefer the content-type from the HTTP header if not content_type and tag: content_type = tag if not encoding: encoding = get_charset_from_content_type(tag) if encoding: r.encoding = encoding if not content_type: pywikibot.output('No content-type found for ' + ref.link) continue if not self.MIME.search(content_type): pywikibot.output('<<lightyellow>>WARNING<<default>> : media : ' '{} '.format(ref.link)) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(r.text): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output('{} : No title found...'.format(ref.link)) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output('<<lightred>>WARNING<<default>> {} : ' 'Blacklisted title ({})' .format(ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary ref.title = shorten(ref.title, width=178, placeholder='...') repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10 and self.norefbot.lacksReferences(new_text): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) old_text = page.text if old_text == new_text: return self.userPut(page, old_text, new_text, summary=self.msg, ignore_save_related_errors=True, ignore_server_errors=True) if not self.counter['write']: return if self.opt.limit and self.counter['write'] >= self.opt.limit: pywikibot.output('Edited {} pages, stopping.' .format(self.opt.limit)) self.generator.close() if self.site_stop_page and self.counter['write'] % 20 == 0: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): pywikibot.output( '<<lightgreen>>Checking stop page...<<default>>') actual_rev = self.stop_page.latest_revision_id if actual_rev != self.stop_page_rev_id: pywikibot.output( '{} has been edited: Someone wants us to stop.' .format(self.stop_page.title(as_link=True))) self.generator.close()
[docs]def main(*args: str) -> None: """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. :param args: command line arguments """ xml_filename = None xml_start = None options = {} generator = None # Process global args and prepare generator args parser local_args = pywikibot.handle_args(args) gen_factory = pagegenerators.GeneratorFactory() for arg in local_args: opt, _, value = arg.partition(':') if opt in ('-summary', '-limit'): options[opt[1:]] = value elif opt in ('-always', '-ignorepdf'): options[opt[1:]] = True elif opt == '-xmlstart': xml_start = value or pywikibot.input( 'Please enter the dumped article to start with:') elif opt == '-xml': xml_filename = value or pywikibot.input( "Please enter the XML dump's filename:") else: gen_factory.handle_arg(arg) if xml_filename: generator = XmlDumpPageGenerator(xml_filename, xml_start, gen_factory.namespaces) if not generator: generator = gen_factory.getCombinedGenerator() if not generator: pywikibot.bot.suggest_help(missing_generator=True) return if not gen_factory.nopreload: generator = pagegenerators.PreloadingGenerator(generator) generator = pagegenerators.RedirectFilterPageGenerator(generator) bot = ReferencesRobot(generator=generator, **options) bot.run()
if __name__ == '__main__': main()