Source code for textlib

#
# (C) Pywikibot team, 2008-2026
#
# Distributed under the terms of the MIT license.
#
"""Functions for manipulating wiki-text."""

from __future__ import annotations

import itertools
import re
import sys
from collections import OrderedDict
from collections.abc import Callable, Container, Iterable, Mapping, Sequence
from contextlib import closing, suppress
from dataclasses import dataclass
from html.parser import HTMLParser
from typing import NamedTuple

import pywikibot
from pywikibot.backports import pairwise
from pywikibot.exceptions import InvalidTitleError, SiteDefinitionError
from pywikibot.family import Family
from pywikibot.time import TZoneFixedOffset
from pywikibot.tools import ModuleDeprecationWrapper, first_lower, first_upper
from pywikibot.tools.chars import INVISIBLE_REGEX
from pywikibot.userinterfaces.transliteration import NON_ASCII_DIGITS


try:
    import wikitextparser
except ImportError:
    import mwparserfromhell as wikitextparser


# cache for replaceExcept to avoid recompile or regexes each call
_regex_cache: dict[str, re.Pattern[str]] = {}

# The regex below collects nested templates, providing simpler
# identification of templates used at the top-level of wikitext.
# It doesn't match {{{1|...}}}, however it also does not match templates
# with a numerical name. e.g. {{1|..}}. It will correctly match {{{x}} as
# being {{x}} with leading '{' left in the wikitext.
# Prefix msg: is not included in the 'name' group, but all others are
# included for backwards compatibility with TEMP_REGEX.
# Only parser functions using # are excluded.
# When more than two levels of templates are found, this regex will
# capture from the beginning of the first {{ to the end of the last }},
# with wikitext between templates as part of the parameters of the first
# template in the wikitext.
# This ensures it fallsback to a safe mode for replaceExcept, as it
# ensures that any replacement will not occur within template text.
NESTED_TEMPLATE_REGEX = re.compile(r"""
{{\s*(?:msg:\s*)?
  (?P<name>[^{\|#0-9][^{\|#]*?)\s*
  (?:\|(?P<params> [^{]*?
          (({{{[^{}]+?}}}
            |{{[^{}]+?}}
            |{[^{}]*?}
          ) [^{]*?
        )*?
    )?
  )?
}}
|
(?P<unhandled_depth>{{\s*[^{\|#0-9][^{\|#]*?\s* [^{]* {{ .* }})
""", re.VERBOSE | re.DOTALL)

# Regex matching file links with optional parameters.
#
# Captures the filename and parameters, including nested links
# within the parameters. The regex safely matches the closing
# brackets even if inner wikilinks contain [[ or ]].
# The Namespace names must be substituted into the pattern, e.g.:
#     FILE_LINK_REGEX % 'File'
# or: FILE_LINK_REGEX % '|'.join(site.namespaces[6])
#
# Don't use this regex directly; use textlib.get_regexes('file', site)`
# instead.
#
# 10.7: Exclude empty filename
FILE_LINK_REGEX = r"""
    \[\[\s*
    (?:%s)  # namespace aliases
    \s*:
    (?=(?P<filename>
        [^]|]+
    ))(?P=filename)
    (
        \|
        (
            (
                (?=(?P<inner_link>
                    \[\[.*?\]\]
                ))(?P=inner_link)
            )?
            (?=(?P<other_chars>
                [^\[\]]*
            ))(?P=other_chars)
        |
            (?=(?P<not_wikilink>
                \[[^]]*\]
            ))(?P=not_wikilink)
        )*?
    )??
    \]\]
"""

# Used in TimeStripper. When a timestamp-like line has longer gaps
# than this between year, month, etc in it, then the line will not be
# considered to contain a timestamp.
TIMESTAMP_GAP_LIMIT = 10


[docs] def to_local_digits(phrase: str | int, lang: str) -> str: """Change ASCII digits based on language to localized version. .. attention:: Be aware that this function only works for several languages, and that it returns an unchanged string if an unsupported language is given. .. versionchanged:: 7.5 always return a string even `phrase` is an int. :param phrase: The phrase to convert to localized numerical :param lang: Language code :return: The localized version """ digits = NON_ASCII_DIGITS.get(lang) phrase = str(phrase) if digits: trans = str.maketrans('0123456789', digits) phrase = phrase.translate(trans) return phrase
[docs] def to_ascii_digits(phrase: str, langs: Sequence[str] | str | None = None) -> str: """Change non-ascii digits to ascii digits. .. versionadded:: 7.0 .. versionchanged:: 10.3 this function was renamed from to_latin_digits. :param phrase: The phrase to convert to ascii numerical. :param langs: Language codes. If langs parameter is None, use all known languages to convert. :return: The string with ascii digits """ if langs is None: langs = NON_ASCII_DIGITS.keys() elif isinstance(langs, str): langs = [langs] digits = [NON_ASCII_DIGITS[key] for key in langs if key in NON_ASCII_DIGITS] if digits: trans = str.maketrans(''.join(digits), '0123456789' * len(digits)) phrase = phrase.translate(trans) return phrase
[docs] def case_escape(case: str, string: str, *, underscore: bool = False) -> str: """Return an escaped regex pattern which depends on 'first-letter' case. .. versionadded:: 7.0 .. versionchanged:: 8.4 Added the optional *underscore* parameter. :param case: If `case` is 'first-letter', the regex contains an inline re.IGNORECASE flag for the first letter :param underscore: If True, expand the regex to detect spaces and underscores which are interchangeable and collapsible """ if case == 'first-letter': pattern = f'(?i:{string[:1]}){re.escape(string[1:])}' else: pattern = re.escape(string) if underscore: pattern = re.sub(r'_|\\ ', '[_ ]+', pattern) return pattern
[docs] class MultiTemplateMatchBuilder: """Build template matcher.""" def __init__(self, site) -> None: """Initializer.""" self.site = site
[docs] def pattern(self, template, flags=re.DOTALL): """Return a compiled regex to match template.""" # TODO: add ability to also match contents within the template # TODO: add option for template to be None to match any template # TODO: merge regex with NESTED_TEMPLATE_REGEX namespace = self.site.namespaces[10] if isinstance(template, pywikibot.Page): if template.namespace() != 10: raise ValueError(f'{template} is not a template Page object') old = template.title(with_ns=False) elif isinstance(template, str): old = template else: raise ValueError(f'{template!r} is not a valid template') pattern = case_escape(namespace.case, old) # namespaces may be any mixed case namespaces = [ignore_case(ns) for ns in namespace] namespaces.append(ignore_case('msg')) pattern = re.sub(r'_|\\ ', r'[_ ]', pattern) templateRegexP = ( r'{{\s*(%(namespace)s:)?%(pattern)s' r'(?P<parameters>\s*\|[^{]+?' r'((({{{[^{}]+?}}}|{{[^{}]+?}}|{[^{}]*?})[^{]*?)*?)?' r'|)\s*}}' ) % {'namespace': ':|'.join(namespaces), 'pattern': pattern} return re.compile(templateRegexP, flags)
[docs] def search_any_predicate(self, templates): """Return a predicate that matches any template.""" predicates = [self.pattern(template).search for template in templates] return lambda text: any(predicate(text) for predicate in predicates)
[docs] def ignore_case(string: str) -> str: """Return a case-insensitive pattern for the string. .. versionchanged:: 7.2 `_ignore_case` becomes a public method """ return ''.join( f'[{c}{s}]' if c != s else c for s, c in zip(string, string.swapcase()))
def _tag_pattern(tag_name: str) -> str: """Return a tag pattern for the given tag name.""" return ( rf'<{ignore_case(tag_name)}(?:>|\s+[^>]*(?<!/)>)' # start tag r'[\s\S]*?' # contents rf'</{ignore_case(tag_name)}\s*>' # end tag ) def _tag_regex(tag_name: str): """Return a compiled tag regex for the given tag name.""" return re.compile(_tag_pattern(tag_name))
[docs] def _create_default_regexes() -> None: """Fill (and possibly overwrite) ``_regex_cache`` with default regexes. The following keys are provided: ``category``, ``comment``, ``file``, ``header``, ``hyperlink``, ``interwiki``, ``invoke``, ``link``, ``pagelist``, ``property``, ``startcolon``, ``startspace``, ``table``, ``template``. :meta public: """ _regex_cache.update({ # categories 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])), 'comment': re.compile(r'<!--[\s\S]*?-->'), # files 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), # section headers 'header': re.compile( r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' r'(=(?:[^\n]|<!--[\s\S]*?-->)+=)' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'), # external links 'hyperlink': compileLinkR(), # also finds links to foreign sites with preleading ":" 'interwiki': ( r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join( ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), # Module invocations (currently only Lua) 'invoke': ( r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join( ignore_case(mw) for mw in site.getmagicwords('invoke'))), # this matches internal wikilinks, but also interwiki, categories, and # images. 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'), # pagelist tag (used in Proofread extension). 'pagelist': re.compile(r'<{}[\s\S]*?/>' .format(ignore_case('pagelist'))), # Wikibase property inclusions 'property': ( r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join( ignore_case(mw) for mw in site.getmagicwords('property'))), # lines that start with a colon or more will be indented 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'), # lines that start with a space are shown in a monospace font and # have whitespace preserved. 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'), # tables often have whitespace that is used to improve wiki # source code readability. # TODO: handle nested tables. 'table': re.compile( r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')), 'template': NESTED_TEMPLATE_REGEX, })
[docs] def get_regexes( keys: str | Iterable[str], site: pywikibot.site.BaseSite | None = None ) -> list[re.Pattern[str]]: """Fetch compiled regexes. .. versionchanged:: 8.2 ``_get_regexes`` becomes a public function. *keys* may be a single string; *site* is optional. :param keys: A single key or an iterable of keys whose regex pattern should be given :param site: A BaseSite object needed for ``category``, ``file``, ``interwiki``, ``invoke`` and ``property`` keys :raises ValueError: Site cannot be None. """ if not _regex_cache: _create_default_regexes() if isinstance(keys, str): keys = [keys] result = [] for exc in keys: if not isinstance(exc, str): # assume it's a regular expression result.append(exc) continue # assume the string is a reference to a standard regex above, # which may not yet have a site specific re compiled. if exc not in _regex_cache: # nowiki, noinclude, includeonly, timeline, math and other # extensions _regex_cache[exc] = _tag_regex(exc) result.append(_regex_cache[exc]) elif not isinstance(_regex_cache[exc], tuple): result.append(_regex_cache[exc]) else: if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'): raise ValueError(f'site cannot be None for the {exc!r} regex') if (exc, site) not in _regex_cache: re_text, re_var = _regex_cache[exc] _regex_cache[(exc, site)] = re.compile( re_text % re_var(site), re.VERBOSE) result.append(_regex_cache[(exc, site)]) # handle aliases if exc == 'source': result.append(_tag_regex('syntaxhighlight')) elif exc == 'syntaxhighlight': result.append(_tag_regex('source')) elif exc == 'chem': result.append(_tag_regex('ce')) elif exc == 'math': result.append(_tag_regex('chem')) result.append(_tag_regex('ce')) return result
[docs] def replaceExcept(text: str, old: str | re.Pattern[str], new: str | Callable[[re.Match[str]], str], exceptions: Sequence[str | re.Pattern[str]], caseInsensitive: bool = False, allowoverlap: bool = False, marker: str = '', site: pywikibot.site.BaseSite | None = None, count: int = 0) -> str: """Return text with *old* replaced by *new*, ignoring specified text types. Skip occurrences of *old* within *exceptions*; e.g. within nowiki tags or HTML comments. If *caseInsensitive* is true, then use case insensitive regex matching. If *allowoverlap* is true, overlapping occurrences are all replaced .. caution:: Watch out when using *allowoverlap*, it might lead to infinite loops! :param text: Text to be modified :param old: A compiled or uncompiled regular expression :param new: A string (which can contain regular expression references), or a function which takes a match object as parameter. See parameter *repl* of ``re.sub()``. :param exceptions: A list of strings or already compiled regex objects which signal what to leave out. List of strings might be like ``['math', 'table', 'template']`` for example. :param marker: A string that will be added to the last replacement; if nothing is changed, it is added at the end :param count: How many replacements to do at most. See parameter *count* of ``re.sub()``. """ # if we got a string, compile it as a regular expression if isinstance(old, str): old = re.compile(old, flags=re.IGNORECASE if caseInsensitive else 0) # early termination if not relevant if not old.search(text): return text + marker dontTouchRegexes = get_regexes(exceptions, site) index = 0 replaced = 0 markerpos = len(text) while not count or replaced < count: if index > len(text): break match = old.search(text, index) if not match: # nothing left to replace break # check which exception will occur next. nextExceptionMatch = None for dontTouchR in dontTouchRegexes: excMatch = dontTouchR.search(text, index) if excMatch and ( nextExceptionMatch is None or excMatch.start() < nextExceptionMatch.start()): nextExceptionMatch = excMatch if nextExceptionMatch is not None \ and nextExceptionMatch.start() <= match.start(): # an HTML comment or text in nowiki tags stands before the next # valid match. Skip. index = nextExceptionMatch.end() continue # We found a valid match. Replace it. if callable(new): # the parameter new can be a function which takes the match # as a parameter. replacement = new(match) else: # it is not a function, but a string. # it is a little hack to make \n work. It would be better # to fix it previously, but better than nothing. new = new.replace('\\n', '\n') # We cannot just insert the new string, as it may contain regex # group references such as \2 or \g<name>. # On the other hand, this approach does not work because it # can't handle lookahead or lookbehind (see bug T123185). # So we have to process the group references manually. replacement = '' group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>') last = 0 for group_match in group_regex.finditer(new): group_id = group_match[1] or group_match[2] with suppress(ValueError): group_id = int(group_id) try: replacement += new[last:group_match.start()] replacement += match[group_id] or '' except IndexError: raise IndexError(f'Invalid group reference: {group_id}\n' f'Groups found: {match.groups()}') last = group_match.end() replacement += new[last:] text = text[:match.start()] + replacement + text[match.end():] # continue the search on the remaining text if allowoverlap: index = match.start() + 1 else: index = match.start() + len(replacement) if not match.group(): # When the regex allows to match nothing, shift by one char index += 1 markerpos = match.start() + len(replacement) replaced += 1 return text[:markerpos] + marker + text[markerpos:]
[docs] def removeDisabledParts(text: str, tags: Iterable | None = None, include: Container | None = None, site: pywikibot.site.BaseSite | None = None ) -> str: """Return text without portions where wiki markup is disabled. Parts that will be removed by default are: * HTML comments * nowiki tags * pre tags * includeonly tags * source and syntaxhighlight tags .. versionchanged:: 7.0 the order of removals will correspond to the tags argument if provided as an ordered collection (list, tuple) :param tags: The exact set of parts which should be removed using keywords from :func:`get_regexes`. :param include: Or, in alternative, default parts that shall not be removed. :param site: Site to be used for site-dependent regexes. Default disabled parts listed above do not need it. :return: Text stripped from disabled parts. """ if not tags: tags = ['comment', 'includeonly', 'nowiki', 'pre', 'syntaxhighlight'] # avoid set(tags) because sets are internally ordered using the hash # which for strings is salted per Python process => the output of # this function would likely be different per script run because # the replacements would be done in different order and the disabled # parts may overlap and suppress each other # see https://docs.python.org/3/reference/datamodel.html#object.__hash__ # ("Note" at the end of the section) if include: tags = [tag for tag in tags if tag not in include] regexes = get_regexes(tags, site) for regex in regexes: text = regex.sub('', text) return text
[docs] def removeHTMLParts(text: str, keeptags: list[str] | None = None, *, removetags: list[str] | None = None) -> str: """Remove selected HTML tags, their content, and comments from text. This function removes HTML tags and their contents for tags listed in ``removetags``. Tags specified in ``keeptags`` are preserved along with their content and markup. This is a wrapper around the :class:`GetDataHTML` parser class. **Example:** >>> remove = removeHTMLParts >>> remove('<div><b><ref><tt>Hi all!</tt></ref></b></div>') '<tt>Hi all!</tt>' >>> remove('<style><b>This is stylish</b></style>', keeptags=['style']) '<style></style>' >>> remove('<a>Note:</a> <b>This is important!<!-- really? --></b>') 'Note: This is important!' >>> remove('<a>Note:</a> <b>This is important!</b>', removetags=['a']) ' This is important!' .. caution:: Tag names must be given in lowercase. .. versionchanged:: 10.3 The *removetags* parameter was added. Refactored to use :class:`GetDataHTML` and its ``__call__`` method. tag attributes will be kept. :param text: The input HTML text to clean. :param keeptags: List of tag names to keep, including their content and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']` if None. :param removetags: List of tag names whose tags and content should be removed. The tags ca be preserved if listed in *keeptags*. Defaults to :code:`['style', 'script']` if None. :return: The cleaned text with specified HTML parts removed. """ return GetDataHTML(keeptags=keeptags, removetags=removetags)(text)
[docs] @dataclass(init=False, eq=False) class GetDataHTML(HTMLParser): """HTML parser that removes unwanted HTML elements and optionally comments. Tags listed in *keeptags* are preserved. Tags listed in *removetags* are removed entirely along with their content. Optionally strips HTML comments. Use via the callable interface or in a :code:`with closing(...)` block. .. note:: The callable interface is preferred because it is simpler and ensures proper resource management automatically. If using the context manager, be sure to access :attr:`textdata` before calling :meth:`close`. .. tabs:: .. tab:: callable interface .. code-block:: python text = ('<html><head><title>Test</title></head>' '<body><h1><!-- Parse --> me!</h1></body></html>') parser = GetDataHTML(keeptags = ['html']) clean_text = parser(text) .. tab:: closing block .. code-block:: python from contextlib import closing text = ('<html><head><title>Test</title></head>' '<body><h1><!-- Parse --> me!</h1></body></html>') parser = GetDataHTML(keeptags = ['html']) with closing(parser): parser.feed(text) clean_text = parser.textdata .. warning:: Save the :attr:`textdata` **before** :meth:`close` is called; otherwise the cleaned text is empty. **Usage:** >>> text = ('<html><head><title>Test</title></head>' ... '<body><h1><!-- Parse --> me!</h1></body></html>') >>> GetDataHTML()(text) 'Test me!' >>> GetDataHTML(keeptags=['title'])(text) '<title>Test</title> me!' >>> GetDataHTML(removetags=['body'])(text) 'Test' .. caution:: Tag names must be given in lowercase. .. versionchanged:: 9.2 No longer a context manager .. versionchanged:: 10.3 Public class now. Added support for removals of tag contents. .. seealso:: - :func:`removeHTMLParts` - :pylib:`html.parser` :param keeptags: List of tag names to keep, including their content and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']` if None. :param removetags: List of tag names whose tags and content should be removed. The tags can be preserved if listed in *keeptags*. Defaults to :code:`['style', 'script']` if None. :param removecomments: Whether to remove HTML comments. Defaults to True. """ def __init__(self, *, keeptags: list[str] | None = None, removetags: list[str] | None = None) -> None: """Initialize default tags and internal state.""" super().__init__() self.keeptags: list[str] = (keeptags if keeptags is not None else ['tt', 'nowiki', 'small', 'sup']) self.removetags: list[str] = (removetags if removetags is not None else ['style', 'script']) #: The cleaned output text collected during parsing. self.textdata = '' self._skiptag: str | None = None def __call__(self, text: str) -> str: """Feed the parser with *text* and return cleaned :attr:`textdata`. :param text: The HTML text to parse and clean. :return: The cleaned text with unwanted tags/content removed. """ with closing(self): self.feed(text) return self.textdata
[docs] def close(self) -> None: """Clean current processing and clear :attr:`textdata`.""" self.textdata = '' self._skiptag = None super().close()
[docs] def handle_data(self, data: str) -> None: """Handle plain text content found between tags. Text is added to the output unless it is located inside a tag marked for removal. :param data: The text data between HTML tags. """ if not self._skiptag: self.textdata += data
[docs] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: """Handle an opening HTML tag. Tags listed in *keeptags* are preserved in the output. Tags listed in *removetags* begin a skip block, and their content will be excluded from the output. .. versionchanged:: 10.3 Keep tag attributes. :param tag: The tag name (e.g., "div", "script") converted to lowercase. :param attrs: A list of (name, value) pairs with tag attributes. """ if tag in self.keeptags: # Reconstruct attributes for preserved tags attr_text = ''.join( f' {name}' if value is None else f' {name}="{value}"' for name, value in attrs ) self.textdata += f'<{tag}{attr_text}>' if tag in self.removetags: self._skiptag = tag
[docs] def handle_endtag(self, tag: str) -> None: """Handle a closing HTML tag. Tags listed in *keeptags* are preserved in the output. A closing tag that matches the currently skipped tag will end the skip block. :param tag: The name of the closing tag. """ if tag in self.keeptags: self.textdata += f'</{tag}>' if tag in self.removetags and tag == self._skiptag: self._skiptag = None
[docs] def isDisabled(text: str, index: int, tags=None) -> bool: """Return True if text[index] is disabled, e.g. by a comment or nowiki tag. For the tags parameter, see :py:obj:`removeDisabledParts`. """ # Find a marker that is not already in the text. marker = findmarker(text) text = text[:index] + marker + text[index:] text = removeDisabledParts(text, tags) return marker not in text
[docs] def findmarker(text: str, startwith: str = '@@', append: str | None = None) -> str: """Find a string which is not part of text.""" if not append: append = '@' mymarker = startwith while mymarker in text: mymarker += append return mymarker
[docs] def expandmarker(text: str, marker: str = '', separator: str = '') -> str: """Return a marker expanded whitespace and the separator. It searches for the first occurrence of the marker and gets the combination of the separator and whitespace directly before it. :param text: The text which will be searched. :param marker: The marker to be searched. :param separator: The separator string allowed before the marker. If empty it won't include whitespace too. :return: The marker with the separator and whitespace from the text in front of it. It'll be just the marker if the separator is empty. """ # set to remove any number of separator occurrences plus arbitrary # whitespace before, after, and between them, # by allowing to include them into marker. if separator: firstinmarker = text.find(marker) firstinseparator = firstinmarker lenseparator = len(separator) striploopcontinue = True while firstinseparator > 0 and striploopcontinue: striploopcontinue = False if (firstinseparator >= lenseparator and separator == text[firstinseparator - lenseparator:firstinseparator]): firstinseparator -= lenseparator striploopcontinue = True elif text[firstinseparator - 1] < ' ': firstinseparator -= 1 striploopcontinue = True marker = text[firstinseparator:firstinmarker] + marker return marker
[docs] def add_text(text: str, add: str, *, site=None) -> str: """Add text to a page content above categories and interwiki. .. versionadded:: 6.4 :param text: The page content to add text to. :param add: Text to add. :param site: The site that the text is coming from. Required for reorder of categories and interlanguage links. Te default site is used otherwise. :type site: pywikibot.Site """ # Translating the \\n (e.g. from command line) into binary \n add = add.replace('\\n', '\n') # Getting the categories categories_inside = getCategoryLinks(text, site) # Deleting the categories text = removeCategoryLinks(text, site) # Getting the interwiki interwiki_inside = getLanguageLinks(text, site) # Removing the interwiki text = removeLanguageLinks(text, site) # Adding the text text += '\n' + add # Reputting the categories text = replaceCategoryLinks(text, categories_inside, site, add_only=True) # Adding the interwiki return replaceLanguageLinks(text, interwiki_inside, site)
# ------------------------------- # Functions dealing with sections # ------------------------------- #: Head pattern HEAD_PATTERN = re.compile(r'(={1,6}).+\1', re.DOTALL) TITLE_PATTERN = re.compile("'{3}([^']+)'{3}") class _Heading(NamedTuple): text: str start: int end: int
[docs] class Section(NamedTuple): """A namedtuple as part of :class:`Content` describing a page section. .. versionchanged:: 8.2 ``_Section`` becomes a public class. """ title: str #: section title including equal signs content: str #: section content @property def level(self) -> int: """Return the section level. .. versionadded:: 8.2 """ m = HEAD_PATTERN.match(self.title) return len(m[1]) @property def heading(self) -> str: """Return the section title without equal signs. .. versionadded:: 8.2 .. versionchanged:: 11.0 Invisible chars like LTR or RTO are removed. """ level = self.level title = self.title[level:-level].strip() return INVISIBLE_REGEX.sub('', title)
[docs] class SectionList(list): """List of :class:`Section` objects with heading/level-aware index(). Introduced for handling lists of sections with custom lookup by :attr:`Section.heading` and :attr:`level<Section.level>`. .. versionadded:: 10.4 """ def __contains__(self, value: object) -> bool: """Check if a section matching the given value exists. :param value: The section heading string, a (heading, level) tuple, or a :class:`Section` instance to search for. :return: ``True`` if a matching section exists, ``False`` otherwise. """ with suppress(ValueError): self.index(value) return True return False
[docs] def count(self, value: str | tuple[str, int] | Section, /) -> int: """Count the number of sections matching the given value. :param value: The section heading string, a (heading, level) tuple, or a :class:`Section` instance to search for. :return: The number of matching sections. """ if isinstance(value, Section): return super().count(value) if isinstance(value, tuple) and len(value) == 2: heading, level = value return sum(1 for sec in self if sec.heading == heading and sec.level == level) if isinstance(value, str): return sum(1 for sec in self if sec.heading == value) return super().count(value)
[docs] def index( self, value: str | tuple[str, int] | Section, start: int = 0, stop: int = sys.maxsize, /, ) -> int: """Return the index of a matching section. Works like ``list.index(value, start, stop)`` but also allows: - *value* as a string → match by :attr:`Section.heading` (any level) - *value* as a ``(heading, level)`` tuple → match both :attr:`heading<Section.heading>` and :attr:`level<Section.level>` - *value* as a ``Section`` object → normal list.index() behavior :param value: The item to search for. May be: - ``str`` — search by section heading. - ``tuple[str, int]`` — search by heading and section level. - :class:`Section` — search for an exact section object. :param start: Index to start searching from (inclusive). :param stop: Index to stop searching at (exclusive). :return: The integer index of the matching section. :raises ValueError: If no matching section is found. """ # Normalize negative indices n = len(self) start = max(0, n + start) if start < 0 else start stop = max(0, n + stop) if stop < 0 else stop if isinstance(value, Section): return super().index(value, start, stop) if isinstance(value, tuple) and len(value) == 2: heading, level = value for i, sec in enumerate(self[start:stop], start): if sec.heading == heading and sec.level == level: return i raise ValueError( f'{value!r} not found in Section headings/levels') if isinstance(value, str): for i, sec in enumerate(self[start:stop], start): if sec.heading == value: return i raise ValueError(f'{value!r} not found in Section headings') return super().index(value, start, stop)
[docs] class Content(NamedTuple): """A namedtuple as result of :func:`extract_sections` holding page content. .. versionchanged:: 8.2 ``_Content`` becomes a public class. """ header: str #: the page header sections: SectionList[Section] #: the page sections footer: str #: the page footer @property def title(self) -> str: """Return the first main title found on the page. The first main title is anything enclosed within triple quotes. .. versionadded:: 8.2 """ m = TITLE_PATTERN.search(self.header) return m[1].strip() if m else ''
def _extract_headings(text: str) -> list[_Heading]: """Return _Heading objects.""" headings = [] heading_regex = get_regexes('header')[0] for match in heading_regex.finditer(text): start, end = match.span(1) if not isDisabled(text, start) and not isDisabled(text, end): headings.append(_Heading(match[1], start, end)) return headings def _extract_sections(text: str, headings) -> list[Section]: """Return a list of :class:`Section` objects.""" sections = SectionList() if headings: # Assign them their contents for heading, next_heading in pairwise(headings): content = text[heading.end:next_heading.start] sections.append(Section(heading.text, content)) last = headings[-1] sections.append(Section(last.text, text[last.end:])) return sections
[docs] def extract_sections( text: str, site: pywikibot.site.BaseSite | None = None, ) -> Content: """Return section headings and contents found in text. The returned namedtuple :class:`Content` contains the text parsed into *header*, *sections* and *footer* parts. The main title found in the header which is the first text enclosed with ''' like '''page title''' can be given by the *title* property. The header part is a string containing text part above the first heading. The sections part is a list of :class:`Section` namedtuples, each tuple containing a string with section title (including equal signs), and a string with the section content. In addition the section heading (the title without equal signs) can be given by the *heading* property. Also the section level can be found by the *level* property which is the number of the equal signs around the section heading. The footer part is also a string containing text part after the last section. **Examples:** >>> text = \"\"\" ... '''this''' is a Python module. ... ... == History of this == ... This set of principles was posted in 1999... ... ... == Usage of this == ... Enter "import this" for usage... ... ... === Details === ... The Zen of Python... ... ... [[Category:Programming principles]] ... \"\"\" >>> site = pywikibot.Site('wikipedia:en') >>> result = extract_sections(text, site) >>> result.header.strip() "'''this''' is a Python module." >>> result.sections[0].title '== History of this ==' >>> result.sections[1].content.strip() 'Enter "import this" for usage...' >>> 'Details' in result.sections True >>> ('Details', 2) in result.sections False >>> result.sections.index('Details') 2 >>> result.sections.index(('Details', 2)) Traceback (most recent call last): ... ValueError: ('Details', 2) not found in Section headings/levels >>> result.sections[2].heading 'Details' >>> result.sections[2].level 3 >>> result.footer.strip() '[[Category:Programming principles]]' >>> result.title 'this' .. note:: sections and text from templates are not extracted but embedded as plain text. .. versionadded:: 3.0 .. versionchanged:: 8.2 The :class:`Content` and :class:`Section` class have additional properties. .. versionchanged:: 10.4 Added custom ``index()``, ``count()`` and ``in`` operator support for :attr:`Content.sections`. :return: The parsed namedtuple. """ # noqa: D300, D301 headings = _extract_headings(text) sections = _extract_sections(text, headings) # Find header and footer contents header = text[:headings[0].start] if headings else text cat_regex, interwiki_regex = get_regexes(['category', 'interwiki'], site) langlink_pattern = interwiki_regex.pattern.replace(':?', '') last_section_content = sections[-1].content if sections else header footer = re.search(fr'({langlink_pattern}|{cat_regex.pattern}|\s)*\Z', last_section_content).group().lstrip() if footer: if sections: sections[-1] = Section( sections[-1].title, last_section_content[:-len(footer)]) else: header = header[:-len(footer)] return Content(header, sections, footer)
# ----------------------------------------------- # Functions dealing with interwiki language links # ----------------------------------------------- # Note - MediaWiki supports several kinds of interwiki links; two kinds are # inter-language links. We deal here with those kinds only. # A family has by definition only one kind of inter-language links: # 1 - inter-language links inside the own family. # They go to a corresponding page in another language in the same # family, such as from 'en.wikipedia' to 'pt.wikipedia', or from # 'es.wiktionary' to 'ar.wiktionary'. # Families with this kind have several language-specific sites. # They have their interwiki_forward attribute set to None # 2 - language links forwarding to another family. # They go to a corresponding page in another family, such as from # 'commons' to 'zh.wikipedia, or from 'incubator' to 'en.wikipedia'. # Families having those have one member only, and do not have # language-specific sites. The name of the target family of their # inter-language links is kept in their interwiki_forward attribute. # These functions only deal with links of these two kinds only. They # do not find or change links of other kinds, nor any that are formatted # as in-line interwiki links (e.g., "[[:es:Artículo]]".
[docs] def removeLanguageLinksAndSeparator(text: str, site=None, marker: str = '', separator: str = '') -> str: """Return text with inter-language links and preceding separators removed. If a link to an unknown language is encountered, a warning is printed. :param text: The text that needs to be modified. :param site: The site that the text is coming from. :type site: pywikibot.Site :param marker: If defined, marker is placed after the last language link, or at the end of text if there are no language links. :param separator: The separator string that will be removed if followed by the language links. :return: The modified text """ if separator: mymarker = findmarker(text, '@L@') newtext = removeLanguageLinks(text, site, mymarker) mymarker = expandmarker(newtext, mymarker, separator) return newtext.replace(mymarker, marker) return removeLanguageLinks(text, site, marker)
[docs] def interwikiFormat(links: dict, insite=None) -> str: """Convert interwiki link dict into a wikitext string. :param links: Interwiki links to be formatted :type links: Dict with the Site objects as keys, and Page or Link objects as values. :param insite: Site the interwiki links will be formatted for (defaulting to the current site). :type insite: BaseSite :return: String including wiki links formatted for inclusion in insite """ if not links: return '' if insite is None: insite = pywikibot.Site() ar = interwikiSort(list(links.keys()), insite) s = [] for site in ar: if isinstance(links[site], pywikibot.Link): links[site] = pywikibot.Page(links[site]) if not isinstance(links[site], pywikibot.Page): raise ValueError('links dict must contain Page or Link objects') title = links[site].title(as_link=True, force_interwiki=True, insite=insite) link = title.replace('[[:', '[[') s.append(link) sep = ' ' if insite.code in insite.family.interwiki_on_one_line else '\n' return sep.join(s) + '\n'
[docs] def interwikiSort(sites, insite=None): """Sort sites according to local interwiki sort logic.""" if not sites: return [] if insite is None: insite = pywikibot.Site() sites.sort() putfirst = insite.interwiki_putfirst() if putfirst: # In this case I might have to change the order firstsites = [] validlanglinks = insite.validLanguageLinks() for code in putfirst: if code in validlanglinks: site = insite.getSite(code=code) if site in sites: del sites[sites.index(site)] firstsites.append(site) sites = firstsites + sites return sites
# ------------------------------------- # Functions dealing with category links # -------------------------------------
[docs] def removeCategoryLinksAndSeparator(text: str, site=None, marker: str = '', separator: str = '') -> str: """Return text with category links and preceding separators removed. :param text: The text that needs to be modified. :param site: The site that the text is coming from. :type site: pywikibot.Site :param marker: If defined, marker is placed after the last category link, or at the end of text if there are no category links. :param separator: The separator string that will be removed if followed by the category links. :return: The modified text """ if site is None: site = pywikibot.Site() if separator: mymarker = findmarker(text, '@C@') newtext = removeCategoryLinks(text, site, mymarker) mymarker = expandmarker(newtext, mymarker, separator) return newtext.replace(mymarker, marker) return removeCategoryLinks(text, site, marker)
[docs] def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None, add_only: bool = False) -> str: """Replace old category with new one and return the modified text. :param oldtext: Content of the old category :param oldcat: :class:`pywikibot.Category` object of the old category :param newcat: :class:`Pywikibot.Category` object of the new category :param add_only: If add_only is True, the old category won't be replaced and the category given will be added after it. :return: The modified text """ if site is None: site = pywikibot.Site() catNamespace = '|'.join(site.namespaces.CATEGORY) title = oldcat.title(with_ns=False) if not title: return oldtext # title might contain regex special characters title = case_escape(site.namespaces[14].case, title, underscore=True) categoryR = re.compile( rf'\[\[\s*({catNamespace})\s*:\s*{title}[\s\u200e\u200f]*' r'((?:\|[^]]+)?\]\])', re.IGNORECASE) categoryRN = re.compile( rf'^[^\S\n]*\[\[\s*({catNamespace})\s*:\s*{title}[\s\u200e\u200f]*' r'((?:\|[^]]+)?\]\])[^\S\n]*\n', re.IGNORECASE | re.MULTILINE) exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight'] if newcat is None: # First go through and try the more restrictive regex that removes # an entire line, if the category is the only thing on that line (this # prevents blank lines left over in category lists following a removal) text = replaceExcept(oldtext, categoryRN, '', exceptions, site=site) text = replaceExcept(text, categoryR, '', exceptions, site=site) elif add_only: text = replaceExcept( oldtext, categoryR, f'{oldcat.title(as_link=True, allow_interwiki=False)}\n' f'{newcat.title(as_link=True, allow_interwiki=False)}', exceptions, site=site ) else: text = replaceExcept( oldtext, categoryR, f'[[{site.namespace(14)}:{newcat.title(with_ns=False)}\\2', exceptions, site=site ) return text
[docs] def categoryFormat(categories, insite=None) -> str: """Return a string containing links to all categories in a list. :param categories: A list of Category or Page objects or strings which can be either the raw name, [[Category:..]] or [[cat_localised_ns:...]]. :type categories: Iterable :param insite: Used to to localise the category namespace. :type insite: pywikibot.Site :return: String of categories """ if not categories: return '' if insite is None: insite = pywikibot.Site() catLinks = [] for category in categories: if isinstance(category, str): category, separator, sortKey = category.strip('[]').partition('|') sortKey = sortKey if separator else None # whole word if no ":" is present prefix = category.split(':', 1)[0] if prefix not in insite.namespaces[14]: category = f'{insite.namespace(14)}:{category}' category = pywikibot.Category(pywikibot.Link(category, insite, default_namespace=14), sort_key=sortKey) # Make sure a category is casted from Page to Category. elif not isinstance(category, pywikibot.Category): category = pywikibot.Category(category) link = category.aslink() catLinks.append(link) sep = ' ' if insite.category_on_one_line() else '\n' # Some people don't like the categories sorted # catLinks.sort() return sep.join(catLinks) + '\n'
# ------------------------------------- # Functions dealing with external links # -------------------------------------
[docs] def compileLinkR(withoutBracketed: bool = False, onlyBracketed: bool = False): """Return a regex that matches external links.""" # RFC 2396 says that URLs may only contain certain characters. # For this regex we also accept non-allowed characters, so that the bot # will later show these links as broken ('Non-ASCII Characters in URL'). # Note: While allowing dots inside URLs, MediaWiki will regard # dots at the end of the URL as not part of that URL. # The same applies to comma, colon and some other characters. notAtEnd = r'\]\s\.:;,<>"\|\)}' # So characters inside the URL can be anything except whitespace, # closing squared brackets, quotation marks, greater than and less # than, and the last character also can't be parenthesis or another # character disallowed by MediaWiki. notInside = r'\]\s<>"' # The first half of this regular expression is required because '' is # not allowed inside links. For example, in this wiki text: # ''Please see https://www.example.org.'' # .'' shouldn't be considered as part of the link. regex = rf'(?P<url>http[s]?://[^{notInside}]*?[^{notAtEnd}]' \ rf'(?=[{notAtEnd}]*\'\')|http[s]?://[^{notInside}]*' \ rf'[^{notAtEnd}])' if withoutBracketed: regex = r'(?<!\[)' + regex elif onlyBracketed: regex = r'\[' + regex return re.compile(regex)
# -------------------------------- # Functions dealing with templates # --------------------------------
[docs] def extract_templates_and_params( text: str, remove_disabled_parts: bool = False, strip: bool = False, ) -> list[tuple[str, OrderedDict[str, str]]]: """Return a list of templates found in text. Return value is a list of tuples. There is one tuple for each use of a template in the page, with the template title as the first entry and a dict of parameters as the second entry. Parameters are indexed by strings; as in MediaWiki, an unnamed parameter is given a parameter name with an integer value corresponding to its position among the unnamed parameters, and if this results multiple parameters with the same name only the last value provided will be returned. This uses the package :py:obj:`mwparserfromhell` or :py:obj:`wikitextparser` as MediaWiki markup parser. ``mwparserfromhell`` is installed by default. There are minor differences between the two implementations. The parser packages preserves whitespace in parameter names and values. If there are multiple numbered parameters in the wikitext for the same position, MediaWiki will only use the last parameter value. e.g. `{{a| foo | 2 <!-- --> = bar | baz }}` is `{{a|1=foo|2=baz}}` To replicate that behaviour, enable both `remove_disabled_parts` and `strip` parameters. :param text: The wikitext from which templates are extracted :param remove_disabled_parts: If enabled, remove disabled wikitext such as comments and pre. :param strip: If enabled, strip arguments and values of templates. :return: List of template name and params .. versionchanged:: 6.1 *wikitextparser* package is supported; either *wikitextparser* or *mwparserfromhell* is strictly recommended. """ def explicit(param): try: attr = param.showkey except AttributeError: attr = not param.positional return attr if remove_disabled_parts: text = removeDisabledParts(text) parser_name = wikitextparser.__name__ pywikibot.debug(f'Using {parser_name!r} wikitext parser') result = [] parsed = wikitextparser.parse(text) if parser_name == 'wikitextparser': templates = parsed.templates arguments = 'arguments' else: templates = parsed.ifilter_templates( matches=lambda x: not x.name.lstrip().startswith('#'), recursive=True) arguments = 'params' for template in templates: params = OrderedDict() for param in getattr(template, arguments): value = str(param.value) # mwpfh needs upcast to str if strip: key = param.name.strip() if explicit(param): value = param.value.strip() else: value = str(param.value) else: key = str(param.name) params[key] = value result.append((template.name.strip(), params)) return result
[docs] def extract_templates_and_params_regex_simple(text: str): """Extract top-level templates with params using only a simple regex. This function uses only a single regex, and returns an entry for each template called at the top-level of the wikitext. Nested templates are included in the argument values of the top-level template. This method will incorrectly split arguments when an argument value contains a '|', such as {{template|a={{b|c}} }}. :param text: The wikitext from which templates are extracted :return: List of template name and params :rtype: list of tuple of name and OrderedDict """ result = [] for match in NESTED_TEMPLATE_REGEX.finditer(text): name, params = match[1], match[2] # Special case for {{a}} params = [] if params is None else params.split('|') numbered_param_identifiers = itertools.count(1) params = OrderedDict( arg.split('=', 1) if '=' in arg else (str(next(numbered_param_identifiers)), arg) for arg in params) result.append((name, params)) return result
[docs] def glue_template_and_params(template_and_params) -> str: """Return wiki text of template glued from params. You can use items from extract_templates_and_params here to get an equivalent template wiki text (it may happen that the order of the params changes). """ template, params = template_and_params text = '' for items in params.items(): text += '|{}={}\n'.format(*items) return f'{{{{{template}\n{text}}}}}'
# -------------------------- # Page parsing functionality # --------------------------
[docs] def does_text_contain_section(pagetext: str, section: str) -> bool: """Determine whether the page text contains the given section title. It does not care whether a section string may contain spaces or underlines. Both will match. If a section parameter contains an internal link, it will match the section with or without a preceding colon which is required for a text link e.g. for categories and files. :param pagetext: The wikitext of a page :param section: A section of a page including wikitext markups """ # match preceding colon for text links section = re.sub(r'\\\[\\\[(\\?:)?', r'\[\[\:?', re.escape(section)) # match underscores and white spaces section = re.sub(r'\\?[ _]', '[ _]', section) m = re.search(f"=+[ ']*{section}[ ']*=+", pagetext) return bool(m)
[docs] def reformat_ISBNs(text: str, match_func) -> str: """Reformat ISBNs. :param text: Text containing ISBNs :param match_func: Function to reformat matched ISBNs :type match_func: Callable :return: Reformatted text """ isbnR = re.compile(r'(?<=ISBN )(?P<code>[\d\-]+[\dXx])') return isbnR.sub(match_func, text)
# --------------------------------------- # Time parsing functionality (Archivebot) # --------------------------------------- TIMEGROUPS = ('time', 'tzinfo', 'year', 'month', 'day', 'hour', 'minute')
[docs] class TimeStripperPatterns(NamedTuple): """Hold precompiled timestamp patterns for :class:`TimeStripper`. Attribute order is important to avoid mismatch when searching. .. versionadded:: 8.0 """ time: re.Pattern[str] tzinfo: re.Pattern[str] year: re.Pattern[str] month: re.Pattern[str] day: re.Pattern[str]
[docs] class TimeStripper: """Find timestamp in page and return it as pywikibot.Timestamp object. .. versionchanged:: 8.0 *group* attribute is a set instead of a list. *patterns* is a :class:`TimeStripperPatterns` namedtuple instead of a list. **Example**: >>> site = pywikibot.Site('wikipedia:fr') >>> sign = 'Merci bien Xqt (d) 15 mai 2013 à 20:34 (CEST)' >>> ts = TimeStripper(site) >>> ts.timestripper(sign) # doctest: +SKIP Timestamp(2013, 5, 15, 20, 34, tzinfo=TZoneFixedOffset(3600, Europe/Paris)) """ def __init__(self, site=None) -> None: """Initializer.""" self.site = pywikibot.Site() if site is None else site self.origNames2monthNum = {} # use first_lower/first_upper for those language where month names # were changed: T324310, T356175, T415880 if self.site.lang in ('hy', 'it', 'vi'): functions = [first_upper, first_lower] else: functions = [str] for n, (long, short) in enumerate(self.site.months_names, start=1): for func in functions: self.origNames2monthNum[func(long)] = n self.origNames2monthNum[func(short)] = n # in some cases month in ~~~~ might end without dot even if # site.months_names do not. if short.endswith('.'): self.origNames2monthNum[func(short[:-1])] = n timeR = (r'(?P<time>(?P<hour>([0-1]\d|2[0-3]))[:\.h]' r'(?P<minute>[0-5]\d))') timeznR = r'\((?P<tzinfo>[A-Z]+)\)' yearR = r'(?P<year>(19|20)\d\d)(?:{})?'.format('\ub144') # if months have 'digits' as names, they need to be # removed; will be handled as digits in regex, adding d+{1,2}\.? escaped_months = [month for month in self.origNames2monthNum if not month.strip('.').isdigit()] # match longest names first. escaped_months = [re.escape(month) for month in sorted(escaped_months, reverse=True)] # work around for cs wiki: if month are in digits, we assume # that format is dd. mm. (with dot and spaces optional) # the last one is workaround for Korean if any(month.isdigit() for month in self.origNames2monthNum): self.is_digit_month = True monthR = r'(?P<month>({})|(?:1[012]|0?[1-9])\.)' \ .format('|'.join(escaped_months)) dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))(?:{})' \ r'?\.?\s*(?:[01]?\d\.)?'.format('\uc77c') else: self.is_digit_month = False monthR = r'(?P<month>({}))'.format('|'.join(escaped_months)) dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))\.?' self.patterns = TimeStripperPatterns( re.compile(timeR), re.compile(timeznR), re.compile(yearR), re.compile(monthR), re.compile(dayR), ) self._hyperlink_pat = re.compile(r'\[\s*?http[s]?://[^\]]*?\]') self._comment_pat = re.compile(r'<!--(.*?)-->') self._wikilink_pat = re.compile( r'\[\[(?P<link>[^\]\|]*?)(?P<anchor>\|[^\]]*)?\]\]') self.tzinfo = TZoneFixedOffset(self.site.siteinfo['timeoffset'], self.site.siteinfo['timezone']) def _last_match_and_replace(self, txt: str, pat) -> tuple[str, re.Match[str] | None]: """Take the rightmost match and replace with marker. It does so to prevent spurious earlier matches. """ all_matches = list(pat.finditer(txt)) cnt = len(all_matches) if not cnt: return (txt, None) m = all_matches[-1] def marker(m: re.Match[str]): """Replace exactly the same number of matched characters. Same number of chars shall be replaced, in order to be able to compare pos for matches reliably (absolute pos of a match is not altered by replacement). """ return '@' * (m.end() - m.start()) # month and day format might be identical (e.g. see bug T71315), # avoid to wipe out day, after month is matched. Replace all matches # but the last two (i.e. allow to search for dd. mm.) if pat != self.patterns.month: txt = pat.sub(marker, txt) elif self.is_digit_month: if cnt > 2: txt = pat.sub(marker, txt, cnt - 2) else: txt = pat.sub(marker, txt) return (txt, m) @staticmethod def _valid_date_dict_positions(dateDict) -> bool: """Check consistency of reasonable positions for groups.""" time_pos = dateDict['time']['start'] tzinfo_pos = dateDict['tzinfo']['start'] date_pos = sorted( (dateDict['day'], dateDict['month'], dateDict['year']), key=lambda x: x['start']) min_pos, max_pos = date_pos[0]['start'], date_pos[-1]['start'] max_gap = max(x[1]['start'] - x[0]['end'] for x in zip(date_pos, date_pos[1:])) if max_gap > TIMESTAMP_GAP_LIMIT: return False if tzinfo_pos < min_pos or tzinfo_pos < time_pos: return False if min_pos < tzinfo_pos < max_pos: return False return not min_pos < time_pos < max_pos
[docs] def timestripper(self, line: str) -> pywikibot.Timestamp | None: """Find timestamp in line and convert it to time zone aware datetime. All the following items must be matched, otherwise None is returned: -. year, month, hour, time, day, minute, tzinfo .. versionchanged:: 7.6 HTML parts are removed from line :return: A timestamp found on the given line """ # Try to maintain gaps that are used in _valid_date_dict_positions() def censor_match(match): return '_' * (match.end() - match.start()) # match date fields dateDict = {} # Analyze comments separately from rest of each line to avoid to skip # dates in comments, as the date matched by timestripper is the # rightmost one. most_recent = [] for comment in self._comment_pat.finditer(line): # Recursion levels can be maximum two. If a comment is found, it # will not for sure be found in the next level. # Nested comments are excluded by design. timestamp = self.timestripper(comment[1]) most_recent.append(timestamp) # Censor comments. line = self._comment_pat.sub(censor_match, line) # Censor external links. line = self._hyperlink_pat.sub(censor_match, line) for wikilink in self._wikilink_pat.finditer(line): # Recursion levels can be maximum two. If a link is found, it will # not for sure be found in the next level. # Nested links are excluded by design. link, anchor = wikilink['link'], wikilink['anchor'] timestamp = self.timestripper(link) most_recent.append(timestamp) if anchor: timestamp = self.timestripper(anchor) most_recent.append(timestamp) # Censor wikilinks. line = self._wikilink_pat.sub(censor_match, line) # Remove parts that are not supposed to contain the timestamp, in order # to reduce false positives. line = removeDisabledParts(line) line = removeHTMLParts(line) line = to_ascii_digits(line) for pat in self.patterns: line, match_obj = self._last_match_and_replace(line, pat) if match_obj: for group, value in match_obj.groupdict().items(): start, end = (match_obj.start(group), match_obj.end(group)) # The positions are stored for later validation dateDict[group] = { 'value': value, 'start': start, 'end': end } # all fields matched -> date valid # groups are in a reasonable order. if (all(g in dateDict for g in TIMEGROUPS) and self._valid_date_dict_positions(dateDict)): # remove 'time' key, now split in hour/minute and not needed # by datetime. del dateDict['time'] # replace month name in original language with month number try: value = self.origNames2monthNum[dateDict['month']['value']] except KeyError: raise KeyError( f"incorrect month name {dateDict['month']['value']!r} " f'in page in site {self.site}' ) dateDict['month']['value'] = value # convert to integers and remove the inner dict for k, v in dateDict.items(): if k == 'tzinfo': continue try: dateDict[k] = int(v['value']) except ValueError: raise ValueError(f"Value: {v['value']} could not be " f'converted for key: {k}.') # find timezone dateDict['tzinfo'] = self.tzinfo timestamp = pywikibot.Timestamp(**dateDict) else: timestamp = None most_recent.append(timestamp) try: timestamp = max(ts for ts in most_recent if ts is not None) except ValueError: timestamp = None return timestamp
wrapper = ModuleDeprecationWrapper(__name__) wrapper.add_deprecated_attr('to_latin_digits', to_ascii_digits, since='10.3.0')