Source code for pywikibot.page._links

"""Objects representing internal or interwiki link in wikitext.

.. note::
   `Link` objects defined here represent a wiki-page's title, while
   :class:`pywikibot.Page` objects represent the page itself, including
   its contents.
"""
#
# (C) Pywikibot team, 2008-2022
#
# Distributed under the terms of the MIT license.
#
import re
import unicodedata
from html.entities import name2codepoint

import pywikibot
from pywikibot import textlib
from pywikibot.exceptions import InvalidTitleError, SiteDefinitionError
from pywikibot.site import Namespace
from pywikibot.tools import ComparableMixin, first_upper, is_ip_address


__all__ = (
    'BaseLink',
    'Link',
    'SiteLink',
    'html2unicode',
)











# Utility functions for parsing page titles

# This regular expression will match any decimal and hexadecimal entity and
# also entities that might be named entities.
_ENTITY_SUB = re.compile(
    r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));').sub
# These characters are Html-illegal, but sadly you *can* find some of
# these and converting them to chr(decimal) is unsuitable
_ILLEGAL_HTML_ENTITIES_MAPPING = {
    128: 8364,  # €
    130: 8218,  # ‚
    131: 402,   # ƒ
    132: 8222,  # „
    133: 8230,  # …
    134: 8224,  # †
    135: 8225,  # ‡
    136: 710,   # ˆ
    137: 8240,  # ‰
    138: 352,   # Š
    139: 8249,  # ‹
    140: 338,   # Œ
    142: 381,   # Ž
    145: 8216,  # ‘
    146: 8217,  # ’
    147: 8220,  # “
    148: 8221,  # ”
    149: 8226,  # •
    150: 8211,  # –
    151: 8212,  # —
    152: 732,   # ˜
    153: 8482,  # ™
    154: 353,   # š
    155: 8250,  # ›
    156: 339,   # œ
    158: 382,   # ž
    159: 376    # Ÿ
}


[docs]def html2unicode(text: str, ignore=None, exceptions=None) -> str: """ Replace HTML entities with equivalent unicode. :param ignore: HTML entities to ignore :param ignore: list of int """ if ignore is None: ignore = [] # ensuring that illegal &#129; &#141; and &#157, which have no known # values, don't get converted to chr(129), chr(141) or chr(157) ignore = {_ILLEGAL_HTML_ENTITIES_MAPPING.get(x, x) for x in ignore} | {129, 141, 157} def handle_entity(match): if textlib.isDisabled(match.string, match.start(), tags=exceptions): # match.string stores original text so we do not need # to pass it to handle_entity, ♥ Python return match.group(0) if match.group('decimal'): unicode_codepoint = int(match.group('decimal')) elif match.group('hex'): unicode_codepoint = int(match.group('hex'), 16) elif match.group('name'): name = match.group('name') unicode_codepoint = name2codepoint.get(name, False) unicode_codepoint = _ILLEGAL_HTML_ENTITIES_MAPPING.get( unicode_codepoint, unicode_codepoint) if unicode_codepoint and unicode_codepoint not in ignore: return chr(unicode_codepoint) # Leave the entity unchanged return match.group(0) return _ENTITY_SUB(handle_entity, text)