Source code for pywikibot.page._links

"""Objects representing internal or interwiki link in wikitext.

.. note::
   `Link` objects defined here represent a wiki-page's title, while
   :class:`pywikibot.Page` objects represent the page itself, including
   its contents.
"""
#
# (C) Pywikibot team, 2008-2023
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import re
import unicodedata
from html.entities import name2codepoint
from typing import Any

import pywikibot
from pywikibot import textlib
from pywikibot.exceptions import InvalidTitleError, SiteDefinitionError
from pywikibot.site import Namespace
from pywikibot.tools import ComparableMixin, first_upper, is_ip_address


__all__ = (
    'BaseLink',
    'Link',
    'SiteLink',
    'html2unicode',
)














# Utility functions for parsing page titles

# This regular expression will match any decimal and hexadecimal entity and
# also entities that might be named entities.
_ENTITY_SUB = re.compile(
    r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));').sub
# These characters are Html-illegal, but sadly you *can* find some of
# these and converting them to chr(decimal) is unsuitable
_ILLEGAL_HTML_ENTITIES_MAPPING = {
    128: 8364,  # €
    130: 8218,  # ‚
    131: 402,   # ƒ
    132: 8222,  # „
    133: 8230,  # …
    134: 8224,  # †
    135: 8225,  # ‡
    136: 710,   # ˆ
    137: 8240,  # ‰
    138: 352,   # Š
    139: 8249,  # ‹
    140: 338,   # Œ
    142: 381,   # Ž
    145: 8216,  # ‘
    146: 8217,  # ’
    147: 8220,  # “
    148: 8221,  # ”
    149: 8226,  # •
    150: 8211,  # –
    151: 8212,  # —
    152: 732,   # ˜
    153: 8482,  # ™
    154: 353,   # š
    155: 8250,  # ›
    156: 339,   # œ
    158: 382,   # ž
    159: 376    # Ÿ
}


[docs] def html2unicode(text: str, ignore=None, exceptions=None) -> str: """ Replace HTML entities with equivalent unicode. :param ignore: HTML entities to ignore :param ignore: list of int """ if ignore is None: ignore = [] # ensuring that illegal &#129; &#141; and &#157, which have no known # values, don't get converted to chr(129), chr(141) or chr(157) ignore = {_ILLEGAL_HTML_ENTITIES_MAPPING.get(x, x) for x in ignore} | {129, 141, 157} def handle_entity(match): if textlib.isDisabled(match.string, match.start(), tags=exceptions): # match.string stores original text so we do not need # to pass it to handle_entity, ♥ Python return match[0] if match['decimal']: unicode_codepoint = int(match['decimal']) elif match['hex']: unicode_codepoint = int(match['hex'], 16) elif match['name']: name = match['name'] unicode_codepoint = name2codepoint.get(name, False) unicode_codepoint = _ILLEGAL_HTML_ENTITIES_MAPPING.get( unicode_codepoint, unicode_codepoint) if unicode_codepoint and unicode_codepoint not in ignore: return chr(unicode_codepoint) # Leave the entity unchanged return match[0] return _ENTITY_SUB(handle_entity, text)