Source code for tools.chars

"""Character based helper functions (not wiki-dependent)."""
#
# (C) Pywikibot team, 2015-2023
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import re
import sys
from contextlib import suppress
from urllib.parse import unquote

from pywikibot.backports import Iterable
from pywikibot.tools._unidata import _category_cf


# This is a set of all invisible characters
# At the moment we've only added the characters from the Cf category
_invisible_chars = _category_cf

INVISIBLE_REGEX = re.compile(f"[{''.join(_invisible_chars)}]")


[docs] def contains_invisible(text): """Return True if the text contain any of the invisible characters.""" return any(char in _invisible_chars for char in text)
[docs] def replace_invisible(text): """Replace invisible characters by '<codepoint>'.""" def replace(match) -> str: match = match.group() if sys.maxunicode < 0x10ffff and len(match) == 2: mask = (1 << 10) - 1 assert ord(match[0]) & ~mask == 0xd800 assert ord(match[1]) & ~mask == 0xdc00 codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) & mask) else: codepoint = ord(match) return f'<{codepoint:x}>' return INVISIBLE_REGEX.sub(replace, text)
[docs] def string_to_ascii_html(string: str) -> str: """Convert unicode chars of str to HTML entities if chars are not ASCII. **Example:** >>> string_to_ascii_html('Python') 'Python' >>> string_to_ascii_html("Pywikibot's API") "Pywikibot's API" >>> string_to_ascii_html('Eetße Joohunndot füür Kreůßtůß') 'Eet&#223;e Joohunndot f&#252;&#252;r Kre&#367;&#223;t&#367;&#223;' :param string: String to update """ html = [] for c in string: cord = ord(c) if 31 < cord < 127: html.append(c) else: html.append(f'&#{cord};') return ''.join(html)
[docs] def string2html(string: str, encoding: str) -> str: """Convert unicode string to requested HTML encoding. Attempt to encode the string into the desired format; if that work return it unchanged. Otherwise encode the non-ASCII characters into HTML &#; entities. **Example:** >>> string2html('Referências', 'utf-8') 'Referências' >>> string2html('Referências', 'ascii') 'Refer&#234;ncias' >>> string2html('脚注', 'euc_jp') '脚注' >>> string2html('脚注', 'iso-8859-1') '&#33050;&#27880;' :param string: String to update :param encoding: Encoding to use """ with suppress(UnicodeError): string.encode(encoding) return string return string_to_ascii_html(string)
[docs] def url2string(title: str, encodings: str | Iterable[str] = 'utf-8') -> str: """Convert URL-encoded text to unicode using several encoding. Uses the first encoding that doesn't cause an error. Raises the first exception if all encodings fails. For a single *encodings* string this function is equvalent to :samp:`urllib.parse.unquote(title, encodings, errors='strict')` .. versionchanged:: 8.4 Ignore *LookupError* and try other encodings. .. seealso:: :python:`urllib.parse.unquote <library/urllib.parse.html#urllib.parse.unquote>` **Example:** >>> url2string('abc%20def') 'abc def' >>> url2string('/El%20Ni%C3%B1o/') '/El Niño/' >>> url2string('/El%20Ni%C3%B1o/', 'ascii') Traceback (most recent call last): ... UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 6:... >>> url2string('/El%20Ni%C3%B1o/', ['ascii', 'utf-8']) '/El Niño/' :param title: URL-encoded character data to convert :param encodings: Encodings to attempt to use during conversion. :raise UnicodeError: Could not convert using any encoding. :raise LookupError: unknown encoding """ if isinstance(encodings, str): return unquote(title, encodings, errors='strict') first_exception = None for enc in encodings: try: return unquote(title, enc, errors='strict') except (UnicodeError, LookupError) as e: if not first_exception: first_exception = e # Couldn't convert, raise the first exception raise first_exception