Source code for tools.chars
"""Character based helper functions (not wiki-dependent)."""
#
# (C) Pywikibot team, 2015-2022
#
# Distributed under the terms of the MIT license.
#
import re
import sys
from contextlib import suppress
from typing import Union
from urllib.parse import unquote_to_bytes
from pywikibot.backports import List, Tuple
from pywikibot.tools._unidata import _category_cf
# This is a set of all invisible characters
# At the moment we've only added the characters from the Cf category
_invisible_chars = _category_cf
INVISIBLE_REGEX = re.compile('[{}]'.format(''.join(_invisible_chars)))
[docs]def contains_invisible(text):
"""Return True if the text contain any of the invisible characters."""
return any(char in _invisible_chars for char in text)
[docs]def replace_invisible(text):
"""Replace invisible characters by '<codepoint>'."""
def replace(match) -> str:
match = match.group()
if sys.maxunicode < 0x10ffff and len(match) == 2:
mask = (1 << 10) - 1
assert ord(match[0]) & ~mask == 0xd800
assert ord(match[1]) & ~mask == 0xdc00
codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) & mask)
else:
codepoint = ord(match)
return f'<{codepoint:x}>'
return INVISIBLE_REGEX.sub(replace, text)
[docs]def string_to_ascii_html(string: str) -> str:
"""Convert unicode chars of str to HTML entities if chars are not ASCII.
:param string: String to update
"""
html = []
for c in string:
cord = ord(c)
if 31 < cord < 127:
html.append(c)
else:
html.append(f'&#{cord};')
return ''.join(html)
[docs]def string2html(string: str, encoding: str) -> str:
"""Convert unicode string to requested HTML encoding.
Attempt to encode the string into the desired format; if that work
return it unchanged. Otherwise encode the non-ASCII characters into
HTML &#; entities.
:param string: String to update
:param encoding: Encoding to use
"""
with suppress(UnicodeError):
string.encode(encoding)
return string
return string_to_ascii_html(string)
[docs]def url2string(
title: str,
encodings: Union[str, List[str], Tuple[str, ...]] = 'utf-8'
) -> str:
"""Convert URL-encoded text to unicode using several encoding.
Uses the first encoding that doesn't cause an error.
:param title: URL-encoded character data to convert
:param encodings: Encodings to attempt to use during conversion.
:raise UnicodeError: Could not convert using any encoding.
"""
if isinstance(encodings, str):
encodings = [encodings]
first_exception = None
for enc in encodings:
try:
t = title.encode(enc)
t = unquote_to_bytes(t)
except UnicodeError as e:
if not first_exception:
first_exception = e
else:
return t.decode(enc)
# Couldn't convert, raise the first exception
raise first_exception