"""Character based helper functions (not wiki-dependent)."""## (C) Pywikibot team, 2015-2024## Distributed under the terms of the MIT license.#from__future__importannotationsimportrefromcontextlibimportsuppressfromurllib.parseimportunquotefrompywikibot.backportsimportIterablefrompywikibot.tools._unidataimport_category_cf# This is a set of all invisible characters# At the moment we've only added the characters from the Cf category_invisible_chars=_category_cfINVISIBLE_REGEX=re.compile(f"[{''.join(_invisible_chars)}]")
[docs]defcontains_invisible(text):"""Return True if the text contain any of the invisible characters."""returnany(charin_invisible_charsforcharintext)
[docs]defreplace_invisible(text):"""Replace invisible characters by '<codepoint>'."""defreplace(match)->str:codepoint=ord(match.group())returnf'<{codepoint:x}>'returnINVISIBLE_REGEX.sub(replace,text)
[docs]defstring_to_ascii_html(string:str)->str:"""Convert unicode chars of str to HTML entities if chars are not ASCII. **Example:** >>> string_to_ascii_html('Python') 'Python' >>> string_to_ascii_html("Pywikibot's API") "Pywikibot's API" >>> string_to_ascii_html('Eetße Joohunndot füür Kreůßtůß') 'Eetße Joohunndot füür Kreůßtůß' :param string: String to update """html=[]forcinstring:cord=ord(c)if31<cord<127:html.append(c)else:html.append(f'&#{cord};')return''.join(html)
[docs]defstring2html(string:str,encoding:str)->str:"""Convert unicode string to requested HTML encoding. Attempt to encode the string into the desired format; if that work return it unchanged. Otherwise encode the non-ASCII characters into HTML &#; entities. **Example:** >>> string2html('Referências', 'utf-8') 'Referências' >>> string2html('Referências', 'ascii') 'Referências' >>> string2html('脚注', 'euc_jp') '脚注' >>> string2html('脚注', 'iso-8859-1') '脚注' :param string: String to update :param encoding: Encoding to use """withsuppress(UnicodeError):string.encode(encoding)returnstringreturnstring_to_ascii_html(string)
[docs]defurl2string(title:str,encodings:str|Iterable[str]='utf-8')->str:"""Convert URL-encoded text to unicode using several encoding. Uses the first encoding that doesn't cause an error. Raises the first exception if all encodings fails. For a single *encodings* string this function is equivalent to :samp:`urllib.parse.unquote(title, encodings, errors='strict')` .. versionchanged:: 8.4 Ignore *LookupError* and try other encodings. .. seealso:: :python:`urllib.parse.unquote <library/urllib.parse.html#urllib.parse.unquote>` **Example:** >>> url2string('abc%20def') 'abc def' >>> url2string('/El%20Ni%C3%B1o/') '/El Niño/' >>> url2string('/El%20Ni%C3%B1o/', 'ascii') Traceback (most recent call last): ... UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 6:... >>> url2string('/El%20Ni%C3%B1o/', ['ascii', 'utf-8']) '/El Niño/' :param title: URL-encoded character data to convert :param encodings: Encodings to attempt to use during conversion. :raise UnicodeError: Could not convert using any encoding. :raise LookupError: unknown encoding """ifisinstance(encodings,str):returnunquote(title,encodings,errors='strict')first_exception=Noneforencinencodings:try:returnunquote(title,enc,errors='strict')except(UnicodeError,LookupError)ase:ifnotfirst_exception:first_exception=e# Couldn't convert, raise the first exceptionraisefirst_exception