12 pyversion = platform.python_version()
13 islinux = platform.system().lower() ==
'linux'
15 if pyversion[:3]
in [
'2.6',
'2.7']:
16 import urllib
as urllib_request
20 if sys.maxunicode < 0x10000:
26 elif pyversion[:2] ==
'3.':
27 import urllib.request
as urllib_request
32 return [
unichr(int(i.split(
'<')[0][2:], 16))
for i
in args]
36 return [
unichr(int(i[2:7], 16))
for i
in args
if i[2:7]]
41 SCIM_TABLES_VER =
'0.5.13'
42 SCIM_PINYIN_VER =
'0.5.92'
48 if os.path.isfile(dest):
49 print(
'File %s is up to date.' % dest)
55 os.system(
'wget %s -O %s' % (url, dest))
57 print(
'Downloading from [%s] ...' % url)
58 urllib_request.urlretrieve(url, dest)
59 print(
'Download complete.\n')
64 name = member.rsplit(
'/', 1)[-1]
65 print(
'Extracting %s ...' % name)
67 shutil.move(member, name)
69 shutil.rmtree(member.split(
'/', 1)[0])
70 if pyversion[:1]
in [
'2']:
71 fc =
open(name,
'rb', encoding,
'ignore')
73 fc =
open(name,
'r', encoding=encoding, errors=
'ignore')
76 unzip =
lambda path, member, encoding =
'U8': \
77 uncompress(zipfile.ZipFile(path), member, encoding)
79 untargz =
lambda path, member, encoding =
'U8': \
80 uncompress(tarfile.open(path,
'r:gz'), member, encoding)
84 if beginmark
and endmark:
90 if beginmark
and line.startswith(beginmark):
93 elif endmark
and line.startswith(endmark):
95 if start
and not line.startswith(
'#'):
99 elif len(elems[0]) > 1
and len(elems[pos]) > 1:
100 mlist.add(elems[pos])
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src =
'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
109 return parserCore(fp, 1,
'BEGIN_TABLE',
'END_TABLE')
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src =
'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
125 """ Read tsi.src and parse it. """
126 src =
'libtabe/tsi-src/tsi.src'
127 fp =
untargz(path, src,
'big5hkscs')
132 """ Read Unihan_Variants.txt and parse it. """
133 fp =
unzip(path,
'Unihan_Variants.txt',
'U8')
137 if line.startswith(
'#'):
145 if type ==
'kTraditionalVariant':
146 s2t[elems[0]] = elems[1:]
147 elif type ==
'kSimplifiedVariant':
148 t2s[elems[0]] = elems[1:]
154 """ Apply exclude rules from path to mlist. """
155 if pyversion[:1]
in [
'2']:
156 excludes =
open(path,
'rb',
'U8').read().split()
158 excludes =
open(path,
'r', encoding=
'U8').read().split()
159 excludes = [word.split(
'#')[0].strip()
for word
in excludes]
160 excludes =
'|'.join(excludes)
161 excptn = re.compile(
'.*(?:%s).*' % excludes)
162 diff = [mword
for mword
in mlist
if excptn.search(mword)]
163 mlist.difference_update(diff)
168 fp =
open(path,
'r', encoding=
'U8')
170 elems = line.split(
'#')[0].split(
'|')
173 yield elems[0], elems[1:]
178 if pyversion[:1]
in [
'2']:
179 for (f, t)
in src_table.iteritems():
180 for i
in range(1, len(t)):
183 for (f, t)
in src_table.items():
184 for i
in range(1, len(t)):
190 fp =
open(path,
'r', encoding=
'U8')
193 elems = line.split(
'=>')
194 f = t = elems[0].strip()
197 f = f.strip(
'"').strip(
"'")
198 t = t.strip(
'"').strip(
"'")
206 texcptn = re.compile(
'^(?:%s)$' %
'|'.join(texc))
207 if pyversion[:1]
in [
'2']:
208 for (tmp_f, tmp_t)
in table.copy().iteritems():
209 if texcptn.match(tmp_t):
212 for (tmp_f, tmp_t)
in table.copy().items():
213 if texcptn.match(tmp_t):
219 fp =
open(path,
'r', encoding=
'U8')
222 line = line.rstrip(
'\r\n')
224 line = line.split(
'#')[0].rstrip()
225 elems = line.split(
'\t')
227 ret[elems[0]] = elems[1]
232 return sorted(src_table.items(), key=
lambda m: (m[pos], m[1 - pos]))
238 for j
in range(len(text) - i, 0, -1):
240 t = conv_table.get(f)
242 text = text[:i] + t + text[i:][j:]
250 fp =
open(path,
'r', encoding=
'U8')
251 reconv_table = reconv_table.copy()
253 wordlist = [line.split(
'#')[0].strip()
for line
in fp]
254 wordlist =
list(set(wordlist))
255 wordlist.sort(key=
lambda w: (len(w), w), reverse=
True)
257 word = wordlist.pop()
261 reconv_table[word] = out_table[word] = word
262 reconv_table[new_word] = out_table[new_word] = word
268 wordlist =
list(src_wordlist)
269 wordlist.sort(key=
lambda w: (len(w), w), reverse=
True)
271 word_reconv_table = {}
272 conv_table = char_conv_table.copy()
273 reconv_table = char_reconv_table.copy()
274 tomanyptn = re.compile(
'(?:%s)' %
'|'.join(src_tomany))
276 conv_table.update(word_conv_table)
277 reconv_table.update(word_reconv_table)
278 word = wordlist.pop()
279 new_word_len = word_len = len(word)
280 while new_word_len == word_len:
281 test_word =
translate(word, reconv_table)
283 if not reconv_table.get(new_word)
and \
284 (test_word != word
or
285 (tomanyptn.search(word)
and
286 word !=
translate(new_word, reconv_table))):
287 word_conv_table[word] = new_word
288 word_reconv_table[new_word] = word
290 word = wordlist.pop()
293 new_word_len = len(word)
294 return word_reconv_table
298 lines = [
'\'%s\' => \'%s\',' % (f, t)
for (f, t)
in table
if f
and t]
299 return '\n'.join(lines)
304 url =
'https://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
305 han_dest =
'Unihan-%s.zip' % UNIHAN_VER
308 sfurlbase =
'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
311 url = sfurlbase +
'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 tbe_dest =
'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
316 url = sfurlbase +
'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 pyn_dest =
'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
321 url = sfurlbase +
'libtabe/libtabe-%s.tgz' % LIBTABE_VER
322 lbt_dest =
'libtabe-%s.tgz' % LIBTABE_VER
330 s2t_1tomany.update((t[0], [f])
for (f, t)
in charManualTable(
'symme_supp.manual'))
333 if pyversion[:1]
in [
'2']:
334 t2s_1to1 = dict([(f, t[0])
for (f, t)
in t2s_1tomany.iteritems()])
335 s2t_1to1 = dict([(f, t[0])
for (f, t)
in s2t_1tomany.iteritems()])
337 t2s_1to1 = dict([(f, t[0])
for (f, t)
in t2s_1tomany.items()])
338 s2t_1to1 = dict([(f, t[0])
for (f, t)
in s2t_1tomany.items()])
344 t2s_1to1 =
removeRules(
'trad2simp_noconvert.manual', t2s_1to1)
345 s2t_1to1 =
removeRules(
'simp2trad_noconvert.manual', s2t_1to1)
348 t2s_1to1_supp = t2s_1to1.copy()
349 s2t_1to1_supp = s2t_1to1.copy()
350 t2s_1to1_supp.update(
customRules(
'trad2simp_supp_set.manual'))
351 s2t_1to1_supp.update(
customRules(
'simp2trad_supp_set.manual'))
355 s2t_1to1_supp, t2s_1to1_supp)
356 t2s_word2word_manual.update(
customRules(
'toSimp.manual'))
358 t2s_1to1_supp, s2t_1to1_supp)
359 s2t_word2word_manual.update(
customRules(
'toTrad.manual'))
371 s_wordlist =
applyExcludes(s_wordlist,
'simpphrases_exclude.manual')
372 t_wordlist =
applyExcludes(t_wordlist,
'tradphrases_exclude.manual')
374 s2t_supp = s2t_1to1_supp.copy()
375 s2t_supp.update(s2t_word2word_manual)
376 t2s_supp = t2s_1to1_supp.copy()
377 t2s_supp.update(t2s_word2word_manual)
381 s2t_1to1_supp, t2s_supp)
382 t2s_word2word.update(t2s_word2word_manual)
384 t2s_1to1_supp, s2t_supp)
385 s2t_word2word.update(s2t_word2word_manual)
389 if pyversion[:1]
in [
'2']:
390 t2s_1to1 = dict([(f, t)
for (f, t)
in t2s_1to1.iteritems()
if f != t])
392 t2s_1to1 = dict([(f, t)
for (f, t)
in t2s_1to1.items()
if f != t])
395 if pyversion[:1]
in [
'2']:
396 s2t_1to1 = dict([(f, t)
for (f, t)
in s2t_1to1.iteritems()
if f != t])
398 s2t_1to1 = dict([(f, t)
for (f, t)
in s2t_1to1.items()
if f != t])
410 * Simplified / Traditional Chinese conversion tables
412 * Automatically generated using code and data in maintenance/language/zhtable/
413 * Do not modify directly!
418 namespace MediaWiki\Languages\Data;
421 public static $zh2Hant = [\n'''
423 +
'\n];\n\npublic static $zh2Hans = [\n' \
425 +
'\n];\n\npublic static $zh2TW = [\n' \
427 +
'\n];\n\npublic static $zh2HK = [\n' \
429 +
'\n];\n\npublic static $zh2CN = [\n' \
433 if pyversion[:1]
in [
'2']:
434 f =
open(os.path.join(
'..',
'..',
'..',
'languages',
'data',
'ZhConversion.php'),
'wb', encoding=
'utf8')
436 f =
open(os.path.join(
'..',
'..',
'..',
'languages',
'data',
'ZhConversion.php'),
'w', buffering=4096, encoding=
'utf8')
437 print (
'Writing ZhConversion.php ... ')
442 print (
'Deleting temporary files ... ')
443 os.remove(
'EZ-Big.txt.in')
444 os.remove(
'phrase_lib.txt')
446 os.remove(
'Unihan_Variants.txt')
447 os.remove(
'Wubi.txt.in')
448 os.remove(
'Ziranma.txt.in')
451 if __name__ ==
'__main__':