12 pyversion = platform.python_version()
13 islinux = platform.system().lower() ==
'linux'
15 if pyversion[:3]
in [
'2.6',
'2.7']:
16 import urllib
as urllib_request
20 if sys.maxunicode < 0x10000:
26 elif pyversion[:2] ==
'3.':
27 import urllib.request
as urllib_request
32 return [
unichr(int(i.split(
'<')[0][2:], 16))
for i
in args]
36 return [
unichr(int(i[2:7], 16))
for i
in args
if i[2:7]]
41 SCIM_TABLES_VER =
'0.5.13'
42 SCIM_PINYIN_VER =
'0.5.92'
48 if os.path.isfile(dest):
49 print(
'File %s is up to date.' % dest)
55 os.system(
'wget %s -O %s' % (url, dest))
57 print(
'Downloading from [%s] ...' % url)
58 urllib_request.urlretrieve(url, dest)
59 print(
'Download complete.\n')
64 name = member.rsplit(
'/', 1)[-1]
65 print(
'Extracting %s ...' % name)
67 shutil.move(member, name)
69 shutil.rmtree(member.split(
'/', 1)[0])
70 if pyversion[:1]
in [
'2']:
71 fc =
open(name,
'rb', encoding,
'ignore')
73 fc =
open(name,
'r', encoding=encoding, errors='ignore')
76 unzip =
lambda path, member, encoding =
'U8': \
77 uncompress(zipfile.ZipFile(path), member, encoding)
79 untargz =
lambda path, member, encoding =
'U8': \
80 uncompress(tarfile.open(path,
'r:gz'), member, encoding)
84 if beginmark
and endmark:
90 if beginmark
and line.startswith(beginmark):
93 elif endmark
and line.startswith(endmark):
95 if start
and not line.startswith(
'#'):
99 elif len(elems[0]) > 1
and len(elems[pos]) > 1:
100 mlist.add(elems[pos])
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src =
'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
109 return parserCore(fp, 1,
'BEGIN_TABLE',
'END_TABLE')
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src =
'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
125 """ Read tsi.src and parse it. """
126 src =
'libtabe/tsi-src/tsi.src'
127 fp =
untargz(path, src,
'big5hkscs')
132 """ Read Unihan_Variants.txt and parse it. """
133 fp =
unzip(path,
'Unihan_Variants.txt',
'U8')
137 if line.startswith(
'#'):
145 if type ==
'kTraditionalVariant':
146 s2t[elems[0]] = elems[1:]
147 elif type ==
'kSimplifiedVariant':
148 t2s[elems[0]] = elems[1:]
154 """ Apply exclude rules from path to mlist. """
155 if pyversion[:1]
in [
'2']:
156 excludes =
open(path,
'rb',
'U8').read().split()
158 excludes =
open(path,
'r', encoding='U8').read().split()
159 excludes = [word.split(
'#')[0].strip()
for word
in excludes]
160 excludes =
'|'.join(excludes)
161 excptn = re.compile(
'.*(?:%s).*' % excludes)
162 diff = [mword
for mword
in mlist
if excptn.search(mword)]
163 mlist.difference_update(diff)
168 fp =
open(path,
'r', encoding='U8')
170 elems = line.split(
'#')[0].split(
'|')
173 yield elems[0], elems[1:]
178 if pyversion[:1]
in [
'2']:
179 for (f, t)
in src_table.iteritems():
180 for i
in range(1, len(t)):
183 for (f, t)
in src_table.items():
184 for i
in range(1, len(t)):
190 fp =
open(path,
'r', encoding='U8')
193 elems = line.split(
'=>')
194 f = t = elems[0].strip()
197 f = f.strip(
'"').strip(
"'")
198 t = t.strip(
'"').strip(
"'")
206 texcptn = re.compile(
'^(?:%s)$' %
'|'.join(texc))
207 if pyversion[:1]
in [
'2']:
208 for (tmp_f, tmp_t)
in table.copy().iteritems():
209 if texcptn.match(tmp_t):
212 for (tmp_f, tmp_t)
in table.copy().items():
213 if texcptn.match(tmp_t):
219 fp =
open(path,
'r', encoding='U8')
222 line = line.rstrip(
'\r\n')
224 line = line.split(
'#')[0].rstrip()
225 elems = line.split(
'\t')
227 ret[elems[0]] = elems[1]
232 return sorted(src_table.items(), key=
lambda m: (m[pos], m[1 - pos]))
238 for j
in range(len(text) - i, 0, -1):
240 t = conv_table.get(f)
242 text = text[:i] + t + text[i:][j:]
250 fp =
open(path,
'r', encoding='U8')
251 reconv_table = reconv_table.copy()
253 wordlist = [line.split(
'#')[0].strip()
for line
in fp]
255 wordlist.sort(key=
lambda w: (len(w), w), reverse=
True)
257 word = wordlist.pop()
261 reconv_table[word] = out_table[word] = word
262 reconv_table[new_word] = out_table[new_word] = word
268 wordlist =
list(src_wordlist)
269 wordlist.sort(key=
lambda w: (len(w), w), reverse=
True)
271 word_reconv_table = {}
272 conv_table = char_conv_table.copy()
273 reconv_table = char_reconv_table.copy()
274 tomanyptn = re.compile(
'(?:%s)' %
'|'.join(src_tomany))
276 conv_table.update(word_conv_table)
277 reconv_table.update(word_reconv_table)
278 word = wordlist.pop()
279 new_word_len = word_len = len(word)
280 while new_word_len == word_len:
281 test_word =
translate(word, reconv_table)
283 if not reconv_table.get(new_word)
and \
284 (test_word != word
or
285 (tomanyptn.search(word)
and
286 word !=
translate(new_word, reconv_table))):
287 word_conv_table[word] = new_word
288 word_reconv_table[new_word] = word
290 word = wordlist.pop()
293 new_word_len = len(word)
294 return word_reconv_table
298 lines = [
'\'%s\' => \'%s\',' % (f, t)
for (f, t)
in table
if f
and t]
299 return '\n'.join(lines)
304 url =
'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
305 han_dest =
'Unihan-%s.zip' % UNIHAN_VER
308 sfurlbase =
'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
311 url = sfurlbase +
'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 tbe_dest =
'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
316 url = sfurlbase +
'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 pyn_dest =
'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
321 url = sfurlbase +
'libtabe/libtabe-%s.tgz' % LIBTABE_VER
322 lbt_dest =
'libtabe-%s.tgz' % LIBTABE_VER
330 s2t_1tomany.update((t[0], [f])
for (f, t)
in charManualTable(
'symme_supp.manual'))
333 if pyversion[:1]
in [
'2']:
334 t2s_1to1 = dict([(f, t[0])
for (f, t)
in t2s_1tomany.iteritems()])
335 s2t_1to1 = dict([(f, t[0])
for (f, t)
in s2t_1tomany.iteritems()])
337 t2s_1to1 = dict([(f, t[0])
for (f, t)
in t2s_1tomany.items()])
338 s2t_1to1 = dict([(f, t[0])
for (f, t)
in s2t_1tomany.items()])
344 t2s_1to1 =
removeRules(
'trad2simp_noconvert.manual', t2s_1to1)
345 s2t_1to1 =
removeRules(
'simp2trad_noconvert.manual', s2t_1to1)
348 t2s_1to1_supp = t2s_1to1.copy()
349 s2t_1to1_supp = s2t_1to1.copy()
350 t2s_1to1_supp.update(
customRules(
'trad2simp_supp_set.manual'))
351 s2t_1to1_supp.update(
customRules(
'simp2trad_supp_set.manual'))
355 s2t_1to1_supp, t2s_1to1_supp)
356 t2s_word2word_manual.update(
customRules(
'toSimp.manual'))
358 t2s_1to1_supp, s2t_1to1_supp)
359 s2t_word2word_manual.update(
customRules(
'toTrad.manual'))
371 s_wordlist =
applyExcludes(s_wordlist,
'simpphrases_exclude.manual')
372 t_wordlist =
applyExcludes(t_wordlist,
'tradphrases_exclude.manual')
374 s2t_supp = s2t_1to1_supp.copy()
375 s2t_supp.update(s2t_word2word_manual)
376 t2s_supp = t2s_1to1_supp.copy()
377 t2s_supp.update(t2s_word2word_manual)
381 s2t_1to1_supp, t2s_supp)
382 t2s_word2word.update(t2s_word2word_manual)
384 t2s_1to1_supp, s2t_supp)
385 s2t_word2word.update(s2t_word2word_manual)
389 if pyversion[:1]
in [
'2']:
390 t2s_1to1 = dict([(f, t)
for (f, t)
in t2s_1to1.iteritems()
if f != t])
392 t2s_1to1 = dict([(f, t)
for (f, t)
in t2s_1to1.items()
if f != t])
395 if pyversion[:1]
in [
'2']:
396 s2t_1to1 = dict([(f, t)
for (f, t)
in s2t_1to1.iteritems()
if f != t])
398 s2t_1to1 = dict([(f, t)
for (f, t)
in s2t_1to1.items()
if f != t])
410 * Simplified / Traditional Chinese conversion tables
412 * Automatically generated using code and data in maintenance/language/zhtable/
413 * Do not modify directly!
418 namespace MediaWiki\Languages\Data;
421 public static $zh2Hant = [\n'''
423 +
'\n];\n\npublic static $zh2Hans = [\n' \
425 +
'\n];\n\npublic static $zh2TW = [\n' \
427 +
'\n];\n\npublic static $zh2HK = [\n' \
429 +
'\n];\n\npublic static $zh2CN = [\n' \
433 if pyversion[:1]
in [
'2']:
434 f =
open(os.path.join(
'..',
'..',
'..',
'languages',
'data',
'ZhConversion.php'),
'wb', encoding=
'utf8')
436 f =
open(os.path.join(
'..',
'..',
'..',
'languages',
'data',
'ZhConversion.php'),
'w', buffering=4096, encoding=
'utf8')
437 print (
'Writing ZhConversion.php ... ')
442 print (
'Deleting temporary files ... ')
443 os.remove(
'EZ-Big.txt.in')
444 os.remove(
'phrase_lib.txt')
446 os.remove(
'Unihan_Variants.txt')
447 os.remove(
'Wubi.txt.in')
448 os.remove(
'Ziranma.txt.in')
451 if __name__ ==
'__main__':
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
it s the revision text itself In either if gzip is set