6 import os, re, shutil, sys, platform
8 pyversion = platform.python_version()
9 islinux = platform.system().lower() ==
'linux'
11 if pyversion[:3]
in [
'2.6',
'2.7']:
12 import urllib
as urllib_request
16 if sys.maxunicode < 0x10000:
21 return _unichr( 0xD7C0 + ( i>>10 ) ) +
_unichr( 0xDC00 + ( i & 0x3FF ) )
22 elif pyversion[:2] ==
'3.':
23 import urllib.request
as urllib_request
27 return [
unichr( int( i.split(
'<')[0][2:], 16 ) )
for i
in args]
30 return [
unichr( int( i[2:7], 16 ) )
for i
in args
if i[2:7]]
35 SCIM_TABLES_VER =
'0.5.13'
36 SCIM_PINYIN_VER =
'0.5.92'
41 if os.path.isfile( dest ):
42 print(
'File %s is up to date.' % dest )
48 os.system(
'wget %s -O %s' % ( url, dest ) )
50 print(
'Downloading from [%s] ...' % url )
51 urllib_request.urlretrieve( url, dest )
52 print(
'Download complete.\n' )
56 name = member.rsplit(
'/', 1 )[-1]
57 print(
'Extracting %s ...' % name )
59 shutil.move( member, name )
61 shutil.rmtree( member.split(
'/', 1 )[0] )
62 if pyversion[:1]
in [
'2']:
63 fc =
open( name,
'rb', encoding,
'ignore' )
65 fc =
open( name,
'r', encoding = encoding, errors =
'ignore' )
68 unzip =
lambda path, member, encoding =
'U8': \
69 uncompress( zf.ZipFile( path ), member, encoding )
71 untargz =
lambda path, member, encoding =
'U8': \
72 uncompress( tf.open( path,
'r:gz' ), member, encoding )
74 def parserCore( fp, pos, beginmark = None, endmark = None ):
75 if beginmark
and endmark:
80 if beginmark
and line.startswith( beginmark ):
83 elif endmark
and line.startswith( endmark ):
85 if start
and not line.startswith(
'#' ):
89 elif len( elems[0] ) > 1
and \
90 len( elems[pos] ) > 1:
91 mlist.add( elems[pos] )
95 """ Read file from scim-tables and parse it. """
96 global SCIM_TABLES_VER
97 src =
'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
99 return parserCore( fp, 1,
'BEGIN_TABLE',
'END_TABLE' )
106 """ Read phrase_lib.txt and parse it. """
107 global SCIM_PINYIN_VER
108 src =
'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
109 dst =
'phrase_lib.txt'
110 fp =
untargz( path, src,
'U8' )
114 """ Read tsi.src and parse it. """
115 src =
'libtabe/tsi-src/tsi.src'
117 fp =
untargz( path, src,
'big5hkscs' )
121 """ Read Unihan_Variants.txt and parse it. """
122 fp =
unzip( path,
'Unihan_Variants.txt',
'U8' )
126 if line.startswith(
'#' ):
132 type = elems.pop( 1 )
134 if type ==
'kTraditionalVariant':
135 s2t[elems[0]] = elems[1:]
136 elif type ==
'kSimplifiedVariant':
137 t2s[elems[0]] = elems[1:]
142 """ Apply exclude rules from path to mlist. """
143 if pyversion[:1]
in [
'2']:
144 excludes =
open( path,
'rb',
'U8' ).read().split()
146 excludes =
open( path,
'r', encoding =
'U8' ).read().split()
147 excludes = [word.split(
'#' )[0].strip()
for word
in excludes]
148 excludes =
'|'.join( excludes )
149 excptn = re.compile(
'.*(?:%s).*' % excludes )
150 diff = [mword
for mword
in mlist
if excptn.search( mword )]
151 mlist.difference_update( diff )
155 fp =
open( path,
'r', encoding =
'U8' )
158 elems = line.split(
'#' )[0].split(
'|' )
161 ret[elems[0]] = elems[1:]
166 if pyversion[:1]
in [
'2']:
167 for ( f, t )
in src_table.iteritems():
168 for i
in range( 1, len( t ) ):
171 for ( f, t )
in src_table.items():
172 for i
in range( 1, len( t ) ):
177 fp =
open( path,
'r', encoding =
'U8' )
180 elems = line.split(
'=>' )
181 f = t = elems[0].strip()
182 if len( elems ) == 2:
184 f = f.strip(
'"').strip(
"'")
185 t = t.strip(
'"').strip(
"'")
193 texcptn = re.compile(
'^(?:%s)$' %
'|'.join( texc ) )
194 if pyversion[:1]
in [
'2']:
195 for (tmp_f, tmp_t)
in table.copy().iteritems():
196 if texcptn.match( tmp_t ):
199 for (tmp_f, tmp_t)
in table.copy().items():
200 if texcptn.match( tmp_t ):
205 fp =
open( path,
'r', encoding =
'U8' )
208 elems = line.split(
'#' )[0].split()
210 ret[elems[0]] = elems[1]
214 return sorted( src_table.items(), key =
lambda m: m[pos] )
218 while i < len( text ):
219 for j
in range( len( text ) - i, 0, -1 ):
221 t = conv_table.get( f )
223 text = text[:i] + t + text[i:][j:]
230 fp =
open( path,
'r', encoding =
'U8' )
232 wordlist = [line.split(
'#' )[0].strip()
for line
in fp]
233 wordlist =
list(
set( wordlist ) )
234 wordlist.sort( key = len, reverse =
True )
236 word = wordlist.pop()
238 rcv_word =
translate( word, reconv_table )
240 reconv_table[word] = word
241 reconv_table[new_word] = word
245 wordlist =
list( src_wordlist )
246 wordlist.sort( key = len, reverse =
True )
248 word_reconv_table = {}
249 conv_table = char_conv_table.copy()
250 reconv_table = char_reconv_table.copy()
251 tomanyptn = re.compile(
'(?:%s)' %
'|'.join( src_tomany ) )
253 conv_table.update( word_conv_table )
254 reconv_table.update( word_reconv_table )
255 word = wordlist.pop()
256 new_word_len = word_len = len( word )
257 while new_word_len == word_len:
259 test_word =
translate( word, reconv_table )
261 if not reconv_table.get( new_word ) \
262 and ( test_word != word \
263 or ( tomanyptn.search( word ) \
264 and word !=
translate( new_word, reconv_table ) ) ):
265 word_conv_table[word] = new_word
266 word_reconv_table[new_word] = word
268 word = wordlist.pop()
271 new_word_len = len(word)
272 return word_reconv_table
275 lines = [
'\'%s\' => \'%s\',' % (f, t)
for (f, t)
in table
if f
and t]
276 return '\n'.join(lines)
280 url =
'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
281 han_dest =
'Unihan.zip'
285 url =
'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
286 tbe_dest =
'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
290 url =
'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
291 pyn_dest =
'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
295 url =
'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
296 lbt_dest =
'libtabe-%s.tgz' % LIBTABE_VER
305 if pyversion[:1]
in [
'2']:
306 t2s_1to1 = dict( [( f, t[0] )
for ( f, t )
in t2s_1tomany.iteritems()] )
307 s2t_1to1 = dict( [( f, t[0] )
for ( f, t )
in s2t_1tomany.iteritems()] )
309 t2s_1to1 = dict( [( f, t[0] )
for ( f, t )
in t2s_1tomany.items()] )
310 s2t_1to1 = dict( [( f, t[0] )
for ( f, t )
in s2t_1tomany.items()] )
316 t2s_1to1 =
removeRules(
'trad2simp_noconvert.manual', t2s_1to1 )
317 s2t_1to1 =
removeRules(
'simp2trad_noconvert.manual', s2t_1to1 )
320 t2s_1to1_supp = t2s_1to1.copy()
321 s2t_1to1_supp = s2t_1to1.copy()
322 t2s_1to1_supp.update(
customRules(
'trad2simp_supp_set.manual' ) )
323 s2t_1to1_supp.update(
customRules(
'simp2trad_supp_set.manual' ) )
326 t2s_word2word_manual =
manualWordsTable(
'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
327 t2s_word2word_manual.update(
customRules(
'toSimp.manual' ) )
328 s2t_word2word_manual =
manualWordsTable(
'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
329 s2t_word2word_manual.update(
customRules(
'toTrad.manual' ) )
341 s_wordlist =
applyExcludes( s_wordlist,
'simpphrases_exclude.manual' )
342 t_wordlist =
applyExcludes( t_wordlist,
'tradphrases_exclude.manual' )
344 s2t_supp = s2t_1to1_supp.copy()
345 s2t_supp.update( s2t_word2word_manual )
346 t2s_supp = t2s_1to1_supp.copy()
347 t2s_supp.update( t2s_word2word_manual )
350 t2s_word2word =
defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
351 t2s_word2word.update( t2s_word2word_manual )
352 s2t_word2word =
defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
353 s2t_word2word.update( s2t_word2word_manual )
357 if pyversion[:1]
in [
'2']:
358 t2s_1to1 = dict( [( f, t )
for ( f, t )
in t2s_1to1.iteritems()
if f != t] )
360 t2s_1to1 = dict( [( f, t )
for ( f, t )
in t2s_1to1.items()
if f != t] )
363 if pyversion[:1]
in [
'2']:
364 s2t_1to1 = dict( [( f, t )
for ( f, t )
in s2t_1to1.iteritems()
if f != t] )
366 s2t_1to1 = dict( [( f, t )
for ( f, t )
in s2t_1to1.items()
if f != t] )
380 * Simplified / Traditional Chinese conversion tables
382 * Automatically generated using code and data in maintenance/language/zhtable/
383 * Do not modify directly!
388 $zh2Hant = array(\n'''
390 +
'\n);\n\n$zh2Hans = array(\n' \
392 +
'\n);\n\n$zh2TW = array(\n' \
394 +
'\n);\n\n$zh2HK = array(\n' \
396 +
'\n);\n\n$zh2CN = array(\n' \
398 +
'\n);\n\n$zh2SG = array(\n' \
402 if pyversion[:1]
in [
'2']:
403 f =
open( os.path.join(
'..',
'..',
'..',
'includes',
'ZhConversion.php' ),
'wb', encoding =
'utf8' )
405 f =
open( os.path.join(
'..',
'..',
'..',
'includes',
'ZhConversion.php' ),
'w', buffering = 4096, encoding =
'utf8' )
406 print (
'Writing ZhConversion.php ... ')
411 print (
'Deleting temporary files ... ')
412 os.remove(
'EZ-Big.txt.in')
413 os.remove(
'phrase_lib.txt')
415 os.remove(
'Unihan_Variants.txt')
416 os.remove(
'Wubi.txt.in')
417 os.remove(
'Ziranma.txt.in')
420 if __name__ ==
'__main__':