MediaWiki  1.23.0
Makefile.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import tarfile as tf
5 import zipfile as zf
6 import os, re, shutil, sys, platform
7 
8 pyversion = platform.python_version()
9 islinux = platform.system().lower() == 'linux'
10 
11 if pyversion[:3] in ['2.6', '2.7']:
12  import urllib as urllib_request
13  import codecs
14  open = codecs.open
15  _unichr = unichr
16  if sys.maxunicode < 0x10000:
17  def unichr(i):
18  if i < 0x10000:
19  return _unichr(i)
20  else:
21  return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
22 elif pyversion[:2] == '3.':
23  import urllib.request as urllib_request
24  unichr = chr
25 
26 def unichr2( *args ):
27  return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
28 
29 def unichr3( *args ):
30  return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
31 
32 # DEFINE
33 UNIHAN_VER = '6.3.0'
34 SF_MIRROR = 'dfn'
35 SCIM_TABLES_VER = '0.5.13'
36 SCIM_PINYIN_VER = '0.5.92'
37 LIBTABE_VER = '0.2.3'
38 # END OF DEFINE
39 
40 def download( url, dest ):
41  if os.path.isfile( dest ):
42  print( 'File %s is up to date.' % dest )
43  return
44  global islinux
45  if islinux:
46  # we use wget instead urlretrieve under Linux,
47  # because wget could display details like download progress
48  os.system( 'wget %s -O %s' % ( url, dest ) )
49  else:
50  print( 'Downloading from [%s] ...' % url )
51  urllib_request.urlretrieve( url, dest )
52  print( 'Download complete.\n' )
53  return
54 
55 def uncompress( fp, member, encoding = 'U8' ):
56  name = member.rsplit( '/', 1 )[-1]
57  print( 'Extracting %s ...' % name )
58  fp.extract( member )
59  shutil.move( member, name )
60  if '/' in member:
61  shutil.rmtree( member.split( '/', 1 )[0] )
62  if pyversion[:1] in ['2']:
63  fc = open( name, 'rb', encoding, 'ignore' )
64  else:
65  fc = open( name, 'r', encoding = encoding, errors = 'ignore' )
66  return fc
67 
68 unzip = lambda path, member, encoding = 'U8': \
69  uncompress( zf.ZipFile( path ), member, encoding )
70 
71 untargz = lambda path, member, encoding = 'U8': \
72  uncompress( tf.open( path, 'r:gz' ), member, encoding )
73 
74 def parserCore( fp, pos, beginmark = None, endmark = None ):
75  if beginmark and endmark:
76  start = False
77  else: start = True
78  mlist = set()
79  for line in fp:
80  if beginmark and line.startswith( beginmark ):
81  start = True
82  continue
83  elif endmark and line.startswith( endmark ):
84  break
85  if start and not line.startswith( '#' ):
86  elems = line.split()
87  if len( elems ) < 2:
88  continue
89  elif len( elems[0] ) > 1 and \
90  len( elems[pos] ) > 1: # words only
91  mlist.add( elems[pos] )
92  return mlist
93 
94 def tablesParser( path, name ):
95  """ Read file from scim-tables and parse it. """
96  global SCIM_TABLES_VER
97  src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
98  fp = untargz( path, src, 'U8' )
99  return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
100 
101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
104 
105 def phraseParser( path ):
106  """ Read phrase_lib.txt and parse it. """
107  global SCIM_PINYIN_VER
108  src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
109  dst = 'phrase_lib.txt'
110  fp = untargz( path, src, 'U8' )
111  return parserCore( fp, 0 )
112 
113 def tsiParser( path ):
114  """ Read tsi.src and parse it. """
115  src = 'libtabe/tsi-src/tsi.src'
116  dst = 'tsi.src'
117  fp = untargz( path, src, 'big5hkscs' )
118  return parserCore( fp, 0 )
119 
120 def unihanParser( path ):
121  """ Read Unihan_Variants.txt and parse it. """
122  fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
123  t2s = dict()
124  s2t = dict()
125  for line in fp:
126  if line.startswith( '#' ):
127  continue
128  else:
129  elems = line.split()
130  if len( elems ) < 3:
131  continue
132  type = elems.pop( 1 )
133  elems = unichr2( *elems )
134  if type == 'kTraditionalVariant':
135  s2t[elems[0]] = elems[1:]
136  elif type == 'kSimplifiedVariant':
137  t2s[elems[0]] = elems[1:]
138  fp.close()
139  return ( t2s, s2t )
140 
141 def applyExcludes( mlist, path ):
142  """ Apply exclude rules from path to mlist. """
143  if pyversion[:1] in ['2']:
144  excludes = open( path, 'rb', 'U8' ).read().split()
145  else:
146  excludes = open( path, 'r', encoding = 'U8' ).read().split()
147  excludes = [word.split( '#' )[0].strip() for word in excludes]
148  excludes = '|'.join( excludes )
149  excptn = re.compile( '.*(?:%s).*' % excludes )
150  diff = [mword for mword in mlist if excptn.search( mword )]
151  mlist.difference_update( diff )
152  return mlist
153 
154 def charManualTable( path ):
155  fp = open( path, 'r', encoding = 'U8' )
156  ret = {}
157  for line in fp:
158  elems = line.split( '#' )[0].split( '|' )
159  elems = unichr3( *elems )
160  if len( elems ) > 1:
161  ret[elems[0]] = elems[1:]
162  return ret
163 
164 def toManyRules( src_table ):
165  tomany = set()
166  if pyversion[:1] in ['2']:
167  for ( f, t ) in src_table.iteritems():
168  for i in range( 1, len( t ) ):
169  tomany.add( t[i] )
170  else:
171  for ( f, t ) in src_table.items():
172  for i in range( 1, len( t ) ):
173  tomany.add( t[i] )
174  return tomany
175 
176 def removeRules( path, table ):
177  fp = open( path, 'r', encoding = 'U8' )
178  texc = list()
179  for line in fp:
180  elems = line.split( '=>' )
181  f = t = elems[0].strip()
182  if len( elems ) == 2:
183  t = elems[1].strip()
184  f = f.strip('"').strip("'")
185  t = t.strip('"').strip("'")
186  if f:
187  try:
188  table.pop( f )
189  except:
190  pass
191  if t:
192  texc.append( t )
193  texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
194  if pyversion[:1] in ['2']:
195  for (tmp_f, tmp_t) in table.copy().iteritems():
196  if texcptn.match( tmp_t ):
197  table.pop( tmp_f )
198  else:
199  for (tmp_f, tmp_t) in table.copy().items():
200  if texcptn.match( tmp_t ):
201  table.pop( tmp_f )
202  return table
203 
204 def customRules( path ):
205  fp = open( path, 'r', encoding = 'U8' )
206  ret = dict()
207  for line in fp:
208  elems = line.split( '#' )[0].split()
209  if len( elems ) > 1:
210  ret[elems[0]] = elems[1]
211  return ret
212 
213 def dictToSortedList( src_table, pos ):
214  return sorted( src_table.items(), key = lambda m: m[pos] )
215 
216 def translate( text, conv_table ):
217  i = 0
218  while i < len( text ):
219  for j in range( len( text ) - i, 0, -1 ):
220  f = text[i:][:j]
221  t = conv_table.get( f )
222  if t:
223  text = text[:i] + t + text[i:][j:]
224  i += len(t) - 1
225  break
226  i += 1
227  return text
228 
229 def manualWordsTable( path, conv_table, reconv_table ):
230  fp = open( path, 'r', encoding = 'U8' )
231  reconv_table = {}
232  wordlist = [line.split( '#' )[0].strip() for line in fp]
233  wordlist = list( set( wordlist ) )
234  wordlist.sort( key = len, reverse = True )
235  while wordlist:
236  word = wordlist.pop()
237  new_word = translate( word, conv_table )
238  rcv_word = translate( word, reconv_table )
239  if word != rcv_word:
240  reconv_table[word] = word
241  reconv_table[new_word] = word
242  return reconv_table
243 
244 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
245  wordlist = list( src_wordlist )
246  wordlist.sort( key = len, reverse = True )
247  word_conv_table = {}
248  word_reconv_table = {}
249  conv_table = char_conv_table.copy()
250  reconv_table = char_reconv_table.copy()
251  tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
252  while wordlist:
253  conv_table.update( word_conv_table )
254  reconv_table.update( word_reconv_table )
255  word = wordlist.pop()
256  new_word_len = word_len = len( word )
257  while new_word_len == word_len:
258  add = False
259  test_word = translate( word, reconv_table )
260  new_word = translate( word, conv_table )
261  if not reconv_table.get( new_word ) \
262  and ( test_word != word \
263  or ( tomanyptn.search( word ) \
264  and word != translate( new_word, reconv_table ) ) ):
265  word_conv_table[word] = new_word
266  word_reconv_table[new_word] = word
267  try:
268  word = wordlist.pop()
269  except IndexError:
270  break
271  new_word_len = len(word)
272  return word_reconv_table
273 
274 def PHPArray( table ):
275  lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
276  return '\n'.join(lines)
277 
278 def main():
279  #Get Unihan.zip:
280  url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
281  han_dest = 'Unihan.zip'
282  download( url, han_dest )
283 
284  # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
285  url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
286  tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
287  download( url, tbe_dest )
288 
289  # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
290  url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
291  pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
292  download( url, pyn_dest )
293 
294  # Get libtabe-$(LIBTABE_VER).tgz:
295  url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
296  lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
297  download( url, lbt_dest )
298 
299  # Unihan.txt
300  ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
301 
302  t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
303  s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
304 
305  if pyversion[:1] in ['2']:
306  t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
307  s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
308  else:
309  t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] )
310  s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] )
311 
312  s_tomany = toManyRules( t2s_1tomany )
313  t_tomany = toManyRules( s2t_1tomany )
314 
315  # noconvert rules
316  t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
317  s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
318 
319  # the supper set for word to word conversion
320  t2s_1to1_supp = t2s_1to1.copy()
321  s2t_1to1_supp = s2t_1to1.copy()
322  t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
323  s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
324 
325  # word to word manual rules
326  t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
327  t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
328  s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
329  s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
330 
331  # word to word rules from input methods
332  t_wordlist = set()
333  s_wordlist = set()
334  t_wordlist.update( ezbigParser( tbe_dest ),
335  tsiParser( lbt_dest ) )
336  s_wordlist.update( wubiParser( tbe_dest ),
337  zrmParser( tbe_dest ),
338  phraseParser( pyn_dest ) )
339 
340  # exclude
341  s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
342  t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
343 
344  s2t_supp = s2t_1to1_supp.copy()
345  s2t_supp.update( s2t_word2word_manual )
346  t2s_supp = t2s_1to1_supp.copy()
347  t2s_supp.update( t2s_word2word_manual )
348 
349  # parse list to dict
350  t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
351  t2s_word2word.update( t2s_word2word_manual )
352  s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
353  s2t_word2word.update( s2t_word2word_manual )
354 
355  # Final tables
356  # sorted list toHans
357  if pyversion[:1] in ['2']:
358  t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
359  else:
360  t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] )
361  toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
362  # sorted list toHant
363  if pyversion[:1] in ['2']:
364  s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
365  else:
366  s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] )
367  toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
368  # sorted list toCN
369  toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
370  # sorted list toHK
371  toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
372  # sorted list toSG
373  toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
374  # sorted list toTW
375  toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
376 
377  # Get PHP Array
378  php = '''<?php
379 /**
380  * Simplified / Traditional Chinese conversion tables
381  *
382  * Automatically generated using code and data in maintenance/language/zhtable/
383  * Do not modify directly!
384  *
385  * @file
386  */
387 
388 $zh2Hant = array(\n'''
389  php += PHPArray( toHant ) \
390  + '\n);\n\n$zh2Hans = array(\n' \
391  + PHPArray( toHans ) \
392  + '\n);\n\n$zh2TW = array(\n' \
393  + PHPArray( toTW ) \
394  + '\n);\n\n$zh2HK = array(\n' \
395  + PHPArray( toHK ) \
396  + '\n);\n\n$zh2CN = array(\n' \
397  + PHPArray( toCN ) \
398  + '\n);\n\n$zh2SG = array(\n' \
399  + PHPArray( toSG ) \
400  + '\n);\n'
401 
402  if pyversion[:1] in ['2']:
403  f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
404  else:
405  f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' )
406  print ('Writing ZhConversion.php ... ')
407  f.write( php )
408  f.close()
409 
410  # Remove temporary files
411  print ('Deleting temporary files ... ')
412  os.remove('EZ-Big.txt.in')
413  os.remove('phrase_lib.txt')
414  os.remove('tsi.src')
415  os.remove('Unihan_Variants.txt')
416  os.remove('Wubi.txt.in')
417  os.remove('Ziranma.txt.in')
418 
419 
420 if __name__ == '__main__':
421  main()
Makefile.toManyRules
def toManyRules(src_table)
Definition: Makefile.py:164
Makefile.phraseParser
def phraseParser(path)
Definition: Makefile.py:105
Makefile.unichr2
def unichr2(*args)
Definition: Makefile.py:26
Makefile.ezbigParser
def ezbigParser
Definition: Makefile.py:101
Makefile.manualWordsTable
def manualWordsTable(path, conv_table, reconv_table)
Definition: Makefile.py:229
Makefile.wubiParser
def wubiParser
Definition: Makefile.py:102
Makefile._unichr
_unichr
Definition: Makefile.py:15
Makefile.dictToSortedList
def dictToSortedList(src_table, pos)
Definition: Makefile.py:213
Makefile.open
open
Definition: Makefile.py:14
Makefile.defaultWordsTable
def defaultWordsTable(src_wordlist, src_tomany, char_conv_table, char_reconv_table)
Definition: Makefile.py:244
Makefile.uncompress
def uncompress(fp, member, encoding='U8')
Definition: Makefile.py:55
Makefile.download
def download(url, dest)
Definition: Makefile.py:40
Makefile.main
def main()
Definition: Makefile.py:278
Makefile.charManualTable
def charManualTable(path)
Definition: Makefile.py:154
Makefile.parserCore
def parserCore(fp, pos, beginmark=None, endmark=None)
Definition: Makefile.py:74
Makefile.PHPArray
def PHPArray(table)
Definition: Makefile.py:274
Makefile.unihanParser
def unihanParser(path)
Definition: Makefile.py:120
Makefile.zrmParser
def zrmParser
Definition: Makefile.py:103
Makefile.customRules
def customRules(path)
Definition: Makefile.py:204
list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
Makefile.unzip
string unzip
Definition: Makefile.py:68
set
it s the revision text itself In either if gzip is set
Definition: hooks.txt:2113
Makefile.tablesParser
def tablesParser(path, name)
Definition: Makefile.py:94
Makefile.applyExcludes
def applyExcludes(mlist, path)
Definition: Makefile.py:141
Makefile.removeRules
def removeRules(path, table)
Definition: Makefile.py:176
Makefile.translate
def translate(text, conv_table)
Definition: Makefile.py:216
Makefile.untargz
string untargz
Definition: Makefile.py:71
Makefile.tsiParser
def tsiParser(path)
Definition: Makefile.py:113
Makefile.unichr3
def unichr3(*args)
Definition: Makefile.py:29
Makefile.unichr
unichr
Definition: Makefile.py:24