MediaWiki REL1_31
Makefile.py
Go to the documentation of this file.
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# @author Philip
4import os
5import platform
6import re
7import shutil
8import sys
9import tarfile
10import zipfile
11
12pyversion = platform.python_version()
13islinux = platform.system().lower() == 'linux'
14
15if pyversion[:3] in ['2.6', '2.7']:
16 import urllib as urllib_request
17 import codecs
18 open = codecs.open
19 _unichr = unichr
20 if sys.maxunicode < 0x10000:
21 def unichr(i):
22 if i < 0x10000:
23 return _unichr(i)
24 else:
25 return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))
26elif pyversion[:2] == '3.':
27 import urllib.request as urllib_request
28 unichr = chr
29
30
31def unichr2(*args):
32 return [unichr(int(i.split('<')[0][2:], 16)) for i in args]
33
34
35def unichr3(*args):
36 return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]
37
38# DEFINE
39UNIHAN_VER = '6.3.0'
40SF_MIRROR = 'dfn'
41SCIM_TABLES_VER = '0.5.13'
42SCIM_PINYIN_VER = '0.5.92'
43LIBTABE_VER = '0.2.3'
44# END OF DEFINE
45
46
47def download(url, dest):
48 if os.path.isfile(dest):
49 print('File %s is up to date.' % dest)
50 return
51 global islinux
52 if islinux:
53 # we use wget instead urlretrieve under Linux,
54 # because wget could display details like download progress
55 os.system('wget %s -O %s' % (url, dest))
56 else:
57 print('Downloading from [%s] ...' % url)
58 urllib_request.urlretrieve(url, dest)
59 print('Download complete.\n')
60 return
61
62
63def uncompress(fp, member, encoding='U8'):
64 name = member.rsplit('/', 1)[-1]
65 print('Extracting %s ...' % name)
66 fp.extract(member)
67 shutil.move(member, name)
68 if '/' in member:
69 shutil.rmtree(member.split('/', 1)[0])
70 if pyversion[:1] in ['2']:
71 fc = open(name, 'rb', encoding, 'ignore')
72 else:
73 fc = open(name, 'r', encoding=encoding, errors='ignore')
74 return fc
75
76unzip = lambda path, member, encoding = 'U8': \
77 uncompress(zipfile.ZipFile(path), member, encoding)
78
79untargz = lambda path, member, encoding = 'U8': \
80 uncompress(tarfile.open(path, 'r:gz'), member, encoding)
81
82
83def parserCore(fp, pos, beginmark=None, endmark=None):
84 if beginmark and endmark:
85 start = False
86 else:
87 start = True
88 mlist = set()
89 for line in fp:
90 if beginmark and line.startswith(beginmark):
91 start = True
92 continue
93 elif endmark and line.startswith(endmark):
94 break
95 if start and not line.startswith('#'):
96 elems = line.split()
97 if len(elems) < 2:
98 continue
99 elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only
100 mlist.add(elems[pos])
101 return mlist
102
103
104def tablesParser(path, name):
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
108 fp = untargz(path, src, 'U8')
109 return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')
110
111ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')
112wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')
113zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')
114
115
116def phraseParser(path):
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
120 fp = untargz(path, src, 'U8')
121 return parserCore(fp, 0)
122
123
124def tsiParser(path):
125 """ Read tsi.src and parse it. """
126 src = 'libtabe/tsi-src/tsi.src'
127 fp = untargz(path, src, 'big5hkscs')
128 return parserCore(fp, 0)
129
130
131def unihanParser(path):
132 """ Read Unihan_Variants.txt and parse it. """
133 fp = unzip(path, 'Unihan_Variants.txt', 'U8')
134 t2s = dict()
135 s2t = dict()
136 for line in fp:
137 if line.startswith('#'):
138 continue
139 else:
140 elems = line.split()
141 if len(elems) < 3:
142 continue
143 type = elems.pop(1)
144 elems = unichr2(*elems)
145 if type == 'kTraditionalVariant':
146 s2t[elems[0]] = elems[1:]
147 elif type == 'kSimplifiedVariant':
148 t2s[elems[0]] = elems[1:]
149 fp.close()
150 return (t2s, s2t)
151
152
153def applyExcludes(mlist, path):
154 """ Apply exclude rules from path to mlist. """
155 if pyversion[:1] in ['2']:
156 excludes = open(path, 'rb', 'U8').read().split()
157 else:
158 excludes = open(path, 'r', encoding='U8').read().split()
159 excludes = [word.split('#')[0].strip() for word in excludes]
160 excludes = '|'.join(excludes)
161 excptn = re.compile('.*(?:%s).*' % excludes)
162 diff = [mword for mword in mlist if excptn.search(mword)]
163 mlist.difference_update(diff)
164 return mlist
165
166
168 fp = open(path, 'r', encoding='U8')
169 for line in fp:
170 elems = line.split('#')[0].split('|')
171 elems = unichr3(*elems)
172 if len(elems) > 1:
173 yield elems[0], elems[1:]
174
175
176def toManyRules(src_table):
177 tomany = set()
178 if pyversion[:1] in ['2']:
179 for (f, t) in src_table.iteritems():
180 for i in range(1, len(t)):
181 tomany.add(t[i])
182 else:
183 for (f, t) in src_table.items():
184 for i in range(1, len(t)):
185 tomany.add(t[i])
186 return tomany
187
188
189def removeRules(path, table):
190 fp = open(path, 'r', encoding='U8')
191 texc = list()
192 for line in fp:
193 elems = line.split('=>')
194 f = t = elems[0].strip()
195 if len(elems) == 2:
196 t = elems[1].strip()
197 f = f.strip('"').strip("'")
198 t = t.strip('"').strip("'")
199 if f:
200 try:
201 table.pop(f)
202 except:
203 pass
204 if t:
205 texc.append(t)
206 texcptn = re.compile('^(?:%s)$' % '|'.join(texc))
207 if pyversion[:1] in ['2']:
208 for (tmp_f, tmp_t) in table.copy().iteritems():
209 if texcptn.match(tmp_t):
210 table.pop(tmp_f)
211 else:
212 for (tmp_f, tmp_t) in table.copy().items():
213 if texcptn.match(tmp_t):
214 table.pop(tmp_f)
215 return table
216
217
218def customRules(path):
219 fp = open(path, 'r', encoding='U8')
220 ret = dict()
221 for line in fp:
222 line = line.rstrip('\r\n')
223 if '#' in line:
224 line = line.split('#')[0].rstrip()
225 elems = line.split('\t')
226 if len(elems) > 1:
227 ret[elems[0]] = elems[1]
228 return ret
229
230
231def dictToSortedList(src_table, pos):
232 return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))
233
234
235def translate(text, conv_table):
236 i = 0
237 while i < len(text):
238 for j in range(len(text) - i, 0, -1):
239 f = text[i:][:j]
240 t = conv_table.get(f)
241 if t:
242 text = text[:i] + t + text[i:][j:]
243 i += len(t) - 1
244 break
245 i += 1
246 return text
247
248
249def manualWordsTable(path, conv_table, reconv_table):
250 fp = open(path, 'r', encoding='U8')
251 reconv_table = reconv_table.copy()
252 out_table = {}
253 wordlist = [line.split('#')[0].strip() for line in fp]
254 wordlist = list(set(wordlist))
255 wordlist.sort(key=lambda w: (len(w), w), reverse=True)
256 while wordlist:
257 word = wordlist.pop()
258 new_word = translate(word, conv_table)
259 rcv_word = translate(word, reconv_table)
260 if word != rcv_word:
261 reconv_table[word] = out_table[word] = word
262 reconv_table[new_word] = out_table[new_word] = word
263 return out_table
264
265
266def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,
267 char_reconv_table):
268 wordlist = list(src_wordlist)
269 wordlist.sort(key=lambda w: (len(w), w), reverse=True)
270 word_conv_table = {}
271 word_reconv_table = {}
272 conv_table = char_conv_table.copy()
273 reconv_table = char_reconv_table.copy()
274 tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))
275 while wordlist:
276 conv_table.update(word_conv_table)
277 reconv_table.update(word_reconv_table)
278 word = wordlist.pop()
279 new_word_len = word_len = len(word)
280 while new_word_len == word_len:
281 test_word = translate(word, reconv_table)
282 new_word = translate(word, conv_table)
283 if not reconv_table.get(new_word) and \
284 (test_word != word or
285 (tomanyptn.search(word) and
286 word != translate(new_word, reconv_table))):
287 word_conv_table[word] = new_word
288 word_reconv_table[new_word] = word
289 try:
290 word = wordlist.pop()
291 except IndexError:
292 break
293 new_word_len = len(word)
294 return word_reconv_table
295
296
297def PHPArray(table):
298 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
299 return '\n'.join(lines)
300
301
302def main():
303 # Get Unihan.zip:
304 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
305 han_dest = 'Unihan-%s.zip' % UNIHAN_VER
306 download(url, han_dest)
307
308 sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
309
310 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
311 url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
313 download(url, tbe_dest)
314
315 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
316 url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
318 download(url, pyn_dest)
319
320 # Get libtabe-$(LIBTABE_VER).tgz:
321 url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
322 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
323 download(url, lbt_dest)
324
325 # Unihan.txt
326 (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)
327
328 t2s_1tomany.update(charManualTable('symme_supp.manual'))
329 t2s_1tomany.update(charManualTable('trad2simp.manual'))
330 s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))
331 s2t_1tomany.update(charManualTable('simp2trad.manual'))
332
333 if pyversion[:1] in ['2']:
334 t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])
335 s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])
336 else:
337 t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])
338 s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])
339
340 s_tomany = toManyRules(t2s_1tomany)
341 t_tomany = toManyRules(s2t_1tomany)
342
343 # noconvert rules
344 t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)
345 s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)
346
347 # the supper set for word to word conversion
348 t2s_1to1_supp = t2s_1to1.copy()
349 s2t_1to1_supp = s2t_1to1.copy()
350 t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))
351 s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))
352
353 # word to word manual rules
354 t2s_word2word_manual = manualWordsTable('simpphrases.manual',
355 s2t_1to1_supp, t2s_1to1_supp)
356 t2s_word2word_manual.update(customRules('toSimp.manual'))
357 s2t_word2word_manual = manualWordsTable('tradphrases.manual',
358 t2s_1to1_supp, s2t_1to1_supp)
359 s2t_word2word_manual.update(customRules('toTrad.manual'))
360
361 # word to word rules from input methods
362 t_wordlist = set()
363 s_wordlist = set()
364 t_wordlist.update(ezbigParser(tbe_dest),
365 tsiParser(lbt_dest))
366 s_wordlist.update(wubiParser(tbe_dest),
367 zrmParser(tbe_dest),
368 phraseParser(pyn_dest))
369
370 # exclude
371 s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')
372 t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')
373
374 s2t_supp = s2t_1to1_supp.copy()
375 s2t_supp.update(s2t_word2word_manual)
376 t2s_supp = t2s_1to1_supp.copy()
377 t2s_supp.update(t2s_word2word_manual)
378
379 # parse list to dict
380 t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,
381 s2t_1to1_supp, t2s_supp)
382 t2s_word2word.update(t2s_word2word_manual)
383 s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,
384 t2s_1to1_supp, s2t_supp)
385 s2t_word2word.update(s2t_word2word_manual)
386
387 # Final tables
388 # sorted list toHans
389 if pyversion[:1] in ['2']:
390 t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])
391 else:
392 t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])
393 toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)
394 # sorted list toHant
395 if pyversion[:1] in ['2']:
396 s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])
397 else:
398 s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])
399 toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)
400 # sorted list toCN
401 toCN = dictToSortedList(customRules('toCN.manual'), 1)
402 # sorted list toHK
403 toHK = dictToSortedList(customRules('toHK.manual'), 1)
404 # sorted list toTW
405 toTW = dictToSortedList(customRules('toTW.manual'), 1)
406
407 # Get PHP Array
408 php = '''<?php
409/**
410 * Simplified / Traditional Chinese conversion tables
411 *
412 * Automatically generated using code and data in maintenance/language/zhtable/
413 * Do not modify directly!
414 *
415 * @file
416 */
417
418namespace MediaWiki\Languages\Data;
419
420class ZhConversion {
421public static $zh2Hant = [\n'''
422 php += PHPArray(toHant) \
423 + '\n];\n\npublic static $zh2Hans = [\n' \
424 + PHPArray(toHans) \
425 + '\n];\n\npublic static $zh2TW = [\n' \
426 + PHPArray(toTW) \
427 + '\n];\n\npublic static $zh2HK = [\n' \
428 + PHPArray(toHK) \
429 + '\n];\n\npublic static $zh2CN = [\n' \
430 + PHPArray(toCN) \
431 + '\n];\n}\n'
432
433 if pyversion[:1] in ['2']:
434 f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding='utf8')
435 else:
436 f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')
437 print ('Writing ZhConversion.php ... ')
438 f.write(php)
439 f.close()
440
441 # Remove temporary files
442 print ('Deleting temporary files ... ')
443 os.remove('EZ-Big.txt.in')
444 os.remove('phrase_lib.txt')
445 os.remove('tsi.src')
446 os.remove('Unihan_Variants.txt')
447 os.remove('Wubi.txt.in')
448 os.remove('Ziranma.txt.in')
449
450
451if __name__ == '__main__':
452 main()
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
while(( $__line=Maintenance::readconsole()) !==false) print
Definition eval.php:64
translate(text, conv_table)
Definition Makefile.py:235
manualWordsTable(path, conv_table, reconv_table)
Definition Makefile.py:249
unihanParser(path)
Definition Makefile.py:131
removeRules(path, table)
Definition Makefile.py:189
defaultWordsTable(src_wordlist, src_tomany, char_conv_table, char_reconv_table)
Definition Makefile.py:267
str untargz
Definition Makefile.py:79
customRules(path)
Definition Makefile.py:218
tsiParser(path)
Definition Makefile.py:124
tablesParser ezbigParser
Definition Makefile.py:111
tablesParser zrmParser
Definition Makefile.py:113
str unzip
Definition Makefile.py:76
PHPArray(table)
Definition Makefile.py:297
phraseParser(path)
Definition Makefile.py:116
unichr2(*args)
Definition Makefile.py:31
applyExcludes(mlist, path)
Definition Makefile.py:153
unichr3(*args)
Definition Makefile.py:35
toManyRules(src_table)
Definition Makefile.py:176
charManualTable(path)
Definition Makefile.py:167
download(url, dest)
Definition Makefile.py:47
parserCore(fp, pos, beginmark=None, endmark=None)
Definition Makefile.py:83
uncompress(fp, member, encoding='U8')
Definition Makefile.py:63
tablesParser wubiParser
Definition Makefile.py:112
tablesParser(path, name)
Definition Makefile.py:104
dictToSortedList(src_table, pos)
Definition Makefile.py:231