REL1_28/php/Makefile_8py_source.html

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @author Philip

import os

import platform

import re

import shutil

import sys

import tarfile

import zipfile


pyversion = platform.python_version()

islinux = platform.system().lower() == 'linux'


if pyversion[:3] in ['2.6', '2.7']:

    import urllib as urllib_request

    import codecs

    open = codecs.open

    _unichr = unichr

    if sys.maxunicode < 0x10000:


        def unichr(i):

            if i < 0x10000:

                return _unichr(i)

            else:

                return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))


elif pyversion[:2] == '3.':

    import urllib.request as urllib_request

    unichr = chr


def unichr2(*args):

    return [unichr(int(i.split('<')[0][2:], 16)) for i in args]


def unichr3(*args):

    return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]


# DEFINE


UNIHAN_VER = '6.3.0'

SF_MIRROR = 'dfn'

SCIM_TABLES_VER = '0.5.13'

SCIM_PINYIN_VER = '0.5.92'

LIBTABE_VER = '0.2.3'

# END OF DEFINE


def download(url, dest):

    if os.path.isfile(dest):

        print('File %s is up to date.' % dest)

        return

    global islinux

    if islinux:

        # we use wget instead urlretrieve under Linux,

        # because wget could display details like download progress

        os.system('wget %s -O %s' % (url, dest))

    else:

        print('Downloading from [%s] ...' % url)

        urllib_request.urlretrieve(url, dest)

        print('Download complete.\n')

    return


def uncompress(fp, member, encoding='U8'):

    name = member.rsplit('/', 1)[-1]

    print('Extracting %s ...' % name)

    fp.extract(member)

    shutil.move(member, name)

    if '/' in member:

        shutil.rmtree(member.split('/', 1)[0])

    if pyversion[:1] in ['2']:

        fc = open(name, 'rb', encoding, 'ignore')

    else:

        fc = open(name, 'r', encoding=encoding, errors='ignore')

    return fc


unzip = lambda path, member, encoding = 'U8': \

        uncompress(zipfile.ZipFile(path), member, encoding)


untargz = lambda path, member, encoding = 'U8': \

        uncompress(tarfile.open(path, 'r:gz'), member, encoding)


def parserCore(fp, pos, beginmark=None, endmark=None):

    if beginmark and endmark:

        start = False

    else:

        start = True

    mlist = set()

    for line in fp:

        if beginmark and line.startswith(beginmark):

            start = True

            continue

        elif endmark and line.startswith(endmark):

            break

        if start and not line.startswith('#'):

            elems = line.split()

            if len(elems) < 2:

                continue

            elif len(elems[0]) > 1 and len(elems[pos]) > 1:  # words only

                mlist.add(elems[pos])

    return mlist


def tablesParser(path, name):

    """ Read file from scim-tables and parse it. """

    global SCIM_TABLES_VER

    src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)

    fp = untargz(path, src, 'U8')

    return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')


ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')

wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')

zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')


def phraseParser(path):

    """ Read phrase_lib.txt and parse it. """

    global SCIM_PINYIN_VER

    src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER

    fp = untargz(path, src, 'U8')

    return parserCore(fp, 0)


def tsiParser(path):

    """ Read tsi.src and parse it. """

    src = 'libtabe/tsi-src/tsi.src'

    fp = untargz(path, src, 'big5hkscs')

    return parserCore(fp, 0)


def unihanParser(path):

    """ Read Unihan_Variants.txt and parse it. """

    fp = unzip(path, 'Unihan_Variants.txt', 'U8')

    t2s = dict()

    s2t = dict()

    for line in fp:

        if line.startswith('#'):

            continue

        else:

            elems = line.split()

            if len(elems) < 3:

                continue

            type = elems.pop(1)

            elems = unichr2(*elems)

            if type == 'kTraditionalVariant':

                s2t[elems[0]] = elems[1:]

            elif type == 'kSimplifiedVariant':

                t2s[elems[0]] = elems[1:]

    fp.close()

    return (t2s, s2t)


def applyExcludes(mlist, path):

    """ Apply exclude rules from path to mlist. """

    if pyversion[:1] in ['2']:

        excludes = open(path, 'rb', 'U8').read().split()

    else:

        excludes = open(path, 'r', encoding='U8').read().split()

    excludes = [word.split('#')[0].strip() for word in excludes]

    excludes = '|'.join(excludes)

    excptn = re.compile('.*(?:%s).*' % excludes)

    diff = [mword for mword in mlist if excptn.search(mword)]

    mlist.difference_update(diff)

    return mlist


def charManualTable(path):

    fp = open(path, 'r', encoding='U8')

    for line in fp:

        elems = line.split('#')[0].split('|')

        elems = unichr3(*elems)

        if len(elems) > 1:

            yield elems[0], elems[1:]


def toManyRules(src_table):

    tomany = set()

    if pyversion[:1] in ['2']:

        for (f, t) in src_table.iteritems():

            for i in range(1, len(t)):

                tomany.add(t[i])

    else:

        for (f, t) in src_table.items():

            for i in range(1, len(t)):

                tomany.add(t[i])

    return tomany


def removeRules(path, table):

    fp = open(path, 'r', encoding='U8')

    texc = list()

    for line in fp:

        elems = line.split('=>')

        f = t = elems[0].strip()

        if len(elems) == 2:

            t = elems[1].strip()

        f = f.strip('"').strip("'")

        t = t.strip('"').strip("'")

        if f:

            try:

                table.pop(f)

            except:

                pass

        if t:

            texc.append(t)

    texcptn = re.compile('^(?:%s)$' % '|'.join(texc))

    if pyversion[:1] in ['2']:

        for (tmp_f, tmp_t) in table.copy().iteritems():

            if texcptn.match(tmp_t):

                table.pop(tmp_f)

    else:

        for (tmp_f, tmp_t) in table.copy().items():

            if texcptn.match(tmp_t):

                table.pop(tmp_f)

    return table


def customRules(path):

    fp = open(path, 'r', encoding='U8')

    ret = dict()

    for line in fp:

        line = line.rstrip('\r\n')

        if '#' in line:

            line = line.split('#')[0].rstrip()

        elems = line.split('\t')

        if len(elems) > 1:

            ret[elems[0]] = elems[1]

    return ret


def dictToSortedList(src_table, pos):

    return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))


def translate(text, conv_table):

    i = 0

    while i < len(text):

        for j in range(len(text) - i, 0, -1):

            f = text[i:][:j]

            t = conv_table.get(f)

            if t:

                text = text[:i] + t + text[i:][j:]

                i += len(t) - 1

                break

        i += 1

    return text


def manualWordsTable(path, conv_table, reconv_table):

    fp = open(path, 'r', encoding='U8')

    reconv_table = reconv_table.copy()

    out_table = {}

    wordlist = [line.split('#')[0].strip() for line in fp]

    wordlist = list(set(wordlist))

    wordlist.sort(key=lambda w: (len(w), w), reverse=True)

    while wordlist:

        word = wordlist.pop()

        new_word = translate(word, conv_table)

        rcv_word = translate(word, reconv_table)

        if word != rcv_word:

            reconv_table[word] = out_table[word] = word

        reconv_table[new_word] = out_table[new_word] = word

    return out_table


def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,

                      char_reconv_table):

    wordlist = list(src_wordlist)

    wordlist.sort(key=lambda w: (len(w), w), reverse=True)

    word_conv_table = {}

    word_reconv_table = {}

    conv_table = char_conv_table.copy()

    reconv_table = char_reconv_table.copy()

    tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))

    while wordlist:

        conv_table.update(word_conv_table)

        reconv_table.update(word_reconv_table)

        word = wordlist.pop()

        new_word_len = word_len = len(word)

        while new_word_len == word_len:

            test_word = translate(word, reconv_table)

            new_word = translate(word, conv_table)

            if not reconv_table.get(new_word) and \

               (test_word != word or

                (tomanyptn.search(word) and

                 word != translate(new_word, reconv_table))):

                word_conv_table[word] = new_word

                word_reconv_table[new_word] = word

            try:

                word = wordlist.pop()

            except IndexError:

                break

            new_word_len = len(word)

    return word_reconv_table


def PHPArray(table):

    lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]

    return '\n'.join(lines)


def main():

    # Get Unihan.zip:

    url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER

    han_dest = 'Unihan-%s.zip' % UNIHAN_VER

    download(url, han_dest)


    sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR


    # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:

    url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER

    tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER

    download(url, tbe_dest)


    # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:

    url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER

    pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER

    download(url, pyn_dest)


    # Get libtabe-$(LIBTABE_VER).tgz:

    url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER

    lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER

    download(url, lbt_dest)


    # Unihan.txt

    (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)


    t2s_1tomany.update(charManualTable('symme_supp.manual'))

    t2s_1tomany.update(charManualTable('trad2simp.manual'))

    s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))

    s2t_1tomany.update(charManualTable('simp2trad.manual'))


    if pyversion[:1] in ['2']:

        t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])

        s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])

    else:

        t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])

        s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])


    s_tomany = toManyRules(t2s_1tomany)

    t_tomany = toManyRules(s2t_1tomany)


    # noconvert rules

    t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)

    s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)


    # the supper set for word to word conversion

    t2s_1to1_supp = t2s_1to1.copy()

    s2t_1to1_supp = s2t_1to1.copy()

    t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))

    s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))


    # word to word manual rules

    t2s_word2word_manual = manualWordsTable('simpphrases.manual',

                                            s2t_1to1_supp, t2s_1to1_supp)

    t2s_word2word_manual.update(customRules('toSimp.manual'))

    s2t_word2word_manual = manualWordsTable('tradphrases.manual',

                                            t2s_1to1_supp, s2t_1to1_supp)

    s2t_word2word_manual.update(customRules('toTrad.manual'))


    # word to word rules from input methods

    t_wordlist = set()

    s_wordlist = set()

    t_wordlist.update(ezbigParser(tbe_dest),

                      tsiParser(lbt_dest))

    s_wordlist.update(wubiParser(tbe_dest),

                      zrmParser(tbe_dest),

                      phraseParser(pyn_dest))


    # exclude

    s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')

    t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')


    s2t_supp = s2t_1to1_supp.copy()

    s2t_supp.update(s2t_word2word_manual)

    t2s_supp = t2s_1to1_supp.copy()

    t2s_supp.update(t2s_word2word_manual)


    # parse list to dict

    t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,

                                      s2t_1to1_supp, t2s_supp)

    t2s_word2word.update(t2s_word2word_manual)

    s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,

                                      t2s_1to1_supp, s2t_supp)

    s2t_word2word.update(s2t_word2word_manual)


    # Final tables

    # sorted list toHans

    if pyversion[:1] in ['2']:

        t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])

    else:

        t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])

    toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)

    # sorted list toHant

    if pyversion[:1] in ['2']:

        s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])

    else:

        s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])

    toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)

    # sorted list toCN

    toCN = dictToSortedList(customRules('toCN.manual'), 1)

    # sorted list toHK

    toHK = dictToSortedList(customRules('toHK.manual'), 1)

    # sorted list toTW

    toTW = dictToSortedList(customRules('toTW.manual'), 1)


    # Get PHP Array

    php = '''<?php

/**

 * Simplified / Traditional Chinese conversion tables

 *

 * Automatically generated using code and data in maintenance/language/zhtable/

 * Do not modify directly!

 *

 * @file

 */


namespace MediaWiki\Languages\Data;


class ZhConversion {

public static $zh2Hant = [\n'''

    php += PHPArray(toHant) \

        + '\n];\n\npublic static $zh2Hans = [\n' \

        + PHPArray(toHans) \

        + '\n];\n\npublic static $zh2TW = [\n' \

        + PHPArray(toTW) \

        + '\n];\n\npublic static $zh2HK = [\n' \

        + PHPArray(toHK) \

        + '\n];\n\npublic static $zh2CN = [\n' \

        + PHPArray(toCN) \

        + '\n];\n}\n'


    if pyversion[:1] in ['2']:

        f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding='utf8')

    else:

        f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')

    print ('Writing ZhConversion.php ... ')

    f.write(php)

    f.close()


    # Remove temporary files

    print ('Deleting temporary files ... ')

    os.remove('EZ-Big.txt.in')

    os.remove('phrase_lib.txt')

    os.remove('tsi.src')

    os.remove('Unihan_Variants.txt')

    os.remove('Wubi.txt.in')

    os.remove('Ziranma.txt.in')


if __name__ == '__main__':

    main()

list
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11

print
while(( $__line=Maintenance::readconsole()) !==false) print
Definition eval.php:64

Makefile.translate
translate(text, conv_table)
Definition Makefile.py:235

Makefile.manualWordsTable
manualWordsTable(path, conv_table, reconv_table)
Definition Makefile.py:249

Makefile.unihanParser
unihanParser(path)
Definition Makefile.py:131

Makefile.removeRules
removeRules(path, table)
Definition Makefile.py:189

Makefile._unichr
_unichr
Definition Makefile.py:19

Makefile.defaultWordsTable
defaultWordsTable(src_wordlist, src_tomany, char_conv_table, char_reconv_table)
Definition Makefile.py:267

Makefile.open
open
Definition Makefile.py:18

Makefile.unichr
unichr
Definition Makefile.py:28

Makefile.untargz
str untargz
Definition Makefile.py:79

Makefile.customRules
customRules(path)
Definition Makefile.py:218

Makefile.tsiParser
tsiParser(path)
Definition Makefile.py:124

Makefile.ezbigParser
tablesParser ezbigParser
Definition Makefile.py:111

Makefile.zrmParser
tablesParser zrmParser
Definition Makefile.py:113

Makefile.unzip
str unzip
Definition Makefile.py:76

Makefile.PHPArray
PHPArray(table)
Definition Makefile.py:297

Makefile.phraseParser
phraseParser(path)
Definition Makefile.py:116

Makefile.unichr2
unichr2(*args)
Definition Makefile.py:31

Makefile.applyExcludes
applyExcludes(mlist, path)
Definition Makefile.py:153

Makefile.unichr3
unichr3(*args)
Definition Makefile.py:35

Makefile.toManyRules
toManyRules(src_table)
Definition Makefile.py:176

Makefile.charManualTable
charManualTable(path)
Definition Makefile.py:167

Makefile.download
download(url, dest)
Definition Makefile.py:47

Makefile.main
main()
Definition Makefile.py:302

Makefile.parserCore
parserCore(fp, pos, beginmark=None, endmark=None)
Definition Makefile.py:83

Makefile.uncompress
uncompress(fp, member, encoding='U8')
Definition Makefile.py:63

Makefile.wubiParser
tablesParser wubiParser
Definition Makefile.py:112

Makefile.tablesParser
tablesParser(path, name)
Definition Makefile.py:104

Makefile.dictToSortedList
dictToSortedList(src_table, pos)
Definition Makefile.py:231