add change log 2. change the converting unit from hanji, etc.

2024-06-28 21:55:08 +08:00 · 2024-06-28 21:55:08 +08:00 · 9f2836067a
commit 9f2836067a
parent 9b7d8d4432
7 changed files with 1030 additions and 266 deletions
--- a/corpus/教典例句.csv
+++ b/corpus/教典例句.csv
--- a/160
+++ b/160
@ -0,0 +1,160 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/model.db
+++ b/model.db
--- a/pakkau.py
+++ b/pakkau.py
@ -7,7 +7,7 @@ import os
 import sqlite3
 from itertools import chain
-model_filename = "model.db"
+model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model.db")
 def genmod():
    corpus_path = "./corpus/"
@ -44,7 +44,11 @@ def genmod():
    for i in new_data:
        hanji = i[0]
        lomaji = i[1]
        '''111'''
        hanji = list(zip(hanji, lomaji))
        hanji = list(map(lambda x : x[0] + x[1], hanji))
        for j in range(len(i[0])):
            if not hanji[j] in char_to_pronounce:
                char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
@ -65,7 +69,7 @@ def genmod():
    for i in new_data:
-        head_hanji = i[0][0]
+        head_hanji = i[0][0]+i[1][0]
        if head_hanji in init_freq:
            init_freq[head_hanji] += 1
@ -86,7 +90,8 @@ def genmod():
    cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
    for i in new_data:
-        hanji = i[0]
+        hanji_tmp = list(zip(i[0],i[1]))
        hanji = list(map(lambda x: x[0]+ x[1], hanji_tmp))
        for j in range(len(i[0])-1):
            this_hanji = hanji[j]
            next_hanji = hanji[j+1]
@ -111,7 +116,6 @@ def genmod():
 def get_homophones(pron, cur, con):
    homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
    homophones = list(map(lambda x: x[0], homophones_raw))
    return homophones
 def convert(sentences):
@ -171,7 +175,7 @@ def convert_one_sentence(sentence):
        for i in homophones_sequence[0]:
            i_freq = cur.execute('''select initial.freq FROM initial 
-    WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
+    WHERE initial.char = ?''', (i['char'],)).fetchall()[0][0]
            i['prob'] = i_freq / head_freq_total
@ -268,8 +272,10 @@ on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0]
        current = current_ls[0]["char"]
        prev_char = current_ls[0]["prev_char"]
    return_result = list(filter(lambda x : x != "", return_result))
    return_result = list(map(lambda x : x[0] if re.match(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎𪜀-\U0002b73f]', x)
                                             else x, return_result))
    return return_result
--- a/pakkau.py~
+++ b/pakkau.py~
@ -0,0 +1,302 @@
 import re
 import pandas as pd
 import math
 from functools import reduce
 import argparse
 import os
 import sqlite3
 from itertools import chain
 model_filename = "model.db"
 def genmod():
    corpus_path = "./corpus/"
    df_list = []
    for file in os.listdir(corpus_path):
        if file.endswith(".csv"):
            df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
            df_list.append(df)
    df = pd.concat(df_list)
    df['lomaji'] = df['lomaji'].str.lower()
    new_data = []
    for index, row in df.iterrows():
        hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['hanji'])))
        tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
        tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
        new_data.append((hanji, tl2))
        if (len(hanji) != len(tl2)):
            raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
    #model_filename = "model.db"
    try:
        os.remove(model_filename)
    except OSError:
        pass
    con = sqlite3.connect(model_filename)
    cur = con.cursor()
    cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
    char_to_pronounce = {}
    for i in new_data:
        hanji = i[0]
        lomaji = i[1]
        for j in range(len(i[0])):
            if not hanji[j] in char_to_pronounce:
                char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
            elif not lomaji[j] in char_to_pronounce[hanji[j]]:
                char_to_pronounce[hanji[j]][lomaji[j]] = 1
            else:
                char_to_pronounce[hanji[j]][lomaji[j]] += 1
    for i in char_to_pronounce.keys():
        hanji =  char_to_pronounce[i]
        for j in hanji.keys():
            cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
    all_chars = char_to_pronounce.keys()
    init_freq = {} #詞kap句開始ê字出現次數
    cur.execute("CREATE TABLE initial(char, freq)")
    for i in new_data:
        head_hanji = i[0][0]
        if head_hanji in init_freq:
            init_freq[head_hanji] += 1
        else:
            init_freq[head_hanji] = 1
    #補字
    min_weight = 0.1
    for i in all_chars:
        if not i in init_freq.keys():
            init_freq[i] = 0.1
    for i in init_freq.keys():
        cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i]))
    char_transition = {}
    cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
    for i in new_data:
        hanji = i[0]
        for j in range(len(i[0])-1):
            this_hanji = hanji[j]
            next_hanji = hanji[j+1]
            if not this_hanji in char_transition:
                char_transition[this_hanji] = {next_hanji : 1}
            elif not next_hanji in char_transition[this_hanji]:
                char_transition[this_hanji][next_hanji] = 1
            else:
                char_transition[this_hanji][next_hanji] += 1
    for i in char_transition.keys():
        next_char = char_transition[i]
        for j in next_char.keys():
            cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j]))
    #get_homophones("lí", cur, con)
    con.commit()
    con.close()
 def get_homophones(pron, cur, con):
    homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
    homophones = list(map(lambda x: x[0], homophones_raw))
    return homophones
 def convert(sentences):
    splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences)
    splitted_cleaned = list(filter(lambda x : x != '', splitted))
    result =  list(map(lambda s : convert_one_sentence(s), splitted_cleaned))
    flatten_result = [x for xs in result for xss in xs for x in xss]
    result_string = "".join(flatten_result)
    print(result_string)
    return result_string
 def convert_one_sentence(sentence):
    full_width = ["！", "？", "；","：","，","。", "（", "）"]
    half_width = ["!", "?", ";", ":", ",", ".", "(", ")"]
    if len(sentence) == 1:
        for i in range(len(half_width)):
            if sentence[0] == half_width[i]:
                return [[full_width[i]]]
    weight = 2/3
    splitted = re.split(r'(--?|\s+)', sentence)
    filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted))
    small_capized = list(map(lambda x : x.lower(), filtered))
    print("======", small_capized)
    con = sqlite3.connect(model_filename)
    cur = con.cursor()
    homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized))
    homophones_sequence = [list(map (lambda x : {"char": x,
                                      "prev_char": None,
                                                 "prob" : 1}, i)) for i in homophones_sequence_raw]
    head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial 
    INNER JOIN pronounce ON pronounce.hanji = initial.char
    WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall()))
    return_result = [None] * len(small_capized)
    if head_freqs == []:
        return_result[0] = filtered[0]
        homophones_sequence[0] = [{"char": filtered[0],
                                  "prev_char": None,
                                  "prob" : 1}]
    else:
        head_freq_total = reduce(lambda x , y : x + y, head_freqs)
        for i in homophones_sequence[0]:
            i_freq = cur.execute('''select initial.freq FROM initial 
    WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
            i['prob'] = i_freq / head_freq_total
            print(i)
    #for i in homophones_sequence[0]:
    print("+++++", return_result)
    if len(small_capized) == 1:
        max_prob = -math.inf
        max_prob_char = None
        for i in homophones_sequence[0]:
            if i['prob'] > max_prob:
                max_prob_char = i['char']
                max_prob = i['prob']
        return_result[0] = max_prob_char
    else:
        for i in range(1,len(small_capized)):
            char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial 
    INNER JOIN pronounce ON pronounce.hanji = initial.char
    WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall()))
            if char_freqs == []:
                return_result[i] = filtered[i]
                homophones_sequence[i] = [{"char": filtered[i],
                                  "prev_char": None,
                                  "prob" : 1}]
                prev_char = ""
                max_prob = -math.inf
                for m in homophones_sequence[i-1]:
                    if m['prob'] > max_prob:
                        max_prob = m['prob']
                        prev_char = m['char']
                homophones_sequence[i][0]['prob'] = max_prob
                homophones_sequence[i][0]['prev_char'] = prev_char
            else:
                total_transition_freq = cur.execute('''
 SELECT sum(t.freq)
 FROM transition as t
 INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char
 INNER JOIN pronounce as p2 ON p2.hanji = t.next_char
 where p2.lomaji = ?  and p1.lomaji = ?''',
                                              (small_capized[i], small_capized[i-1])).fetchall()[0][0]
                for j in homophones_sequence[i]:
                    prev_char = None
                    max_prob = -math.inf
                    for k in homophones_sequence[i-1]:
                        k_to_j_freq_raw = cur.execute('''select freq from transition
 where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall()
                        if k_to_j_freq_raw == []:
                            den = cur.execute('''
 SELECT sum(p.freq)
 FROM pronounce as p 
 inner join pronounce as p2
 on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母
                            #分子
                            num = cur.execute(''' SELECT sum(freq) FROM pronounce as p  where hanji = ?''', (j["char"],)).fetchall()[0][0]
                            print("+++", num, den)
                            k_to_j_freq = num/den * (1-weight)
                        else:
                            num = k_to_j_freq_raw[0][0]
                            don = total_transition_freq
                            k_to_j_freq =num/don * weight
                        print("k_to_j_fr", k["char"], j["char"], k_to_j_freq)
                        if k_to_j_freq * k["prob"] > max_prob:
                            max_prob = k_to_j_freq * k["prob"]
                            prev_char = k["char"]
                    print("~-~_~-~-~-~-", prev_char, j["char"], max_prob)
                    j["prob"] = max_prob
                    j["prev_char"] = prev_char
    max_prob = -math.inf
    current = ""
    prev_char = ""
    for i in homophones_sequence[len(homophones_sequence)-1]:
        if i["prob"] > max_prob:
            max_prob = i["prob"]
            current = i["char"]
            prev_char = i["prev_char"]
    print("~tail~~", current)
    print(homophones_sequence)
    return_result[len(homophones_sequence)-1] = current
    for i in range(len(homophones_sequence)-2, -1, -1):
        current_ls = list(filter(lambda x : x["char"] == prev_char,
                              homophones_sequence[i]))
        print(prev_char)
        return_result[i] = prev_char
        current = current_ls[0]["char"]
        prev_char = current_ls[0]["prev_char"]
    print(return_result)
    return return_result
 def poj_to_tl(sentence):
    return sentence
 parser = argparse.ArgumentParser()
 parser.add_argument('--genmod', help='generate the model', action='store_true',
                required=False,)
 parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
                    help='the sentence to be converted')
 parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
                    default=['poj'],
                    help='the orthography to be used (poj or tl). Default is poj.')
 args = parser.parse_args()
 if args.genmod == True:
    genmod()
 elif args.sentence != None:
    if args.form == ['poj']:
        sentence = poj_to_tl(args.sentence)
        convert(sentence)
    else:
        convert(args.sentence)
 else:
    parser.print_help()
--- a/test2.py~
+++ b/test2.py~
@ -0,0 +1,207 @@
 import re
 import pandas as pd
 import math
 from functools import reduce
 df1 = pd.read_csv('教典例句.csv', header=0, names=['漢字', '羅馬字'])
 df2 = pd.read_csv('教典發音詞.csv',header=0, names=['漢字', '羅馬字'])
 df = pd.concat([df1, df2]) # combine 2 csv dataframe
 df['羅馬字'] = df['羅馬字'].str.lower()
 new_data = []
 for index, row in df.iterrows():
    hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['漢字'])))
    tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['羅馬字'])
    tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
    new_data.append((hanji, tl2))
    #if (len(hanji) != len(tl2)):
        #print(tl2, hanji)
    #print(tl2, hanji)
 # char-To-Pronounciation Prossibility dict
 char_to_pronounce = {}
 for i in new_data:
    hanji = i[0]
    lomaji = i[1]
    for j in range(len(i[0])):
        if not hanji[j] in char_to_pronounce:
            char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
        elif not lomaji[j] in char_to_pronounce[hanji[j]]:
            char_to_pronounce[hanji[j]][lomaji[j]] = 1
        else:
            char_to_pronounce[hanji[j]][lomaji[j]] += 1
 for char, char_reading in char_to_pronounce.items():
    total_count = reduce((lambda x, y : x + y), list(char_reading.values()))
    for i in char_reading.keys():
        char_reading[i] = char_reading[i] / float(total_count)
 #print(char_to_pronounce)
 all_chars = char_to_pronounce.keys()
 '''{'提': 45, '宋': 7, '完': 18, '刻': 7, '局': 9,
 '巡': 8, '畫': 25, '青': 56, '尪': 13}'''
 init_freq = {} #詞kap句開始ê字出現次數
 for i in new_data:
    head_hanji = i[0][0]
    if head_hanji in init_freq:
        init_freq[head_hanji] += 1
    else:
        init_freq[head_hanji] = 1
 #補字
 min_weight = 0.1
 for i in all_chars:
    if not i in init_freq.keys():
        init_freq[i] = 0.1
 #print(init_freq)
 # probability of P(next=c2|this=c1)
 char_transition = {}
 for i in new_data:
    hanji = i[0]
    for j in range(len(i[0])-1):
        this_hanji = hanji[j]
        next_hanji = hanji[j+1]
        if not this_hanji in char_transition:
            char_transition[this_hanji] = {next_hanji : 1}
        elif not next_hanji in char_transition[this_hanji]:
            char_transition[this_hanji][next_hanji] = 1
        else:
            char_transition[this_hanji][next_hanji] += 1
 #print(char_transition)
 #補字
 for i in all_chars:
    if not i in char_transition.keys():
        char_transition[i] = {}
        for j in all_chars:
            char_transition[i][j] = init_freq[j]
    else:
        pass
 for i in char_transition.keys():
    for j in all_chars:
        if not j in char_transition[i].keys():
            char_transition[i][j] = min_weight * (0.03+math.log(init_freq[j]))
 for char, next_char in char_transition.items():
    total_count = 0
    [total_count := total_count + x for x in list(next_char.values())]
    for i in next_char.keys():
        next_char[i] = next_char[i] / float(total_count)
 def get_homophones(pron):
    homophones = []
    for i in char_to_pronounce.keys():
        if pron in char_to_pronounce[i].keys():
            homophones.append(i)
        else:
            pass
    return homophones
 input_lomaji = ["guá", "kap", "tshit", "á", "lâi", "khì", "tâi", "tiong", "tshit", "thô", "sūn", "suà", "tsē", "ko", "thih"]
 char_candidates = []
 for i in input_lomaji:
    homophones = list(map(lambda x : {"char": x,
                                      "prev_char": None,
                                      "prob" : None}, # probibility
                          get_homophones(i)))
    char_candidates.append(homophones)
 #print(char_candidates)
 def get_max_prob(input_lmj, char_cand):
    for i in range(len(input_lmj)):
        if i == 0:
            for j in char_cand[i]:
                init_freq_sum = reduce(lambda x, y : x + y,
                                       list(
                                           map(lambda x : init_freq[x["char"]] ,
                                               char_cand[0])))
                print(init_freq_sum)
                ch = j["char"]
                init_to_char_prob = init_freq[ch] / init_freq_sum # get the ratio
                char_reading_prob = char_to_pronounce[ch][input_lmj[0]]
                j["prob"] = init_to_char_prob * char_reading_prob
            result = ""
            max_num = -math.inf
            for i in char_cand[0]:
                if i["prob"] >= max_num:
                    max_num = i["prob"]
                    result = i["char"]
            #print(result)
        else:
            for j in char_cand[i]:
                prob = -math.inf
                prev_char = ""
                for k in char_cand[i-1]:
                    k_prob = k["prob"]
                    #print(k["char"], "k_prob:", k_prob)
                    k_to_j_prob = char_transition[k["char"]][j["char"]]
                    #print(k["char"], "->",j["char"] ,"k_to_j_prob:", k_to_j_prob)
                    j_to_pron_prob = char_to_pronounce[j["char"]][input_lmj[i]]
                    total_tmp_prob = k_prob * k_to_j_prob * j_to_pron_prob 
                    if prob < total_tmp_prob:
                        prob = total_tmp_prob
                        prev_char = k
                j["prev_char"] = prev_char["char"]
                j["prob"] = prob
    real_last_char = ""
    prev_char = ""
    prob = -math.inf
    for i in char_cand[-1]:
        if i["prob"] > prob:
            prob = i["prob"]
            real_last_char = i["char"]
            prev_char = i["prev_char"]
    print(real_last_char)
    result_hanji = [real_last_char]
    for i in range(len(input_lmj)-2, -1, -1):
        current = list(filter(lambda x : x["char"] == prev_char,
                              char_cand[i]))[0]
        result_hanji.append(current["char"])
        prev_char = current["prev_char"]
    result_hanji.reverse()
    result_hanji_string = "".join(result_hanji)
    print("輸入ê羅馬字陣列(array)：", input_lomaji)
    print("輸出ê漢字：", result_hanji_string)
 get_max_prob(input_lomaji, char_candidates)
--- a/test3.py~
+++ b/test3.py~
@ -0,0 +1,89 @@
 import re
 import pandas as pd
 import math
 from functools import reduce
 import argparse
 import os
 import sqlite3
 def genmod():
    corpus_path = "./corpus/"
    df_list = []
    for file in os.listdir(corpus_path):
        if file.endswith(".csv"):
            df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
            df_list.append(df)
    df = pd.concat(df_list)
    df['lomaji'] = df['lomaji'].str.lower()
    new_data = []
    for index, row in df.iterrows():
        hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['hanji'])))
        tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
        tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
        new_data.append((hanji, tl2))
        if (len(hanji) != len(tl2)):
            raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
    model_filename = "model.db"
    try:
        os.remove(model_filename)
    except OSError:
        pass
    con = sqlite3.connect(model_filename)
    cur = con.cursor()
    cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
    char_to_pronounce = {}
    for i in new_data:
        hanji = i[0]
        lomaji = i[1]
        for j in range(len(i[0])):
            if not hanji[j] in char_to_pronounce:
                char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
            elif not lomaji[j] in char_to_pronounce[hanji[j]]:
                char_to_pronounce[hanji[j]][lomaji[j]] = 1
            else:
                char_to_pronounce[hanji[j]][lomaji[j]] += 1
    print(char_to_pronounce)
    for i in char_to_pronounce.keys():
        hanji =  char_to_pronounce[i]
        for j in hanji.keys():
            cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
    #con.commit()
    con.commit()
    con.close()
 def convert(sentence):
    pass
 parser = argparse.ArgumentParser()
 parser.add_argument('--genmod', help='generate the model', action='store_true',
                required=False,)
 parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
                    help='the sentence to be converted')
 parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
                    default=['poj'],
                    help='the orthography to be used (poj or tl). Default is poj.')
 args = parser.parse_args()
 print(args)
 if args.genmod == True:
    genmod()
 elif args.sentence != None:
    if args.form == ['poj']:
        sentence = poj_to_tl(args.sentence)
        print(convert(sentence))
    else:
        print(convert(args.sentence))
 else:
    parser.print_help()