pakkau/test3.py~

import re
import pandas as pd
import math
from functools import reduce
import argparse
import os
import sqlite3

def genmod():
    corpus_path = "./corpus/"
    df_list = []
    for file in os.listdir(corpus_path):
        if file.endswith(".csv"):
            df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
            df_list.append(df)
    df = pd.concat(df_list)
    df['lomaji'] = df['lomaji'].str.lower()

    new_data = []

    for index, row in df.iterrows():
        hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['hanji'])))
        tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
        tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
        new_data.append((hanji, tl2))
        if (len(hanji) != len(tl2)):
            raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")

    model_filename = "model.db"
    try:
        os.remove(model_filename)
    except OSError:
        pass

    con = sqlite3.connect(model_filename)
    cur = con.cursor()
    cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")


    char_to_pronounce = {}

    for i in new_data:
        hanji = i[0]
        lomaji = i[1]
        for j in range(len(i[0])):
            if not hanji[j] in char_to_pronounce:
                char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
            elif not lomaji[j] in char_to_pronounce[hanji[j]]:
                char_to_pronounce[hanji[j]][lomaji[j]] = 1
            else:
                char_to_pronounce[hanji[j]][lomaji[j]] += 1

    print(char_to_pronounce)

    for i in char_to_pronounce.keys():
        hanji =  char_to_pronounce[i]
        for j in hanji.keys():
            cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))

    #con.commit()
    con.commit()
    con.close()

def convert(sentence):
    pass

parser = argparse.ArgumentParser()
parser.add_argument('--genmod', help='generate the model', action='store_true',
                required=False,)

parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
                    help='the sentence to be converted')
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
                    default=['poj'],
                    help='the orthography to be used (poj or tl). Default is poj.')

args = parser.parse_args()
print(args)
if args.genmod == True:
    genmod()
elif args.sentence != None:
    if args.form == ['poj']:
        sentence = poj_to_tl(args.sentence)
        print(convert(sentence))
    else:
        print(convert(args.sentence))
else:
    parser.print_help()