90 lines
2.6 KiB
Python
90 lines
2.6 KiB
Python
|
import re
|
||
|
import pandas as pd
|
||
|
import math
|
||
|
from functools import reduce
|
||
|
import argparse
|
||
|
import os
|
||
|
import sqlite3
|
||
|
|
||
|
def genmod():
|
||
|
corpus_path = "./corpus/"
|
||
|
df_list = []
|
||
|
for file in os.listdir(corpus_path):
|
||
|
if file.endswith(".csv"):
|
||
|
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
|
||
|
df_list.append(df)
|
||
|
df = pd.concat(df_list)
|
||
|
df['lomaji'] = df['lomaji'].str.lower()
|
||
|
|
||
|
new_data = []
|
||
|
|
||
|
for index, row in df.iterrows():
|
||
|
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
|
||
|
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
|
||
|
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
|
||
|
new_data.append((hanji, tl2))
|
||
|
if (len(hanji) != len(tl2)):
|
||
|
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
|
||
|
|
||
|
model_filename = "model.db"
|
||
|
try:
|
||
|
os.remove(model_filename)
|
||
|
except OSError:
|
||
|
pass
|
||
|
|
||
|
con = sqlite3.connect(model_filename)
|
||
|
cur = con.cursor()
|
||
|
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
|
||
|
|
||
|
|
||
|
char_to_pronounce = {}
|
||
|
|
||
|
for i in new_data:
|
||
|
hanji = i[0]
|
||
|
lomaji = i[1]
|
||
|
for j in range(len(i[0])):
|
||
|
if not hanji[j] in char_to_pronounce:
|
||
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||
|
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
|
||
|
char_to_pronounce[hanji[j]][lomaji[j]] = 1
|
||
|
else:
|
||
|
char_to_pronounce[hanji[j]][lomaji[j]] += 1
|
||
|
|
||
|
print(char_to_pronounce)
|
||
|
|
||
|
for i in char_to_pronounce.keys():
|
||
|
hanji = char_to_pronounce[i]
|
||
|
for j in hanji.keys():
|
||
|
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
|
||
|
|
||
|
#con.commit()
|
||
|
con.commit()
|
||
|
con.close()
|
||
|
|
||
|
def convert(sentence):
|
||
|
pass
|
||
|
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument('--genmod', help='generate the model', action='store_true',
|
||
|
required=False,)
|
||
|
|
||
|
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
|
||
|
help='the sentence to be converted')
|
||
|
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
|
||
|
default=['poj'],
|
||
|
help='the orthography to be used (poj or tl). Default is poj.')
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
print(args)
|
||
|
if args.genmod == True:
|
||
|
genmod()
|
||
|
elif args.sentence != None:
|
||
|
if args.form == ['poj']:
|
||
|
sentence = poj_to_tl(args.sentence)
|
||
|
print(convert(sentence))
|
||
|
else:
|
||
|
print(convert(args.sentence))
|
||
|
else:
|
||
|
parser.print_help()
|
||
|
|