pakkau/test3.py~

89 lines
2.6 KiB
Python

import re
import pandas as pd
import math
from functools import reduce
import argparse
import os
import sqlite3
def genmod():
corpus_path = "./corpus/"
df_list = []
for file in os.listdir(corpus_path):
if file.endswith(".csv"):
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
df_list.append(df)
df = pd.concat(df_list)
df['lomaji'] = df['lomaji'].str.lower()
new_data = []
for index, row in df.iterrows():
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
new_data.append((hanji, tl2))
if (len(hanji) != len(tl2)):
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
model_filename = "model.db"
try:
os.remove(model_filename)
except OSError:
pass
con = sqlite3.connect(model_filename)
cur = con.cursor()
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
char_to_pronounce = {}
for i in new_data:
hanji = i[0]
lomaji = i[1]
for j in range(len(i[0])):
if not hanji[j] in char_to_pronounce:
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
char_to_pronounce[hanji[j]][lomaji[j]] = 1
else:
char_to_pronounce[hanji[j]][lomaji[j]] += 1
print(char_to_pronounce)
for i in char_to_pronounce.keys():
hanji = char_to_pronounce[i]
for j in hanji.keys():
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
#con.commit()
con.commit()
con.close()
def convert(sentence):
pass
parser = argparse.ArgumentParser()
parser.add_argument('--genmod', help='generate the model', action='store_true',
required=False,)
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
help='the sentence to be converted')
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
default=['poj'],
help='the orthography to be used (poj or tl). Default is poj.')
args = parser.parse_args()
print(args)
if args.genmod == True:
genmod()
elif args.sentence != None:
if args.form == ['poj']:
sentence = poj_to_tl(args.sentence)
print(convert(sentence))
else:
print(convert(args.sentence))
else:
parser.print_help()