pakkau/test3.py~

import re
import pandas as pd
import math
from functools import reduce
import argparse
import os
import sqlite3

def genmod():
    corpus_path = "./corpus/"
    df_list = []
    for file in os.listdir(corpus_path):
        if file.endswith(".csv"):
            df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
            df_list.append(df)
    df = pd.concat(df_list)
    df['lomaji'] = df['lomaji'].str.lower()

    new_data = []

    for index, row in df.iterrows():
        hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['hanji'])))
        tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
        tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
        new_data.append((hanji, tl2))
        if (len(hanji) != len(tl2)):
            raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")

    model_filename = "model.db"
    try:
        os.remove(model_filename)
    except OSError:
        pass

    con = sqlite3.connect(model_filename)
    cur = con.cursor()
    cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")

    
    char_to_pronounce = {}

    for i in new_data:
        hanji = i[0]
        lomaji = i[1]
        for j in range(len(i[0])):
            if not hanji[j] in char_to_pronounce:
                char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
            elif not lomaji[j] in char_to_pronounce[hanji[j]]:
                char_to_pronounce[hanji[j]][lomaji[j]] = 1
            else:
                char_to_pronounce[hanji[j]][lomaji[j]] += 1

    print(char_to_pronounce)

    for i in char_to_pronounce.keys():
        hanji =  char_to_pronounce[i]
        for j in hanji.keys():
            cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
            
    #con.commit()
    con.commit()
    con.close()

def convert(sentence):
    pass

parser = argparse.ArgumentParser()
parser.add_argument('--genmod', help='generate the model', action='store_true',
                required=False,)

parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
                    help='the sentence to be converted')
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
                    default=['poj'],
                    help='the orthography to be used (poj or tl). Default is poj.')

args = parser.parse_args()
print(args)
if args.genmod == True:
    genmod()
elif args.sentence != None:
    if args.form == ['poj']:
        sentence = poj_to_tl(args.sentence)
        print(convert(sentence))
    else:
        print(convert(args.sentence))
else:
    parser.print_help()
add change log 2. change the converting unit from hanji, etc. 2024-06-28 21:55:08 +08:00			`import re`
			`import pandas as pd`
			`import math`
			`from functools import reduce`
			`import argparse`
			`import os`
			`import sqlite3`

			`def genmod():`
			`corpus_path = "./corpus/"`
			`df_list = []`
			`for file in os.listdir(corpus_path):`
			`if file.endswith(".csv"):`
			`df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])`
			`df_list.append(df)`
			`df = pd.concat(df_list)`
			`df['lomaji'] = df['lomaji'].str.lower()`

			`new_data = []`

			`for index, row in df.iterrows():`
			`hanji = list(filter(lambda x : re.match("[^、（）；：，。！？「」『』]", x), list(row['hanji'])))`
			`tl = re.split(r'(:?[!?;,.\"\'\(\):]\|[-]+\|\s+)', row['lomaji'])`
			`tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))`
			`new_data.append((hanji, tl2))`
			`if (len(hanji) != len(tl2)):`
			`raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")`

			`model_filename = "model.db"`
			`try:`
			`os.remove(model_filename)`
			`except OSError:`
			`pass`

			`con = sqlite3.connect(model_filename)`
			`cur = con.cursor()`
			`cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")`


			`char_to_pronounce = {}`

			`for i in new_data:`
			`hanji = i[0]`
			`lomaji = i[1]`
			`for j in range(len(i[0])):`
			`if not hanji[j] in char_to_pronounce:`
			`char_to_pronounce[hanji[j]] = {lomaji[j] : 1}`
			`elif not lomaji[j] in char_to_pronounce[hanji[j]]:`
			`char_to_pronounce[hanji[j]][lomaji[j]] = 1`
			`else:`
			`char_to_pronounce[hanji[j]][lomaji[j]] += 1`

			`print(char_to_pronounce)`

			`for i in char_to_pronounce.keys():`
			`hanji = char_to_pronounce[i]`
			`for j in hanji.keys():`
			`cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))`

			`#con.commit()`
			`con.commit()`
			`con.close()`

			`def convert(sentence):`
			`pass`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--genmod', help='generate the model', action='store_true',`
			`required=False,)`

			`parser.add_argument('sentence', metavar='SENTENCE', nargs='?',`
			`help='the sentence to be converted')`
			`parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,`
			`default=['poj'],`
			`help='the orthography to be used (poj or tl). Default is poj.')`

			`args = parser.parse_args()`
			`print(args)`
			`if args.genmod == True:`
			`genmod()`
			`elif args.sentence != None:`
			`if args.form == ['poj']:`
			`sentence = poj_to_tl(args.sentence)`
			`print(convert(sentence))`
			`else:`
			`print(convert(args.sentence))`
			`else:`
			`parser.print_help()`