This commit is contained in:
Tan, Kian-ting 2024-04-08 02:50:34 +08:00
parent 4acf44911a
commit eeffa16ec9
3 changed files with 330 additions and 0 deletions

28
3rd/LICENSE Normal file
View file

@ -0,0 +1,28 @@
<<<<<<< HEAD
MIT License
Copyright (c) 2024 Tan, Kian-ting
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
=======
all the csv files are published by Ministry of Education, under CC BY-NC-SA TW 3.0
source code:
under MIT License
>>>>>>> ba6c568 (initial)

BIN
3rd/model.db Normal file

Binary file not shown.

302
3rd/pakkau.py Normal file
View file

@ -0,0 +1,302 @@
import re
import pandas as pd
import math
from functools import reduce
import argparse
import os
import sqlite3
from itertools import chain
model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model.db")
def genmod():
corpus_path = "./corpus/"
df_list = []
for file in os.listdir(corpus_path):
if file.endswith(".csv"):
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
df_list.append(df)
df = pd.concat(df_list)
df['lomaji'] = df['lomaji'].str.lower()
new_data = []
for index, row in df.iterrows():
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
new_data.append((hanji, tl2))
if (len(hanji) != len(tl2)):
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
#model_filename = "model.db"
try:
os.remove(model_filename)
except OSError:
pass
con = sqlite3.connect(model_filename)
cur = con.cursor()
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
char_to_pronounce = {}
for i in new_data:
hanji = i[0]
lomaji = i[1]
for j in range(len(i[0])):
if not hanji[j] in char_to_pronounce:
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
char_to_pronounce[hanji[j]][lomaji[j]] = 1
else:
char_to_pronounce[hanji[j]][lomaji[j]] += 1
for i in char_to_pronounce.keys():
hanji = char_to_pronounce[i]
for j in hanji.keys():
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
all_chars = char_to_pronounce.keys()
init_freq = {} #詞kap句開始ê字出現次數
cur.execute("CREATE TABLE initial(char, freq)")
for i in new_data:
head_hanji = i[0][0]
if head_hanji in init_freq:
init_freq[head_hanji] += 1
else:
init_freq[head_hanji] = 1
#補字
min_weight = 0.1
for i in all_chars:
if not i in init_freq.keys():
init_freq[i] = 0.1
for i in init_freq.keys():
cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i]))
char_transition = {}
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
for i in new_data:
hanji = i[0]
for j in range(len(i[0])-1):
this_hanji = hanji[j]
next_hanji = hanji[j+1]
if not this_hanji in char_transition:
char_transition[this_hanji] = {next_hanji : 1}
elif not next_hanji in char_transition[this_hanji]:
char_transition[this_hanji][next_hanji] = 1
else:
char_transition[this_hanji][next_hanji] += 1
for i in char_transition.keys():
next_char = char_transition[i]
for j in next_char.keys():
cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j]))
#get_homophones("lí", cur, con)
con.commit()
con.close()
def get_homophones(pron, cur, con):
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
homophones = list(map(lambda x: x[0], homophones_raw))
return homophones
def convert(sentences):
splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences)
splitted_cleaned = list(filter(lambda x : x != '', splitted))
result = list(map(lambda s : convert_one_sentence(s), splitted_cleaned))
flatten_result = [x for xs in result for xss in xs for x in xss]
result_string = "".join(flatten_result)
print(result_string)
return result_string
def convert_one_sentence(sentence):
full_width = ["", "", "","","","", "", ""]
half_width = ["!", "?", ";", ":", ",", ".", "(", ")"]
if len(sentence) == 1:
for i in range(len(half_width)):
if sentence[0] == half_width[i]:
return [[full_width[i]]]
weight = 2/3
splitted = re.split(r'(--?|\s+)', sentence)
filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted))
small_capized = list(map(lambda x : x.lower(), filtered))
con = sqlite3.connect(model_filename)
cur = con.cursor()
homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized))
homophones_sequence = [list(map (lambda x : {"char": x,
"prev_char": None,
"prob" : 1}, i)) for i in homophones_sequence_raw]
head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
INNER JOIN pronounce ON pronounce.hanji = initial.char
WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall()))
return_result = [None] * len(small_capized)
if head_freqs == []:
return_result[0] = filtered[0]
homophones_sequence[0] = [{"char": filtered[0],
"prev_char": None,
"prob" : 1}]
else:
head_freq_total = reduce(lambda x , y : x + y, head_freqs)
for i in homophones_sequence[0]:
i_freq = cur.execute('''select initial.freq FROM initial
WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
i['prob'] = i_freq / head_freq_total
#for i in homophones_sequence[0]:
if len(small_capized) == 1:
max_prob = -math.inf
max_prob_char = None
for i in homophones_sequence[0]:
if i['prob'] > max_prob:
max_prob_char = i['char']
max_prob = i['prob']
return_result[0] = max_prob_char
else:
for i in range(1,len(small_capized)):
char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
INNER JOIN pronounce ON pronounce.hanji = initial.char
WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall()))
if char_freqs == []:
return_result[i] = filtered[i]
homophones_sequence[i] = [{"char": filtered[i],
"prev_char": None,
"prob" : 1}]
prev_char = ""
max_prob = -math.inf
for m in homophones_sequence[i-1]:
if m['prob'] > max_prob:
max_prob = m['prob']
prev_char = m['char']
homophones_sequence[i][0]['prob'] = max_prob
homophones_sequence[i][0]['prev_char'] = prev_char
else:
total_transition_freq = cur.execute('''
SELECT sum(t.freq)
FROM transition as t
INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char
INNER JOIN pronounce as p2 ON p2.hanji = t.next_char
where p2.lomaji = ? and p1.lomaji = ?''',
(small_capized[i], small_capized[i-1])).fetchall()[0][0]
for j in homophones_sequence[i]:
prev_char = None
max_prob = -math.inf
for k in homophones_sequence[i-1]:
k_to_j_freq_raw = cur.execute('''select freq from transition
where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall()
if k_to_j_freq_raw == []:
den = cur.execute('''
SELECT sum(p.freq)
FROM pronounce as p
inner join pronounce as p2
on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母
#分子
num = cur.execute(''' SELECT sum(freq) FROM pronounce as p where hanji = ?''', (j["char"],)).fetchall()[0][0]
k_to_j_freq = num/den * (1-weight)
else:
num = k_to_j_freq_raw[0][0]
don = total_transition_freq
k_to_j_freq =num/don * weight
if k_to_j_freq * k["prob"] > max_prob:
max_prob = k_to_j_freq * k["prob"]
prev_char = k["char"]
j["prob"] = max_prob
j["prev_char"] = prev_char
max_prob = -math.inf
current = ""
prev_char = ""
for i in homophones_sequence[len(homophones_sequence)-1]:
if i["prob"] > max_prob:
max_prob = i["prob"]
current = i["char"]
prev_char = i["prev_char"]
return_result[len(homophones_sequence)-1] = current
for i in range(len(homophones_sequence)-2, -1, -1):
current_ls = list(filter(lambda x : x["char"] == prev_char,
homophones_sequence[i]))
return_result[i] = prev_char
current = current_ls[0]["char"]
prev_char = current_ls[0]["prev_char"]
return return_result
def poj_to_tl(sentence):
return sentence
parser = argparse.ArgumentParser()
parser.add_argument('--genmod', help='generate the model', action='store_true',
required=False,)
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
help='the sentence to be converted')
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
default=['poj'],
help='the orthography to be used (poj or tl). Default is poj.')
args = parser.parse_args()
if args.genmod == True:
genmod()
elif args.sentence != None:
if args.form == ['poj']:
sentence = poj_to_tl(args.sentence)
convert(sentence)
else:
convert(args.sentence)
else:
parser.print_help()