diff --git a/3rd/LICENSE b/3rd/LICENSE new file mode 100644 index 0000000..9bd6865 --- /dev/null +++ b/3rd/LICENSE @@ -0,0 +1,28 @@ +<<<<<<< HEAD +MIT License + +Copyright (c) 2024 Tan, Kian-ting + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +======= +all the csv files are published by Ministry of Education, under CC BY-NC-SA TW 3.0 + +source code: +under MIT License +>>>>>>> ba6c568 (initial) diff --git a/3rd/model.db b/3rd/model.db new file mode 100644 index 0000000..d537453 Binary files /dev/null and b/3rd/model.db differ diff --git a/3rd/pakkau.py b/3rd/pakkau.py new file mode 100644 index 0000000..07b6610 --- /dev/null +++ b/3rd/pakkau.py @@ -0,0 +1,302 @@ +import re +import pandas as pd +import math +from functools import reduce +import argparse +import os +import sqlite3 +from itertools import chain + +model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model.db") + +def genmod(): + corpus_path = "./corpus/" + df_list = [] + for file in os.listdir(corpus_path): + if file.endswith(".csv"): + df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji']) + df_list.append(df) + df = pd.concat(df_list) + df['lomaji'] = df['lomaji'].str.lower() + + new_data = [] + + for index, row in df.iterrows(): + hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji']))) + tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji']) + tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl)) + new_data.append((hanji, tl2)) + if (len(hanji) != len(tl2)): + raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.") + + #model_filename = "model.db" + try: + os.remove(model_filename) + except OSError: + pass + + con = sqlite3.connect(model_filename) + cur = con.cursor() + cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)") + + + char_to_pronounce = {} + + for i in new_data: + hanji = i[0] + lomaji = i[1] + for j in range(len(i[0])): + if not hanji[j] in char_to_pronounce: + char_to_pronounce[hanji[j]] = {lomaji[j] : 1} + elif not lomaji[j] in char_to_pronounce[hanji[j]]: + char_to_pronounce[hanji[j]][lomaji[j]] = 1 + else: + char_to_pronounce[hanji[j]][lomaji[j]] += 1 + + + for i in char_to_pronounce.keys(): + hanji = char_to_pronounce[i] + for j in hanji.keys(): + cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j])) + + all_chars = char_to_pronounce.keys() + init_freq = {} #詞kap句開始ê字出現次數 + cur.execute("CREATE TABLE initial(char, freq)") + + + for i in new_data: + head_hanji = i[0][0] + + if head_hanji in init_freq: + init_freq[head_hanji] += 1 + else: + init_freq[head_hanji] = 1 + + #補字 + min_weight = 0.1 + + for i in all_chars: + if not i in init_freq.keys(): + init_freq[i] = 0.1 + + for i in init_freq.keys(): + cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i])) + + char_transition = {} + cur.execute("CREATE TABLE transition(prev_char, next_char, freq)") + + for i in new_data: + hanji = i[0] + for j in range(len(i[0])-1): + this_hanji = hanji[j] + next_hanji = hanji[j+1] + if not this_hanji in char_transition: + char_transition[this_hanji] = {next_hanji : 1} + elif not next_hanji in char_transition[this_hanji]: + char_transition[this_hanji][next_hanji] = 1 + else: + char_transition[this_hanji][next_hanji] += 1 + + for i in char_transition.keys(): + next_char = char_transition[i] + for j in next_char.keys(): + cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j])) + + + #get_homophones("lí", cur, con) + + con.commit() + con.close() + +def get_homophones(pron, cur, con): + homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall() + homophones = list(map(lambda x: x[0], homophones_raw)) + + return homophones + +def convert(sentences): + splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences) + splitted_cleaned = list(filter(lambda x : x != '', splitted)) + + result = list(map(lambda s : convert_one_sentence(s), splitted_cleaned)) + + flatten_result = [x for xs in result for xss in xs for x in xss] + result_string = "".join(flatten_result) + + + print(result_string) + return result_string + +def convert_one_sentence(sentence): + full_width = ["!", "?", ";",":",",","。", "(", ")"] + half_width = ["!", "?", ";", ":", ",", ".", "(", ")"] + + if len(sentence) == 1: + for i in range(len(half_width)): + if sentence[0] == half_width[i]: + return [[full_width[i]]] + + + weight = 2/3 + + splitted = re.split(r'(--?|\s+)', sentence) + filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted)) + small_capized = list(map(lambda x : x.lower(), filtered)) + + con = sqlite3.connect(model_filename) + cur = con.cursor() + + homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized)) + + homophones_sequence = [list(map (lambda x : {"char": x, + "prev_char": None, + "prob" : 1}, i)) for i in homophones_sequence_raw] + + + + head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial + INNER JOIN pronounce ON pronounce.hanji = initial.char + WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall())) + + return_result = [None] * len(small_capized) + + if head_freqs == []: + return_result[0] = filtered[0] + homophones_sequence[0] = [{"char": filtered[0], + "prev_char": None, + "prob" : 1}] + + else: + head_freq_total = reduce(lambda x , y : x + y, head_freqs) + + for i in homophones_sequence[0]: + i_freq = cur.execute('''select initial.freq FROM initial + WHERE initial.char = ?''', (i['char'])).fetchall()[0][0] + + i['prob'] = i_freq / head_freq_total + + + #for i in homophones_sequence[0]: + + + + if len(small_capized) == 1: + max_prob = -math.inf + max_prob_char = None + for i in homophones_sequence[0]: + if i['prob'] > max_prob: + max_prob_char = i['char'] + max_prob = i['prob'] + + return_result[0] = max_prob_char + + else: + for i in range(1,len(small_capized)): + char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial + INNER JOIN pronounce ON pronounce.hanji = initial.char + WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall())) + + if char_freqs == []: + return_result[i] = filtered[i] + homophones_sequence[i] = [{"char": filtered[i], + "prev_char": None, + "prob" : 1}] + prev_char = "" + max_prob = -math.inf + for m in homophones_sequence[i-1]: + if m['prob'] > max_prob: + max_prob = m['prob'] + prev_char = m['char'] + homophones_sequence[i][0]['prob'] = max_prob + homophones_sequence[i][0]['prev_char'] = prev_char + else: + total_transition_freq = cur.execute(''' +SELECT sum(t.freq) +FROM transition as t +INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char +INNER JOIN pronounce as p2 ON p2.hanji = t.next_char +where p2.lomaji = ? and p1.lomaji = ?''', + (small_capized[i], small_capized[i-1])).fetchall()[0][0] + for j in homophones_sequence[i]: + prev_char = None + max_prob = -math.inf + + for k in homophones_sequence[i-1]: + k_to_j_freq_raw = cur.execute('''select freq from transition +where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall() + if k_to_j_freq_raw == []: + den = cur.execute(''' +SELECT sum(p.freq) +FROM pronounce as p +inner join pronounce as p2 +on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母 + #分子 + num = cur.execute(''' SELECT sum(freq) FROM pronounce as p where hanji = ?''', (j["char"],)).fetchall()[0][0] + + k_to_j_freq = num/den * (1-weight) + + else: + num = k_to_j_freq_raw[0][0] + don = total_transition_freq + k_to_j_freq =num/don * weight + + if k_to_j_freq * k["prob"] > max_prob: + max_prob = k_to_j_freq * k["prob"] + prev_char = k["char"] + + j["prob"] = max_prob + j["prev_char"] = prev_char + + max_prob = -math.inf + current = "" + prev_char = "" + for i in homophones_sequence[len(homophones_sequence)-1]: + if i["prob"] > max_prob: + max_prob = i["prob"] + current = i["char"] + prev_char = i["prev_char"] + + + + return_result[len(homophones_sequence)-1] = current + + for i in range(len(homophones_sequence)-2, -1, -1): + current_ls = list(filter(lambda x : x["char"] == prev_char, + homophones_sequence[i])) + + return_result[i] = prev_char + current = current_ls[0]["char"] + prev_char = current_ls[0]["prev_char"] + + + + + return return_result + + +def poj_to_tl(sentence): + return sentence + +parser = argparse.ArgumentParser() +parser.add_argument('--genmod', help='generate the model', action='store_true', + required=False,) + +parser.add_argument('sentence', metavar='SENTENCE', nargs='?', + help='the sentence to be converted') +parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1, + default=['poj'], + help='the orthography to be used (poj or tl). Default is poj.') + +args = parser.parse_args() + +if args.genmod == True: + genmod() +elif args.sentence != None: + if args.form == ['poj']: + sentence = poj_to_tl(args.sentence) + convert(sentence) + else: + convert(args.sentence) +else: + parser.print_help() +