This commit is contained in:
Tan, Kian-ting 2024-03-20 01:03:27 +08:00
parent e15440982d
commit 236f62a5cc
7 changed files with 42085 additions and 0 deletions

View file

@ -1,3 +1,4 @@
<<<<<<< HEAD
MIT License
Copyright (c) 2024 Tan, Kian-ting
@ -19,3 +20,9 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
=======
all the csv files are published by Ministry of Education, under CC BY-NC-SA TW 3.0
source code:
under MIT License
>>>>>>> ba6c568 (initial)

38
README.md Normal file
View file

@ -0,0 +1,38 @@
### pakkau - a lomoji to hanji Taiwanese (Hokkien) converter
A test of Hidden Markov Model converter from lomaji to hanji of Taiwanese (Hokkien). still in alpha version.
## Dependencies
- Python3
- Pandas
## Help
usage: pakkau.py [-h] [--genmod] [--form FORM] [SENTENCE]
positional arguments:
SENTENCE the sentence to be converted
options:
-h, --help show this help message and exit
--genmod generate the model
--form FORM the orthography to be used (poj or tl). Default is poj. (not opened)
#### example1:
``
python3 ./pakkau.py --form tl "Iâ-soo kóng:guá sī sè-kan ê kng"
``
output:
耶穌講:我是世間的光
#### example2:
``
python3 ./pakkau.py --genmod
``
generate models from the .csv parallel transliteration file in ./corpus files
## unfinished
poj conversion
the preciseness of the conversion

13829
corpus/教典例句.csv Normal file

File diff suppressed because it is too large Load diff

27908
corpus/教典發音詞.csv Normal file

File diff suppressed because it is too large Load diff

BIN
model.db Normal file

Binary file not shown.

302
pakkau.py Normal file
View file

@ -0,0 +1,302 @@
import re
import pandas as pd
import math
from functools import reduce
import argparse
import os
import sqlite3
from itertools import chain
model_filename = "model.db"
def genmod():
corpus_path = "./corpus/"
df_list = []
for file in os.listdir(corpus_path):
if file.endswith(".csv"):
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
df_list.append(df)
df = pd.concat(df_list)
df['lomaji'] = df['lomaji'].str.lower()
new_data = []
for index, row in df.iterrows():
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
new_data.append((hanji, tl2))
if (len(hanji) != len(tl2)):
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
#model_filename = "model.db"
try:
os.remove(model_filename)
except OSError:
pass
con = sqlite3.connect(model_filename)
cur = con.cursor()
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
char_to_pronounce = {}
for i in new_data:
hanji = i[0]
lomaji = i[1]
for j in range(len(i[0])):
if not hanji[j] in char_to_pronounce:
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
char_to_pronounce[hanji[j]][lomaji[j]] = 1
else:
char_to_pronounce[hanji[j]][lomaji[j]] += 1
for i in char_to_pronounce.keys():
hanji = char_to_pronounce[i]
for j in hanji.keys():
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
all_chars = char_to_pronounce.keys()
init_freq = {} #詞kap句開始ê字出現次數
cur.execute("CREATE TABLE initial(char, freq)")
for i in new_data:
head_hanji = i[0][0]
if head_hanji in init_freq:
init_freq[head_hanji] += 1
else:
init_freq[head_hanji] = 1
#補字
min_weight = 0.1
for i in all_chars:
if not i in init_freq.keys():
init_freq[i] = 0.1
for i in init_freq.keys():
cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i]))
char_transition = {}
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
for i in new_data:
hanji = i[0]
for j in range(len(i[0])-1):
this_hanji = hanji[j]
next_hanji = hanji[j+1]
if not this_hanji in char_transition:
char_transition[this_hanji] = {next_hanji : 1}
elif not next_hanji in char_transition[this_hanji]:
char_transition[this_hanji][next_hanji] = 1
else:
char_transition[this_hanji][next_hanji] += 1
for i in char_transition.keys():
next_char = char_transition[i]
for j in next_char.keys():
cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j]))
#get_homophones("lí", cur, con)
con.commit()
con.close()
def get_homophones(pron, cur, con):
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
homophones = list(map(lambda x: x[0], homophones_raw))
return homophones
def convert(sentences):
splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences)
splitted_cleaned = list(filter(lambda x : x != '', splitted))
result = list(map(lambda s : convert_one_sentence(s), splitted_cleaned))
flatten_result = [x for xs in result for xss in xs for x in xss]
result_string = "".join(flatten_result)
print(result_string)
return result_string
def convert_one_sentence(sentence):
full_width = ["", "", "","","","", "", ""]
half_width = ["!", "?", ";", ":", ",", ".", "(", ")"]
if len(sentence) == 1:
for i in range(len(half_width)):
if sentence[0] == half_width[i]:
return [[full_width[i]]]
weight = 2/3
splitted = re.split(r'(--?|\s+)', sentence)
filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted))
small_capized = list(map(lambda x : x.lower(), filtered))
con = sqlite3.connect(model_filename)
cur = con.cursor()
homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized))
homophones_sequence = [list(map (lambda x : {"char": x,
"prev_char": None,
"prob" : 1}, i)) for i in homophones_sequence_raw]
head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
INNER JOIN pronounce ON pronounce.hanji = initial.char
WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall()))
return_result = [None] * len(small_capized)
if head_freqs == []:
return_result[0] = filtered[0]
homophones_sequence[0] = [{"char": filtered[0],
"prev_char": None,
"prob" : 1}]
else:
head_freq_total = reduce(lambda x , y : x + y, head_freqs)
for i in homophones_sequence[0]:
i_freq = cur.execute('''select initial.freq FROM initial
WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
i['prob'] = i_freq / head_freq_total
#for i in homophones_sequence[0]:
if len(small_capized) == 1:
max_prob = -math.inf
max_prob_char = None
for i in homophones_sequence[0]:
if i['prob'] > max_prob:
max_prob_char = i['char']
max_prob = i['prob']
return_result[0] = max_prob_char
else:
for i in range(1,len(small_capized)):
char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
INNER JOIN pronounce ON pronounce.hanji = initial.char
WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall()))
if char_freqs == []:
return_result[i] = filtered[i]
homophones_sequence[i] = [{"char": filtered[i],
"prev_char": None,
"prob" : 1}]
prev_char = ""
max_prob = -math.inf
for m in homophones_sequence[i-1]:
if m['prob'] > max_prob:
max_prob = m['prob']
prev_char = m['char']
homophones_sequence[i][0]['prob'] = max_prob
homophones_sequence[i][0]['prev_char'] = prev_char
else:
total_transition_freq = cur.execute('''
SELECT sum(t.freq)
FROM transition as t
INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char
INNER JOIN pronounce as p2 ON p2.hanji = t.next_char
where p2.lomaji = ? and p1.lomaji = ?''',
(small_capized[i], small_capized[i-1])).fetchall()[0][0]
for j in homophones_sequence[i]:
prev_char = None
max_prob = -math.inf
for k in homophones_sequence[i-1]:
k_to_j_freq_raw = cur.execute('''select freq from transition
where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall()
if k_to_j_freq_raw == []:
den = cur.execute('''
SELECT sum(p.freq)
FROM pronounce as p
inner join pronounce as p2
on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母
#分子
num = cur.execute(''' SELECT sum(freq) FROM pronounce as p where hanji = ?''', (j["char"],)).fetchall()[0][0]
k_to_j_freq = num/den * (1-weight)
else:
num = k_to_j_freq_raw[0][0]
don = total_transition_freq
k_to_j_freq =num/don * weight
if k_to_j_freq * k["prob"] > max_prob:
max_prob = k_to_j_freq * k["prob"]
prev_char = k["char"]
j["prob"] = max_prob
j["prev_char"] = prev_char
max_prob = -math.inf
current = ""
prev_char = ""
for i in homophones_sequence[len(homophones_sequence)-1]:
if i["prob"] > max_prob:
max_prob = i["prob"]
current = i["char"]
prev_char = i["prev_char"]
return_result[len(homophones_sequence)-1] = current
for i in range(len(homophones_sequence)-2, -1, -1):
current_ls = list(filter(lambda x : x["char"] == prev_char,
homophones_sequence[i]))
return_result[i] = prev_char
current = current_ls[0]["char"]
prev_char = current_ls[0]["prev_char"]
return return_result
def poj_to_tl(sentence):
return sentence
parser = argparse.ArgumentParser()
parser.add_argument('--genmod', help='generate the model', action='store_true',
required=False,)
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
help='the sentence to be converted')
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
default=['poj'],
help='the orthography to be used (poj or tl). Default is poj.')
args = parser.parse_args()
if args.genmod == True:
genmod()
elif args.sentence != None:
if args.form == ['poj']:
sentence = poj_to_tl(args.sentence)
convert(sentence)
else:
convert(args.sentence)
else:
parser.print_help()

1
result.json Normal file

File diff suppressed because one or more lines are too long