initial
This commit is contained in:
parent
e15440982d
commit
236f62a5cc
7 changed files with 42085 additions and 0 deletions
7
LICENSE
7
LICENSE
|
@ -1,3 +1,4 @@
|
||||||
|
<<<<<<< HEAD
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2024 Tan, Kian-ting
|
Copyright (c) 2024 Tan, Kian-ting
|
||||||
|
@ -19,3 +20,9 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
=======
|
||||||
|
all the csv files are published by Ministry of Education, under CC BY-NC-SA TW 3.0
|
||||||
|
|
||||||
|
source code:
|
||||||
|
under MIT License
|
||||||
|
>>>>>>> ba6c568 (initial)
|
||||||
|
|
38
README.md
Normal file
38
README.md
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
### pakkau - a lomoji to hanji Taiwanese (Hokkien) converter
|
||||||
|
|
||||||
|
A test of Hidden Markov Model converter from lomaji to hanji of Taiwanese (Hokkien). still in alpha version.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
- Python3
|
||||||
|
- Pandas
|
||||||
|
|
||||||
|
## Help
|
||||||
|
|
||||||
|
usage: pakkau.py [-h] [--genmod] [--form FORM] [SENTENCE]
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
SENTENCE the sentence to be converted
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--genmod generate the model
|
||||||
|
--form FORM the orthography to be used (poj or tl). Default is poj. (not opened)
|
||||||
|
|
||||||
|
#### example1:
|
||||||
|
``
|
||||||
|
python3 ./pakkau.py --form tl "Iâ-soo kóng:guá sī sè-kan ê kng"
|
||||||
|
``
|
||||||
|
output:
|
||||||
|
|
||||||
|
耶穌講:我是世間的光
|
||||||
|
|
||||||
|
#### example2:
|
||||||
|
``
|
||||||
|
python3 ./pakkau.py --genmod
|
||||||
|
``
|
||||||
|
generate models from the .csv parallel transliteration file in ./corpus files
|
||||||
|
|
||||||
|
|
||||||
|
## unfinished
|
||||||
|
poj conversion
|
||||||
|
the preciseness of the conversion
|
13829
corpus/教典例句.csv
Normal file
13829
corpus/教典例句.csv
Normal file
File diff suppressed because it is too large
Load diff
27908
corpus/教典發音詞.csv
Normal file
27908
corpus/教典發音詞.csv
Normal file
File diff suppressed because it is too large
Load diff
BIN
model.db
Normal file
BIN
model.db
Normal file
Binary file not shown.
302
pakkau.py
Normal file
302
pakkau.py
Normal file
|
@ -0,0 +1,302 @@
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import math
|
||||||
|
from functools import reduce
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
model_filename = "model.db"
|
||||||
|
|
||||||
|
def genmod():
|
||||||
|
corpus_path = "./corpus/"
|
||||||
|
df_list = []
|
||||||
|
for file in os.listdir(corpus_path):
|
||||||
|
if file.endswith(".csv"):
|
||||||
|
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
|
||||||
|
df_list.append(df)
|
||||||
|
df = pd.concat(df_list)
|
||||||
|
df['lomaji'] = df['lomaji'].str.lower()
|
||||||
|
|
||||||
|
new_data = []
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
|
||||||
|
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
|
||||||
|
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
|
||||||
|
new_data.append((hanji, tl2))
|
||||||
|
if (len(hanji) != len(tl2)):
|
||||||
|
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
|
||||||
|
|
||||||
|
#model_filename = "model.db"
|
||||||
|
try:
|
||||||
|
os.remove(model_filename)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
con = sqlite3.connect(model_filename)
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
|
||||||
|
|
||||||
|
|
||||||
|
char_to_pronounce = {}
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
lomaji = i[1]
|
||||||
|
for j in range(len(i[0])):
|
||||||
|
if not hanji[j] in char_to_pronounce:
|
||||||
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||||||
|
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] = 1
|
||||||
|
else:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] += 1
|
||||||
|
|
||||||
|
|
||||||
|
for i in char_to_pronounce.keys():
|
||||||
|
hanji = char_to_pronounce[i]
|
||||||
|
for j in hanji.keys():
|
||||||
|
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
|
||||||
|
|
||||||
|
all_chars = char_to_pronounce.keys()
|
||||||
|
init_freq = {} #詞kap句開始ê字出現次數
|
||||||
|
cur.execute("CREATE TABLE initial(char, freq)")
|
||||||
|
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
head_hanji = i[0][0]
|
||||||
|
|
||||||
|
if head_hanji in init_freq:
|
||||||
|
init_freq[head_hanji] += 1
|
||||||
|
else:
|
||||||
|
init_freq[head_hanji] = 1
|
||||||
|
|
||||||
|
#補字
|
||||||
|
min_weight = 0.1
|
||||||
|
|
||||||
|
for i in all_chars:
|
||||||
|
if not i in init_freq.keys():
|
||||||
|
init_freq[i] = 0.1
|
||||||
|
|
||||||
|
for i in init_freq.keys():
|
||||||
|
cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i]))
|
||||||
|
|
||||||
|
char_transition = {}
|
||||||
|
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
for j in range(len(i[0])-1):
|
||||||
|
this_hanji = hanji[j]
|
||||||
|
next_hanji = hanji[j+1]
|
||||||
|
if not this_hanji in char_transition:
|
||||||
|
char_transition[this_hanji] = {next_hanji : 1}
|
||||||
|
elif not next_hanji in char_transition[this_hanji]:
|
||||||
|
char_transition[this_hanji][next_hanji] = 1
|
||||||
|
else:
|
||||||
|
char_transition[this_hanji][next_hanji] += 1
|
||||||
|
|
||||||
|
for i in char_transition.keys():
|
||||||
|
next_char = char_transition[i]
|
||||||
|
for j in next_char.keys():
|
||||||
|
cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j]))
|
||||||
|
|
||||||
|
|
||||||
|
#get_homophones("lí", cur, con)
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
def get_homophones(pron, cur, con):
|
||||||
|
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
|
||||||
|
homophones = list(map(lambda x: x[0], homophones_raw))
|
||||||
|
|
||||||
|
return homophones
|
||||||
|
|
||||||
|
def convert(sentences):
|
||||||
|
splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences)
|
||||||
|
splitted_cleaned = list(filter(lambda x : x != '', splitted))
|
||||||
|
|
||||||
|
result = list(map(lambda s : convert_one_sentence(s), splitted_cleaned))
|
||||||
|
|
||||||
|
flatten_result = [x for xs in result for xss in xs for x in xss]
|
||||||
|
result_string = "".join(flatten_result)
|
||||||
|
|
||||||
|
|
||||||
|
print(result_string)
|
||||||
|
return result_string
|
||||||
|
|
||||||
|
def convert_one_sentence(sentence):
|
||||||
|
full_width = ["!", "?", ";",":",",","。", "(", ")"]
|
||||||
|
half_width = ["!", "?", ";", ":", ",", ".", "(", ")"]
|
||||||
|
|
||||||
|
if len(sentence) == 1:
|
||||||
|
for i in range(len(half_width)):
|
||||||
|
if sentence[0] == half_width[i]:
|
||||||
|
return [[full_width[i]]]
|
||||||
|
|
||||||
|
|
||||||
|
weight = 2/3
|
||||||
|
|
||||||
|
splitted = re.split(r'(--?|\s+)', sentence)
|
||||||
|
filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted))
|
||||||
|
small_capized = list(map(lambda x : x.lower(), filtered))
|
||||||
|
|
||||||
|
con = sqlite3.connect(model_filename)
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized))
|
||||||
|
|
||||||
|
homophones_sequence = [list(map (lambda x : {"char": x,
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}, i)) for i in homophones_sequence_raw]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
|
||||||
|
INNER JOIN pronounce ON pronounce.hanji = initial.char
|
||||||
|
WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall()))
|
||||||
|
|
||||||
|
return_result = [None] * len(small_capized)
|
||||||
|
|
||||||
|
if head_freqs == []:
|
||||||
|
return_result[0] = filtered[0]
|
||||||
|
homophones_sequence[0] = [{"char": filtered[0],
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}]
|
||||||
|
|
||||||
|
else:
|
||||||
|
head_freq_total = reduce(lambda x , y : x + y, head_freqs)
|
||||||
|
|
||||||
|
for i in homophones_sequence[0]:
|
||||||
|
i_freq = cur.execute('''select initial.freq FROM initial
|
||||||
|
WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
|
||||||
|
|
||||||
|
i['prob'] = i_freq / head_freq_total
|
||||||
|
|
||||||
|
|
||||||
|
#for i in homophones_sequence[0]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if len(small_capized) == 1:
|
||||||
|
max_prob = -math.inf
|
||||||
|
max_prob_char = None
|
||||||
|
for i in homophones_sequence[0]:
|
||||||
|
if i['prob'] > max_prob:
|
||||||
|
max_prob_char = i['char']
|
||||||
|
max_prob = i['prob']
|
||||||
|
|
||||||
|
return_result[0] = max_prob_char
|
||||||
|
|
||||||
|
else:
|
||||||
|
for i in range(1,len(small_capized)):
|
||||||
|
char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
|
||||||
|
INNER JOIN pronounce ON pronounce.hanji = initial.char
|
||||||
|
WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall()))
|
||||||
|
|
||||||
|
if char_freqs == []:
|
||||||
|
return_result[i] = filtered[i]
|
||||||
|
homophones_sequence[i] = [{"char": filtered[i],
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}]
|
||||||
|
prev_char = ""
|
||||||
|
max_prob = -math.inf
|
||||||
|
for m in homophones_sequence[i-1]:
|
||||||
|
if m['prob'] > max_prob:
|
||||||
|
max_prob = m['prob']
|
||||||
|
prev_char = m['char']
|
||||||
|
homophones_sequence[i][0]['prob'] = max_prob
|
||||||
|
homophones_sequence[i][0]['prev_char'] = prev_char
|
||||||
|
else:
|
||||||
|
total_transition_freq = cur.execute('''
|
||||||
|
SELECT sum(t.freq)
|
||||||
|
FROM transition as t
|
||||||
|
INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char
|
||||||
|
INNER JOIN pronounce as p2 ON p2.hanji = t.next_char
|
||||||
|
where p2.lomaji = ? and p1.lomaji = ?''',
|
||||||
|
(small_capized[i], small_capized[i-1])).fetchall()[0][0]
|
||||||
|
for j in homophones_sequence[i]:
|
||||||
|
prev_char = None
|
||||||
|
max_prob = -math.inf
|
||||||
|
|
||||||
|
for k in homophones_sequence[i-1]:
|
||||||
|
k_to_j_freq_raw = cur.execute('''select freq from transition
|
||||||
|
where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall()
|
||||||
|
if k_to_j_freq_raw == []:
|
||||||
|
den = cur.execute('''
|
||||||
|
SELECT sum(p.freq)
|
||||||
|
FROM pronounce as p
|
||||||
|
inner join pronounce as p2
|
||||||
|
on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母
|
||||||
|
#分子
|
||||||
|
num = cur.execute(''' SELECT sum(freq) FROM pronounce as p where hanji = ?''', (j["char"],)).fetchall()[0][0]
|
||||||
|
|
||||||
|
k_to_j_freq = num/den * (1-weight)
|
||||||
|
|
||||||
|
else:
|
||||||
|
num = k_to_j_freq_raw[0][0]
|
||||||
|
don = total_transition_freq
|
||||||
|
k_to_j_freq =num/don * weight
|
||||||
|
|
||||||
|
if k_to_j_freq * k["prob"] > max_prob:
|
||||||
|
max_prob = k_to_j_freq * k["prob"]
|
||||||
|
prev_char = k["char"]
|
||||||
|
|
||||||
|
j["prob"] = max_prob
|
||||||
|
j["prev_char"] = prev_char
|
||||||
|
|
||||||
|
max_prob = -math.inf
|
||||||
|
current = ""
|
||||||
|
prev_char = ""
|
||||||
|
for i in homophones_sequence[len(homophones_sequence)-1]:
|
||||||
|
if i["prob"] > max_prob:
|
||||||
|
max_prob = i["prob"]
|
||||||
|
current = i["char"]
|
||||||
|
prev_char = i["prev_char"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return_result[len(homophones_sequence)-1] = current
|
||||||
|
|
||||||
|
for i in range(len(homophones_sequence)-2, -1, -1):
|
||||||
|
current_ls = list(filter(lambda x : x["char"] == prev_char,
|
||||||
|
homophones_sequence[i]))
|
||||||
|
|
||||||
|
return_result[i] = prev_char
|
||||||
|
current = current_ls[0]["char"]
|
||||||
|
prev_char = current_ls[0]["prev_char"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return return_result
|
||||||
|
|
||||||
|
|
||||||
|
def poj_to_tl(sentence):
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--genmod', help='generate the model', action='store_true',
|
||||||
|
required=False,)
|
||||||
|
|
||||||
|
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
|
||||||
|
help='the sentence to be converted')
|
||||||
|
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
|
||||||
|
default=['poj'],
|
||||||
|
help='the orthography to be used (poj or tl). Default is poj.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.genmod == True:
|
||||||
|
genmod()
|
||||||
|
elif args.sentence != None:
|
||||||
|
if args.form == ['poj']:
|
||||||
|
sentence = poj_to_tl(args.sentence)
|
||||||
|
convert(sentence)
|
||||||
|
else:
|
||||||
|
convert(args.sentence)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
1
result.json
Normal file
1
result.json
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue