initial
This commit is contained in:
		
							parent
							
								
									e15440982d
								
							
						
					
					
						commit
						236f62a5cc
					
				
					 7 changed files with 42085 additions and 0 deletions
				
			
		
							
								
								
									
										7
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								LICENSE
									
									
									
									
									
								
							|  | @ -1,3 +1,4 @@ | ||||||
|  | <<<<<<< HEAD | ||||||
| MIT License | MIT License | ||||||
| 
 | 
 | ||||||
| Copyright (c) 2024 Tan, Kian-ting | Copyright (c) 2024 Tan, Kian-ting | ||||||
|  | @ -19,3 +20,9 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
| SOFTWARE. | SOFTWARE. | ||||||
|  | ======= | ||||||
|  | all the csv files are published by Ministry of Education, under CC BY-NC-SA TW 3.0 | ||||||
|  | 
 | ||||||
|  | source code: | ||||||
|  | under MIT License | ||||||
|  | >>>>>>> ba6c568 (initial) | ||||||
|  |  | ||||||
							
								
								
									
										38
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,38 @@ | ||||||
|  | ### pakkau - a lomoji to hanji Taiwanese (Hokkien) converter | ||||||
|  | 
 | ||||||
|  | A test of Hidden Markov Model converter from lomaji to hanji of Taiwanese (Hokkien). still in alpha version. | ||||||
|  | 
 | ||||||
|  | ## Dependencies | ||||||
|  | - Python3 | ||||||
|  | - Pandas | ||||||
|  | 
 | ||||||
|  | ## Help | ||||||
|  | 
 | ||||||
|  | usage: pakkau.py [-h] [--genmod] [--form FORM] [SENTENCE] | ||||||
|  | 
 | ||||||
|  | positional arguments: | ||||||
|  |   SENTENCE     the sentence to be converted | ||||||
|  | 
 | ||||||
|  | options: | ||||||
|  |   -h, --help   show this help message and exit | ||||||
|  |   --genmod     generate the model | ||||||
|  |   --form FORM  the orthography to be used (poj or tl). Default is poj. (not opened) | ||||||
|  | 
 | ||||||
|  | #### example1: | ||||||
|  | `` | ||||||
|  |   python3 ./pakkau.py --form tl "Iâ-soo kóng:guá sī sè-kan ê kng" | ||||||
|  | `` | ||||||
|  | output: | ||||||
|  | 
 | ||||||
|  | 耶穌講:我是世間的光 | ||||||
|  | 
 | ||||||
|  | #### example2: | ||||||
|  | `` | ||||||
|  | python3 ./pakkau.py --genmod | ||||||
|  | `` | ||||||
|  | generate models from the .csv parallel transliteration  file in ./corpus files | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ## unfinished | ||||||
|  | poj conversion | ||||||
|  | the preciseness of the conversion | ||||||
							
								
								
									
										13829
									
								
								corpus/教典例句.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13829
									
								
								corpus/教典例句.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										27908
									
								
								corpus/教典發音詞.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27908
									
								
								corpus/教典發音詞.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								model.db
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model.db
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										302
									
								
								pakkau.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										302
									
								
								pakkau.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,302 @@ | ||||||
|  | import re | ||||||
|  | import pandas as pd | ||||||
|  | import math | ||||||
|  | from functools import reduce | ||||||
|  | import argparse | ||||||
|  | import os | ||||||
|  | import sqlite3 | ||||||
|  | from itertools import chain | ||||||
|  | 
 | ||||||
|  | model_filename = "model.db" | ||||||
|  | 
 | ||||||
|  | def genmod(): | ||||||
|  |     corpus_path = "./corpus/" | ||||||
|  |     df_list = [] | ||||||
|  |     for file in os.listdir(corpus_path): | ||||||
|  |         if file.endswith(".csv"): | ||||||
|  |             df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji']) | ||||||
|  |             df_list.append(df) | ||||||
|  |     df = pd.concat(df_list) | ||||||
|  |     df['lomaji'] = df['lomaji'].str.lower() | ||||||
|  | 
 | ||||||
|  |     new_data = [] | ||||||
|  | 
 | ||||||
|  |     for index, row in df.iterrows(): | ||||||
|  |         hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji']))) | ||||||
|  |         tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji']) | ||||||
|  |         tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl)) | ||||||
|  |         new_data.append((hanji, tl2)) | ||||||
|  |         if (len(hanji) != len(tl2)): | ||||||
|  |             raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.") | ||||||
|  | 
 | ||||||
|  |     #model_filename = "model.db" | ||||||
|  |     try: | ||||||
|  |         os.remove(model_filename) | ||||||
|  |     except OSError: | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     con = sqlite3.connect(model_filename) | ||||||
|  |     cur = con.cursor() | ||||||
|  |     cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)") | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     char_to_pronounce = {} | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         hanji = i[0] | ||||||
|  |         lomaji = i[1] | ||||||
|  |         for j in range(len(i[0])): | ||||||
|  |             if not hanji[j] in char_to_pronounce: | ||||||
|  |                 char_to_pronounce[hanji[j]] = {lomaji[j] : 1} | ||||||
|  |             elif not lomaji[j] in char_to_pronounce[hanji[j]]: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] = 1 | ||||||
|  |             else: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] += 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     for i in char_to_pronounce.keys(): | ||||||
|  |         hanji =  char_to_pronounce[i] | ||||||
|  |         for j in hanji.keys(): | ||||||
|  |             cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j])) | ||||||
|  | 
 | ||||||
|  |     all_chars = char_to_pronounce.keys() | ||||||
|  |     init_freq = {} #詞kap句開始ê字出現次數 | ||||||
|  |     cur.execute("CREATE TABLE initial(char, freq)") | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         head_hanji = i[0][0] | ||||||
|  | 
 | ||||||
|  |         if head_hanji in init_freq: | ||||||
|  |             init_freq[head_hanji] += 1 | ||||||
|  |         else: | ||||||
|  |             init_freq[head_hanji] = 1 | ||||||
|  |      | ||||||
|  |     #補字 | ||||||
|  |     min_weight = 0.1 | ||||||
|  | 
 | ||||||
|  |     for i in all_chars: | ||||||
|  |         if not i in init_freq.keys(): | ||||||
|  |             init_freq[i] = 0.1 | ||||||
|  | 
 | ||||||
|  |     for i in init_freq.keys(): | ||||||
|  |         cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i])) | ||||||
|  | 
 | ||||||
|  |     char_transition = {} | ||||||
|  |     cur.execute("CREATE TABLE transition(prev_char, next_char, freq)") | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         hanji = i[0] | ||||||
|  |         for j in range(len(i[0])-1): | ||||||
|  |             this_hanji = hanji[j] | ||||||
|  |             next_hanji = hanji[j+1] | ||||||
|  |             if not this_hanji in char_transition: | ||||||
|  |                 char_transition[this_hanji] = {next_hanji : 1} | ||||||
|  |             elif not next_hanji in char_transition[this_hanji]: | ||||||
|  |                 char_transition[this_hanji][next_hanji] = 1 | ||||||
|  |             else: | ||||||
|  |                 char_transition[this_hanji][next_hanji] += 1 | ||||||
|  | 
 | ||||||
|  |     for i in char_transition.keys(): | ||||||
|  |         next_char = char_transition[i] | ||||||
|  |         for j in next_char.keys(): | ||||||
|  |             cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j])) | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     #get_homophones("lí", cur, con) | ||||||
|  |              | ||||||
|  |     con.commit() | ||||||
|  |     con.close() | ||||||
|  | 
 | ||||||
|  | def get_homophones(pron, cur, con): | ||||||
|  |     homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall() | ||||||
|  |     homophones = list(map(lambda x: x[0], homophones_raw)) | ||||||
|  |      | ||||||
|  |     return homophones | ||||||
|  | 
 | ||||||
|  | def convert(sentences): | ||||||
|  |     splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences) | ||||||
|  |     splitted_cleaned = list(filter(lambda x : x != '', splitted)) | ||||||
|  | 
 | ||||||
|  |     result =  list(map(lambda s : convert_one_sentence(s), splitted_cleaned)) | ||||||
|  | 
 | ||||||
|  |     flatten_result = [x for xs in result for xss in xs for x in xss] | ||||||
|  |     result_string = "".join(flatten_result) | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     print(result_string) | ||||||
|  |     return result_string | ||||||
|  |      | ||||||
|  | def convert_one_sentence(sentence): | ||||||
|  |     full_width = ["!", "?", ";",":",",","。", "(", ")"] | ||||||
|  |     half_width = ["!", "?", ";", ":", ",", ".", "(", ")"] | ||||||
|  | 
 | ||||||
|  |     if len(sentence) == 1: | ||||||
|  |         for i in range(len(half_width)): | ||||||
|  |             if sentence[0] == half_width[i]: | ||||||
|  |                 return [[full_width[i]]] | ||||||
|  |          | ||||||
|  |      | ||||||
|  |     weight = 2/3 | ||||||
|  |      | ||||||
|  |     splitted = re.split(r'(--?|\s+)', sentence) | ||||||
|  |     filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted)) | ||||||
|  |     small_capized = list(map(lambda x : x.lower(), filtered)) | ||||||
|  |      | ||||||
|  |     con = sqlite3.connect(model_filename) | ||||||
|  |     cur = con.cursor() | ||||||
|  | 
 | ||||||
|  |     homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized)) | ||||||
|  | 
 | ||||||
|  |     homophones_sequence = [list(map (lambda x : {"char": x, | ||||||
|  |                                       "prev_char": None, | ||||||
|  |                                                  "prob" : 1}, i)) for i in homophones_sequence_raw] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial  | ||||||
|  |     INNER JOIN pronounce ON pronounce.hanji = initial.char | ||||||
|  |     WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall())) | ||||||
|  | 
 | ||||||
|  |     return_result = [None] * len(small_capized) | ||||||
|  |      | ||||||
|  |     if head_freqs == []: | ||||||
|  |         return_result[0] = filtered[0] | ||||||
|  |         homophones_sequence[0] = [{"char": filtered[0], | ||||||
|  |                                   "prev_char": None, | ||||||
|  |                                   "prob" : 1}] | ||||||
|  |      | ||||||
|  |     else: | ||||||
|  |         head_freq_total = reduce(lambda x , y : x + y, head_freqs) | ||||||
|  | 
 | ||||||
|  |         for i in homophones_sequence[0]: | ||||||
|  |             i_freq = cur.execute('''select initial.freq FROM initial  | ||||||
|  |     WHERE initial.char = ?''', (i['char'])).fetchall()[0][0] | ||||||
|  | 
 | ||||||
|  |             i['prob'] = i_freq / head_freq_total | ||||||
|  |      | ||||||
|  |      | ||||||
|  |     #for i in homophones_sequence[0]: | ||||||
|  |          | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     if len(small_capized) == 1: | ||||||
|  |         max_prob = -math.inf | ||||||
|  |         max_prob_char = None | ||||||
|  |         for i in homophones_sequence[0]: | ||||||
|  |             if i['prob'] > max_prob: | ||||||
|  |                 max_prob_char = i['char'] | ||||||
|  |                 max_prob = i['prob'] | ||||||
|  | 
 | ||||||
|  |         return_result[0] = max_prob_char | ||||||
|  | 
 | ||||||
|  |     else: | ||||||
|  |         for i in range(1,len(small_capized)): | ||||||
|  |             char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial  | ||||||
|  |     INNER JOIN pronounce ON pronounce.hanji = initial.char | ||||||
|  |     WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall())) | ||||||
|  | 
 | ||||||
|  |             if char_freqs == []: | ||||||
|  |                 return_result[i] = filtered[i] | ||||||
|  |                 homophones_sequence[i] = [{"char": filtered[i], | ||||||
|  |                                   "prev_char": None, | ||||||
|  |                                   "prob" : 1}] | ||||||
|  |                 prev_char = "" | ||||||
|  |                 max_prob = -math.inf | ||||||
|  |                 for m in homophones_sequence[i-1]: | ||||||
|  |                     if m['prob'] > max_prob: | ||||||
|  |                         max_prob = m['prob'] | ||||||
|  |                         prev_char = m['char'] | ||||||
|  |                 homophones_sequence[i][0]['prob'] = max_prob | ||||||
|  |                 homophones_sequence[i][0]['prev_char'] = prev_char | ||||||
|  |             else: | ||||||
|  |                 total_transition_freq = cur.execute(''' | ||||||
|  | SELECT sum(t.freq) | ||||||
|  | FROM transition as t | ||||||
|  | INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char | ||||||
|  | INNER JOIN pronounce as p2 ON p2.hanji = t.next_char | ||||||
|  | where p2.lomaji = ?  and p1.lomaji = ?''', | ||||||
|  |                                               (small_capized[i], small_capized[i-1])).fetchall()[0][0] | ||||||
|  |                 for j in homophones_sequence[i]: | ||||||
|  |                     prev_char = None | ||||||
|  |                     max_prob = -math.inf | ||||||
|  | 
 | ||||||
|  |                     for k in homophones_sequence[i-1]: | ||||||
|  |                         k_to_j_freq_raw = cur.execute('''select freq from transition | ||||||
|  | where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall() | ||||||
|  |                         if k_to_j_freq_raw == []: | ||||||
|  |                             den = cur.execute(''' | ||||||
|  | SELECT sum(p.freq) | ||||||
|  | FROM pronounce as p  | ||||||
|  | inner join pronounce as p2 | ||||||
|  | on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母 | ||||||
|  |                             #分子 | ||||||
|  |                             num = cur.execute(''' SELECT sum(freq) FROM pronounce as p  where hanji = ?''', (j["char"],)).fetchall()[0][0] | ||||||
|  |                              | ||||||
|  |                             k_to_j_freq = num/den * (1-weight) | ||||||
|  | 
 | ||||||
|  |                         else: | ||||||
|  |                             num = k_to_j_freq_raw[0][0] | ||||||
|  |                             don = total_transition_freq | ||||||
|  |                             k_to_j_freq =num/don * weight | ||||||
|  |                          | ||||||
|  |                         if k_to_j_freq * k["prob"] > max_prob: | ||||||
|  |                             max_prob = k_to_j_freq * k["prob"] | ||||||
|  |                             prev_char = k["char"] | ||||||
|  |                      | ||||||
|  |                     j["prob"] = max_prob | ||||||
|  |                     j["prev_char"] = prev_char | ||||||
|  | 
 | ||||||
|  |     max_prob = -math.inf | ||||||
|  |     current = "" | ||||||
|  |     prev_char = "" | ||||||
|  |     for i in homophones_sequence[len(homophones_sequence)-1]: | ||||||
|  |         if i["prob"] > max_prob: | ||||||
|  |             max_prob = i["prob"] | ||||||
|  |             current = i["char"] | ||||||
|  |             prev_char = i["prev_char"] | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |      | ||||||
|  |     return_result[len(homophones_sequence)-1] = current | ||||||
|  | 
 | ||||||
|  |     for i in range(len(homophones_sequence)-2, -1, -1): | ||||||
|  |         current_ls = list(filter(lambda x : x["char"] == prev_char, | ||||||
|  |                               homophones_sequence[i])) | ||||||
|  |          | ||||||
|  |         return_result[i] = prev_char | ||||||
|  |         current = current_ls[0]["char"] | ||||||
|  |         prev_char = current_ls[0]["prev_char"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     return return_result | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | def poj_to_tl(sentence): | ||||||
|  |     return sentence | ||||||
|  | 
 | ||||||
|  | parser = argparse.ArgumentParser() | ||||||
|  | parser.add_argument('--genmod', help='generate the model', action='store_true', | ||||||
|  |                 required=False,) | ||||||
|  | 
 | ||||||
|  | parser.add_argument('sentence', metavar='SENTENCE', nargs='?', | ||||||
|  |                     help='the sentence to be converted') | ||||||
|  | parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1, | ||||||
|  |                     default=['poj'], | ||||||
|  |                     help='the orthography to be used (poj or tl). Default is poj.') | ||||||
|  | 
 | ||||||
|  | args = parser.parse_args() | ||||||
|  | 
 | ||||||
|  | if args.genmod == True: | ||||||
|  |     genmod() | ||||||
|  | elif args.sentence != None: | ||||||
|  |     if args.form == ['poj']: | ||||||
|  |         sentence = poj_to_tl(args.sentence) | ||||||
|  |         convert(sentence) | ||||||
|  |     else: | ||||||
|  |         convert(args.sentence) | ||||||
|  | else: | ||||||
|  |     parser.print_help() | ||||||
|  | 
 | ||||||
							
								
								
									
										1
									
								
								result.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								result.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
		Loading…
	
		Reference in a new issue