add change log 2. change the converting unit from hanji, etc.
This commit is contained in:
		
							parent
							
								
									9b7d8d4432
								
							
						
					
					
						commit
						9f2836067a
					
				
					 7 changed files with 1030 additions and 266 deletions
				
			
		
							
								
								
									
										520
									
								
								corpus/教典例句.csv
									
									
									
									
									
								
							
							
						
						
									
										520
									
								
								corpus/教典例句.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										160
									
								
								gitignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										160
									
								
								gitignore
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,160 @@ | ||||||
|  | # Byte-compiled / optimized / DLL files | ||||||
|  | __pycache__/ | ||||||
|  | *.py[cod] | ||||||
|  | *$py.class | ||||||
|  | 
 | ||||||
|  | # C extensions | ||||||
|  | *.so | ||||||
|  | 
 | ||||||
|  | # Distribution / packaging | ||||||
|  | .Python | ||||||
|  | build/ | ||||||
|  | develop-eggs/ | ||||||
|  | dist/ | ||||||
|  | downloads/ | ||||||
|  | eggs/ | ||||||
|  | .eggs/ | ||||||
|  | lib/ | ||||||
|  | lib64/ | ||||||
|  | parts/ | ||||||
|  | sdist/ | ||||||
|  | var/ | ||||||
|  | wheels/ | ||||||
|  | share/python-wheels/ | ||||||
|  | *.egg-info/ | ||||||
|  | .installed.cfg | ||||||
|  | *.egg | ||||||
|  | MANIFEST | ||||||
|  | 
 | ||||||
|  | # PyInstaller | ||||||
|  | #  Usually these files are written by a python script from a template | ||||||
|  | #  before PyInstaller builds the exe, so as to inject date/other infos into it. | ||||||
|  | *.manifest | ||||||
|  | *.spec | ||||||
|  | 
 | ||||||
|  | # Installer logs | ||||||
|  | pip-log.txt | ||||||
|  | pip-delete-this-directory.txt | ||||||
|  | 
 | ||||||
|  | # Unit test / coverage reports | ||||||
|  | htmlcov/ | ||||||
|  | .tox/ | ||||||
|  | .nox/ | ||||||
|  | .coverage | ||||||
|  | .coverage.* | ||||||
|  | .cache | ||||||
|  | nosetests.xml | ||||||
|  | coverage.xml | ||||||
|  | *.cover | ||||||
|  | *.py,cover | ||||||
|  | .hypothesis/ | ||||||
|  | .pytest_cache/ | ||||||
|  | cover/ | ||||||
|  | 
 | ||||||
|  | # Translations | ||||||
|  | *.mo | ||||||
|  | *.pot | ||||||
|  | 
 | ||||||
|  | # Django stuff: | ||||||
|  | *.log | ||||||
|  | local_settings.py | ||||||
|  | db.sqlite3 | ||||||
|  | db.sqlite3-journal | ||||||
|  | 
 | ||||||
|  | # Flask stuff: | ||||||
|  | instance/ | ||||||
|  | .webassets-cache | ||||||
|  | 
 | ||||||
|  | # Scrapy stuff: | ||||||
|  | .scrapy | ||||||
|  | 
 | ||||||
|  | # Sphinx documentation | ||||||
|  | docs/_build/ | ||||||
|  | 
 | ||||||
|  | # PyBuilder | ||||||
|  | .pybuilder/ | ||||||
|  | target/ | ||||||
|  | 
 | ||||||
|  | # Jupyter Notebook | ||||||
|  | .ipynb_checkpoints | ||||||
|  | 
 | ||||||
|  | # IPython | ||||||
|  | profile_default/ | ||||||
|  | ipython_config.py | ||||||
|  | 
 | ||||||
|  | # pyenv | ||||||
|  | #   For a library or package, you might want to ignore these files since the code is | ||||||
|  | #   intended to run in multiple environments; otherwise, check them in: | ||||||
|  | # .python-version | ||||||
|  | 
 | ||||||
|  | # pipenv | ||||||
|  | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||||||
|  | #   However, in case of collaboration, if having platform-specific dependencies or dependencies | ||||||
|  | #   having no cross-platform support, pipenv may install dependencies that don't work, or not | ||||||
|  | #   install all needed dependencies. | ||||||
|  | #Pipfile.lock | ||||||
|  | 
 | ||||||
|  | # poetry | ||||||
|  | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. | ||||||
|  | #   This is especially recommended for binary packages to ensure reproducibility, and is more | ||||||
|  | #   commonly ignored for libraries. | ||||||
|  | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control | ||||||
|  | #poetry.lock | ||||||
|  | 
 | ||||||
|  | # pdm | ||||||
|  | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | ||||||
|  | #pdm.lock | ||||||
|  | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | ||||||
|  | #   in version control. | ||||||
|  | #   https://pdm.fming.dev/#use-with-ide | ||||||
|  | .pdm.toml | ||||||
|  | 
 | ||||||
|  | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | ||||||
|  | __pypackages__/ | ||||||
|  | 
 | ||||||
|  | # Celery stuff | ||||||
|  | celerybeat-schedule | ||||||
|  | celerybeat.pid | ||||||
|  | 
 | ||||||
|  | # SageMath parsed files | ||||||
|  | *.sage.py | ||||||
|  | 
 | ||||||
|  | # Environments | ||||||
|  | .env | ||||||
|  | .venv | ||||||
|  | env/ | ||||||
|  | venv/ | ||||||
|  | ENV/ | ||||||
|  | env.bak/ | ||||||
|  | venv.bak/ | ||||||
|  | 
 | ||||||
|  | # Spyder project settings | ||||||
|  | .spyderproject | ||||||
|  | .spyproject | ||||||
|  | 
 | ||||||
|  | # Rope project settings | ||||||
|  | .ropeproject | ||||||
|  | 
 | ||||||
|  | # mkdocs documentation | ||||||
|  | /site | ||||||
|  | 
 | ||||||
|  | # mypy | ||||||
|  | .mypy_cache/ | ||||||
|  | .dmypy.json | ||||||
|  | dmypy.json | ||||||
|  | 
 | ||||||
|  | # Pyre type checker | ||||||
|  | .pyre/ | ||||||
|  | 
 | ||||||
|  | # pytype static type analyzer | ||||||
|  | .pytype/ | ||||||
|  | 
 | ||||||
|  | # Cython debug symbols | ||||||
|  | cython_debug/ | ||||||
|  | 
 | ||||||
|  | # PyCharm | ||||||
|  | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can | ||||||
|  | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | ||||||
|  | #  and can be added to the global gitignore or merged into this file.  For a more nuclear | ||||||
|  | #  option (not recommended) you can uncomment the following to ignore the entire idea folder. | ||||||
|  | #.idea/ | ||||||
							
								
								
									
										
											BIN
										
									
								
								model.db
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model.db
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										18
									
								
								pakkau.py
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								pakkau.py
									
									
									
									
									
								
							|  | @ -7,7 +7,7 @@ import os | ||||||
| import sqlite3 | import sqlite3 | ||||||
| from itertools import chain | from itertools import chain | ||||||
| 
 | 
 | ||||||
| model_filename = "model.db" | model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model.db") | ||||||
| 
 | 
 | ||||||
| def genmod(): | def genmod(): | ||||||
|     corpus_path = "./corpus/" |     corpus_path = "./corpus/" | ||||||
|  | @ -44,7 +44,11 @@ def genmod(): | ||||||
| 
 | 
 | ||||||
|     for i in new_data: |     for i in new_data: | ||||||
|         hanji = i[0] |         hanji = i[0] | ||||||
|  |          | ||||||
|         lomaji = i[1] |         lomaji = i[1] | ||||||
|  |         '''111''' | ||||||
|  |         hanji = list(zip(hanji, lomaji)) | ||||||
|  |         hanji = list(map(lambda x : x[0] + x[1], hanji)) | ||||||
|         for j in range(len(i[0])): |         for j in range(len(i[0])): | ||||||
|             if not hanji[j] in char_to_pronounce: |             if not hanji[j] in char_to_pronounce: | ||||||
|                 char_to_pronounce[hanji[j]] = {lomaji[j] : 1} |                 char_to_pronounce[hanji[j]] = {lomaji[j] : 1} | ||||||
|  | @ -65,7 +69,7 @@ def genmod(): | ||||||
|      |      | ||||||
| 
 | 
 | ||||||
|     for i in new_data: |     for i in new_data: | ||||||
|         head_hanji = i[0][0] |         head_hanji = i[0][0]+i[1][0] | ||||||
| 
 | 
 | ||||||
|         if head_hanji in init_freq: |         if head_hanji in init_freq: | ||||||
|             init_freq[head_hanji] += 1 |             init_freq[head_hanji] += 1 | ||||||
|  | @ -86,7 +90,8 @@ def genmod(): | ||||||
|     cur.execute("CREATE TABLE transition(prev_char, next_char, freq)") |     cur.execute("CREATE TABLE transition(prev_char, next_char, freq)") | ||||||
| 
 | 
 | ||||||
|     for i in new_data: |     for i in new_data: | ||||||
|         hanji = i[0] |         hanji_tmp = list(zip(i[0],i[1])) | ||||||
|  |         hanji = list(map(lambda x: x[0]+ x[1], hanji_tmp)) | ||||||
|         for j in range(len(i[0])-1): |         for j in range(len(i[0])-1): | ||||||
|             this_hanji = hanji[j] |             this_hanji = hanji[j] | ||||||
|             next_hanji = hanji[j+1] |             next_hanji = hanji[j+1] | ||||||
|  | @ -111,7 +116,6 @@ def genmod(): | ||||||
| def get_homophones(pron, cur, con): | def get_homophones(pron, cur, con): | ||||||
|     homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall() |     homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall() | ||||||
|     homophones = list(map(lambda x: x[0], homophones_raw)) |     homophones = list(map(lambda x: x[0], homophones_raw)) | ||||||
|      |  | ||||||
|     return homophones |     return homophones | ||||||
| 
 | 
 | ||||||
| def convert(sentences): | def convert(sentences): | ||||||
|  | @ -171,7 +175,7 @@ def convert_one_sentence(sentence): | ||||||
| 
 | 
 | ||||||
|         for i in homophones_sequence[0]: |         for i in homophones_sequence[0]: | ||||||
|             i_freq = cur.execute('''select initial.freq FROM initial  |             i_freq = cur.execute('''select initial.freq FROM initial  | ||||||
|     WHERE initial.char = ?''', (i['char'])).fetchall()[0][0] |     WHERE initial.char = ?''', (i['char'],)).fetchall()[0][0] | ||||||
| 
 | 
 | ||||||
|             i['prob'] = i_freq / head_freq_total |             i['prob'] = i_freq / head_freq_total | ||||||
|      |      | ||||||
|  | @ -268,7 +272,9 @@ on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0] | ||||||
|         current = current_ls[0]["char"] |         current = current_ls[0]["char"] | ||||||
|         prev_char = current_ls[0]["prev_char"] |         prev_char = current_ls[0]["prev_char"] | ||||||
| 
 | 
 | ||||||
| 
 |     return_result = list(filter(lambda x : x != "", return_result)) | ||||||
|  |     return_result = list(map(lambda x : x[0] if re.match(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎𪜀-\U0002b73f]', x) | ||||||
|  |                                              else x, return_result)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     return return_result |     return return_result | ||||||
|  |  | ||||||
							
								
								
									
										302
									
								
								pakkau.py~
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										302
									
								
								pakkau.py~
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,302 @@ | ||||||
|  | import re | ||||||
|  | import pandas as pd | ||||||
|  | import math | ||||||
|  | from functools import reduce | ||||||
|  | import argparse | ||||||
|  | import os | ||||||
|  | import sqlite3 | ||||||
|  | from itertools import chain | ||||||
|  | 
 | ||||||
|  | model_filename = "model.db" | ||||||
|  | 
 | ||||||
|  | def genmod(): | ||||||
|  |     corpus_path = "./corpus/" | ||||||
|  |     df_list = [] | ||||||
|  |     for file in os.listdir(corpus_path): | ||||||
|  |         if file.endswith(".csv"): | ||||||
|  |             df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji']) | ||||||
|  |             df_list.append(df) | ||||||
|  |     df = pd.concat(df_list) | ||||||
|  |     df['lomaji'] = df['lomaji'].str.lower() | ||||||
|  | 
 | ||||||
|  |     new_data = [] | ||||||
|  | 
 | ||||||
|  |     for index, row in df.iterrows(): | ||||||
|  |         hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji']))) | ||||||
|  |         tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji']) | ||||||
|  |         tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl)) | ||||||
|  |         new_data.append((hanji, tl2)) | ||||||
|  |         if (len(hanji) != len(tl2)): | ||||||
|  |             raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.") | ||||||
|  | 
 | ||||||
|  |     #model_filename = "model.db" | ||||||
|  |     try: | ||||||
|  |         os.remove(model_filename) | ||||||
|  |     except OSError: | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     con = sqlite3.connect(model_filename) | ||||||
|  |     cur = con.cursor() | ||||||
|  |     cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)") | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     char_to_pronounce = {} | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         hanji = i[0] | ||||||
|  |         lomaji = i[1] | ||||||
|  |         for j in range(len(i[0])): | ||||||
|  |             if not hanji[j] in char_to_pronounce: | ||||||
|  |                 char_to_pronounce[hanji[j]] = {lomaji[j] : 1} | ||||||
|  |             elif not lomaji[j] in char_to_pronounce[hanji[j]]: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] = 1 | ||||||
|  |             else: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] += 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     for i in char_to_pronounce.keys(): | ||||||
|  |         hanji =  char_to_pronounce[i] | ||||||
|  |         for j in hanji.keys(): | ||||||
|  |             cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j])) | ||||||
|  | 
 | ||||||
|  |     all_chars = char_to_pronounce.keys() | ||||||
|  |     init_freq = {} #詞kap句開始ê字出現次數 | ||||||
|  |     cur.execute("CREATE TABLE initial(char, freq)") | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         head_hanji = i[0][0] | ||||||
|  | 
 | ||||||
|  |         if head_hanji in init_freq: | ||||||
|  |             init_freq[head_hanji] += 1 | ||||||
|  |         else: | ||||||
|  |             init_freq[head_hanji] = 1 | ||||||
|  |      | ||||||
|  |     #補字 | ||||||
|  |     min_weight = 0.1 | ||||||
|  | 
 | ||||||
|  |     for i in all_chars: | ||||||
|  |         if not i in init_freq.keys(): | ||||||
|  |             init_freq[i] = 0.1 | ||||||
|  | 
 | ||||||
|  |     for i in init_freq.keys(): | ||||||
|  |         cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i])) | ||||||
|  | 
 | ||||||
|  |     char_transition = {} | ||||||
|  |     cur.execute("CREATE TABLE transition(prev_char, next_char, freq)") | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         hanji = i[0] | ||||||
|  |         for j in range(len(i[0])-1): | ||||||
|  |             this_hanji = hanji[j] | ||||||
|  |             next_hanji = hanji[j+1] | ||||||
|  |             if not this_hanji in char_transition: | ||||||
|  |                 char_transition[this_hanji] = {next_hanji : 1} | ||||||
|  |             elif not next_hanji in char_transition[this_hanji]: | ||||||
|  |                 char_transition[this_hanji][next_hanji] = 1 | ||||||
|  |             else: | ||||||
|  |                 char_transition[this_hanji][next_hanji] += 1 | ||||||
|  | 
 | ||||||
|  |     for i in char_transition.keys(): | ||||||
|  |         next_char = char_transition[i] | ||||||
|  |         for j in next_char.keys(): | ||||||
|  |             cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j])) | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |     #get_homophones("lí", cur, con) | ||||||
|  |              | ||||||
|  |     con.commit() | ||||||
|  |     con.close() | ||||||
|  | 
 | ||||||
|  | def get_homophones(pron, cur, con): | ||||||
|  |     homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall() | ||||||
|  |     homophones = list(map(lambda x: x[0], homophones_raw)) | ||||||
|  |      | ||||||
|  |     return homophones | ||||||
|  | 
 | ||||||
|  | def convert(sentences): | ||||||
|  |     splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences) | ||||||
|  |     splitted_cleaned = list(filter(lambda x : x != '', splitted)) | ||||||
|  | 
 | ||||||
|  |     result =  list(map(lambda s : convert_one_sentence(s), splitted_cleaned)) | ||||||
|  | 
 | ||||||
|  |     flatten_result = [x for xs in result for xss in xs for x in xss] | ||||||
|  |     result_string = "".join(flatten_result) | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     print(result_string) | ||||||
|  |     return result_string | ||||||
|  |      | ||||||
|  | def convert_one_sentence(sentence): | ||||||
|  |     full_width = ["!", "?", ";",":",",","。", "(", ")"] | ||||||
|  |     half_width = ["!", "?", ";", ":", ",", ".", "(", ")"] | ||||||
|  | 
 | ||||||
|  |     if len(sentence) == 1: | ||||||
|  |         for i in range(len(half_width)): | ||||||
|  |             if sentence[0] == half_width[i]: | ||||||
|  |                 return [[full_width[i]]] | ||||||
|  |          | ||||||
|  |      | ||||||
|  |     weight = 2/3 | ||||||
|  |      | ||||||
|  |     splitted = re.split(r'(--?|\s+)', sentence) | ||||||
|  |     filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted)) | ||||||
|  |     small_capized = list(map(lambda x : x.lower(), filtered)) | ||||||
|  |     print("======", small_capized) | ||||||
|  |     con = sqlite3.connect(model_filename) | ||||||
|  |     cur = con.cursor() | ||||||
|  | 
 | ||||||
|  |     homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized)) | ||||||
|  | 
 | ||||||
|  |     homophones_sequence = [list(map (lambda x : {"char": x, | ||||||
|  |                                       "prev_char": None, | ||||||
|  |                                                  "prob" : 1}, i)) for i in homophones_sequence_raw] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial  | ||||||
|  |     INNER JOIN pronounce ON pronounce.hanji = initial.char | ||||||
|  |     WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall())) | ||||||
|  | 
 | ||||||
|  |     return_result = [None] * len(small_capized) | ||||||
|  |      | ||||||
|  |     if head_freqs == []: | ||||||
|  |         return_result[0] = filtered[0] | ||||||
|  |         homophones_sequence[0] = [{"char": filtered[0], | ||||||
|  |                                   "prev_char": None, | ||||||
|  |                                   "prob" : 1}] | ||||||
|  |      | ||||||
|  |     else: | ||||||
|  |         head_freq_total = reduce(lambda x , y : x + y, head_freqs) | ||||||
|  | 
 | ||||||
|  |         for i in homophones_sequence[0]: | ||||||
|  |             i_freq = cur.execute('''select initial.freq FROM initial  | ||||||
|  |     WHERE initial.char = ?''', (i['char'])).fetchall()[0][0] | ||||||
|  | 
 | ||||||
|  |             i['prob'] = i_freq / head_freq_total | ||||||
|  |             print(i) | ||||||
|  |      | ||||||
|  |     #for i in homophones_sequence[0]: | ||||||
|  |          | ||||||
|  |     print("+++++", return_result) | ||||||
|  | 
 | ||||||
|  |     if len(small_capized) == 1: | ||||||
|  |         max_prob = -math.inf | ||||||
|  |         max_prob_char = None | ||||||
|  |         for i in homophones_sequence[0]: | ||||||
|  |             if i['prob'] > max_prob: | ||||||
|  |                 max_prob_char = i['char'] | ||||||
|  |                 max_prob = i['prob'] | ||||||
|  | 
 | ||||||
|  |         return_result[0] = max_prob_char | ||||||
|  | 
 | ||||||
|  |     else: | ||||||
|  |         for i in range(1,len(small_capized)): | ||||||
|  |             char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial  | ||||||
|  |     INNER JOIN pronounce ON pronounce.hanji = initial.char | ||||||
|  |     WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall())) | ||||||
|  | 
 | ||||||
|  |             if char_freqs == []: | ||||||
|  |                 return_result[i] = filtered[i] | ||||||
|  |                 homophones_sequence[i] = [{"char": filtered[i], | ||||||
|  |                                   "prev_char": None, | ||||||
|  |                                   "prob" : 1}] | ||||||
|  |                 prev_char = "" | ||||||
|  |                 max_prob = -math.inf | ||||||
|  |                 for m in homophones_sequence[i-1]: | ||||||
|  |                     if m['prob'] > max_prob: | ||||||
|  |                         max_prob = m['prob'] | ||||||
|  |                         prev_char = m['char'] | ||||||
|  |                 homophones_sequence[i][0]['prob'] = max_prob | ||||||
|  |                 homophones_sequence[i][0]['prev_char'] = prev_char | ||||||
|  |             else: | ||||||
|  |                 total_transition_freq = cur.execute(''' | ||||||
|  | SELECT sum(t.freq) | ||||||
|  | FROM transition as t | ||||||
|  | INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char | ||||||
|  | INNER JOIN pronounce as p2 ON p2.hanji = t.next_char | ||||||
|  | where p2.lomaji = ?  and p1.lomaji = ?''', | ||||||
|  |                                               (small_capized[i], small_capized[i-1])).fetchall()[0][0] | ||||||
|  |                 for j in homophones_sequence[i]: | ||||||
|  |                     prev_char = None | ||||||
|  |                     max_prob = -math.inf | ||||||
|  | 
 | ||||||
|  |                     for k in homophones_sequence[i-1]: | ||||||
|  |                         k_to_j_freq_raw = cur.execute('''select freq from transition | ||||||
|  | where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall() | ||||||
|  |                         if k_to_j_freq_raw == []: | ||||||
|  |                             den = cur.execute(''' | ||||||
|  | SELECT sum(p.freq) | ||||||
|  | FROM pronounce as p  | ||||||
|  | inner join pronounce as p2 | ||||||
|  | on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母 | ||||||
|  |                             #分子 | ||||||
|  |                             num = cur.execute(''' SELECT sum(freq) FROM pronounce as p  where hanji = ?''', (j["char"],)).fetchall()[0][0] | ||||||
|  |                             print("+++", num, den) | ||||||
|  |                             k_to_j_freq = num/den * (1-weight) | ||||||
|  | 
 | ||||||
|  |                         else: | ||||||
|  |                             num = k_to_j_freq_raw[0][0] | ||||||
|  |                             don = total_transition_freq | ||||||
|  |                             k_to_j_freq =num/don * weight | ||||||
|  |                         print("k_to_j_fr", k["char"], j["char"], k_to_j_freq) | ||||||
|  |                         if k_to_j_freq * k["prob"] > max_prob: | ||||||
|  |                             max_prob = k_to_j_freq * k["prob"] | ||||||
|  |                             prev_char = k["char"] | ||||||
|  |                     print("~-~_~-~-~-~-", prev_char, j["char"], max_prob) | ||||||
|  |                     j["prob"] = max_prob | ||||||
|  |                     j["prev_char"] = prev_char | ||||||
|  | 
 | ||||||
|  |     max_prob = -math.inf | ||||||
|  |     current = "" | ||||||
|  |     prev_char = "" | ||||||
|  |     for i in homophones_sequence[len(homophones_sequence)-1]: | ||||||
|  |         if i["prob"] > max_prob: | ||||||
|  |             max_prob = i["prob"] | ||||||
|  |             current = i["char"] | ||||||
|  |             prev_char = i["prev_char"] | ||||||
|  | 
 | ||||||
|  |     print("~tail~~", current) | ||||||
|  |     print(homophones_sequence) | ||||||
|  |     return_result[len(homophones_sequence)-1] = current | ||||||
|  | 
 | ||||||
|  |     for i in range(len(homophones_sequence)-2, -1, -1): | ||||||
|  |         current_ls = list(filter(lambda x : x["char"] == prev_char, | ||||||
|  |                               homophones_sequence[i])) | ||||||
|  |         print(prev_char) | ||||||
|  |         return_result[i] = prev_char | ||||||
|  |         current = current_ls[0]["char"] | ||||||
|  |         prev_char = current_ls[0]["prev_char"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     print(return_result) | ||||||
|  | 
 | ||||||
|  |     return return_result | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | def poj_to_tl(sentence): | ||||||
|  |     return sentence | ||||||
|  | 
 | ||||||
|  | parser = argparse.ArgumentParser() | ||||||
|  | parser.add_argument('--genmod', help='generate the model', action='store_true', | ||||||
|  |                 required=False,) | ||||||
|  | 
 | ||||||
|  | parser.add_argument('sentence', metavar='SENTENCE', nargs='?', | ||||||
|  |                     help='the sentence to be converted') | ||||||
|  | parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1, | ||||||
|  |                     default=['poj'], | ||||||
|  |                     help='the orthography to be used (poj or tl). Default is poj.') | ||||||
|  | 
 | ||||||
|  | args = parser.parse_args() | ||||||
|  | 
 | ||||||
|  | if args.genmod == True: | ||||||
|  |     genmod() | ||||||
|  | elif args.sentence != None: | ||||||
|  |     if args.form == ['poj']: | ||||||
|  |         sentence = poj_to_tl(args.sentence) | ||||||
|  |         convert(sentence) | ||||||
|  |     else: | ||||||
|  |         convert(args.sentence) | ||||||
|  | else: | ||||||
|  |     parser.print_help() | ||||||
|  | 
 | ||||||
							
								
								
									
										207
									
								
								test2.py~
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										207
									
								
								test2.py~
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,207 @@ | ||||||
|  | import re | ||||||
|  | import pandas as pd | ||||||
|  | import math | ||||||
|  | from functools import reduce | ||||||
|  | 
 | ||||||
|  | df1 = pd.read_csv('教典例句.csv', header=0, names=['漢字', '羅馬字']) | ||||||
|  | df2 = pd.read_csv('教典發音詞.csv',header=0, names=['漢字', '羅馬字']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | df = pd.concat([df1, df2]) # combine 2 csv dataframe | ||||||
|  | 
 | ||||||
|  | df['羅馬字'] = df['羅馬字'].str.lower() | ||||||
|  | 
 | ||||||
|  | new_data = [] | ||||||
|  | 
 | ||||||
|  | for index, row in df.iterrows(): | ||||||
|  |     hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['漢字']))) | ||||||
|  |     tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['羅馬字']) | ||||||
|  |     tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl)) | ||||||
|  |     new_data.append((hanji, tl2)) | ||||||
|  |     #if (len(hanji) != len(tl2)): | ||||||
|  |         #print(tl2, hanji) | ||||||
|  |     #print(tl2, hanji) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # char-To-Pronounciation Prossibility dict | ||||||
|  | 
 | ||||||
|  | char_to_pronounce = {} | ||||||
|  | 
 | ||||||
|  | for i in new_data: | ||||||
|  |     hanji = i[0] | ||||||
|  |     lomaji = i[1] | ||||||
|  |     for j in range(len(i[0])): | ||||||
|  |         if not hanji[j] in char_to_pronounce: | ||||||
|  |             char_to_pronounce[hanji[j]] = {lomaji[j] : 1} | ||||||
|  |         elif not lomaji[j] in char_to_pronounce[hanji[j]]: | ||||||
|  |             char_to_pronounce[hanji[j]][lomaji[j]] = 1 | ||||||
|  |         else: | ||||||
|  |             char_to_pronounce[hanji[j]][lomaji[j]] += 1 | ||||||
|  | 
 | ||||||
|  | for char, char_reading in char_to_pronounce.items(): | ||||||
|  |     total_count = reduce((lambda x, y : x + y), list(char_reading.values())) | ||||||
|  | 
 | ||||||
|  |     for i in char_reading.keys(): | ||||||
|  |         char_reading[i] = char_reading[i] / float(total_count) | ||||||
|  | 
 | ||||||
|  | #print(char_to_pronounce) | ||||||
|  | 
 | ||||||
|  | all_chars = char_to_pronounce.keys() | ||||||
|  | 
 | ||||||
|  | '''{'提': 45, '宋': 7, '完': 18, '刻': 7, '局': 9, | ||||||
|  |  '巡': 8, '畫': 25, '青': 56, '尪': 13}''' | ||||||
|  | init_freq = {} #詞kap句開始ê字出現次數 | ||||||
|  | 
 | ||||||
|  | for i in new_data: | ||||||
|  |     head_hanji = i[0][0] | ||||||
|  | 
 | ||||||
|  |     if head_hanji in init_freq: | ||||||
|  |         init_freq[head_hanji] += 1 | ||||||
|  |     else: | ||||||
|  |         init_freq[head_hanji] = 1 | ||||||
|  | 
 | ||||||
|  | #補字 | ||||||
|  | min_weight = 0.1 | ||||||
|  | 
 | ||||||
|  | for i in all_chars: | ||||||
|  |     if not i in init_freq.keys(): | ||||||
|  |         init_freq[i] = 0.1 | ||||||
|  |          | ||||||
|  | #print(init_freq) | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # probability of P(next=c2|this=c1) | ||||||
|  | char_transition = {} | ||||||
|  | 
 | ||||||
|  | for i in new_data: | ||||||
|  |     hanji = i[0] | ||||||
|  |     for j in range(len(i[0])-1): | ||||||
|  |         this_hanji = hanji[j] | ||||||
|  |         next_hanji = hanji[j+1] | ||||||
|  |         if not this_hanji in char_transition: | ||||||
|  |             char_transition[this_hanji] = {next_hanji : 1} | ||||||
|  |         elif not next_hanji in char_transition[this_hanji]: | ||||||
|  |             char_transition[this_hanji][next_hanji] = 1 | ||||||
|  |         else: | ||||||
|  |             char_transition[this_hanji][next_hanji] += 1 | ||||||
|  | 
 | ||||||
|  | #print(char_transition) | ||||||
|  | 
 | ||||||
|  | #補字 | ||||||
|  | for i in all_chars: | ||||||
|  |     if not i in char_transition.keys(): | ||||||
|  |         char_transition[i] = {} | ||||||
|  |         for j in all_chars: | ||||||
|  |             char_transition[i][j] = init_freq[j] | ||||||
|  |     else: | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  | for i in char_transition.keys(): | ||||||
|  |     for j in all_chars: | ||||||
|  |         if not j in char_transition[i].keys(): | ||||||
|  |             char_transition[i][j] = min_weight * (0.03+math.log(init_freq[j])) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | for char, next_char in char_transition.items(): | ||||||
|  |     total_count = 0 | ||||||
|  |     [total_count := total_count + x for x in list(next_char.values())] | ||||||
|  | 
 | ||||||
|  |     for i in next_char.keys(): | ||||||
|  |         next_char[i] = next_char[i] / float(total_count) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_homophones(pron): | ||||||
|  |     homophones = [] | ||||||
|  |     for i in char_to_pronounce.keys(): | ||||||
|  |         if pron in char_to_pronounce[i].keys(): | ||||||
|  |             homophones.append(i) | ||||||
|  |         else: | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|  |     return homophones | ||||||
|  | 
 | ||||||
|  | input_lomaji = ["guá", "kap", "tshit", "á", "lâi", "khì", "tâi", "tiong", "tshit", "thô", "sūn", "suà", "tsē", "ko", "thih"] | ||||||
|  | 
 | ||||||
|  | char_candidates = [] | ||||||
|  | 
 | ||||||
|  | for i in input_lomaji: | ||||||
|  |     homophones = list(map(lambda x : {"char": x, | ||||||
|  |                                       "prev_char": None, | ||||||
|  |                                       "prob" : None}, # probibility | ||||||
|  |                           get_homophones(i))) | ||||||
|  |     char_candidates.append(homophones) | ||||||
|  | 
 | ||||||
|  | #print(char_candidates) | ||||||
|  | def get_max_prob(input_lmj, char_cand): | ||||||
|  |     for i in range(len(input_lmj)): | ||||||
|  |         if i == 0: | ||||||
|  |             for j in char_cand[i]: | ||||||
|  |                 init_freq_sum = reduce(lambda x, y : x + y, | ||||||
|  |                                        list( | ||||||
|  |                                            map(lambda x : init_freq[x["char"]] , | ||||||
|  |                                                char_cand[0]))) | ||||||
|  |                 print(init_freq_sum) | ||||||
|  |                 ch = j["char"] | ||||||
|  |                 init_to_char_prob = init_freq[ch] / init_freq_sum # get the ratio | ||||||
|  |                 char_reading_prob = char_to_pronounce[ch][input_lmj[0]] | ||||||
|  |              | ||||||
|  |                 j["prob"] = init_to_char_prob * char_reading_prob | ||||||
|  |          | ||||||
|  |             result = "" | ||||||
|  |             max_num = -math.inf | ||||||
|  | 
 | ||||||
|  |             for i in char_cand[0]: | ||||||
|  |                 if i["prob"] >= max_num: | ||||||
|  |                     max_num = i["prob"] | ||||||
|  |                     result = i["char"] | ||||||
|  | 
 | ||||||
|  |             #print(result) | ||||||
|  |         else: | ||||||
|  |             for j in char_cand[i]: | ||||||
|  |                 prob = -math.inf | ||||||
|  |                 prev_char = "" | ||||||
|  |                 for k in char_cand[i-1]: | ||||||
|  |                     k_prob = k["prob"] | ||||||
|  |                     #print(k["char"], "k_prob:", k_prob) | ||||||
|  |                     k_to_j_prob = char_transition[k["char"]][j["char"]] | ||||||
|  |                     #print(k["char"], "->",j["char"] ,"k_to_j_prob:", k_to_j_prob) | ||||||
|  |                     j_to_pron_prob = char_to_pronounce[j["char"]][input_lmj[i]] | ||||||
|  |                     total_tmp_prob = k_prob * k_to_j_prob * j_to_pron_prob  | ||||||
|  |                     if prob < total_tmp_prob: | ||||||
|  |                         prob = total_tmp_prob | ||||||
|  |                         prev_char = k | ||||||
|  | 
 | ||||||
|  |                 j["prev_char"] = prev_char["char"] | ||||||
|  |                 j["prob"] = prob | ||||||
|  | 
 | ||||||
|  |     real_last_char = "" | ||||||
|  |     prev_char = "" | ||||||
|  |     prob = -math.inf | ||||||
|  |     for i in char_cand[-1]: | ||||||
|  |         if i["prob"] > prob: | ||||||
|  |             prob = i["prob"] | ||||||
|  |             real_last_char = i["char"] | ||||||
|  |             prev_char = i["prev_char"] | ||||||
|  | 
 | ||||||
|  |     print(real_last_char) | ||||||
|  | 
 | ||||||
|  |     result_hanji = [real_last_char] | ||||||
|  |     for i in range(len(input_lmj)-2, -1, -1): | ||||||
|  |         current = list(filter(lambda x : x["char"] == prev_char, | ||||||
|  |                               char_cand[i]))[0] | ||||||
|  |         result_hanji.append(current["char"]) | ||||||
|  |         prev_char = current["prev_char"] | ||||||
|  |          | ||||||
|  | 
 | ||||||
|  |     result_hanji.reverse() | ||||||
|  |      | ||||||
|  |     result_hanji_string = "".join(result_hanji) | ||||||
|  |     print("輸入ê羅馬字陣列(array):", input_lomaji) | ||||||
|  |     print("輸出ê漢字:", result_hanji_string) | ||||||
|  |                  | ||||||
|  |      | ||||||
|  | get_max_prob(input_lomaji, char_candidates) | ||||||
							
								
								
									
										89
									
								
								test3.py~
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								test3.py~
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,89 @@ | ||||||
|  | import re | ||||||
|  | import pandas as pd | ||||||
|  | import math | ||||||
|  | from functools import reduce | ||||||
|  | import argparse | ||||||
|  | import os | ||||||
|  | import sqlite3 | ||||||
|  | 
 | ||||||
|  | def genmod(): | ||||||
|  |     corpus_path = "./corpus/" | ||||||
|  |     df_list = [] | ||||||
|  |     for file in os.listdir(corpus_path): | ||||||
|  |         if file.endswith(".csv"): | ||||||
|  |             df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji']) | ||||||
|  |             df_list.append(df) | ||||||
|  |     df = pd.concat(df_list) | ||||||
|  |     df['lomaji'] = df['lomaji'].str.lower() | ||||||
|  | 
 | ||||||
|  |     new_data = [] | ||||||
|  | 
 | ||||||
|  |     for index, row in df.iterrows(): | ||||||
|  |         hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji']))) | ||||||
|  |         tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji']) | ||||||
|  |         tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl)) | ||||||
|  |         new_data.append((hanji, tl2)) | ||||||
|  |         if (len(hanji) != len(tl2)): | ||||||
|  |             raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.") | ||||||
|  | 
 | ||||||
|  |     model_filename = "model.db" | ||||||
|  |     try: | ||||||
|  |         os.remove(model_filename) | ||||||
|  |     except OSError: | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     con = sqlite3.connect(model_filename) | ||||||
|  |     cur = con.cursor() | ||||||
|  |     cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)") | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     char_to_pronounce = {} | ||||||
|  | 
 | ||||||
|  |     for i in new_data: | ||||||
|  |         hanji = i[0] | ||||||
|  |         lomaji = i[1] | ||||||
|  |         for j in range(len(i[0])): | ||||||
|  |             if not hanji[j] in char_to_pronounce: | ||||||
|  |                 char_to_pronounce[hanji[j]] = {lomaji[j] : 1} | ||||||
|  |             elif not lomaji[j] in char_to_pronounce[hanji[j]]: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] = 1 | ||||||
|  |             else: | ||||||
|  |                 char_to_pronounce[hanji[j]][lomaji[j]] += 1 | ||||||
|  | 
 | ||||||
|  |     print(char_to_pronounce) | ||||||
|  | 
 | ||||||
|  |     for i in char_to_pronounce.keys(): | ||||||
|  |         hanji =  char_to_pronounce[i] | ||||||
|  |         for j in hanji.keys(): | ||||||
|  |             cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j])) | ||||||
|  |              | ||||||
|  |     #con.commit() | ||||||
|  |     con.commit() | ||||||
|  |     con.close() | ||||||
|  | 
 | ||||||
|  | def convert(sentence): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | parser = argparse.ArgumentParser() | ||||||
|  | parser.add_argument('--genmod', help='generate the model', action='store_true', | ||||||
|  |                 required=False,) | ||||||
|  | 
 | ||||||
|  | parser.add_argument('sentence', metavar='SENTENCE', nargs='?', | ||||||
|  |                     help='the sentence to be converted') | ||||||
|  | parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1, | ||||||
|  |                     default=['poj'], | ||||||
|  |                     help='the orthography to be used (poj or tl). Default is poj.') | ||||||
|  | 
 | ||||||
|  | args = parser.parse_args() | ||||||
|  | print(args) | ||||||
|  | if args.genmod == True: | ||||||
|  |     genmod() | ||||||
|  | elif args.sentence != None: | ||||||
|  |     if args.form == ['poj']: | ||||||
|  |         sentence = poj_to_tl(args.sentence) | ||||||
|  |         print(convert(sentence)) | ||||||
|  |     else: | ||||||
|  |         print(convert(args.sentence)) | ||||||
|  | else: | ||||||
|  |     parser.print_help() | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in a new issue