add change log 2. change the converting unit from hanji, etc.
This commit is contained in:
parent
9b7d8d4432
commit
9f2836067a
7 changed files with 1030 additions and 266 deletions
520
corpus/教典例句.csv
520
corpus/教典例句.csv
File diff suppressed because it is too large
Load diff
160
gitignore
Normal file
160
gitignore
Normal file
|
@ -0,0 +1,160 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
BIN
model.db
BIN
model.db
Binary file not shown.
18
pakkau.py
18
pakkau.py
|
@ -7,7 +7,7 @@ import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
model_filename = "model.db"
|
model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model.db")
|
||||||
|
|
||||||
def genmod():
|
def genmod():
|
||||||
corpus_path = "./corpus/"
|
corpus_path = "./corpus/"
|
||||||
|
@ -44,7 +44,11 @@ def genmod():
|
||||||
|
|
||||||
for i in new_data:
|
for i in new_data:
|
||||||
hanji = i[0]
|
hanji = i[0]
|
||||||
|
|
||||||
lomaji = i[1]
|
lomaji = i[1]
|
||||||
|
'''111'''
|
||||||
|
hanji = list(zip(hanji, lomaji))
|
||||||
|
hanji = list(map(lambda x : x[0] + x[1], hanji))
|
||||||
for j in range(len(i[0])):
|
for j in range(len(i[0])):
|
||||||
if not hanji[j] in char_to_pronounce:
|
if not hanji[j] in char_to_pronounce:
|
||||||
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||||||
|
@ -65,7 +69,7 @@ def genmod():
|
||||||
|
|
||||||
|
|
||||||
for i in new_data:
|
for i in new_data:
|
||||||
head_hanji = i[0][0]
|
head_hanji = i[0][0]+i[1][0]
|
||||||
|
|
||||||
if head_hanji in init_freq:
|
if head_hanji in init_freq:
|
||||||
init_freq[head_hanji] += 1
|
init_freq[head_hanji] += 1
|
||||||
|
@ -86,7 +90,8 @@ def genmod():
|
||||||
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
|
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
|
||||||
|
|
||||||
for i in new_data:
|
for i in new_data:
|
||||||
hanji = i[0]
|
hanji_tmp = list(zip(i[0],i[1]))
|
||||||
|
hanji = list(map(lambda x: x[0]+ x[1], hanji_tmp))
|
||||||
for j in range(len(i[0])-1):
|
for j in range(len(i[0])-1):
|
||||||
this_hanji = hanji[j]
|
this_hanji = hanji[j]
|
||||||
next_hanji = hanji[j+1]
|
next_hanji = hanji[j+1]
|
||||||
|
@ -111,7 +116,6 @@ def genmod():
|
||||||
def get_homophones(pron, cur, con):
|
def get_homophones(pron, cur, con):
|
||||||
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
|
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
|
||||||
homophones = list(map(lambda x: x[0], homophones_raw))
|
homophones = list(map(lambda x: x[0], homophones_raw))
|
||||||
|
|
||||||
return homophones
|
return homophones
|
||||||
|
|
||||||
def convert(sentences):
|
def convert(sentences):
|
||||||
|
@ -171,7 +175,7 @@ def convert_one_sentence(sentence):
|
||||||
|
|
||||||
for i in homophones_sequence[0]:
|
for i in homophones_sequence[0]:
|
||||||
i_freq = cur.execute('''select initial.freq FROM initial
|
i_freq = cur.execute('''select initial.freq FROM initial
|
||||||
WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
|
WHERE initial.char = ?''', (i['char'],)).fetchall()[0][0]
|
||||||
|
|
||||||
i['prob'] = i_freq / head_freq_total
|
i['prob'] = i_freq / head_freq_total
|
||||||
|
|
||||||
|
@ -268,8 +272,10 @@ on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0]
|
||||||
current = current_ls[0]["char"]
|
current = current_ls[0]["char"]
|
||||||
prev_char = current_ls[0]["prev_char"]
|
prev_char = current_ls[0]["prev_char"]
|
||||||
|
|
||||||
|
return_result = list(filter(lambda x : x != "", return_result))
|
||||||
|
return_result = list(map(lambda x : x[0] if re.match(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎𪜀-\U0002b73f]', x)
|
||||||
|
else x, return_result))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return return_result
|
return return_result
|
||||||
|
|
||||||
|
|
302
pakkau.py~
Normal file
302
pakkau.py~
Normal file
|
@ -0,0 +1,302 @@
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import math
|
||||||
|
from functools import reduce
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
model_filename = "model.db"
|
||||||
|
|
||||||
|
def genmod():
|
||||||
|
corpus_path = "./corpus/"
|
||||||
|
df_list = []
|
||||||
|
for file in os.listdir(corpus_path):
|
||||||
|
if file.endswith(".csv"):
|
||||||
|
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
|
||||||
|
df_list.append(df)
|
||||||
|
df = pd.concat(df_list)
|
||||||
|
df['lomaji'] = df['lomaji'].str.lower()
|
||||||
|
|
||||||
|
new_data = []
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
|
||||||
|
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
|
||||||
|
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
|
||||||
|
new_data.append((hanji, tl2))
|
||||||
|
if (len(hanji) != len(tl2)):
|
||||||
|
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
|
||||||
|
|
||||||
|
#model_filename = "model.db"
|
||||||
|
try:
|
||||||
|
os.remove(model_filename)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
con = sqlite3.connect(model_filename)
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
|
||||||
|
|
||||||
|
|
||||||
|
char_to_pronounce = {}
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
lomaji = i[1]
|
||||||
|
for j in range(len(i[0])):
|
||||||
|
if not hanji[j] in char_to_pronounce:
|
||||||
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||||||
|
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] = 1
|
||||||
|
else:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] += 1
|
||||||
|
|
||||||
|
|
||||||
|
for i in char_to_pronounce.keys():
|
||||||
|
hanji = char_to_pronounce[i]
|
||||||
|
for j in hanji.keys():
|
||||||
|
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
|
||||||
|
|
||||||
|
all_chars = char_to_pronounce.keys()
|
||||||
|
init_freq = {} #詞kap句開始ê字出現次數
|
||||||
|
cur.execute("CREATE TABLE initial(char, freq)")
|
||||||
|
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
head_hanji = i[0][0]
|
||||||
|
|
||||||
|
if head_hanji in init_freq:
|
||||||
|
init_freq[head_hanji] += 1
|
||||||
|
else:
|
||||||
|
init_freq[head_hanji] = 1
|
||||||
|
|
||||||
|
#補字
|
||||||
|
min_weight = 0.1
|
||||||
|
|
||||||
|
for i in all_chars:
|
||||||
|
if not i in init_freq.keys():
|
||||||
|
init_freq[i] = 0.1
|
||||||
|
|
||||||
|
for i in init_freq.keys():
|
||||||
|
cur.execute("INSERT INTO initial VALUES(?, ?)", (i, init_freq[i]))
|
||||||
|
|
||||||
|
char_transition = {}
|
||||||
|
cur.execute("CREATE TABLE transition(prev_char, next_char, freq)")
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
for j in range(len(i[0])-1):
|
||||||
|
this_hanji = hanji[j]
|
||||||
|
next_hanji = hanji[j+1]
|
||||||
|
if not this_hanji in char_transition:
|
||||||
|
char_transition[this_hanji] = {next_hanji : 1}
|
||||||
|
elif not next_hanji in char_transition[this_hanji]:
|
||||||
|
char_transition[this_hanji][next_hanji] = 1
|
||||||
|
else:
|
||||||
|
char_transition[this_hanji][next_hanji] += 1
|
||||||
|
|
||||||
|
for i in char_transition.keys():
|
||||||
|
next_char = char_transition[i]
|
||||||
|
for j in next_char.keys():
|
||||||
|
cur.execute("INSERT INTO transition VALUES(?, ?, ?)", (i, j, next_char[j]))
|
||||||
|
|
||||||
|
|
||||||
|
#get_homophones("lí", cur, con)
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
def get_homophones(pron, cur, con):
|
||||||
|
homophones_raw = cur.execute("select hanji FROM pronounce where lomaji = ?", (pron, )).fetchall()
|
||||||
|
homophones = list(map(lambda x: x[0], homophones_raw))
|
||||||
|
|
||||||
|
return homophones
|
||||||
|
|
||||||
|
def convert(sentences):
|
||||||
|
splitted = re.split(r'(:?[!?;,.\"\'\(\):])', sentences)
|
||||||
|
splitted_cleaned = list(filter(lambda x : x != '', splitted))
|
||||||
|
|
||||||
|
result = list(map(lambda s : convert_one_sentence(s), splitted_cleaned))
|
||||||
|
|
||||||
|
flatten_result = [x for xs in result for xss in xs for x in xss]
|
||||||
|
result_string = "".join(flatten_result)
|
||||||
|
|
||||||
|
|
||||||
|
print(result_string)
|
||||||
|
return result_string
|
||||||
|
|
||||||
|
def convert_one_sentence(sentence):
|
||||||
|
full_width = ["!", "?", ";",":",",","。", "(", ")"]
|
||||||
|
half_width = ["!", "?", ";", ":", ",", ".", "(", ")"]
|
||||||
|
|
||||||
|
if len(sentence) == 1:
|
||||||
|
for i in range(len(half_width)):
|
||||||
|
if sentence[0] == half_width[i]:
|
||||||
|
return [[full_width[i]]]
|
||||||
|
|
||||||
|
|
||||||
|
weight = 2/3
|
||||||
|
|
||||||
|
splitted = re.split(r'(--?|\s+)', sentence)
|
||||||
|
filtered = list(filter(lambda x :not re.match(r'(--?|\s+)', x), splitted))
|
||||||
|
small_capized = list(map(lambda x : x.lower(), filtered))
|
||||||
|
print("======", small_capized)
|
||||||
|
con = sqlite3.connect(model_filename)
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
homophones_sequence_raw = list(map(lambda x : get_homophones(x, con, cur), small_capized))
|
||||||
|
|
||||||
|
homophones_sequence = [list(map (lambda x : {"char": x,
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}, i)) for i in homophones_sequence_raw]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
head_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
|
||||||
|
INNER JOIN pronounce ON pronounce.hanji = initial.char
|
||||||
|
WHERE pronounce.lomaji = ?''', (small_capized[0], )).fetchall()))
|
||||||
|
|
||||||
|
return_result = [None] * len(small_capized)
|
||||||
|
|
||||||
|
if head_freqs == []:
|
||||||
|
return_result[0] = filtered[0]
|
||||||
|
homophones_sequence[0] = [{"char": filtered[0],
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}]
|
||||||
|
|
||||||
|
else:
|
||||||
|
head_freq_total = reduce(lambda x , y : x + y, head_freqs)
|
||||||
|
|
||||||
|
for i in homophones_sequence[0]:
|
||||||
|
i_freq = cur.execute('''select initial.freq FROM initial
|
||||||
|
WHERE initial.char = ?''', (i['char'])).fetchall()[0][0]
|
||||||
|
|
||||||
|
i['prob'] = i_freq / head_freq_total
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
#for i in homophones_sequence[0]:
|
||||||
|
|
||||||
|
print("+++++", return_result)
|
||||||
|
|
||||||
|
if len(small_capized) == 1:
|
||||||
|
max_prob = -math.inf
|
||||||
|
max_prob_char = None
|
||||||
|
for i in homophones_sequence[0]:
|
||||||
|
if i['prob'] > max_prob:
|
||||||
|
max_prob_char = i['char']
|
||||||
|
max_prob = i['prob']
|
||||||
|
|
||||||
|
return_result[0] = max_prob_char
|
||||||
|
|
||||||
|
else:
|
||||||
|
for i in range(1,len(small_capized)):
|
||||||
|
char_freqs = list(map(lambda x : x[0], cur.execute('''select initial.freq FROM initial
|
||||||
|
INNER JOIN pronounce ON pronounce.hanji = initial.char
|
||||||
|
WHERE pronounce.lomaji = ?''', (small_capized[i], )).fetchall()))
|
||||||
|
|
||||||
|
if char_freqs == []:
|
||||||
|
return_result[i] = filtered[i]
|
||||||
|
homophones_sequence[i] = [{"char": filtered[i],
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : 1}]
|
||||||
|
prev_char = ""
|
||||||
|
max_prob = -math.inf
|
||||||
|
for m in homophones_sequence[i-1]:
|
||||||
|
if m['prob'] > max_prob:
|
||||||
|
max_prob = m['prob']
|
||||||
|
prev_char = m['char']
|
||||||
|
homophones_sequence[i][0]['prob'] = max_prob
|
||||||
|
homophones_sequence[i][0]['prev_char'] = prev_char
|
||||||
|
else:
|
||||||
|
total_transition_freq = cur.execute('''
|
||||||
|
SELECT sum(t.freq)
|
||||||
|
FROM transition as t
|
||||||
|
INNER JOIN pronounce as p1 ON p1.hanji = t.prev_char
|
||||||
|
INNER JOIN pronounce as p2 ON p2.hanji = t.next_char
|
||||||
|
where p2.lomaji = ? and p1.lomaji = ?''',
|
||||||
|
(small_capized[i], small_capized[i-1])).fetchall()[0][0]
|
||||||
|
for j in homophones_sequence[i]:
|
||||||
|
prev_char = None
|
||||||
|
max_prob = -math.inf
|
||||||
|
|
||||||
|
for k in homophones_sequence[i-1]:
|
||||||
|
k_to_j_freq_raw = cur.execute('''select freq from transition
|
||||||
|
where prev_char = ? and next_char = ? ''', (k["char"], j["char"])).fetchall()
|
||||||
|
if k_to_j_freq_raw == []:
|
||||||
|
den = cur.execute('''
|
||||||
|
SELECT sum(p.freq)
|
||||||
|
FROM pronounce as p
|
||||||
|
inner join pronounce as p2
|
||||||
|
on p.hanji = p2.hanji where p2.lomaji = ?''', (small_capized[i],)).fetchall()[0][0]#分母
|
||||||
|
#分子
|
||||||
|
num = cur.execute(''' SELECT sum(freq) FROM pronounce as p where hanji = ?''', (j["char"],)).fetchall()[0][0]
|
||||||
|
print("+++", num, den)
|
||||||
|
k_to_j_freq = num/den * (1-weight)
|
||||||
|
|
||||||
|
else:
|
||||||
|
num = k_to_j_freq_raw[0][0]
|
||||||
|
don = total_transition_freq
|
||||||
|
k_to_j_freq =num/don * weight
|
||||||
|
print("k_to_j_fr", k["char"], j["char"], k_to_j_freq)
|
||||||
|
if k_to_j_freq * k["prob"] > max_prob:
|
||||||
|
max_prob = k_to_j_freq * k["prob"]
|
||||||
|
prev_char = k["char"]
|
||||||
|
print("~-~_~-~-~-~-", prev_char, j["char"], max_prob)
|
||||||
|
j["prob"] = max_prob
|
||||||
|
j["prev_char"] = prev_char
|
||||||
|
|
||||||
|
max_prob = -math.inf
|
||||||
|
current = ""
|
||||||
|
prev_char = ""
|
||||||
|
for i in homophones_sequence[len(homophones_sequence)-1]:
|
||||||
|
if i["prob"] > max_prob:
|
||||||
|
max_prob = i["prob"]
|
||||||
|
current = i["char"]
|
||||||
|
prev_char = i["prev_char"]
|
||||||
|
|
||||||
|
print("~tail~~", current)
|
||||||
|
print(homophones_sequence)
|
||||||
|
return_result[len(homophones_sequence)-1] = current
|
||||||
|
|
||||||
|
for i in range(len(homophones_sequence)-2, -1, -1):
|
||||||
|
current_ls = list(filter(lambda x : x["char"] == prev_char,
|
||||||
|
homophones_sequence[i]))
|
||||||
|
print(prev_char)
|
||||||
|
return_result[i] = prev_char
|
||||||
|
current = current_ls[0]["char"]
|
||||||
|
prev_char = current_ls[0]["prev_char"]
|
||||||
|
|
||||||
|
|
||||||
|
print(return_result)
|
||||||
|
|
||||||
|
return return_result
|
||||||
|
|
||||||
|
|
||||||
|
def poj_to_tl(sentence):
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--genmod', help='generate the model', action='store_true',
|
||||||
|
required=False,)
|
||||||
|
|
||||||
|
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
|
||||||
|
help='the sentence to be converted')
|
||||||
|
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
|
||||||
|
default=['poj'],
|
||||||
|
help='the orthography to be used (poj or tl). Default is poj.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.genmod == True:
|
||||||
|
genmod()
|
||||||
|
elif args.sentence != None:
|
||||||
|
if args.form == ['poj']:
|
||||||
|
sentence = poj_to_tl(args.sentence)
|
||||||
|
convert(sentence)
|
||||||
|
else:
|
||||||
|
convert(args.sentence)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
207
test2.py~
Normal file
207
test2.py~
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import math
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
|
df1 = pd.read_csv('教典例句.csv', header=0, names=['漢字', '羅馬字'])
|
||||||
|
df2 = pd.read_csv('教典發音詞.csv',header=0, names=['漢字', '羅馬字'])
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.concat([df1, df2]) # combine 2 csv dataframe
|
||||||
|
|
||||||
|
df['羅馬字'] = df['羅馬字'].str.lower()
|
||||||
|
|
||||||
|
new_data = []
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['漢字'])))
|
||||||
|
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['羅馬字'])
|
||||||
|
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
|
||||||
|
new_data.append((hanji, tl2))
|
||||||
|
#if (len(hanji) != len(tl2)):
|
||||||
|
#print(tl2, hanji)
|
||||||
|
#print(tl2, hanji)
|
||||||
|
|
||||||
|
|
||||||
|
# char-To-Pronounciation Prossibility dict
|
||||||
|
|
||||||
|
char_to_pronounce = {}
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
lomaji = i[1]
|
||||||
|
for j in range(len(i[0])):
|
||||||
|
if not hanji[j] in char_to_pronounce:
|
||||||
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||||||
|
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] = 1
|
||||||
|
else:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] += 1
|
||||||
|
|
||||||
|
for char, char_reading in char_to_pronounce.items():
|
||||||
|
total_count = reduce((lambda x, y : x + y), list(char_reading.values()))
|
||||||
|
|
||||||
|
for i in char_reading.keys():
|
||||||
|
char_reading[i] = char_reading[i] / float(total_count)
|
||||||
|
|
||||||
|
#print(char_to_pronounce)
|
||||||
|
|
||||||
|
all_chars = char_to_pronounce.keys()
|
||||||
|
|
||||||
|
'''{'提': 45, '宋': 7, '完': 18, '刻': 7, '局': 9,
|
||||||
|
'巡': 8, '畫': 25, '青': 56, '尪': 13}'''
|
||||||
|
init_freq = {} #詞kap句開始ê字出現次數
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
head_hanji = i[0][0]
|
||||||
|
|
||||||
|
if head_hanji in init_freq:
|
||||||
|
init_freq[head_hanji] += 1
|
||||||
|
else:
|
||||||
|
init_freq[head_hanji] = 1
|
||||||
|
|
||||||
|
#補字
|
||||||
|
min_weight = 0.1
|
||||||
|
|
||||||
|
for i in all_chars:
|
||||||
|
if not i in init_freq.keys():
|
||||||
|
init_freq[i] = 0.1
|
||||||
|
|
||||||
|
#print(init_freq)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# probability of P(next=c2|this=c1)
|
||||||
|
char_transition = {}
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
for j in range(len(i[0])-1):
|
||||||
|
this_hanji = hanji[j]
|
||||||
|
next_hanji = hanji[j+1]
|
||||||
|
if not this_hanji in char_transition:
|
||||||
|
char_transition[this_hanji] = {next_hanji : 1}
|
||||||
|
elif not next_hanji in char_transition[this_hanji]:
|
||||||
|
char_transition[this_hanji][next_hanji] = 1
|
||||||
|
else:
|
||||||
|
char_transition[this_hanji][next_hanji] += 1
|
||||||
|
|
||||||
|
#print(char_transition)
|
||||||
|
|
||||||
|
#補字
|
||||||
|
for i in all_chars:
|
||||||
|
if not i in char_transition.keys():
|
||||||
|
char_transition[i] = {}
|
||||||
|
for j in all_chars:
|
||||||
|
char_transition[i][j] = init_freq[j]
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for i in char_transition.keys():
|
||||||
|
for j in all_chars:
|
||||||
|
if not j in char_transition[i].keys():
|
||||||
|
char_transition[i][j] = min_weight * (0.03+math.log(init_freq[j]))
|
||||||
|
|
||||||
|
|
||||||
|
for char, next_char in char_transition.items():
|
||||||
|
total_count = 0
|
||||||
|
[total_count := total_count + x for x in list(next_char.values())]
|
||||||
|
|
||||||
|
for i in next_char.keys():
|
||||||
|
next_char[i] = next_char[i] / float(total_count)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_homophones(pron):
|
||||||
|
homophones = []
|
||||||
|
for i in char_to_pronounce.keys():
|
||||||
|
if pron in char_to_pronounce[i].keys():
|
||||||
|
homophones.append(i)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return homophones
|
||||||
|
|
||||||
|
input_lomaji = ["guá", "kap", "tshit", "á", "lâi", "khì", "tâi", "tiong", "tshit", "thô", "sūn", "suà", "tsē", "ko", "thih"]
|
||||||
|
|
||||||
|
char_candidates = []
|
||||||
|
|
||||||
|
for i in input_lomaji:
|
||||||
|
homophones = list(map(lambda x : {"char": x,
|
||||||
|
"prev_char": None,
|
||||||
|
"prob" : None}, # probibility
|
||||||
|
get_homophones(i)))
|
||||||
|
char_candidates.append(homophones)
|
||||||
|
|
||||||
|
#print(char_candidates)
|
||||||
|
def get_max_prob(input_lmj, char_cand):
|
||||||
|
for i in range(len(input_lmj)):
|
||||||
|
if i == 0:
|
||||||
|
for j in char_cand[i]:
|
||||||
|
init_freq_sum = reduce(lambda x, y : x + y,
|
||||||
|
list(
|
||||||
|
map(lambda x : init_freq[x["char"]] ,
|
||||||
|
char_cand[0])))
|
||||||
|
print(init_freq_sum)
|
||||||
|
ch = j["char"]
|
||||||
|
init_to_char_prob = init_freq[ch] / init_freq_sum # get the ratio
|
||||||
|
char_reading_prob = char_to_pronounce[ch][input_lmj[0]]
|
||||||
|
|
||||||
|
j["prob"] = init_to_char_prob * char_reading_prob
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
max_num = -math.inf
|
||||||
|
|
||||||
|
for i in char_cand[0]:
|
||||||
|
if i["prob"] >= max_num:
|
||||||
|
max_num = i["prob"]
|
||||||
|
result = i["char"]
|
||||||
|
|
||||||
|
#print(result)
|
||||||
|
else:
|
||||||
|
for j in char_cand[i]:
|
||||||
|
prob = -math.inf
|
||||||
|
prev_char = ""
|
||||||
|
for k in char_cand[i-1]:
|
||||||
|
k_prob = k["prob"]
|
||||||
|
#print(k["char"], "k_prob:", k_prob)
|
||||||
|
k_to_j_prob = char_transition[k["char"]][j["char"]]
|
||||||
|
#print(k["char"], "->",j["char"] ,"k_to_j_prob:", k_to_j_prob)
|
||||||
|
j_to_pron_prob = char_to_pronounce[j["char"]][input_lmj[i]]
|
||||||
|
total_tmp_prob = k_prob * k_to_j_prob * j_to_pron_prob
|
||||||
|
if prob < total_tmp_prob:
|
||||||
|
prob = total_tmp_prob
|
||||||
|
prev_char = k
|
||||||
|
|
||||||
|
j["prev_char"] = prev_char["char"]
|
||||||
|
j["prob"] = prob
|
||||||
|
|
||||||
|
real_last_char = ""
|
||||||
|
prev_char = ""
|
||||||
|
prob = -math.inf
|
||||||
|
for i in char_cand[-1]:
|
||||||
|
if i["prob"] > prob:
|
||||||
|
prob = i["prob"]
|
||||||
|
real_last_char = i["char"]
|
||||||
|
prev_char = i["prev_char"]
|
||||||
|
|
||||||
|
print(real_last_char)
|
||||||
|
|
||||||
|
result_hanji = [real_last_char]
|
||||||
|
for i in range(len(input_lmj)-2, -1, -1):
|
||||||
|
current = list(filter(lambda x : x["char"] == prev_char,
|
||||||
|
char_cand[i]))[0]
|
||||||
|
result_hanji.append(current["char"])
|
||||||
|
prev_char = current["prev_char"]
|
||||||
|
|
||||||
|
|
||||||
|
result_hanji.reverse()
|
||||||
|
|
||||||
|
result_hanji_string = "".join(result_hanji)
|
||||||
|
print("輸入ê羅馬字陣列(array):", input_lomaji)
|
||||||
|
print("輸出ê漢字:", result_hanji_string)
|
||||||
|
|
||||||
|
|
||||||
|
get_max_prob(input_lomaji, char_candidates)
|
89
test3.py~
Normal file
89
test3.py~
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import math
|
||||||
|
from functools import reduce
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
def genmod():
|
||||||
|
corpus_path = "./corpus/"
|
||||||
|
df_list = []
|
||||||
|
for file in os.listdir(corpus_path):
|
||||||
|
if file.endswith(".csv"):
|
||||||
|
df = pd.read_csv(corpus_path+file, header=0, names=['hanji', 'lomaji'])
|
||||||
|
df_list.append(df)
|
||||||
|
df = pd.concat(df_list)
|
||||||
|
df['lomaji'] = df['lomaji'].str.lower()
|
||||||
|
|
||||||
|
new_data = []
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
hanji = list(filter(lambda x : re.match("[^、();:,。!?「」『』]", x), list(row['hanji'])))
|
||||||
|
tl = re.split(r'(:?[!?;,.\"\'\(\):]|[-]+|\s+)', row['lomaji'])
|
||||||
|
tl2 = list(filter(lambda x : re.match(r"([^\(\)^!:?; \'\",.\-\u3000])", x), tl))
|
||||||
|
new_data.append((hanji, tl2))
|
||||||
|
if (len(hanji) != len(tl2)):
|
||||||
|
raise ValueError(f"length of hanji {hanji} is different from romaji {tl2}.")
|
||||||
|
|
||||||
|
model_filename = "model.db"
|
||||||
|
try:
|
||||||
|
os.remove(model_filename)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
con = sqlite3.connect(model_filename)
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute("CREATE TABLE pronounce(hanji, lomaji, freq)")
|
||||||
|
|
||||||
|
|
||||||
|
char_to_pronounce = {}
|
||||||
|
|
||||||
|
for i in new_data:
|
||||||
|
hanji = i[0]
|
||||||
|
lomaji = i[1]
|
||||||
|
for j in range(len(i[0])):
|
||||||
|
if not hanji[j] in char_to_pronounce:
|
||||||
|
char_to_pronounce[hanji[j]] = {lomaji[j] : 1}
|
||||||
|
elif not lomaji[j] in char_to_pronounce[hanji[j]]:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] = 1
|
||||||
|
else:
|
||||||
|
char_to_pronounce[hanji[j]][lomaji[j]] += 1
|
||||||
|
|
||||||
|
print(char_to_pronounce)
|
||||||
|
|
||||||
|
for i in char_to_pronounce.keys():
|
||||||
|
hanji = char_to_pronounce[i]
|
||||||
|
for j in hanji.keys():
|
||||||
|
cur.execute("INSERT INTO pronounce VALUES(?, ?, ?)", (i,j, hanji[j]))
|
||||||
|
|
||||||
|
#con.commit()
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
def convert(sentence):
|
||||||
|
pass
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--genmod', help='generate the model', action='store_true',
|
||||||
|
required=False,)
|
||||||
|
|
||||||
|
parser.add_argument('sentence', metavar='SENTENCE', nargs='?',
|
||||||
|
help='the sentence to be converted')
|
||||||
|
parser.add_argument('--form', metavar='FORM', choices=["poj", "tl"], nargs=1,
|
||||||
|
default=['poj'],
|
||||||
|
help='the orthography to be used (poj or tl). Default is poj.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
if args.genmod == True:
|
||||||
|
genmod()
|
||||||
|
elif args.sentence != None:
|
||||||
|
if args.form == ['poj']:
|
||||||
|
sentence = poj_to_tl(args.sentence)
|
||||||
|
print(convert(sentence))
|
||||||
|
else:
|
||||||
|
print(convert(args.sentence))
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
Loading…
Reference in a new issue