fix 儂->わし bug

This commit is contained in:
Tan, Kian-ting 2021-08-14 23:13:04 +08:00
parent 3eb0440b58
commit bddf12b89c

View file

@ -1,3 +1,4 @@
import functools
import MeCab import MeCab
import sys import sys
import re import re
@ -28,6 +29,24 @@ def parse(sentence):
return result return result
def is_hira(string):
if isinstance(string, str):
string = list(string)
if len(string) == 0:
return False
elif len(string) == 1:
return (("" <= string[0]) and (string[0] <= ""))
if len(string) > 1:
return functools.reduce((lambda x, y: (is_hira(x) and is_hira(y))) , string)
def contain_kanji(str):
if len(str) == 0:
return False
elif len(str) == 1:
return re.match(r"[一-龯]", str)
if len(str) > 1:
return functools.reduce(lambda x, y: contain_kanji(x) or contain_kanji(y) , str)
# ひらがなを削除する関数 # ひらがなを削除する関数
# Function to delete hiragana. # Function to delete hiragana.
@ -64,10 +83,18 @@ if __name__ == "__main__":
elif token['lemma'] == '': elif token['lemma'] == '':
prime = "" prime = ""
else: else:
prime = token["lemma"] print(is_hira(token['lemma']))
if is_hira(token['lemma']):
prime = token["form"]
else:
prime = token["lemma"]
else: else:
prime = token["lemma"]
if is_hira(token["lemma"]) and contain_kanji(token["form"]):
prime=token["form"]
else:
prime = token["lemma"]
if (token['lemma'] == '' or token['lemma'] == '貴方' or token['lemma'] == 'お前'): if (token['lemma'] == '' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
@ -90,6 +117,9 @@ if __name__ == "__main__":
prime = "無之" prime = "無之"
else: else:
prime = prime + "" prime = prime + ""
result_list.append(hira_to_blank(prime)) result_list.append(hira_to_blank(prime))