pseudo-chinese/pseudo-chinese.py

106 lines
2.6 KiB
Python
Raw Normal View History

2021-08-14 20:36:09 +08:00
import MeCab
2020-04-14 18:57:50 +08:00
import sys
2021-08-14 21:26:32 +08:00
import re
2020-04-14 18:57:50 +08:00
# 形態素解析する関数
# Function for morphological analysis.
# 形态学分析功能
2021-08-14 20:36:09 +08:00
def parse(sentence):
mecab_tagger = MeCab.Tagger()
raw_result = mecab_tagger.parse(sentence).split('\n')
result = []
for i in raw_result[:-2]:
j = i.split('\t')
item = dict()
item['form'] = j[0] # 食べ
2021-08-14 20:49:54 +08:00
#print(j)
2021-08-14 20:36:09 +08:00
if len(j) > 1:
item['lemma'] = j[3] # 食べる
item['pos'] = j[4] # 動詞-一般
item['features'] = j[6] # 連用形-一般
else:
item['lemma'] = j[0]
item["pos"] = ""
item["features"] = ""
result.append(item)
return result
2020-04-14 18:57:50 +08:00
# ひらがなを削除する関数
# Function to delete hiragana.
# 删除平假名的功能
2020-04-14 19:14:30 +08:00
def hira_to_blank(str):
2021-08-14 20:36:09 +08:00
return "".join(["" if ("" <= ch <= "") else ch for ch in str])
2020-04-14 18:57:50 +08:00
if __name__ == "__main__":
2021-08-14 20:36:09 +08:00
2020-04-14 18:57:50 +08:00
document = "私は明日、伊豆大島に行きたい"
args = sys.argv
if len(args) >= 2:
document = str(args[1])
2021-08-14 20:36:09 +08:00
parse_document = parse(document)
#print(parse_document)
2020-04-14 18:57:50 +08:00
result_list = list()
2021-08-14 20:36:09 +08:00
2021-08-14 21:26:32 +08:00
for i, token in enumerate(parse_document):
2021-08-14 20:36:09 +08:00
# 形態素解析結果に置き換えルールを適用する
if (token["pos"] != "助詞-格助詞"
and token["pos"] != "助詞-接続助詞"
and token["pos"] != "助詞-終助詞"
and token["pos"] != "助詞-接続助詞" ):
if '終止形-一般' in token["features"]:
if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
prime = "" # don't translate it.
elif "たい" in token["lemma"]:
prime = ""
elif token["lemma"] in ["ない", "無い"]:
prime = ""
elif token['lemma'] == '':
prime = ""
2020-04-14 19:18:21 +08:00
else:
2021-08-14 20:36:09 +08:00
prime = token["lemma"]
else:
prime = token["lemma"]
2020-04-14 19:18:21 +08:00
2021-08-14 20:36:09 +08:00
if (token['lemma'] == '' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
prime = ''
2021-08-14 21:26:32 +08:00
if token['lemma'] == '為る' and parse_document[i-1]['pos'] == '名詞-普通名詞-サ変可能':
prime = ''
2021-08-14 20:36:09 +08:00
2021-08-14 21:26:32 +08:00
compound_matched = re.match("([^-]+)-([^-]+)", token['lemma'])
if compound_matched:
prime = compound_matched.group(1)
2021-08-14 20:36:09 +08:00
2021-08-14 21:42:26 +08:00
if token['lemma'] == '私-代名詞':
prime = ''
2021-08-14 20:36:09 +08:00
if len(token["features"]) != 0:
if "連体形-一般" in token['features']:
if token['lemma'] == 'ない':
prime = "無之"
else:
2021-08-14 21:26:32 +08:00
prime = prime + ""
2020-04-14 19:18:21 +08:00
2021-08-14 20:36:09 +08:00
result_list.append(hira_to_blank(prime))
2020-04-14 19:18:21 +08:00
2021-08-14 20:36:09 +08:00
2020-04-14 19:18:21 +08:00
2021-08-14 20:36:09 +08:00
if token['lemma'] == '' and token['pos'] == "助詞-格助詞":
prime = ""
result_list.append(hira_to_blank(prime))
if token["form"] == "" and token['pos'] == '助詞-終助詞':
prime = ""
result_list.append(hira_to_blank(prime))
2020-04-14 18:57:50 +08:00
2021-08-14 20:36:09 +08:00
print(''.join(result_list))