pseudo-chinese/pseudo-chinese.py

import MeCab
import sys
import re


# 形態素解析する関数
# Function for morphological analysis.
# 形态学分析功能
def parse(sentence):
	mecab_tagger = MeCab.Tagger()
	raw_result = mecab_tagger.parse(sentence).split('\n')
	result = []
	for i in raw_result[:-2]:
		j = i.split('\t')
		item = dict()
		item['form'] = j[0] # 食べ
		#print(j)
		if len(j) > 1:
			item['lemma'] = j[3] # 食べる
			item['pos'] = j[4] #　動詞-一般
			item['features'] = j[6] # 連用形-一般
		else:
			item['lemma'] = j[0]
			item["pos"] = ""
			item["features"] = ""
		
		result.append(item)
	return result


# ひらがなを削除する関数
# Function to delete hiragana.
# 删除平假名的功能
def hira_to_blank(str):
	return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])

if __name__ == "__main__":
	

	document = "私は明日、伊豆大島に行きたい"
	args = sys.argv
	if len(args) >= 2:
		document = str(args[1])

	parse_document = parse(document)
	#print(parse_document)
	result_list = list()
	
	for i, token in enumerate(parse_document):

		# 形態素解析結果に置き換えルールを適用する
		if (token["pos"] != "助詞-格助詞" 
			and token["pos"] != "助詞-接続助詞" 
			and token["pos"] != "助詞-終助詞" 
			and token["pos"] != "助詞-接続助詞" ):
			if '終止形-一般' in token["features"]:
				if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
					prime = "" # don't translate it. 
				elif "たい" in token["lemma"]:
					prime = "欲"
				elif token["lemma"] in ["ない", "無い"]:
					prime = "無"
				elif token['lemma'] == 'た':
					prime = "了"
				else:
					prime = token["lemma"]
			else:
				prime = token["lemma"]


			if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
				prime = '你'

			if token['lemma'] == '為る' and parse_document[i-1]['pos'] == '名詞-普通名詞-サ変可能':
				prime = ''


			compound_matched = re.match("([^-]+)-([^-]+)", token['lemma'])
			if compound_matched:
				prime = compound_matched.group(1)

			if token['lemma'] == '私-代名詞':
				prime = '我'

			if len(token["features"]) != 0:
				if "連体形-一般" in token['features']:
					if token['lemma'] == 'ない':
						prime = "無之"
					else:
						prime = prime + "之"

			result_list.append(hira_to_blank(prime))

		
		if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞":
			prime = "之"
			result_list.append(hira_to_blank(prime))
		if token["form"] == "か" and token['pos'] == '助詞-終助詞':
			prime = "乎"
			result_list.append(hira_to_blank(prime))

print(''.join(result_list))
mecab porting 2021-08-14 20:36:09 +08:00			`import MeCab`
First Commit. 2020-04-14 18:57:50 +08:00			`import sys`
fix bugs 2021-08-14 21:26:32 +08:00			`import re`
First Commit. 2020-04-14 18:57:50 +08:00

			`# 形態素解析する関数`
			`# Function for morphological analysis.`
			`# 形态学分析功能`
mecab porting 2021-08-14 20:36:09 +08:00			`def parse(sentence):`
			`mecab_tagger = MeCab.Tagger()`
			`raw_result = mecab_tagger.parse(sentence).split('\n')`
			`result = []`
			`for i in raw_result[:-2]:`
			`j = i.split('\t')`
			`item = dict()`
			`item['form'] = j[0] # 食べ`
hide debug messages 2021-08-14 20:49:54 +08:00			`#print(j)`
mecab porting 2021-08-14 20:36:09 +08:00			`if len(j) > 1:`
			`item['lemma'] = j[3] # 食べる`
			`item['pos'] = j[4] #　動詞-一般`
			`item['features'] = j[6] # 連用形-一般`
			`else:`
			`item['lemma'] = j[0]`
			`item["pos"] = ""`
			`item["features"] = ""`

			`result.append(item)`
			`return result`


First Commit. 2020-04-14 18:57:50 +08:00
			`# ひらがなを削除する関数`
			`# Function to delete hiragana.`
			`# 删除平假名的功能`
Create README.md 2020-04-14 19:14:30 +08:00			`def hira_to_blank(str):`
mecab porting 2021-08-14 20:36:09 +08:00			`return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])`
First Commit. 2020-04-14 18:57:50 +08:00
			`if __name__ == "__main__":`
mecab porting 2021-08-14 20:36:09 +08:00
First Commit. 2020-04-14 18:57:50 +08:00
			`document = "私は明日、伊豆大島に行きたい"`
			`args = sys.argv`
			`if len(args) >= 2:`
			`document = str(args[1])`

mecab porting 2021-08-14 20:36:09 +08:00			`parse_document = parse(document)`
			`#print(parse_document)`
First Commit. 2020-04-14 18:57:50 +08:00			`result_list = list()`
mecab porting 2021-08-14 20:36:09 +08:00
fix bugs 2021-08-14 21:26:32 +08:00			`for i, token in enumerate(parse_document):`

mecab porting 2021-08-14 20:36:09 +08:00			`# 形態素解析結果に置き換えルールを適用する`
			`if (token["pos"] != "助詞-格助詞"`
			`and token["pos"] != "助詞-接続助詞"`
			`and token["pos"] != "助詞-終助詞"`
			`and token["pos"] != "助詞-接続助詞" ):`
			`if '終止形-一般' in token["features"]:`
			`if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):`
			`prime = "" # don't translate it.`
			`elif "たい" in token["lemma"]:`
			`prime = "欲"`
			`elif token["lemma"] in ["ない", "無い"]:`
			`prime = "無"`
			`elif token['lemma'] == 'た':`
			`prime = "了"`
Fix some indent. 2020-04-14 19:18:21 +08:00			`else:`
mecab porting 2021-08-14 20:36:09 +08:00			`prime = token["lemma"]`
			`else:`
			`prime = token["lemma"]`


Fix some indent. 2020-04-14 19:18:21 +08:00
mecab porting 2021-08-14 20:36:09 +08:00			`if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):`
			`prime = '你'`

fix bugs 2021-08-14 21:26:32 +08:00			`if token['lemma'] == '為る' and parse_document[i-1]['pos'] == '名詞-普通名詞-サ変可能':`
			`prime = ''`

mecab porting 2021-08-14 20:36:09 +08:00
fix bugs 2021-08-14 21:26:32 +08:00			`compound_matched = re.match("([^-]+)-([^-]+)", token['lemma'])`
			`if compound_matched:`
			`prime = compound_matched.group(1)`
mecab porting 2021-08-14 20:36:09 +08:00
fix 私->我 2021-08-14 21:42:26 +08:00			`if token['lemma'] == '私-代名詞':`
			`prime = '我'`

mecab porting 2021-08-14 20:36:09 +08:00			`if len(token["features"]) != 0:`
			`if "連体形-一般" in token['features']:`
			`if token['lemma'] == 'ない':`
			`prime = "無之"`
			`else:`
fix bugs 2021-08-14 21:26:32 +08:00			`prime = prime + "之"`
Fix some indent. 2020-04-14 19:18:21 +08:00
mecab porting 2021-08-14 20:36:09 +08:00			`result_list.append(hira_to_blank(prime))`
Fix some indent. 2020-04-14 19:18:21 +08:00
mecab porting 2021-08-14 20:36:09 +08:00
Fix some indent. 2020-04-14 19:18:21 +08:00
mecab porting 2021-08-14 20:36:09 +08:00			`if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞":`
			`prime = "之"`
			`result_list.append(hira_to_blank(prime))`
			`if token["form"] == "か" and token['pos'] == '助詞-終助詞':`
			`prime = "乎"`
			`result_list.append(hira_to_blank(prime))`
First Commit. 2020-04-14 18:57:50 +08:00
mecab porting 2021-08-14 20:36:09 +08:00			`print(''.join(result_list))`