From e566cb2716f67cbbbbebd2785bd00b7ec376f492 Mon Sep 17 00:00:00 2001
From: "Chen, Chien-ting" <chenjt30@gmail.com>
Date: Sat, 14 Aug 2021 20:36:09 +0800
Subject: [PATCH] mecab porting

---
 README.md         |  29 ++++-----
 env.json          |   4 --
 pseudo-chinese.py | 148 ++++++++++++++++++++++------------------------
 3 files changed, 82 insertions(+), 99 deletions(-)
 delete mode 100644 env.json
diff --git a/README.md b/README.md
index af2ad6f..7f0b974 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,24 @@
-# Pseudo Chinese
+# Pseudo Chinese (with MeCab)
 Convert Japanese to pseudo-Chinese.
 
 ## Description
 This tool will automatically generate fake Chinese from Japanese sentences.
 
-## Demo
-私は本日定時退社します -> 我本日定時退社也
+Using MeCab to parse Japanese sentences instead of COTOHA API.
 
-私はお酒を飲みたい -> 我飲酒希望
+## Demo
+私は本日定時退社します -> 我本日定時退社為
+
+私はお酒を飲みたい -> 我御酒飲欲
 
 ## Requirement
 - Python 3.5.1
-- [COTOHA API](https://api.ce-cotoha.com/contents/index.html)
-
-You need to register for a COTOHA API account before you can run this tool.
-
-Once you have registered your COTOHA API account, you will set your Client ID and Client Secret to `env.json` .
-
-```json
-{
-	"client_id": "yourclinetid",
-	"client_secret": "yourclinetsecret"
-}
-```
+- mecab-python3
+- unidic-lite
 
 ## Usage
 ```
-$ python -u pseudo-chinese.py
+$ python -u pseudo-chinese.py [sentence]
 ```
 
 ## Contribution
@@ -42,4 +34,5 @@ MIT
 
 ## Author
 
-[Shoichiro Kono](https://github.com/k2font)
+[Shoichiro Kono](https://github.com/k2font) (orig. creater)
+[Tan Kian-ting](https://github.com/yoxem) (porting to MeCab, and modified it.)
\ No newline at end of file
diff --git a/env.json b/env.json
deleted file mode 100644
index d7e6573..0000000
--- a/env.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-	"client_id": "<Client ID>",
-	"client_secret": "<Client Secret>"
-}
diff --git a/pseudo-chinese.py b/pseudo-chinese.py
index 15566aa..ac4df4a 100644
--- a/pseudo-chinese.py
+++ b/pseudo-chinese.py
@@ -1,104 +1,98 @@
-import requests
-import json
+import MeCab
 import sys
 
-BASE_URL = "https://api.ce-cotoha.com/api/dev/nlp/"
-
-# アクセストークンを取得する関数
-# Function to get the access token.
-# 获取访问令牌的函数
-def auth(client_id, client_secret):
-	token_url = "https://api.ce-cotoha.com/v1/oauth/accesstokens"
-	headers = {
-		"Content-Type": "application/json",
-		"charset": "UTF-8"
-	}
-
-	data = {
-		"grantType": "client_credentials",
-		"clientId": client_id,
-		"clientSecret": client_secret
-	}
-
-	r = requests.post(token_url,headers=headers,data=json.dumps(data))
-
-	return r.json()["access_token"]
 
 # 形態素解析する関数
 # Function for morphological analysis.
 # 形态学分析功能
-def parse(sentence, access_token):
-	base_url = BASE_URL
+def parse(sentence):
+	mecab_tagger = MeCab.Tagger()
+	raw_result = mecab_tagger.parse(sentence).split('\n')
+	result = []
+	for i in raw_result[:-2]:
+		j = i.split('\t')
+		item = dict()
+		item['form'] = j[0] # 食べ
+		print(j)
+		if len(j) > 1:
+			item['lemma'] = j[3] # 食べる
+			item['pos'] = j[4] #　動詞-一般
+			item['features'] = j[6] # 連用形-一般
+		else:
+			item['lemma'] = j[0]
+			item["pos"] = ""
+			item["features"] = ""
+		
+		result.append(item)
+	return result
+
 
-	headers = {
-		"Content-Type": "application/json",
-		"charset": "UTF-8",
-		"Authorization": "Bearer {}".format(access_token)
-	}
-	
-	data = {
-		"sentence": sentence,
-		"type": "default"
-	}
-	
-	r = requests.post(base_url + "v1/parse",headers=headers,data=json.dumps(data))
-	return r.json()
 
 # ひらがなを削除する関数
 # Function to delete hiragana.
 # 删除平假名的功能
 def hira_to_blank(str):
-  return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])
+	return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])
 
 if __name__ == "__main__":
-	envjson = open('env.json', 'r')
-	json_load = json.load(envjson)
-	CLIENT_ID = json_load["client_id"]
-	CLIENT_SECRET = json_load["client_secret"]
+	
 
 	document = "私は明日、伊豆大島に行きたい"
 	args = sys.argv
 	if len(args) >= 2:
 		document = str(args[1])
 
-	access_token = auth(CLIENT_ID, CLIENT_SECRET)
-	parse_document = parse(document, access_token)
-	print(parse_document)
+	parse_document = parse(document)
+	#print(parse_document)
 	result_list = list()
-	for chunks in parse_document['result']:
-		for token in chunks["tokens"]:
-			# 形態素解析結果に置き換えルールを適用する
-			if (token["pos"] != "連用助詞" 
-			and token["pos"] != "引用助詞" 
-			and token["pos"] != "終助詞" 
-			and token["pos"] != "接続接尾辞" 
-			and token["pos"] != "動詞活用語尾"):
-				if token["pos"] == "動詞接尾辞" and '終止' in token["features"]:
-					if ("する" in token["lemma"]) or ("ます" in token["lemma"]):
-						prime = "也"
-					elif "たい" in token["lemma"]:
-						prime = "希望"
-					elif token['lemma'] != 'ない':
-						prime = "了"
-					else:
-						prime = "実行"
+	
+	for token in parse_document:
+		# 形態素解析結果に置き換えルールを適用する
+		if (token["pos"] != "助詞-格助詞" 
+			and token["pos"] != "助詞-接続助詞" 
+			and token["pos"] != "助詞-終助詞" 
+			and token["pos"] != "助詞-接続助詞" ):
+			if '終止形-一般' in token["features"]:
+				if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
+					prime = "" # don't translate it. 
+				elif "たい" in token["lemma"]:
+					prime = "欲"
+				elif token["lemma"] in ["ない", "無い"]:
+					prime = "無"
+				elif token['lemma'] == 'た':
+					prime = "了"
 				else:
-					prime = token["form"]
+					prime = token["lemma"]
+			else:
+				prime = token["lemma"]
 
-				if token['lemma'] == '私':
-					prime = '我'
 
-				if (token['lemma'] == '君' or token['lemma'] == 'あなた' or token['lemma'] == 'お前'):
-					prime = '你'
+			if token['lemma'] == '私-代名詞':
+				prime = '我'
 
-				if len(token["features"]) != 0:
-					if "SURU" in token["features"][0] :
-						prime = "実行"
-					elif "連体" in token['features'][0]:
-						prime = "的"
-					elif "疑問符" in token["features"][0]:
-						prime = "如何?"
+			if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
+				prime = '你'
 
-				result_list.append(hira_to_blank(prime))
 
-	print(''.join(result_list))
+			if token['lemma'] == '円-助数詞':
+				prime = '円'
+
+			if len(token["features"]) != 0:
+				if "連体形-一般" in token['features']:
+					if token['lemma'] == 'ない':
+						prime = "無之"
+					else:
+						prime = "之"
+
+			result_list.append(hira_to_blank(prime))
+
+		
+
+		if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞":
+			prime = "之"
+			result_list.append(hira_to_blank(prime))
+		if token["form"] == "か" and token['pos'] == '助詞-終助詞':
+			prime = "乎"
+			result_list.append(hira_to_blank(prime))
+
+print(''.join(result_list))