3 changed files with 97 additions and 117 deletions
--- a/README.md
+++ b/README.md
@ -1,24 +1,32 @@
-# Pseudo Chinese (with MeCab)
+# Pseudo Chinese
 Convert Japanese to pseudo-Chinese.

 ## Description
 This tool will automatically generate fake Chinese from Japanese sentences.

-Using MeCab to parse and word-tag Japanese sentences instead of COTOHA API.
-
 ## Demo
-私は本日定時退社します -> 我本日定時退社
+私は本日定時退社します -> 我本日定時退社也

-私はお酒を飲みたい -> 我御酒飲欲
+私はお酒を飲みたい -> 我飲酒希望

 ## Requirement
 - Python 3.5.1
- mecab-python3
- unidic-lite
+- [COTOHA API](https://api.ce-cotoha.com/contents/index.html)
+
+You need to register for a COTOHA API account before you can run this tool.
+
+Once you have registered your COTOHA API account, you will set your Client ID and Client Secret to `env.json` .
+
+```json
+{
+	"client_id": "yourclinetid",
+	"client_secret": "yourclinetsecret"
+}
+```

 ## Usage
 ```
-$ python -u pseudo-chinese.py [sentence]
+$ python -u pseudo-chinese.py
 ```

 ## Contribution
@ -34,5 +42,4 @@ MIT

 ## Author

- [Shoichiro Kono](https://github.com/k2font) (orig. creater)
- [Tan Kian-ting](https://github.com/yoxem) (porting to MeCab, and modified it.)
+[Shoichiro Kono](https://github.com/k2font)
--- a/env.json
+++ b/env.json
@ -0,0 +1,4 @@
+{
+	"client_id": "<Client ID>",
+	"client_secret": "<Client Secret>"
+}
--- a/pseudo-chinese.py
+++ b/pseudo-chinese.py
@ -1,52 +1,48 @@
-import functools
-import MeCab
+import requests
+import json
 import sys
-import re

+BASE_URL = "https://api.ce-cotoha.com/api/dev/nlp/"
+
+# アクセストークンを取得する関数
+# Function to get the access token.
+# 获取访问令牌的函数
+def auth(client_id, client_secret):
+	token_url = "https://api.ce-cotoha.com/v1/oauth/accesstokens"
+	headers = {
+		"Content-Type": "application/json",
+		"charset": "UTF-8"
+	}
+
+	data = {
+		"grantType": "client_credentials",
+		"clientId": client_id,
+		"clientSecret": client_secret
+	}
+
+	r = requests.post(token_url,headers=headers,data=json.dumps(data))
+
+	return r.json()["access_token"]

 # 形態素解析する関数
 # Function for morphological analysis.
 # 形态学分析功能
-def parse(sentence):
-	mecab_tagger = MeCab.Tagger()
-	raw_result = mecab_tagger.parse(sentence).split('\n')
-	result = []
-	for i in raw_result[:-2]:
-		j = i.split('\t')
-		item = dict()
-		item['form'] = j[0] # 食べ
-		#print(j)
-		if len(j) > 1:
-			item['lemma'] = j[3] # 食べる
-			item['pos'] = j[4] #　動詞-一般
-			item['features'] = j[6] # 連用形-一般
-		else:
-			item['lemma'] = j[0]
-			item["pos"] = ""
-			item["features"] = ""
+def parse(sentence, access_token):
+	base_url = BASE_URL

-		result.append(item)
-	return result
+	headers = {
+		"Content-Type": "application/json",
+		"charset": "UTF-8",
+		"Authorization": "Bearer {}".format(access_token)
+	}
 	
+	data = {
+		"sentence": sentence,
+		"type": "default"
+	}
 	
-def is_hira(string):
-	if isinstance(string, str):
-		string = list(string)
-	if len(string) == 0:
-		return False
-	elif len(string) == 1:
-		return (("ぁ" <= string[0]) and (string[0] <= "ん"))
-	if len(string) > 1:
-		return functools.reduce((lambda x, y: (is_hira(x) and is_hira(y))) , string)
-
-def contain_kanji(str):
-	if len(str) == 0:
-		return False
-	elif len(str) == 1:
-		return re.match(r"[一-龯]", str)
-	if len(str) > 1:
-		return functools.reduce(lambda x, y: contain_kanji(x) or contain_kanji(y) , str)	
-
+	r = requests.post(base_url + "v1/parse",headers=headers,data=json.dumps(data))
+	return r.json()

 # ひらがなを削除する関数
 # Function to delete hiragana.
@ -55,81 +51,54 @@ def hira_to_blank(str):
  return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])

 if __name__ == "__main__":
-	
+	envjson = open('env.json', 'r')
+	json_load = json.load(envjson)
+	CLIENT_ID = json_load["client_id"]
+	CLIENT_SECRET = json_load["client_secret"]

 	document = "私は明日、伊豆大島に行きたい"
 	args = sys.argv
 	if len(args) >= 2:
 		document = str(args[1])

-	parse_document = parse(document)
-	#print(parse_document)
+	access_token = auth(CLIENT_ID, CLIENT_SECRET)
+	parse_document = parse(document, access_token)
+	print(parse_document)
 	result_list = list()
-	
-	for i, token in enumerate(parse_document):
-
+	for chunks in parse_document['result']:
+		for token in chunks["tokens"]:
 			# 形態素解析結果に置き換えルールを適用する
-		if (token["pos"] != "助詞-格助詞" 
-			and token["pos"] != "助詞-接続助詞" 
-			and token["pos"] != "助詞-終助詞" 
-			and token["pos"] != "助詞-接続助詞" ):
-			if '終止形-一般' in token["features"]:
-				if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
-					prime = "" # don't translate it. 
+			if (token["pos"] != "連用助詞" 
+			and token["pos"] != "引用助詞" 
+			and token["pos"] != "終助詞" 
+			and token["pos"] != "接続接尾辞" 
+			and token["pos"] != "動詞活用語尾"):
+				if token["pos"] == "動詞接尾辞" and '終止' in token["features"]:
+					if ("する" in token["lemma"]) or ("ます" in token["lemma"]):
+						prime = "也"
 					elif "たい" in token["lemma"]:
-					prime = "欲"
-				elif token["lemma"] in ["ない", "無い"]:
-					prime = "無"
-				elif token['lemma'] == 'た':
+						prime = "希望"
+					elif token['lemma'] != 'ない':
 						prime = "了"
 					else:
-					print(is_hira(token['lemma']))
-					if is_hira(token['lemma']):
-						
+						prime = "実行"
+				else:
 					prime = token["form"]
-					else:
-						prime = token["lemma"]
-			else:

-				if is_hira(token["lemma"]) and contain_kanji(token["form"]):
-					prime=token["form"]
-				else:
-					prime = token["lemma"]
-
-
-			if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
-				prime = '你'
-
-			if token['lemma'] == '為る' and parse_document[i-1]['pos'] == '名詞-普通名詞-サ変可能':
-				prime = ''
-
-
-			compound_matched = re.match("([^-]+)-([^-]+)", token['lemma'])
-			if compound_matched:
-				prime = compound_matched.group(1)
-
-			if token['lemma'] == '私-代名詞':
+				if token['lemma'] == '私':
 					prime = '我'

+				if (token['lemma'] == '君' or token['lemma'] == 'あなた' or token['lemma'] == 'お前'):
+					prime = '你'
+
 				if len(token["features"]) != 0:
-				if "連体形-一般" in token['features']:
-					if token['lemma'] == 'ない':
-						prime = "無之"
-					else:
-						prime = prime + "之"
-			
-			
-
+					if "SURU" in token["features"][0] :
+						prime = "実行"
+					elif "連体" in token['features'][0]:
+						prime = "的"
+					elif "疑問符" in token["features"][0]:
+						prime = "如何?"

 				result_list.append(hira_to_blank(prime))

-		
-
-		if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞":
-			prime = "之"
-			result_list.append(hira_to_blank(prime))
-		if token["form"] == "か" and token['pos'] == '助詞-終助詞':
-			prime = "乎"
-			result_list.append(hira_to_blank(prime))
-
-print(''.join(result_list))
+	print(''.join(result_list))