mecab porting
This commit is contained in:
parent
a6fa27b659
commit
e566cb2716
3 changed files with 82 additions and 99 deletions
29
README.md
29
README.md
|
@ -1,32 +1,24 @@
|
||||||
# Pseudo Chinese
|
# Pseudo Chinese (with MeCab)
|
||||||
Convert Japanese to pseudo-Chinese.
|
Convert Japanese to pseudo-Chinese.
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
This tool will automatically generate fake Chinese from Japanese sentences.
|
This tool will automatically generate fake Chinese from Japanese sentences.
|
||||||
|
|
||||||
## Demo
|
Using MeCab to parse Japanese sentences instead of COTOHA API.
|
||||||
私は本日定時退社します -> 我本日定時退社也
|
|
||||||
|
|
||||||
私はお酒を飲みたい -> 我飲酒希望
|
## Demo
|
||||||
|
私は本日定時退社します -> 我本日定時退社為
|
||||||
|
|
||||||
|
私はお酒を飲みたい -> 我御酒飲欲
|
||||||
|
|
||||||
## Requirement
|
## Requirement
|
||||||
- Python 3.5.1
|
- Python 3.5.1
|
||||||
- [COTOHA API](https://api.ce-cotoha.com/contents/index.html)
|
- mecab-python3
|
||||||
|
- unidic-lite
|
||||||
You need to register for a COTOHA API account before you can run this tool.
|
|
||||||
|
|
||||||
Once you have registered your COTOHA API account, you will set your Client ID and Client Secret to `env.json` .
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"client_id": "yourclinetid",
|
|
||||||
"client_secret": "yourclinetsecret"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
```
|
```
|
||||||
$ python -u pseudo-chinese.py
|
$ python -u pseudo-chinese.py [sentence]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Contribution
|
## Contribution
|
||||||
|
@ -42,4 +34,5 @@ MIT
|
||||||
|
|
||||||
## Author
|
## Author
|
||||||
|
|
||||||
[Shoichiro Kono](https://github.com/k2font)
|
[Shoichiro Kono](https://github.com/k2font) (orig. creater)
|
||||||
|
[Tan Kian-ting](https://github.com/yoxem) (porting to MeCab, and modified it.)
|
4
env.json
4
env.json
|
@ -1,4 +0,0 @@
|
||||||
{
|
|
||||||
"client_id": "<Client ID>",
|
|
||||||
"client_secret": "<Client Secret>"
|
|
||||||
}
|
|
|
@ -1,48 +1,32 @@
|
||||||
import requests
|
import MeCab
|
||||||
import json
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
BASE_URL = "https://api.ce-cotoha.com/api/dev/nlp/"
|
|
||||||
|
|
||||||
# アクセストークンを取得する関数
|
|
||||||
# Function to get the access token.
|
|
||||||
# 获取访问令牌的函数
|
|
||||||
def auth(client_id, client_secret):
|
|
||||||
token_url = "https://api.ce-cotoha.com/v1/oauth/accesstokens"
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"charset": "UTF-8"
|
|
||||||
}
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"grantType": "client_credentials",
|
|
||||||
"clientId": client_id,
|
|
||||||
"clientSecret": client_secret
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.post(token_url,headers=headers,data=json.dumps(data))
|
|
||||||
|
|
||||||
return r.json()["access_token"]
|
|
||||||
|
|
||||||
# 形態素解析する関数
|
# 形態素解析する関数
|
||||||
# Function for morphological analysis.
|
# Function for morphological analysis.
|
||||||
# 形态学分析功能
|
# 形态学分析功能
|
||||||
def parse(sentence, access_token):
|
def parse(sentence):
|
||||||
base_url = BASE_URL
|
mecab_tagger = MeCab.Tagger()
|
||||||
|
raw_result = mecab_tagger.parse(sentence).split('\n')
|
||||||
|
result = []
|
||||||
|
for i in raw_result[:-2]:
|
||||||
|
j = i.split('\t')
|
||||||
|
item = dict()
|
||||||
|
item['form'] = j[0] # 食べ
|
||||||
|
print(j)
|
||||||
|
if len(j) > 1:
|
||||||
|
item['lemma'] = j[3] # 食べる
|
||||||
|
item['pos'] = j[4] # 動詞-一般
|
||||||
|
item['features'] = j[6] # 連用形-一般
|
||||||
|
else:
|
||||||
|
item['lemma'] = j[0]
|
||||||
|
item["pos"] = ""
|
||||||
|
item["features"] = ""
|
||||||
|
|
||||||
headers = {
|
result.append(item)
|
||||||
"Content-Type": "application/json",
|
return result
|
||||||
"charset": "UTF-8",
|
|
||||||
"Authorization": "Bearer {}".format(access_token)
|
|
||||||
}
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"sentence": sentence,
|
|
||||||
"type": "default"
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.post(base_url + "v1/parse",headers=headers,data=json.dumps(data))
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
# ひらがなを削除する関数
|
# ひらがなを削除する関数
|
||||||
# Function to delete hiragana.
|
# Function to delete hiragana.
|
||||||
|
@ -51,54 +35,64 @@ def hira_to_blank(str):
|
||||||
return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])
|
return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str])
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
envjson = open('env.json', 'r')
|
|
||||||
json_load = json.load(envjson)
|
|
||||||
CLIENT_ID = json_load["client_id"]
|
|
||||||
CLIENT_SECRET = json_load["client_secret"]
|
|
||||||
|
|
||||||
document = "私は明日、伊豆大島に行きたい"
|
document = "私は明日、伊豆大島に行きたい"
|
||||||
args = sys.argv
|
args = sys.argv
|
||||||
if len(args) >= 2:
|
if len(args) >= 2:
|
||||||
document = str(args[1])
|
document = str(args[1])
|
||||||
|
|
||||||
access_token = auth(CLIENT_ID, CLIENT_SECRET)
|
parse_document = parse(document)
|
||||||
parse_document = parse(document, access_token)
|
#print(parse_document)
|
||||||
print(parse_document)
|
|
||||||
result_list = list()
|
result_list = list()
|
||||||
for chunks in parse_document['result']:
|
|
||||||
for token in chunks["tokens"]:
|
for token in parse_document:
|
||||||
# 形態素解析結果に置き換えルールを適用する
|
# 形態素解析結果に置き換えルールを適用する
|
||||||
if (token["pos"] != "連用助詞"
|
if (token["pos"] != "助詞-格助詞"
|
||||||
and token["pos"] != "引用助詞"
|
and token["pos"] != "助詞-接続助詞"
|
||||||
and token["pos"] != "終助詞"
|
and token["pos"] != "助詞-終助詞"
|
||||||
and token["pos"] != "接続接尾辞"
|
and token["pos"] != "助詞-接続助詞" ):
|
||||||
and token["pos"] != "動詞活用語尾"):
|
if '終止形-一般' in token["features"]:
|
||||||
if token["pos"] == "動詞接尾辞" and '終止' in token["features"]:
|
if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
|
||||||
if ("する" in token["lemma"]) or ("ます" in token["lemma"]):
|
prime = "" # don't translate it.
|
||||||
prime = "也"
|
|
||||||
elif "たい" in token["lemma"]:
|
elif "たい" in token["lemma"]:
|
||||||
prime = "希望"
|
prime = "欲"
|
||||||
elif token['lemma'] != 'ない':
|
elif token["lemma"] in ["ない", "無い"]:
|
||||||
|
prime = "無"
|
||||||
|
elif token['lemma'] == 'た':
|
||||||
prime = "了"
|
prime = "了"
|
||||||
else:
|
else:
|
||||||
prime = "実行"
|
prime = token["lemma"]
|
||||||
else:
|
else:
|
||||||
prime = token["form"]
|
prime = token["lemma"]
|
||||||
|
|
||||||
if token['lemma'] == '私':
|
|
||||||
|
if token['lemma'] == '私-代名詞':
|
||||||
prime = '我'
|
prime = '我'
|
||||||
|
|
||||||
if (token['lemma'] == '君' or token['lemma'] == 'あなた' or token['lemma'] == 'お前'):
|
if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
|
||||||
prime = '你'
|
prime = '你'
|
||||||
|
|
||||||
|
|
||||||
|
if token['lemma'] == '円-助数詞':
|
||||||
|
prime = '円'
|
||||||
|
|
||||||
if len(token["features"]) != 0:
|
if len(token["features"]) != 0:
|
||||||
if "SURU" in token["features"][0] :
|
if "連体形-一般" in token['features']:
|
||||||
prime = "実行"
|
if token['lemma'] == 'ない':
|
||||||
elif "連体" in token['features'][0]:
|
prime = "無之"
|
||||||
prime = "的"
|
else:
|
||||||
elif "疑問符" in token["features"][0]:
|
prime = "之"
|
||||||
prime = "如何?"
|
|
||||||
|
|
||||||
result_list.append(hira_to_blank(prime))
|
result_list.append(hira_to_blank(prime))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞":
|
||||||
|
prime = "之"
|
||||||
|
result_list.append(hira_to_blank(prime))
|
||||||
|
if token["form"] == "か" and token['pos'] == '助詞-終助詞':
|
||||||
|
prime = "乎"
|
||||||
|
result_list.append(hira_to_blank(prime))
|
||||||
|
|
||||||
print(''.join(result_list))
|
print(''.join(result_list))
|
||||||
|
|
Loading…
Reference in a new issue