mecab porting

This commit is contained in:
Tan, Kian-ting 2021-08-14 20:36:09 +08:00
parent a6fa27b659
commit e566cb2716
3 changed files with 82 additions and 99 deletions

View file

@ -1,32 +1,24 @@
# Pseudo Chinese # Pseudo Chinese (with MeCab)
Convert Japanese to pseudo-Chinese. Convert Japanese to pseudo-Chinese.
## Description ## Description
This tool will automatically generate fake Chinese from Japanese sentences. This tool will automatically generate fake Chinese from Japanese sentences.
## Demo Using MeCab to parse Japanese sentences instead of COTOHA API.
私は本日定時退社します -> 我本日定時退社也
私はお酒を飲みたい -> 我飲酒希望 ## Demo
私は本日定時退社します -> 我本日定時退社為
私はお酒を飲みたい -> 我御酒飲欲
## Requirement ## Requirement
- Python 3.5.1 - Python 3.5.1
- [COTOHA API](https://api.ce-cotoha.com/contents/index.html) - mecab-python3
- unidic-lite
You need to register for a COTOHA API account before you can run this tool.
Once you have registered your COTOHA API account, you will set your Client ID and Client Secret to `env.json` .
```json
{
"client_id": "yourclinetid",
"client_secret": "yourclinetsecret"
}
```
## Usage ## Usage
``` ```
$ python -u pseudo-chinese.py $ python -u pseudo-chinese.py [sentence]
``` ```
## Contribution ## Contribution
@ -42,4 +34,5 @@ MIT
## Author ## Author
[Shoichiro Kono](https://github.com/k2font) [Shoichiro Kono](https://github.com/k2font) (orig. creater)
[Tan Kian-ting](https://github.com/yoxem) (porting to MeCab, and modified it.)

View file

@ -1,4 +0,0 @@
{
"client_id": "<Client ID>",
"client_secret": "<Client Secret>"
}

View file

@ -1,48 +1,32 @@
import requests import MeCab
import json
import sys import sys
BASE_URL = "https://api.ce-cotoha.com/api/dev/nlp/"
# アクセストークンを取得する関数
# Function to get the access token.
# 获取访问令牌的函数
def auth(client_id, client_secret):
token_url = "https://api.ce-cotoha.com/v1/oauth/accesstokens"
headers = {
"Content-Type": "application/json",
"charset": "UTF-8"
}
data = {
"grantType": "client_credentials",
"clientId": client_id,
"clientSecret": client_secret
}
r = requests.post(token_url,headers=headers,data=json.dumps(data))
return r.json()["access_token"]
# 形態素解析する関数 # 形態素解析する関数
# Function for morphological analysis. # Function for morphological analysis.
# 形态学分析功能 # 形态学分析功能
def parse(sentence, access_token): def parse(sentence):
base_url = BASE_URL mecab_tagger = MeCab.Tagger()
raw_result = mecab_tagger.parse(sentence).split('\n')
result = []
for i in raw_result[:-2]:
j = i.split('\t')
item = dict()
item['form'] = j[0] # 食べ
print(j)
if len(j) > 1:
item['lemma'] = j[3] # 食べる
item['pos'] = j[4] # 動詞-一般
item['features'] = j[6] # 連用形-一般
else:
item['lemma'] = j[0]
item["pos"] = ""
item["features"] = ""
headers = { result.append(item)
"Content-Type": "application/json", return result
"charset": "UTF-8",
"Authorization": "Bearer {}".format(access_token)
}
data = {
"sentence": sentence,
"type": "default"
}
r = requests.post(base_url + "v1/parse",headers=headers,data=json.dumps(data))
return r.json()
# ひらがなを削除する関数 # ひらがなを削除する関数
# Function to delete hiragana. # Function to delete hiragana.
@ -51,54 +35,64 @@ def hira_to_blank(str):
return "".join(["" if ("" <= ch <= "") else ch for ch in str]) return "".join(["" if ("" <= ch <= "") else ch for ch in str])
if __name__ == "__main__": if __name__ == "__main__":
envjson = open('env.json', 'r')
json_load = json.load(envjson)
CLIENT_ID = json_load["client_id"]
CLIENT_SECRET = json_load["client_secret"]
document = "私は明日、伊豆大島に行きたい" document = "私は明日、伊豆大島に行きたい"
args = sys.argv args = sys.argv
if len(args) >= 2: if len(args) >= 2:
document = str(args[1]) document = str(args[1])
access_token = auth(CLIENT_ID, CLIENT_SECRET) parse_document = parse(document)
parse_document = parse(document, access_token) #print(parse_document)
print(parse_document)
result_list = list() result_list = list()
for chunks in parse_document['result']:
for token in chunks["tokens"]: for token in parse_document:
# 形態素解析結果に置き換えルールを適用する # 形態素解析結果に置き換えルールを適用する
if (token["pos"] != "連用助詞" if (token["pos"] != "助詞-格助詞"
and token["pos"] != "引用助詞" and token["pos"] != "助詞-接続助詞"
and token["pos"] != "終助詞" and token["pos"] != "助詞-終助詞"
and token["pos"] != "接続接尾辞" and token["pos"] != "助詞-接続助詞" ):
and token["pos"] != "動詞活用語尾"): if '終止形-一般' in token["features"]:
if token["pos"] == "動詞接尾辞" and '終止' in token["features"]: if ("為る" in token["lemma"]) or ("ます" in token["lemma"]):
if ("する" in token["lemma"]) or ("ます" in token["lemma"]): prime = "" # don't translate it.
prime = ""
elif "たい" in token["lemma"]: elif "たい" in token["lemma"]:
prime = "希望" prime = ""
elif token['lemma'] != 'ない': elif token["lemma"] in ["ない", "無い"]:
prime = ""
elif token['lemma'] == '':
prime = "" prime = ""
else: else:
prime = "実行" prime = token["lemma"]
else: else:
prime = token["form"] prime = token["lemma"]
if token['lemma'] == '':
if token['lemma'] == '私-代名詞':
prime = '' prime = ''
if (token['lemma'] == '' or token['lemma'] == 'あなた' or token['lemma'] == 'お前'): if (token['lemma'] == '' or token['lemma'] == '貴方' or token['lemma'] == 'お前'):
prime = '' prime = ''
if token['lemma'] == '円-助数詞':
prime = ''
if len(token["features"]) != 0: if len(token["features"]) != 0:
if "SURU" in token["features"][0] : if "連体形-一般" in token['features']:
prime = "実行" if token['lemma'] == 'ない':
elif "連体" in token['features'][0]: prime = "無之"
prime = "" else:
elif "疑問符" in token["features"][0]: prime = ""
prime = "如何?"
result_list.append(hira_to_blank(prime)) result_list.append(hira_to_blank(prime))
if token['lemma'] == '' and token['pos'] == "助詞-格助詞":
prime = ""
result_list.append(hira_to_blank(prime))
if token["form"] == "" and token['pos'] == '助詞-終助詞':
prime = ""
result_list.append(hira_to_blank(prime))
print(''.join(result_list)) print(''.join(result_list))