From e566cb2716f67cbbbbebd2785bd00b7ec376f492 Mon Sep 17 00:00:00 2001 From: "Chen, Chien-ting" Date: Sat, 14 Aug 2021 20:36:09 +0800 Subject: [PATCH] mecab porting --- README.md | 29 ++++----- env.json | 4 -- pseudo-chinese.py | 148 ++++++++++++++++++++++------------------------ 3 files changed, 82 insertions(+), 99 deletions(-) delete mode 100644 env.json diff --git a/README.md b/README.md index af2ad6f..7f0b974 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,24 @@ -# Pseudo Chinese +# Pseudo Chinese (with MeCab) Convert Japanese to pseudo-Chinese. ## Description This tool will automatically generate fake Chinese from Japanese sentences. -## Demo -私は本日定時退社します -> 我本日定時退社也 +Using MeCab to parse Japanese sentences instead of COTOHA API. -私はお酒を飲みたい -> 我飲酒希望 +## Demo +私は本日定時退社します -> 我本日定時退社為 + +私はお酒を飲みたい -> 我御酒飲欲 ## Requirement - Python 3.5.1 -- [COTOHA API](https://api.ce-cotoha.com/contents/index.html) - -You need to register for a COTOHA API account before you can run this tool. - -Once you have registered your COTOHA API account, you will set your Client ID and Client Secret to `env.json` . - -```json -{ - "client_id": "yourclinetid", - "client_secret": "yourclinetsecret" -} -``` +- mecab-python3 +- unidic-lite ## Usage ``` -$ python -u pseudo-chinese.py +$ python -u pseudo-chinese.py [sentence] ``` ## Contribution @@ -42,4 +34,5 @@ MIT ## Author -[Shoichiro Kono](https://github.com/k2font) +[Shoichiro Kono](https://github.com/k2font) (orig. creater) +[Tan Kian-ting](https://github.com/yoxem) (porting to MeCab, and modified it.) \ No newline at end of file diff --git a/env.json b/env.json deleted file mode 100644 index d7e6573..0000000 --- a/env.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "client_id": "", - "client_secret": "" -} diff --git a/pseudo-chinese.py b/pseudo-chinese.py index 15566aa..ac4df4a 100644 --- a/pseudo-chinese.py +++ b/pseudo-chinese.py @@ -1,104 +1,98 @@ -import requests -import json +import MeCab import sys -BASE_URL = "https://api.ce-cotoha.com/api/dev/nlp/" - -# アクセストークンを取得する関数 -# Function to get the access token. -# 获取访问令牌的函数 -def auth(client_id, client_secret): - token_url = "https://api.ce-cotoha.com/v1/oauth/accesstokens" - headers = { - "Content-Type": "application/json", - "charset": "UTF-8" - } - - data = { - "grantType": "client_credentials", - "clientId": client_id, - "clientSecret": client_secret - } - - r = requests.post(token_url,headers=headers,data=json.dumps(data)) - - return r.json()["access_token"] # 形態素解析する関数 # Function for morphological analysis. # 形态学分析功能 -def parse(sentence, access_token): - base_url = BASE_URL +def parse(sentence): + mecab_tagger = MeCab.Tagger() + raw_result = mecab_tagger.parse(sentence).split('\n') + result = [] + for i in raw_result[:-2]: + j = i.split('\t') + item = dict() + item['form'] = j[0] # 食べ + print(j) + if len(j) > 1: + item['lemma'] = j[3] # 食べる + item['pos'] = j[4] # 動詞-一般 + item['features'] = j[6] # 連用形-一般 + else: + item['lemma'] = j[0] + item["pos"] = "" + item["features"] = "" + + result.append(item) + return result + - headers = { - "Content-Type": "application/json", - "charset": "UTF-8", - "Authorization": "Bearer {}".format(access_token) - } - - data = { - "sentence": sentence, - "type": "default" - } - - r = requests.post(base_url + "v1/parse",headers=headers,data=json.dumps(data)) - return r.json() # ひらがなを削除する関数 # Function to delete hiragana. # 删除平假名的功能 def hira_to_blank(str): - return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str]) + return "".join(["" if ("ぁ" <= ch <= "ん") else ch for ch in str]) if __name__ == "__main__": - envjson = open('env.json', 'r') - json_load = json.load(envjson) - CLIENT_ID = json_load["client_id"] - CLIENT_SECRET = json_load["client_secret"] + document = "私は明日、伊豆大島に行きたい" args = sys.argv if len(args) >= 2: document = str(args[1]) - access_token = auth(CLIENT_ID, CLIENT_SECRET) - parse_document = parse(document, access_token) - print(parse_document) + parse_document = parse(document) + #print(parse_document) result_list = list() - for chunks in parse_document['result']: - for token in chunks["tokens"]: - # 形態素解析結果に置き換えルールを適用する - if (token["pos"] != "連用助詞" - and token["pos"] != "引用助詞" - and token["pos"] != "終助詞" - and token["pos"] != "接続接尾辞" - and token["pos"] != "動詞活用語尾"): - if token["pos"] == "動詞接尾辞" and '終止' in token["features"]: - if ("する" in token["lemma"]) or ("ます" in token["lemma"]): - prime = "也" - elif "たい" in token["lemma"]: - prime = "希望" - elif token['lemma'] != 'ない': - prime = "了" - else: - prime = "実行" + + for token in parse_document: + # 形態素解析結果に置き換えルールを適用する + if (token["pos"] != "助詞-格助詞" + and token["pos"] != "助詞-接続助詞" + and token["pos"] != "助詞-終助詞" + and token["pos"] != "助詞-接続助詞" ): + if '終止形-一般' in token["features"]: + if ("為る" in token["lemma"]) or ("ます" in token["lemma"]): + prime = "" # don't translate it. + elif "たい" in token["lemma"]: + prime = "欲" + elif token["lemma"] in ["ない", "無い"]: + prime = "無" + elif token['lemma'] == 'た': + prime = "了" else: - prime = token["form"] + prime = token["lemma"] + else: + prime = token["lemma"] - if token['lemma'] == '私': - prime = '我' - if (token['lemma'] == '君' or token['lemma'] == 'あなた' or token['lemma'] == 'お前'): - prime = '你' + if token['lemma'] == '私-代名詞': + prime = '我' - if len(token["features"]) != 0: - if "SURU" in token["features"][0] : - prime = "実行" - elif "連体" in token['features'][0]: - prime = "的" - elif "疑問符" in token["features"][0]: - prime = "如何?" + if (token['lemma'] == '君' or token['lemma'] == '貴方' or token['lemma'] == 'お前'): + prime = '你' - result_list.append(hira_to_blank(prime)) - print(''.join(result_list)) + if token['lemma'] == '円-助数詞': + prime = '円' + + if len(token["features"]) != 0: + if "連体形-一般" in token['features']: + if token['lemma'] == 'ない': + prime = "無之" + else: + prime = "之" + + result_list.append(hira_to_blank(prime)) + + + + if token['lemma'] == 'の' and token['pos'] == "助詞-格助詞": + prime = "之" + result_list.append(hira_to_blank(prime)) + if token["form"] == "か" and token['pos'] == '助詞-終助詞': + prime = "乎" + result_list.append(hira_to_blank(prime)) + +print(''.join(result_list))