Clochur/src/Interpreter/__init__.py

#-*-coding:utf-8-*-

import re

class Parser():

    def __init__(self):
        float_pattern  =r"(?P<flo>[+-]?\d+[.]\d+)"
        bool_pattern = r"(?P<bool>True|False)"
        int_pattern  =r"(?P<int>[+-]?\d+)"
        symbol_pattern = r"(?P<sym>[_a-zA-Z][-!._0-9a-zA-Z]*)"
        string_pattern = r"(?P<str>[\"]([^\"\\]|[\][\\\"\n\t])*[\"])"
        parenthesis_pattern = r"(?P<paren>[[]|[]])"
        percent_pattern = r"(?P<percent>[%])"
        space_pattern  = r"(?P<space>[ \t]+)"
        newline_pattern = r"(?P<nl>)\n"
        inside_docu_pattern = r"(?P<other>([^%\[\]\n\s\\]|[\\][%\[\]]?)+)"


        self.total_pattern = re.compile("|".join([float_pattern,bool_pattern,int_pattern,symbol_pattern,
                                                string_pattern,parenthesis_pattern,
                                                percent_pattern,inside_docu_pattern,space_pattern,newline_pattern]))

        self.clc_sexp = None
        self.tokenized = None
        self.parse_tree = None
        self.index = None


    def get_clc_sexp(self, clc):
        self.clc_sexp = clc
        self.tokenized = self.remove_comment(self.tokenize(self.clc_sexp))
        self.parse_tree = self.parse_main(self.tokenized)

    def generate_printable_sexp(self, sexp):
        if isinstance(sexp, list):
            result = "["
            for i in sexp:
                result += (self.generate_printable_sexp(i) + " ")
            result += "]"

            return result
        else:
            if sexp["type"] == "str":
                result = sexp["token"].replace("\\", "\\\\")
                result = "\""+ result[1:-1].replace("\"", "\\\"") + "\""
                return result
            else:
                return str(sexp["token"])

    def tokenize(self, clc):
        line_no = 1
        column = 0
        column_offset = 0
        find_iterator = re.finditer(self.total_pattern, self.clc_sexp)
        result = []
        for i in find_iterator:
            column = i.start() - column_offset

            if i.group(0) == '\n':
                item = {"token" : i.group(0), "line": line_no, "col" : column, "type": i.lastgroup}
                line_no += 1
                column_offset  = i.end()
            else:
                item = {"token" : i.group(0), "line": line_no, "col" : column, "type": i.lastgroup}


            result.append(item)

        return result
    def remove_comment(self, series):
        result = []
        is_comment_token = False
        for i in series:
            if i["token"] == "%":
                is_comment_token = True
            elif i["token"] == "\n":
                if is_comment_token == True:
                    is_comment_token = False
                else:
                    result.append(i)
            elif is_comment_token == True:
                pass
            else:
                result.append(i)

        return result


    def move_forward(self):
        self.index += 1

    def parse_main(self, series):
        self.index = 0

        processed_series = [{"token": "[", "line": None, "col": None, "type": None}] + series + \
                            [{"token": "]", "line": None, "col": None, "type": None}]
        result = self.parse(processed_series)

        if self.index < len(processed_series):
            raise Exception("the parenthesis ] is not balanced.")
        else:
            return result

    def atom(self, series):
        result = series[self.index]
        if result["type"] == "int":
            result["token"] = int(result["token"])
        elif result["type"] == "flo":
            result["token"] = float(result["token"])
        else:
            pass
        self.move_forward()
        return result

    def parse(self, series):
        result = None
        if series[self.index]["token"] == "[":
            result = []
            self.move_forward()
            try:
                while series[self.index]["token"] != "]":
                    item = self.parse(series)
                    result.append(item)

                self.move_forward()

                return result
            except IndexError:
                raise Exception("the parenthesis [ is not balanced.")


        else:
            result = self.atom(series)
            return result


'''
macro expansion for example:

the eclipsis (...) shouldn't be seperated from variable.
[def-syntax foo
    [[_ x y] [+ x y]]
    [[_ x y z...] [+ x [foo y z...]]]]'''

class Intepreter:
    def __init__(self):

        self.macro_env = [dict()] # {"foo": {"before":[_ x y], "after":[+ x y]}, ...}
        # environment
        self.env = [dict()]
        self.macro_list = dict()


    def remove_spaces_and_newlines(self, sexp):
        if isinstance(sexp, list):
            if isinstance(sexp[0], dict) and sexp[0]["token"] == "docu":
                result = []
                for i in sexp[1:]:
                    if i["type"] in ["space", "nl"]:
                        result.append(i)
                    else:
                        result.append(self.remove_spaces_and_newlines(i))

                    result = sexp[0] + result
            else:
                result = [self.remove_spaces_and_newlines(i)
                            for i in sexp if not isinstance(i, dict) or (i["type"] not in ["space", "nl"])]
            return result
        else:
            return sexp


    def interprete(self, sexps):
        sexps = self.remove_spaces_and_newlines(sexps)

        #environment = [dict()]
        for sexp in sexps:
            self.interprete_aux(sexp)

    def interprete_aux(self, sexp):
        if isinstance(sexp, dict):
            if sexp["type"] == "sym":
                res = None
                for i in reversed(self.env):
                    if sexp["token"] in i.keys():
                        res = i[sexp["token"]]
                        break

                if res == None:
                    raise Exception("Ln %d, Col %d: the variable is not found!" % (sexp["line"], sexp["col"], sexp["token"]))
                else:
                    return res

            else:
                return sexp["token"]
        elif sexp[0]["token"] in ["+", "-", "*", "/", "<", "=", ">", "<=", ">="]:
            if len(sexp) != 3:
                raise Exception("Ln %d, Col %d: the argument length %d of %s is not correct." %
                        (sexp[0]["line"], sexp[0]["col"], len(sexp), a.generate_printable_sexp(sexp)))
            else:
                if sexp[0]["token"] == "+":
                    return self.interprete_aux(sexp[1]) + self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == "-":
                    return self.interprete_aux(sexp[1]) - self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == "*":
                    return self.interprete_aux(sexp[1]) * self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == "/":
                    return self.interprete_aux(sexp[1]) / self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == "<":
                    return self.interprete_aux(sexp[1]) < self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == "=":
                    return self.interprete_aux(sexp[1]) == self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == ">":
                    return self.interprete_aux(sexp[1]) > self.interprete_aux(sexp[2])
                elif sexp[0]["token"] == ">=":
                    return self.interprete_aux(sexp[1]) <= self.interprete_aux(sexp[2])
                else:
                    return self.interprete_aux(sexp[1]) >= self.interprete_aux(sexp[2])


        # macro expanding
        elif sexp[0]["token"] in self.macro_list.keys():
            macro_pair = self.macro_list[sexp[0]["token"]]

            is_found = False
            macro_len = len(sexp)

            unification = dict()

            for i in macro_pair:
                if len(i["before"]) == macro_len:
                    unification = self.unify(sexp, i["before"], unification)
                    syntax_after = i["after"]
                    is_found = True
                    break
                elif len(i["before"]) < macro_len and re.match(r".+[.]{3}$", i["before"][-1]["token"]):
                    unification = self.unify(sexp, i["before"], unification)
                    syntax_after = i["after"]
                    is_found = True
                    break
                else:
                    pass

            if not is_found:
                raise Exception("The syntax pattern for %s is not found." % a.generate_printable_sexp(sexp))
            else:
                new_sexp = self.macro_expand(syntax_after, unification)

                return self.interprete_aux(new_sexp)


                #elif len(i["before"]) < macro_len and

        elif sexp[0]["token"] == "define":
            if sexp[1]["type"] != "sym":
                raise Exception("Ln %d, Col %d: the type of %s should be symbol, not %s" %
                        (sexp[1]["line"], sexp[1]["col"], sexp[1], sexp[1]["type"]))
            else:
                self.env[-1][sexp[1]["token"]] = self.interprete_aux(sexp[2])
        elif sexp[0]["token"] == "if":
            if len(sexp) != 4:
                raise Exception("Ln %d, Col %d: the number of argument of if should be 3." %
                        (sexp[0]["line"], sexp[0]["col"]))
            cond = self.interprete_aux(sexp[1])
            if cond:
                return self.interprete_aux(sexp[2])
            else:
                return self.interprete_aux(sexp[3])

        elif sexp[0]["token"] == "str":
            if len(sexp) != 2:
                raise Exception("Ln %d, Col %d: the argument number of str should be 1" %
                        (sexp[0]["line"], sexp[0]["col"]))
            else:
                return str(self.interprete_aux(sexp[1]))
        elif sexp[0]["token"] == "print":
            if len(sexp) != 2:
                raise Exception("Ln %d, Col %d: the argument number of print should be 1" %
                        (sexp[0]["line"], sexp[0]["col"]))
            else:
                print(self.interprete_aux(sexp[1]))
        elif sexp[0]["token"] == "set!":
            if sexp[1]["type"] != "sym":
                raise Exception("Ln %d, Col %d: the type of %s should be symbol, not %s" %
                        (sexp[1]["line"], sexp[1]["col"], sexp[1], sexp[1]["type"]))
            else:
                is_found = False
                for i in reversed(self.env):
                    if sexp[1]["token"] in i.keys():
                        i[sexp[1]["token"]] = self.interprete_aux(sexp[2])
                        is_found = True
                        break

                if not is_found:
                    raise Exception("Ln %d, Col %d: the variable %s is not found!" %
                            (sexp[1]["line"], sexp[1]["col"], sexp[1]["token"]))


        elif sexp[0]["token"] == "def-syntax":
            if len(sexp) < 3:
                raise Exception("Ln %d, Col %d: def-syntax should have 2 or more arguments!" %
                        (sexp[0]["line"], sexp[0]["col"]))
            else:
                syntax_name = sexp[1]["token"] # {"foo": {"before":[_ x y], "after":[+ x y]}, ...}
                #removed_dict_form = self.remove_dict_form(sexp[2:])

                result_list = []

                for i in sexp[2:]:
                #for i in removed_dict_form:
                    syntax_before = i[0]
                    syntax_after = i[1]
                    item = {"before": syntax_before, "after": syntax_after}
                    result_list.append(item)

                self.macro_list[syntax_name] = result_list

        elif sexp[0]["token"] == "begin":
            if len(sexp) == 1:
                raise Exception("Ln %d, Col %d: begin should have argument(s)!" %
                        (sexp[0]["line"], sexp[0]["col"]))
            else:
                for i in sexp[1:]:
                    ret = self.interprete_aux(i)
                return ret

        elif sexp[0]["token"] == "lambda":
            if not isinstance(sexp[1], list):
                raise Exception("Ln %d, Col %d: %s should be a variable list" %
                        (sexp[1]["line"], sexp[1]["col"], sexp[1]))
            else:
                return Lambda(sexp[1], sexp[2], self.env)
        else:
            sexp_new = [self.interprete_aux(x) for x in sexp]

            if isinstance(sexp_new[0], Lambda):
                vars = sexp_new[0].vars
                env = sexp_new[0].env
                body = sexp_new[0].body
                vars_input = sexp_new[1:]

                if len(vars) != len(vars_input):
                    raise Exception("Ln %d, Col %d: argument length is not matched." %
                        (self.get_first_item(sexp[0])["line"], self.get_first_item(sexp[0])["col"]))

                new_env_block = dict()
                for i in range(len(vars)):
                    new_env_block[vars[i]] = vars_input[i]

                new_env = [new_env_block] + env

                old_env = self.env
                self.env = new_env
                ret = self.interprete_aux(body)
                self.env = old_env
                return ret


    def remove_dict_form(self, sexp):
        if isinstance(sexp, list):
            return [self.remove_dict_form(x) for x in sexp]
        else:
            return sexp["token"]

    def get_first_item(self, sexp):
        if isinstance(sexp, list):
            sexp = sexp[0]

        return sexp

    def unify(self, sexp, before_stx, unification):
        for i in range(len(before_stx)):
            if isinstance(before_stx[i], list):
                unification = unify(sexp[i], before_stx[i], unification)
            elif before_stx[i]["token"] in unification.keys():
                raise Exception("the variable %s is double defined." % before-stx[i])
            elif re.match(r".+[.]{3}$", before_stx[i]["token"]):
                if i == len(before_stx) - 1:
                    unification[before_stx[i]["token"]] = {"content": sexp[i:], "dotted":True}
                else:
                    raise Exception("the variable %s is put in the wrong position." % before_stx[i])
            else:
                unification[before_stx[i]["token"]] = {"content": sexp[i], "dotted": False}

        return unification

    def macro_expand(self, after_stx, unification):
        if isinstance(after_stx, list):
            raw_list = [self.macro_expand(i, unification) for i in after_stx]

            result_list = []

            for i in raw_list:
                if isinstance(i, list):
                    result_list.append(i)
                elif "dotted" not in i.keys():
                    result_list.append(i)
                elif i["dotted"] == True:
                    result_list += i["content"]
                else:
                    result_list.append(i["content"])

            return result_list

        else:
            if after_stx["token"] in unification.keys():
                return unification[after_stx["token"]]
            else:
                return after_stx


class Lambda:
    def __init__(self, vars, body, env):
        for i in vars:
            if i["type"] != "sym":
                raise Exception("Line %d, Col %d: the variable of lambda should be a variable, not %s"
                                    % (i["line"], i["col"], str(i["token"])))
            else:
                self.vars = [i["token"] for i in vars]
                self.body = body
                self.env = env


'''test'''
a = Parser()
text = '''
[def-syntax bar
    [[_ x y] [+ x y]]
    [[_ x y z...] [+ x [bar y z...]]]]

[print[str[bar 156 6546 146514 10 6]]]

[define fac [lambda [x] [if [= x 1] 1 [*  x  [fac [- x 1]]]]]]

[print [fac 6]]

[+[- 2 3][* 5.0 6]]
[define var1 [+[- 2 3][* 5.0 6]]]
[set! var1 [* 10 2]]
[define foo [lambda [x y] [begin [+ x y][set! var1 10] 7]]]
[foo 12 5]
[print [+ var1 5]]
'''

"""text = '''[[[ 123 1.23 abc "\\123\\\"貓貓貓"] 我是貓，喵\[喵\]貓\%。喵喵%喵
]]'''
"""

interp = Intepreter()

a.get_clc_sexp(text)

interp.interprete(a.parse_tree)

#print(a.parse_tree)
print(a.generate_printable_sexp(a.parse_tree))