add parser and lexer

2021-06-08 22:07:14 +08:00 · 2021-06-08 22:07:14 +08:00 · e8c94c857d
commit e8c94c857d
parent 0dcd45f118
6 changed files with 146 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -129,4 +129,7 @@ dmypy.json
 .pyre/

 #backup file
-*~
+*~
+
+# VS Code
+launch.json
--- a/src/Editor/ClochurLexer.py
+++ b/src/Editor/ClochurLexer.py
@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+#-*-coding:utf-8-*-

 import re
 from PyQt5.Qsci import QsciLexerCustom, QsciScintilla
@ -31,6 +33,8 @@ class ClochurLexer(QsciLexerCustom):

        self.PRIMARY = ['define', 'let' , '#t', '#f', 'lambda', '@', 'cond', 'if', 'docu']

+        self.split_pattern = re.compile(r'(\s+|\\%|%|\\\[|\\\]|[[]|[]])')
+
        font = QFont()
        font.setFamily(parent.font_family)
        font.setPointSize(parent.font_size)
@ -110,9 +114,7 @@ class ClochurLexer(QsciLexerCustom):

            line_utf8 = line.decode('utf-8')

-            split_pattern = re.compile(r'(\s+|\\%|%|\\\[|\\\]|[[]|[]])')
-
-            line_utf8_splitted = split_pattern.split(line_utf8)
+            line_utf8_splitted = self.split_pattern.split(line_utf8)

            line_utf8_splitted_len_pair = [{"str": item, "len" : len(bytearray(item, "utf-8"))} for item in line_utf8_splitted]

--- a/src/Editor/CustomQsciEditor.py
+++ b/src/Editor/CustomQsciEditor.py
@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+#-*-coding:utf-8-*-
+
 from PyQt5.QtGui import *
 from PyQt5.Qsci import QsciScintilla

--- a/src/Editor/FindReplace.py
+++ b/src/Editor/FindReplace.py
@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+#-*-coding:utf-8-*-
+
 import sys
 from PyQt5.QtWidgets import *
 from PyQt5.Qsci import QsciScintilla
--- a/src/Editor/about.py
+++ b/src/Editor/about.py
@ -1,3 +1,7 @@
+#!/usr/bin/env python3
+#-*-coding:utf-8-*-
+
+
 version_no = "0.0.1"
 about_info = '''A S-expression-like typesetting language powered by SILE engine with a simple text text editor.
 http://yoxem.github.com
--- a/src/Interpreter/init.py
+++ b/src/Interpreter/init.py
@ -0,0 +1,127 @@
+#-*-coding:utf-8-*-
+
+import re
+
+class Parser():
+
+    def __init__(self):
+        float_pattern  =r"(?P<flo>[+-]?\d+[.]\d+)"
+        int_pattern  =r"(?P<int>[+-]?\d+)"
+        symbol_pattern = r"(?P<sym>[_a-zA-Z][_0-9a-zA-Z]+)"
+        string_pattern = r"(?P<str>[\"]([^\"\\]|[\\][\"\\nt])+[\"])"
+        parenthesis_pattern = r"(?P<paren>[[]|[]])"
+        percent_pattern = r"(?P<percent>[%])"
+        space_pattern  = r"(?P<space>[ \t]+)"
+        newline_pattern = r"(?P<nl>)\n"
+        inside_docu_pattern = r"(?P<other>([^%\[\]\n\s\\]|[\\][%\[\]]?)+)"
+
+
+        self.total_pattern = re.compile("|".join([float_pattern,int_pattern,symbol_pattern,string_pattern,parenthesis_pattern,
+                                                percent_pattern,inside_docu_pattern,space_pattern,newline_pattern]))
+
+        self.clc_sexp = None
+        self.tokenized = None
+        self.index = None
+    
+
+    def get_clc_sexp(self, clc):
+        self.clc_sexp = clc
+        self.tokenized = self.remove_comment(self.tokenize(self.clc_sexp))
+        print(self.parse_main(self.tokenized))
+
+    def tokenize(self, clc):
+        line_no = 1
+        column = 0
+        column_offset = 0
+        find_iterator = re.finditer(self.total_pattern, self.clc_sexp)
+        result = []
+        for i in find_iterator:
+            column = i.start() - column_offset
+
+            if i.group(0) == '\n':
+                item = {"token" : i.group(0), "line": line_no, "col" : column, "type": i.lastgroup}
+                line_no += 1
+                column_offset  = i.end()
+            else:
+                item = {"token" : i.group(0), "line": line_no, "col" : column, "type": i.lastgroup}
+        
+            
+
+            result.append(item)
+
+        [print(i["token"]) for i in result]
+
+        return result
+    def remove_comment(self, series):
+        result = []
+        is_comment_token = False
+        for i in series:
+            if i["token"] == "%":
+                is_comment_token = True
+            elif i["token"] == "\n":
+                if is_comment_token == True:
+                    is_comment_token = False
+                else:
+                    result.append(i)
+            elif is_comment_token == True:
+                pass
+            else:
+                result.append(i)
+        
+        return result
+
+
+    def move_forward(self):
+        self.index += 1
+
+    def parse_main(self, series):
+        self.index = 0
+
+        processed_series = [{"token": "[", "line": None, "col": None, "type": None}] + series + \
+                            [{"token": "]", "line": None, "col": None, "type": None}]
+        result = self.parse(processed_series)
+
+        if self.index < len(processed_series):
+            raise Exception("the parenthesis ] is not balanced.")
+        else:
+            return result
+
+    def atom(self, series):
+        result = series[self.index]
+        if result["type"] == "int":
+            result["token"] = int(result["token"])
+        elif result["type"] == "flo":
+            result["token"] = float(result["token"])
+        else:
+            pass
+        self.move_forward()
+        return result
+
+    def parse(self, series):
+        result = None
+        if series[self.index]["token"] == "[":
+            result = []
+            self.move_forward()
+            try:
+                while series[self.index]["token"] != "]":
+                    item = self.parse(series)
+                    result.append(item)
+
+                self.move_forward()
+
+                return result
+            except IndexError:
+                raise Exception("the parenthesis [ is not balanced.")
+
+
+        else:
+            result = self.atom(series)
+            return result
+            
+
+'''test'''
+a = Parser()
+text = '''[[[ 123 1.23 abc "\\123\\"喵喵"] 我是貓，喵\[喵\]貓\%。喵喵%喵
+]]'''
+
+a.get_clc_sexp(text)