archivesOfToyLang/ataabu/archive/tokenizer.ml

292 lines
No EOL
9.6 KiB
OCaml

open Uchar;;
open List;;
open StringLabels;;
(* TODO: Add parsing attribute and operator + - * / \n \r # = == < > ; ( ) [ ]*)
type tokenizeroutput =
| Fail
| Success of string * string;;
type token = Token of string * string;;
type aux_middle = Aux of tokenizeroutput * string
exception IndexException of string
let token_to_string token = match token with
| Token (str, token_type) -> "Token(\"" ^ str ^ "\", \"" ^ token_type ^"\")";;
let print_token token = print_string (token_to_string token);;
let print_parse_output output = match output with
| Success (car, cdr) -> print_string ("Success(" ^ car ^ ", " ^ cdr ^")\n")
| Fail -> print_string ("Fail!\n");;
let consume1char str =
match str with
| "" -> Fail
| _ -> Success ((sub str 0 1), (sub str 1 ((length str) - 1)));;
let match_range min max =
fun input ->
if min > max then
raise (IndexException (min ^ " should be not less than " ^ max))
else
let initial_result = consume1char input in
match initial_result with
| Fail -> Fail
| Success (fst, rest) ->
let fst_code = to_int (of_char (get fst 0)) in
let max_code = to_int (of_char (get max 0)) in
let min_code = to_int (of_char (get min 0)) in
if fst_code >= min_code && fst_code <= max_code
then Success (fst, rest)
else Fail;;
let match_char pattern =
fun input ->
let initial_result = consume1char input in
match initial_result with
| Fail -> Fail
| Success (fst, rest) ->
if equal fst pattern
then Success (fst, rest)
else Fail;;
let (>>=) input parser =
match input with
| Fail -> input
| Success (fst , snd_rest) ->
let middle = parser snd_rest in
match middle with
| Fail -> Fail
| Success (snd, rest) ->
let fst_snd = fst ^ snd in
Success (fst_snd, rest);;
let ( >>=* ) (input : tokenizeroutput) parser =
if input == Fail then
Fail
else
let middle0 = input >>= parser in
if middle0 == Fail then
input
else
let rec parser_recursive i parser =
let middle = i >>= parser in
match middle with
| Fail -> i
| Success (a , b) -> (parser_recursive middle parser) in
(parser_recursive input parser);;
let not_match_char pattern =
fun input ->
let initial_result = consume1char input in
match initial_result with
| Fail -> Fail
| Success (fst, rest) ->
if not (equal fst pattern)
then Success (fst, rest)
else Fail;;
let ( >>=? ) (input : tokenizeroutput) parser =
let middle = input >>= parser in
match middle with
| Fail -> input
| Success (a , b) -> middle;;
let ( || ) parser1 parser2 =
fun input ->
let middle1 = parser1 input in
match middle1 with
| Success (_ , _) -> middle1
| Fail ->
let middle2 = parser2 input in
match middle2 with
| Fail -> Fail
| Success (_ , _) -> middle2;;
let ( ||** ) parser1 parser2 =
fun input ->
let middle1 = parser1 input in
match middle1 with
| Aux (Success (_ , _), _) -> middle1
| Aux(Fail, _) ->
let middle2 = parser2 input in
middle2;;
let parse_int input = (input
>>=* ((match_char "+") || (match_char "-")))
>>= (match_range "0" "9")
>>=* (match_range "0" "9");;
let parse_float input = (input
>>=* ((match_char "+") || (match_char "-")))
>>= (match_range "0" "9")
>>=* (match_range "0" "9")
>>= (match_char ".")
>>= (match_range "0" "9")
>>=* (match_range "0" "9");;
(*concat 2 parser
let ( >> ) parser1 parser2 =
fun input ->
if input == Fail then
Fail
else
let middle1 = parser1 input in
match middle1 with
| Fail -> Fail
| Success (_ , _) -> parser2 input *)
let inside_quote_mark = ((fun i -> Success("", i) >>= (match_char "\\") >>= (match_char "\"")) || (fun i -> Success("", i) >>= (not_match_char "\""))) ;;
let parse_string input = (input
>>= (match_char "\""))
>>=* (inside_quote_mark)
>>= (match_char "\"") ;;
let parse_operator input = input >>= ((match_char "+") || (match_char "-") || (match_char "*") || (match_char "/") || (match_char "%"));;
let parse_number_mark input = input >>= (match_char "#");;
let parse_equal input = input >>= (match_char "=") >>= (match_char "=");; (* == *)
let parse_imply input = input >>= (match_char "-") >>= (match_char ">");; (* -> *)
let parse_assign input = input >>= (match_char "=");;
let parse_semicolon input = input >>= (match_char ";");;
let parse_comma input = input >>= (match_char ",");;
let parse_parenthesis input = input >>= ((match_char "(")|| (match_char ")"));;
let parse_bracket input = input >>= ((match_char "[")|| (match_char "]"));;
let parse_brace input = input >>= ((match_char "{")|| (match_char "}"));;
let parse_newline input = input >>= (match_char "\n");;
let parse_spaces input = input >>= (match_char " ") >>=* (match_char " ");;
let parse_id input = (input
>>= ((match_char "_") || (match_range "a" "z") ||(match_range "A" "Z"))
>>=* ((match_char "_") || (match_range "0" "9") || (match_range "a" "z") ||(match_range "A" "Z")));;
let rec total_parser_aux input list =
match input with
| Success(_,"") -> list
| _ ->
let initial = ((fun i -> Aux ((parse_id i), "ID"))
||** (fun i -> Aux ((parse_float i) ,"FLO"))
||** (fun i -> Aux ((parse_int i) ,"INT"))
||** (fun i -> Aux ((parse_imply i), "IMPLY"))
||** (fun i -> Aux ((parse_operator i) ,"OP"))
||** (fun i -> Aux ((parse_number_mark i) ,"NUM_MRK"))
||** (fun i -> Aux ((parse_brace i) ,"BRACE"))
||** (fun i -> Aux ((parse_comma i) ,"COMMA"))
||** (fun i -> Aux ((parse_assign i), "ASSIGN"))
||** (fun i -> Aux ((parse_bracket i) ,"BRACK"))
||** (fun i -> Aux ((parse_parenthesis i) ,"PAREN"))
||** (fun i -> Aux ((parse_semicolon i), "SEMICO"))
||** (fun i -> Aux ((parse_newline i), "NL"))
||** (fun i -> Aux ((parse_spaces i) ,"SPACE")))
input in
match initial with
| Aux (Fail, _) -> let _ = print_string "Error" in []
| Aux (Success(matched, remained), token_type) -> total_parser_aux (
Success("", remained))
(append list [Token(matched, token_type );]);;
let rec total_parser input = total_parser_aux (Success("", input)) [];;
(* tests
List.iter (print_token) (total_parser "lambda(x){let a = 2;
return a + x;};");;
List.iter (print_token) (total_parser "12+中34;");;
print_parse_output (parse_id (Success ("", "_")));;
print_parse_output (parse_id (Success ("", "_abc12c")));;
print_parse_output (parse_id (Success ("", "_9")));;
print_parse_output (parse_id (Success ("", "a_9A")));;
print_parse_output (parse_id (Success ("", "if")));;
print_parse_output (parse_float (Success ("", "+2.0;")));;
print_parse_output (parse_id (Success ("", "Class")));;
print_parse_output (parse_id (Success ("", "BIGLETTER123__")));;
print_parse_output (parse_id (Success ("", "12a")));;
print_string ("Test 5\n");;
print_parse_output (((Success ("", "+1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
print_parse_output (((Success ("", "1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
print_parse_output (((Success ("", "-1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
print_parse_output ((Success ("", "-1234a")) >>=* (match_range "0" "9"));;
print_string ("Test 6\n");;
print_parse_output ((Success ("", "+1234a")) >>= ( (match_char "+") || (match_char "-")));;
print_parse_output ((Success ("", "-1234a")) >>= ( (match_char "+") || (match_char "-")));;
print_parse_output ((Success ("", "1234a")) >>= ( (match_char "+") || (match_char "-")));;
print_string ("Test 7\n");;
print_parse_output (parse_int (Success ("", "+1234a")));
print_parse_output (parse_int (Success ("", "-1234a")));
print_parse_output (parse_int (Success ("", "1234a")));
print_parse_output (parse_int (Success ("", "+a")));
print_string ("Test 8\n");;
print_parse_output (parse_float (Success ("", "+1234.58a")));
print_parse_output (parse_float (Success ("", "-1234.58a")));
print_parse_output (parse_float (Success ("", "0.0a")));
print_parse_output (parse_float (Success ("", "+0.58a")));
print_parse_output (parse_float (Success ("", "0.58a")));
print_parse_output (parse_float (Success ("", "-0.58a")));
print_parse_output (parse_float (Success ("", "1234.8a")));
print_parse_output (parse_float (Success ("", "1234a")));
print_parse_output (parse_float (Success ("", "+1234a")));
print_parse_output (parse_float (Success ("", "-1234a")));
print_string ("Test 9\n");;
(* print_parse_output (inside_quote_mark (Success ("", "abc"))); *)
(* print_parse_output (inside_quote_mark (Success ("", "\"abc"))); *)
(* print_parse_output (inside_quote_mark (Success ("", "\\\"abc"))); *)
print_parse_output (parse_string (Success ("","\"123\"")));;
print_parse_output (parse_string (Success ("","\"12\\\"3\"")));;
print_parse_output (parse_string (Success ("","\"\\\"\\\"\"")));;
print_parse_output (parse_string (Success ("","\"\"")));;
*)