292 lines
No EOL
9.6 KiB
OCaml
292 lines
No EOL
9.6 KiB
OCaml
open Uchar;;
|
|
open List;;
|
|
open StringLabels;;
|
|
|
|
(* TODO: Add parsing attribute and operator + - * / \n \r # = == < > ; ( ) [ ]*)
|
|
|
|
type tokenizeroutput =
|
|
| Fail
|
|
| Success of string * string;;
|
|
|
|
type token = Token of string * string;;
|
|
|
|
type aux_middle = Aux of tokenizeroutput * string
|
|
|
|
|
|
exception IndexException of string
|
|
|
|
let token_to_string token = match token with
|
|
| Token (str, token_type) -> "Token(\"" ^ str ^ "\", \"" ^ token_type ^"\")";;
|
|
|
|
|
|
let print_token token = print_string (token_to_string token);;
|
|
|
|
|
|
let print_parse_output output = match output with
|
|
| Success (car, cdr) -> print_string ("Success(" ^ car ^ ", " ^ cdr ^")\n")
|
|
| Fail -> print_string ("Fail!\n");;
|
|
|
|
|
|
let consume1char str =
|
|
match str with
|
|
| "" -> Fail
|
|
| _ -> Success ((sub str 0 1), (sub str 1 ((length str) - 1)));;
|
|
|
|
let match_range min max =
|
|
fun input ->
|
|
if min > max then
|
|
raise (IndexException (min ^ " should be not less than " ^ max))
|
|
else
|
|
let initial_result = consume1char input in
|
|
match initial_result with
|
|
| Fail -> Fail
|
|
| Success (fst, rest) ->
|
|
let fst_code = to_int (of_char (get fst 0)) in
|
|
let max_code = to_int (of_char (get max 0)) in
|
|
let min_code = to_int (of_char (get min 0)) in
|
|
if fst_code >= min_code && fst_code <= max_code
|
|
then Success (fst, rest)
|
|
else Fail;;
|
|
|
|
let match_char pattern =
|
|
fun input ->
|
|
let initial_result = consume1char input in
|
|
match initial_result with
|
|
| Fail -> Fail
|
|
| Success (fst, rest) ->
|
|
if equal fst pattern
|
|
then Success (fst, rest)
|
|
else Fail;;
|
|
|
|
let (>>=) input parser =
|
|
match input with
|
|
| Fail -> input
|
|
| Success (fst , snd_rest) ->
|
|
let middle = parser snd_rest in
|
|
match middle with
|
|
| Fail -> Fail
|
|
| Success (snd, rest) ->
|
|
let fst_snd = fst ^ snd in
|
|
Success (fst_snd, rest);;
|
|
|
|
let ( >>=* ) (input : tokenizeroutput) parser =
|
|
if input == Fail then
|
|
Fail
|
|
else
|
|
let middle0 = input >>= parser in
|
|
if middle0 == Fail then
|
|
input
|
|
else
|
|
let rec parser_recursive i parser =
|
|
let middle = i >>= parser in
|
|
match middle with
|
|
| Fail -> i
|
|
| Success (a , b) -> (parser_recursive middle parser) in
|
|
(parser_recursive input parser);;
|
|
|
|
|
|
let not_match_char pattern =
|
|
fun input ->
|
|
let initial_result = consume1char input in
|
|
match initial_result with
|
|
| Fail -> Fail
|
|
| Success (fst, rest) ->
|
|
if not (equal fst pattern)
|
|
then Success (fst, rest)
|
|
else Fail;;
|
|
|
|
let ( >>=? ) (input : tokenizeroutput) parser =
|
|
let middle = input >>= parser in
|
|
match middle with
|
|
| Fail -> input
|
|
| Success (a , b) -> middle;;
|
|
|
|
let ( || ) parser1 parser2 =
|
|
fun input ->
|
|
let middle1 = parser1 input in
|
|
match middle1 with
|
|
| Success (_ , _) -> middle1
|
|
| Fail ->
|
|
let middle2 = parser2 input in
|
|
match middle2 with
|
|
| Fail -> Fail
|
|
| Success (_ , _) -> middle2;;
|
|
|
|
|
|
let ( ||** ) parser1 parser2 =
|
|
fun input ->
|
|
let middle1 = parser1 input in
|
|
match middle1 with
|
|
| Aux (Success (_ , _), _) -> middle1
|
|
| Aux(Fail, _) ->
|
|
let middle2 = parser2 input in
|
|
middle2;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let parse_int input = (input
|
|
>>=* ((match_char "+") || (match_char "-")))
|
|
>>= (match_range "0" "9")
|
|
>>=* (match_range "0" "9");;
|
|
|
|
let parse_float input = (input
|
|
>>=* ((match_char "+") || (match_char "-")))
|
|
>>= (match_range "0" "9")
|
|
>>=* (match_range "0" "9")
|
|
>>= (match_char ".")
|
|
>>= (match_range "0" "9")
|
|
>>=* (match_range "0" "9");;
|
|
|
|
(*concat 2 parser
|
|
let ( >> ) parser1 parser2 =
|
|
fun input ->
|
|
if input == Fail then
|
|
Fail
|
|
else
|
|
let middle1 = parser1 input in
|
|
match middle1 with
|
|
| Fail -> Fail
|
|
| Success (_ , _) -> parser2 input *)
|
|
|
|
let inside_quote_mark = ((fun i -> Success("", i) >>= (match_char "\\") >>= (match_char "\"")) || (fun i -> Success("", i) >>= (not_match_char "\""))) ;;
|
|
|
|
let parse_string input = (input
|
|
>>= (match_char "\""))
|
|
>>=* (inside_quote_mark)
|
|
>>= (match_char "\"") ;;
|
|
|
|
let parse_operator input = input >>= ((match_char "+") || (match_char "-") || (match_char "*") || (match_char "/") || (match_char "%"));;
|
|
|
|
let parse_number_mark input = input >>= (match_char "#");;
|
|
|
|
let parse_equal input = input >>= (match_char "=") >>= (match_char "=");; (* == *)
|
|
|
|
let parse_imply input = input >>= (match_char "-") >>= (match_char ">");; (* -> *)
|
|
|
|
|
|
let parse_assign input = input >>= (match_char "=");;
|
|
|
|
let parse_semicolon input = input >>= (match_char ";");;
|
|
|
|
let parse_comma input = input >>= (match_char ",");;
|
|
|
|
let parse_parenthesis input = input >>= ((match_char "(")|| (match_char ")"));;
|
|
|
|
let parse_bracket input = input >>= ((match_char "[")|| (match_char "]"));;
|
|
|
|
let parse_brace input = input >>= ((match_char "{")|| (match_char "}"));;
|
|
|
|
let parse_newline input = input >>= (match_char "\n");;
|
|
|
|
let parse_spaces input = input >>= (match_char " ") >>=* (match_char " ");;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let parse_id input = (input
|
|
>>= ((match_char "_") || (match_range "a" "z") ||(match_range "A" "Z"))
|
|
>>=* ((match_char "_") || (match_range "0" "9") || (match_range "a" "z") ||(match_range "A" "Z")));;
|
|
|
|
let rec total_parser_aux input list =
|
|
match input with
|
|
| Success(_,"") -> list
|
|
| _ ->
|
|
let initial = ((fun i -> Aux ((parse_id i), "ID"))
|
|
||** (fun i -> Aux ((parse_float i) ,"FLO"))
|
|
||** (fun i -> Aux ((parse_int i) ,"INT"))
|
|
||** (fun i -> Aux ((parse_imply i), "IMPLY"))
|
|
||** (fun i -> Aux ((parse_operator i) ,"OP"))
|
|
||** (fun i -> Aux ((parse_number_mark i) ,"NUM_MRK"))
|
|
||** (fun i -> Aux ((parse_brace i) ,"BRACE"))
|
|
||** (fun i -> Aux ((parse_comma i) ,"COMMA"))
|
|
||** (fun i -> Aux ((parse_assign i), "ASSIGN"))
|
|
||** (fun i -> Aux ((parse_bracket i) ,"BRACK"))
|
|
||** (fun i -> Aux ((parse_parenthesis i) ,"PAREN"))
|
|
||** (fun i -> Aux ((parse_semicolon i), "SEMICO"))
|
|
||** (fun i -> Aux ((parse_newline i), "NL"))
|
|
||** (fun i -> Aux ((parse_spaces i) ,"SPACE")))
|
|
input in
|
|
match initial with
|
|
| Aux (Fail, _) -> let _ = print_string "Error" in []
|
|
| Aux (Success(matched, remained), token_type) -> total_parser_aux (
|
|
Success("", remained))
|
|
(append list [Token(matched, token_type );]);;
|
|
|
|
|
|
let rec total_parser input = total_parser_aux (Success("", input)) [];;
|
|
|
|
|
|
|
|
|
|
(* tests
|
|
|
|
List.iter (print_token) (total_parser "lambda(x){let a = 2;
|
|
|
|
return a + x;};");;
|
|
List.iter (print_token) (total_parser "12+中34;");;
|
|
|
|
print_parse_output (parse_id (Success ("", "_")));;
|
|
print_parse_output (parse_id (Success ("", "_abc12c")));;
|
|
print_parse_output (parse_id (Success ("", "_9")));;
|
|
print_parse_output (parse_id (Success ("", "a_9A")));;
|
|
print_parse_output (parse_id (Success ("", "if")));;
|
|
|
|
|
|
print_parse_output (parse_float (Success ("", "+2.0;")));;
|
|
print_parse_output (parse_id (Success ("", "Class")));;
|
|
print_parse_output (parse_id (Success ("", "BIGLETTER123__")));;
|
|
print_parse_output (parse_id (Success ("", "12a")));;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_string ("Test 5\n");;
|
|
print_parse_output (((Success ("", "+1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
|
|
print_parse_output (((Success ("", "1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
|
|
print_parse_output (((Success ("", "-1234a")) >>=? (match_char "+")) >>=* (match_range "0" "9"));;
|
|
print_parse_output ((Success ("", "-1234a")) >>=* (match_range "0" "9"));;
|
|
|
|
|
|
print_string ("Test 6\n");;
|
|
print_parse_output ((Success ("", "+1234a")) >>= ( (match_char "+") || (match_char "-")));;
|
|
print_parse_output ((Success ("", "-1234a")) >>= ( (match_char "+") || (match_char "-")));;
|
|
print_parse_output ((Success ("", "1234a")) >>= ( (match_char "+") || (match_char "-")));;
|
|
|
|
print_string ("Test 7\n");;
|
|
print_parse_output (parse_int (Success ("", "+1234a")));
|
|
print_parse_output (parse_int (Success ("", "-1234a")));
|
|
print_parse_output (parse_int (Success ("", "1234a")));
|
|
print_parse_output (parse_int (Success ("", "+a")));
|
|
|
|
print_string ("Test 8\n");;
|
|
print_parse_output (parse_float (Success ("", "+1234.58a")));
|
|
print_parse_output (parse_float (Success ("", "-1234.58a")));
|
|
print_parse_output (parse_float (Success ("", "0.0a")));
|
|
print_parse_output (parse_float (Success ("", "+0.58a")));
|
|
print_parse_output (parse_float (Success ("", "0.58a")));
|
|
print_parse_output (parse_float (Success ("", "-0.58a")));
|
|
print_parse_output (parse_float (Success ("", "1234.8a")));
|
|
print_parse_output (parse_float (Success ("", "1234a")));
|
|
print_parse_output (parse_float (Success ("", "+1234a")));
|
|
print_parse_output (parse_float (Success ("", "-1234a")));
|
|
|
|
print_string ("Test 9\n");;
|
|
(* print_parse_output (inside_quote_mark (Success ("", "abc"))); *)
|
|
(* print_parse_output (inside_quote_mark (Success ("", "\"abc"))); *)
|
|
(* print_parse_output (inside_quote_mark (Success ("", "\\\"abc"))); *)
|
|
print_parse_output (parse_string (Success ("","\"123\"")));;
|
|
print_parse_output (parse_string (Success ("","\"12\\\"3\"")));;
|
|
print_parse_output (parse_string (Success ("","\"\\\"\\\"\"")));;
|
|
print_parse_output (parse_string (Success ("","\"\"")));;
|
|
|
|
*) |