292 lines
		
	
	
	
		
			9.6 KiB
		
	
	
	
		
			OCaml
		
	
	
	
	
	
		
		
			
		
	
	
			292 lines
		
	
	
	
		
			9.6 KiB
		
	
	
	
		
			OCaml
		
	
	
	
	
	
|  | open Uchar;; | ||
|  | open List;; | ||
|  | open StringLabels;; | ||
|  | 
 | ||
|  | (* TODO: Add parsing attribute and operator + - * / \n \r # = == < > ; ( ) [ ]*) | ||
|  | 
 | ||
|  | type tokenizeroutput = | ||
|  |     | Fail | ||
|  |     | Success of string * string;; | ||
|  | 
 | ||
|  | type token = Token of string * string;; | ||
|  | 
 | ||
|  | type aux_middle = Aux of tokenizeroutput * string | ||
|  | 
 | ||
|  | 
 | ||
|  | exception IndexException of string | ||
|  | 
 | ||
|  | let token_to_string token =  match token with | ||
|  | | Token (str, token_type) -> "Token(\"" ^ str ^ "\", \"" ^ token_type ^"\")";; | ||
|  | 
 | ||
|  | 
 | ||
|  | let print_token token =  print_string (token_to_string token);; | ||
|  | 
 | ||
|  | 
 | ||
|  | let print_parse_output output =  match output with | ||
|  |     | Success (car, cdr) -> print_string ("Success(" ^ car ^ ", " ^ cdr ^")\n") | ||
|  |     | Fail -> print_string ("Fail!\n");; | ||
|  |    | ||
|  | 
 | ||
|  | let consume1char str = | ||
|  |   match str with | ||
|  |   | "" -> Fail | ||
|  |   | _ -> Success ((sub str 0 1), (sub str 1 ((length str) - 1)));; | ||
|  | 
 | ||
|  | let match_range min max =  | ||
|  |   fun input -> | ||
|  |     if min > max then | ||
|  |       raise (IndexException (min ^ " should be not less than " ^ max)) | ||
|  |   else | ||
|  |     let initial_result = consume1char input in | ||
|  |       match initial_result with | ||
|  |       | Fail -> Fail | ||
|  |       | Success (fst, rest) -> | ||
|  |         let fst_code = to_int (of_char (get fst 0)) in | ||
|  |         let max_code = to_int (of_char (get max 0)) in | ||
|  |         let min_code = to_int (of_char (get min 0)) in | ||
|  |         if fst_code >= min_code && fst_code <= max_code | ||
|  |           then Success (fst, rest) | ||
|  |           else Fail;;       | ||
|  | 
 | ||
|  | let match_char pattern =  | ||
|  |   fun input ->  | ||
|  |     let initial_result = consume1char input in | ||
|  |       match initial_result with  | ||
|  |         | Fail -> Fail | ||
|  |         | Success (fst, rest) -> | ||
|  |           if equal fst pattern | ||
|  |           then Success (fst, rest) | ||
|  |           else Fail;; | ||
|  | 
 | ||
|  | let  (>>=) input parser =  | ||
|  |   match input with  | ||
|  |     | Fail -> input | ||
|  |     | Success (fst , snd_rest) -> | ||
|  |       let middle = parser snd_rest in | ||
|  |           match middle with | ||
|  |           | Fail -> Fail | ||
|  |           | Success (snd, rest) ->  | ||
|  |             let fst_snd = fst ^ snd in | ||
|  |             Success (fst_snd, rest);; | ||
|  | 
 | ||
|  | let ( >>=* ) (input : tokenizeroutput) parser =  | ||
|  |   if input == Fail then | ||
|  |     Fail | ||
|  |   else | ||
|  |     let middle0 = input >>= parser in | ||
|  |     if middle0 == Fail then | ||
|  |       input | ||
|  |     else | ||
|  |       let rec parser_recursive i parser = | ||
|  |         let middle = i >>= parser in | ||
|  |         match middle with | ||
|  |           | Fail -> i | ||
|  |           | Success (a  , b) -> (parser_recursive middle parser) in | ||
|  |         (parser_recursive input parser);;  | ||
|  |      | ||
|  | 
 | ||
|  | let not_match_char pattern =  | ||
|  |   fun input ->  | ||
|  |     let initial_result = consume1char input in | ||
|  |       match initial_result with  | ||
|  |         | Fail -> Fail | ||
|  |         | Success (fst, rest) -> | ||
|  |           if not (equal fst pattern) | ||
|  |           then Success (fst, rest) | ||
|  |           else Fail;; | ||
|  | 
 | ||
|  | let ( >>=? ) (input : tokenizeroutput) parser =  | ||
|  |   let middle = input >>= parser in | ||
|  |     match middle with | ||
|  |       | Fail -> input | ||
|  |       | Success (a  , b) -> middle;; | ||
|  | 
 | ||
|  | let ( || ) parser1  parser2 =  | ||
|  |     fun input -> | ||
|  |       let middle1 = parser1 input in | ||
|  |         match middle1 with | ||
|  |           | Success (_ , _) -> middle1 | ||
|  |           | Fail -> | ||
|  |             let middle2 = parser2 input in | ||
|  |             match middle2 with  | ||
|  |               | Fail -> Fail | ||
|  |               | Success (_ , _) -> middle2;; | ||
|  | 
 | ||
|  | 
 | ||
|  | let ( ||** ) parser1  parser2 =  | ||
|  |   fun input -> | ||
|  |     let middle1 = parser1 input in | ||
|  |       match middle1 with | ||
|  |         | Aux (Success (_ , _), _) -> middle1 | ||
|  |         | Aux(Fail, _) -> | ||
|  |           let middle2 = parser2 input in | ||
|  |             middle2;; | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | let parse_int input = (input  | ||
|  |                         >>=* ((match_char "+") || (match_char "-"))) | ||
|  |                         >>= (match_range "0" "9") | ||
|  |                         >>=* (match_range "0" "9");; | ||
|  | 
 | ||
|  | let parse_float input = (input  | ||
|  |                         >>=* ((match_char "+") || (match_char "-"))) | ||
|  |                         >>= (match_range "0" "9") | ||
|  |                         >>=* (match_range "0" "9") | ||
|  |                         >>= (match_char ".") | ||
|  |                         >>= (match_range "0" "9") | ||
|  |                         >>=* (match_range "0" "9");; | ||
|  | 
 | ||
|  | (*concat 2 parser
 | ||
|  | let ( >> ) parser1 parser2 =  | ||
|  |   fun input -> | ||
|  |     if input == Fail then | ||
|  |       Fail | ||
|  |     else | ||
|  |       let middle1 = parser1 input in | ||
|  |       match middle1 with | ||
|  |       | Fail -> Fail | ||
|  |       | Success (_ , _) -> parser2 input *) | ||
|  | 
 | ||
|  | let inside_quote_mark  = ((fun i -> Success("", i)  >>= (match_char "\\") >>= (match_char "\"")) ||  (fun i -> Success("", i)  >>= (not_match_char "\""))) ;; | ||
|  | 
 | ||
|  | let parse_string input = (input  | ||
|  |                         >>= (match_char "\""))  | ||
|  |                         >>=* (inside_quote_mark) | ||
|  |                         >>= (match_char "\"")  ;; | ||
|  | 
 | ||
|  | let parse_operator input = input >>= ((match_char "+") || (match_char "-") || (match_char "*") || (match_char "/") || (match_char "%"));; | ||
|  | 
 | ||
|  | let parse_number_mark input = input >>= (match_char "#");; | ||
|  | 
 | ||
|  | let parse_equal input = input >>= (match_char "=") >>= (match_char "=");; (* == *) | ||
|  | 
 | ||
|  | let parse_imply input = input >>= (match_char "-") >>= (match_char ">");; (* -> *) | ||
|  | 
 | ||
|  | 
 | ||
|  | let parse_assign input = input >>= (match_char "=");; | ||
|  | 
 | ||
|  | let parse_semicolon input = input >>= (match_char ";");; | ||
|  | 
 | ||
|  | let parse_comma input = input >>= (match_char ",");; | ||
|  | 
 | ||
|  | let parse_parenthesis input = input >>= ((match_char "(")|| (match_char ")"));; | ||
|  | 
 | ||
|  | let parse_bracket input = input >>= ((match_char "[")|| (match_char "]"));; | ||
|  | 
 | ||
|  | let parse_brace input = input >>= ((match_char "{")|| (match_char "}"));; | ||
|  | 
 | ||
|  | let parse_newline input = input >>= (match_char "\n");; | ||
|  | 
 | ||
|  | let parse_spaces input = input >>= (match_char " ") >>=* (match_char " ");; | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | let parse_id input = (input  | ||
|  |           >>= ((match_char "_") || (match_range "a" "z") ||(match_range "A" "Z")) | ||
|  |           >>=* ((match_char "_") || (match_range "0" "9") || (match_range "a" "z") ||(match_range "A" "Z")));; | ||
|  | 
 | ||
|  | let rec total_parser_aux input list =  | ||
|  |   match input with | ||
|  |   | Success(_,"") -> list | ||
|  |   | _ ->  | ||
|  |     let initial = ((fun i -> Aux ((parse_id i), "ID")) | ||
|  |                 ||** (fun i -> Aux ((parse_float i) ,"FLO")) | ||
|  |                 ||** (fun i -> Aux ((parse_int i) ,"INT")) | ||
|  |                 ||** (fun i -> Aux ((parse_imply i), "IMPLY")) | ||
|  |                 ||** (fun i -> Aux ((parse_operator i) ,"OP")) | ||
|  |                 ||** (fun i -> Aux ((parse_number_mark i) ,"NUM_MRK")) | ||
|  |                 ||** (fun i -> Aux ((parse_brace i) ,"BRACE")) | ||
|  |                 ||** (fun i -> Aux ((parse_comma i) ,"COMMA")) | ||
|  |                 ||** (fun i -> Aux ((parse_assign i), "ASSIGN")) | ||
|  |                 ||** (fun i -> Aux ((parse_bracket i) ,"BRACK")) | ||
|  |                 ||** (fun i -> Aux ((parse_parenthesis i) ,"PAREN")) | ||
|  |                 ||** (fun i -> Aux ((parse_semicolon i), "SEMICO")) | ||
|  |                 ||** (fun i -> Aux ((parse_newline i), "NL")) | ||
|  |                 ||** (fun i -> Aux ((parse_spaces i) ,"SPACE"))) | ||
|  |                 input in | ||
|  |     match initial with | ||
|  |       | Aux (Fail, _) -> let _ =  print_string "Error" in [] | ||
|  |       | Aux (Success(matched, remained), token_type) -> total_parser_aux ( | ||
|  |                                                             Success("", remained)) | ||
|  |                                                             (append list [Token(matched, token_type );]);; | ||
|  | 
 | ||
|  | 
 | ||
|  | let rec total_parser input = total_parser_aux (Success("", input)) [];; | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | (* tests
 | ||
|  | 
 | ||
|  | List.iter (print_token) (total_parser "lambda(x){let a = 2;
 | ||
|  | 
 | ||
|  | return a + x;};");;
 | ||
|  | List.iter (print_token) (total_parser "12+中34;");; | ||
|  | 
 | ||
|  | print_parse_output (parse_id (Success ("", "_")));; | ||
|  | print_parse_output (parse_id (Success ("", "_abc12c")));; | ||
|  | print_parse_output (parse_id (Success ("", "_9")));; | ||
|  | print_parse_output (parse_id (Success ("", "a_9A")));; | ||
|  | print_parse_output (parse_id (Success ("", "if")));; | ||
|  | 
 | ||
|  | 
 | ||
|  | print_parse_output (parse_float (Success ("", "+2.0;")));; | ||
|  | print_parse_output (parse_id (Success ("", "Class")));; | ||
|  | print_parse_output (parse_id (Success ("", "BIGLETTER123__")));; | ||
|  | print_parse_output (parse_id (Success ("", "12a")));; | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | print_string  ("Test 5\n");; | ||
|  | print_parse_output  (((Success ("", "+1234a"))  >>=? (match_char "+")) >>=* (match_range "0" "9"));; | ||
|  | print_parse_output  (((Success ("", "1234a"))  >>=? (match_char "+")) >>=* (match_range "0" "9"));; | ||
|  | print_parse_output  (((Success ("", "-1234a"))  >>=? (match_char "+")) >>=* (match_range "0" "9"));; | ||
|  | print_parse_output  ((Success ("", "-1234a"))  >>=* (match_range "0" "9"));; | ||
|  | 
 | ||
|  | 
 | ||
|  | print_string  ("Test 6\n");; | ||
|  | print_parse_output  ((Success ("", "+1234a"))  >>= ( (match_char "+") || (match_char "-")));; | ||
|  | print_parse_output  ((Success ("", "-1234a"))  >>= ( (match_char "+") || (match_char "-")));; | ||
|  | print_parse_output  ((Success ("", "1234a"))  >>= ( (match_char "+") || (match_char "-")));; | ||
|  | 
 | ||
|  | print_string  ("Test 7\n");; | ||
|  | print_parse_output (parse_int (Success ("", "+1234a"))); | ||
|  | print_parse_output (parse_int (Success ("", "-1234a"))); | ||
|  | print_parse_output (parse_int (Success ("", "1234a"))); | ||
|  | print_parse_output (parse_int (Success ("", "+a"))); | ||
|  | 
 | ||
|  | print_string  ("Test 8\n");; | ||
|  | print_parse_output (parse_float (Success ("", "+1234.58a"))); | ||
|  | print_parse_output (parse_float (Success ("", "-1234.58a"))); | ||
|  | print_parse_output (parse_float (Success ("", "0.0a"))); | ||
|  | print_parse_output (parse_float (Success ("", "+0.58a"))); | ||
|  | print_parse_output (parse_float (Success ("", "0.58a"))); | ||
|  | print_parse_output (parse_float (Success ("", "-0.58a"))); | ||
|  | print_parse_output (parse_float (Success ("", "1234.8a"))); | ||
|  | print_parse_output (parse_float (Success ("", "1234a"))); | ||
|  | print_parse_output (parse_float (Success ("", "+1234a"))); | ||
|  | print_parse_output (parse_float (Success ("", "-1234a"))); | ||
|  | 
 | ||
|  | print_string  ("Test 9\n");; | ||
|  | (* print_parse_output (inside_quote_mark (Success ("", "abc"))); *) | ||
|  | (* print_parse_output (inside_quote_mark (Success ("", "\"abc"))); *) | ||
|  | (* print_parse_output (inside_quote_mark (Success ("", "\\\"abc"))); *) | ||
|  | print_parse_output (parse_string (Success ("","\"123\"")));; | ||
|  | print_parse_output (parse_string (Success ("","\"12\\\"3\"")));; | ||
|  | print_parse_output (parse_string (Success ("","\"\\\"\\\"\"")));; | ||
|  | print_parse_output (parse_string (Success ("","\"\"")));; | ||
|  | 
 | ||
|  | *) |