diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f22a73b --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +doc : docs stringCodepointSplitter.ml + ocamlfind ocamldoc -package uutf -html -charset=utf-8 stringCodepointSplitter.ml -d docs + +docs : + mkdir docs \ No newline at end of file diff --git a/docs/StringCodepointSplitter.html b/docs/StringCodepointSplitter.html new file mode 100644 index 0000000..7454207 --- /dev/null +++ b/docs/StringCodepointSplitter.html @@ -0,0 +1,41 @@ + + +
+ + + + + + + +module StringCodepointSplitter:sig
..end
The Module needs Uutf
Module.
It only contains split_string_by_unicode_codepoint
, which splits an OCaml string str
to a string list
val split_string_by_unicode_codepoint : string -> string list
Split an OCaml string str
to a string list
Arguments
+str
the string to be splitted.
+ Example
+ +let example= split_string_by_unicode_codepoint "m̄知 who you're." (*don't know who you are*) in
+
+List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;
+
+(*it will output : "m, ̄, 知, , w, h, o, , y, o, u, ', r, e, ., "*)
StringCodepointSplitter |
S | |
StringCodepointSplitter | +
S | |
split_string_by_unicode_codepoint [StringCodepointSplitter] | +
+
+Split an OCaml string |
sig end
diff --git a/stringCodepointSplitter.ml b/stringCodepointSplitter.ml
new file mode 100644
index 0000000..4464364
--- /dev/null
+++ b/stringCodepointSplitter.ml
@@ -0,0 +1,54 @@
+(*#use "topfind";;*)
+open Stdlib
+open Uutf
+
+(**
+ The Module needs [Uutf] Module.
+
+ It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list]
+*)
+
+
+(** Split an OCaml string [str] to a [string list]
+
+{b Arguments}
+{ul
+ {- [str] the string to be splitted.
+ }}
+
+ {b Example}
+
+{[let example= split_string_by_unicode_codepoint "m̄知 who you're." (*don't know who you are*) in
+
+List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;
+
+(*it will output : "m, ̄, 知, , w, h, o, , y, o, u, ', r, e, ., "*)]}
+ *)
+let split_string_by_unicode_codepoint str =
+(*Split a Ocaml string [str] to a `str list` *)
+ let pred_codepoint = ref (-1) in
+ let segmented_unit_list = ref [] in
+ let iterator x y z =
+ let _ = if !pred_codepoint > -1 then
+ let current_codepoint = y in
+ let pred_char_len = current_codepoint - !pred_codepoint in
+ let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
+ let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
+ unit_substring
+ else
+ "" in
+ let _ = pred_codepoint := y in x in
+
+ let _ = Uutf.String.fold_utf_8 iterator str str in
+ let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
+ if last_char_len > 0 then
+ let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
+ let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
+ !segmented_unit_list
+ else
+ !segmented_unit_list;;
+
+
+List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint "m̄知 who you're.");;
+
+