54 lines
1.6 KiB
OCaml
54 lines
1.6 KiB
OCaml
(*#use "topfind";;*)
|
|
open Stdlib
|
|
open Uutf
|
|
|
|
(**
|
|
The Module needs [Uutf] Module.
|
|
|
|
It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list]
|
|
*)
|
|
|
|
|
|
(** Split an OCaml string [str] to a [string list]
|
|
|
|
{b Arguments}
|
|
{ul
|
|
{- [str] the string to be splitted.
|
|
}}
|
|
|
|
{b Example}
|
|
|
|
{[let example= split_string_by_unicode_codepoint "m̄知 who you're." (*don't know who you are*) in
|
|
|
|
List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;
|
|
|
|
(*it will output : "m, ̄, 知, , w, h, o, , y, o, u, ', r, e, ., "*)]}
|
|
*)
|
|
let split_string_by_unicode_codepoint str =
|
|
(*Split a Ocaml string [str] to a `str list` *)
|
|
let pred_codepoint = ref (-1) in
|
|
let segmented_unit_list = ref [] in
|
|
let iterator x y z =
|
|
let _ = if !pred_codepoint > -1 then
|
|
let current_codepoint = y in
|
|
let pred_char_len = current_codepoint - !pred_codepoint in
|
|
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
|
|
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
|
|
unit_substring
|
|
else
|
|
"" in
|
|
let _ = pred_codepoint := y in x in
|
|
|
|
let _ = Uutf.String.fold_utf_8 iterator str str in
|
|
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
|
|
if last_char_len > 0 then
|
|
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
|
|
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
|
|
!segmented_unit_list
|
|
else
|
|
!segmented_unit_list;;
|
|
|
|
|
|
List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint "m̄知 who you're.");;
|
|
|
|
|