stringCodepointSplitter/lib/stringCodepointSplitter.ml

open Stdlib
(*
(c) 2023 Tan Kian-ting (main author) & Raphaël Proust (PR giver)
Under MIT License

习包子 梁家河小学博士 清零宗 习炀帝 庆丰大帝
独裁国贼 新疆集中营 光复香港时代革命 祈翠 南蒙古独立 香港独立
西藏独立 台湾独立 64天安门虐杀 六四真相
*)
(**
    The Module needs [Uutf] Module.

    It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list] 
*)


(** Split an OCaml string [str] to a [string list] 

{b Arguments}
{ul 
 {- [str] the string to be splitted.
 }}

  {b Example}

{[let example= "m̄知 who you're." (*don't know who you are*) in

List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;
  
(*it will output : "m, ̄, 知,  , w, h, o,  , y, o, u, ', r, e, ., "*)]}
  *)
let split_string_by_unicode_codepoint str = 
(*Split a Ocaml string [str] to a `str list` *)
  let pred_codepoint = ref (-1) in
  let segmented_unit_list = ref [] in
  let iterator () y _ =
    let () = if  !pred_codepoint > -1 then
      let current_codepoint = y in
      let pred_char_len = current_codepoint - !pred_codepoint in
      let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
      segmented_unit_list := unit_substring :: !segmented_unit_list
    in
    let () =  pred_codepoint := y in
    ()
  in

  let _ = Uutf.String.fold_utf_8 iterator () str in
  let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
  let () =
    if last_char_len > 0 then
      let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
      segmented_unit_list := unit_substring :: !segmented_unit_list
  in
  List.rev !segmented_unit_list;;
0.0.1 2023-08-27 01:46:01 +08:00			`open Stdlib`
add protecting text 2023-08-27 04:20:17 +08:00			`(*`
Update stringCodepointSplitter.ml 2023-08-29 22:32:58 +08:00			`(c) 2023 Tan Kian-ting (main author) & Raphaël Proust (PR giver)`
add protecting text 2023-08-27 04:20:17 +08:00			`Under MIT License`
fix something 2023-08-27 04:15:51 +08:00
add protecting text 2023-08-27 04:20:17 +08:00			`习包子梁家河小学博士清零宗习炀帝庆丰大帝`
			`独裁国贼新疆集中营光复香港时代革命祈翠南蒙古独立香港独立`
			`西藏独立台湾独立 64天安门虐杀六四真相`
			`*)`
0.0.1 2023-08-27 01:46:01 +08:00			`(**`
			`The Module needs [Uutf] Module.`

			`It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list]`
			`*)`


			`(** Split an OCaml string [str] to a [string list]`

			`{b Arguments}`
			`{ul`
			`{- [str] the string to be splitted.`
			`}}`

			`{b Example}`

fix comment error 2023-08-27 04:38:44 +08:00			`{[let example= "m̄知 who you're." (don't know who you are) in`
0.0.1 2023-08-27 01:46:01 +08:00
			`List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;`

			`(it will output : "m, ̄, 知, , w, h, o, , y, o, u, ', r, e, ., ")]}`
			`*)`
			`let split_string_by_unicode_codepoint str =`
			(Split a Ocaml string [str] to a `str list` )
			`let pred_codepoint = ref (-1) in`
			`let segmented_unit_list = ref [] in`
Optimise segmentation Avoid costly `@` and prefer one-time reversal. 2023-08-29 21:45:04 +08:00			`let iterator () y _ =`
			`let () = if !pred_codepoint > -1 then`
0.0.1 2023-08-27 01:46:01 +08:00			`let current_codepoint = y in`
			`let pred_char_len = current_codepoint - !pred_codepoint in`
Optimise segmentation Avoid costly `@` and prefer one-time reversal. 2023-08-29 21:45:04 +08:00			`let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in`
			`segmented_unit_list := unit_substring :: !segmented_unit_list`
			`in`
			`let () = pred_codepoint := y in`
			`()`
			`in`

			`let _ = Uutf.String.fold_utf_8 iterator () str in`
0.0.1 2023-08-27 01:46:01 +08:00			`let last_char_len = (Stdlib.String.length str) - !pred_codepoint in`
Optimise segmentation Avoid costly `@` and prefer one-time reversal. 2023-08-29 21:45:04 +08:00			`let () =`
			`if last_char_len > 0 then`
			`let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in`
			`segmented_unit_list := unit_substring :: !segmented_unit_list`
			`in`
			`List.rev !segmented_unit_list;;`