Optimise segmentation

Avoid costly `@` and prefer one-time reversal.
This commit is contained in:
Raphaël Proust 2023-08-29 15:45:04 +02:00
parent 67acf25cd7
commit 30bb11b988

View file

@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str =
(*Split a Ocaml string [str] to a `str list` *) (*Split a Ocaml string [str] to a `str list` *)
let pred_codepoint = ref (-1) in let pred_codepoint = ref (-1) in
let segmented_unit_list = ref [] in let segmented_unit_list = ref [] in
let iterator x y _ = let iterator () y _ =
let _ = if !pred_codepoint > -1 then let () = if !pred_codepoint > -1 then
let current_codepoint = y in let current_codepoint = y in
let pred_char_len = current_codepoint - !pred_codepoint in let pred_char_len = current_codepoint - !pred_codepoint in
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in segmented_unit_list := unit_substring :: !segmented_unit_list
unit_substring in
else let () = pred_codepoint := y in
"" in ()
let _ = pred_codepoint := y in x in in
let _ = Uutf.String.fold_utf_8 iterator str str in let _ = Uutf.String.fold_utf_8 iterator () str in
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
if last_char_len > 0 then let () =
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in if last_char_len > 0 then
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
!segmented_unit_list segmented_unit_list := unit_substring :: !segmented_unit_list
else in
!segmented_unit_list;; List.rev !segmented_unit_list;;