From 30bb11b9889e94f140272c9a797eb9b36d1c3168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Proust?= Date: Tue, 29 Aug 2023 15:45:04 +0200 Subject: [PATCH] Optimise segmentation Avoid costly `@` and prefer one-time reversal. --- lib/stringCodepointSplitter.ml | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/stringCodepointSplitter.ml b/lib/stringCodepointSplitter.ml index e5657b5..da9d636 100644 --- a/lib/stringCodepointSplitter.ml +++ b/lib/stringCodepointSplitter.ml @@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str = (*Split a Ocaml string [str] to a `str list` *) let pred_codepoint = ref (-1) in let segmented_unit_list = ref [] in - let iterator x y _ = - let _ = if !pred_codepoint > -1 then + let iterator () y _ = + let () = if !pred_codepoint > -1 then let current_codepoint = y in let pred_char_len = current_codepoint - !pred_codepoint in - let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in - let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in - unit_substring - else - "" in - let _ = pred_codepoint := y in x in + let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in + segmented_unit_list := unit_substring :: !segmented_unit_list + in + let () = pred_codepoint := y in + () + in - let _ = Uutf.String.fold_utf_8 iterator str str in + let _ = Uutf.String.fold_utf_8 iterator () str in let last_char_len = (Stdlib.String.length str) - !pred_codepoint in - if last_char_len > 0 then - let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in - let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in - !segmented_unit_list - else - !segmented_unit_list;; - + let () = + if last_char_len > 0 then + let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in + segmented_unit_list := unit_substring :: !segmented_unit_list + in + List.rev !segmented_unit_list;;