Merge pull request #2 from raphael-proust/main

Improve performance of splitting function
This commit is contained in:
Tan, Kian-ting 2023-08-30 00:02:54 +08:00 committed by GitHub
commit 6a1f3000c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 16 deletions

View file

@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str =
(*Split a Ocaml string [str] to a `str list` *) (*Split a Ocaml string [str] to a `str list` *)
let pred_codepoint = ref (-1) in let pred_codepoint = ref (-1) in
let segmented_unit_list = ref [] in let segmented_unit_list = ref [] in
let iterator x y _ = let iterator () y _ =
let _ = if !pred_codepoint > -1 then let () = if !pred_codepoint > -1 then
let current_codepoint = y in let current_codepoint = y in
let pred_char_len = current_codepoint - !pred_codepoint in let pred_char_len = current_codepoint - !pred_codepoint in
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in segmented_unit_list := unit_substring :: !segmented_unit_list
unit_substring in
else let () = pred_codepoint := y in
"" in ()
let _ = pred_codepoint := y in x in in
let _ = Uutf.String.fold_utf_8 iterator str str in let _ = Uutf.String.fold_utf_8 iterator () str in
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
if last_char_len > 0 then let () =
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in if last_char_len > 0 then
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
!segmented_unit_list segmented_unit_list := unit_substring :: !segmented_unit_list
else in
!segmented_unit_list;; List.rev !segmented_unit_list;;

View file

@ -9,3 +9,21 @@ Still simple but not just ASCII
« «
» »
Example from the docstring of the lib
$ ./stringCodepointSplitterTest.exe "m̄知 who you're."
m
̄
w
h
o
y
o
u
'
r
e
.