Merge pull request #2 from raphael-proust/main

Improve performance of splitting function
This commit is contained in:
Tan, Kian-ting 2023-08-30 00:02:54 +08:00 committed by GitHub
commit 6a1f3000c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 16 deletions

View file

@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str =
(*Split a Ocaml string [str] to a `str list` *)
let pred_codepoint = ref (-1) in
let segmented_unit_list = ref [] in
let iterator x y _ =
let _ = if !pred_codepoint > -1 then
let iterator () y _ =
let () = if !pred_codepoint > -1 then
let current_codepoint = y in
let pred_char_len = current_codepoint - !pred_codepoint in
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
unit_substring
else
"" in
let _ = pred_codepoint := y in x in
let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
segmented_unit_list := unit_substring :: !segmented_unit_list
in
let () = pred_codepoint := y in
()
in
let _ = Uutf.String.fold_utf_8 iterator str str in
let _ = Uutf.String.fold_utf_8 iterator () str in
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
let () =
if last_char_len > 0 then
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
!segmented_unit_list
else
!segmented_unit_list;;
segmented_unit_list := unit_substring :: !segmented_unit_list
in
List.rev !segmented_unit_list;;

View file

@ -9,3 +9,21 @@ Still simple but not just ASCII
«
»
Example from the docstring of the lib
$ ./stringCodepointSplitterTest.exe "m̄知 who you're."
m
̄
w
h
o
y
o
u
'
r
e
.