Merge pull request #2 from raphael-proust/main
Improve performance of splitting function
This commit is contained in:
commit
6a1f3000c5
2 changed files with 33 additions and 16 deletions
|
@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str =
|
||||||
(*Split a Ocaml string [str] to a `str list` *)
|
(*Split a Ocaml string [str] to a `str list` *)
|
||||||
let pred_codepoint = ref (-1) in
|
let pred_codepoint = ref (-1) in
|
||||||
let segmented_unit_list = ref [] in
|
let segmented_unit_list = ref [] in
|
||||||
let iterator x y _ =
|
let iterator () y _ =
|
||||||
let _ = if !pred_codepoint > -1 then
|
let () = if !pred_codepoint > -1 then
|
||||||
let current_codepoint = y in
|
let current_codepoint = y in
|
||||||
let pred_char_len = current_codepoint - !pred_codepoint in
|
let pred_char_len = current_codepoint - !pred_codepoint in
|
||||||
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
|
let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
|
||||||
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
|
segmented_unit_list := unit_substring :: !segmented_unit_list
|
||||||
unit_substring
|
in
|
||||||
else
|
let () = pred_codepoint := y in
|
||||||
"" in
|
()
|
||||||
let _ = pred_codepoint := y in x in
|
in
|
||||||
|
|
||||||
let _ = Uutf.String.fold_utf_8 iterator str str in
|
let _ = Uutf.String.fold_utf_8 iterator () str in
|
||||||
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
|
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
|
||||||
if last_char_len > 0 then
|
let () =
|
||||||
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
|
if last_char_len > 0 then
|
||||||
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
|
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
|
||||||
!segmented_unit_list
|
segmented_unit_list := unit_substring :: !segmented_unit_list
|
||||||
else
|
in
|
||||||
!segmented_unit_list;;
|
List.rev !segmented_unit_list;;
|
||||||
|
|
||||||
|
|
|
@ -9,3 +9,21 @@ Still simple but not just ASCII
|
||||||
«
|
«
|
||||||
—
|
—
|
||||||
»
|
»
|
||||||
|
|
||||||
|
Example from the docstring of the lib
|
||||||
|
$ ./stringCodepointSplitterTest.exe "m̄知 who you're."
|
||||||
|
m
|
||||||
|
̄
|
||||||
|
知
|
||||||
|
|
||||||
|
w
|
||||||
|
h
|
||||||
|
o
|
||||||
|
|
||||||
|
y
|
||||||
|
o
|
||||||
|
u
|
||||||
|
'
|
||||||
|
r
|
||||||
|
e
|
||||||
|
.
|
||||||
|
|
Loading…
Reference in a new issue