Merge pull request #2 from raphael-proust/main
Improve performance of splitting function
This commit is contained in:
		
						commit
						6a1f3000c5
					
				
					 2 changed files with 33 additions and 16 deletions
				
			
		| 
						 | 
					@ -33,23 +33,22 @@ let split_string_by_unicode_codepoint str =
 | 
				
			||||||
(*Split a Ocaml string [str] to a `str list` *)
 | 
					(*Split a Ocaml string [str] to a `str list` *)
 | 
				
			||||||
  let pred_codepoint = ref (-1) in
 | 
					  let pred_codepoint = ref (-1) in
 | 
				
			||||||
  let segmented_unit_list = ref [] in
 | 
					  let segmented_unit_list = ref [] in
 | 
				
			||||||
  let iterator x y _ =
 | 
					  let iterator () y _ =
 | 
				
			||||||
    let _ = if  !pred_codepoint > -1 then
 | 
					    let () = if  !pred_codepoint > -1 then
 | 
				
			||||||
      let current_codepoint = y in
 | 
					      let current_codepoint = y in
 | 
				
			||||||
      let pred_char_len = current_codepoint - !pred_codepoint in
 | 
					      let pred_char_len = current_codepoint - !pred_codepoint in
 | 
				
			||||||
      let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
 | 
					      let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
 | 
				
			||||||
      let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
 | 
					      segmented_unit_list := unit_substring :: !segmented_unit_list
 | 
				
			||||||
      unit_substring
 | 
					    in
 | 
				
			||||||
    else
 | 
					    let () =  pred_codepoint := y in
 | 
				
			||||||
      "" in
 | 
					    ()
 | 
				
			||||||
    let _ =  pred_codepoint := y in x in
 | 
					  in
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  let _ = Uutf.String.fold_utf_8 iterator str str in
 | 
					  let _ = Uutf.String.fold_utf_8 iterator () str in
 | 
				
			||||||
  let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
 | 
					  let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
 | 
				
			||||||
 | 
					  let () =
 | 
				
			||||||
    if last_char_len > 0 then
 | 
					    if last_char_len > 0 then
 | 
				
			||||||
      let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
 | 
					      let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
 | 
				
			||||||
    let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
 | 
					      segmented_unit_list := unit_substring :: !segmented_unit_list
 | 
				
			||||||
    !segmented_unit_list
 | 
					  in
 | 
				
			||||||
  else
 | 
					  List.rev !segmented_unit_list;;
 | 
				
			||||||
    !segmented_unit_list;;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,3 +9,21 @@ Still simple but not just ASCII
 | 
				
			||||||
  «
 | 
					  «
 | 
				
			||||||
  —
 | 
					  —
 | 
				
			||||||
  »
 | 
					  »
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Example from the docstring of the lib
 | 
				
			||||||
 | 
					  $ ./stringCodepointSplitterTest.exe "m̄知 who you're."
 | 
				
			||||||
 | 
					  m
 | 
				
			||||||
 | 
					  ̄
 | 
				
			||||||
 | 
					  知
 | 
				
			||||||
 | 
					   
 | 
				
			||||||
 | 
					  w
 | 
				
			||||||
 | 
					  h
 | 
				
			||||||
 | 
					  o
 | 
				
			||||||
 | 
					   
 | 
				
			||||||
 | 
					  y
 | 
				
			||||||
 | 
					  o
 | 
				
			||||||
 | 
					  u
 | 
				
			||||||
 | 
					  '
 | 
				
			||||||
 | 
					  r
 | 
				
			||||||
 | 
					  e
 | 
				
			||||||
 | 
					  .
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue