Compare commits

..

12 commits
v0.0.1 ... main

Author SHA1 Message Date
c1611f3b55 Merge branch 'main' of github.com:Yoxem/stringCodepointSplitter 2023-08-31 19:13:32 +08:00
8b93467985 add changes 2023-08-31 19:12:14 +08:00
Tan, Kian-ting
4226b6bb40
Update README.md
dep version correcting
2023-08-31 07:19:18 +08:00
Tan, Kian-ting
6a1f3000c5
Merge pull request #2 from raphael-proust/main
Improve performance of splitting function
2023-08-30 00:02:54 +08:00
11fbe53e74
Merge pull request #1 from raphael-proust/main
Remove findlib dynlink
2023-08-29 23:10:21 +08:00
aeee3f79cc
Update stringCodepointSplitter.ml 2023-08-29 22:32:58 +08:00
6ca99f7399
Update stringCodepointSplitter.ml 2023-08-29 22:30:30 +08:00
0bb465f663
Update LICENSE 2023-08-29 22:30:05 +08:00
Raphaël Proust
a9659a7b7e more test 2023-08-29 15:45:33 +02:00
Raphaël Proust
30bb11b988 Optimise segmentation
Avoid costly `@` and prefer one-time reversal.
2023-08-29 15:45:04 +02:00
Raphaël Proust
67acf25cd7 Replacing dynlink by just using dune to link at compile time 2023-08-29 15:38:36 +02:00
Raphaël Proust
1a4afd5d41 Adding tests to make sure things work 2023-08-29 15:38:23 +02:00
11 changed files with 68 additions and 30 deletions

View file

@ -1,2 +1,6 @@
## v0.0.2 (2023-08-30)
- add more tests
- optimalize the list processing
- disable dynamic linking to uutf
## v0.0.1 (2023-08-27)
- add `split_string_by_unicode_codepoint` initially

View file

@ -1,4 +1,4 @@
Copyright 2023 Tan Kian-ting
Copyright 2023 Tan Kian-ting & Raphaël Proust
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

View file

@ -5,7 +5,7 @@ Split a string to a list of strings of a character by the unicode codepoint.
It requires module Uutf.
## Dependencies
- OCaml >= 4.13
- OCaml >= 4.06
- dune
- uutf
- fildlib

View file

@ -1,7 +1,8 @@
(lang dune 3.9)
(cram enable)
(name stringCodepointSplitter)
(version 0.0.1)
(version 0.0.2)
(generate_opam_files true)
(source
@ -21,7 +22,7 @@
(description "Split a string to a list of strings of a character by the unicode codepoint.
It requires module Uutf.")
(depends ocaml ocamlfind dune uutf)
(depends (ocaml (>= 4.06)) ocamlfind dune uutf)
(tags (string utf8)))
; See the complete stanza docs at https://dune.readthedocs.io/en/stable/dune-files.html#dune-project

View file

@ -1,4 +1,4 @@
(library
(name stringCodepointSplitter)
(public_name stringCodepointSplitter)
(libraries uutf findlib.dynload))
(libraries uutf))

View file

@ -1,6 +1,6 @@
open Stdlib
(*
(c) Tan Kian-ting 2023
(c) 2023 Tan Kian-ting (main author) & Raphaël Proust (PR giver)
Under MIT License
习包子 梁家河小学博士 清零宗 习炀帝 庆丰大帝
@ -13,9 +13,6 @@ Under MIT License
It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list]
*)
let _ = Findlib.init ();;
Fl_dynload.load_packages ["uutf"];;
(** Split an OCaml string [str] to a [string list]
@ -36,23 +33,22 @@ let split_string_by_unicode_codepoint str =
(*Split a Ocaml string [str] to a `str list` *)
let pred_codepoint = ref (-1) in
let segmented_unit_list = ref [] in
let iterator x y _ =
let _ = if !pred_codepoint > -1 then
let iterator () y _ =
let () = if !pred_codepoint > -1 then
let current_codepoint = y in
let pred_char_len = current_codepoint - !pred_codepoint in
let unit_substring = Stdlib.String.sub x !pred_codepoint pred_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
unit_substring
else
"" in
let _ = pred_codepoint := y in x in
let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
segmented_unit_list := unit_substring :: !segmented_unit_list
in
let () = pred_codepoint := y in
()
in
let _ = Uutf.String.fold_utf_8 iterator str str in
let _ = Uutf.String.fold_utf_8 iterator () str in
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
if last_char_len > 0 then
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
let _ = segmented_unit_list := !segmented_unit_list @ [unit_substring] in
!segmented_unit_list
else
!segmented_unit_list;;
let () =
if last_char_len > 0 then
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
segmented_unit_list := unit_substring :: !segmented_unit_list
in
List.rev !segmented_unit_list;;

View file

@ -1,6 +1,6 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
version: "0.0.1"
version: "0.0.2"
synopsis:
"Split a string to a list of strings of a character by the unicode codepoint"
description: """
@ -13,7 +13,7 @@ tags: ["string" "utf8"]
homepage: "https://github.com/yoxem/stringCodepointSplitter"
bug-reports: "https://github.com/Yoxem/stringCodepointSplitter/issues"
depends: [
"ocaml"
"ocaml" {>= "4.06"}
"ocamlfind"
"dune" {>= "3.9"}
"uutf"

View file

@ -1,2 +1,6 @@
(test
(name stringCodepointSplitter))
(executable
(libraries stringCodepointSplitter)
(name stringCodepointSplitterTest))
(cram
(deps ./stringCodepointSplitterTest.exe))

View file

@ -0,0 +1,29 @@
ASCII only
$ ./stringCodepointSplitterTest.exe abc
a
b
c
Still simple but not just ASCII
$ ./stringCodepointSplitterTest.exe «»
«
»
Example from the docstring of the lib
$ ./stringCodepointSplitterTest.exe "m̄知 who you're."
m
̄
w
h
o
y
o
u
'
r
e
.

View file

@ -0,0 +1,4 @@
let () =
Sys.argv.(1)
|> StringCodepointSplitter.split_string_by_unicode_codepoint
|> List.iter print_endline