uahgi/src/passes.jl
Tan Kian-ting f216906b41
Some checks failed
CI / Julia 1.6 - ubuntu-latest - x64 (push) Has been cancelled
CI / Julia 1.7 - ubuntu-latest - x64 (push) Has been cancelled
CI / Julia pre - ubuntu-latest - x64 (push) Has been cancelled
add some function incl. cjk+latn passes
2025-02-04 00:52:37 +08:00

97 lines
No EOL
3 KiB
Julia

module Passes
include("classes.jl")
using .Classes
export processed_passes, Pass
processed_passes = []
struct Pass
pattern
func
end
####definition of passes ####
# 2 newline become @par{}
function two_nl_to_par_pass_func(two_nl)
return [Classes.SEQ([Classes.ID("par")])]
end
two_nl_to_par_pattern = [Classes.NL([]), Classes.NL([])] #two continuous newline
two_nl_to_par_pass = Pass(two_nl_to_par_pattern,
two_nl_to_par_pass_func)
push!(processed_passes, two_nl_to_par_pass)
# hyphen between 2 chars to disc
function discretize_hyphen(two_nl)
inner = Classes.SEQ([Classes.ELE([Classes.ID("disc")]),
Classes.ELE([Classes.CHAR("-")]),
Classes.ELE([]),
Classes.ELE([Classes.CHAR("-")])])
return [two_nl[1]; inner; two_nl[3]]
end
hyphen_pattern = [Classes.CHAR(r"[^\-]"),
Classes.CHAR(r"[-]"), Classes.CHAR(r"[^\-]")]
hyphen_disc_pass = Pass(hyphen_pattern,
discretize_hyphen)
push!(processed_passes, hyphen_disc_pass)
adjacent_cjk_pattern = [Classes.CHAR(r"[\p{Han}]"),
Classes.CHAR(r"[\p{Han}]")]
# in latin+cjk and latin+cjk add glue.
function insert_hglue_in_adjacent_cjk_lat(two_nl)
inner = Classes.ID("cjk_lat_spacing")
return [two_nl[1]; inner; two_nl[2]]
end
adjacent_cjk_lat_pattern = [Classes.CHAR(r"[\p{Han}]"),
Classes.CHAR(r"[^\p{Han}·,。!?:」』》】]〕』〗〉}…—「『《〔[【『〖〈{]")]
adjacent_cjk_lat_pattern2 = [Classes.CHAR(r"[^\p{Han}·,。!?:」』》】]〕』〗〉}…—「『《〔[【『〖〈{]"),
Classes.CHAR(r"[\p{Han}]")]
adjacent_cjk_lat_pass = Pass(adjacent_cjk_lat_pattern,
insert_hglue_in_adjacent_cjk_lat)
push!(processed_passes, adjacent_cjk_lat_pass)
adjacent_cjk_lat2_pass = Pass(adjacent_cjk_lat_pattern2,
insert_hglue_in_adjacent_cjk_lat)
push!(processed_passes, adjacent_cjk_lat2_pass)
# in 2 hanzi add glue.
function insert_hglue_in_adjacent_cjk(two_nl)
inner = Classes.ID("cjk_spacing")
return [two_nl[1]; inner; two_nl[2]]
end
adjacent_cjk_pattern = [Classes.CHAR(r"[\p{Han}]"),
Classes.CHAR(r"[\p{Han}]")]
adjacent_cjk_pass = Pass(adjacent_cjk_pattern,
insert_hglue_in_adjacent_cjk)
push!(processed_passes, adjacent_cjk_pass)
# line breaking rule in CJK 避頭尾/禁則処理
adjacent_cjk_punc_pattern = [Classes.CHAR(r"[·,。!?:」』》】]〕』〗〉}]"),
Classes.CHAR(r"[^·,。!?:」』》】]〕』〗〉}]")]
adjacent_cjk_punc_pattern2 = [Classes.CHAR(r"[^「『《〔[【『〖〈{]"),
Classes.CHAR(r"[「『《〔[【『〖〈{]")]
adjacent_cjk_punc_pattern3 = [Classes.CHAR(r"[…—]"),
Classes.CHAR(r"[^…—]")]
adjacent_glue_pass = Pass(adjacent_cjk_pattern,
insert_hglue_in_adjacent_cjk)
push!(processed_passes, adjacent_glue_pass)
adjacent_glue_pass2 = Pass(adjacent_cjk_punc_pattern2,
insert_hglue_in_adjacent_cjk)
push!(processed_passes, adjacent_glue_pass2)
adjacent_glue_pass3 = Pass(adjacent_cjk_punc_pattern3,
insert_hglue_in_adjacent_cjk)
push!(processed_passes, adjacent_glue_pass3)
end