add hyphenation algorithm
Some checks are pending
CI / Julia 1.6 - ubuntu-latest - x64 (push) Waiting to run
CI / Julia 1.7 - ubuntu-latest - x64 (push) Waiting to run
CI / Julia pre - ubuntu-latest - x64 (push) Waiting to run

This commit is contained in:
Tan, Kian-ting 2025-01-27 23:24:47 +08:00
parent 52d4b1b3cb
commit c1b4a45d6a
7 changed files with 231 additions and 15 deletions

15
docs/grammar.md Normal file
View file

@ -0,0 +1,15 @@
語法:
變數輸入法:
`@varname`
後面如果接英文字,要加`%%`,變成`@varname%%english blah`
註解:`%comment%`,註解內可以換行。
輸入指令:
`{@foo|bar}`
指定斷字語言:
`{@lang|en}`等,要輸入於第一行才有效。

View file

@ -4,4 +4,16 @@
{@foo} {@foo}
{@foo|@bar} {@foo|@bar}
貓咪的眼睛,
狐狸的耳朵。
我是
貓,還沒有名字。@foo%我是註釋
%
{@foo|@bar|12\|}

View file

@ -1,11 +1,4 @@
貓咪的眼睛, {@lang|en}
therapy of communication and pronounciation123%comment
anotherline% processing
狐狸的耳朵。
我是
貓,還沒有名字。@foo%我是註釋
%
{@foo|@bar|12\|}

37
src/hyphenRules/en.jl Normal file

File diff suppressed because one or more lines are too long

149
src/hyphenating.jl Normal file
View file

@ -0,0 +1,149 @@
"""
using Knuth-liang's pattern-matching hyphenating algorithm.
"""
module Hyphenating
using Match
c = Main.uahgi.Parsing.Passes.Classes
function match_lang(item)
@match item begin
c.SEQ([c.ELE([c.ID("lang")]), c.ELE([c.CHAR(v1)])]) => begin
return v1
end
_ => false
end
end
"""
check the language indicated by .ug file
"""
function check_lang(ast)
ast_tree = ast.val
result = map(x -> match_lang(x), ast_tree)
lang = result[1]
if lang != false
return lang
else
return nothing
end
end
function remove_num_from_pattern(ptn)
ptn1 = replace(ptn, r"^\." => "^")
ptn2 = replace(ptn1, r"\.$" => "\$")
ptn3 = replace(ptn2, r"\d" => "")
return ptn3
end
function hyphenate_aux(chars, patterns)
level_of_chars = fill(0, length(chars))
println(level_of_chars)
y = filter(x -> (match(Regex(x[1]), chars) !== nothing),patterns)
z = map(x -> (x[1], x[2],
map(x -> x.offset, collect(eachmatch(Regex(x[1]), chars)))), y)
for ptn in z
for offset in ptn[3]
counter = 0
for ptn_char in ptn[2]
if match(r"[a-z]", string(ptn_char)) !== nothing
counter += 1
elseif match(r"[.]", string(ptn_char)) !== nothing
counter = 0
else
# 1-5
if offset + counter -1 != 0
orig = level_of_chars[offset + counter-1]
new = parse(Int, ptn_char)
if new > orig
level_of_chars[offset + counter-1] = new
end
end
counter += 0
end
end
end
end
new_chars = ""
for (idx, char) in enumerate(chars)
new_chars *= char
level_after_char = level_of_chars[idx]
if (level_after_char > 0) && (level_after_char % 2 == 1)
new_chars *= "~" # for hyphenation
end
end
return new_chars
end
function match_char(ast_item, patterns)
@match ast_item begin
c.CHAR(chars), if match(r"[a-zA-Z]+", chars) !== Nothing end =>
begin
raw_result = hyphenate_aux(chars, patterns)
splitted = split(raw_result, "~")
final = []
for i in splitted
push!(final, c.CHAR(i))
push!(final, c.SEQ([c.ELE([c.ID("disc")]),
c.ELE([]),
c.ELE([]),
c.ELE([c.CHAR("-")])]))
end
final = final[1:end-1]
return final
#c.CHAR(hyphenate_aux(chars, patterns))
end
c.SEQ(v) => c.SEQ(map(x -> match_char(x, patterns), v))
_ => ast_item
end
end
function hyphenate(ast)
lang = check_lang(ast)
if lang !== nothing
include("hyphenRules/$lang.jl")
patterns = Hyphen.patterns
pattern_with_orig = map(x->(remove_num_from_pattern(x), x),
patterns)
println("AST=====", ast)
new_ast_val = map(x -> match_char(x, pattern_with_orig), ast.val)
new_ast_val2 = []
for i in new_ast_val
@match i begin
[x] => push!(new_ast_val2, x)
[x,y] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
[x,y...,z] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
_ => push!(new_ast_val2, i)
end
end
new_ast_val3 = []
for i in new_ast_val2
@match i begin
c.CHAR(val=r"[a-zA-Z]+") => begin
new_ast_val3 = vcat(new_ast_val3,
map(q -> c.CHAR(q), split(i.val, "")))
end
_ => push!(new_ast_val3, i)
end
end
println("New AST Val=====", new_ast_val3)
return c.PROG(new_ast_val3)
else
return ast
end
end
end

View file

@ -3,7 +3,9 @@ using ParserCombinator
using Match using Match
include("passes.jl") include("passes.jl")
include("hyphenating.jl")
using .Passes using .Passes
using .Hyphenating
#= #=
grammar rules of uahgi grammar rules of uahgi
@ -16,7 +18,10 @@ space = p"[ \t]" > Passes.Classes.SPACE
id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
id = E"@" + id_name id = E"@" + id_name
char = p"[^ \n\r\t\\]" > Passes.Classes.CHAR #[1:2,:?] empty_char = P"" # empty char
#make alphabet series a group for hyphenating
char = p"([a-zA-Z]+|[^ a-zA-Z\n\r\t\\])" > Passes.Classes.CHAR #[1:2,:?]
# chars should be preceded by "\" are \, {, }, |, @, % # chars should be preceded by "\" are \, {, }, |, @, %
esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
@ -26,7 +31,7 @@ seq = (foo x1 x2 " ")
=> {@foo|x1|x2| } => {@foo|x1|x2| }
=# =#
char_and_combined = char | esc_combined char_and_combined = char | esc_combined
seq_item = id | Repeat(char_and_combined) |> Passes.Classes.ELE seq_item = id | Repeat(char_and_combined) | empty_char |> Passes.Classes.ELE
seq_item_rest = E"|" + seq_item seq_item_rest = E"|" + seq_item
seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
seq = E"{" + seq_inner + E"}" seq = E"{" + seq_inner + E"}"
@ -42,6 +47,8 @@ function parse(input)
#print(parse_one(, Pattern(r".b."))) #print(parse_one(, Pattern(r".b.")))
ast = Hyphenating.hyphenate(ast)
passes = Passes.processed_passes passes = Passes.processed_passes
@ -112,6 +119,7 @@ function ast_to_string(ast)
return "(" * prog_inner * ")" return "(" * prog_inner * ")"
end end
Passes.Classes.ID(v) => "[ID: " * v * "]" Passes.Classes.ID(v) => "[ID: " * v * "]"
Passes.Classes.ELE([]) => return "[ELE : ()]"
Passes.Classes.ELE(v) => begin Passes.Classes.ELE(v) => begin
prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v)) prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
return "[ELE : (" * prog_inner * ")]" return "[ELE : (" * prog_inner * ")]"

View file

@ -8,9 +8,9 @@ using TeX-like metrics, in px
- weight - weight
- depth - depth
""" """
struct FontMetrics struct CharMetrics
height::Float64 height::Float64
weight::Float64 width::Float64
depth::Float64 depth::Float64
end end
@ -110,6 +110,8 @@ function check_char_size(char, font_path, font_size, font_index=0)
println("width ", width, "px") println("width ", width, "px")
println("height ", height, "px") println("height ", height, "px")
println("depth ", depth, "px") println("depth ", depth, "px")
return CharMetrics(height, width, depth)
end end
""" """