add hyphenation algorithm
Some checks are pending
CI / Julia 1.6 - ubuntu-latest - x64 (push) Waiting to run
CI / Julia 1.7 - ubuntu-latest - x64 (push) Waiting to run
CI / Julia pre - ubuntu-latest - x64 (push) Waiting to run

This commit is contained in:
Tan, Kian-ting 2025-01-27 23:24:47 +08:00
parent 52d4b1b3cb
commit c1b4a45d6a
7 changed files with 231 additions and 15 deletions

15
docs/grammar.md Normal file
View file

@ -0,0 +1,15 @@
語法:
變數輸入法:
`@varname`
後面如果接英文字,要加`%%`,變成`@varname%%english blah`
註解:`%comment%`,註解內可以換行。
輸入指令:
`{@foo|bar}`
指定斷字語言:
`{@lang|en}`等,要輸入於第一行才有效。

View file

@ -4,4 +4,16 @@
{@foo}
{@foo|@bar}
{@foo|@bar}
貓咪的眼睛,
狐狸的耳朵。
我是
貓,還沒有名字。@foo%我是註釋
%
{@foo|@bar|12\|}

View file

@ -1,11 +1,4 @@
貓咪的眼睛,
{@lang|en}
therapy of communication and pronounciation123%comment
狐狸的耳朵。
我是
貓,還沒有名字。@foo%我是註釋
%
{@foo|@bar|12\|}
anotherline% processing

37
src/hyphenRules/en.jl Normal file

File diff suppressed because one or more lines are too long

149
src/hyphenating.jl Normal file
View file

@ -0,0 +1,149 @@
"""
using Knuth-liang's pattern-matching hyphenating algorithm.
"""
module Hyphenating
using Match
c = Main.uahgi.Parsing.Passes.Classes
function match_lang(item)
@match item begin
c.SEQ([c.ELE([c.ID("lang")]), c.ELE([c.CHAR(v1)])]) => begin
return v1
end
_ => false
end
end
"""
check the language indicated by .ug file
"""
function check_lang(ast)
ast_tree = ast.val
result = map(x -> match_lang(x), ast_tree)
lang = result[1]
if lang != false
return lang
else
return nothing
end
end
function remove_num_from_pattern(ptn)
ptn1 = replace(ptn, r"^\." => "^")
ptn2 = replace(ptn1, r"\.$" => "\$")
ptn3 = replace(ptn2, r"\d" => "")
return ptn3
end
function hyphenate_aux(chars, patterns)
level_of_chars = fill(0, length(chars))
println(level_of_chars)
y = filter(x -> (match(Regex(x[1]), chars) !== nothing),patterns)
z = map(x -> (x[1], x[2],
map(x -> x.offset, collect(eachmatch(Regex(x[1]), chars)))), y)
for ptn in z
for offset in ptn[3]
counter = 0
for ptn_char in ptn[2]
if match(r"[a-z]", string(ptn_char)) !== nothing
counter += 1
elseif match(r"[.]", string(ptn_char)) !== nothing
counter = 0
else
# 1-5
if offset + counter -1 != 0
orig = level_of_chars[offset + counter-1]
new = parse(Int, ptn_char)
if new > orig
level_of_chars[offset + counter-1] = new
end
end
counter += 0
end
end
end
end
new_chars = ""
for (idx, char) in enumerate(chars)
new_chars *= char
level_after_char = level_of_chars[idx]
if (level_after_char > 0) && (level_after_char % 2 == 1)
new_chars *= "~" # for hyphenation
end
end
return new_chars
end
function match_char(ast_item, patterns)
@match ast_item begin
c.CHAR(chars), if match(r"[a-zA-Z]+", chars) !== Nothing end =>
begin
raw_result = hyphenate_aux(chars, patterns)
splitted = split(raw_result, "~")
final = []
for i in splitted
push!(final, c.CHAR(i))
push!(final, c.SEQ([c.ELE([c.ID("disc")]),
c.ELE([]),
c.ELE([]),
c.ELE([c.CHAR("-")])]))
end
final = final[1:end-1]
return final
#c.CHAR(hyphenate_aux(chars, patterns))
end
c.SEQ(v) => c.SEQ(map(x -> match_char(x, patterns), v))
_ => ast_item
end
end
function hyphenate(ast)
lang = check_lang(ast)
if lang !== nothing
include("hyphenRules/$lang.jl")
patterns = Hyphen.patterns
pattern_with_orig = map(x->(remove_num_from_pattern(x), x),
patterns)
println("AST=====", ast)
new_ast_val = map(x -> match_char(x, pattern_with_orig), ast.val)
new_ast_val2 = []
for i in new_ast_val
@match i begin
[x] => push!(new_ast_val2, x)
[x,y] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
[x,y...,z] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
_ => push!(new_ast_val2, i)
end
end
new_ast_val3 = []
for i in new_ast_val2
@match i begin
c.CHAR(val=r"[a-zA-Z]+") => begin
new_ast_val3 = vcat(new_ast_val3,
map(q -> c.CHAR(q), split(i.val, "")))
end
_ => push!(new_ast_val3, i)
end
end
println("New AST Val=====", new_ast_val3)
return c.PROG(new_ast_val3)
else
return ast
end
end
end

View file

@ -3,7 +3,9 @@ using ParserCombinator
using Match
include("passes.jl")
include("hyphenating.jl")
using .Passes
using .Hyphenating
#=
grammar rules of uahgi
@ -16,7 +18,10 @@ space = p"[ \t]" > Passes.Classes.SPACE
id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
id = E"@" + id_name
char = p"[^ \n\r\t\\]" > Passes.Classes.CHAR #[1:2,:?]
empty_char = P"" # empty char
#make alphabet series a group for hyphenating
char = p"([a-zA-Z]+|[^ a-zA-Z\n\r\t\\])" > Passes.Classes.CHAR #[1:2,:?]
# chars should be preceded by "\" are \, {, }, |, @, %
esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
@ -26,7 +31,7 @@ seq = (foo x1 x2 " ")
=> {@foo|x1|x2| }
=#
char_and_combined = char | esc_combined
seq_item = id | Repeat(char_and_combined) |> Passes.Classes.ELE
seq_item = id | Repeat(char_and_combined) | empty_char |> Passes.Classes.ELE
seq_item_rest = E"|" + seq_item
seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
seq = E"{" + seq_inner + E"}"
@ -42,6 +47,8 @@ function parse(input)
#print(parse_one(, Pattern(r".b.")))
ast = Hyphenating.hyphenate(ast)
passes = Passes.processed_passes
@ -112,6 +119,7 @@ function ast_to_string(ast)
return "(" * prog_inner * ")"
end
Passes.Classes.ID(v) => "[ID: " * v * "]"
Passes.Classes.ELE([]) => return "[ELE : ()]"
Passes.Classes.ELE(v) => begin
prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
return "[ELE : (" * prog_inner * ")]"

View file

@ -8,9 +8,9 @@ using TeX-like metrics, in px
- weight
- depth
"""
struct FontMetrics
struct CharMetrics
height::Float64
weight::Float64
width::Float64
depth::Float64
end
@ -110,6 +110,8 @@ function check_char_size(char, font_path, font_size, font_index=0)
println("width ", width, "px")
println("height ", height, "px")
println("depth ", depth, "px")
return CharMetrics(height, width, depth)
end
"""