add hyphenation algorithm

2025-01-27 23:24:47 +08:00 · 2025-01-27 23:24:47 +08:00 · c1b4a45d6a
commit c1b4a45d6a
parent 52d4b1b3cb
7 changed files with 231 additions and 15 deletions
--- a/docs/grammar.md
+++ b/docs/grammar.md
@ -0,0 +1,15 @@
 語法：
 變數輸入法：
 `@varname`
 後面如果接英文字，要加`%%`，變成`@varname%%english blah`
 註解：`%comment%`，註解內可以換行。
 輸入指令：
 `{@foo|bar}`
 指定斷字語言：
 `{@lang|en}`等，要輸入於第一行才有效。
--- a/example/ex1
+++ b/example/ex1
@ -4,4 +4,16 @@
 {@foo}
-{@foo|@bar}
+{@foo|@bar}
 貓咪的眼睛，
 狐狸的耳朵。
 我是
 貓，還沒有名字。@foo%我是註釋
 %
 {@foo|@bar|12\|}
--- a/example/ex1.ug
+++ b/example/ex1.ug
@ -1,11 +1,4 @@
-貓咪的眼睛，
+{@lang|en}
 therapy of communication and pronounciation123%comment
-
+anotherline% processing
 狐狸的耳朵。
 我是
 貓，還沒有名字。@foo%我是註釋
 %
 {@foo|@bar|12\|}
--- a/src/hyphenRules/en.jl
+++ b/src/hyphenRules/en.jl
--- a/src/hyphenating.jl
+++ b/src/hyphenating.jl
@ -0,0 +1,149 @@
 """
 using Knuth-liang's pattern-matching hyphenating algorithm.
 """
 module Hyphenating
 using Match
 c = Main.uahgi.Parsing.Passes.Classes
 function match_lang(item)
    @match item begin
    c.SEQ([c.ELE([c.ID("lang")]), c.ELE([c.CHAR(v1)])]) => begin
        return v1
        end
    _ => false
    end
 end
 """
 check the language indicated by .ug file
 """
 function check_lang(ast)
    ast_tree = ast.val
    result = map(x -> match_lang(x), ast_tree)
    lang = result[1]
    if lang != false
        return lang
    else
        return nothing
    end
 end
 function remove_num_from_pattern(ptn)
    ptn1 = replace(ptn, r"^\." => "^")
    ptn2 = replace(ptn1, r"\.$" => "\$")
    ptn3 = replace(ptn2, r"\d" => "")
    return ptn3
 end
 function hyphenate_aux(chars, patterns)
    level_of_chars = fill(0, length(chars))
    println(level_of_chars)
    y = filter(x -> (match(Regex(x[1]), chars) !== nothing),patterns)
    z = map(x -> (x[1], x[2],
        map(x -> x.offset, collect(eachmatch(Regex(x[1]), chars)))), y)
    for ptn in z
        for offset in ptn[3]
            counter = 0
            for ptn_char in ptn[2]
                if match(r"[a-z]", string(ptn_char)) !== nothing
                    counter += 1
                elseif match(r"[.]", string(ptn_char)) !== nothing
                    counter = 0
                else
                    # 1-5
                    if offset + counter -1 != 0
                        orig = level_of_chars[offset + counter-1]
                        new = parse(Int, ptn_char)
                        if new > orig
                            level_of_chars[offset + counter-1] = new
                        end
                    end
                    counter += 0
                end
            end
        end
    end
    new_chars = ""
    for (idx, char) in enumerate(chars)
        new_chars *= char
        level_after_char = level_of_chars[idx]
        if (level_after_char > 0) && (level_after_char % 2 == 1)
            new_chars *= "~" # for hyphenation
        end
    end
    return new_chars
 end
 function match_char(ast_item, patterns)
    @match ast_item begin
        c.CHAR(chars), if match(r"[a-zA-Z]+", chars) !== Nothing end =>
            begin
                raw_result = hyphenate_aux(chars, patterns)
                splitted = split(raw_result, "~")
                final = []
                for i in splitted
                    push!(final, c.CHAR(i))
                    push!(final, c.SEQ([c.ELE([c.ID("disc")]),
                                        c.ELE([]),
                                        c.ELE([]),
                                        c.ELE([c.CHAR("-")])]))
                end
                final = final[1:end-1]
                return final
                #c.CHAR(hyphenate_aux(chars, patterns))
            end
        c.SEQ(v) => c.SEQ(map(x -> match_char(x, patterns), v))
        _ => ast_item
    end
 end
 function hyphenate(ast)
    lang = check_lang(ast)
    if lang !== nothing
        include("hyphenRules/$lang.jl")
        patterns = Hyphen.patterns
        pattern_with_orig = map(x->(remove_num_from_pattern(x), x),
                                patterns)
        println("AST=====", ast)
        new_ast_val = map(x -> match_char(x, pattern_with_orig), ast.val)
        new_ast_val2 = []
        for i in new_ast_val
            @match i begin
                [x] => push!(new_ast_val2, x)
                [x,y] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
                [x,y...,z] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
                _ => push!(new_ast_val2, i)
            end
        end
        new_ast_val3 = []
        for i in new_ast_val2
            @match i begin
                c.CHAR(val=r"[a-zA-Z]+") => begin
                                                new_ast_val3 = vcat(new_ast_val3,
                                                map(q -> c.CHAR(q), split(i.val, "")))
                                            end
                _ => push!(new_ast_val3, i)
            end
        end
        println("New AST Val=====", new_ast_val3)
        return c.PROG(new_ast_val3)
    else
        return ast
    end
 end
 end
--- a/src/parsing.jl
+++ b/src/parsing.jl
@ -3,7 +3,9 @@ using ParserCombinator
 using Match
 include("passes.jl")
 include("hyphenating.jl")
 using .Passes
 using .Hyphenating
 #=
 grammar rules of uahgi
@ -16,7 +18,10 @@ space = p"[ \t]" > Passes.Classes.SPACE
 id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
 id = E"@" + id_name
-char = p"[^ \n\r\t\\]" > Passes.Classes.CHAR #[1:2,:?]
+empty_char = P"" # empty char
 #make alphabet series a group for hyphenating
 char = p"([a-zA-Z]+|[^ a-zA-Z\n\r\t\\])" > Passes.Classes.CHAR #[1:2,:?]
 # chars should be preceded by "\" are \, {, }, |, @, %
 esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
@ -26,7 +31,7 @@ seq = (foo x1 x2 " ")
 => {@foo|x1|x2| }
 =#
 char_and_combined = char | esc_combined
-seq_item = id | Repeat(char_and_combined) |> Passes.Classes.ELE
+seq_item = id | Repeat(char_and_combined) | empty_char |> Passes.Classes.ELE
 seq_item_rest = E"|" + seq_item
 seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
 seq = E"{" + seq_inner + E"}"
@ -42,6 +47,8 @@ function parse(input)
    #print(parse_one(, Pattern(r".b.")))
    ast = Hyphenating.hyphenate(ast)
    passes = Passes.processed_passes
@ -112,6 +119,7 @@ function ast_to_string(ast)
                    return "(" * prog_inner * ")"
                end
        Passes.Classes.ID(v) => "[ID: " * v * "]"
        Passes.Classes.ELE([]) => return "[ELE : ()]"
        Passes.Classes.ELE(v) => begin
            prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
            return "[ELE : (" * prog_inner * ")]"
--- a/src/pdfoperating.jl
+++ b/src/pdfoperating.jl
@ -8,9 +8,9 @@ using TeX-like metrics, in px
 - weight
 - depth
 """
-struct FontMetrics
+struct CharMetrics
    height::Float64
-    weight::Float64
+    width::Float64
    depth::Float64    
 end
@ -110,6 +110,8 @@ function check_char_size(char, font_path, font_size, font_index=0)
    println("width ", width, "px")
    println("height ", height, "px")
    println("depth ", depth, "px")
    return CharMetrics(height, width, depth)
 end
 """