add hyphenation algorithm
This commit is contained in:
parent
52d4b1b3cb
commit
c1b4a45d6a
7 changed files with 231 additions and 15 deletions
15
docs/grammar.md
Normal file
15
docs/grammar.md
Normal file
|
@ -0,0 +1,15 @@
|
|||
語法:
|
||||
變數輸入法:
|
||||
`@varname`
|
||||
|
||||
後面如果接英文字,要加`%%`,變成`@varname%%english blah`
|
||||
|
||||
註解:`%comment%`,註解內可以換行。
|
||||
|
||||
輸入指令:
|
||||
|
||||
`{@foo|bar}`
|
||||
|
||||
指定斷字語言:
|
||||
|
||||
`{@lang|en}`等,要輸入於第一行才有效。
|
|
@ -4,4 +4,16 @@
|
|||
|
||||
{@foo}
|
||||
|
||||
{@foo|@bar}
|
||||
{@foo|@bar}
|
||||
|
||||
貓咪的眼睛,
|
||||
|
||||
|
||||
狐狸的耳朵。
|
||||
|
||||
我是
|
||||
|
||||
貓,還沒有名字。@foo%我是註釋
|
||||
|
||||
%
|
||||
{@foo|@bar|12\|}
|
|
@ -1,11 +1,4 @@
|
|||
貓咪的眼睛,
|
||||
{@lang|en}
|
||||
therapy of communication and pronounciation123%comment
|
||||
|
||||
|
||||
狐狸的耳朵。
|
||||
|
||||
我是
|
||||
|
||||
貓,還沒有名字。@foo%我是註釋
|
||||
|
||||
%
|
||||
{@foo|@bar|12\|}
|
||||
anotherline% processing
|
37
src/hyphenRules/en.jl
Normal file
37
src/hyphenRules/en.jl
Normal file
File diff suppressed because one or more lines are too long
149
src/hyphenating.jl
Normal file
149
src/hyphenating.jl
Normal file
|
@ -0,0 +1,149 @@
|
|||
"""
|
||||
using Knuth-liang's pattern-matching hyphenating algorithm.
|
||||
"""
|
||||
module Hyphenating
|
||||
using Match
|
||||
c = Main.uahgi.Parsing.Passes.Classes
|
||||
|
||||
function match_lang(item)
|
||||
@match item begin
|
||||
c.SEQ([c.ELE([c.ID("lang")]), c.ELE([c.CHAR(v1)])]) => begin
|
||||
return v1
|
||||
end
|
||||
_ => false
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
check the language indicated by .ug file
|
||||
"""
|
||||
function check_lang(ast)
|
||||
ast_tree = ast.val
|
||||
result = map(x -> match_lang(x), ast_tree)
|
||||
lang = result[1]
|
||||
|
||||
if lang != false
|
||||
return lang
|
||||
else
|
||||
return nothing
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
function remove_num_from_pattern(ptn)
|
||||
ptn1 = replace(ptn, r"^\." => "^")
|
||||
ptn2 = replace(ptn1, r"\.$" => "\$")
|
||||
ptn3 = replace(ptn2, r"\d" => "")
|
||||
return ptn3
|
||||
|
||||
end
|
||||
|
||||
function hyphenate_aux(chars, patterns)
|
||||
level_of_chars = fill(0, length(chars))
|
||||
println(level_of_chars)
|
||||
|
||||
y = filter(x -> (match(Regex(x[1]), chars) !== nothing),patterns)
|
||||
z = map(x -> (x[1], x[2],
|
||||
map(x -> x.offset, collect(eachmatch(Regex(x[1]), chars)))), y)
|
||||
for ptn in z
|
||||
for offset in ptn[3]
|
||||
counter = 0
|
||||
for ptn_char in ptn[2]
|
||||
if match(r"[a-z]", string(ptn_char)) !== nothing
|
||||
counter += 1
|
||||
elseif match(r"[.]", string(ptn_char)) !== nothing
|
||||
counter = 0
|
||||
else
|
||||
# 1-5
|
||||
if offset + counter -1 != 0
|
||||
orig = level_of_chars[offset + counter-1]
|
||||
new = parse(Int, ptn_char)
|
||||
if new > orig
|
||||
level_of_chars[offset + counter-1] = new
|
||||
end
|
||||
end
|
||||
counter += 0
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
new_chars = ""
|
||||
for (idx, char) in enumerate(chars)
|
||||
new_chars *= char
|
||||
level_after_char = level_of_chars[idx]
|
||||
if (level_after_char > 0) && (level_after_char % 2 == 1)
|
||||
new_chars *= "~" # for hyphenation
|
||||
end
|
||||
end
|
||||
return new_chars
|
||||
|
||||
end
|
||||
|
||||
function match_char(ast_item, patterns)
|
||||
@match ast_item begin
|
||||
c.CHAR(chars), if match(r"[a-zA-Z]+", chars) !== Nothing end =>
|
||||
begin
|
||||
raw_result = hyphenate_aux(chars, patterns)
|
||||
splitted = split(raw_result, "~")
|
||||
final = []
|
||||
for i in splitted
|
||||
push!(final, c.CHAR(i))
|
||||
push!(final, c.SEQ([c.ELE([c.ID("disc")]),
|
||||
c.ELE([]),
|
||||
c.ELE([]),
|
||||
c.ELE([c.CHAR("-")])]))
|
||||
end
|
||||
|
||||
final = final[1:end-1]
|
||||
return final
|
||||
#c.CHAR(hyphenate_aux(chars, patterns))
|
||||
end
|
||||
c.SEQ(v) => c.SEQ(map(x -> match_char(x, patterns), v))
|
||||
_ => ast_item
|
||||
end
|
||||
end
|
||||
|
||||
function hyphenate(ast)
|
||||
lang = check_lang(ast)
|
||||
if lang !== nothing
|
||||
include("hyphenRules/$lang.jl")
|
||||
patterns = Hyphen.patterns
|
||||
pattern_with_orig = map(x->(remove_num_from_pattern(x), x),
|
||||
patterns)
|
||||
|
||||
println("AST=====", ast)
|
||||
new_ast_val = map(x -> match_char(x, pattern_with_orig), ast.val)
|
||||
|
||||
|
||||
|
||||
new_ast_val2 = []
|
||||
for i in new_ast_val
|
||||
@match i begin
|
||||
[x] => push!(new_ast_val2, x)
|
||||
[x,y] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
|
||||
[x,y...,z] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
|
||||
_ => push!(new_ast_val2, i)
|
||||
end
|
||||
end
|
||||
|
||||
new_ast_val3 = []
|
||||
for i in new_ast_val2
|
||||
@match i begin
|
||||
c.CHAR(val=r"[a-zA-Z]+") => begin
|
||||
new_ast_val3 = vcat(new_ast_val3,
|
||||
map(q -> c.CHAR(q), split(i.val, "")))
|
||||
end
|
||||
_ => push!(new_ast_val3, i)
|
||||
end
|
||||
end
|
||||
|
||||
println("New AST Val=====", new_ast_val3)
|
||||
|
||||
return c.PROG(new_ast_val3)
|
||||
else
|
||||
return ast
|
||||
end
|
||||
end
|
||||
|
||||
end
|
|
@ -3,7 +3,9 @@ using ParserCombinator
|
|||
using Match
|
||||
|
||||
include("passes.jl")
|
||||
include("hyphenating.jl")
|
||||
using .Passes
|
||||
using .Hyphenating
|
||||
|
||||
#=
|
||||
grammar rules of uahgi
|
||||
|
@ -16,7 +18,10 @@ space = p"[ \t]" > Passes.Classes.SPACE
|
|||
id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
|
||||
id = E"@" + id_name
|
||||
|
||||
char = p"[^ \n\r\t\\]" > Passes.Classes.CHAR #[1:2,:?]
|
||||
empty_char = P"" # empty char
|
||||
|
||||
#make alphabet series a group for hyphenating
|
||||
char = p"([a-zA-Z]+|[^ a-zA-Z\n\r\t\\])" > Passes.Classes.CHAR #[1:2,:?]
|
||||
|
||||
# chars should be preceded by "\" are \, {, }, |, @, %
|
||||
esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
|
||||
|
@ -26,7 +31,7 @@ seq = (foo x1 x2 " ")
|
|||
=> {@foo|x1|x2| }
|
||||
=#
|
||||
char_and_combined = char | esc_combined
|
||||
seq_item = id | Repeat(char_and_combined) |> Passes.Classes.ELE
|
||||
seq_item = id | Repeat(char_and_combined) | empty_char |> Passes.Classes.ELE
|
||||
seq_item_rest = E"|" + seq_item
|
||||
seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
|
||||
seq = E"{" + seq_inner + E"}"
|
||||
|
@ -42,6 +47,8 @@ function parse(input)
|
|||
|
||||
|
||||
#print(parse_one(, Pattern(r".b.")))
|
||||
|
||||
ast = Hyphenating.hyphenate(ast)
|
||||
|
||||
passes = Passes.processed_passes
|
||||
|
||||
|
@ -112,6 +119,7 @@ function ast_to_string(ast)
|
|||
return "(" * prog_inner * ")"
|
||||
end
|
||||
Passes.Classes.ID(v) => "[ID: " * v * "]"
|
||||
Passes.Classes.ELE([]) => return "[ELE : ()]"
|
||||
Passes.Classes.ELE(v) => begin
|
||||
prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
|
||||
return "[ELE : (" * prog_inner * ")]"
|
||||
|
|
|
@ -8,9 +8,9 @@ using TeX-like metrics, in px
|
|||
- weight
|
||||
- depth
|
||||
"""
|
||||
struct FontMetrics
|
||||
struct CharMetrics
|
||||
height::Float64
|
||||
weight::Float64
|
||||
width::Float64
|
||||
depth::Float64
|
||||
end
|
||||
|
||||
|
@ -110,6 +110,8 @@ function check_char_size(char, font_path, font_size, font_index=0)
|
|||
println("width ", width, "px")
|
||||
println("height ", height, "px")
|
||||
println("depth ", depth, "px")
|
||||
|
||||
return CharMetrics(height, width, depth)
|
||||
end
|
||||
|
||||
"""
|
||||
|
|
Loading…
Reference in a new issue