add hyphenation algorithm
This commit is contained in:
parent
52d4b1b3cb
commit
c1b4a45d6a
7 changed files with 231 additions and 15 deletions
15
docs/grammar.md
Normal file
15
docs/grammar.md
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
語法:
|
||||||
|
變數輸入法:
|
||||||
|
`@varname`
|
||||||
|
|
||||||
|
後面如果接英文字,要加`%%`,變成`@varname%%english blah`
|
||||||
|
|
||||||
|
註解:`%comment%`,註解內可以換行。
|
||||||
|
|
||||||
|
輸入指令:
|
||||||
|
|
||||||
|
`{@foo|bar}`
|
||||||
|
|
||||||
|
指定斷字語言:
|
||||||
|
|
||||||
|
`{@lang|en}`等,要輸入於第一行才有效。
|
|
@ -4,4 +4,16 @@
|
||||||
|
|
||||||
{@foo}
|
{@foo}
|
||||||
|
|
||||||
{@foo|@bar}
|
{@foo|@bar}
|
||||||
|
|
||||||
|
貓咪的眼睛,
|
||||||
|
|
||||||
|
|
||||||
|
狐狸的耳朵。
|
||||||
|
|
||||||
|
我是
|
||||||
|
|
||||||
|
貓,還沒有名字。@foo%我是註釋
|
||||||
|
|
||||||
|
%
|
||||||
|
{@foo|@bar|12\|}
|
|
@ -1,11 +1,4 @@
|
||||||
貓咪的眼睛,
|
{@lang|en}
|
||||||
|
therapy of communication and pronounciation123%comment
|
||||||
|
|
||||||
|
anotherline% processing
|
||||||
狐狸的耳朵。
|
|
||||||
|
|
||||||
我是
|
|
||||||
|
|
||||||
貓,還沒有名字。@foo%我是註釋
|
|
||||||
|
|
||||||
%
|
|
||||||
{@foo|@bar|12\|}
|
|
37
src/hyphenRules/en.jl
Normal file
37
src/hyphenRules/en.jl
Normal file
File diff suppressed because one or more lines are too long
149
src/hyphenating.jl
Normal file
149
src/hyphenating.jl
Normal file
|
@ -0,0 +1,149 @@
|
||||||
|
"""
|
||||||
|
using Knuth-liang's pattern-matching hyphenating algorithm.
|
||||||
|
"""
|
||||||
|
module Hyphenating
|
||||||
|
using Match
|
||||||
|
c = Main.uahgi.Parsing.Passes.Classes
|
||||||
|
|
||||||
|
function match_lang(item)
|
||||||
|
@match item begin
|
||||||
|
c.SEQ([c.ELE([c.ID("lang")]), c.ELE([c.CHAR(v1)])]) => begin
|
||||||
|
return v1
|
||||||
|
end
|
||||||
|
_ => false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
check the language indicated by .ug file
|
||||||
|
"""
|
||||||
|
function check_lang(ast)
|
||||||
|
ast_tree = ast.val
|
||||||
|
result = map(x -> match_lang(x), ast_tree)
|
||||||
|
lang = result[1]
|
||||||
|
|
||||||
|
if lang != false
|
||||||
|
return lang
|
||||||
|
else
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
function remove_num_from_pattern(ptn)
|
||||||
|
ptn1 = replace(ptn, r"^\." => "^")
|
||||||
|
ptn2 = replace(ptn1, r"\.$" => "\$")
|
||||||
|
ptn3 = replace(ptn2, r"\d" => "")
|
||||||
|
return ptn3
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
function hyphenate_aux(chars, patterns)
|
||||||
|
level_of_chars = fill(0, length(chars))
|
||||||
|
println(level_of_chars)
|
||||||
|
|
||||||
|
y = filter(x -> (match(Regex(x[1]), chars) !== nothing),patterns)
|
||||||
|
z = map(x -> (x[1], x[2],
|
||||||
|
map(x -> x.offset, collect(eachmatch(Regex(x[1]), chars)))), y)
|
||||||
|
for ptn in z
|
||||||
|
for offset in ptn[3]
|
||||||
|
counter = 0
|
||||||
|
for ptn_char in ptn[2]
|
||||||
|
if match(r"[a-z]", string(ptn_char)) !== nothing
|
||||||
|
counter += 1
|
||||||
|
elseif match(r"[.]", string(ptn_char)) !== nothing
|
||||||
|
counter = 0
|
||||||
|
else
|
||||||
|
# 1-5
|
||||||
|
if offset + counter -1 != 0
|
||||||
|
orig = level_of_chars[offset + counter-1]
|
||||||
|
new = parse(Int, ptn_char)
|
||||||
|
if new > orig
|
||||||
|
level_of_chars[offset + counter-1] = new
|
||||||
|
end
|
||||||
|
end
|
||||||
|
counter += 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
new_chars = ""
|
||||||
|
for (idx, char) in enumerate(chars)
|
||||||
|
new_chars *= char
|
||||||
|
level_after_char = level_of_chars[idx]
|
||||||
|
if (level_after_char > 0) && (level_after_char % 2 == 1)
|
||||||
|
new_chars *= "~" # for hyphenation
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return new_chars
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
function match_char(ast_item, patterns)
|
||||||
|
@match ast_item begin
|
||||||
|
c.CHAR(chars), if match(r"[a-zA-Z]+", chars) !== Nothing end =>
|
||||||
|
begin
|
||||||
|
raw_result = hyphenate_aux(chars, patterns)
|
||||||
|
splitted = split(raw_result, "~")
|
||||||
|
final = []
|
||||||
|
for i in splitted
|
||||||
|
push!(final, c.CHAR(i))
|
||||||
|
push!(final, c.SEQ([c.ELE([c.ID("disc")]),
|
||||||
|
c.ELE([]),
|
||||||
|
c.ELE([]),
|
||||||
|
c.ELE([c.CHAR("-")])]))
|
||||||
|
end
|
||||||
|
|
||||||
|
final = final[1:end-1]
|
||||||
|
return final
|
||||||
|
#c.CHAR(hyphenate_aux(chars, patterns))
|
||||||
|
end
|
||||||
|
c.SEQ(v) => c.SEQ(map(x -> match_char(x, patterns), v))
|
||||||
|
_ => ast_item
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function hyphenate(ast)
|
||||||
|
lang = check_lang(ast)
|
||||||
|
if lang !== nothing
|
||||||
|
include("hyphenRules/$lang.jl")
|
||||||
|
patterns = Hyphen.patterns
|
||||||
|
pattern_with_orig = map(x->(remove_num_from_pattern(x), x),
|
||||||
|
patterns)
|
||||||
|
|
||||||
|
println("AST=====", ast)
|
||||||
|
new_ast_val = map(x -> match_char(x, pattern_with_orig), ast.val)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
new_ast_val2 = []
|
||||||
|
for i in new_ast_val
|
||||||
|
@match i begin
|
||||||
|
[x] => push!(new_ast_val2, x)
|
||||||
|
[x,y] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
|
||||||
|
[x,y...,z] =>begin new_ast_val2 = vcat(new_ast_val2, i) end
|
||||||
|
_ => push!(new_ast_val2, i)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
new_ast_val3 = []
|
||||||
|
for i in new_ast_val2
|
||||||
|
@match i begin
|
||||||
|
c.CHAR(val=r"[a-zA-Z]+") => begin
|
||||||
|
new_ast_val3 = vcat(new_ast_val3,
|
||||||
|
map(q -> c.CHAR(q), split(i.val, "")))
|
||||||
|
end
|
||||||
|
_ => push!(new_ast_val3, i)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
println("New AST Val=====", new_ast_val3)
|
||||||
|
|
||||||
|
return c.PROG(new_ast_val3)
|
||||||
|
else
|
||||||
|
return ast
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -3,7 +3,9 @@ using ParserCombinator
|
||||||
using Match
|
using Match
|
||||||
|
|
||||||
include("passes.jl")
|
include("passes.jl")
|
||||||
|
include("hyphenating.jl")
|
||||||
using .Passes
|
using .Passes
|
||||||
|
using .Hyphenating
|
||||||
|
|
||||||
#=
|
#=
|
||||||
grammar rules of uahgi
|
grammar rules of uahgi
|
||||||
|
@ -16,7 +18,10 @@ space = p"[ \t]" > Passes.Classes.SPACE
|
||||||
id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
|
id_name = p"[_a-zA-Z][_0-9a-zA-Z]*" > Passes.Classes.ID
|
||||||
id = E"@" + id_name
|
id = E"@" + id_name
|
||||||
|
|
||||||
char = p"[^ \n\r\t\\]" > Passes.Classes.CHAR #[1:2,:?]
|
empty_char = P"" # empty char
|
||||||
|
|
||||||
|
#make alphabet series a group for hyphenating
|
||||||
|
char = p"([a-zA-Z]+|[^ a-zA-Z\n\r\t\\])" > Passes.Classes.CHAR #[1:2,:?]
|
||||||
|
|
||||||
# chars should be preceded by "\" are \, {, }, |, @, %
|
# chars should be preceded by "\" are \, {, }, |, @, %
|
||||||
esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
|
esc_char = p"[\{\|\}\@\%]" > Passes.Classes.ESC_CHAR
|
||||||
|
@ -26,7 +31,7 @@ seq = (foo x1 x2 " ")
|
||||||
=> {@foo|x1|x2| }
|
=> {@foo|x1|x2| }
|
||||||
=#
|
=#
|
||||||
char_and_combined = char | esc_combined
|
char_and_combined = char | esc_combined
|
||||||
seq_item = id | Repeat(char_and_combined) |> Passes.Classes.ELE
|
seq_item = id | Repeat(char_and_combined) | empty_char |> Passes.Classes.ELE
|
||||||
seq_item_rest = E"|" + seq_item
|
seq_item_rest = E"|" + seq_item
|
||||||
seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
|
seq_inner = seq_item + (seq_item_rest)[0:end] |> Passes.Classes.SEQ
|
||||||
seq = E"{" + seq_inner + E"}"
|
seq = E"{" + seq_inner + E"}"
|
||||||
|
@ -42,6 +47,8 @@ function parse(input)
|
||||||
|
|
||||||
|
|
||||||
#print(parse_one(, Pattern(r".b.")))
|
#print(parse_one(, Pattern(r".b.")))
|
||||||
|
|
||||||
|
ast = Hyphenating.hyphenate(ast)
|
||||||
|
|
||||||
passes = Passes.processed_passes
|
passes = Passes.processed_passes
|
||||||
|
|
||||||
|
@ -112,6 +119,7 @@ function ast_to_string(ast)
|
||||||
return "(" * prog_inner * ")"
|
return "(" * prog_inner * ")"
|
||||||
end
|
end
|
||||||
Passes.Classes.ID(v) => "[ID: " * v * "]"
|
Passes.Classes.ID(v) => "[ID: " * v * "]"
|
||||||
|
Passes.Classes.ELE([]) => return "[ELE : ()]"
|
||||||
Passes.Classes.ELE(v) => begin
|
Passes.Classes.ELE(v) => begin
|
||||||
prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
|
prog_inner = reduce( (x, y) -> x*" "*y, map(i -> ast_to_string(i), v))
|
||||||
return "[ELE : (" * prog_inner * ")]"
|
return "[ELE : (" * prog_inner * ")]"
|
||||||
|
|
|
@ -8,9 +8,9 @@ using TeX-like metrics, in px
|
||||||
- weight
|
- weight
|
||||||
- depth
|
- depth
|
||||||
"""
|
"""
|
||||||
struct FontMetrics
|
struct CharMetrics
|
||||||
height::Float64
|
height::Float64
|
||||||
weight::Float64
|
width::Float64
|
||||||
depth::Float64
|
depth::Float64
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -110,6 +110,8 @@ function check_char_size(char, font_path, font_size, font_index=0)
|
||||||
println("width ", width, "px")
|
println("width ", width, "px")
|
||||||
println("height ", height, "px")
|
println("height ", height, "px")
|
||||||
println("depth ", depth, "px")
|
println("depth ", depth, "px")
|
||||||
|
|
||||||
|
return CharMetrics(height, width, depth)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in a new issue