From 5fbb467bdf501dee03c78817242581d962547c4f Mon Sep 17 00:00:00 2001 From: Tan Kian-ting Date: Thu, 7 Sep 2023 00:01:15 +0800 Subject: [PATCH] add tokenize's function, add interface `Token` --- README.md | 1 + src/index.js | 96 +++++++++++++++++++++++++++++++++---- src/index.ts | 126 +++++++++++++++++++++++++++++++++++++++++++++---- tests/index.js | 21 +++++++++ tests/index.ts | 27 ++++++++++- 5 files changed, 250 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 73ea9ce..aaba90e 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,4 @@ another personal draught of a typesetting language and engine. - 20230904 建立 thenDo、matchRange的函數、refactor harfbuzzjs 以及libpdf 等測試界面 - 20230905-06: 建立 : `toSome`, initial of basic tokenizer (`tokenize`), `matchAny`, `notDo`, `orDo`, `zeroOrMoreDo`, `zeroOrOnceDo` + - 20230905-07:強化`tokenize`, 加強功能,加`Token`界面。 diff --git a/src/index.js b/src/index.js index 23320ac..efa8b68 100644 --- a/src/index.js +++ b/src/index.js @@ -1,6 +1,6 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.tokenize = exports.zeroOrOnceDo = exports.notDo = exports.zeroOrMoreDo = exports.orDo = exports.thenDo = exports.charToCodepoint = exports.matchRange = exports.matchAny = exports.match1Char = void 0; +exports.tokenize = exports.zeroOrOnceDo = exports.notDo = exports.zeroOrMoreDo = exports.orDo = exports.thenDo = exports.charToCodepoint = exports.matchRange = exports.matchAny = exports.match1Char = exports.TokenType = void 0; var fs = require('fs'); /** * wrap a x in a `Some(T)` @@ -10,6 +10,32 @@ var fs = require('fs'); function toSome(x) { return { _tag: "Some", value: x }; } +/** + * The types of Token + * NL, // newline + * + * SP, // half-width space and tab + * + * ID, // identifier + * + * STR, // string + * + * OP, // operator or something like it + * + * FLO, // float num + * + * INT, // Integer + */ +var TokenType; +(function (TokenType) { + TokenType[TokenType["NL"] = 0] = "NL"; + TokenType[TokenType["SP"] = 1] = "SP"; + TokenType[TokenType["ID"] = 2] = "ID"; + TokenType[TokenType["STR"] = 3] = "STR"; + TokenType[TokenType["OP"] = 4] = "OP"; + TokenType[TokenType["FLO"] = 5] = "FLO"; + TokenType[TokenType["INT"] = 6] = "INT"; +})(TokenType || (exports.TokenType = TokenType = {})); /** * @description * it returns a function which test if the first char of the `remained` part of @@ -211,20 +237,70 @@ exports.zeroOrOnceDo = zeroOrOnceDo; function tokenize(input) { var input_matchee_pair = toSome({ matched: "", remained: input }); - // integer = ([+]|[-])\d\d? + // integer = ([+]|[-])?\d\d* let integer = (x) => { let wrapped_x = toSome(x); let plusMinus = orDo(match1Char('+'), match1Char('-')); // ([+]|[-]) let d = matchRange('0', '9'); // \d - return thenDo(thenDo(thenDo(wrapped_x, zeroOrOnceDo(plusMinus)), d), zeroOrMoreDo(d)); + var result = thenDo(thenDo(thenDo(wrapped_x, zeroOrOnceDo(plusMinus)), d), zeroOrMoreDo(d)); + if (result._tag == "Some") { + result.value.matched_type = TokenType.INT; + } + return result; }; - console.log(input + ", result: "); - console.log(thenDo(input_matchee_pair, integer)); + let space = (x) => { + let wrapped_x = toSome(x); + let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t") + var result = thenDo(thenDo(wrapped_x, s_aux), zeroOrMoreDo(s_aux)); + if (result._tag == "Some") { + result.value.matched_type = TokenType.SP; + } + return result; + }; + let newline = (x) => { + let wrapped_x = toSome(x); + // nl = \r?\n + let result = thenDo(thenDo(wrapped_x, zeroOrOnceDo(match1Char('\r'))), match1Char('\n')); + if (result._tag == "Some") { + result.value.matched_type = TokenType.NL; + } + return result; + }; + let term = (token_list, x) => { + var ln = 1; + var col = 0; + var old_x = x; + let term_list = [newline, space, integer]; + let term_aux = term_list.reduce((x, y) => orDo(x, y)); + var new_x = thenDo(old_x, term_aux); + while (new_x._tag != "None") { + if (new_x.value.matched_type != TokenType.NL) { + col += new_x.value.matched.length; + token_list.push({ text: new_x.value.matched, + type: new_x.value.matched_type, + ln: ln, + col: col }); + } + else { + col = 0; + ln += 1; + token_list.push({ text: new_x.value.matched, + type: new_x.value.matched_type, + ln: ln, + col: col }); + } + old_x = toSome({ matched: "", + remained: new_x.value.remained }); + new_x = thenDo(old_x, term_aux); + } + if (old_x.value.remained.length) { + console.log(token_list); + throw new Error("the code can't be tokenized is near Ln. " + ln + ", Col." + col + + ", starting with " + old_x.value.remained.substring(0, 10)); + } + return token_list; + }; + console.log(term([], input_matchee_pair)); // TODO: id, string, space, basic operator, 3 marks: @, {, }. } exports.tokenize = tokenize; -tokenize("+123"); -tokenize("123"); -tokenize("-123"); -tokenize(" 123"); -tokenize("c123"); diff --git a/src/index.ts b/src/index.ts index 8cbd145..883d94b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,12 +27,54 @@ export type Maybe = Some | None; /** * @description * the pair of the string to be matched later and the string that have been matched + * @var matched : have been matched + * @var remained : will be tested whether it'll be matched. + * @var matched_type (optional): the type of the matched string */ export interface MatcheePair { - /** have been matched */ matched : string - /** will be tested whether it'll be matched. */ remained : string + matched_type?: TokenType +} + +/** + * The types of Token + * NL, // newline + * + * SP, // half-width space and tab + * + * ID, // identifier + * + * STR, // string + * + * OP, // operator or something like it + * + * FLO, // float num + * + * INT, // Integer + */ +export enum TokenType{ + NL, // newlinw + SP, // half-width space and tab + ID, // identifier + STR, // string + OP, // operator + FLO, // float num + INT, // integer +} + +/** + * tokenized token. + * @var text : the content text + * @var type (optional): the type of the token + * @var col : the column number + * @var ln : the line number + */ +export interface Token{ + text: string, + type?: TokenType, + col: number, + ln: number, } /** @@ -239,18 +281,82 @@ export function tokenize(input : string){ { let wrapped_x = toSome(x); let plusMinus = orDo(match1Char('+'), match1Char('-')); // ([+]|[-]) let d = matchRange('0','9'); // \d - return thenDo(thenDo(thenDo(wrapped_x, + var result = thenDo(thenDo(thenDo(wrapped_x, zeroOrOnceDo(plusMinus)),d), zeroOrMoreDo(d)); + + if (result._tag == "Some"){ + result.value.matched_type = TokenType.INT; + } + return result; } - console.log(input+", result: "); - console.log(thenDo(input_matchee_pair, integer)); + let space = (x : MatcheePair) =>{ + let wrapped_x = toSome(x); + let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t") + var result = thenDo(thenDo(wrapped_x, s_aux), zeroOrMoreDo(s_aux)); + if (result._tag == "Some"){ + result.value.matched_type = TokenType.SP; + } + return result; + } + let newline = (x : MatcheePair) =>{ + let wrapped_x = toSome(x); + // nl = \r?\n + let result = thenDo(thenDo(wrapped_x, + zeroOrOnceDo(match1Char('\r'))), match1Char('\n')); + if (result._tag == "Some"){ + result.value.matched_type = TokenType.NL; + } + return result; + } + + let term = (token_list : Array, x : Some)=>{ + var ln = 1; + var col = 0; + var old_x = x; + let term_list = [newline, space, integer]; + let term_aux = term_list.reduce((x,y)=> orDo(x,y)); + + var new_x : Maybe = thenDo(old_x, term_aux); + while (new_x._tag != "None"){ + if (new_x.value.matched_type != TokenType.NL){ + col += new_x.value.matched.length; + token_list.push({text : new_x.value.matched, + type: new_x.value.matched_type, + ln : ln, + col : col}); + + } + else{ + col = 0; + ln += 1; + + token_list.push({text : new_x.value.matched, + type: new_x.value.matched_type, + ln : ln, + col : col}); + + } + + + old_x = toSome({matched : "", + remained : new_x.value.remained}); + new_x = thenDo(old_x, term_aux); + } + + if (old_x.value.remained.length){ + console.log(token_list); + throw new Error("the code can't be tokenized is near Ln. "+ln+", Col."+col + +", starting with "+ old_x.value.remained.substring(0,10)); + } + + return token_list; + } + + console.log(term([], input_matchee_pair)); + // TODO: id, string, space, basic operator, 3 marks: @, {, }. } -tokenize("+123"); -tokenize("123"); -tokenize("-123"); -tokenize(" 123"); -tokenize("c123"); + diff --git a/tests/index.js b/tests/index.js index 891511c..b50c103 100644 --- a/tests/index.js +++ b/tests/index.js @@ -57,6 +57,27 @@ let doTestRes9 = thenDo(doThenTestee9, src_1.matchAny); assert(doTestRes9._tag == "Some"); assert(doTestRes9.value.matched == "妳"); assert(doTestRes9.value.remained == "的"); +(0, src_1.tokenize)("+123"); +(0, src_1.tokenize)("123"); +(0, src_1.tokenize)("-123"); +(0, src_1.tokenize)(" 123"); +try { + (0, src_1.tokenize)("c123"); +} +catch (error) { + console.log(error); +} +(0, src_1.tokenize)(" "); +(0, src_1.tokenize)(" "); +(0, src_1.tokenize)(" \t"); +(0, src_1.tokenize)(" \t123"); +try { + (0, src_1.tokenize)(" \t123aaa456"); +} +catch (error) { + console.log(error); +} +(0, src_1.tokenize)(" \t123\n456"); // harfbuzz test let harfbuzz = require("../src/harfbuzz.js"); harfbuzz.harfbuzzTest("123.abc"); diff --git a/tests/index.ts b/tests/index.ts index 653f3cf..e4c7344 100644 --- a/tests/index.ts +++ b/tests/index.ts @@ -1,4 +1,4 @@ -import { matchAny } from "../src"; +import { matchAny, tokenize } from "../src"; let assert = require("assert"); let cloMain = require("../src"); @@ -74,6 +74,31 @@ assert(doTestRes9._tag == "Some"); assert(doTestRes9.value.matched == "妳"); assert(doTestRes9.value.remained == "的"); +tokenize("+123"); +tokenize("123"); +tokenize("-123"); +tokenize(" 123"); +try { + tokenize("c123"); + +} catch (error) { + console.log(error); +} + +tokenize(" "); +tokenize(" "); +tokenize(" \t"); +tokenize(" \t123"); + +try { + tokenize(" \t123aaa456"); + + +} catch (error) { + console.log(error); +} +tokenize(" \t123\n456"); + // harfbuzz test let harfbuzz = require("../src/harfbuzz.js");