501 lines
12 KiB
Odin
501 lines
12 KiB
Odin
package lexer_gen
|
|
|
|
import "core:os"
|
|
import "core:fmt"
|
|
import "core:strings"
|
|
import "core:testing"
|
|
import "core:log"
|
|
|
|
logger := log.create_console_logger()
|
|
|
|
INDENTATION :: " "
|
|
|
|
print_indentation :: proc (level: int) {
|
|
for ii := 0; ii < level; ii += 1 {
|
|
fmt.print(INDENTATION)
|
|
}
|
|
}
|
|
|
|
Tokens :: enum {
|
|
ASTERISK,
|
|
BACK_SLASH,
|
|
CARROT,
|
|
CLOSE_BRACKET,
|
|
CLOSE_PAREN,
|
|
DOLLAR,
|
|
DOT,
|
|
FORWARD_SLASH,
|
|
LITERAL,
|
|
OPEN_BRACKET,
|
|
OPEN_PAREN,
|
|
PIPE,
|
|
TACK,
|
|
UNDERLINE,
|
|
_EOS,
|
|
}
|
|
|
|
token_type_string :: proc (tt: Tokens) -> string {
|
|
switch tt {
|
|
case .ASTERISK:
|
|
return "ASTERISK"
|
|
case .BACK_SLASH:
|
|
return "BACK_SLASH"
|
|
case .CARROT:
|
|
return "CARROT"
|
|
case .CLOSE_BRACKET:
|
|
return "CLOSE_BRACKET"
|
|
case .CLOSE_PAREN:
|
|
return "CLOSE_PAREN"
|
|
case .DOLLAR:
|
|
return "DOLLAR"
|
|
case .DOT:
|
|
return "DOT"
|
|
case .FORWARD_SLASH:
|
|
return "FORWARD_SLASH"
|
|
case .LITERAL:
|
|
return "LITERAL"
|
|
case .OPEN_BRACKET:
|
|
return "OPEN_BRACKET"
|
|
case .OPEN_PAREN:
|
|
return "OPEN_PAREN"
|
|
case .PIPE:
|
|
return "PIPE"
|
|
case .TACK:
|
|
return "TACK"
|
|
case .UNDERLINE:
|
|
return "UNDERLINE"
|
|
case ._EOS:
|
|
return "_EOS"
|
|
}
|
|
panic("Unreachable");
|
|
}
|
|
|
|
print_token_type :: proc(tt: Tokens) {
|
|
fmt.print(token_type_string(tt));
|
|
}
|
|
|
|
Token :: struct {
|
|
type_: Tokens,
|
|
start: int,
|
|
end: int,
|
|
}
|
|
|
|
print_token :: proc (token: ^Token) {
|
|
using token
|
|
fmt.printf("(%s start=%d end=%d)", token_type_string(type_), start, end)
|
|
}
|
|
|
|
Tokenizer :: struct {
|
|
buffer: []u8,
|
|
tokens: [dynamic](^Token),
|
|
idx: int,
|
|
}
|
|
|
|
Parser :: struct {
|
|
tokenizer: ^Tokenizer,
|
|
ast: ^AstNode,
|
|
current: ^AstNode,
|
|
stack: [dynamic]^AstNode
|
|
}
|
|
|
|
new_parser :: proc(tokenizer: ^Tokenizer) -> ^Parser {
|
|
p := new(Parser);
|
|
p.tokenizer = tokenizer;
|
|
p.ast = new_ast_node(Group);
|
|
p.current = p.ast;
|
|
p.ast.start = 0;
|
|
return p;
|
|
}
|
|
|
|
AstNode :: struct {
|
|
start: int,
|
|
end: int,
|
|
next: ^AstNode,
|
|
variant: union{
|
|
^Group, ^Literal, ^Range, ^Repeater
|
|
}
|
|
}
|
|
|
|
new_ast_node :: proc($T: typeid) -> ^T {
|
|
e := new(T)
|
|
e.variant = e
|
|
return e
|
|
}
|
|
|
|
Literal :: struct {
|
|
using node: AstNode,
|
|
}
|
|
|
|
print_literal :: proc(parser: ^Parser, node: ^Literal, level := 0) {
|
|
using parser
|
|
value := tokenizer.buffer[node.start:node.end]
|
|
print_indentation(level)
|
|
fmt.printf("(Literal start=%d end=%d value='%s'", node.start, node.end, value)
|
|
}
|
|
|
|
Range :: struct {
|
|
using node: AstNode,
|
|
lbound: rune,
|
|
ubound: rune,
|
|
};
|
|
|
|
print_range :: proc (parser: ^Parser, range: ^Range, level := 0) {
|
|
print_indentation(level);
|
|
fmt.printf("(range %s %c)", range.lbound, range.ubound);
|
|
}
|
|
|
|
Group :: struct {
|
|
using node: AstNode,
|
|
child: ^AstNode
|
|
};
|
|
|
|
print_group :: proc(parser: ^Parser, group: ^Group, level := 0) {
|
|
// TODO
|
|
if parser == nil || group == nil {
|
|
// TODO handle error
|
|
}
|
|
fmt.print("(group ")
|
|
current: ^AstNode = group.child;
|
|
for current != nil {
|
|
print_node(parser, current, level + 1)
|
|
current = current.next
|
|
}
|
|
fmt.print(")")
|
|
}
|
|
|
|
RepeaterType :: enum {
|
|
One,
|
|
OneOrMore,
|
|
ZeroOrMore,
|
|
ExactRange,
|
|
_None
|
|
}
|
|
|
|
repeater_type_string :: proc(rt: RepeaterType) -> string {
|
|
switch rt {
|
|
case .One:
|
|
return "One"
|
|
case .OneOrMore:
|
|
return "OneOrMore"
|
|
case .ZeroOrMore:
|
|
return "ZeroOrMore"
|
|
case .ExactRange:
|
|
return "ExactRange"
|
|
case ._None:
|
|
return "_None"
|
|
}
|
|
panic("Unreachable")
|
|
}
|
|
|
|
Repeater :: struct {
|
|
using node: AstNode,
|
|
target: ^AstNode,
|
|
type_: RepeaterType
|
|
}
|
|
|
|
print_repeater :: proc(parser: ^Parser, node: ^Repeater, level := 0) {
|
|
using node
|
|
print_indentation(level);
|
|
fmt.printf("(repeat %s \n", repeater_type_string(type_));
|
|
print_node(parser, target, level + 1);
|
|
fmt.print(")");
|
|
}
|
|
|
|
print_node :: proc(parser: ^Parser, node: $T/^AstNode, level := 0) {
|
|
switch node_ in node.variant {
|
|
case ^Group: print_group(parser, node_, level);
|
|
case ^Literal: print_literal(parser, node_, level);
|
|
case ^Repeater: print_repeater(parser, node_, level);
|
|
case ^Range: print_range(parser, node_, level);
|
|
}
|
|
}
|
|
|
|
new_tokenizer :: proc() -> (state: ^Tokenizer) {
|
|
state = new(Tokenizer)
|
|
using state
|
|
tokens = make([dynamic](^Token))
|
|
idx = 0
|
|
return
|
|
}
|
|
|
|
scan_one :: proc(tokenizer: ^Tokenizer, tt: Tokens) {
|
|
using tokenizer;
|
|
fmt.printf("%s: Call with type %s\n", #procedure, tt);
|
|
new_tok := new(Token);
|
|
new_tok.type_ = tt;
|
|
new_tok.start = idx;
|
|
new_tok.end = idx + 1;
|
|
append(&tokens, new_tok);
|
|
}
|
|
|
|
peek_front_safe :: proc(array: [dynamic]$E) -> (E, bool) {
|
|
if array == nil {
|
|
return nil, false;
|
|
}
|
|
return array[0], true;
|
|
}
|
|
|
|
peek_safe :: proc(array: [dynamic]$E) -> (E, bool) {
|
|
if array == nil || len(array) == 0 {
|
|
return nil, false;
|
|
}
|
|
return array[len(array) - 1], true;
|
|
}
|
|
|
|
peek_type :: proc (tokens: [dynamic]^Token, start := 0) -> Tokens {
|
|
tok, ok := peek_front_safe(tokens);
|
|
if !ok {
|
|
return ._EOS;
|
|
}
|
|
return tok.type_;
|
|
}
|
|
|
|
scan_one_maybe :: proc(tokenizer: ^Tokenizer) -> bool {
|
|
switch tokenizer.buffer[tokenizer.idx] {
|
|
case '[':
|
|
scan_one(tokenizer, Tokens.OPEN_BRACKET);
|
|
case ']':
|
|
scan_one(tokenizer, Tokens.CLOSE_BRACKET);
|
|
case '(':
|
|
scan_one(tokenizer, Tokens.OPEN_PAREN);
|
|
case ')':
|
|
scan_one(tokenizer, Tokens.CLOSE_PAREN);
|
|
case '|':
|
|
scan_one(tokenizer, Tokens.PIPE);
|
|
case '^':
|
|
scan_one(tokenizer, Tokens.CARROT);
|
|
case '$':
|
|
scan_one(tokenizer, Tokens.DOLLAR);
|
|
case '*':
|
|
scan_one(tokenizer, Tokens.ASTERISK);
|
|
case '-':
|
|
scan_one(tokenizer, Tokens.TACK);
|
|
case '_':
|
|
scan_one(tokenizer, Tokens.UNDERLINE);
|
|
case '.':
|
|
scan_one(tokenizer, Tokens.DOT);
|
|
case '\\':
|
|
scan_one(tokenizer, Tokens.BACK_SLASH);
|
|
case '/':
|
|
scan_one(tokenizer, Tokens.FORWARD_SLASH);
|
|
case:
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
tokenize :: proc (buffer: []u8) -> ^Tokenizer {
|
|
tokenizer := new_tokenizer();
|
|
tokenizer.buffer = buffer;
|
|
new_tok: ^Token = nil;
|
|
literal_tok: ^Token = nil;
|
|
|
|
for ; tokenizer.idx < len(tokenizer.buffer); tokenizer.idx += 1 {
|
|
scanned := scan_one_maybe(tokenizer);
|
|
if scanned && literal_tok != nil {
|
|
// End a literal
|
|
literal_tok.end = tokenizer.idx;
|
|
literal_tok = nil;
|
|
} else if !scanned && literal_tok == nil {
|
|
// Start a literal
|
|
literal_tok = new(Token);
|
|
literal_tok.type_ = .LITERAL;
|
|
literal_tok.start = tokenizer.idx;
|
|
literal_tok.end = tokenizer.idx;
|
|
fmt.printf("Appending!\n");
|
|
append(&tokenizer.tokens, literal_tok);
|
|
}
|
|
}
|
|
|
|
assert(len(tokenizer.tokens) > 0);
|
|
return tokenizer;
|
|
}
|
|
|
|
print_token_stream :: proc(stream: []^Token) {
|
|
for tok, index in stream {
|
|
fmt.printf("%d: ", index);
|
|
print_token(tok);
|
|
fmt.print("\n");
|
|
}
|
|
}
|
|
|
|
@(test)
|
|
test_tokenize_basic :: proc(sig: ^testing.T) {
|
|
buf, ok := os.read_entire_file_from_filename("./test/test_tokenizer__basic.input");
|
|
tokenizer := tokenize(buf);
|
|
assert(
|
|
len(tokenizer.tokens) == 2,
|
|
fmt.tprintf("len(tokens) = %d != 2\n", len(tokenizer.tokens)));
|
|
print_token_stream(tokenizer.tokens[:]);
|
|
assert(
|
|
tokenizer.tokens[0].type_ == Tokens.LITERAL,
|
|
fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_));
|
|
assert(
|
|
tokenizer.tokens[1].type_ == Tokens.ASTERISK,
|
|
fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_));
|
|
}
|
|
|
|
parse_escape :: proc(parser: ^Parser) -> ^AstNode {
|
|
using parser
|
|
node := new_ast_node(Literal)
|
|
tok, ok := pop_front_safe(&(tokenizer.tokens))
|
|
if !ok {
|
|
free(node)
|
|
return nil
|
|
}
|
|
node.start = tok.start
|
|
assert(tok.type_ == .BACK_SLASH)
|
|
tok, ok = pop_front_safe(&tokenizer.tokens)
|
|
if !ok {
|
|
// TODO
|
|
}
|
|
if tok.type_ == .LITERAL {
|
|
// TODO Cannot escape a literal
|
|
}
|
|
node.end = tok.end
|
|
return node
|
|
}
|
|
|
|
parse_zero_or_more_repeater :: proc(parser: ^Parser) -> ^AstNode {
|
|
using parser
|
|
|
|
// Create an AST node representing the repeater
|
|
ok: bool
|
|
repeater_token: ^Token
|
|
repeater_token, ok = pop_safe(&tokenizer.tokens)
|
|
if !ok {
|
|
// TODO Handle error
|
|
return nil
|
|
}
|
|
assert(repeater_token.type_ == .ASTERISK)
|
|
|
|
// Attach the ast node that the repeater operates on
|
|
node := new_ast_node(Repeater)
|
|
node.type_ = .ZeroOrMore
|
|
node.start = repeater_token.start
|
|
node.end = repeater_token.end
|
|
|
|
return node
|
|
}
|
|
|
|
parse_literal :: proc(parser: ^Parser) -> ^AstNode {
|
|
tok, ok := pop_front_safe(&parser.tokenizer.tokens)
|
|
if !ok { return nil }
|
|
node := new(Literal)
|
|
node.start = tok.start
|
|
node.end = tok.end
|
|
node.next = nil
|
|
value := parser.tokenizer.buffer[node.start:node.end]
|
|
log.debugf("Finished parsing type=Literal with value='%s'\n", value)
|
|
return node
|
|
}
|
|
|
|
parse_token_stream :: proc(tokenizer: ^Tokenizer) -> ^Parser {
|
|
parser := new_parser(tokenizer)
|
|
parser.ast = parse_token_stream__inner(parser)
|
|
fmt.printf("%x\n", &parser.ast)
|
|
return parser
|
|
}
|
|
|
|
parse_token_stream__inner :: proc(parser: ^Parser) -> ^AstNode {
|
|
using parser
|
|
next: ^AstNode
|
|
for len(tokenizer.tokens) > 0 {
|
|
type_ := peek_type(tokenizer.tokens)
|
|
log.debugf("Parsing token of type=%s\n", type_)
|
|
#partial switch type_ {
|
|
case .LITERAL:
|
|
next = parse_literal(parser)
|
|
case .BACK_SLASH:
|
|
next = parse_escape(parser)
|
|
case .ASTERISK:
|
|
next = parse_zero_or_more_repeater(parser)
|
|
case:
|
|
msg := fmt.tprintf("Don't know how to handle token of type=%s\n", type_)
|
|
assert(false, msg)
|
|
}
|
|
if next == nil {
|
|
return current
|
|
} else if current != nil {
|
|
current.next = next
|
|
}
|
|
current = next
|
|
}
|
|
return current
|
|
}
|
|
|
|
print_parse_tree :: proc(parser: ^Parser) {
|
|
for current := parser.ast; current != nil; current = current.next {
|
|
print_node(parser, current, 0);
|
|
}
|
|
}
|
|
|
|
@(test)
|
|
test_parse_token_stream :: proc(sig: ^testing.T) {
|
|
buf, ok := os.read_entire_file_from_filename("./test/test_parser__basic.input");
|
|
assert(ok)
|
|
tokenizer := tokenize(buf)
|
|
parser := parse_token_stream(tokenizer)
|
|
print_parse_tree(parser)
|
|
}
|
|
|
|
iter_lines :: proc(buffer: []u8, index: ^int) -> ([]u8, bool) {
|
|
if index^ < 0 || index^ >= len(buffer) {
|
|
return buffer, true
|
|
}
|
|
lbound := index^
|
|
for index^ < len(buffer) {
|
|
if buffer[index^] == '\n' {
|
|
index^ += 1 // Skip the newline
|
|
return buffer[lbound:index^ - 1], false
|
|
}
|
|
index^ += 1
|
|
}
|
|
return buffer[lbound:index^], false
|
|
}
|
|
|
|
@(test)
|
|
test_iter_lines :: proc(sig: ^testing.T) {
|
|
input, ok_ := os.read_entire_file_from_filename("./test/test_iter_lines__basic.input")
|
|
assert(ok_)
|
|
linum := 1
|
|
index := 0
|
|
line, stop_iter := iter_lines(input, &index)
|
|
for !stop_iter {
|
|
linum += 1
|
|
fmt.printf("Line %0d: %s\n", linum, line)
|
|
line, stop_iter = iter_lines(input, &index)
|
|
}
|
|
}
|
|
|
|
parse_token_expectation :: proc(filename: string) {
|
|
buffer, ok := os.read_entire_file_from_filename(filename);
|
|
if !ok {
|
|
// TODO handle error
|
|
log.errorf("%s\n", os.get_last_error_string());
|
|
return;
|
|
}
|
|
start := 0
|
|
line, ok_ := iter_lines(buffer, &start)
|
|
for ; ok; line, ok = iter_lines(buffer, &start) {
|
|
|
|
}
|
|
}
|
|
|
|
re_compile :: proc(pattern: string) -> ^AstNode {
|
|
pattern_ := strings.clone(pattern);
|
|
tokenizer := tokenize(transmute([]u8)pattern_)
|
|
parser := parse_token_stream(tokenizer);
|
|
return parser.ast;
|
|
}
|
|
|
|
re_match :: proc() {
|
|
assert(false);
|
|
return;
|
|
}
|
|
|
|
re_search :: proc() {
|
|
assert(false);
|
|
return;
|
|
}
|