Files
advent-of-code-2023/lexer_gen/re.odin
2024-11-14 19:16:08 -05:00

501 lines
12 KiB
Odin

package lexer_gen
import "core:os"
import "core:fmt"
import "core:strings"
import "core:testing"
import "core:log"
logger := log.create_console_logger()
INDENTATION :: " "
print_indentation :: proc (level: int) {
for ii := 0; ii < level; ii += 1 {
fmt.print(INDENTATION)
}
}
Tokens :: enum {
ASTERISK,
BACK_SLASH,
CARROT,
CLOSE_BRACKET,
CLOSE_PAREN,
DOLLAR,
DOT,
FORWARD_SLASH,
LITERAL,
OPEN_BRACKET,
OPEN_PAREN,
PIPE,
TACK,
UNDERLINE,
_EOS,
}
token_type_string :: proc (tt: Tokens) -> string {
switch tt {
case .ASTERISK:
return "ASTERISK"
case .BACK_SLASH:
return "BACK_SLASH"
case .CARROT:
return "CARROT"
case .CLOSE_BRACKET:
return "CLOSE_BRACKET"
case .CLOSE_PAREN:
return "CLOSE_PAREN"
case .DOLLAR:
return "DOLLAR"
case .DOT:
return "DOT"
case .FORWARD_SLASH:
return "FORWARD_SLASH"
case .LITERAL:
return "LITERAL"
case .OPEN_BRACKET:
return "OPEN_BRACKET"
case .OPEN_PAREN:
return "OPEN_PAREN"
case .PIPE:
return "PIPE"
case .TACK:
return "TACK"
case .UNDERLINE:
return "UNDERLINE"
case ._EOS:
return "_EOS"
}
panic("Unreachable");
}
print_token_type :: proc(tt: Tokens) {
fmt.print(token_type_string(tt));
}
Token :: struct {
type_: Tokens,
start: int,
end: int,
}
print_token :: proc (token: ^Token) {
using token
fmt.printf("(%s start=%d end=%d)", token_type_string(type_), start, end)
}
Tokenizer :: struct {
buffer: []u8,
tokens: [dynamic](^Token),
idx: int,
}
Parser :: struct {
tokenizer: ^Tokenizer,
ast: ^AstNode,
current: ^AstNode,
stack: [dynamic]^AstNode
}
new_parser :: proc(tokenizer: ^Tokenizer) -> ^Parser {
p := new(Parser);
p.tokenizer = tokenizer;
p.ast = new_ast_node(Group);
p.current = p.ast;
p.ast.start = 0;
return p;
}
AstNode :: struct {
start: int,
end: int,
next: ^AstNode,
variant: union{
^Group, ^Literal, ^Range, ^Repeater
}
}
new_ast_node :: proc($T: typeid) -> ^T {
e := new(T)
e.variant = e
return e
}
Literal :: struct {
using node: AstNode,
}
print_literal :: proc(parser: ^Parser, node: ^Literal, level := 0) {
using parser
value := tokenizer.buffer[node.start:node.end]
print_indentation(level)
fmt.printf("(Literal start=%d end=%d value='%s'", node.start, node.end, value)
}
Range :: struct {
using node: AstNode,
lbound: rune,
ubound: rune,
};
print_range :: proc (parser: ^Parser, range: ^Range, level := 0) {
print_indentation(level);
fmt.printf("(range %s %c)", range.lbound, range.ubound);
}
Group :: struct {
using node: AstNode,
child: ^AstNode
};
print_group :: proc(parser: ^Parser, group: ^Group, level := 0) {
// TODO
if parser == nil || group == nil {
// TODO handle error
}
fmt.print("(group ")
current: ^AstNode = group.child;
for current != nil {
print_node(parser, current, level + 1)
current = current.next
}
fmt.print(")")
}
RepeaterType :: enum {
One,
OneOrMore,
ZeroOrMore,
ExactRange,
_None
}
repeater_type_string :: proc(rt: RepeaterType) -> string {
switch rt {
case .One:
return "One"
case .OneOrMore:
return "OneOrMore"
case .ZeroOrMore:
return "ZeroOrMore"
case .ExactRange:
return "ExactRange"
case ._None:
return "_None"
}
panic("Unreachable")
}
Repeater :: struct {
using node: AstNode,
target: ^AstNode,
type_: RepeaterType
}
print_repeater :: proc(parser: ^Parser, node: ^Repeater, level := 0) {
using node
print_indentation(level);
fmt.printf("(repeat %s \n", repeater_type_string(type_));
print_node(parser, target, level + 1);
fmt.print(")");
}
print_node :: proc(parser: ^Parser, node: $T/^AstNode, level := 0) {
switch node_ in node.variant {
case ^Group: print_group(parser, node_, level);
case ^Literal: print_literal(parser, node_, level);
case ^Repeater: print_repeater(parser, node_, level);
case ^Range: print_range(parser, node_, level);
}
}
new_tokenizer :: proc() -> (state: ^Tokenizer) {
state = new(Tokenizer)
using state
tokens = make([dynamic](^Token))
idx = 0
return
}
scan_one :: proc(tokenizer: ^Tokenizer, tt: Tokens) {
using tokenizer;
fmt.printf("%s: Call with type %s\n", #procedure, tt);
new_tok := new(Token);
new_tok.type_ = tt;
new_tok.start = idx;
new_tok.end = idx + 1;
append(&tokens, new_tok);
}
peek_front_safe :: proc(array: [dynamic]$E) -> (E, bool) {
if array == nil {
return nil, false;
}
return array[0], true;
}
peek_safe :: proc(array: [dynamic]$E) -> (E, bool) {
if array == nil || len(array) == 0 {
return nil, false;
}
return array[len(array) - 1], true;
}
peek_type :: proc (tokens: [dynamic]^Token, start := 0) -> Tokens {
tok, ok := peek_front_safe(tokens);
if !ok {
return ._EOS;
}
return tok.type_;
}
scan_one_maybe :: proc(tokenizer: ^Tokenizer) -> bool {
switch tokenizer.buffer[tokenizer.idx] {
case '[':
scan_one(tokenizer, Tokens.OPEN_BRACKET);
case ']':
scan_one(tokenizer, Tokens.CLOSE_BRACKET);
case '(':
scan_one(tokenizer, Tokens.OPEN_PAREN);
case ')':
scan_one(tokenizer, Tokens.CLOSE_PAREN);
case '|':
scan_one(tokenizer, Tokens.PIPE);
case '^':
scan_one(tokenizer, Tokens.CARROT);
case '$':
scan_one(tokenizer, Tokens.DOLLAR);
case '*':
scan_one(tokenizer, Tokens.ASTERISK);
case '-':
scan_one(tokenizer, Tokens.TACK);
case '_':
scan_one(tokenizer, Tokens.UNDERLINE);
case '.':
scan_one(tokenizer, Tokens.DOT);
case '\\':
scan_one(tokenizer, Tokens.BACK_SLASH);
case '/':
scan_one(tokenizer, Tokens.FORWARD_SLASH);
case:
return false;
}
return true;
}
tokenize :: proc (buffer: []u8) -> ^Tokenizer {
tokenizer := new_tokenizer();
tokenizer.buffer = buffer;
new_tok: ^Token = nil;
literal_tok: ^Token = nil;
for ; tokenizer.idx < len(tokenizer.buffer); tokenizer.idx += 1 {
scanned := scan_one_maybe(tokenizer);
if scanned && literal_tok != nil {
// End a literal
literal_tok.end = tokenizer.idx;
literal_tok = nil;
} else if !scanned && literal_tok == nil {
// Start a literal
literal_tok = new(Token);
literal_tok.type_ = .LITERAL;
literal_tok.start = tokenizer.idx;
literal_tok.end = tokenizer.idx;
fmt.printf("Appending!\n");
append(&tokenizer.tokens, literal_tok);
}
}
assert(len(tokenizer.tokens) > 0);
return tokenizer;
}
print_token_stream :: proc(stream: []^Token) {
for tok, index in stream {
fmt.printf("%d: ", index);
print_token(tok);
fmt.print("\n");
}
}
@(test)
test_tokenize_basic :: proc(sig: ^testing.T) {
buf, ok := os.read_entire_file_from_filename("./test/test_tokenizer__basic.input");
tokenizer := tokenize(buf);
assert(
len(tokenizer.tokens) == 2,
fmt.tprintf("len(tokens) = %d != 2\n", len(tokenizer.tokens)));
print_token_stream(tokenizer.tokens[:]);
assert(
tokenizer.tokens[0].type_ == Tokens.LITERAL,
fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_));
assert(
tokenizer.tokens[1].type_ == Tokens.ASTERISK,
fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_));
}
parse_escape :: proc(parser: ^Parser) -> ^AstNode {
using parser
node := new_ast_node(Literal)
tok, ok := pop_front_safe(&(tokenizer.tokens))
if !ok {
free(node)
return nil
}
node.start = tok.start
assert(tok.type_ == .BACK_SLASH)
tok, ok = pop_front_safe(&tokenizer.tokens)
if !ok {
// TODO
}
if tok.type_ == .LITERAL {
// TODO Cannot escape a literal
}
node.end = tok.end
return node
}
parse_zero_or_more_repeater :: proc(parser: ^Parser) -> ^AstNode {
using parser
// Create an AST node representing the repeater
ok: bool
repeater_token: ^Token
repeater_token, ok = pop_safe(&tokenizer.tokens)
if !ok {
// TODO Handle error
return nil
}
assert(repeater_token.type_ == .ASTERISK)
// Attach the ast node that the repeater operates on
node := new_ast_node(Repeater)
node.type_ = .ZeroOrMore
node.start = repeater_token.start
node.end = repeater_token.end
return node
}
parse_literal :: proc(parser: ^Parser) -> ^AstNode {
tok, ok := pop_front_safe(&parser.tokenizer.tokens)
if !ok { return nil }
node := new(Literal)
node.start = tok.start
node.end = tok.end
node.next = nil
value := parser.tokenizer.buffer[node.start:node.end]
log.debugf("Finished parsing type=Literal with value='%s'\n", value)
return node
}
parse_token_stream :: proc(tokenizer: ^Tokenizer) -> ^Parser {
parser := new_parser(tokenizer)
parser.ast = parse_token_stream__inner(parser)
fmt.printf("%x\n", &parser.ast)
return parser
}
parse_token_stream__inner :: proc(parser: ^Parser) -> ^AstNode {
using parser
next: ^AstNode
for len(tokenizer.tokens) > 0 {
type_ := peek_type(tokenizer.tokens)
log.debugf("Parsing token of type=%s\n", type_)
#partial switch type_ {
case .LITERAL:
next = parse_literal(parser)
case .BACK_SLASH:
next = parse_escape(parser)
case .ASTERISK:
next = parse_zero_or_more_repeater(parser)
case:
msg := fmt.tprintf("Don't know how to handle token of type=%s\n", type_)
assert(false, msg)
}
if next == nil {
return current
} else if current != nil {
current.next = next
}
current = next
}
return current
}
print_parse_tree :: proc(parser: ^Parser) {
for current := parser.ast; current != nil; current = current.next {
print_node(parser, current, 0);
}
}
@(test)
test_parse_token_stream :: proc(sig: ^testing.T) {
buf, ok := os.read_entire_file_from_filename("./test/test_parser__basic.input");
assert(ok)
tokenizer := tokenize(buf)
parser := parse_token_stream(tokenizer)
print_parse_tree(parser)
}
iter_lines :: proc(buffer: []u8, index: ^int) -> ([]u8, bool) {
if index^ < 0 || index^ >= len(buffer) {
return buffer, true
}
lbound := index^
for index^ < len(buffer) {
if buffer[index^] == '\n' {
index^ += 1 // Skip the newline
return buffer[lbound:index^ - 1], false
}
index^ += 1
}
return buffer[lbound:index^], false
}
@(test)
test_iter_lines :: proc(sig: ^testing.T) {
input, ok_ := os.read_entire_file_from_filename("./test/test_iter_lines__basic.input")
assert(ok_)
linum := 1
index := 0
line, stop_iter := iter_lines(input, &index)
for !stop_iter {
linum += 1
fmt.printf("Line %0d: %s\n", linum, line)
line, stop_iter = iter_lines(input, &index)
}
}
parse_token_expectation :: proc(filename: string) {
buffer, ok := os.read_entire_file_from_filename(filename);
if !ok {
// TODO handle error
log.errorf("%s\n", os.get_last_error_string());
return;
}
start := 0
line, ok_ := iter_lines(buffer, &start)
for ; ok; line, ok = iter_lines(buffer, &start) {
}
}
re_compile :: proc(pattern: string) -> ^AstNode {
pattern_ := strings.clone(pattern);
tokenizer := tokenize(transmute([]u8)pattern_)
parser := parse_token_stream(tokenizer);
return parser.ast;
}
re_match :: proc() {
assert(false);
return;
}
re_search :: proc() {
assert(false);
return;
}