package lexer_gen import "core:os" import "core:fmt" import "core:strings" import "core:testing" import "core:log" logger := log.create_console_logger() INDENTATION :: " " print_indentation :: proc (level: int) { for ii := 0; ii < level; ii += 1 { fmt.print(INDENTATION) } } Tokens :: enum { ASTERISK, BACK_SLASH, CARROT, CLOSE_BRACKET, CLOSE_PAREN, DOLLAR, DOT, FORWARD_SLASH, LITERAL, OPEN_BRACKET, OPEN_PAREN, PIPE, TACK, UNDERLINE, _EOS, } token_type_string :: proc (tt: Tokens) -> string { switch tt { case .ASTERISK: return "ASTERISK" case .BACK_SLASH: return "BACK_SLASH" case .CARROT: return "CARROT" case .CLOSE_BRACKET: return "CLOSE_BRACKET" case .CLOSE_PAREN: return "CLOSE_PAREN" case .DOLLAR: return "DOLLAR" case .DOT: return "DOT" case .FORWARD_SLASH: return "FORWARD_SLASH" case .LITERAL: return "LITERAL" case .OPEN_BRACKET: return "OPEN_BRACKET" case .OPEN_PAREN: return "OPEN_PAREN" case .PIPE: return "PIPE" case .TACK: return "TACK" case .UNDERLINE: return "UNDERLINE" case ._EOS: return "_EOS" } panic("Unreachable"); } print_token_type :: proc(tt: Tokens) { fmt.print(token_type_string(tt)); } Token :: struct { type_: Tokens, start: int, end: int, } print_token :: proc (token: ^Token) { using token fmt.printf("(%s start=%d end=%d)", token_type_string(type_), start, end) } Tokenizer :: struct { buffer: []u8, tokens: [dynamic](^Token), idx: int, } Parser :: struct { tokenizer: ^Tokenizer, ast: ^AstNode, current: ^AstNode, stack: [dynamic]^AstNode } new_parser :: proc(tokenizer: ^Tokenizer) -> ^Parser { p := new(Parser); p.tokenizer = tokenizer; p.ast = new_ast_node(Group); p.current = p.ast; p.ast.start = 0; return p; } AstNode :: struct { start: int, end: int, next: ^AstNode, variant: union{ ^Group, ^Literal, ^Range, ^Repeater } } new_ast_node :: proc($T: typeid) -> ^T { e := new(T) e.variant = e return e } Literal :: struct { using node: AstNode, } print_literal :: proc(parser: ^Parser, node: ^Literal, level := 0) { using parser value := tokenizer.buffer[node.start:node.end] print_indentation(level) fmt.printf("(Literal start=%d end=%d value='%s'", node.start, node.end, value) } Range :: struct { using node: AstNode, lbound: rune, ubound: rune, }; print_range :: proc (parser: ^Parser, range: ^Range, level := 0) { print_indentation(level); fmt.printf("(range %s %c)", range.lbound, range.ubound); } Group :: struct { using node: AstNode, child: ^AstNode }; print_group :: proc(parser: ^Parser, group: ^Group, level := 0) { // TODO if parser == nil || group == nil { // TODO handle error } fmt.print("(group ") current: ^AstNode = group.child; for current != nil { print_node(parser, current, level + 1) current = current.next } fmt.print(")") } RepeaterType :: enum { One, OneOrMore, ZeroOrMore, ExactRange, _None } repeater_type_string :: proc(rt: RepeaterType) -> string { switch rt { case .One: return "One" case .OneOrMore: return "OneOrMore" case .ZeroOrMore: return "ZeroOrMore" case .ExactRange: return "ExactRange" case ._None: return "_None" } panic("Unreachable") } Repeater :: struct { using node: AstNode, target: ^AstNode, type_: RepeaterType } print_repeater :: proc(parser: ^Parser, node: ^Repeater, level := 0) { using node print_indentation(level); fmt.printf("(repeat %s \n", repeater_type_string(type_)); print_node(parser, target, level + 1); fmt.print(")"); } print_node :: proc(parser: ^Parser, node: $T/^AstNode, level := 0) { switch node_ in node.variant { case ^Group: print_group(parser, node_, level); case ^Literal: print_literal(parser, node_, level); case ^Repeater: print_repeater(parser, node_, level); case ^Range: print_range(parser, node_, level); } } new_tokenizer :: proc() -> (state: ^Tokenizer) { state = new(Tokenizer) using state tokens = make([dynamic](^Token)) idx = 0 return } scan_one :: proc(tokenizer: ^Tokenizer, tt: Tokens) { using tokenizer; fmt.printf("%s: Call with type %s\n", #procedure, tt); new_tok := new(Token); new_tok.type_ = tt; new_tok.start = idx; new_tok.end = idx + 1; append(&tokens, new_tok); } peek_front_safe :: proc(array: [dynamic]$E) -> (E, bool) { if array == nil { return nil, false; } return array[0], true; } peek_safe :: proc(array: [dynamic]$E) -> (E, bool) { if array == nil || len(array) == 0 { return nil, false; } return array[len(array) - 1], true; } peek_type :: proc (tokens: [dynamic]^Token, start := 0) -> Tokens { tok, ok := peek_front_safe(tokens); if !ok { return ._EOS; } return tok.type_; } scan_one_maybe :: proc(tokenizer: ^Tokenizer) -> bool { switch tokenizer.buffer[tokenizer.idx] { case '[': scan_one(tokenizer, Tokens.OPEN_BRACKET); case ']': scan_one(tokenizer, Tokens.CLOSE_BRACKET); case '(': scan_one(tokenizer, Tokens.OPEN_PAREN); case ')': scan_one(tokenizer, Tokens.CLOSE_PAREN); case '|': scan_one(tokenizer, Tokens.PIPE); case '^': scan_one(tokenizer, Tokens.CARROT); case '$': scan_one(tokenizer, Tokens.DOLLAR); case '*': scan_one(tokenizer, Tokens.ASTERISK); case '-': scan_one(tokenizer, Tokens.TACK); case '_': scan_one(tokenizer, Tokens.UNDERLINE); case '.': scan_one(tokenizer, Tokens.DOT); case '\\': scan_one(tokenizer, Tokens.BACK_SLASH); case '/': scan_one(tokenizer, Tokens.FORWARD_SLASH); case: return false; } return true; } tokenize :: proc (buffer: []u8) -> ^Tokenizer { tokenizer := new_tokenizer(); tokenizer.buffer = buffer; new_tok: ^Token = nil; literal_tok: ^Token = nil; for ; tokenizer.idx < len(tokenizer.buffer); tokenizer.idx += 1 { scanned := scan_one_maybe(tokenizer); if scanned && literal_tok != nil { // End a literal literal_tok.end = tokenizer.idx; literal_tok = nil; } else if !scanned && literal_tok == nil { // Start a literal literal_tok = new(Token); literal_tok.type_ = .LITERAL; literal_tok.start = tokenizer.idx; literal_tok.end = tokenizer.idx; fmt.printf("Appending!\n"); append(&tokenizer.tokens, literal_tok); } } assert(len(tokenizer.tokens) > 0); return tokenizer; } print_token_stream :: proc(stream: []^Token) { for tok, index in stream { fmt.printf("%d: ", index); print_token(tok); fmt.print("\n"); } } @(test) test_tokenize_basic :: proc(sig: ^testing.T) { buf, ok := os.read_entire_file_from_filename("./test/test_tokenizer__basic.input"); tokenizer := tokenize(buf); assert( len(tokenizer.tokens) == 2, fmt.tprintf("len(tokens) = %d != 2\n", len(tokenizer.tokens))); print_token_stream(tokenizer.tokens[:]); assert( tokenizer.tokens[0].type_ == Tokens.LITERAL, fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_)); assert( tokenizer.tokens[1].type_ == Tokens.ASTERISK, fmt.tprintf("tt = %s\n", tokenizer.tokens[0].type_)); } parse_escape :: proc(parser: ^Parser) -> ^AstNode { using parser node := new_ast_node(Literal) tok, ok := pop_front_safe(&(tokenizer.tokens)) if !ok { free(node) return nil } node.start = tok.start assert(tok.type_ == .BACK_SLASH) tok, ok = pop_front_safe(&tokenizer.tokens) if !ok { // TODO } if tok.type_ == .LITERAL { // TODO Cannot escape a literal } node.end = tok.end return node } parse_zero_or_more_repeater :: proc(parser: ^Parser) -> ^AstNode { using parser // Create an AST node representing the repeater ok: bool repeater_token: ^Token repeater_token, ok = pop_safe(&tokenizer.tokens) if !ok { // TODO Handle error return nil } assert(repeater_token.type_ == .ASTERISK) // Attach the ast node that the repeater operates on node := new_ast_node(Repeater) node.type_ = .ZeroOrMore node.start = repeater_token.start node.end = repeater_token.end return node } parse_literal :: proc(parser: ^Parser) -> ^AstNode { tok, ok := pop_front_safe(&parser.tokenizer.tokens) if !ok { return nil } node := new(Literal) node.start = tok.start node.end = tok.end node.next = nil value := parser.tokenizer.buffer[node.start:node.end] log.debugf("Finished parsing type=Literal with value='%s'\n", value) return node } parse_token_stream :: proc(tokenizer: ^Tokenizer) -> ^Parser { parser := new_parser(tokenizer) parser.ast = parse_token_stream__inner(parser) fmt.printf("%x\n", &parser.ast) return parser } parse_token_stream__inner :: proc(parser: ^Parser) -> ^AstNode { using parser next: ^AstNode for len(tokenizer.tokens) > 0 { type_ := peek_type(tokenizer.tokens) log.debugf("Parsing token of type=%s\n", type_) #partial switch type_ { case .LITERAL: next = parse_literal(parser) case .BACK_SLASH: next = parse_escape(parser) case .ASTERISK: next = parse_zero_or_more_repeater(parser) case: msg := fmt.tprintf("Don't know how to handle token of type=%s\n", type_) assert(false, msg) } if next == nil { return current } else if current != nil { current.next = next } current = next } return current } print_parse_tree :: proc(parser: ^Parser) { for current := parser.ast; current != nil; current = current.next { print_node(parser, current, 0); } } @(test) test_parse_token_stream :: proc(sig: ^testing.T) { buf, ok := os.read_entire_file_from_filename("./test/test_parser__basic.input"); assert(ok) tokenizer := tokenize(buf) parser := parse_token_stream(tokenizer) print_parse_tree(parser) } iter_lines :: proc(buffer: []u8, index: ^int) -> ([]u8, bool) { if index^ < 0 || index^ >= len(buffer) { return buffer, true } lbound := index^ for index^ < len(buffer) { if buffer[index^] == '\n' { index^ += 1 // Skip the newline return buffer[lbound:index^ - 1], false } index^ += 1 } return buffer[lbound:index^], false } @(test) test_iter_lines :: proc(sig: ^testing.T) { input, ok_ := os.read_entire_file_from_filename("./test/test_iter_lines__basic.input") assert(ok_) linum := 1 index := 0 line, stop_iter := iter_lines(input, &index) for !stop_iter { linum += 1 fmt.printf("Line %0d: %s\n", linum, line) line, stop_iter = iter_lines(input, &index) } } parse_token_expectation :: proc(filename: string) { buffer, ok := os.read_entire_file_from_filename(filename); if !ok { // TODO handle error log.errorf("%s\n", os.get_last_error_string()); return; } start := 0 line, ok_ := iter_lines(buffer, &start) for ; ok; line, ok = iter_lines(buffer, &start) { } } re_compile :: proc(pattern: string) -> ^AstNode { pattern_ := strings.clone(pattern); tokenizer := tokenize(transmute([]u8)pattern_) parser := parse_token_stream(tokenizer); return parser.ast; } re_match :: proc() { assert(false); return; } re_search :: proc() { assert(false); return; }