From c955f2fb933865a50e791e2be91adbcb34bbadc9 Mon Sep 17 00:00:00 2001 From: Nic Gaffney Date: Fri, 28 Jun 2024 00:54:59 -0500 Subject: feat(tokenizer): Move tokenizer into a struct Moved the tokenizer into a struct and created a string iterator to make tokenizing easier --- src/tokenize.zig | 132 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 84 insertions(+), 48 deletions(-) (limited to 'src/tokenize.zig') diff --git a/src/tokenize.zig b/src/tokenize.zig index b5d5d23..6225814 100644 --- a/src/tokenize.zig +++ b/src/tokenize.zig @@ -5,62 +5,98 @@ const TokenError = error{UnknownToken}; const Token = union(enum) { ret: []const u8, intLit: i32, - semiCol: u8, - nil: void, + semiCol, + nil, }; pub const TokenIterator = struct { - tokens: []const Token, + tokens: std.ArrayList(Token), index: usize = 0, pub fn next(self: *TokenIterator) ?Token { - defer self.*.index = self.*.index + 1; - if (self.*.index >= self.*.tokens.len) return null; - return self.*.tokens[self.*.index]; + defer self.index = self.index + 1; + if (self.index >= self.tokens.items.len) return null; + return self.tokens.items[self.index]; } }; -pub fn tokenize(allocator: std.mem.Allocator, buff: []const u8) ![]const Token { - var toks = std.ArrayList(Token).init(allocator); - defer toks.deinit(); - var str = std.ArrayList(u8).init(allocator); - defer str.deinit(); - - var i: u32 = 0; - while (i < buff.len) { - switch (buff[i]) { - ' ', '\n', '\t' => { - i = i + 1; - continue; - }, - '0'...'9' => { - while (std.ascii.isDigit(buff[i])) { - try str.append(buff[i]); - i = i + 1; - } - const num: i32 = try std.fmt.parseInt(i32, str.items, 10); - try toks.append(.{ .intLit = num }); - str.deinit(); - str = std.ArrayList(u8).init(allocator); - }, - 'a'...'z', 'A'...'Z' => { - while (std.ascii.isAlphanumeric(buff[i])) { - try str.append(buff[i]); - i = i + 1; - } - try toks.append(.{ .ret = try str.toOwnedSlice() }); - str.deinit(); - str = std.ArrayList(u8).init(allocator); - }, - ';' => { - i = i + 1; - try toks.append(.{ .semiCol = ';' }); - }, - '+', '-', '*', '/' => { - // Process operator - }, - else => {}, +pub const StringIterator = struct { + string: []const u8, + index: usize = 0, + + pub fn init(string: []const u8) StringIterator { + return StringIterator{ .string = string }; + } + + pub fn peek(self: StringIterator) ?u8 { + if (self.index >= self.string.len) return null; + return self.string[self.index]; + } + + pub fn consume(self: *StringIterator) ?u8 { + defer self.index += 1; + return self.peek(); + } + + pub fn skip(self: *StringIterator) void { + self.index += 1; + } +}; + +pub const Tokenizer = struct { + src: StringIterator, + allocator: std.mem.Allocator, + toks: std.ArrayList(Token), + + pub fn init(allocator: std.mem.Allocator, src: []const u8) Tokenizer { + return Tokenizer{ + .src = StringIterator.init(src), + .allocator = allocator, + .toks = std.ArrayList(Token).init(allocator), + }; + } + + pub fn deinit(self: *Tokenizer) void { + self.toks.deinit(); + } + + pub fn tokenize(self: *Tokenizer) !std.ArrayList(Token) { + var str = std.ArrayList(u8).init(self.allocator); + defer str.deinit(); + + while (self.src.peek()) |char| { + switch (char) { + ' ', '\n', '\t' => { + self.src.skip(); + continue; + }, + '0'...'9' => { + while (std.ascii.isDigit(self.src.peek().?)) + try str.append(self.src.consume().?); + + const num: i32 = try std.fmt.parseInt(i32, str.items, 10); + try self.toks.append(.{ .intLit = num }); + str.deinit(); + str = std.ArrayList(u8).init(self.allocator); + }, + 'a'...'z', 'A'...'Z' => { + while (std.ascii.isAlphanumeric(self.src.peek().?)) + try str.append(self.src.consume().?); + + try self.toks.append(.{ .ret = try str.toOwnedSlice() }); + str.deinit(); + str = std.ArrayList(u8).init(self.allocator); + }, + ';' => { + self.src.skip(); + try self.toks.append(.semiCol); + }, + '+', '-', '*', '/' => { + // Process operator + }, + else => {}, + } } + return self.toks; } - return toks.toOwnedSlice(); -} +}; -- cgit v1.2.3