summaryrefslogtreecommitdiff
path: root/src/tokenizer.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer.zig')
-rw-r--r--src/tokenizer.zig290
1 files changed, 290 insertions, 0 deletions
diff --git a/src/tokenizer.zig b/src/tokenizer.zig
new file mode 100644
index 0000000..dc700c8
--- /dev/null
+++ b/src/tokenizer.zig
@@ -0,0 +1,290 @@
+const std = @import("std");
+
+pub const Token_enum = enum {
+ RARROW, // ->
+ LARROW, // <-
+ BACKTICK, // `
+ PERIOD, // .
+ COMMA, // ,
+ QMARK, // ?
+ LPAREN, // (
+ RPAREN, // )
+ STRING, // "..."
+ BUILTIN, // !word
+ FUNC, // all chars
+ TYP, // Capital Letters
+ INT, // numbers
+};
+
+pub const Token = union(Token_enum) {
+ RARROW, // ->
+ LARROW, // <-
+ BACKTICK, // `
+ PERIOD, // .
+ COMMA, // ,
+ QMARK, // ?
+ LPAREN, // (
+ RPAREN, // )
+ STRING: []const u8, // "..."
+ BUILTIN: []const u8, // !word
+ FUNC: []const u8, // lowercase letters
+ TYP: []const u8, // Capital Letters
+ INT: i64, // numbers
+
+ pub fn print(self: Token,alloc: std.mem.Allocator) ![]const u8 {
+ return switch (self) {
+ .RARROW => "->",
+ .LARROW => "<-",
+ .BACKTICK => "`",
+ .PERIOD => ".",
+ .COMMA => ",",
+ .QMARK => "?",
+ .LPAREN => "(",
+ .RPAREN => ")",
+ .STRING => |v| v,
+ .BUILTIN => |v| v,
+ .FUNC => |v| v,
+ .TYP => |v| v,
+ .INT => |v| try std.fmt.allocPrint(alloc, "{d}",.{v}),
+ };
+ }
+};
+
+/// Creates a tokenizer over a slice of typ
+pub fn Iterator(comptime typ: type) type {
+ return struct {
+ items: []const typ,
+ index: usize = 0,
+
+ const SelfType = Iterator(typ);
+ const Error = error{
+ OutOfBounds,
+ ExpectedItem,
+ EndOfItems,
+ };
+
+ /// Initialize tokenizer with a slice
+ pub fn init(items: []const typ) SelfType {
+ return Iterator(typ){ .items = items };
+ }
+
+ /// Get current item
+ pub fn peekAhead(self: *SelfType, ahead: u32) ?typ {
+ if (self.index + ahead >= self.items.len) return null;
+ return self.items[self.index + ahead];
+ }
+
+ pub fn peek(self: *SelfType) ?typ {
+ return self.peekAhead(0);
+ }
+
+ /// Get current item and iterate index
+ pub fn next(self: *SelfType) ?typ {
+ const ret = self.peek();
+ self.skip();
+ return ret;
+ }
+
+ pub fn consume(self: *SelfType, expected: typ) !?typ {
+ if (!std.meta.eql(self.peek().?, expected)) return Error.ExpectedItem;
+ return self.next();
+ }
+
+ pub fn consumeuntil(self: *SelfType, alloc: std.mem.Allocator, delims: []const typ) !?[]typ {
+ var arr = try std.ArrayList(typ).initCapacity(alloc, 128);
+ while (self.peek()) |item| {
+ for (delims) |d|
+ if (std.meta.eql(item, d)) break;
+ self.skip();
+ try arr.append(alloc, item);
+ }
+ return try arr.toOwnedSlice(alloc);
+ }
+
+ pub fn consumeuntilescape(self: *SelfType, alloc: std.mem.Allocator, delims: []const typ, escape: typ) !?[]typ { var arr = try std.ArrayList(typ).initCapacity(alloc, 128);
+ var previous: typ = undefined;
+ while (self.peek()) |item| {
+ for (delims) |d|
+ if (std.meta.eql(item, d) and !std.meta.eql(previous, escape)) break;
+ self.skip();
+ try arr.append(alloc, item);
+ previous = item;
+ }
+ return try arr.toOwnedSlice(alloc);
+ }
+
+ pub fn consumewhile(self: *SelfType, alloc: std.mem.Allocator, allowed: []const typ) !?[]typ {
+ var arr = try std.ArrayList(typ).initCapacity(alloc, 128);
+ while (self.peek()) |item| {
+ for (allowed) |d|
+ if (!std.meta.eql(item, d)) break;
+ self.skip();
+ try arr.append(alloc, item);
+ }
+ return try arr.toOwnedSlice(alloc);
+ }
+
+
+ pub fn maybe(self: *SelfType, expected: typ) ?typ {
+ return self.consume(expected) catch null;
+ }
+
+ /// Skip over current item
+ pub fn skip(self: *Iterator(typ)) void {
+ self.index += 1;
+ }
+ };
+}
+
+pub fn tokenize(allocator: std.mem.Allocator, input: []const u8) ![]Token {
+ var toks = std.ArrayList(Token){};
+ var buff = std.ArrayList(u8){};
+ defer buff.deinit(allocator);
+ var src = Iterator(u8).init(input);
+
+ const internals = struct {
+ fn clearbuff(alloc: std.mem.Allocator, tok: *std.ArrayList(Token), buf: *std.ArrayList(u8)) !void {
+ if (buf.items.len == 0) return;
+ const str = try buf.toOwnedSlice(alloc);
+ try tok.append(alloc, .{ .FUNC = str });
+ buf.clearAndFree(alloc);
+ }
+ };
+
+ while (src.peek()) |char| {
+ switch (char) {
+ '`' => {
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .BACKTICK);
+ },
+ ',' => {
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .COMMA);
+ },
+ '.' => {
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .PERIOD);
+ },
+ '(' => {
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .LPAREN);
+ },
+ ')' => {
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .RPAREN);
+ },
+ '-' => {
+ src.skip();
+ if (src.peek().? != '>') {
+ try buff.append(allocator, '-' );
+ continue;
+ }
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .RARROW);
+ },
+ '<' => {
+ src.skip();
+ if (src.peek().? != '-') {
+ try buff.append(allocator, '<' );
+ continue;
+ }
+ src.skip();
+ try internals.clearbuff(allocator, &toks, &buff);
+ try toks.append(allocator, .LARROW);
+ },
+ '0'...'9' => {
+ while (std.ascii.isDigit(src.peek().?))
+ try buff.append(allocator, src.next().?);
+
+ const num: i32 = try std.fmt.parseInt(i32, buff.items, 10);
+ try toks.append(allocator, .{ .INT = num });
+ buff.clearAndFree(allocator);
+ },
+ 'A'...'Z' => {
+ while (std.ascii.isAlphabetic(src.peek().?))
+ try buff.append(allocator, src.next().?);
+ const str = try buff.toOwnedSlice(allocator);
+ try toks.append(allocator, .{ .TYP = str });
+ buff.clearAndFree(allocator);
+ },
+ '!' => {
+ src.skip();
+ while (std.ascii.isAlphanumeric(src.peek().?))
+ try buff.append(allocator, src.next().?);
+ const str = try buff.toOwnedSlice(allocator);
+ try toks.append(allocator, .{ .BUILTIN = str });
+ buff.clearAndFree(allocator);
+ },
+ '"' => {
+ _ = src.next();
+ while (src.peek().? != '"')
+ try buff.append(allocator, src.next().?);
+
+ _ = src.next();
+ const token = Token{ .STRING = try buff.toOwnedSlice(allocator) };
+ try toks.append(allocator, token);
+ buff.clearAndFree(allocator);
+ },
+ ' ', '\t', '\n' => {
+ src.skip();
+ if (buff.items.len == 0) continue;
+ try internals.clearbuff(allocator, &toks, &buff);
+ },
+ else => try buff.append(allocator, src.next().?),
+ }
+ }
+ return toks.toOwnedSlice(allocator);
+}
+
+// pub fn tokenize(allocator: std.mem.Allocator, input: []const u8) ![]Token {
+// var arr = try std.ArrayList(Token).initCapacity(allocator, 1024);
+// defer arr.deinit(allocator);
+// var iterator = Iterator(u8).init(input);
+// return parse: switch (iterator.next().?) {
+// '-' => {
+// if (iterator.maybe('>')) |_| try arr.append(allocator, .RARROW);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '<' => {
+// if (iterator.maybe('-')) |_| try arr.append(allocator, .LARROW);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '.' => {
+// try arr.append(allocator, .PERIOD);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// ',' => {
+// try arr.append(allocator, .COMMA);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '`' => {
+// try arr.append(allocator, .QMARK);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '?' => {
+// try arr.append(allocator, .QMARK);
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '!' => {
+// const name = try iterator.consumeuntil(allocator, &std.ascii.whitespace);
+// try arr.append(allocator, Token{.BUILTIN = name.?});
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '"' => {
+// const name = try iterator.consumeuntilescape(allocator, "\"", '\\');
+// try arr.append(allocator, Token{.STRING = name.?});
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// 'A'...'Z' => {
+// const name = try iterator.consumeuntil(allocator, &std.ascii.whitespace);
+// try arr.append(allocator, Token{.TYP = name.?});
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// '0'...'9' => {
+// const name = try iterator.consumewhile(allocator, "0123456789");
+// try arr.append(allocator, Token{.FUNC = name.?});
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// else => {
+// const name = try iterator.consumeuntil(allocator, @constCast(&std.ascii.whitespace));
+// try arr.append(allocator, Token{.FUNC = name.?});
+// if (iterator.peek()) |pk| continue :parse pk else break :parse arr.items; },
+// };
+
+// }