feat(tokenizer): Move tokenizer into a struct

Moved the tokenizer into a struct and created a string iterator to make tokenizing easier
author: Nic Gaffney <gaffney_nic@protonmail.com> 2024-06-28 00:54:59 -0500
committer: Nic Gaffney <gaffney_nic@protonmail.com> 2024-06-28 00:54:59 -0500
commit: c955f2fb933865a50e791e2be91adbcb34bbadc9 (patch)
tree: efd6def99007c8936721931d02e42c77357f3fd7
parent: 36e990c5bdfffb145b7255b8159d3ac879344996 (diff)
download: calico-c955f2fb933865a50e791e2be91adbcb34bbadc9.tar.gz
5 files changed, 125 insertions, 124 deletions
diff --git a/.gitignore b/.gitignore
index 9ae02d6..093eb95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
-out/
+calico-out/
 zig-out/
-zig-cache/
+.zig-cache/
diff --git a/build.zig b/build.zig
index f04131f..052d68b 100644
--- a/build.zig
+++ b/build.zig
@@ -6,7 +6,7 @@ pub fn build(b: *std.Build) void {
     const optimize = b.standardOptimizeOption(.{});
 
     const exe = b.addExecutable(.{
-        .name = "compiler",
+        .name = "calico",
         .root_source_file = b.path("src/main.zig"),
         .target = target,
         .optimize = optimize,
@@ -22,7 +22,7 @@ pub fn build(b: *std.Build) void {
         run_cmd.addArgs(args);
     }
 
-    const run_step = b.step("run", "Run the app");
+    const run_step = b.step("run", "Run the compiler");
     run_step.dependOn(&run_cmd.step);
 
     const exe_unit_tests = b.addTest(.{
diff --git a/build.zig.zon b/build.zig.zon
index 39f7607..cf62921 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -1,67 +1,9 @@
 .{
-    .name = "compiler",
-    // This is a [Semantic Version](https://semver.org/).
-    // In a future version of Zig it will be used for package deduplication.
-    .version = "0.0.0",
+    .name = "calico",
+    .version = "0.0.1",
 
-    // This field is optional.
-    // This is currently advisory only; Zig does not yet do anything
-    // with this value.
-    //.minimum_zig_version = "0.11.0",
-
-    // This field is optional.
-    // Each dependency must either provide a `url` and `hash`, or a `path`.
-    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
-    // Once all dependencies are fetched, `zig build` no longer requires
-    // internet connectivity.
-    .dependencies = .{
-        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
-        //.example = .{
-        //    // When updating this field to a new URL, be sure to delete the corresponding
-        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
-        //    // the new URL.
-        //    .url = "https://example.com/foo.tar.gz",
-        //
-        //    // This is computed from the file contents of the directory of files that is
-        //    // obtained after fetching `url` and applying the inclusion rules given by
-        //    // `paths`.
-        //    //
-        //    // This field is the source of truth; packages do not come from a `url`; they
-        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
-        //    // obtain a package matching this `hash`.
-        //    //
-        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
-        //    .hash = "...",
-        //
-        //    // When this is provided, the package is found in a directory relative to the
-        //    // build root. In this case the package's hash is irrelevant and therefore not
-        //    // computed. This field and `url` are mutually exclusive.
-        //    .path = "foo",
-
-        //    // When this is set to `true`, a package is declared to be lazily
-        //    // fetched. This makes the dependency only get fetched if it is
-        //    // actually used.
-        //    .lazy = false,
-        //},
-    },
-
-    // Specifies the set of files and directories that are included in this package.
-    // Only files and directories listed here are included in the `hash` that
-    // is computed for this package.
-    // Paths are relative to the build root. Use the empty string (`""`) to refer to
-    // the build root itself.
-    // A directory listed here means that all files within, recursively, are included.
+    .dependencies = .{},
     .paths = .{
-        // This makes *all* files, recursively, included in this package. It is generally
-        // better to explicitly list the files and directories instead, to insure that
-        // fetching from tarballs, file system paths, and version control all result
-        // in the same contents hash.
         "",
-        // For example...
-        //"build.zig",
-        //"build.zig.zon",
-        //"src",
-        //"LICENSE",
-        //"README.md",
     },
 }
diff --git a/src/main.zig b/src/main.zig
index 18239b1..99b8130 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -4,19 +4,27 @@ const tok = @import("tokenize.zig");
 const gftCompilerError = error{NoInputFile};
 
 pub fn main() !void {
-    if (std.os.argv.len != 2) return gftCompilerError.NoInputFile;
+    if (std.os.argv.len < 2) return gftCompilerError.NoInputFile;
+
     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
     defer _ = gpa.deinit();
+
     var args = std.process.args();
     _ = args.skip();
     const inputFileName = args.next();
+
+    var out_name: []const u8 = "out";
+    if (std.os.argv.len == 3) out_name = args.next().?;
+
     const inputFile = try std.fs.cwd().openFile(inputFileName.?, .{});
     defer inputFile.close();
 
-    std.fs.cwd().makeDir("out") catch |err| {
+    std.fs.cwd().makeDir("calico-out") catch |err|
         if (err != error.PathAlreadyExists) return err;
-    };
-    const outfile = try std.fs.cwd().createFile("out/out.asm", .{});
+
+    const outFileName = try getFileName(gpa.allocator(), out_name, "asm");
+    defer gpa.allocator().free(outFileName);
+    const outfile = try std.fs.cwd().createFile(outFileName, .{});
     const outWriter = outfile.writer();
     defer outfile.close();
 
@@ -24,9 +32,10 @@ pub fn main() !void {
     const all = try inputFile.readToEndAlloc(gpa.allocator(), 2048);
     defer gpa.allocator().free(all);
 
-    const toks = try tok.tokenize(gpa.allocator(), all);
-    defer gpa.allocator().free(toks);
-    var tokIter = tok.TokenIterator{ .tokens = toks };
+    var tokenizer = tok.Tokenizer.init(gpa.allocator(), all);
+    defer tokenizer.deinit();
+    var tokIter = tok.TokenIterator{ .tokens = try tokenizer.tokenize() };
+
     try outWriter.print("global _start:\n", .{});
     while (tokIter.next()) |t| {
         switch (t) {
@@ -52,13 +61,27 @@ pub fn main() !void {
         }
     }
 
-    const nasmargv = [_][]const u8{ "nasm", "-felf64", "out/out.asm" };
-    const nasmproc = try std.ChildProcess.run(.{ .argv = &nasmargv, .allocator = gpa.allocator() });
+    // Run nasm and ld to build the executable
+    // TODO: switch to qbe or llvm (preferabbly qbe)
+    const nasmFile = try getFileName(gpa.allocator(), out_name, "asm");
+    defer gpa.allocator().free(nasmFile);
+    const nasmargv = [_][]const u8{ "nasm", "-felf64", nasmFile };
+    const nasmproc = try std.process.Child.run(.{ .argv = &nasmargv, .allocator = gpa.allocator() });
     defer gpa.allocator().free(nasmproc.stdout);
     defer gpa.allocator().free(nasmproc.stderr);
 
-    const ldargv = [_][]const u8{ "ld", "-o", "out/out", "out/out.o" };
-    const ldproc = try std.ChildProcess.run(.{ .argv = &ldargv, .allocator = gpa.allocator() });
+    const ldFile = try getFileName(gpa.allocator(), out_name, "o");
+    defer gpa.allocator().free(ldFile);
+    const binFile = try getFileName(gpa.allocator(), out_name, "");
+    defer gpa.allocator().free(binFile);
+    const ldargv = [_][]const u8{ "ld", "-o", binFile, ldFile };
+    const ldproc = try std.process.Child.run(.{ .argv = &ldargv, .allocator = gpa.allocator() });
     defer gpa.allocator().free(ldproc.stdout);
     defer gpa.allocator().free(ldproc.stderr);
 }
+
+inline fn getFileName(allocator: std.mem.Allocator, out_name: []const u8, fileType: []const u8) ![]const u8 {
+    var hasDot: []const u8 = ".";
+    if (fileType.len == 0) hasDot = "";
+    return try std.fmt.allocPrint(allocator, "calico-out/{s}{s}{s}", .{ out_name, hasDot, fileType });
+}
diff --git a/src/tokenize.zig b/src/tokenize.zig
index b5d5d23..6225814 100644
--- a/src/tokenize.zig
+++ b/src/tokenize.zig
@@ -5,62 +5,98 @@ const TokenError = error{UnknownToken};
 const Token = union(enum) {
     ret: []const u8,
     intLit: i32,
-    semiCol: u8,
-    nil: void,
+    semiCol,
+    nil,
 };
 
 pub const TokenIterator = struct {
-    tokens: []const Token,
+    tokens: std.ArrayList(Token),
     index: usize = 0,
 
     pub fn next(self: *TokenIterator) ?Token {
-        defer self.*.index = self.*.index + 1;
-        if (self.*.index >= self.*.tokens.len) return null;
-        return self.*.tokens[self.*.index];
+        defer self.index = self.index + 1;
+        if (self.index >= self.tokens.items.len) return null;
+        return self.tokens.items[self.index];
     }
 };
 
-pub fn tokenize(allocator: std.mem.Allocator, buff: []const u8) ![]const Token {
-    var toks = std.ArrayList(Token).init(allocator);
-    defer toks.deinit();
-    var str = std.ArrayList(u8).init(allocator);
-    defer str.deinit();
-
-    var i: u32 = 0;
-    while (i < buff.len) {
-        switch (buff[i]) {
-            ' ', '\n', '\t' => {
-                i = i + 1;
-                continue;
-            },
-            '0'...'9' => {
-                while (std.ascii.isDigit(buff[i])) {
-                    try str.append(buff[i]);
-                    i = i + 1;
-                }
-                const num: i32 = try std.fmt.parseInt(i32, str.items, 10);
-                try toks.append(.{ .intLit = num });
-                str.deinit();
-                str = std.ArrayList(u8).init(allocator);
-            },
-            'a'...'z', 'A'...'Z' => {
-                while (std.ascii.isAlphanumeric(buff[i])) {
-                    try str.append(buff[i]);
-                    i = i + 1;
-                }
-                try toks.append(.{ .ret = try str.toOwnedSlice() });
-                str.deinit();
-                str = std.ArrayList(u8).init(allocator);
-            },
-            ';' => {
-                i = i + 1;
-                try toks.append(.{ .semiCol = ';' });
-            },
-            '+', '-', '*', '/' => {
-                // Process operator
-            },
-            else => {},
+pub const StringIterator = struct {
+    string: []const u8,
+    index: usize = 0,
+
+    pub fn init(string: []const u8) StringIterator {
+        return StringIterator{ .string = string };
+    }
+
+    pub fn peek(self: StringIterator) ?u8 {
+        if (self.index >= self.string.len) return null;
+        return self.string[self.index];
+    }
+
+    pub fn consume(self: *StringIterator) ?u8 {
+        defer self.index += 1;
+        return self.peek();
+    }
+
+    pub fn skip(self: *StringIterator) void {
+        self.index += 1;
+    }
+};
+
+pub const Tokenizer = struct {
+    src: StringIterator,
+    allocator: std.mem.Allocator,
+    toks: std.ArrayList(Token),
+
+    pub fn init(allocator: std.mem.Allocator, src: []const u8) Tokenizer {
+        return Tokenizer{
+            .src = StringIterator.init(src),
+            .allocator = allocator,
+            .toks = std.ArrayList(Token).init(allocator),
+        };
+    }
+
+    pub fn deinit(self: *Tokenizer) void {
+        self.toks.deinit();
+    }
+
+    pub fn tokenize(self: *Tokenizer) !std.ArrayList(Token) {
+        var str = std.ArrayList(u8).init(self.allocator);
+        defer str.deinit();
+
+        while (self.src.peek()) |char| {
+            switch (char) {
+                ' ', '\n', '\t' => {
+                    self.src.skip();
+                    continue;
+                },
+                '0'...'9' => {
+                    while (std.ascii.isDigit(self.src.peek().?))
+                        try str.append(self.src.consume().?);
+
+                    const num: i32 = try std.fmt.parseInt(i32, str.items, 10);
+                    try self.toks.append(.{ .intLit = num });
+                    str.deinit();
+                    str = std.ArrayList(u8).init(self.allocator);
+                },
+                'a'...'z', 'A'...'Z' => {
+                    while (std.ascii.isAlphanumeric(self.src.peek().?))
+                        try str.append(self.src.consume().?);
+
+                    try self.toks.append(.{ .ret = try str.toOwnedSlice() });
+                    str.deinit();
+                    str = std.ArrayList(u8).init(self.allocator);
+                },
+                ';' => {
+                    self.src.skip();
+                    try self.toks.append(.semiCol);
+                },
+                '+', '-', '*', '/' => {
+                    // Process operator
+                },
+                else => {},
+            }
         }
+        return self.toks;
     }
-    return toks.toOwnedSlice();
-}
+};
author	Nic Gaffney <gaffney_nic@protonmail.com>	2024-06-28 00:54:59 -0500
committer	Nic Gaffney <gaffney_nic@protonmail.com>	2024-06-28 00:54:59 -0500
commit	c955f2fb933865a50e791e2be91adbcb34bbadc9 (patch)
tree	efd6def99007c8936721931d02e42c77357f3fd7
parent	36e990c5bdfffb145b7255b8159d3ac879344996 (diff)
download	calico-c955f2fb933865a50e791e2be91adbcb34bbadc9.tar.gz