Initial language designs & lexer from crafting interpreters

The very initial language designs I came up with for Sloth. Likely contains inconsistencies and definitely contains things that will be changed in the future. This is basically just a dump of every idea I've had for the language thus far. As for the lexer right now it is heavily based on the one from the Crafting Interpretrs book and doesn't yet parse Sloth grammar.
author: Cody <cody@codyq.dev> 2022-12-15 13:23:48 -0600
committer: Cody <cody@codyq.dev> 2022-12-15 13:23:48 -0600
commit: bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0 (patch)
tree: 874d175f352f1a4688e7e62d1f9222a192ae9bff /src/lexer.rs
download: sloth-bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0.tar.gz
1 files changed, 285 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..602b5e1
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,285 @@
+#![allow(dead_code)]
+
+use itertools::Itertools;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum TokenType {
+    // Short
+    Plus,  // +
+    Minus, // -
+    Slash, // /
+    Star,  // *
+    Perc,  // %
+
+    PlusEq,  // +=
+    MinusEq, // -=
+    SlashEq, // /=
+    StarEq,  // *=
+    PercEq,  // %=
+
+    Eq,     // =
+    EqEq,   // ==
+    Bang,   // !
+    BangEq, // !=
+
+    Gt,   // >
+    GtEq, // >=
+    Lt,   // <
+    LtEq, // <=
+
+    Amp,      // &
+    AmpAmp,   // &&
+    Pipe,     // |
+    PipePipe, // ||
+
+    LeftParen,    // (
+    RightParen,   // )
+    LeftBracket,  // [
+    RightBracket, // ]
+    LeftBrace,    // {
+    RightBrace,   // }
+
+    Comma,     // ,
+    Dot,       // .
+    Colon,     // :
+    SemiColon, // ;
+
+    // Literals
+    Literal(Literal),
+    Identifier(String),
+
+    // Keywords
+    Let,
+    Fn,
+
+    If,
+    Else,
+
+    For,
+    In,
+
+    While,
+
+    Loop,
+    Break,
+    Continue,
+
+    Print, // TODO: Change to std library function
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Literal {
+    String(String),
+    Character(char),
+    Number(i32),
+    Bool(bool),
+    Nil,
+}
+
+#[derive(Debug)]
+pub struct Token {
+    pub tt: TokenType,
+    pub lexeme: String,
+
+    start: usize,
+    length: usize,
+    line: u32,
+}
+
+pub struct Lexer<'a> {
+    source: &'a [u8],
+    start: usize, // Start of the lexme
+    pos: usize,   // End of the lexme
+    line: u32,
+}
+
+impl<'a> Lexer<'a> {
+    pub fn new(source: &'a str) -> Lexer<'a> {
+        Self {
+            source: source.as_bytes(),
+            start: 0,
+            pos: 0,
+            line: 0,
+        }
+    }
+
+    fn peek(&self) -> char {
+        self.source
+            .get(self.pos)
+            .map(|it| *it as char)
+            .unwrap_or('\u{0000}')
+    }
+
+    fn peek_nth(&self, nth: usize) -> char {
+        self.source
+            .get(self.pos + nth)
+            .map(|it| *it as char)
+            .unwrap_or('\u{0000}')
+    }
+
+    fn advance(&mut self) -> char {
+        self.pos += 1;
+        self.source
+            .get(self.pos - 1)
+            .map(|it| *it as char)
+            .unwrap_or('\u{0000}')
+    }
+
+    fn advance_if(&mut self, next: char) -> bool {
+        if self.peek() != next {
+            return false;
+        }
+
+        self.advance();
+        true
+    }
+}
+
+impl<'a> Iterator for Lexer<'a> {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Ignore all whitespace & comments
+        loop {
+            match self.peek() {
+                '#' => {
+                    while self.peek() != '\n' {
+                        self.advance();
+                    }
+                }
+                '\n' => self.line += 1,
+                ' ' | '\r' | '\t' => (),
+                _ => break,
+            }
+            self.advance();
+        }
+
+        // Resetting the lexeme
+        self.start = self.pos;
+
+        // Parse the next lexeme
+        let character = self.advance();
+        let tt = match character {
+            // Arithmetic
+            '+' if self.advance_if('=') => TokenType::PlusEq,
+            '-' if self.advance_if('=') => TokenType::MinusEq,
+            '*' if self.advance_if('=') => TokenType::StarEq,
+            '/' if self.advance_if('=') => TokenType::SlashEq,
+            '%' if self.advance_if('=') => TokenType::PercEq,
+            '+' => TokenType::Plus,
+            '-' => TokenType::Minus,
+            '*' => TokenType::Star,
+            '/' => TokenType::Slash,
+            '%' => TokenType::Perc,
+
+            '0'..='9' => {
+                let mut value = String::new();
+                value.push(character);
+                while ('0'..='9').contains(&self.peek()) {
+                    value.push(self.advance());
+                }
+                if self.advance_if('.') {
+                    value.push('.');
+                    while ('0'..='9').contains(&self.peek()) {
+                        let c = self.advance();
+                        value.push(c);
+                    }
+                }
+                TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap()))
+            }
+
+            // Logical & Bitwise
+            '!' if self.advance_if('=') => TokenType::BangEq,
+            '=' if self.advance_if('=') => TokenType::EqEq,
+            '>' if self.advance_if('=') => TokenType::GtEq,
+            '<' if self.advance_if('=') => TokenType::LtEq,
+            '!' => TokenType::Bang,
+            '=' => TokenType::Eq,
+            '>' => TokenType::Gt,
+            '<' => TokenType::Lt,
+
+            '&' if self.advance_if('&') => TokenType::AmpAmp,
+            '|' if self.advance_if('|') => TokenType::PipePipe,
+            '&' => TokenType::Amp,
+            '|' => TokenType::Pipe,
+
+            // Scope
+            '(' => TokenType::LeftParen,
+            ')' => TokenType::RightParen,
+            '[' => TokenType::LeftBracket,
+            ']' => TokenType::RightBracket,
+            '{' => TokenType::LeftBrace,
+            '}' => TokenType::RightBrace,
+            ',' => TokenType::Comma,
+            '.' => TokenType::Dot,
+            ':' => TokenType::Colon,
+            ';' => TokenType::SemiColon,
+
+            '"' => {
+                let mut value = String::new();
+                while self.peek() != '"' {
+                    let character = self.advance();
+
+                    if character == '\\' {
+                        match self.advance() {
+                            '\\' => value.push('\\'),
+                            '"' => value.push('"'),
+                            _ => panic!(),
+                        }
+                        continue;
+                    }
+
+                    value.push(character);
+                }
+
+                self.advance();
+                TokenType::Literal(Literal::String(value))
+            }
+
+            // Keywords & Identifiers
+            'a'..='z' | 'A'..='Z' | '_' => {
+                let mut value = String::new();
+                value.push(character);
+                while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
+                    value.push(self.advance())
+                }
+
+                match value.as_str() {
+                    "let" => TokenType::Let,
+                    "fn" => TokenType::Fn,
+                    "if" => TokenType::If,
+                    "else" => TokenType::Else,
+                    "for" => TokenType::For,
+                    "in" => TokenType::In,
+                    "while" => TokenType::While,
+                    "loop" => TokenType::Loop,
+                    "break" => TokenType::Break,
+                    "continue" => TokenType::Continue,
+                    "print" => TokenType::Print,
+                    _ => TokenType::Identifier(value),
+                }
+            }
+
+            // Misc.
+            '\u{0000}' => return None,
+            _ => panic!("Failed to parse"),
+        };
+
+        // Getting the lexeme and then making the token to be returned
+        // let lexeme = self.source[self.start..self.pos].iter().join("");
+        let lexeme = self.source[self.start..self.pos]
+            .iter()
+            .map(|it| *it as char)
+            .join("");
+
+        let token = Token {
+            tt,
+            lexeme,
+            start: self.start,
+            length: self.pos - self.start,
+            line: self.line,
+        };
+
+        Some(token)
+    }
+}
author	Cody <cody@codyq.dev>	2022-12-15 13:23:48 -0600
committer	Cody <cody@codyq.dev>	2022-12-15 13:23:48 -0600
commit	bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0 (patch)
tree	874d175f352f1a4688e7e62d1f9222a192ae9bff /src/lexer.rs
download	sloth-bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0.tar.gz