From bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 15 Dec 2022 13:23:48 -0600 Subject: Initial language designs & lexer from crafting interpreters The very initial language designs I came up with for Sloth. Likely contains inconsistencies and definitely contains things that will be changed in the future. This is basically just a dump of every idea I've had for the language thus far. As for the lexer right now it is heavily based on the one from the Crafting Interpretrs book and doesn't yet parse Sloth grammar. --- src/lexer.rs | 285 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 26 ++++++ 2 files changed, 311 insertions(+) create mode 100644 src/lexer.rs create mode 100644 src/main.rs (limited to 'src') diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..602b5e1 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,285 @@ +#![allow(dead_code)] + +use itertools::Itertools; + +#[derive(Debug, Eq, PartialEq)] +pub enum TokenType { + // Short + Plus, // + + Minus, // - + Slash, // / + Star, // * + Perc, // % + + PlusEq, // += + MinusEq, // -= + SlashEq, // /= + StarEq, // *= + PercEq, // %= + + Eq, // = + EqEq, // == + Bang, // ! + BangEq, // != + + Gt, // > + GtEq, // >= + Lt, // < + LtEq, // <= + + Amp, // & + AmpAmp, // && + Pipe, // | + PipePipe, // || + + LeftParen, // ( + RightParen, // ) + LeftBracket, // [ + RightBracket, // ] + LeftBrace, // { + RightBrace, // } + + Comma, // , + Dot, // . + Colon, // : + SemiColon, // ; + + // Literals + Literal(Literal), + Identifier(String), + + // Keywords + Let, + Fn, + + If, + Else, + + For, + In, + + While, + + Loop, + Break, + Continue, + + Print, // TODO: Change to std library function +} + +#[derive(Debug, Eq, PartialEq)] +pub enum Literal { + String(String), + Character(char), + Number(i32), + Bool(bool), + Nil, +} + +#[derive(Debug)] +pub struct Token { + pub tt: TokenType, + pub lexeme: String, + + start: usize, + length: usize, + line: u32, +} + +pub struct Lexer<'a> { + source: &'a [u8], + start: usize, // Start of the lexme + pos: usize, // End of the lexme + line: u32, +} + +impl<'a> Lexer<'a> { + pub fn new(source: &'a str) -> Lexer<'a> { + Self { + source: source.as_bytes(), + start: 0, + pos: 0, + line: 0, + } + } + + fn peek(&self) -> char { + self.source + .get(self.pos) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn peek_nth(&self, nth: usize) -> char { + self.source + .get(self.pos + nth) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance(&mut self) -> char { + self.pos += 1; + self.source + .get(self.pos - 1) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance_if(&mut self, next: char) -> bool { + if self.peek() != next { + return false; + } + + self.advance(); + true + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + // Ignore all whitespace & comments + loop { + match self.peek() { + '#' => { + while self.peek() != '\n' { + self.advance(); + } + } + '\n' => self.line += 1, + ' ' | '\r' | '\t' => (), + _ => break, + } + self.advance(); + } + + // Resetting the lexeme + self.start = self.pos; + + // Parse the next lexeme + let character = self.advance(); + let tt = match character { + // Arithmetic + '+' if self.advance_if('=') => TokenType::PlusEq, + '-' if self.advance_if('=') => TokenType::MinusEq, + '*' if self.advance_if('=') => TokenType::StarEq, + '/' if self.advance_if('=') => TokenType::SlashEq, + '%' if self.advance_if('=') => TokenType::PercEq, + '+' => TokenType::Plus, + '-' => TokenType::Minus, + '*' => TokenType::Star, + '/' => TokenType::Slash, + '%' => TokenType::Perc, + + '0'..='9' => { + let mut value = String::new(); + value.push(character); + while ('0'..='9').contains(&self.peek()) { + value.push(self.advance()); + } + if self.advance_if('.') { + value.push('.'); + while ('0'..='9').contains(&self.peek()) { + let c = self.advance(); + value.push(c); + } + } + TokenType::Literal(Literal::Number(value.parse::().unwrap())) + } + + // Logical & Bitwise + '!' if self.advance_if('=') => TokenType::BangEq, + '=' if self.advance_if('=') => TokenType::EqEq, + '>' if self.advance_if('=') => TokenType::GtEq, + '<' if self.advance_if('=') => TokenType::LtEq, + '!' => TokenType::Bang, + '=' => TokenType::Eq, + '>' => TokenType::Gt, + '<' => TokenType::Lt, + + '&' if self.advance_if('&') => TokenType::AmpAmp, + '|' if self.advance_if('|') => TokenType::PipePipe, + '&' => TokenType::Amp, + '|' => TokenType::Pipe, + + // Scope + '(' => TokenType::LeftParen, + ')' => TokenType::RightParen, + '[' => TokenType::LeftBracket, + ']' => TokenType::RightBracket, + '{' => TokenType::LeftBrace, + '}' => TokenType::RightBrace, + ',' => TokenType::Comma, + '.' => TokenType::Dot, + ':' => TokenType::Colon, + ';' => TokenType::SemiColon, + + '"' => { + let mut value = String::new(); + while self.peek() != '"' { + let character = self.advance(); + + if character == '\\' { + match self.advance() { + '\\' => value.push('\\'), + '"' => value.push('"'), + _ => panic!(), + } + continue; + } + + value.push(character); + } + + self.advance(); + TokenType::Literal(Literal::String(value)) + } + + // Keywords & Identifiers + 'a'..='z' | 'A'..='Z' | '_' => { + let mut value = String::new(); + value.push(character); + while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + value.push(self.advance()) + } + + match value.as_str() { + "let" => TokenType::Let, + "fn" => TokenType::Fn, + "if" => TokenType::If, + "else" => TokenType::Else, + "for" => TokenType::For, + "in" => TokenType::In, + "while" => TokenType::While, + "loop" => TokenType::Loop, + "break" => TokenType::Break, + "continue" => TokenType::Continue, + "print" => TokenType::Print, + _ => TokenType::Identifier(value), + } + } + + // Misc. + '\u{0000}' => return None, + _ => panic!("Failed to parse"), + }; + + // Getting the lexeme and then making the token to be returned + // let lexeme = self.source[self.start..self.pos].iter().join(""); + let lexeme = self.source[self.start..self.pos] + .iter() + .map(|it| *it as char) + .join(""); + + let token = Token { + tt, + lexeme, + start: self.start, + length: self.pos - self.start, + line: self.line, + }; + + Some(token) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..65e6ee9 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,26 @@ +#![warn( + clippy::wildcard_imports, + clippy::string_add, + clippy::string_add_assign, + clippy::manual_ok_or, + unused_lifetimes +)] + +pub mod lexer; + +use lexer::Lexer; + +const SOURCE: &str = r#" + +if 5 >= 7 { + print "Hello World"; +} + +"#; + +fn main() { + let lexer = Lexer::new(SOURCE); + for token in lexer { + print!("{} ", token.lexeme); + } +} -- cgit v1.2.3