diff options
| author | Cody <cody@codyq.dev> | 2022-12-15 13:23:48 -0600 |
|---|---|---|
| committer | Cody <cody@codyq.dev> | 2022-12-15 13:23:48 -0600 |
| commit | bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0 (patch) | |
| tree | 874d175f352f1a4688e7e62d1f9222a192ae9bff /src/lexer.rs | |
| download | sloth-bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0.tar.gz | |
Initial language designs & lexer from crafting interpreters
The very initial language designs I came up with for Sloth. Likely
contains inconsistencies and definitely contains things that will be
changed in the future. This is basically just a dump of every idea I've
had for the language thus far.
As for the lexer right now it is heavily based on the one from the
Crafting Interpretrs book and doesn't yet parse Sloth grammar.
Diffstat (limited to 'src/lexer.rs')
| -rw-r--r-- | src/lexer.rs | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..602b5e1 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,285 @@ +#![allow(dead_code)] + +use itertools::Itertools; + +#[derive(Debug, Eq, PartialEq)] +pub enum TokenType { + // Short + Plus, // + + Minus, // - + Slash, // / + Star, // * + Perc, // % + + PlusEq, // += + MinusEq, // -= + SlashEq, // /= + StarEq, // *= + PercEq, // %= + + Eq, // = + EqEq, // == + Bang, // ! + BangEq, // != + + Gt, // > + GtEq, // >= + Lt, // < + LtEq, // <= + + Amp, // & + AmpAmp, // && + Pipe, // | + PipePipe, // || + + LeftParen, // ( + RightParen, // ) + LeftBracket, // [ + RightBracket, // ] + LeftBrace, // { + RightBrace, // } + + Comma, // , + Dot, // . + Colon, // : + SemiColon, // ; + + // Literals + Literal(Literal), + Identifier(String), + + // Keywords + Let, + Fn, + + If, + Else, + + For, + In, + + While, + + Loop, + Break, + Continue, + + Print, // TODO: Change to std library function +} + +#[derive(Debug, Eq, PartialEq)] +pub enum Literal { + String(String), + Character(char), + Number(i32), + Bool(bool), + Nil, +} + +#[derive(Debug)] +pub struct Token { + pub tt: TokenType, + pub lexeme: String, + + start: usize, + length: usize, + line: u32, +} + +pub struct Lexer<'a> { + source: &'a [u8], + start: usize, // Start of the lexme + pos: usize, // End of the lexme + line: u32, +} + +impl<'a> Lexer<'a> { + pub fn new(source: &'a str) -> Lexer<'a> { + Self { + source: source.as_bytes(), + start: 0, + pos: 0, + line: 0, + } + } + + fn peek(&self) -> char { + self.source + .get(self.pos) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn peek_nth(&self, nth: usize) -> char { + self.source + .get(self.pos + nth) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance(&mut self) -> char { + self.pos += 1; + self.source + .get(self.pos - 1) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance_if(&mut self, next: char) -> bool { + if self.peek() != next { + return false; + } + + self.advance(); + true + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option<Self::Item> { + // Ignore all whitespace & comments + loop { + match self.peek() { + '#' => { + while self.peek() != '\n' { + self.advance(); + } + } + '\n' => self.line += 1, + ' ' | '\r' | '\t' => (), + _ => break, + } + self.advance(); + } + + // Resetting the lexeme + self.start = self.pos; + + // Parse the next lexeme + let character = self.advance(); + let tt = match character { + // Arithmetic + '+' if self.advance_if('=') => TokenType::PlusEq, + '-' if self.advance_if('=') => TokenType::MinusEq, + '*' if self.advance_if('=') => TokenType::StarEq, + '/' if self.advance_if('=') => TokenType::SlashEq, + '%' if self.advance_if('=') => TokenType::PercEq, + '+' => TokenType::Plus, + '-' => TokenType::Minus, + '*' => TokenType::Star, + '/' => TokenType::Slash, + '%' => TokenType::Perc, + + '0'..='9' => { + let mut value = String::new(); + value.push(character); + while ('0'..='9').contains(&self.peek()) { + value.push(self.advance()); + } + if self.advance_if('.') { + value.push('.'); + while ('0'..='9').contains(&self.peek()) { + let c = self.advance(); + value.push(c); + } + } + TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap())) + } + + // Logical & Bitwise + '!' if self.advance_if('=') => TokenType::BangEq, + '=' if self.advance_if('=') => TokenType::EqEq, + '>' if self.advance_if('=') => TokenType::GtEq, + '<' if self.advance_if('=') => TokenType::LtEq, + '!' => TokenType::Bang, + '=' => TokenType::Eq, + '>' => TokenType::Gt, + '<' => TokenType::Lt, + + '&' if self.advance_if('&') => TokenType::AmpAmp, + '|' if self.advance_if('|') => TokenType::PipePipe, + '&' => TokenType::Amp, + '|' => TokenType::Pipe, + + // Scope + '(' => TokenType::LeftParen, + ')' => TokenType::RightParen, + '[' => TokenType::LeftBracket, + ']' => TokenType::RightBracket, + '{' => TokenType::LeftBrace, + '}' => TokenType::RightBrace, + ',' => TokenType::Comma, + '.' => TokenType::Dot, + ':' => TokenType::Colon, + ';' => TokenType::SemiColon, + + '"' => { + let mut value = String::new(); + while self.peek() != '"' { + let character = self.advance(); + + if character == '\\' { + match self.advance() { + '\\' => value.push('\\'), + '"' => value.push('"'), + _ => panic!(), + } + continue; + } + + value.push(character); + } + + self.advance(); + TokenType::Literal(Literal::String(value)) + } + + // Keywords & Identifiers + 'a'..='z' | 'A'..='Z' | '_' => { + let mut value = String::new(); + value.push(character); + while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + value.push(self.advance()) + } + + match value.as_str() { + "let" => TokenType::Let, + "fn" => TokenType::Fn, + "if" => TokenType::If, + "else" => TokenType::Else, + "for" => TokenType::For, + "in" => TokenType::In, + "while" => TokenType::While, + "loop" => TokenType::Loop, + "break" => TokenType::Break, + "continue" => TokenType::Continue, + "print" => TokenType::Print, + _ => TokenType::Identifier(value), + } + } + + // Misc. + '\u{0000}' => return None, + _ => panic!("Failed to parse"), + }; + + // Getting the lexeme and then making the token to be returned + // let lexeme = self.source[self.start..self.pos].iter().join(""); + let lexeme = self.source[self.start..self.pos] + .iter() + .map(|it| *it as char) + .join(""); + + let token = Token { + tt, + lexeme, + start: self.start, + length: self.pos - self.start, + line: self.line, + }; + + Some(token) + } +} |
