diff options
Diffstat (limited to 'src/lexer.rs')
| -rw-r--r-- | src/lexer.rs | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..602b5e1 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,285 @@ +#![allow(dead_code)] + +use itertools::Itertools; + +#[derive(Debug, Eq, PartialEq)] +pub enum TokenType { + // Short + Plus, // + + Minus, // - + Slash, // / + Star, // * + Perc, // % + + PlusEq, // += + MinusEq, // -= + SlashEq, // /= + StarEq, // *= + PercEq, // %= + + Eq, // = + EqEq, // == + Bang, // ! + BangEq, // != + + Gt, // > + GtEq, // >= + Lt, // < + LtEq, // <= + + Amp, // & + AmpAmp, // && + Pipe, // | + PipePipe, // || + + LeftParen, // ( + RightParen, // ) + LeftBracket, // [ + RightBracket, // ] + LeftBrace, // { + RightBrace, // } + + Comma, // , + Dot, // . + Colon, // : + SemiColon, // ; + + // Literals + Literal(Literal), + Identifier(String), + + // Keywords + Let, + Fn, + + If, + Else, + + For, + In, + + While, + + Loop, + Break, + Continue, + + Print, // TODO: Change to std library function +} + +#[derive(Debug, Eq, PartialEq)] +pub enum Literal { + String(String), + Character(char), + Number(i32), + Bool(bool), + Nil, +} + +#[derive(Debug)] +pub struct Token { + pub tt: TokenType, + pub lexeme: String, + + start: usize, + length: usize, + line: u32, +} + +pub struct Lexer<'a> { + source: &'a [u8], + start: usize, // Start of the lexme + pos: usize, // End of the lexme + line: u32, +} + +impl<'a> Lexer<'a> { + pub fn new(source: &'a str) -> Lexer<'a> { + Self { + source: source.as_bytes(), + start: 0, + pos: 0, + line: 0, + } + } + + fn peek(&self) -> char { + self.source + .get(self.pos) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn peek_nth(&self, nth: usize) -> char { + self.source + .get(self.pos + nth) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance(&mut self) -> char { + self.pos += 1; + self.source + .get(self.pos - 1) + .map(|it| *it as char) + .unwrap_or('\u{0000}') + } + + fn advance_if(&mut self, next: char) -> bool { + if self.peek() != next { + return false; + } + + self.advance(); + true + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option<Self::Item> { + // Ignore all whitespace & comments + loop { + match self.peek() { + '#' => { + while self.peek() != '\n' { + self.advance(); + } + } + '\n' => self.line += 1, + ' ' | '\r' | '\t' => (), + _ => break, + } + self.advance(); + } + + // Resetting the lexeme + self.start = self.pos; + + // Parse the next lexeme + let character = self.advance(); + let tt = match character { + // Arithmetic + '+' if self.advance_if('=') => TokenType::PlusEq, + '-' if self.advance_if('=') => TokenType::MinusEq, + '*' if self.advance_if('=') => TokenType::StarEq, + '/' if self.advance_if('=') => TokenType::SlashEq, + '%' if self.advance_if('=') => TokenType::PercEq, + '+' => TokenType::Plus, + '-' => TokenType::Minus, + '*' => TokenType::Star, + '/' => TokenType::Slash, + '%' => TokenType::Perc, + + '0'..='9' => { + let mut value = String::new(); + value.push(character); + while ('0'..='9').contains(&self.peek()) { + value.push(self.advance()); + } + if self.advance_if('.') { + value.push('.'); + while ('0'..='9').contains(&self.peek()) { + let c = self.advance(); + value.push(c); + } + } + TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap())) + } + + // Logical & Bitwise + '!' if self.advance_if('=') => TokenType::BangEq, + '=' if self.advance_if('=') => TokenType::EqEq, + '>' if self.advance_if('=') => TokenType::GtEq, + '<' if self.advance_if('=') => TokenType::LtEq, + '!' => TokenType::Bang, + '=' => TokenType::Eq, + '>' => TokenType::Gt, + '<' => TokenType::Lt, + + '&' if self.advance_if('&') => TokenType::AmpAmp, + '|' if self.advance_if('|') => TokenType::PipePipe, + '&' => TokenType::Amp, + '|' => TokenType::Pipe, + + // Scope + '(' => TokenType::LeftParen, + ')' => TokenType::RightParen, + '[' => TokenType::LeftBracket, + ']' => TokenType::RightBracket, + '{' => TokenType::LeftBrace, + '}' => TokenType::RightBrace, + ',' => TokenType::Comma, + '.' => TokenType::Dot, + ':' => TokenType::Colon, + ';' => TokenType::SemiColon, + + '"' => { + let mut value = String::new(); + while self.peek() != '"' { + let character = self.advance(); + + if character == '\\' { + match self.advance() { + '\\' => value.push('\\'), + '"' => value.push('"'), + _ => panic!(), + } + continue; + } + + value.push(character); + } + + self.advance(); + TokenType::Literal(Literal::String(value)) + } + + // Keywords & Identifiers + 'a'..='z' | 'A'..='Z' | '_' => { + let mut value = String::new(); + value.push(character); + while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + value.push(self.advance()) + } + + match value.as_str() { + "let" => TokenType::Let, + "fn" => TokenType::Fn, + "if" => TokenType::If, + "else" => TokenType::Else, + "for" => TokenType::For, + "in" => TokenType::In, + "while" => TokenType::While, + "loop" => TokenType::Loop, + "break" => TokenType::Break, + "continue" => TokenType::Continue, + "print" => TokenType::Print, + _ => TokenType::Identifier(value), + } + } + + // Misc. + '\u{0000}' => return None, + _ => panic!("Failed to parse"), + }; + + // Getting the lexeme and then making the token to be returned + // let lexeme = self.source[self.start..self.pos].iter().join(""); + let lexeme = self.source[self.start..self.pos] + .iter() + .map(|it| *it as char) + .join(""); + + let token = Token { + tt, + lexeme, + start: self.start, + length: self.pos - self.start, + line: self.line, + }; + + Some(token) + } +} |
