diff options
Diffstat (limited to 'src/lexer.rs')
| -rw-r--r-- | src/lexer.rs | 418 |
1 files changed, 46 insertions, 372 deletions
diff --git a/src/lexer.rs b/src/lexer.rs index 88d86bd..ef79716 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,84 +1,49 @@ #![allow(dead_code)] -#[derive(Debug, Clone, Eq, PartialEq)] -pub enum TokenType { - // Utility - DocComment(String), - Comment(String), - - // Short - Plus, // + - Minus, // - - Slash, // / - Star, // * - Perc, // % - - PlusEq, // += - MinusEq, // -= - SlashEq, // /= - StarEq, // *= - PercEq, // %= - - Eq, // = - EqEq, // == - Bang, // ! - BangEq, // != +use thiserror::Error; - Gt, // > - GtGt, // >> - GtEq, // >= - Lt, // < - LtLt, // << - LtEq, // <= - - Amp, // & - AmpAmp, // && - Pipe, // | - PipePipe, // || - - DotDot, // . - - LeftParen, // ( - RightParen, // ) - LeftBracket, // [ - RightBracket, // ] - LeftBrace, // { - RightBrace, // } - - Comma, // , - Dot, // . - Colon, // : - SemiColon, // ; +#[derive(Debug, Error)] +pub enum LexerError { + #[error("Unexpected token")] + UnexpectedToken, +} - // Literals +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum TokenType { + // Meta + DocComment, + Comment, + + // Operatiors + Plus, + Minus, + Star, + Slash, + Perc, + + PlusEq, + MinusEq, + StarEq, + SlashEq, + PercEq, + + // Misc Literal(Literal), - Identifier(String), - - // Keywords - Val, - Var, - Fn, - - If, - Else, - - For, - In, - - While, - - Loop, - Break, - Continue, } #[derive(Debug, Clone, Eq, PartialEq)] pub enum Literal { - String(String), - Character(char), - Number(i32), - Bool(bool), - Nil, + Numeric, + Boolean, + Character, + String, + Regex, +} + +#[derive(Debug, Default)] +pub struct Location { + row: u32, + column: u32, } #[derive(Debug)] @@ -86,52 +51,24 @@ pub struct Token<'a> { pub tt: TokenType, pub lexeme: &'a str, - start: usize, - length: usize, - line: u32, + start: Location, + end: Location, } pub struct Lexer<'a> { source: &'a [u8], - start: usize, // Start of the lexme - pos: usize, // End of the lexme - line: u32, + + start: Location, + end: Location, } impl<'a> Lexer<'a> { - pub fn new(source: &'a str) -> Lexer<'a> { + fn new(source: &'a str) -> Self { Self { source: source.as_bytes(), - start: 0, - pos: 0, - line: 0, - } - } - - fn peek(&self) -> Option<char> { - self.source.get(self.pos).map(|it| *it as char) - } - - fn peek_nth(&self, nth: usize) -> Option<char> { - self.source.get(self.pos + nth).map(|it| *it as char) - } - - fn advance(&mut self) -> Option<char> { - self.pos += 1; - self.source.get(self.pos - 1).map(|it| *it as char) - } - - fn advance_if(&mut self, next: impl FnOnce(Option<char>) -> bool) -> bool { - if next(self.peek()) { - self.advance(); - return true; + start: Default::default(), + end: Default::default(), } - - false - } - - fn advance_if_eq(&mut self, next: Option<char>) -> bool { - self.advance_if(|it| it == next) } } @@ -139,269 +76,6 @@ impl<'a> Iterator for Lexer<'a> { type Item = Token<'a>; fn next(&mut self) -> Option<Self::Item> { - // Ignore all whitespace - loop { - match self.peek() { - Some('\n') => self.line += 1, - Some(' ') | Some('\r') | Some('\t') => (), - _ => break, - } - self.advance(); - } - - // Resetting the lexeme - self.start = self.pos; - - // Parse the next lexeme- If it is EOF return nothing - let Some(character) = self.advance() else { - return None; - }; - - let tt = match character { - // Whitespace & Comments - '#' if self.advance_if_eq(Some('#')) => { - let mut value = String::new(); - while self.peek() != Some('\n') { - value.push(self.advance().unwrap()); - } - - TokenType::DocComment(value) - } - - '#' => { - let mut value = String::new(); - while self.peek() != Some('\n') { - value.push(self.advance().unwrap()); - } - - TokenType::Comment(value) - } - - // Arithmetic - '+' if self.advance_if_eq(Some('=')) => TokenType::PlusEq, - '-' if self.advance_if_eq(Some('=')) => TokenType::MinusEq, - '*' if self.advance_if_eq(Some('=')) => TokenType::StarEq, - '/' if self.advance_if_eq(Some('=')) => TokenType::SlashEq, - '%' if self.advance_if_eq(Some('=')) => TokenType::PercEq, - '+' => TokenType::Plus, - '-' => TokenType::Minus, - '*' => TokenType::Star, - '/' => TokenType::Slash, - '%' => TokenType::Perc, - - '0'..='9' => { - let mut value = String::new(); - value.push(character); - while let Some('0'..='9') = &self.peek() { - value.push(self.advance().unwrap()); - } - - if self.peek() == Some('.') && self.peek_nth(1) != Some('.') { - self.advance(); - value.push('.'); - while self.peek().unwrap().is_ascii_digit() { - value.push(self.advance().unwrap()); - } - } - TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap())) - } - - // Logical & Bitwise - '!' if self.advance_if_eq(Some('=')) => TokenType::BangEq, - '=' if self.advance_if_eq(Some('=')) => TokenType::EqEq, - '>' if self.advance_if_eq(Some('>')) => TokenType::GtGt, - '>' if self.advance_if_eq(Some('=')) => TokenType::GtEq, - '<' if self.advance_if_eq(Some('<')) => TokenType::LtLt, - '<' if self.advance_if_eq(Some('=')) => TokenType::LtEq, - '!' => TokenType::Bang, - '=' => TokenType::Eq, - '>' => TokenType::Gt, - '<' => TokenType::Lt, - - '&' if self.advance_if_eq(Some('&')) => TokenType::AmpAmp, - '|' if self.advance_if_eq(Some('|')) => TokenType::PipePipe, - '&' => TokenType::Amp, - '|' => TokenType::Pipe, - - // Misc. Operators - '.' if self.advance_if_eq(Some('.')) => TokenType::DotDot, - - // Scope - '(' => TokenType::LeftParen, - ')' => TokenType::RightParen, - '[' => TokenType::LeftBracket, - ']' => TokenType::RightBracket, - '{' => TokenType::LeftBrace, - '}' => TokenType::RightBrace, - ',' => TokenType::Comma, - '.' => TokenType::Dot, - ':' => TokenType::Colon, - ';' => TokenType::SemiColon, - - '"' => { - let mut value = String::new(); - while self.peek() != Some('"') { - let Some(character) = self.advance() else { - panic!("Syntax Error: String invalid"); - }; - - if character == '\\' { - match self.advance().unwrap() { - '\\' => value.push('\\'), - '"' => value.push('"'), - 'n' => value.push('\n'), - _ => panic!(), - } - continue; - } - - value.push(character); - } - - self.advance(); - TokenType::Literal(Literal::String(value)) - } - - // Keywords & Identifiers - 'a'..='z' | 'A'..='Z' | '_' => { - let mut value = String::new(); - value.push(character); - - while let Some(character) = self.peek() && matches!(character, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { - value.push(self.advance().unwrap()); - } - - match value.as_str() { - "val" => TokenType::Val, - "var" => TokenType::Var, - "fn" => TokenType::Fn, - "if" => TokenType::If, - "else" => TokenType::Else, - "for" => TokenType::For, - "in" => TokenType::In, - "while" => TokenType::While, - "loop" => TokenType::Loop, - "break" => TokenType::Break, - "continue" => TokenType::Continue, - "true" => TokenType::Literal(Literal::Bool(true)), - "false" => TokenType::Literal(Literal::Bool(false)), - _ => TokenType::Identifier(value), - } - } - - // Misc. - _ => panic!("Failed to parse"), - }; - - let lexeme = unsafe { - // If it got to this point we know the slice is valid UTF-8. The only area in - // the language that UTF-8 characters are recognized is within strings. - std::str::from_utf8_unchecked(&self.source[self.start..self.pos]) - }; - - let token = Token { - tt, - lexeme, - start: self.start, - length: self.pos - self.start, - line: self.line, - }; - - Some(token) - } -} - -#[cfg(test)] -mod tests { - extern crate test; - - use test::Bencher; - - use super::{Lexer, Literal, TokenType}; - - const SAMPLE_PROGRAM: &str = r#" -val variable = 5; - -if variable >= 7 { - print("Hello World"); -} - -if variable < 52 { - variable += 1; - print("Hello ${variable}"); -} - -for person in ["Cody", "Johnny"] { - print("Hello ${person}"); -} -"#; - - #[test] - fn simple_code() { - let tokens = vec![ - // top - TokenType::Val, - TokenType::Identifier("variable".to_owned()), - TokenType::Eq, - TokenType::Literal(Literal::Number(5)), - TokenType::SemiColon, - // 1st block - TokenType::If, - TokenType::Identifier("variable".to_owned()), - TokenType::GtEq, - TokenType::Literal(Literal::Number(7)), - TokenType::LeftBrace, - TokenType::Identifier("print".to_owned()), - TokenType::LeftParen, - TokenType::Literal(Literal::String("Hello World".to_owned())), - TokenType::RightParen, - TokenType::SemiColon, - TokenType::RightBrace, - // 2nd block - TokenType::If, - TokenType::Identifier("variable".to_owned()), - TokenType::Lt, - TokenType::Literal(Literal::Number(52)), - TokenType::LeftBrace, - TokenType::Identifier("variable".to_owned()), - TokenType::PlusEq, - TokenType::Literal(Literal::Number(1)), - TokenType::SemiColon, - TokenType::Identifier("print".to_owned()), - TokenType::LeftParen, - TokenType::Literal(Literal::String("Hello ${variable}".to_owned())), - TokenType::RightParen, - TokenType::SemiColon, - TokenType::RightBrace, - // 3rd block - TokenType::For, - TokenType::Identifier("person".to_owned()), - TokenType::In, - TokenType::LeftBracket, - TokenType::Literal(Literal::String("Cody".to_owned())), - TokenType::Comma, - TokenType::Literal(Literal::String("Johnny".to_owned())), - TokenType::RightBracket, - TokenType::LeftBrace, - TokenType::Identifier("print".to_owned()), - TokenType::LeftParen, - TokenType::Literal(Literal::String("Hello ${person}".to_owned())), - TokenType::RightParen, - TokenType::SemiColon, - TokenType::RightBrace, - ]; - - let lexed_code = Lexer::new(SAMPLE_PROGRAM) - .map(|it| it.tt) - .collect::<Vec<_>>(); - - assert_eq!(tokens, lexed_code); - } - - #[bench] - fn bench_lexer(b: &mut Bencher) { - b.iter(|| { - let _ = Lexer::new(SAMPLE_PROGRAM).collect::<Vec<_>>(); - }); + unimplemented!() } } |
