From 1790d6a9091167b8bcc5f9136480ff2659c0660a Mon Sep 17 00:00:00 2001 From: Cody Date: Sat, 17 Dec 2022 13:52:32 -0600 Subject: Improvements to lexer, add test and benchmarks to lexer - Change lexer functions to return a option - Return None variant instead of \u0000 in case of EOF - Test lexer against a sample program --- src/ast.rs | 0 src/lexer.rs | 203 +++++++++++++++++++++++++++++++++++++++++++---------------- src/main.rs | 3 +- 3 files changed, 152 insertions(+), 54 deletions(-) create mode 100644 src/ast.rs (limited to 'src') diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/lexer.rs b/src/lexer.rs index 75b1993..74a006c 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -2,6 +2,9 @@ #[derive(Debug, Eq, PartialEq)] pub enum TokenType { + // Utility + Comment(String), + // Short Plus, // + Minus, // - @@ -102,35 +105,30 @@ impl<'a> Lexer<'a> { } } - fn peek(&self) -> char { - self.source - .get(self.pos) - .map(|it| *it as char) - .unwrap_or('\u{0000}') + fn peek(&self) -> Option { + self.source.get(self.pos).map(|it| *it as char) } - fn peek_nth(&self, nth: usize) -> char { - self.source - .get(self.pos + nth) - .map(|it| *it as char) - .unwrap_or('\u{0000}') + fn peek_nth(&self, nth: usize) -> Option { + self.source.get(self.pos + nth).map(|it| *it as char) } - fn advance(&mut self) -> char { + fn advance(&mut self) -> Option { self.pos += 1; - self.source - .get(self.pos - 1) - .map(|it| *it as char) - .unwrap_or('\u{0000}') + self.source.get(self.pos - 1).map(|it| *it as char) } - fn advance_if(&mut self, next: char) -> bool { - if self.peek() != next { - return false; + fn advance_if(&mut self, next: impl FnOnce(Option) -> bool) -> bool { + if next(self.peek()) { + self.advance(); + return true; } - self.advance(); - true + false + } + + fn advance_if_eq(&mut self, next: Option) -> bool { + self.advance_if(|it| it == next) } } @@ -138,16 +136,11 @@ impl<'a> Iterator for Lexer<'a> { type Item = Token<'a>; fn next(&mut self) -> Option { - // Ignore all whitespace & comments + // Ignore all whitespace loop { match self.peek() { - '#' => { - while self.peek() != '\n' { - self.advance(); - } - } - '\n' => self.line += 1, - ' ' | '\r' | '\t' => (), + Some('\n') => self.line += 1, + Some(' ') | Some('\r') | Some('\t') => (), _ => break, } self.advance(); @@ -156,15 +149,28 @@ impl<'a> Iterator for Lexer<'a> { // Resetting the lexeme self.start = self.pos; - // Parse the next lexeme - let character = self.advance(); + // Parse the next lexeme- If it is EOF return nothing + let Some(character) = self.advance() else { + return None; + }; + let tt = match character { + // Whitespace & Comments + '#' => { + let mut value = String::new(); + while self.peek() != Some('\n') { + value.push(self.advance().unwrap()); + } + + TokenType::Comment(value) + } + // Arithmetic - '+' if self.advance_if('=') => TokenType::PlusEq, - '-' if self.advance_if('=') => TokenType::MinusEq, - '*' if self.advance_if('=') => TokenType::StarEq, - '/' if self.advance_if('=') => TokenType::SlashEq, - '%' if self.advance_if('=') => TokenType::PercEq, + '+' if self.advance_if_eq(Some('=')) => TokenType::PlusEq, + '-' if self.advance_if_eq(Some('=')) => TokenType::MinusEq, + '*' if self.advance_if_eq(Some('=')) => TokenType::StarEq, + '/' if self.advance_if_eq(Some('=')) => TokenType::SlashEq, + '%' if self.advance_if_eq(Some('=')) => TokenType::PercEq, '+' => TokenType::Plus, '-' => TokenType::Minus, '*' => TokenType::Star, @@ -174,31 +180,31 @@ impl<'a> Iterator for Lexer<'a> { '0'..='9' => { let mut value = String::new(); value.push(character); - while ('0'..='9').contains(&self.peek()) { - value.push(self.advance()); + while ('0'..='9').contains(&self.peek().unwrap()) { + value.push(self.advance().unwrap()); } - if self.advance_if('.') { + + if self.advance_if_eq(Some('.')) { value.push('.'); - while ('0'..='9').contains(&self.peek()) { - let c = self.advance(); - value.push(c); + while ('0'..='9').contains(&self.peek().unwrap()) { + value.push(self.advance().unwrap()); } } TokenType::Literal(Literal::Number(value.parse::().unwrap())) } // Logical & Bitwise - '!' if self.advance_if('=') => TokenType::BangEq, - '=' if self.advance_if('=') => TokenType::EqEq, - '>' if self.advance_if('=') => TokenType::GtEq, - '<' if self.advance_if('=') => TokenType::LtEq, + '!' if self.advance_if_eq(Some('=')) => TokenType::BangEq, + '=' if self.advance_if_eq(Some('=')) => TokenType::EqEq, + '>' if self.advance_if_eq(Some('=')) => TokenType::GtEq, + '<' if self.advance_if_eq(Some('=')) => TokenType::LtEq, '!' => TokenType::Bang, '=' => TokenType::Eq, '>' => TokenType::Gt, '<' => TokenType::Lt, - '&' if self.advance_if('&') => TokenType::AmpAmp, - '|' if self.advance_if('|') => TokenType::PipePipe, + '&' if self.advance_if_eq(Some('&')) => TokenType::AmpAmp, + '|' if self.advance_if_eq(Some('|')) => TokenType::PipePipe, '&' => TokenType::Amp, '|' => TokenType::Pipe, @@ -216,11 +222,13 @@ impl<'a> Iterator for Lexer<'a> { '"' => { let mut value = String::new(); - while self.peek() != '"' { - let character = self.advance(); + while self.peek() != Some('"') { + let Some(character) = self.advance() else { + panic!("Syntax Error: String invalid"); + }; if character == '\\' { - match self.advance() { + match self.advance().unwrap() { '\\' => value.push('\\'), '"' => value.push('"'), _ => panic!(), @@ -239,8 +247,9 @@ impl<'a> Iterator for Lexer<'a> { 'a'..='z' | 'A'..='Z' | '_' => { let mut value = String::new(); value.push(character); - while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { - value.push(self.advance()) + + while let Some(character) = self.peek() && matches!(character, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + value.push(self.advance().unwrap()); } match value.as_str() { @@ -261,7 +270,6 @@ impl<'a> Iterator for Lexer<'a> { } // Misc. - '\u{0000}' => return None, _ => panic!("Failed to parse"), }; @@ -282,3 +290,92 @@ impl<'a> Iterator for Lexer<'a> { Some(token) } } + +#[cfg(test)] +mod tests { + extern crate test; + + use test::Bencher; + + use super::{Lexer, Literal, TokenType}; + + const SAMPLE_PROGRAM: &str = r#" +val variable = 5; + +if variable >= 7 { + print "Hello World"; +} + +if variable < 52 { + variable += 1; + print "Hello ${variable}"; +} + +for person in ["Cody", "Johnny"] { + print "Hello ${person}"; +} +"#; + + #[test] + fn simple_code() { + let tokens = vec![ + // top + TokenType::Val, + TokenType::Identifier("variable".to_owned()), + TokenType::Eq, + TokenType::Literal(Literal::Number(5)), + TokenType::SemiColon, + // 1st block + TokenType::If, + TokenType::Identifier("variable".to_owned()), + TokenType::GtEq, + TokenType::Literal(Literal::Number(7)), + TokenType::LeftBrace, + TokenType::Print, + TokenType::Literal(Literal::String("Hello World".to_owned())), + TokenType::SemiColon, + TokenType::RightBrace, + // 2nd block + TokenType::If, + TokenType::Identifier("variable".to_owned()), + TokenType::Lt, + TokenType::Literal(Literal::Number(52)), + TokenType::LeftBrace, + TokenType::Identifier("variable".to_owned()), + TokenType::PlusEq, + TokenType::Literal(Literal::Number(1)), + TokenType::SemiColon, + TokenType::Print, + TokenType::Literal(Literal::String("Hello ${variable}".to_owned())), + TokenType::SemiColon, + TokenType::RightBrace, + // 3rd block + TokenType::For, + TokenType::Identifier("person".to_owned()), + TokenType::In, + TokenType::LeftBracket, + TokenType::Literal(Literal::String("Cody".to_owned())), + TokenType::Comma, + TokenType::Literal(Literal::String("Johnny".to_owned())), + TokenType::RightBracket, + TokenType::LeftBrace, + TokenType::Print, + TokenType::Literal(Literal::String("Hello ${person}".to_owned())), + TokenType::SemiColon, + TokenType::RightBrace, + ]; + + let lexed_code = Lexer::new(SAMPLE_PROGRAM) + .map(|it| it.tt) + .collect::>(); + + assert_eq!(tokens, lexed_code); + } + + #[bench] + fn bench_lexer(b: &mut Bencher) { + b.iter(|| { + let _ = Lexer::new(SAMPLE_PROGRAM).collect::>(); + }); + } +} diff --git a/src/main.rs b/src/main.rs index fa355c9..8f3c001 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +#![feature(test, let_chains)] #![warn( clippy::wildcard_imports, clippy::string_add, @@ -24,6 +25,6 @@ if variable >= 7 { fn main() { let lexer = Lexer::new(SOURCE); for token in lexer { - print!("({}) ", token.lexeme); + print!("{} ", token.lexeme); } } -- cgit v1.2.3