aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexer.rs')
-rw-r--r--src/lexer.rs285
1 files changed, 285 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..602b5e1
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,285 @@
+#![allow(dead_code)]
+
+use itertools::Itertools;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum TokenType {
+ // Short
+ Plus, // +
+ Minus, // -
+ Slash, // /
+ Star, // *
+ Perc, // %
+
+ PlusEq, // +=
+ MinusEq, // -=
+ SlashEq, // /=
+ StarEq, // *=
+ PercEq, // %=
+
+ Eq, // =
+ EqEq, // ==
+ Bang, // !
+ BangEq, // !=
+
+ Gt, // >
+ GtEq, // >=
+ Lt, // <
+ LtEq, // <=
+
+ Amp, // &
+ AmpAmp, // &&
+ Pipe, // |
+ PipePipe, // ||
+
+ LeftParen, // (
+ RightParen, // )
+ LeftBracket, // [
+ RightBracket, // ]
+ LeftBrace, // {
+ RightBrace, // }
+
+ Comma, // ,
+ Dot, // .
+ Colon, // :
+ SemiColon, // ;
+
+ // Literals
+ Literal(Literal),
+ Identifier(String),
+
+ // Keywords
+ Let,
+ Fn,
+
+ If,
+ Else,
+
+ For,
+ In,
+
+ While,
+
+ Loop,
+ Break,
+ Continue,
+
+ Print, // TODO: Change to std library function
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Literal {
+ String(String),
+ Character(char),
+ Number(i32),
+ Bool(bool),
+ Nil,
+}
+
+#[derive(Debug)]
+pub struct Token {
+ pub tt: TokenType,
+ pub lexeme: String,
+
+ start: usize,
+ length: usize,
+ line: u32,
+}
+
+pub struct Lexer<'a> {
+ source: &'a [u8],
+ start: usize, // Start of the lexme
+ pos: usize, // End of the lexme
+ line: u32,
+}
+
+impl<'a> Lexer<'a> {
+ pub fn new(source: &'a str) -> Lexer<'a> {
+ Self {
+ source: source.as_bytes(),
+ start: 0,
+ pos: 0,
+ line: 0,
+ }
+ }
+
+ fn peek(&self) -> char {
+ self.source
+ .get(self.pos)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn peek_nth(&self, nth: usize) -> char {
+ self.source
+ .get(self.pos + nth)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn advance(&mut self) -> char {
+ self.pos += 1;
+ self.source
+ .get(self.pos - 1)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn advance_if(&mut self, next: char) -> bool {
+ if self.peek() != next {
+ return false;
+ }
+
+ self.advance();
+ true
+ }
+}
+
+impl<'a> Iterator for Lexer<'a> {
+ type Item = Token;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ // Ignore all whitespace & comments
+ loop {
+ match self.peek() {
+ '#' => {
+ while self.peek() != '\n' {
+ self.advance();
+ }
+ }
+ '\n' => self.line += 1,
+ ' ' | '\r' | '\t' => (),
+ _ => break,
+ }
+ self.advance();
+ }
+
+ // Resetting the lexeme
+ self.start = self.pos;
+
+ // Parse the next lexeme
+ let character = self.advance();
+ let tt = match character {
+ // Arithmetic
+ '+' if self.advance_if('=') => TokenType::PlusEq,
+ '-' if self.advance_if('=') => TokenType::MinusEq,
+ '*' if self.advance_if('=') => TokenType::StarEq,
+ '/' if self.advance_if('=') => TokenType::SlashEq,
+ '%' if self.advance_if('=') => TokenType::PercEq,
+ '+' => TokenType::Plus,
+ '-' => TokenType::Minus,
+ '*' => TokenType::Star,
+ '/' => TokenType::Slash,
+ '%' => TokenType::Perc,
+
+ '0'..='9' => {
+ let mut value = String::new();
+ value.push(character);
+ while ('0'..='9').contains(&self.peek()) {
+ value.push(self.advance());
+ }
+ if self.advance_if('.') {
+ value.push('.');
+ while ('0'..='9').contains(&self.peek()) {
+ let c = self.advance();
+ value.push(c);
+ }
+ }
+ TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap()))
+ }
+
+ // Logical & Bitwise
+ '!' if self.advance_if('=') => TokenType::BangEq,
+ '=' if self.advance_if('=') => TokenType::EqEq,
+ '>' if self.advance_if('=') => TokenType::GtEq,
+ '<' if self.advance_if('=') => TokenType::LtEq,
+ '!' => TokenType::Bang,
+ '=' => TokenType::Eq,
+ '>' => TokenType::Gt,
+ '<' => TokenType::Lt,
+
+ '&' if self.advance_if('&') => TokenType::AmpAmp,
+ '|' if self.advance_if('|') => TokenType::PipePipe,
+ '&' => TokenType::Amp,
+ '|' => TokenType::Pipe,
+
+ // Scope
+ '(' => TokenType::LeftParen,
+ ')' => TokenType::RightParen,
+ '[' => TokenType::LeftBracket,
+ ']' => TokenType::RightBracket,
+ '{' => TokenType::LeftBrace,
+ '}' => TokenType::RightBrace,
+ ',' => TokenType::Comma,
+ '.' => TokenType::Dot,
+ ':' => TokenType::Colon,
+ ';' => TokenType::SemiColon,
+
+ '"' => {
+ let mut value = String::new();
+ while self.peek() != '"' {
+ let character = self.advance();
+
+ if character == '\\' {
+ match self.advance() {
+ '\\' => value.push('\\'),
+ '"' => value.push('"'),
+ _ => panic!(),
+ }
+ continue;
+ }
+
+ value.push(character);
+ }
+
+ self.advance();
+ TokenType::Literal(Literal::String(value))
+ }
+
+ // Keywords & Identifiers
+ 'a'..='z' | 'A'..='Z' | '_' => {
+ let mut value = String::new();
+ value.push(character);
+ while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
+ value.push(self.advance())
+ }
+
+ match value.as_str() {
+ "let" => TokenType::Let,
+ "fn" => TokenType::Fn,
+ "if" => TokenType::If,
+ "else" => TokenType::Else,
+ "for" => TokenType::For,
+ "in" => TokenType::In,
+ "while" => TokenType::While,
+ "loop" => TokenType::Loop,
+ "break" => TokenType::Break,
+ "continue" => TokenType::Continue,
+ "print" => TokenType::Print,
+ _ => TokenType::Identifier(value),
+ }
+ }
+
+ // Misc.
+ '\u{0000}' => return None,
+ _ => panic!("Failed to parse"),
+ };
+
+ // Getting the lexeme and then making the token to be returned
+ // let lexeme = self.source[self.start..self.pos].iter().join("");
+ let lexeme = self.source[self.start..self.pos]
+ .iter()
+ .map(|it| *it as char)
+ .join("");
+
+ let token = Token {
+ tt,
+ lexeme,
+ start: self.start,
+ length: self.pos - self.start,
+ line: self.line,
+ };
+
+ Some(token)
+ }
+}