aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCody <cody@codyq.dev>2022-12-15 13:23:48 -0600
committerCody <cody@codyq.dev>2022-12-15 13:23:48 -0600
commitbddb011df4999f7ffeeddf6a4b66e2da6ab19ea0 (patch)
tree874d175f352f1a4688e7e62d1f9222a192ae9bff /src
downloadsloth-bddb011df4999f7ffeeddf6a4b66e2da6ab19ea0.tar.gz
Initial language designs & lexer from crafting interpreters
The very initial language designs I came up with for Sloth. Likely contains inconsistencies and definitely contains things that will be changed in the future. This is basically just a dump of every idea I've had for the language thus far. As for the lexer right now it is heavily based on the one from the Crafting Interpretrs book and doesn't yet parse Sloth grammar.
Diffstat (limited to 'src')
-rw-r--r--src/lexer.rs285
-rw-r--r--src/main.rs26
2 files changed, 311 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..602b5e1
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,285 @@
+#![allow(dead_code)]
+
+use itertools::Itertools;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum TokenType {
+ // Short
+ Plus, // +
+ Minus, // -
+ Slash, // /
+ Star, // *
+ Perc, // %
+
+ PlusEq, // +=
+ MinusEq, // -=
+ SlashEq, // /=
+ StarEq, // *=
+ PercEq, // %=
+
+ Eq, // =
+ EqEq, // ==
+ Bang, // !
+ BangEq, // !=
+
+ Gt, // >
+ GtEq, // >=
+ Lt, // <
+ LtEq, // <=
+
+ Amp, // &
+ AmpAmp, // &&
+ Pipe, // |
+ PipePipe, // ||
+
+ LeftParen, // (
+ RightParen, // )
+ LeftBracket, // [
+ RightBracket, // ]
+ LeftBrace, // {
+ RightBrace, // }
+
+ Comma, // ,
+ Dot, // .
+ Colon, // :
+ SemiColon, // ;
+
+ // Literals
+ Literal(Literal),
+ Identifier(String),
+
+ // Keywords
+ Let,
+ Fn,
+
+ If,
+ Else,
+
+ For,
+ In,
+
+ While,
+
+ Loop,
+ Break,
+ Continue,
+
+ Print, // TODO: Change to std library function
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Literal {
+ String(String),
+ Character(char),
+ Number(i32),
+ Bool(bool),
+ Nil,
+}
+
+#[derive(Debug)]
+pub struct Token {
+ pub tt: TokenType,
+ pub lexeme: String,
+
+ start: usize,
+ length: usize,
+ line: u32,
+}
+
+pub struct Lexer<'a> {
+ source: &'a [u8],
+ start: usize, // Start of the lexme
+ pos: usize, // End of the lexme
+ line: u32,
+}
+
+impl<'a> Lexer<'a> {
+ pub fn new(source: &'a str) -> Lexer<'a> {
+ Self {
+ source: source.as_bytes(),
+ start: 0,
+ pos: 0,
+ line: 0,
+ }
+ }
+
+ fn peek(&self) -> char {
+ self.source
+ .get(self.pos)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn peek_nth(&self, nth: usize) -> char {
+ self.source
+ .get(self.pos + nth)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn advance(&mut self) -> char {
+ self.pos += 1;
+ self.source
+ .get(self.pos - 1)
+ .map(|it| *it as char)
+ .unwrap_or('\u{0000}')
+ }
+
+ fn advance_if(&mut self, next: char) -> bool {
+ if self.peek() != next {
+ return false;
+ }
+
+ self.advance();
+ true
+ }
+}
+
+impl<'a> Iterator for Lexer<'a> {
+ type Item = Token;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ // Ignore all whitespace & comments
+ loop {
+ match self.peek() {
+ '#' => {
+ while self.peek() != '\n' {
+ self.advance();
+ }
+ }
+ '\n' => self.line += 1,
+ ' ' | '\r' | '\t' => (),
+ _ => break,
+ }
+ self.advance();
+ }
+
+ // Resetting the lexeme
+ self.start = self.pos;
+
+ // Parse the next lexeme
+ let character = self.advance();
+ let tt = match character {
+ // Arithmetic
+ '+' if self.advance_if('=') => TokenType::PlusEq,
+ '-' if self.advance_if('=') => TokenType::MinusEq,
+ '*' if self.advance_if('=') => TokenType::StarEq,
+ '/' if self.advance_if('=') => TokenType::SlashEq,
+ '%' if self.advance_if('=') => TokenType::PercEq,
+ '+' => TokenType::Plus,
+ '-' => TokenType::Minus,
+ '*' => TokenType::Star,
+ '/' => TokenType::Slash,
+ '%' => TokenType::Perc,
+
+ '0'..='9' => {
+ let mut value = String::new();
+ value.push(character);
+ while ('0'..='9').contains(&self.peek()) {
+ value.push(self.advance());
+ }
+ if self.advance_if('.') {
+ value.push('.');
+ while ('0'..='9').contains(&self.peek()) {
+ let c = self.advance();
+ value.push(c);
+ }
+ }
+ TokenType::Literal(Literal::Number(value.parse::<i32>().unwrap()))
+ }
+
+ // Logical & Bitwise
+ '!' if self.advance_if('=') => TokenType::BangEq,
+ '=' if self.advance_if('=') => TokenType::EqEq,
+ '>' if self.advance_if('=') => TokenType::GtEq,
+ '<' if self.advance_if('=') => TokenType::LtEq,
+ '!' => TokenType::Bang,
+ '=' => TokenType::Eq,
+ '>' => TokenType::Gt,
+ '<' => TokenType::Lt,
+
+ '&' if self.advance_if('&') => TokenType::AmpAmp,
+ '|' if self.advance_if('|') => TokenType::PipePipe,
+ '&' => TokenType::Amp,
+ '|' => TokenType::Pipe,
+
+ // Scope
+ '(' => TokenType::LeftParen,
+ ')' => TokenType::RightParen,
+ '[' => TokenType::LeftBracket,
+ ']' => TokenType::RightBracket,
+ '{' => TokenType::LeftBrace,
+ '}' => TokenType::RightBrace,
+ ',' => TokenType::Comma,
+ '.' => TokenType::Dot,
+ ':' => TokenType::Colon,
+ ';' => TokenType::SemiColon,
+
+ '"' => {
+ let mut value = String::new();
+ while self.peek() != '"' {
+ let character = self.advance();
+
+ if character == '\\' {
+ match self.advance() {
+ '\\' => value.push('\\'),
+ '"' => value.push('"'),
+ _ => panic!(),
+ }
+ continue;
+ }
+
+ value.push(character);
+ }
+
+ self.advance();
+ TokenType::Literal(Literal::String(value))
+ }
+
+ // Keywords & Identifiers
+ 'a'..='z' | 'A'..='Z' | '_' => {
+ let mut value = String::new();
+ value.push(character);
+ while matches!(self.peek(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
+ value.push(self.advance())
+ }
+
+ match value.as_str() {
+ "let" => TokenType::Let,
+ "fn" => TokenType::Fn,
+ "if" => TokenType::If,
+ "else" => TokenType::Else,
+ "for" => TokenType::For,
+ "in" => TokenType::In,
+ "while" => TokenType::While,
+ "loop" => TokenType::Loop,
+ "break" => TokenType::Break,
+ "continue" => TokenType::Continue,
+ "print" => TokenType::Print,
+ _ => TokenType::Identifier(value),
+ }
+ }
+
+ // Misc.
+ '\u{0000}' => return None,
+ _ => panic!("Failed to parse"),
+ };
+
+ // Getting the lexeme and then making the token to be returned
+ // let lexeme = self.source[self.start..self.pos].iter().join("");
+ let lexeme = self.source[self.start..self.pos]
+ .iter()
+ .map(|it| *it as char)
+ .join("");
+
+ let token = Token {
+ tt,
+ lexeme,
+ start: self.start,
+ length: self.pos - self.start,
+ line: self.line,
+ };
+
+ Some(token)
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..65e6ee9
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,26 @@
+#![warn(
+ clippy::wildcard_imports,
+ clippy::string_add,
+ clippy::string_add_assign,
+ clippy::manual_ok_or,
+ unused_lifetimes
+)]
+
+pub mod lexer;
+
+use lexer::Lexer;
+
+const SOURCE: &str = r#"
+
+if 5 >= 7 {
+ print "Hello World";
+}
+
+"#;
+
+fn main() {
+ let lexer = Lexer::new(SOURCE);
+ for token in lexer {
+ print!("{} ", token.lexeme);
+ }
+}