Implement basic lexer

This commit is contained in:
2021-05-15 18:04:54 -04:00
parent 9ffaca54a5
commit 3e8dd3f698
2 changed files with 138 additions and 1 deletions

134
src/lexer.rs Normal file
View File

@@ -0,0 +1,134 @@
#[derive(Debug)]
pub enum Token {
Identifier(String),
Boolean(bool),
Number(i64),
LeftRoundBracket,
RightRoundBracket,
Quote,
}
type Tokens = Vec<Token>;
pub fn read(code: &str) -> () {
let tokens = scan(code, 0, vec![]);
print!("{:?}", tokens);
}
fn scan(code: &str, mut ix: usize, mut tokens: Tokens) -> Tokens {
if ix == code.len() {
return tokens;
}
let c: char = code[ix..ix + 1].chars().next().unwrap();
if c.is_ascii_whitespace() {
return scan(code, ix + 1, tokens);
} else if is_boolean(code, ix) {
return scan_boolean(code, ix, tokens);
} else if c == ';' {
return scan_comment(code, ix, tokens);
} else if c.is_ascii_digit() {
return scan_number(code, ix, tokens);
} else if c.is_ascii_alphabetic() || SPECIAL_INITIAL.contains(&c) {
return scan_identifier(code, ix, tokens);
} else if c == '(' {
tokens.push(Token::LeftRoundBracket);
} else if c == ')' {
tokens.push(Token::RightRoundBracket);
} else if c == '\'' {
tokens.push(Token::Quote);
} else {
// TODO: character | string | #(||`|,|,@|.
panic!("Unexpected char '{}' at index {}", c, ix);
}
ix += 1;
scan(code, ix, tokens)
}
fn is_boolean(code: &str, ix: usize) -> bool {
if let Some(s) = code.get(ix..ix + 2) {
let c = s.chars().as_str();
match c {
"#f" => true,
"#t" => true,
_ => false,
}
} else {
false
}
}
fn scan_boolean(code: &str, ix: usize, mut tokens: Tokens) -> Tokens {
let c = code[ix..ix + 2].chars().as_str();
match c {
"#t" => {
let token = Token::Boolean(true);
tokens.push(token);
},
"#f" => {
let token = Token::Boolean(false);
tokens.push(token);
}
_ => panic!("Expected boolean but got {}", c)
}
return scan(code, ix + 2, tokens);
}
fn scan_comment(code: &str, mut ix: usize, tokens: Tokens) -> Tokens {
let mut chars = code[ix..].chars();
let c = chars.next().unwrap();
assert!(c == ';');
ix += 1;
while let Some(c) = chars.next() {
if c != '\n' {
ix += 1;
} else {
break;
}
}
scan(code, ix, tokens)
}
fn scan_number(code: &str, mut ix: usize, mut tokens: Tokens) -> Tokens {
let start_ix = ix;
let mut chars = code[ix..].chars();
while let Some(c) = chars.next() {
if c.is_ascii_digit() {
ix += 1;
} else {
break;
}
}
let number = code[start_ix..ix].to_string().parse().unwrap();
let token = Token::Number(number);
tokens.push(token);
scan(code, ix, tokens)
}
fn scan_identifier(code: &str, mut ix: usize, mut tokens: Tokens) -> Tokens {
let start_ix = ix;
let mut chars = code[ix..].chars();
let c = chars.next().unwrap();
assert!(c.is_ascii_alphabetic() || SPECIAL_INITIAL.contains(&c));
ix += 1;
while let Some(c) = chars.next() {
if c.is_ascii_alphanumeric() || SPECIAL_SUBSEQUENT.contains(&c) {
ix += 1;
} else {
break;
}
}
let token = Token::Identifier(code[start_ix..ix].to_string());
tokens.push(token);
scan(code, ix, tokens)
}
const SPECIAL_INITIAL: &[char] = &[
// FIXME: + and - are peculiar identifiers but let's keep it simple for now
'!', '$', '%', '&', '*', '/', ':', '<', '=', '>', '?', '^', '_', '~', '+', '-',
];
const SPECIAL_SUBSEQUENT: &[char] = &[
'+', '-', '.', '@',
];

View File

@@ -1,3 +1,6 @@
mod lexer;
fn main() { fn main() {
println!("Hello, world!"); let scm_code = "(+ a 32)";
lexer::read(scm_code);
} }