N2T/jack_compiler/src/tokenizer.rs

401 lines
9.6 KiB
Rust

use crate::tokenizer;
use std::fs;
use std::path::Path;
#[derive(Debug, Clone)]
pub enum Token {
Keyword(Keyword),
Symbol(Symbol),
Identifier(String),
IntConst(usize),
StringConst(String),
}
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum Keyword {
Class,
Constructor,
Function,
Method,
Field,
Var,
Int,
Char,
Boolean,
Constant,
Argument,
Local,
Static,
That,
Pointer,
Temp,
Void,
True,
False,
Null,
This,
Let,
Do,
If,
Else,
While,
Return,
AnyKeyword,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Symbol {
LCurly,
RCurly,
LBrace,
RBrace,
LSquare,
RSquare,
Dot,
Comma,
Semicolon,
Plus,
Minus,
Mul,
Div,
ExclusiveAnd,
ExclusiveOr,
Smaller,
Greater,
Equal,
UnaryMinus,
Not,
AnySymbol,
}
#[derive(Debug)]
pub struct Tokens {
pub tokens: Vec<Token>,
index: usize,
}
pub fn identifier() -> Token {
Token::Identifier(String::new())
}
pub fn int_const() -> Token {
Token::IntConst(0)
}
pub fn string_const() -> Token {
Token::StringConst(String::new())
}
pub fn equal(t1: &Token, t2: &Token) -> bool {
match t1 {
Token::Keyword(k1) => match t2 {
Token::Keyword(k2) if k1 == k2 => true,
Token::Keyword(_) if k1 == &Keyword::AnyKeyword => true,
_ => false,
},
Token::Symbol(s1) => match t2 {
Token::Symbol(s2) if s1 == s2 => true,
Token::Symbol(_) if s1 == &Symbol::AnySymbol => true,
_ => false,
},
Token::Identifier(_) => match t2 {
Token::Identifier(_) => true,
_ => false,
},
Token::IntConst(_) => match t2 {
Token::IntConst(_) => true,
_ => false,
},
Token::StringConst(_) => match t2 {
Token::StringConst(_) => true,
_ => false,
}
}
}
impl Token {
pub fn to_string(&self) -> String {
match self {
Token::Identifier(s) => s.to_string(),
Token::StringConst(s) => s.to_string(),
_ => panic!("Cannot convert {:?} to string.", self),
}
}
pub fn to_keyword(&self) -> tokenizer::Keyword {
match self {
tokenizer::Token::Keyword(k) => k.clone(),
_ => tokenizer::Keyword::AnyKeyword,
}
}
pub fn to_int(&self) -> usize {
match self {
tokenizer::Token::IntConst(i) => *i,
_ => panic!("Cannot convert {:?} to int.", self),
}
}
pub fn to_symbol(&self) -> tokenizer::Symbol {
match self {
tokenizer::Token::Symbol(s) => s.clone(),
_ => panic!("Cannot convert {:?} to symbol.", self),
}
}
}
impl Tokens {
pub fn peek(&self) -> tokenizer::Token {
let i = self.index;
let t = self.tokens.get(i).unwrap();
t.clone()
}
fn get_token(&self, index: usize) -> tokenizer::Token {
let t = self.tokens.get(index).unwrap();
t.clone()
}
pub fn eat(&mut self, expected_token: Token) -> tokenizer::Token {
let i = self.index;
let t = self.tokens.get(i).unwrap();
self.index += 1;
if !equal(&expected_token, t) {
panic!{"Expected {:?} but got {:?}.", expected_token, t};
}
t.clone()
}
pub fn eat_one_of(&mut self, tokens: Vec<Token>) -> tokenizer::Token {
let t2 = self.get_token(self.index);
for t1 in &tokens {
if equal(&t1, &t2) {
self.index += 1;
return t2.clone();
}
}
panic!{"Expected one of {:?} but got {:?}.", tokens, t2};
}
pub fn is(&self, expected_token: Token) -> bool {
let t = self.get_token(self.index);
if equal(&expected_token, &t) {
true
} else {
false
}
}
pub fn is_one_of(&self, tokens: Vec<Token>) -> bool {
let t2 = self.get_token(self.index);
for t1 in tokens {
if equal(&t1, &t2) {
return true;
}
}
return false;
}
pub fn is_sequence(&self, tokens: Vec<Token>) -> bool {
let mut index = self.index;
for t1 in tokens {
let t2 = self.get_token(index);
if !(equal(&t1, &t2)) {
return false;
}
index += 1;
}
true
}
}
pub fn tokenize_file(file: &Path) -> Tokens {
let mut tokens = vec![];
let chars: Vec<char> = fs::read_to_string(file).unwrap().chars().collect();
let length = chars.len();
let mut index: usize = 0;
while index < length {
index = eat_comment(&chars, index);
let c = chars[index];
if c.is_whitespace() {
index += 1;
} else if parse_symbol(&chars, &mut tokens, index) != index {
// if there is a symbol it has already been added to token list.
index += 1
} else if c.is_ascii_alphabetic() {
index = parse_keyword_or_identifier(&chars, &mut tokens, index);
} else if c.is_ascii_digit() {
index = parse_integer_constant(&chars, &mut tokens, index);
} else if c == '"' {
index = parse_string_constant(&chars, &mut tokens, index);
} else {
println!("Unexpected char {:?}", c);
index += 1;
}
}
let tokens = Tokens {
tokens: tokens,
index: 0,
};
return tokens;
}
fn eat_comment(chars: &Vec<char>, index: usize) -> usize {
let start_index = index;
let mut index = index;
if chars[index] == '/' && chars[index + 1] == '/' {
index += 2;
while chars[index] != '\n' {
index += 1;
}
index += 1;
} else if chars[index] == '/' && chars[index + 1] == '*' {
index += 2;
while !(chars[index] == '*' && chars[index + 1] == '/') {
index += 1;
}
index += 2;
}
if start_index != index {
// print_vector_slice(chars, start_index, index);
return eat_comment(chars, index);
}
return index;
}
fn parse_symbol(chars: &Vec<char>, tokens: &mut Vec<Token>, index: usize) -> usize {
let mut index = index;
let c = chars[index];
use Symbol::*;
let s = match c {
'{' => Some(LCurly),
'}' => Some(RCurly),
'(' => Some(LBrace),
')' => Some(RBrace),
'[' => Some(LSquare),
']' => Some(RSquare),
'.' => Some(Dot),
',' => Some(Comma),
';' => Some(Semicolon),
'+' => Some(Plus),
'-' => Some(Minus),
'*' => Some(Mul),
'/' => Some(Div),
'&' => Some(ExclusiveAnd),
'|' => Some(ExclusiveOr),
'<' => Some(Smaller),
'>' => Some(Greater),
'=' => Some(Equal),
'~' => Some(Not),
_ => None,
};
match s {
Some(s) => {
let t = Token::Symbol(s);
tokens.push(t);
index += 1;
}
None => (),
}
return index;
}
fn parse_integer_constant(chars: &Vec<char>, tokens: &mut Vec<Token>, index: usize) -> usize {
let start_index = index;
let mut index = index;
let mut number = String::new();
while chars[index].is_ascii_digit() {
number.push(chars[index]);
index += 1;
}
if start_index == index {
return index;
}
let number: usize = number.parse::<usize>().unwrap();
let t = Token::IntConst(number);
tokens.push(t);
return index;
}
fn parse_string_constant(chars: &Vec<char>, tokens: &mut Vec<Token>, index: usize) -> usize {
let start_index = index;
let mut index = index;
if chars[index] != '"' {
return index;
}
index += 1;
while chars[index] != '"' {
index += 1;
}
index += 1;
let s = chars[start_index + 1..index - 1].into_iter().collect();
let t = Token::StringConst(s);
tokens.push(t);
return index;
}
fn parse_keyword_or_identifier(chars: &Vec<char>, tokens: &mut Vec<Token>, index: usize) -> usize {
let mut index = index;
let mut token_string = String::new();
if !chars[index].is_ascii_alphabetic() {
return index;
}
token_string.push(chars[index]);
index += 1;
while chars[index].is_alphanumeric() || chars[index] == '_' {
token_string.push(chars[index]);
index += 1;
}
use Keyword::*;
let t = match token_string.as_str() {
"class" => Token::Keyword(Class),
"constructor" => Token::Keyword(Constructor),
"function" => Token::Keyword(Function),
"method" => Token::Keyword(Method),
"field" => Token::Keyword(Field),
"static" => Token::Keyword(Static),
"var" => Token::Keyword(Var),
"int" => Token::Keyword(Int),
"char" => Token::Keyword(Char),
"boolean" => Token::Keyword(Boolean),
"void" => Token::Keyword(Void),
"true" => Token::Keyword(True),
"false" => Token::Keyword(False),
"null" => Token::Keyword(Null),
"this" => Token::Keyword(This),
"let" => Token::Keyword(Let),
"do" => Token::Keyword(Do),
"if" => Token::Keyword(If),
"else" => Token::Keyword(Else),
"while" => Token::Keyword(While),
"return" => Token::Keyword(Return),
s => Token::Identifier(s.to_string()),
};
tokens.push(t);
return index;
}
#[allow(dead_code)]
fn print_vector_slice(chars: &Vec<char>, start: usize, stop: usize) {
let s: String = chars[start..stop].into_iter().collect();
println!("{:?}", s);
}