diff --git a/qlearning_robot/QLearner.py b/qlearning_robot/QLearner.py index dbae5f8..dd1ac9a 100644 --- a/qlearning_robot/QLearner.py +++ b/qlearning_robot/QLearner.py @@ -1,72 +1,115 @@ -""" -Template for implementing QLearner (c) 2015 Tucker Balch - -Copyright 2018, Georgia Institute of Technology (Georgia Tech) -Atlanta, Georgia 30332 -All Rights Reserved - -Template code for CS 4646/7646 - -Georgia Tech asserts copyright ownership of this template and all derivative -works, including solutions to the projects assigned in this course. Students -and other users of this template code are advised not to share it with others -or to make it available on publicly viewable websites including repositories -such as github and gitlab. This copyright statement should not be removed -or edited. - -We do grant permission to share solutions privately with non-students such -as potential employers. However, sharing with other current or future -students of CS 7646 is prohibited and subject to being investigated as a -GT honor code violation. - ------do not edit anything above this line--- - -Student Name: Tucker Balch (replace with your name) -GT User ID: tb34 (replace with your User ID) -GT ID: 900897987 (replace with your GT ID) -""" - -import numpy as np -import random as rand - -class QLearner(object): - - def __init__(self, \ - num_states=100, \ - num_actions = 4, \ - alpha = 0.2, \ - gamma = 0.9, \ - rar = 0.5, \ - radr = 0.99, \ - dyna = 0, \ - verbose = False): - - self.verbose = verbose - self.num_actions = num_actions - self.s = 0 - self.a = 0 - - def querysetstate(self, s): - """ - @summary: Update the state without updating the Q-table - @param s: The new state - @returns: The selected action - """ - self.s = s - action = rand.randint(0, self.num_actions-1) - if self.verbose: print(f"s = {s}, a = {action}") - return action - - def query(self,s_prime,r): - """ - @summary: Update the Q table and return an action - @param s_prime: The new state - @param r: The reward - @returns: The selected action - """ - action = rand.randint(0, self.num_actions-1) - if self.verbose: print(f"s = {s_prime}, a = {action}, r={r}") - return action - -if __name__=="__main__": - print("Remember Q from Star Trek? Well, this isn't him") +""" +Template for implementing QLearner (c) 2015 Tucker Balch + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- +""" + +import numpy as np +import random as rand + + +class QLearner(object): + + def __init__(self, + num_states=100, + num_actions=4, + alpha=0.2, + gamma=0.9, + rar=0.5, + radr=0.99, + dyna=0, + verbose=False): + + self.verbose = verbose + self.num_actions = num_actions + self.num_states = num_states + self.s = 0 + self.a = 0 + self.alpha = alpha + self.gamma = gamma + self.rar = rar + self.radr = radr + self.dyna = dyna + + # self.q = np.random.random((num_states, num_actions)) + self.q = np.zeros((num_states, num_actions)) + + def _get_a(self, s): + """Get best action for state. Considers rar.""" + if rand.random() < self.rar: + a = rand.randint(0, self.num_actions - 1) + else: + a = np.argmax(self.q[s]) + return a + + def _update_q(self, s, a, s_prime, r): + """Updates the Q table.""" + q_old = self.q[s][a] + + # estimate optimal future value + a_max = np.argmax(self.q[s_prime]) + q_future = self.q[s_prime][a_max] + + # calculate new value and update table + q_new = q_old + self.alpha * (r + self.gamma * q_future - q_old) + self.q[s][a] = q_new + + if self.verbose: + print(f"{q_old=} {q_future=} {q_new=}") + + def querysetstate(self, s): + """ + @summary: Update the state without updating the Q-table + @param s: The new state + @returns: The selected action + """ + a = self._get_a(s) + if self.verbose: + print(f"s = {s}, a = {a}") + self.s = s + self.a = a + return self.a + + def query(self, s_prime, r): + """ + @summary: Update the Q table and return an action + @param s_prime: The new state + @param r: The reward + @returns: The selected action + """ + self._update_q(self.s, self.a, s_prime, r) + self.a = self._get_a(s_prime) + self.s = s_prime + if self.verbose: + print(f"s = {s_prime}, a = {self.a}, r={r}") + # Update random action rate + self.rar = self.rar * self.radr + return self.a + + def author(self): + return 'felixm' + + +if __name__ == "__main__": + q = QLearner(verbose=True) + print(q.querysetstate(2)) + q.query(15, 1.00) + print(q.querysetstate(15)) diff --git a/qlearning_robot/testqlearner.py b/qlearning_robot/testqlearner.py index febbf28..223b562 100644 --- a/qlearning_robot/testqlearner.py +++ b/qlearning_robot/testqlearner.py @@ -1,189 +1,189 @@ -""" -Test a Q Learner in a navigation problem. (c) 2015 Tucker Balch -2016-10-20 Added "quicksand" and uncertain actions. - -Copyright 2018, Georgia Institute of Technology (Georgia Tech) -Atlanta, Georgia 30332 -All Rights Reserved - -Template code for CS 4646/7646 - -Georgia Tech asserts copyright ownership of this template and all derivative -works, including solutions to the projects assigned in this course. Students -and other users of this template code are advised not to share it with others -or to make it available on publicly viewable websites including repositories -such as github and gitlab. This copyright statement should not be removed -or edited. - -We do grant permission to share solutions privately with non-students such -as potential employers. However, sharing with other current or future -students of CS 7646 is prohibited and subject to being investigated as a -GT honor code violation. - ------do not edit anything above this line--- - -Student Name: Tucker Balch (replace with your name) -GT User ID: tb34 (replace with your User ID) -GT ID: 900897987 (replace with your GT ID) -""" - -import numpy as np -import random as rand -import time -import math -import QLearner as ql - -# print out the map -def printmap(data): - print("--------------------") - for row in range(0, data.shape[0]): - for col in range(0, data.shape[1]): - if data[row,col] == 0: # Empty space - print(" ", end=' ') - if data[row,col] == 1: # Obstacle - print("O", end=' ') - if data[row,col] == 2: # El roboto - print("*", end=' ') - if data[row,col] == 3: # Goal - print("X", end=' ') - if data[row,col] == 4: # Trail - print(".", end=' ') - if data[row,col] == 5: # Quick sand - print("~", end=' ') - if data[row,col] == 6: # Stepped in quicksand - print("@", end=' ') - print() - print("--------------------") - -# find where the robot is in the map -def getrobotpos(data): - R = -999 - C = -999 - for row in range(0, data.shape[0]): - for col in range(0, data.shape[1]): - if data[row,col] == 2: - C = col - R = row - if (R+C)<0: - print("warning: start location not defined") - return R, C - -# find where the goal is in the map -def getgoalpos(data): - R = -999 - C = -999 - for row in range(0, data.shape[0]): - for col in range(0, data.shape[1]): - if data[row,col] == 3: - C = col - R = row - if (R+C)<0: - print("warning: goal location not defined") - return (R, C) - -# move the robot and report reward -def movebot(data,oldpos,a): - testr, testc = oldpos - - randomrate = 0.20 # how often do we move randomly - quicksandreward = -100 # penalty for stepping on quicksand - - # decide if we're going to ignore the action and - # choose a random one instead - if rand.uniform(0.0, 1.0) <= randomrate: # going rogue - a = rand.randint(0,3) # choose the random direction - - # update the test location - if a == 0: #north - testr = testr - 1 - elif a == 1: #east - testc = testc + 1 - elif a == 2: #south - testr = testr + 1 - elif a == 3: #west - testc = testc - 1 - - reward = -1 # default reward is negative one - # see if it is legal. if not, revert - if testr < 0: # off the map - testr, testc = oldpos - elif testr >= data.shape[0]: # off the map - testr, testc = oldpos - elif testc < 0: # off the map - testr, testc = oldpos - elif testc >= data.shape[1]: # off the map - testr, testc = oldpos - elif data[testr, testc] == 1: # it is an obstacle - testr, testc = oldpos - elif data[testr, testc] == 5: # it is quicksand - reward = quicksandreward - data[testr, testc] = 6 # mark the event - elif data[testr, testc] == 6: # it is still quicksand - reward = quicksandreward - data[testr, testc] = 6 # mark the event - elif data[testr, testc] == 3: # it is the goal - reward = 1 # for reaching the goal - - return (testr, testc), reward #return the new, legal location - -# convert the location to a single integer -def discretize(pos): - return pos[0]*10 + pos[1] - -def test(map, epochs, learner, verbose): -# each epoch involves one trip to the goal - startpos = getrobotpos(map) #find where the robot starts - goalpos = getgoalpos(map) #find where the goal is - scores = np.zeros((epochs,1)) - for epoch in range(1,epochs+1): - total_reward = 0 - data = map.copy() - robopos = startpos - state = discretize(robopos) #convert the location to a state - action = learner.querysetstate(state) #set the state and get first action - count = 0 - while (robopos != goalpos) & (count<10000): - - #move to new location according to action and then get a new action - newpos, stepreward = movebot(data,robopos,action) - if newpos == goalpos: - r = 1 # reward for reaching the goal - else: - r = stepreward # negative reward for not being at the goal - state = discretize(newpos) - action = learner.query(state,r) - - if data[robopos] != 6: - data[robopos] = 4 # mark where we've been for map printing - if data[newpos] != 6: - data[newpos] = 2 # move to new location - robopos = newpos # update the location - #if verbose: time.sleep(1) - total_reward += stepreward - count = count + 1 - if count == 100000: - print("timeout") - if verbose: printmap(data) - if verbose: print(f"{epoch}, {total_reward}") - scores[epoch-1,0] = total_reward - return np.median(scores) - -# run the code to test a learner -def test_code(): - - verbose = True # print lots of debug stuff if True - - # read in the map - filename = 'testworlds/world01.csv' - inf = open(filename) - data = np.array([list(map(float,s.strip().split(','))) for s in inf.readlines()]) - originalmap = data.copy() #make a copy so we can revert to the original map later - - if verbose: printmap(data) - - rand.seed(5) - - ######## run non-dyna test ######## +""" +Test a Q Learner in a navigation problem. (c) 2015 Tucker Balch +2016-10-20 Added "quicksand" and uncertain actions. + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- + +Student Name: Tucker Balch (replace with your name) +GT User ID: tb34 (replace with your User ID) +GT ID: 900897987 (replace with your GT ID) +""" + +import numpy as np +import random as rand +import time +import math +import QLearner as ql + +# print out the map +def printmap(data): + print("--------------------") + for row in range(0, data.shape[0]): + for col in range(0, data.shape[1]): + if data[row,col] == 0: # Empty space + print(" ", end=' ') + if data[row,col] == 1: # Obstacle + print("O", end=' ') + if data[row,col] == 2: # El roboto + print("*", end=' ') + if data[row,col] == 3: # Goal + print("X", end=' ') + if data[row,col] == 4: # Trail + print(".", end=' ') + if data[row,col] == 5: # Quick sand + print("~", end=' ') + if data[row,col] == 6: # Stepped in quicksand + print("@", end=' ') + print() + print("--------------------") + +# find where the robot is in the map +def getrobotpos(data): + R = -999 + C = -999 + for row in range(0, data.shape[0]): + for col in range(0, data.shape[1]): + if data[row,col] == 2: + C = col + R = row + if (R+C)<0: + print("warning: start location not defined") + return R, C + +# find where the goal is in the map +def getgoalpos(data): + R = -999 + C = -999 + for row in range(0, data.shape[0]): + for col in range(0, data.shape[1]): + if data[row,col] == 3: + C = col + R = row + if (R+C)<0: + print("warning: goal location not defined") + return (R, C) + +# move the robot and report reward +def movebot(data,oldpos,a): + testr, testc = oldpos + + randomrate = 0.20 # how often do we move randomly + quicksandreward = -100 # penalty for stepping on quicksand + + # decide if we're going to ignore the action and + # choose a random one instead + if rand.uniform(0.0, 1.0) <= randomrate: # going rogue + a = rand.randint(0,3) # choose the random direction + + # update the test location + if a == 0: #north + testr = testr - 1 + elif a == 1: #east + testc = testc + 1 + elif a == 2: #south + testr = testr + 1 + elif a == 3: #west + testc = testc - 1 + + reward = -1 # default reward is negative one + # see if it is legal. if not, revert + if testr < 0: # off the map + testr, testc = oldpos + elif testr >= data.shape[0]: # off the map + testr, testc = oldpos + elif testc < 0: # off the map + testr, testc = oldpos + elif testc >= data.shape[1]: # off the map + testr, testc = oldpos + elif data[testr, testc] == 1: # it is an obstacle + testr, testc = oldpos + elif data[testr, testc] == 5: # it is quicksand + reward = quicksandreward + data[testr, testc] = 6 # mark the event + elif data[testr, testc] == 6: # it is still quicksand + reward = quicksandreward + data[testr, testc] = 6 # mark the event + elif data[testr, testc] == 3: # it is the goal + reward = 1 # for reaching the goal + + return (testr, testc), reward #return the new, legal location + +# convert the location to a single integer +def discretize(pos): + return pos[0]*10 + pos[1] + +def test(map, epochs, learner, verbose): +# each epoch involves one trip to the goal + startpos = getrobotpos(map) #find where the robot starts + goalpos = getgoalpos(map) #find where the goal is + scores = np.zeros((epochs,1)) + for epoch in range(1,epochs+1): + total_reward = 0 + data = map.copy() + robopos = startpos + state = discretize(robopos) #convert the location to a state + action = learner.querysetstate(state) #set the state and get first action + count = 0 + while (robopos != goalpos) & (count<10000): + + #move to new location according to action and then get a new action + newpos, stepreward = movebot(data,robopos,action) + if newpos == goalpos: + r = 1 # reward for reaching the goal + else: + r = stepreward # negative reward for not being at the goal + state = discretize(newpos) + action = learner.query(state,r) + + if data[robopos] != 6: + data[robopos] = 4 # mark where we've been for map printing + if data[newpos] != 6: + data[newpos] = 2 # move to new location + robopos = newpos # update the location + #if verbose: time.sleep(1) + total_reward += stepreward + count = count + 1 + if count == 100000: + print("timeout") + if verbose: printmap(data) + if verbose: print(f"{epoch}, {total_reward}") + scores[epoch-1,0] = total_reward + return np.median(scores) + +# run the code to test a learner +def test_code(): + + verbose = True # print lots of debug stuff if True + + # read in the map + filename = 'testworlds/world01.csv' + inf = open(filename) + data = np.array([list(map(float,s.strip().split(','))) for s in inf.readlines()]) + originalmap = data.copy() #make a copy so we can revert to the original map later + + if verbose: printmap(data) + + rand.seed(5) + + ######## run non-dyna test ######## learner = ql.QLearner(num_states=100,\ num_actions = 4, \ alpha = 0.2, \ @@ -191,14 +191,14 @@ def test_code(): rar = 0.98, \ radr = 0.999, \ dyna = 0, \ - verbose=False) #initialize the learner - epochs = 500 - total_reward = test(data, epochs, learner, verbose) - print(f"{epochs}, median total_reward {total_reward}") - print() - non_dyna_score = total_reward - - ######## run dyna test ######## + verbose=False) #initialize the learner + epochs = 500 + total_reward = test(data, epochs, learner, verbose) + print(f"{epochs}, median total_reward {total_reward}") + print() + non_dyna_score = total_reward + + ######## run dyna test ######## learner = ql.QLearner(num_states=100,\ num_actions = 4, \ alpha = 0.2, \ @@ -206,18 +206,18 @@ def test_code(): rar = 0.5, \ radr = 0.99, \ dyna = 200, \ - verbose=False) #initialize the learner - epochs = 50 - data = originalmap.copy() - total_reward = test(data, epochs, learner, verbose) - print(f"{epochs}, median total_reward {total_reward}") - dyna_score = total_reward - - print() - print() - print(f"results for {filename}") - print(f"non_dyna_score: {non_dyna_score}") - print(f"dyna_score : {dyna_score}") - -if __name__=="__main__": - test_code() + verbose=False) #initialize the learner + epochs = 50 + data = originalmap.copy() + total_reward = test(data, epochs, learner, verbose) + print(f"{epochs}, median total_reward {total_reward}") + dyna_score = total_reward + + print() + print() + print(f"results for {filename}") + print(f"non_dyna_score: {non_dyna_score}") + print(f"dyna_score : {dyna_score}") + +if __name__=="__main__": + test_code()