Implement Q learner

2020-10-18 14:44:32 -04:00
parent f5e91eba0a
commit d5aa22e9dd
2 changed files with 324 additions and 281 deletions
@@ -1,72 +1,115 @@
-"""  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Template for implementing QLearner  (c) 2015 Tucker Balch  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Copyright 2018, Georgia Institute of Technology (Georgia Tech)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Atlanta, Georgia 30332  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-All Rights Reserved  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Template code for CS 4646/7646  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Georgia Tech asserts copyright ownership of this template and all derivative  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-works, including solutions to the projects assigned in this course. Students  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-and other users of this template code are advised not to share it with others  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-or to make it available on publicly viewable websites including repositories  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-such as github and gitlab.  This copyright statement should not be removed  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-or edited.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-We do grant permission to share solutions privately with non-students such  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-as potential employers. However, sharing with other current or future  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-students of CS 7646 is prohibited and subject to being investigated as a  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT honor code violation.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-----do not edit anything above this line---  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Student Name: Tucker Balch (replace with your name)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT User ID: tb34 (replace with your User ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT ID: 900897987 (replace with your GT ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-"""  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import numpy as np  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import random as rand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-class QLearner(object):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    def __init__(self, \
-        num_states=100, \
-        num_actions = 4, \
-        alpha = 0.2, \
-        gamma = 0.9, \
-        rar = 0.5, \
-        radr = 0.99, \
-        dyna = 0, \
-        verbose = False):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        self.verbose = verbose  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        self.num_actions = num_actions  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        self.s = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        self.a = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    def querysetstate(self, s):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @summary: Update the state without updating the Q-table  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @param s: The new state  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @returns: The selected action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        self.s = s  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        action = rand.randint(0, self.num_actions-1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        if self.verbose: print(f"s = {s}, a = {action}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        return action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    def query(self,s_prime,r):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @summary: Update the Q table and return an action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @param s_prime: The new state  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @param r: The reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        @returns: The selected action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        action = rand.randint(0, self.num_actions-1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        if self.verbose: print(f"s = {s_prime}, a = {action}, r={r}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        return action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print("Remember Q from Star Trek? Well, this isn't him")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+"""
+Template for implementing QLearner  (c) 2015 Tucker Balch
+
+Copyright 2018, Georgia Institute of Technology (Georgia Tech)
+Atlanta, Georgia 30332
+All Rights Reserved
+
+Template code for CS 4646/7646
+
+Georgia Tech asserts copyright ownership of this template and all derivative
+works, including solutions to the projects assigned in this course. Students
+and other users of this template code are advised not to share it with others
+or to make it available on publicly viewable websites including repositories
+such as github and gitlab.  This copyright statement should not be removed
+or edited.
+
+We do grant permission to share solutions privately with non-students such
+as potential employers. However, sharing with other current or future
+students of CS 7646 is prohibited and subject to being investigated as a
+GT honor code violation.
+
+-----do not edit anything above this line---
+"""
+
+import numpy as np
+import random as rand
+
+
+class QLearner(object):
+
+    def __init__(self,
+                 num_states=100,
+                 num_actions=4,
+                 alpha=0.2,
+                 gamma=0.9,
+                 rar=0.5,
+                 radr=0.99,
+                 dyna=0,
+                 verbose=False):
+
+        self.verbose = verbose
+        self.num_actions = num_actions
+        self.num_states = num_states
+        self.s = 0
+        self.a = 0
+        self.alpha = alpha
+        self.gamma = gamma
+        self.rar = rar
+        self.radr = radr
+        self.dyna = dyna
+
+        # self.q = np.random.random((num_states, num_actions))
+        self.q = np.zeros((num_states, num_actions))
+
+    def _get_a(self, s):
+        """Get best action for state. Considers rar."""
+        if rand.random() < self.rar:
+            a = rand.randint(0, self.num_actions - 1)
+        else:
+            a = np.argmax(self.q[s])
+        return a
+
+    def _update_q(self, s, a, s_prime, r):
+        """Updates the Q table."""
+        q_old = self.q[s][a]
+
+        # estimate optimal future value
+        a_max = np.argmax(self.q[s_prime])
+        q_future = self.q[s_prime][a_max]
+
+        # calculate new value and update table
+        q_new = q_old + self.alpha * (r + self.gamma * q_future - q_old)
+        self.q[s][a] = q_new
+
+        if self.verbose:
+            print(f"{q_old=} {q_future=} {q_new=}")
+
+    def querysetstate(self, s):
+        """
+        @summary: Update the state without updating the Q-table
+        @param s: The new state
+        @returns: The selected action
+        """
+        a = self._get_a(s)
+        if self.verbose:
+            print(f"s = {s}, a = {a}")
+        self.s = s
+        self.a = a
+        return self.a
+
+    def query(self, s_prime, r):
+        """
+        @summary: Update the Q table and return an action
+        @param s_prime: The new state
+        @param r: The reward
+        @returns: The selected action
+        """
+        self._update_q(self.s, self.a, s_prime, r)
+        self.a = self._get_a(s_prime)
+        self.s = s_prime
+        if self.verbose:
+            print(f"s = {s_prime}, a = {self.a}, r={r}")
+        # Update random action rate
+        self.rar = self.rar * self.radr
+        return self.a
+
+    def author(self):
+        return 'felixm'
+
+
+if __name__ == "__main__":
+    q = QLearner(verbose=True)
+    print(q.querysetstate(2))
+    q.query(15, 1.00)
+    print(q.querysetstate(15))
@@ -1,189 +1,189 @@
-"""  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Test a Q Learner in a navigation problem.  (c) 2015 Tucker Balch  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-2016-10-20 Added "quicksand" and uncertain actions.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Copyright 2018, Georgia Institute of Technology (Georgia Tech)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Atlanta, Georgia 30332  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-All Rights Reserved  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Template code for CS 4646/7646  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Georgia Tech asserts copyright ownership of this template and all derivative  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-works, including solutions to the projects assigned in this course. Students  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-and other users of this template code are advised not to share it with others  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-or to make it available on publicly viewable websites including repositories  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-such as github and gitlab.  This copyright statement should not be removed  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-or edited.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-We do grant permission to share solutions privately with non-students such  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-as potential employers. However, sharing with other current or future  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-students of CS 7646 is prohibited and subject to being investigated as a  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT honor code violation.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-----do not edit anything above this line---  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-Student Name: Tucker Balch (replace with your name)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT User ID: tb34 (replace with your User ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-GT ID: 900897987 (replace with your GT ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-"""  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import numpy as np  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import random as rand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import time  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import math  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-import QLearner as ql  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# print out the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def printmap(data):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print("--------------------")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    for row in range(0, data.shape[0]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        for col in range(0, data.shape[1]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 0: # Empty space  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print(" ", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 1: # Obstacle  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print("O", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 2: # El roboto  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print("*", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 3: # Goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print("X", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 4: # Trail  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print(".", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 5: # Quick sand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print("~", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 6: # Stepped in quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                print("@", end=' ')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print("--------------------")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# find where the robot is in the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def getrobotpos(data):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    R = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    C = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    for row in range(0, data.shape[0]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        for col in range(0, data.shape[1]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 2:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                C = col  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                R = row  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if (R+C)<0:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        print("warning: start location not defined")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    return R, C  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# find where the goal is in the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def getgoalpos(data):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    R = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    C = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    for row in range(0, data.shape[0]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        for col in range(0, data.shape[1]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[row,col] == 3:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                C = col  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                R = row  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if (R+C)<0:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        print("warning: goal location not defined")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    return (R, C)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# move the robot and report reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def movebot(data,oldpos,a):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    randomrate = 0.20 # how often do we move randomly  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    quicksandreward = -100 # penalty for stepping on quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    # decide if we're going to ignore the action and  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    # choose a random one instead  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if rand.uniform(0.0, 1.0) <= randomrate: # going rogue  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        a = rand.randint(0,3) # choose the random direction  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    # update the test location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if a == 0: #north  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr = testr - 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif a == 1: #east  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testc = testc + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif a == 2: #south  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr = testr + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif a == 3: #west  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testc = testc - 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    reward = -1 # default reward is negative one  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    # see if it is legal. if not, revert  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if testr < 0: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif testr >= data.shape[0]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif testc < 0: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif testc >= data.shape[1]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif data[testr, testc] == 1: # it is an obstacle  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif data[testr, testc] == 5: # it is quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif data[testr, testc] == 6: # it is still quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    elif data[testr, testc] == 3:  # it is the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        reward = 1 # for reaching the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    return (testr, testc), reward #return the new, legal location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# convert the location to a single integer  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def discretize(pos):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    return pos[0]*10 + pos[1]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def test(map, epochs, learner, verbose):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# each epoch involves one trip to the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    startpos = getrobotpos(map) #find where the robot starts  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    goalpos = getgoalpos(map) #find where the goal is  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    scores = np.zeros((epochs,1))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    for epoch in range(1,epochs+1):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        total_reward = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        data = map.copy()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        robopos = startpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        state = discretize(robopos) #convert the location to a state  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        action = learner.querysetstate(state) #set the state and get first action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        count = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        while (robopos != goalpos) & (count<10000):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            #move to new location according to action and then get a new action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            newpos, stepreward = movebot(data,robopos,action)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if newpos == goalpos:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                r = 1 # reward for reaching the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            else:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                r = stepreward # negative reward for not being at the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            state = discretize(newpos)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            action = learner.query(state,r)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[robopos] != 6:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                data[robopos] = 4 # mark where we've been for map printing  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            if data[newpos] != 6:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-                data[newpos] = 2 # move to new location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            robopos = newpos # update the location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            #if verbose: time.sleep(1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            total_reward += stepreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            count = count + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        if count == 100000:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-            print("timeout")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        if verbose: printmap(data)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        if verbose: print(f"{epoch}, {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        scores[epoch-1,0] = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    return np.median(scores)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-# run the code to test a learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-def test_code():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    verbose = True # print lots of debug stuff if True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    # read in the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    filename = 'testworlds/world01.csv'  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    inf = open(filename)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    data = np.array([list(map(float,s.strip().split(','))) for s in inf.readlines()])  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    originalmap = data.copy() #make a copy so we can revert to the original map later  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    if verbose: printmap(data)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    rand.seed(5)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    ######## run non-dyna test ########  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+"""
+Test a Q Learner in a navigation problem.  (c) 2015 Tucker Balch
+2016-10-20 Added "quicksand" and uncertain actions.
+
+Copyright 2018, Georgia Institute of Technology (Georgia Tech)
+Atlanta, Georgia 30332
+All Rights Reserved
+
+Template code for CS 4646/7646
+
+Georgia Tech asserts copyright ownership of this template and all derivative
+works, including solutions to the projects assigned in this course. Students
+and other users of this template code are advised not to share it with others
+or to make it available on publicly viewable websites including repositories
+such as github and gitlab.  This copyright statement should not be removed
+or edited.
+
+We do grant permission to share solutions privately with non-students such
+as potential employers. However, sharing with other current or future
+students of CS 7646 is prohibited and subject to being investigated as a
+GT honor code violation.
+
+-----do not edit anything above this line---
+
+Student Name: Tucker Balch (replace with your name)
+GT User ID: tb34 (replace with your User ID)
+GT ID: 900897987 (replace with your GT ID)
+"""
+
+import numpy as np
+import random as rand
+import time
+import math
+import QLearner as ql
+
+# print out the map
+def printmap(data):
+    print("--------------------")
+    for row in range(0, data.shape[0]):
+        for col in range(0, data.shape[1]):
+            if data[row,col] == 0: # Empty space
+                print(" ", end=' ')
+            if data[row,col] == 1: # Obstacle
+                print("O", end=' ')
+            if data[row,col] == 2: # El roboto
+                print("*", end=' ')
+            if data[row,col] == 3: # Goal
+                print("X", end=' ')
+            if data[row,col] == 4: # Trail
+                print(".", end=' ')
+            if data[row,col] == 5: # Quick sand
+                print("~", end=' ')
+            if data[row,col] == 6: # Stepped in quicksand
+                print("@", end=' ')
+        print()
+    print("--------------------")
+
+# find where the robot is in the map
+def getrobotpos(data):
+    R = -999
+    C = -999
+    for row in range(0, data.shape[0]):
+        for col in range(0, data.shape[1]):
+            if data[row,col] == 2:
+                C = col
+                R = row
+    if (R+C)<0:
+        print("warning: start location not defined")
+    return R, C
+
+# find where the goal is in the map
+def getgoalpos(data):
+    R = -999
+    C = -999
+    for row in range(0, data.shape[0]):
+        for col in range(0, data.shape[1]):
+            if data[row,col] == 3:
+                C = col
+                R = row
+    if (R+C)<0:
+        print("warning: goal location not defined")
+    return (R, C)
+
+# move the robot and report reward
+def movebot(data,oldpos,a):
+    testr, testc = oldpos
+
+    randomrate = 0.20 # how often do we move randomly
+    quicksandreward = -100 # penalty for stepping on quicksand
+
+    # decide if we're going to ignore the action and
+    # choose a random one instead
+    if rand.uniform(0.0, 1.0) <= randomrate: # going rogue
+        a = rand.randint(0,3) # choose the random direction
+
+    # update the test location
+    if a == 0: #north
+        testr = testr - 1
+    elif a == 1: #east
+        testc = testc + 1
+    elif a == 2: #south
+        testr = testr + 1
+    elif a == 3: #west
+        testc = testc - 1
+
+    reward = -1 # default reward is negative one
+    # see if it is legal. if not, revert
+    if testr < 0: # off the map
+        testr, testc = oldpos
+    elif testr >= data.shape[0]: # off the map
+        testr, testc = oldpos
+    elif testc < 0: # off the map
+        testr, testc = oldpos
+    elif testc >= data.shape[1]: # off the map
+        testr, testc = oldpos
+    elif data[testr, testc] == 1: # it is an obstacle
+        testr, testc = oldpos
+    elif data[testr, testc] == 5: # it is quicksand
+        reward = quicksandreward
+        data[testr, testc] = 6 # mark the event
+    elif data[testr, testc] == 6: # it is still quicksand
+        reward = quicksandreward
+        data[testr, testc] = 6 # mark the event
+    elif data[testr, testc] == 3:  # it is the goal
+        reward = 1 # for reaching the goal
+
+    return (testr, testc), reward #return the new, legal location
+
+# convert the location to a single integer
+def discretize(pos):
+    return pos[0]*10 + pos[1]
+
+def test(map, epochs, learner, verbose):
+# each epoch involves one trip to the goal
+    startpos = getrobotpos(map) #find where the robot starts
+    goalpos = getgoalpos(map) #find where the goal is
+    scores = np.zeros((epochs,1))
+    for epoch in range(1,epochs+1):
+        total_reward = 0
+        data = map.copy()
+        robopos = startpos
+        state = discretize(robopos) #convert the location to a state
+        action = learner.querysetstate(state) #set the state and get first action
+        count = 0
+        while (robopos != goalpos) & (count<10000):
+
+            #move to new location according to action and then get a new action
+            newpos, stepreward = movebot(data,robopos,action)
+            if newpos == goalpos:
+                r = 1 # reward for reaching the goal
+            else:
+                r = stepreward # negative reward for not being at the goal
+            state = discretize(newpos)
+            action = learner.query(state,r)
+
+            if data[robopos] != 6:
+                data[robopos] = 4 # mark where we've been for map printing
+            if data[newpos] != 6:
+                data[newpos] = 2 # move to new location
+            robopos = newpos # update the location
+            #if verbose: time.sleep(1)
+            total_reward += stepreward
+            count = count + 1
+        if count == 100000:
+            print("timeout")
+        if verbose: printmap(data)
+        if verbose: print(f"{epoch}, {total_reward}")
+        scores[epoch-1,0] = total_reward
+    return np.median(scores)
+
+# run the code to test a learner
+def test_code():
+
+    verbose = True # print lots of debug stuff if True
+
+    # read in the map
+    filename = 'testworlds/world01.csv'
+    inf = open(filename)
+    data = np.array([list(map(float,s.strip().split(','))) for s in inf.readlines()])
+    originalmap = data.copy() #make a copy so we can revert to the original map later
+
+    if verbose: printmap(data)
+
+    rand.seed(5)
+
+    ######## run non-dyna test ########
    learner = ql.QLearner(num_states=100,\
        num_actions = 4, \
        alpha = 0.2, \
@@ -191,14 +191,14 @@ def test_code():
        rar = 0.98, \
        radr = 0.999, \
        dyna = 0, \
-        verbose=False) #initialize the learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    epochs = 500  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    total_reward = test(data, epochs, learner, verbose)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print(f"{epochs}, median total_reward {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    non_dyna_score = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    ######## run dyna test ########  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        verbose=False) #initialize the learner
+    epochs = 500
+    total_reward = test(data, epochs, learner, verbose)
+    print(f"{epochs}, median total_reward {total_reward}")
+    print()
+    non_dyna_score = total_reward
+
+    ######## run dyna test ########
    learner = ql.QLearner(num_states=100,\
        num_actions = 4, \
        alpha = 0.2, \
@@ -206,18 +206,18 @@ def test_code():
        rar = 0.5, \
        radr = 0.99, \
        dyna = 200, \
-        verbose=False) #initialize the learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    epochs = 50  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    data = originalmap.copy()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    total_reward = test(data, epochs, learner, verbose)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print(f"{epochs}, median total_reward {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    dyna_score = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print(f"results for {filename}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print(f"non_dyna_score: {non_dyna_score}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    print(f"dyna_score    : {dyna_score}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    test_code()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        verbose=False) #initialize the learner
+    epochs = 50
+    data = originalmap.copy()
+    total_reward = test(data, epochs, learner, verbose)
+    print(f"{epochs}, median total_reward {total_reward}")
+    dyna_score = total_reward
+
+    print()
+    print()
+    print(f"results for {filename}")
+    print(f"non_dyna_score: {non_dyna_score}")
+    print(f"dyna_score    : {dyna_score}")
+
+if __name__=="__main__":
+    test_code()