diff --git a/qlearning_robot/QLearner.py b/qlearning_robot/QLearner.py new file mode 100644 index 0000000..dbae5f8 --- /dev/null +++ b/qlearning_robot/QLearner.py @@ -0,0 +1,72 @@ +""" +Template for implementing QLearner (c) 2015 Tucker Balch + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- + +Student Name: Tucker Balch (replace with your name) +GT User ID: tb34 (replace with your User ID) +GT ID: 900897987 (replace with your GT ID) +""" + +import numpy as np +import random as rand + +class QLearner(object): + + def __init__(self, \ + num_states=100, \ + num_actions = 4, \ + alpha = 0.2, \ + gamma = 0.9, \ + rar = 0.5, \ + radr = 0.99, \ + dyna = 0, \ + verbose = False): + + self.verbose = verbose + self.num_actions = num_actions + self.s = 0 + self.a = 0 + + def querysetstate(self, s): + """ + @summary: Update the state without updating the Q-table + @param s: The new state + @returns: The selected action + """ + self.s = s + action = rand.randint(0, self.num_actions-1) + if self.verbose: print(f"s = {s}, a = {action}") + return action + + def query(self,s_prime,r): + """ + @summary: Update the Q table and return an action + @param s_prime: The new state + @param r: The reward + @returns: The selected action + """ + action = rand.randint(0, self.num_actions-1) + if self.verbose: print(f"s = {s_prime}, a = {action}, r={r}") + return action + +if __name__=="__main__": + print("Remember Q from Star Trek? Well, this isn't him") diff --git a/qlearning_robot/grade_robot_qlearning.py b/qlearning_robot/grade_robot_qlearning.py new file mode 100644 index 0000000..cb6c6ae --- /dev/null +++ b/qlearning_robot/grade_robot_qlearning.py @@ -0,0 +1,387 @@ +"""MC3-P2: Q-learning & Dyna - grading script. + +Usage: +- Switch to a student feedback directory first (will write "points.txt" and "comments.txt" in pwd). +- Run this script with both ml4t/ and student solution in PYTHONPATH, e.g.: + PYTHONPATH=ml4t:MC1-P2/jdoe7 python ml4t/mc2_p1_grading/grade_marketsim.py + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- + +Student Name: Tucker Balch (replace with your name) +GT User ID: tb34 (replace with your User ID) +GT ID: 900897987 (replace with your GT ID) + +""" + +import pytest +from grading.grading import grader, GradeResult, run_with_timeout, IncorrectOutput + +import os +import sys +import traceback as tb + +import datetime as dt + +import random + +import numpy as np +import pandas as pd +from collections import namedtuple + +import util + +# Student modules to import +main_code = "QLearner" # module name to import + +robot_qlearning_testing_seed=1490652871 +QLearningTestCase = namedtuple('QLearning', ['description', 'group','world_file','best_reward','median_reward','max_time','points']) +qlearning_test_cases = [ + QLearningTestCase( + description="World 1", + group='nodyna', + world_file='world01.csv', + best_reward=-17, + median_reward=-29.5, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 2", + group='nodyna', + world_file='world02.csv', + best_reward=-14, + median_reward=-19, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 4", + group='nodyna', + world_file='world04.csv', + best_reward=-24, + median_reward=-33, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 6", + group='nodyna', + world_file='world06.csv', + best_reward=-16, + median_reward=-23.5, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 7", + group='nodyna', + world_file='world07.csv', + best_reward=-14, + median_reward=-26, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 8", + group='nodyna', + world_file='world08.csv', + best_reward=-14, + median_reward=-19, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 9", + group='nodyna', + world_file='world09.csv', + best_reward=-15, + median_reward=-20, + max_time=2, + points=9.5 + ), + QLearningTestCase( + description="World 10", + group='nodyna', + world_file='world10.csv', + best_reward=-28, + median_reward=-42, + max_time=2, + points=9.5 + ), + # Dyna test cases + QLearningTestCase( + description="World 1, dyna=200", + group='dyna', + world_file='world01.csv', + best_reward=-12, + median_reward=-29.5, + max_time=10, + points=2.5 + ), + QLearningTestCase( + description="World 2, dyna=200", + group='dyna', + world_file='world02.csv', + best_reward=-14, + median_reward=-19, + max_time=10, + points=2.5 + ), + QLearningTestCase( + description="Author check", + group='author', + world_file='world01.csv', + best_reward=0, + median_reward=0, + max_time=10, + points=0 + ), +] + +max_points = 100.0 +html_pre_block = True # surround comments with HTML
 tag (for T-Square comments field)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# Test functon(s)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+@pytest.mark.parametrize("description,group,world_file,best_reward,median_reward,max_time,points", qlearning_test_cases)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def test_qlearning(description, group, world_file, best_reward, median_reward, max_time, points, grader):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    points_earned = 0.0  # initialize points for this test case  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    try:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        incorrect = True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if not 'QLearner' in globals():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            import importlib  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            m = importlib.import_module('QLearner')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            globals()['QLearner'] = m  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        # Unpack test case  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        world = np.array([list(map(float,s.strip().split(','))) for s in util.get_robot_world_file(world_file).readlines()])  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        student_reward = None  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        student_author = None  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        msgs = []  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if group=='nodyna':  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            def timeoutwrapper_nodyna():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # Note: the following will NOT be commented durring final grading  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # np.random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                learner = QLearner.QLearner(num_states=100,\
+                                            num_actions = 4, \
+                                            alpha = 0.2, \
+                                            gamma = 0.9, \
+                                            rar = 0.98, \
+                                            radr = 0.999, \
+                                            dyna = 0, \
+                                            verbose=False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                return qltest(worldmap=world,iterations=500,max_steps=10000,learner=learner,verbose=False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            student_reward = run_with_timeout(timeoutwrapper_nodyna,max_time,(),{})  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            incorrect = False  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if student_reward < 1.5*median_reward:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                incorrect = True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                msgs.append("   Reward too low, expected %s, found %s"%(median_reward,student_reward))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        elif group=='dyna':  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            def timeoutwrapper_dyna():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # Note: the following will NOT be commented durring final grading  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # np.random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                learner = QLearner.QLearner(num_states=100,\
+                                            num_actions = 4, \
+                                            alpha = 0.2, \
+                                            gamma = 0.9, \
+                                            rar = 0.5, \
+                                            radr = 0.99, \
+                                            dyna = 200, \
+                                            verbose=False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                return qltest(worldmap=world,iterations=50,max_steps=10000,learner=learner,verbose=False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            student_reward = run_with_timeout(timeoutwrapper_dyna,max_time,(),{})  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            incorrect = False  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if student_reward < 1.5*median_reward:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                incorrect = True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                msgs.append("   Reward too low, expected %s, found %s"%(median_reward,student_reward))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        elif group=='author':  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            points_earned = -20  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            def timeoutwrapper_author():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # Note: the following will NOT be commented durring final grading  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                # np.random.seed(robot_qlearning_testing_seed)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                learner = QLearner.QLearner(num_states=100,\
+                                            num_actions = 4, \
+                                            alpha = 0.2, \
+                                            gamma = 0.9, \
+                                            rar = 0.98, \
+                                            radr = 0.999, \
+                                            dyna = 0, \
+                                            verbose=False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                return learner.author()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            student_author = run_with_timeout(timeoutwrapper_author,max_time,(),{})  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            student_reward = best_reward+1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            incorrect = False  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if (student_author is None) or (student_author=='tb34'):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                incorrect = True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                msgs.append("   author() method not implemented correctly. Found {}".format(student_author))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            else:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                points_earned = points  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if (not incorrect):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            points_earned += points  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if incorrect:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            inputs_str = "    group: {}\n" \
+                         "    world_file: {}\n"\
+                         "    median_reward: {}\n".format(group, world_file, median_reward)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            raise IncorrectOutput("Test failed on one or more output criteria.\n  Inputs:\n{}\n  Failures:\n{}".format(inputs_str, "\n".join(msgs)))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    except Exception as e:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        # Test result: failed  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        msg = "Test case description: {}\n".format(description)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        # Generate a filtered stacktrace, only showing erroneous lines in student file(s)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        tb_list = tb.extract_tb(sys.exc_info()[2])  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        for i in range(len(tb_list)):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            row = tb_list[i]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3])  # show only filename instead of long absolute path  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        tb_list = [row for row in tb_list if row[0] in ['QLearner.py','StrategyLearner.py']]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if tb_list:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            msg += "Traceback:\n"  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            msg += ''.join(tb.format_list(tb_list))  # contains newlines  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        elif 'grading_traceback' in dir(e):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            msg += "Traceback:\n"  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            msg += ''.join(tb.format_list(e.grading_traceback))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        msg += "{}: {}".format(e.__class__.__name__, str(e))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        # Report failure result to grader, with stacktrace  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        grader.add_result(GradeResult(outcome='failed', points=points_earned, msg=msg))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        raise  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    else:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        # Test result: passed (no exceptions)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        grader.add_result(GradeResult(outcome='passed', points=points_earned, msg=None))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def getrobotpos(data):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    R = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    C = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    for row in range(0, data.shape[0]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        for col in range(0, data.shape[1]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if data[row,col] == 2:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                C = col  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                R = row  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if (R+C)<0:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        print("warning: start location not defined")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return R, C  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# find where the goal is in the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def getgoalpos(data):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    R = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    C = -999  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    for row in range(0, data.shape[0]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        for col in range(0, data.shape[1]):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if data[row,col] == 3:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                C = col  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                R = row  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if (R+C)<0:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        print("warning: goal location not defined")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return (R, C)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# move the robot and report reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def movebot(data,oldpos,a):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    randomrate = 0.20 # how often do we move randomly  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    quicksandreward = -100 # penalty for stepping on quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # decide if we're going to ignore the action and  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # choose a random one instead  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if random.uniform(0.0, 1.0) <= randomrate: # going rogue  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        a = random.randint(0,3) # choose the random direction  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # update the test location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if a == 0: #north  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr = testr - 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif a == 1: #east  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testc = testc + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif a == 2: #south  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr = testr + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif a == 3: #west  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testc = testc - 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    reward = -1 # default reward is negative one  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # see if it is legal. if not, revert  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if testr < 0: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif testr >= data.shape[0]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif testc < 0: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif testc >= data.shape[1]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 1: # it is an obstacle  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 5: # it is quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 6: # it is still quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 3:  # it is the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = 1 # for reaching the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return (testr, testc), reward #return the new, legal location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# convert the location to a single integer  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def discretize(pos):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return pos[0]*10 + pos[1]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def qltest(worldmap, iterations, max_steps, learner, verbose):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# each iteration involves one trip to the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    startpos = getrobotpos(worldmap) #find where the robot starts  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    goalpos = getgoalpos(worldmap) #find where the goal is  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # max_reward = -float('inf')  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    all_rewards = list()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    for iteration in range(1,iterations+1):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        total_reward = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data = worldmap.copy()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        robopos = startpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        state = discretize(robopos) #convert the location to a state  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        action = learner.querysetstate(state) #set the state and get first action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        count = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        while (robopos != goalpos) & (count= data.shape[0]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif testc < 0: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif testc >= data.shape[1]: # off the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 1: # it is an obstacle  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        testr, testc = oldpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 5: # it is quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 6: # it is still quicksand  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = quicksandreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data[testr, testc] = 6 # mark the event  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    elif data[testr, testc] == 3:  # it is the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        reward = 1 # for reaching the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return (testr, testc), reward #return the new, legal location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# convert the location to a single integer  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def discretize(pos):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return pos[0]*10 + pos[1]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def test(map, epochs, learner, verbose):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# each epoch involves one trip to the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    startpos = getrobotpos(map) #find where the robot starts  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    goalpos = getgoalpos(map) #find where the goal is  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    scores = np.zeros((epochs,1))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    for epoch in range(1,epochs+1):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        total_reward = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        data = map.copy()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        robopos = startpos  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        state = discretize(robopos) #convert the location to a state  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        action = learner.querysetstate(state) #set the state and get first action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        count = 0  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        while (robopos != goalpos) & (count<10000):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            #move to new location according to action and then get a new action  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            newpos, stepreward = movebot(data,robopos,action)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if newpos == goalpos:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                r = 1 # reward for reaching the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            else:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                r = stepreward # negative reward for not being at the goal  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            state = discretize(newpos)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            action = learner.query(state,r)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if data[robopos] != 6:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                data[robopos] = 4 # mark where we've been for map printing  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            if data[newpos] != 6:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                data[newpos] = 2 # move to new location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            robopos = newpos # update the location  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            #if verbose: time.sleep(1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            total_reward += stepreward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            count = count + 1  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if count == 100000:  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+            print("timeout")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if verbose: printmap(data)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        if verbose: print(f"{epoch}, {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        scores[epoch-1,0] = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    return np.median(scores)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+# run the code to test a learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def test_code():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    verbose = True # print lots of debug stuff if True  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # read in the map  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    filename = 'testworlds/world01.csv'  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    inf = open(filename)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    data = np.array([list(map(float,s.strip().split(','))) for s in inf.readlines()])  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    originalmap = data.copy() #make a copy so we can revert to the original map later  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    if verbose: printmap(data)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    rand.seed(5)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    ######## run non-dyna test ########  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    learner = ql.QLearner(num_states=100,\
+        num_actions = 4, \
+        alpha = 0.2, \
+        gamma = 0.9, \
+        rar = 0.98, \
+        radr = 0.999, \
+        dyna = 0, \
+        verbose=False) #initialize the learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    epochs = 500  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    total_reward = test(data, epochs, learner, verbose)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print(f"{epochs}, median total_reward {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    non_dyna_score = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    ######## run dyna test ########  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    learner = ql.QLearner(num_states=100,\
+        num_actions = 4, \
+        alpha = 0.2, \
+        gamma = 0.9, \
+        rar = 0.5, \
+        radr = 0.99, \
+        dyna = 200, \
+        verbose=False) #initialize the learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    epochs = 50  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    data = originalmap.copy()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    total_reward = test(data, epochs, learner, verbose)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print(f"{epochs}, median total_reward {total_reward}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    dyna_score = total_reward  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print(f"results for {filename}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print(f"non_dyna_score: {non_dyna_score}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    print(f"dyna_score    : {dyna_score}")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    test_code()  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
diff --git a/qlearning_robot/testworlds/world01.csv b/qlearning_robot/testworlds/world01.csv
new file mode 100644
index 0000000..d56d367
--- /dev/null
+++ b/qlearning_robot/testworlds/world01.csv
@@ -0,0 +1,10 @@
+3,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,1,1,1,1,1,0,0,0
+0,5,1,0,0,0,1,0,0,0
+0,5,1,0,0,0,1,0,0,0
+0,0,1,0,0,0,1,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,2,0,0,0,0,0
diff --git a/qlearning_robot/testworlds/world02.csv b/qlearning_robot/testworlds/world02.csv
new file mode 100644
index 0000000..fc2833f
--- /dev/null
+++ b/qlearning_robot/testworlds/world02.csv
@@ -0,0 +1,10 @@
+0,1,0,1,0,0,0,0,0,0
+0,1,0,1,0,0,0,0,0,0
+0,1,0,0,0,0,0,0,0,0
+0,1,0,1,1,1,1,1,1,1
+2,1,0,1,0,0,0,0,0,0
+0,1,0,1,0,0,1,0,0,3
+0,0,0,1,0,0,1,0,0,0
+0,1,0,0,0,0,1,1,1,1
+0,1,0,1,0,0,0,0,0,0
+0,0,0,1,0,0,0,0,0,0
diff --git a/qlearning_robot/testworlds/world03.csv b/qlearning_robot/testworlds/world03.csv
new file mode 100644
index 0000000..924da5f
--- /dev/null
+++ b/qlearning_robot/testworlds/world03.csv
@@ -0,0 +1,10 @@
+0,0,0,1,0,0,0,1,0,3
+0,1,0,1,0,1,0,1,0,0
+0,1,0,1,0,1,0,1,0,1
+0,1,0,1,0,1,0,1,0,0
+0,1,0,1,0,1,0,1,1,0
+0,1,0,1,0,1,0,1,0,0
+0,1,0,1,0,1,0,1,0,1
+0,1,0,1,0,1,0,1,0,0
+0,1,0,1,0,1,0,1,1,0
+2,1,0,0,0,1,0,0,0,0
diff --git a/qlearning_robot/testworlds/world04.csv b/qlearning_robot/testworlds/world04.csv
new file mode 100644
index 0000000..88c108f
--- /dev/null
+++ b/qlearning_robot/testworlds/world04.csv
@@ -0,0 +1,10 @@
+0,0,0,0,0,1,0,1,0,3
+0,0,0,0,0,1,0,1,0,0
+0,0,0,1,0,1,0,1,0,1
+0,0,0,1,0,1,0,1,0,0
+0,0,0,1,0,0,0,1,1,0
+2,0,0,1,1,1,0,1,0,0
+0,0,0,1,0,1,0,0,0,1
+0,0,5,0,0,1,0,1,0,0
+0,0,1,1,1,1,0,1,1,0
+0,0,0,0,0,1,0,0,0,0
diff --git a/qlearning_robot/testworlds/world05.csv b/qlearning_robot/testworlds/world05.csv
new file mode 100644
index 0000000..da714ff
--- /dev/null
+++ b/qlearning_robot/testworlds/world05.csv
@@ -0,0 +1,10 @@
+0,1,0,0,0,1,0,1,0,3
+1,0,0,0,0,0,1,0,0,0
+0,0,1,1,0,1,0,1,0,1
+1,1,0,0,0,0,1,0,0,0
+0,0,0,0,0,0,0,0,1,0
+0,1,0,1,1,0,0,1,0,0
+1,0,0,0,0,1,0,0,0,1
+0,0,0,0,0,0,0,1,0,0
+1,0,1,0,0,1,0,0,1,0
+2,0,0,0,0,0,1,0,0,0
diff --git a/qlearning_robot/testworlds/world06.csv b/qlearning_robot/testworlds/world06.csv
new file mode 100644
index 0000000..e299798
--- /dev/null
+++ b/qlearning_robot/testworlds/world06.csv
@@ -0,0 +1,10 @@
+0,1,0,0,0,1,0,1,0,2
+1,0,0,0,0,0,1,0,0,0
+0,0,1,1,0,1,0,1,0,1
+1,1,0,0,0,0,1,0,0,0
+0,0,0,0,0,0,0,0,1,0
+0,1,0,1,1,0,0,1,0,0
+1,0,0,0,0,1,0,0,0,1
+0,0,0,0,0,0,0,1,0,0
+1,0,1,0,0,1,0,0,1,0
+3,0,0,0,0,0,1,0,0,0
diff --git a/qlearning_robot/testworlds/world07.csv b/qlearning_robot/testworlds/world07.csv
new file mode 100644
index 0000000..394f4e3
--- /dev/null
+++ b/qlearning_robot/testworlds/world07.csv
@@ -0,0 +1,10 @@
+0,0,0,0,2,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,1,1,1,1,1,0,0,0
+0,0,1,0,3,0,1,0,0,0
+0,0,1,0,0,0,1,0,0,0
+0,0,1,0,0,0,1,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
+0,0,0,0,0,0,0,0,0,0
diff --git a/qlearning_robot/testworlds/world08.csv b/qlearning_robot/testworlds/world08.csv
new file mode 100644
index 0000000..936c865
--- /dev/null
+++ b/qlearning_robot/testworlds/world08.csv
@@ -0,0 +1,10 @@
+0,1,0,1,0,0,0,0,0,0
+0,1,0,1,0,0,0,0,0,0
+0,1,0,0,0,0,0,0,0,0
+0,1,0,1,1,1,1,1,1,1
+3,1,0,1,0,0,0,0,0,0
+0,1,0,1,0,0,1,0,0,2
+0,0,0,1,0,0,1,0,0,0
+0,1,0,0,0,0,1,1,1,1
+0,1,0,1,0,0,0,0,0,0
+0,0,0,1,0,0,0,0,0,0
diff --git a/qlearning_robot/testworlds/world09.csv b/qlearning_robot/testworlds/world09.csv
new file mode 100644
index 0000000..1595036
--- /dev/null
+++ b/qlearning_robot/testworlds/world09.csv
@@ -0,0 +1,10 @@
+0,0,0,0,0,2,0,0,0,0
+0,0,0,1,0,0,1,0,0,0
+0,1,0,1,0,0,1,0,1,0
+0,1,0,1,1,1,1,0,1,0
+0,1,0,0,1,0,0,0,1,0
+0,1,1,1,1,1,1,1,1,0
+0,0,0,0,1,0,0,0,0,0
+0,0,0,0,1,0,0,1,0,0
+0,0,0,0,1,0,0,1,0,0
+0,0,0,0,1,3,0,1,0,0
diff --git a/qlearning_robot/testworlds/world10.csv b/qlearning_robot/testworlds/world10.csv
new file mode 100644
index 0000000..69bd052
--- /dev/null
+++ b/qlearning_robot/testworlds/world10.csv
@@ -0,0 +1,10 @@
+0,0,0,0,0,0,0,0,0,0
+0,0,0,1,0,0,1,0,0,0
+0,1,0,1,0,0,1,0,1,0
+0,1,0,1,1,1,1,0,1,0
+0,1,0,0,1,0,0,0,1,0
+0,1,1,1,1,0,1,1,1,0
+0,0,0,0,1,0,0,0,0,0
+0,0,0,0,1,0,0,1,0,0
+0,0,0,0,1,0,0,1,0,0
+0,0,0,2,1,3,0,1,0,0
diff --git a/zips/19fall_optimize_something.zip b/zips/19Fall_optimize_something.zip
similarity index 100%
rename from zips/19fall_optimize_something.zip
rename to zips/19Fall_optimize_something.zip
diff --git a/zips/20Spring_qlearning_robot.zip b/zips/20Spring_qlearning_robot.zip
new file mode 100644
index 0000000..02054f1
Binary files /dev/null and b/zips/20Spring_qlearning_robot.zip differ