diff --git a/.gitignore b/.gitignore index af56c2d..9f6db51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ +assess_learners/Data data grading util.py diff --git a/README.md b/README.md index 08f2fb2..a831415 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ exercises. This makes sure that you do not override any of the existing files. I might add a makefile to automize this later. ``` -unzip -n zips/20Spring_martingale.zip -d ./ -unzip -n zips/19fall_optimize_something.zip -d ./ +unzip -n zips/*.zip -d ./ ``` # Reports diff --git a/assess_learners/DTLearner.py b/assess_learners/DTLearner.py new file mode 100644 index 0000000..bf20308 --- /dev/null +++ b/assess_learners/DTLearner.py @@ -0,0 +1,36 @@ +import numpy as np + +class DTLearner(object): + + def __init__(self, leaf_size = 1, verbose = False): + pass # move along, these aren't the drones you're looking for + + def author(self): + return 'felixm' # replace tb34 with your Georgia Tech username + + def addEvidence(self, dataX, dataY): + """ + @summary: Add training data to learner + @param dataX: X values of data to add + @param dataY: the Y training values + """ + + # slap on 1s column so linear regression finds a constant term + newdataX = np.ones([dataX.shape[0], dataX.shape[1]+1]) + newdataX[:,0:dataX.shape[1]] = dataX + + # build and save the model + self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, + dataY, + rcond=None) + + def query(self,points): + """ + @summary: Estimate a set of test points given the model we built. + @param points: should be a numpy array with each row corresponding to a specific query. + @returns the estimated values according to the saved model. + """ + return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1] + +if __name__=="__main__": + print("the secret clue is 'zzyzx'") diff --git a/assess_learners/LinRegLearner.py b/assess_learners/LinRegLearner.py new file mode 100644 index 0000000..fd60be8 --- /dev/null +++ b/assess_learners/LinRegLearner.py @@ -0,0 +1,58 @@ +""" +A simple wrapper for linear regression. (c) 2015 Tucker Balch + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- +""" + +import numpy as np + +class LinRegLearner(object): + + def __init__(self, verbose = False): + pass # move along, these aren't the drones you're looking for + + def author(self): + return 'tb34' # replace tb34 with your Georgia Tech username + + def addEvidence(self,dataX,dataY): + """ + @summary: Add training data to learner + @param dataX: X values of data to add + @param dataY: the Y training values + """ + + # slap on 1s column so linear regression finds a constant term + newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1]) + newdataX[:,0:dataX.shape[1]]=dataX + + # build and save the model + self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY, rcond=None) + + def query(self,points): + """ + @summary: Estimate a set of test points given the model we built. + @param points: should be a numpy array with each row corresponding to a specific query. + @returns the estimated values according to the saved model. + """ + return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1] + +if __name__=="__main__": + print("the secret clue is 'zzyzx'") diff --git a/assess_learners/grade_learners.py b/assess_learners/grade_learners.py new file mode 100644 index 0000000..bf74119 --- /dev/null +++ b/assess_learners/grade_learners.py @@ -0,0 +1,507 @@ +"""MC3-P1: Assess learners - grading script. + +Usage: +- Switch to a student feedback directory first (will write "points.txt" and "comments.txt" in pwd). +- Run this script with both ml4t/ and student solution in PYTHONPATH, e.g.: + PYTHONPATH=ml4t:MC3-P1/jdoe7 python ml4t/mc3_p1_grading/grade_learners.py + +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- +""" + +import pytest +from grading.grading import grader, GradeResult, time_limit, run_with_timeout, IncorrectOutput +import util + +import os +import sys +import traceback as tb + +import numpy as np +import pandas as pd +from collections import namedtuple + +import math + +import string + +import time + +import random + +# Grading parameters +# rmse_margins = dict(KNNLearner=1.10, BagLearner=1.10) # 1.XX = +XX% margin of RMS error +# points_per_test_case = dict(KNNLearner=3.0, BagLearner=2.0) # points per test case for each group +# seconds_per_test_case = 10 # execution time limit +# seconds_per_test_case = 6 + +# More grading parameters (picked up by module-level grading fixtures) +max_points = 50.0 # 3.0*5 + 3.0*5 + 2.0*10 = 50 +html_pre_block = True # surround comments with HTML
tag (for T-Square comments field)
+
+# Test cases
+LearningTestCase = namedtuple('LearningTestCase', ['description', 'group', 'datafile', 'seed', 'outputs'])
+learning_test_cases = [
+ ########################
+ # DTLearner test cases #
+ ########################
+ LearningTestCase(
+ description="Test Case 01: Deterministic Tree",
+ group='DTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090001,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 02: Deterministic Tree",
+ group='DTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090002,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 03: Deterministic Tree",
+ group='DTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090003,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 04: Deterministic Tree",
+ group='DTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090004,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ ########################
+ # RTLearner test cases #
+ ########################
+ LearningTestCase(
+ description="Test Case 01: Random Tree",
+ group='RTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090001,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 02: Random Tree",
+ group='RTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090002,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 03: Random Tree",
+ group='RTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090003,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+ LearningTestCase(
+ description="Test Case 04: Random Tree",
+ group='RTLearner',
+ datafile='Istanbul.csv',
+ seed=1481090004,
+ outputs=dict(
+ insample_corr_min=0.95,
+ outsample_corr_min=0.15,
+ insample_corr_max=0.95
+ )
+ ),
+
+ ######################
+ # Bagging test cases #
+ ######################
+ LearningTestCase(
+ description="Test Case 01: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090001,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 02: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090002,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 03: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090003,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 04: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090004,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 05: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090005,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 06: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090006,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 07: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090007,
+ outputs=None
+ ),
+ LearningTestCase(
+ description="Test Case 08: Bagging",
+ group='BagLearner',
+ datafile='Istanbul.csv',
+ seed=1481090008,
+ outputs=None
+ ),
+ ##############################
+ # RandomName + InsaneLearner #
+ ##############################
+ LearningTestCase(
+ description="InsaneLearner Test Case",
+ group='InsaneLearner',
+ datafile='simple.csv',
+ seed=1498076428,
+ outputs=None,
+ ),
+ LearningTestCase(
+ description="Random Classname Test Case",
+ group='RandomName',
+ datafile='simple.csv',
+ seed=1498076428,
+ outputs=None),
+]
+
+
+# Test functon(s)
+@pytest.mark.parametrize("description,group,datafile,seed,outputs", learning_test_cases)
+def test_learners(description, group, datafile, seed, outputs, grader):
+ """Test ML models returns correct predictions.
+
+ Requires test description, test case group, inputs, expected outputs, and a grader fixture.
+ """
+
+ points_earned = 0.0 # initialize points for this test case
+ try:
+ learner_class = None
+ kwargs = {'verbose':False}
+
+ # (BPH) Copied from grade_strategy_qlearning.py
+ #Set fixed seed for repetability
+ np.random.seed(seed)
+ random.seed(seed)
+ #remove ability to seed either np.random or python random
+ tmp_numpy_seed = np.random.seed
+ tmp_random_seed = random.seed
+ np.random.seed = fake_seed
+ random.seed = fake_rseed
+
+ # Try to import KNNLearner (only once)
+ # if not 'KNNLearner' in globals():
+ # from KNNLearner import KNNLearner
+ if not 'RTLearner' in globals():
+ from RTLearner import RTLearner
+ if not 'DTLearner' in globals():
+ from DTLearner import DTLearner
+ if (group is 'BagLearner') or (group is 'InsaneLearner') or (group is 'RandomName') and (not 'BagLearner' in globals()):
+ from BagLearner import BagLearner
+ #put seeds back for the moment
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ # Tweak kwargs
+ # kwargs.update(inputs.get('kwargs', {}))
+
+ # Read separate training and testing data files
+ # with open(inputs['train_file']) as f:
+ # data_partitions=list()
+ testX,testY,trainX,trainY = None,None, None,None
+ permutation = None
+ author = None
+ with util.get_learner_data_file(datafile) as f:
+ alldata = np.genfromtxt(f,delimiter=',')
+ # Skip the date column and header row if we're working on Istanbul data
+ if datafile == 'Istanbul.csv':
+ alldata = alldata[1:,1:]
+ datasize = alldata.shape[0]
+ cutoff = int(datasize*0.6)
+ permutation = np.random.permutation(alldata.shape[0])
+ col_permutation = np.random.permutation(alldata.shape[1]-1)
+ train_data = alldata[permutation[:cutoff],:]
+ # trainX = train_data[:,:-1]
+ trainX = train_data[:,col_permutation]
+ trainY = train_data[:,-1]
+ test_data = alldata[permutation[cutoff:],:]
+ # testX = test_data[:,:-1]
+ testX = test_data[:,col_permutation]
+ testY = test_data[:,-1]
+ msgs = []
+
+ if (group is "RTLearner") or (group is "DTLearner"):
+ clss_name = RTLearner if group is "RTLearner" else DTLearner
+ tree_sptc = 3 if group is "RTLearner" else 10
+ corr_in, corr_out, corr_in_50 = None,None,None
+ def oneleaf():
+ np.random.seed(seed)
+ random.seed(seed)
+ np.random.seed = fake_seed
+ random.seed = fake_rseed
+ learner = clss_name(leaf_size=1,verbose=False)
+ learner.addEvidence(trainX,trainY)
+ insample = learner.query(trainX)
+ outsample = learner.query(testX)
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ author_rv = None
+ try:
+ author_rv = learner.author()
+ except:
+ pass
+ return insample, outsample, author_rv
+ def fiftyleaves():
+ np.random.seed(seed)
+ random.seed(seed)
+ np.random.seed = fake_seed
+ random.seed = fake_rseed
+ learner = clss_name(leaf_size=50,verbose=False)
+ learner.addEvidence(trainX,trainY)
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ return learner.query(trainX)
+
+ predY_in, predY_out, author = run_with_timeout(oneleaf,tree_sptc,(),{})
+ predY_in_50 = run_with_timeout(fiftyleaves,tree_sptc,(),{})
+ corr_in = np.corrcoef(predY_in,y=trainY)[0,1]
+ corr_out = np.corrcoef(predY_out,y=testY)[0,1]
+ corr_in_50 = np.corrcoef(predY_in_50,y=trainY)[0,1]
+ incorrect = False
+
+ if corr_in < outputs['insample_corr_min'] or np.isnan(corr_in):
+ incorrect = True
+ msgs.append(" In-sample with leaf_size=1 correlation less than allowed: got {} expected {}".format(corr_in,outputs['insample_corr_min']))
+ else:
+ points_earned += 1.0
+ if corr_out < outputs['outsample_corr_min'] or np.isnan(corr_out):
+ incorrect = True
+ msgs.append(" Out-of-sample correlation less than allowed: got {} expected {}".format(corr_out,outputs['outsample_corr_min']))
+ else:
+ points_earned += 1.0
+ if corr_in_50 > outputs['insample_corr_max'] or np.isnan(corr_in_50):
+ incorrect = True
+ msgs.append(" In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}".format(corr_in_50,outputs['insample_corr_max']))
+ else:
+ points_earned += 1.0
+ # Check author string
+ if (author is None) or (author =='tb34'):
+ incorrect = True
+ msgs.append(" Invalid author: {}".format(author))
+ points_earned += -2.0
+
+ elif group is "BagLearner":
+ corr1, corr20 = None,None
+ bag_sptc = 10
+ def onebag():
+ np.random.seed(seed)
+ random.seed(seed)
+ np.random.seed = fake_seed
+ random.seed = fake_rseed
+ learner1 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=1,boost=False,verbose=False)
+ learner1.addEvidence(trainX,trainY)
+ q_rv = learner1.query(testX)
+ a_rv = learner1.author()
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ return q_rv,a_rv
+ def twentybags():
+ np.random.seed(seed)
+ random.seed(seed)
+ np.random.seed = fake_seed
+ random.seed = fake_rseed
+ learner20 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=20,boost=False,verbose=False)
+ learner20.addEvidence(trainX,trainY)
+ q_rv = learner20.query(testX)
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ return q_rv
+ predY1,author = run_with_timeout(onebag,bag_sptc,pos_args=(),keyword_args={})
+ predY20 = run_with_timeout(twentybags,bag_sptc,(),{})
+
+ corr1 = np.corrcoef(predY1,testY)[0,1]
+ corr20 = np.corrcoef(predY20,testY)[0,1]
+ incorrect = False
+ # msgs = []
+ if corr20 <= corr1:
+ incorrect = True
+ msgs.append(" Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}".format(corr20,corr1))
+ else:
+ points_earned += 2.0
+ # Check author string
+ if (author is None) or (author=='tb34'):
+ incorrect = True
+ msgs.append(" Invalid author: {}".format(author))
+ points_earned += -1.0
+ elif group is "InsaneLearner":
+ try:
+ def insane():
+ import InsaneLearner as it
+ learner = it.InsaneLearner(verbose=False)
+ learner.addEvidence(trainX,trainY)
+ Y = learner.query(testX)
+ run_with_timeout(insane,10,pos_args=(),keyword_args={})
+ incorrect = False
+ except Exception as e:
+ incorrect = True
+ msgs.append(" Exception calling InsaneLearner: {}".format(e))
+ points_earned = -10
+ elif group is "RandomName":
+ try:
+ il_name,il_code = gen_class()
+ exec(il_code) in globals(), locals()
+ il_cobj = eval(il_name)
+ def rnd_name():
+ np.random.seed(seed)
+ random.seed(seed)
+ np.random.seed=fake_seed
+ random.seed = fake_rseed
+ learner = BagLearner(learner=il_cobj,kwargs={'verbose':False},bags=20,boost=False,verbose=False)
+ learner.addEvidence(trainX,trainY)
+ Y = learner.query(testX)
+ np.random.seed = tmp_numpy_seed
+ random.seed = tmp_random_seed
+ return il_cobj.init_callcount_dict, il_cobj.add_callcount_dict, il_cobj.query_callcount_dict
+ iccd, accd, qccd = run_with_timeout(rnd_name,10,pos_args=(),keyword_args={})
+ incorrect = False
+ if (len(iccd)!=20) or (any([v!=1 for v in iccd.values()])):
+ incorrect = True
+ msgs.append(" Unexpected number of calls to __init__, sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(iccd),max(iccd.values()),min(iccd.values())))
+ points_earned = -10
+ if (len(accd)!=20) or (any([v!=1 for v in accd.values()])):
+ incorrect = True
+ msgs.append(" Unexpected number of calls to addEvidence sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(accd),max(accd.values()),min(accd.values())))
+ points_earned = -10
+ if (len(qccd)!=20) or (any([v!=1 for v in qccd.values()])):
+ incorrect = True
+ msgs.append(" Unexpected number of calls to query, sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(qccd),max(qccd.values()),min(qccd.values())))
+ points_earned = -10
+ except Exception as e:
+ incorrect = True
+ msgs.append(" Exception calling BagLearner: {}".format(e))
+ points_earned = -10
+ if incorrect:
+ inputs_str = " data file: {}\n" \
+ " permutation: {}".format(datafile, permutation)
+ raise IncorrectOutput("Test failed on one or more output criteria.\n Inputs:\n{}\n Failures:\n{}".format(inputs_str, "\n".join(msgs)))
+ except Exception as e:
+ # Test result: failed
+ msg = "Description: {} (group: {})\n".format(description, group)
+
+ # Generate a filtered stacktrace, only showing erroneous lines in student file(s)
+ tb_list = tb.extract_tb(sys.exc_info()[2])
+ for i in range(len(tb_list)):
+ row = tb_list[i]
+ tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3]) # show only filename instead of long absolute path
+ tb_list = [row for row in tb_list if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py')]
+ if tb_list:
+ msg += "Traceback:\n"
+ msg += ''.join(tb.format_list(tb_list)) # contains newlines
+ msg += "{}: {}".format(e.__class__.__name__, str(e))
+
+ # Report failure result to grader, with stacktrace
+ grader.add_result(GradeResult(outcome='failed', points=points_earned, msg=msg))
+ raise
+ else:
+ # Test result: passed (no exceptions)
+ grader.add_result(GradeResult(outcome='passed', points=points_earned, msg=None))
+
+def gen_class():
+ c_def = "class {}(object):\n"
+ c_def+= " foo=4\n"
+ c_def+= " init_callcount_dict=dict()\n"
+ c_def+= " add_callcount_dict=dict()\n"
+ c_def+= " query_callcount_dict=dict()\n"
+ c_def+= " def __init__(self,**kwargs):\n"
+ c_def+= " self.ctor_args = kwargs\n"
+ c_def+= " self.init_callcount_dict[str(self)] = self.init_callcount_dict.get(str(self),0)+1\n"
+ c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
+ c_def+= " print('creating class')\n"
+ c_def+= " def addEvidence(self,trainX,trainY):\n"
+ c_def+= " self.trainX = trainX\n"
+ c_def+= " self.trainY = trainY\n"
+ c_def+= " self.add_callcount_dict[str(self)] = self.add_callcount_dict.get(str(self),0)+1\n"
+ c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
+ c_def+= " print('addEvidence()')\n"
+ c_def+= " def query(self,testX):\n"
+ c_def+= " rv = np.zeros(len(testX))\n"
+ c_def+= " rv[:] = self.trainY.mean()\n"
+ c_def+= " self.query_callcount_dict[str(self)] = self.query_callcount_dict.get(str(self),0)+1\n"
+ c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
+ c_def+= " print('query()')\n"
+ c_def+= " return rv"
+ c_name = ''.join(np.random.permutation(np.array(tuple(string.ascii_letters)))[:10].tolist())
+ return c_name,c_def.format(c_name)
+
+def fake_seed(*args):
+ pass
+def fake_rseed(*args):
+ pass
+
+if __name__ == "__main__":
+ pytest.main(["-s", __file__])
diff --git a/assess_learners/testlearner.py b/assess_learners/testlearner.py
new file mode 100644
index 0000000..d45854f
--- /dev/null
+++ b/assess_learners/testlearner.py
@@ -0,0 +1,74 @@
+"""
+Test a learner. (c) 2015 Tucker Balch
+
+Copyright 2018, Georgia Institute of Technology (Georgia Tech)
+Atlanta, Georgia 30332
+All Rights Reserved
+
+Template code for CS 4646/7646
+
+Georgia Tech asserts copyright ownership of this template and all derivative
+works, including solutions to the projects assigned in this course. Students
+and other users of this template code are advised not to share it with others
+or to make it available on publicly viewable websites including repositories
+such as github and gitlab. This copyright statement should not be removed
+or edited.
+
+We do grant permission to share solutions privately with non-students such
+as potential employers. However, sharing with other current or future
+students of CS 7646 is prohibited and subject to being investigated as a
+GT honor code violation.
+
+-----do not edit anything above this line---
+"""
+
+import numpy as np
+import math
+import LinRegLearner as lrl
+import DTLearner as dtl
+import sys
+
+if __name__=="__main__":
+ if len(sys.argv) != 2:
+ print("Usage: python testlearner.py ")
+ sys.exit(1)
+ inf = open(sys.argv[1])
+ data = np.array([list(map(float,s.strip().split(',')[1:]))
+ for s in inf.readlines()[1:]])
+
+ # compute how much of the data is training and testing
+ train_rows = int(0.6* data.shape[0])
+ test_rows = data.shape[0] - train_rows
+
+ # separate out training and testing data
+ trainX = data[:train_rows,0:-1]
+ trainY = data[:train_rows,-1]
+ testX = data[train_rows:,0:-1]
+ testY = data[train_rows:,-1]
+
+ print(f"{testX.shape}")
+ print(f"{testY.shape}")
+
+ # create a learner and train it
+ # learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner
+ learner = dtl.DTLearner(verbose = True) # create a LinRegLearner
+ learner.addEvidence(trainX, trainY) # train it
+ print(learner.author())
+
+ # evaluate in sample
+ predY = learner.query(trainX) # get the predictions
+ rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
+ print()
+ print("In sample results")
+ print(f"RMSE: {rmse}")
+ c = np.corrcoef(predY, y=trainY)
+ print(f"corr: {c[0,1]}")
+
+ # evaluate out of sample
+ predY = learner.query(testX) # get the predictions
+ rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
+ print()
+ print("Out of sample results")
+ print(f"RMSE: {rmse}")
+ c = np.corrcoef(predY, y=testY)
+ print(f"corr: {c[0,1]}")
diff --git a/zips/20Spring_assess_learners.zip b/zips/20Spring_assess_learners.zip
new file mode 100644
index 0000000..6f99552
Binary files /dev/null and b/zips/20Spring_assess_learners.zip differ