Start working on project assess learners.

This commit is contained in:
2020-09-21 22:15:46 -04:00
parent 927c5eb9de
commit 9697add7a6
7 changed files with 677 additions and 2 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
__pycache__ __pycache__
assess_learners/Data
data data
grading grading
util.py util.py

View File

@@ -29,8 +29,7 @@ exercises. This makes sure that you do not override any of the existing files. I
might add a makefile to automize this later. might add a makefile to automize this later.
``` ```
unzip -n zips/20Spring_martingale.zip -d ./ unzip -n zips/*.zip -d ./
unzip -n zips/19fall_optimize_something.zip -d ./
``` ```
# Reports # Reports

View File

@@ -0,0 +1,36 @@
import numpy as np
class DTLearner(object):
def __init__(self, leaf_size = 1, verbose = False):
pass # move along, these aren't the drones you're looking for
def author(self):
return 'felixm' # replace tb34 with your Georgia Tech username
def addEvidence(self, dataX, dataY):
"""
@summary: Add training data to learner
@param dataX: X values of data to add
@param dataY: the Y training values
"""
# slap on 1s column so linear regression finds a constant term
newdataX = np.ones([dataX.shape[0], dataX.shape[1]+1])
newdataX[:,0:dataX.shape[1]] = dataX
# build and save the model
self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX,
dataY,
rcond=None)
def query(self,points):
"""
@summary: Estimate a set of test points given the model we built.
@param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model.
"""
return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]
if __name__=="__main__":
print("the secret clue is 'zzyzx'")

View File

@@ -0,0 +1,58 @@
"""
A simple wrapper for linear regression. (c) 2015 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
"""
import numpy as np
class LinRegLearner(object):
def __init__(self, verbose = False):
pass # move along, these aren't the drones you're looking for
def author(self):
return 'tb34' # replace tb34 with your Georgia Tech username
def addEvidence(self,dataX,dataY):
"""
@summary: Add training data to learner
@param dataX: X values of data to add
@param dataY: the Y training values
"""
# slap on 1s column so linear regression finds a constant term
newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1])
newdataX[:,0:dataX.shape[1]]=dataX
# build and save the model
self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY, rcond=None)
def query(self,points):
"""
@summary: Estimate a set of test points given the model we built.
@param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model.
"""
return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]
if __name__=="__main__":
print("the secret clue is 'zzyzx'")

View File

@@ -0,0 +1,507 @@
"""MC3-P1: Assess learners - grading script.
Usage:
- Switch to a student feedback directory first (will write "points.txt" and "comments.txt" in pwd).
- Run this script with both ml4t/ and student solution in PYTHONPATH, e.g.:
PYTHONPATH=ml4t:MC3-P1/jdoe7 python ml4t/mc3_p1_grading/grade_learners.py
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
"""
import pytest
from grading.grading import grader, GradeResult, time_limit, run_with_timeout, IncorrectOutput
import util
import os
import sys
import traceback as tb
import numpy as np
import pandas as pd
from collections import namedtuple
import math
import string
import time
import random
# Grading parameters
# rmse_margins = dict(KNNLearner=1.10, BagLearner=1.10) # 1.XX = +XX% margin of RMS error
# points_per_test_case = dict(KNNLearner=3.0, BagLearner=2.0) # points per test case for each group
# seconds_per_test_case = 10 # execution time limit
# seconds_per_test_case = 6
# More grading parameters (picked up by module-level grading fixtures)
max_points = 50.0 # 3.0*5 + 3.0*5 + 2.0*10 = 50
html_pre_block = True # surround comments with HTML <pre> tag (for T-Square comments field)
# Test cases
LearningTestCase = namedtuple('LearningTestCase', ['description', 'group', 'datafile', 'seed', 'outputs'])
learning_test_cases = [
########################
# DTLearner test cases #
########################
LearningTestCase(
description="Test Case 01: Deterministic Tree",
group='DTLearner',
datafile='Istanbul.csv',
seed=1481090001,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 02: Deterministic Tree",
group='DTLearner',
datafile='Istanbul.csv',
seed=1481090002,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 03: Deterministic Tree",
group='DTLearner',
datafile='Istanbul.csv',
seed=1481090003,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 04: Deterministic Tree",
group='DTLearner',
datafile='Istanbul.csv',
seed=1481090004,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
########################
# RTLearner test cases #
########################
LearningTestCase(
description="Test Case 01: Random Tree",
group='RTLearner',
datafile='Istanbul.csv',
seed=1481090001,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 02: Random Tree",
group='RTLearner',
datafile='Istanbul.csv',
seed=1481090002,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 03: Random Tree",
group='RTLearner',
datafile='Istanbul.csv',
seed=1481090003,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
LearningTestCase(
description="Test Case 04: Random Tree",
group='RTLearner',
datafile='Istanbul.csv',
seed=1481090004,
outputs=dict(
insample_corr_min=0.95,
outsample_corr_min=0.15,
insample_corr_max=0.95
)
),
######################
# Bagging test cases #
######################
LearningTestCase(
description="Test Case 01: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090001,
outputs=None
),
LearningTestCase(
description="Test Case 02: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090002,
outputs=None
),
LearningTestCase(
description="Test Case 03: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090003,
outputs=None
),
LearningTestCase(
description="Test Case 04: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090004,
outputs=None
),
LearningTestCase(
description="Test Case 05: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090005,
outputs=None
),
LearningTestCase(
description="Test Case 06: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090006,
outputs=None
),
LearningTestCase(
description="Test Case 07: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090007,
outputs=None
),
LearningTestCase(
description="Test Case 08: Bagging",
group='BagLearner',
datafile='Istanbul.csv',
seed=1481090008,
outputs=None
),
##############################
# RandomName + InsaneLearner #
##############################
LearningTestCase(
description="InsaneLearner Test Case",
group='InsaneLearner',
datafile='simple.csv',
seed=1498076428,
outputs=None,
),
LearningTestCase(
description="Random Classname Test Case",
group='RandomName',
datafile='simple.csv',
seed=1498076428,
outputs=None),
]
# Test functon(s)
@pytest.mark.parametrize("description,group,datafile,seed,outputs", learning_test_cases)
def test_learners(description, group, datafile, seed, outputs, grader):
"""Test ML models returns correct predictions.
Requires test description, test case group, inputs, expected outputs, and a grader fixture.
"""
points_earned = 0.0 # initialize points for this test case
try:
learner_class = None
kwargs = {'verbose':False}
# (BPH) Copied from grade_strategy_qlearning.py
#Set fixed seed for repetability
np.random.seed(seed)
random.seed(seed)
#remove ability to seed either np.random or python random
tmp_numpy_seed = np.random.seed
tmp_random_seed = random.seed
np.random.seed = fake_seed
random.seed = fake_rseed
# Try to import KNNLearner (only once)
# if not 'KNNLearner' in globals():
# from KNNLearner import KNNLearner
if not 'RTLearner' in globals():
from RTLearner import RTLearner
if not 'DTLearner' in globals():
from DTLearner import DTLearner
if (group is 'BagLearner') or (group is 'InsaneLearner') or (group is 'RandomName') and (not 'BagLearner' in globals()):
from BagLearner import BagLearner
#put seeds back for the moment
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
# Tweak kwargs
# kwargs.update(inputs.get('kwargs', {}))
# Read separate training and testing data files
# with open(inputs['train_file']) as f:
# data_partitions=list()
testX,testY,trainX,trainY = None,None, None,None
permutation = None
author = None
with util.get_learner_data_file(datafile) as f:
alldata = np.genfromtxt(f,delimiter=',')
# Skip the date column and header row if we're working on Istanbul data
if datafile == 'Istanbul.csv':
alldata = alldata[1:,1:]
datasize = alldata.shape[0]
cutoff = int(datasize*0.6)
permutation = np.random.permutation(alldata.shape[0])
col_permutation = np.random.permutation(alldata.shape[1]-1)
train_data = alldata[permutation[:cutoff],:]
# trainX = train_data[:,:-1]
trainX = train_data[:,col_permutation]
trainY = train_data[:,-1]
test_data = alldata[permutation[cutoff:],:]
# testX = test_data[:,:-1]
testX = test_data[:,col_permutation]
testY = test_data[:,-1]
msgs = []
if (group is "RTLearner") or (group is "DTLearner"):
clss_name = RTLearner if group is "RTLearner" else DTLearner
tree_sptc = 3 if group is "RTLearner" else 10
corr_in, corr_out, corr_in_50 = None,None,None
def oneleaf():
np.random.seed(seed)
random.seed(seed)
np.random.seed = fake_seed
random.seed = fake_rseed
learner = clss_name(leaf_size=1,verbose=False)
learner.addEvidence(trainX,trainY)
insample = learner.query(trainX)
outsample = learner.query(testX)
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
author_rv = None
try:
author_rv = learner.author()
except:
pass
return insample, outsample, author_rv
def fiftyleaves():
np.random.seed(seed)
random.seed(seed)
np.random.seed = fake_seed
random.seed = fake_rseed
learner = clss_name(leaf_size=50,verbose=False)
learner.addEvidence(trainX,trainY)
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
return learner.query(trainX)
predY_in, predY_out, author = run_with_timeout(oneleaf,tree_sptc,(),{})
predY_in_50 = run_with_timeout(fiftyleaves,tree_sptc,(),{})
corr_in = np.corrcoef(predY_in,y=trainY)[0,1]
corr_out = np.corrcoef(predY_out,y=testY)[0,1]
corr_in_50 = np.corrcoef(predY_in_50,y=trainY)[0,1]
incorrect = False
if corr_in < outputs['insample_corr_min'] or np.isnan(corr_in):
incorrect = True
msgs.append(" In-sample with leaf_size=1 correlation less than allowed: got {} expected {}".format(corr_in,outputs['insample_corr_min']))
else:
points_earned += 1.0
if corr_out < outputs['outsample_corr_min'] or np.isnan(corr_out):
incorrect = True
msgs.append(" Out-of-sample correlation less than allowed: got {} expected {}".format(corr_out,outputs['outsample_corr_min']))
else:
points_earned += 1.0
if corr_in_50 > outputs['insample_corr_max'] or np.isnan(corr_in_50):
incorrect = True
msgs.append(" In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}".format(corr_in_50,outputs['insample_corr_max']))
else:
points_earned += 1.0
# Check author string
if (author is None) or (author =='tb34'):
incorrect = True
msgs.append(" Invalid author: {}".format(author))
points_earned += -2.0
elif group is "BagLearner":
corr1, corr20 = None,None
bag_sptc = 10
def onebag():
np.random.seed(seed)
random.seed(seed)
np.random.seed = fake_seed
random.seed = fake_rseed
learner1 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=1,boost=False,verbose=False)
learner1.addEvidence(trainX,trainY)
q_rv = learner1.query(testX)
a_rv = learner1.author()
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
return q_rv,a_rv
def twentybags():
np.random.seed(seed)
random.seed(seed)
np.random.seed = fake_seed
random.seed = fake_rseed
learner20 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=20,boost=False,verbose=False)
learner20.addEvidence(trainX,trainY)
q_rv = learner20.query(testX)
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
return q_rv
predY1,author = run_with_timeout(onebag,bag_sptc,pos_args=(),keyword_args={})
predY20 = run_with_timeout(twentybags,bag_sptc,(),{})
corr1 = np.corrcoef(predY1,testY)[0,1]
corr20 = np.corrcoef(predY20,testY)[0,1]
incorrect = False
# msgs = []
if corr20 <= corr1:
incorrect = True
msgs.append(" Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}".format(corr20,corr1))
else:
points_earned += 2.0
# Check author string
if (author is None) or (author=='tb34'):
incorrect = True
msgs.append(" Invalid author: {}".format(author))
points_earned += -1.0
elif group is "InsaneLearner":
try:
def insane():
import InsaneLearner as it
learner = it.InsaneLearner(verbose=False)
learner.addEvidence(trainX,trainY)
Y = learner.query(testX)
run_with_timeout(insane,10,pos_args=(),keyword_args={})
incorrect = False
except Exception as e:
incorrect = True
msgs.append(" Exception calling InsaneLearner: {}".format(e))
points_earned = -10
elif group is "RandomName":
try:
il_name,il_code = gen_class()
exec(il_code) in globals(), locals()
il_cobj = eval(il_name)
def rnd_name():
np.random.seed(seed)
random.seed(seed)
np.random.seed=fake_seed
random.seed = fake_rseed
learner = BagLearner(learner=il_cobj,kwargs={'verbose':False},bags=20,boost=False,verbose=False)
learner.addEvidence(trainX,trainY)
Y = learner.query(testX)
np.random.seed = tmp_numpy_seed
random.seed = tmp_random_seed
return il_cobj.init_callcount_dict, il_cobj.add_callcount_dict, il_cobj.query_callcount_dict
iccd, accd, qccd = run_with_timeout(rnd_name,10,pos_args=(),keyword_args={})
incorrect = False
if (len(iccd)!=20) or (any([v!=1 for v in iccd.values()])):
incorrect = True
msgs.append(" Unexpected number of calls to __init__, sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(iccd),max(iccd.values()),min(iccd.values())))
points_earned = -10
if (len(accd)!=20) or (any([v!=1 for v in accd.values()])):
incorrect = True
msgs.append(" Unexpected number of calls to addEvidence sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(accd),max(accd.values()),min(accd.values())))
points_earned = -10
if (len(qccd)!=20) or (any([v!=1 for v in qccd.values()])):
incorrect = True
msgs.append(" Unexpected number of calls to query, sum={} (should be 20), max={} (should be 1), min={} (should be 1)".format(len(qccd),max(qccd.values()),min(qccd.values())))
points_earned = -10
except Exception as e:
incorrect = True
msgs.append(" Exception calling BagLearner: {}".format(e))
points_earned = -10
if incorrect:
inputs_str = " data file: {}\n" \
" permutation: {}".format(datafile, permutation)
raise IncorrectOutput("Test failed on one or more output criteria.\n Inputs:\n{}\n Failures:\n{}".format(inputs_str, "\n".join(msgs)))
except Exception as e:
# Test result: failed
msg = "Description: {} (group: {})\n".format(description, group)
# Generate a filtered stacktrace, only showing erroneous lines in student file(s)
tb_list = tb.extract_tb(sys.exc_info()[2])
for i in range(len(tb_list)):
row = tb_list[i]
tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3]) # show only filename instead of long absolute path
tb_list = [row for row in tb_list if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py')]
if tb_list:
msg += "Traceback:\n"
msg += ''.join(tb.format_list(tb_list)) # contains newlines
msg += "{}: {}".format(e.__class__.__name__, str(e))
# Report failure result to grader, with stacktrace
grader.add_result(GradeResult(outcome='failed', points=points_earned, msg=msg))
raise
else:
# Test result: passed (no exceptions)
grader.add_result(GradeResult(outcome='passed', points=points_earned, msg=None))
def gen_class():
c_def = "class {}(object):\n"
c_def+= " foo=4\n"
c_def+= " init_callcount_dict=dict()\n"
c_def+= " add_callcount_dict=dict()\n"
c_def+= " query_callcount_dict=dict()\n"
c_def+= " def __init__(self,**kwargs):\n"
c_def+= " self.ctor_args = kwargs\n"
c_def+= " self.init_callcount_dict[str(self)] = self.init_callcount_dict.get(str(self),0)+1\n"
c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
c_def+= " print('creating class')\n"
c_def+= " def addEvidence(self,trainX,trainY):\n"
c_def+= " self.trainX = trainX\n"
c_def+= " self.trainY = trainY\n"
c_def+= " self.add_callcount_dict[str(self)] = self.add_callcount_dict.get(str(self),0)+1\n"
c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
c_def+= " print('addEvidence()')\n"
c_def+= " def query(self,testX):\n"
c_def+= " rv = np.zeros(len(testX))\n"
c_def+= " rv[:] = self.trainY.mean()\n"
c_def+= " self.query_callcount_dict[str(self)] = self.query_callcount_dict.get(str(self),0)+1\n"
c_def+= " if ('verbose' in self.ctor_args) and (self.ctor_args['verbose']==True):\n"
c_def+= " print('query()')\n"
c_def+= " return rv"
c_name = ''.join(np.random.permutation(np.array(tuple(string.ascii_letters)))[:10].tolist())
return c_name,c_def.format(c_name)
def fake_seed(*args):
pass
def fake_rseed(*args):
pass
if __name__ == "__main__":
pytest.main(["-s", __file__])

View File

@@ -0,0 +1,74 @@
"""
Test a learner. (c) 2015 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
"""
import numpy as np
import math
import LinRegLearner as lrl
import DTLearner as dtl
import sys
if __name__=="__main__":
if len(sys.argv) != 2:
print("Usage: python testlearner.py <filename>")
sys.exit(1)
inf = open(sys.argv[1])
data = np.array([list(map(float,s.strip().split(',')[1:]))
for s in inf.readlines()[1:]])
# compute how much of the data is training and testing
train_rows = int(0.6* data.shape[0])
test_rows = data.shape[0] - train_rows
# separate out training and testing data
trainX = data[:train_rows,0:-1]
trainY = data[:train_rows,-1]
testX = data[train_rows:,0:-1]
testY = data[train_rows:,-1]
print(f"{testX.shape}")
print(f"{testY.shape}")
# create a learner and train it
# learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner
learner = dtl.DTLearner(verbose = True) # create a LinRegLearner
learner.addEvidence(trainX, trainY) # train it
print(learner.author())
# evaluate in sample
predY = learner.query(trainX) # get the predictions
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
print()
print("In sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=trainY)
print(f"corr: {c[0,1]}")
# evaluate out of sample
predY = learner.query(testX) # get the predictions
rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
print()
print("Out of sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=testY)
print(f"corr: {c[0,1]}")

Binary file not shown.