Compare commits

..

2 Commits

3 changed files with 300 additions and 216 deletions

View File

@@ -1,64 +1,108 @@
""" import numpy as np
A simple wrapper for linear regression. (c) 2015 Tucker Balch
Note, this is NOT a correct DTLearner; Replace with your own implementation.
Copyright 2018, Georgia Institute of Technology (Georgia Tech) class DTLearner:
Atlanta, Georgia 30332 LEAF = -1
All Rights Reserved NA = -1
Template code for CS 4646/7646 def __init__(self, leaf_size=1, verbose=False):
self.leaf_size = leaf_size
Georgia Tech asserts copyright ownership of this template and all derivative self.verbose = verbose
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others def author(self):
or to make it available on publicly viewable websites including repositories return 'felixm' # replace tb34 with your Georgia Tech username
such as github and gitlab. This copyright statement should not be removed
or edited. def create_node(self, factor, split_value, left, right):
return np.array([(factor, split_value, left, right), ],
We do grant permission to share solutions privately with non-students such dtype='|i4, f4, i4, i4')
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a def query_point(self, point):
GT honor code violation. node_index = 0
while self.rel_tree[node_index][0] != self.LEAF:
-----do not edit anything above this line--- node = self.rel_tree[node_index]
split_factor = node[0]
Student Name: Tucker Balch (replace with your name) split_value = node[1]
GT User ID: tb34 (replace with your User ID) if point[split_factor] <= split_value:
GT ID: 900897987 (replace with your GT ID) # Recurse into left sub-tree.
""" node_index += node[2]
else:
import numpy as np node_index += node[3]
import warnings v = self.rel_tree[node_index][1]
return v
class DTLearner(object):
def query(self, points):
def __init__(self, leaf_size=1, verbose = False): """
warnings.warn("\n\n WARNING! THIS IS NOT A CORRECT DTLearner IMPLEMENTATION! REPLACE WITH YOUR OWN CODE\n") @summary: Estimate a set of test points given the model we built.
pass # move along, these aren't the drones you're looking for @param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model.
def author(self): """
return 'tb34' # replace tb34 with your Georgia Tech username def query_point(p): return self.query_point(p)
r = np.apply_along_axis(query_point, 1, points)
def addEvidence(self,dataX,dataY): return r
"""
@summary: Add training data to learner def build_tree(self, xs, y):
@param dataX: X values of data to add """
@param dataY: the Y training values @summary: Build a decision tree from the training data.
""" @param dataX: X values of data to add
@param dataY: the Y training values
# slap on 1s column so linear regression finds a constant term """
newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1]) assert(xs.shape[0] == y.shape[0])
newdataX[:,0:dataX.shape[1]]=dataX assert(xs.shape[0] > 0) # If this is 0 something went wrong.
# build and save the model if xs.shape[0] <= self.leaf_size:
self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY, rcond=None) value = np.mean(y)
return self.create_node(self.LEAF, value, self.NA, self.NA)
def query(self,points):
""" if np.all(y[0] == y):
@summary: Estimate a set of test points given the model we built. return self.create_node(self.LEAF, y[0], self.NA, self.NA)
@param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model. i, split_value = self.get_i_and_split_value(xs, y)
""" select_l = xs[:, i] <= split_value
return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1] select_r = xs[:, i] > split_value
lt = self.build_tree(xs[select_l], y[select_l])
if __name__=="__main__": rt = self.build_tree(xs[select_r], y[select_r])
print("the secret clue is 'zzyzx'") root = self.create_node(i, split_value, 1, lt.shape[0] + 1)
root = np.concatenate([root, lt, rt])
return root
def addEvidence(self, data_x, data_y):
"""
@summary: Add training data to learner
@param dataX: X values of data to add
@param dataY: the Y training values
"""
self.rel_tree = self.build_tree(data_x, data_y)
def get_correlations(self, xs, y):
""" Return a list of sorted 2-tuples where the first element
is the correlation and the second element is the index. Sorted by
highest correlation first. """
# a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
# for i in range(xs.shape[1])])
correlations = []
for i in range(xs.shape[1]):
c = abs(np.corrcoef(xs[:, i], y=y)[0, 1])
correlations.append((c, i))
correlations.sort(reverse=True)
return correlations
def get_i_and_split_value(self, xs, y):
# If all elements are true we would get one sub-tree with zero
# elements, but we need at least one element in both trees. We avoid
# zero-trees in two steps. First we take the average between the median
# value and a smaller value an use that as the new split value. If that
# doesn't work (when all values are the same) we choose the X with the
# next smaller correlation. We assert that not all values are
# smaller/equal to the split value at the end.
for _, i in self.get_correlations(xs, y):
split_value = np.median(xs[:, i])
select = xs[:, i] <= split_value
if select.all():
for value in xs[:, i]:
if value < split_value:
split_value = (value + split_value) / 2.0
select = xs[:, i] <= split_value
if not select.all():
break
assert(not select.all())
return i, split_value

View File

@@ -1,52 +1,88 @@
""" """
template for generating data to fool learners (c) 2016 Tucker Balch template for generating data to fool learners (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech) Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332 Atlanta, Georgia 30332
All Rights Reserved All Rights Reserved
Template code for CS 4646/7646 Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed such as github and gitlab. This copyright statement should not be removed
or edited. or edited.
We do grant permission to share solutions privately with non-students such We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation. GT honor code violation.
-----do not edit anything above this line--- -----do not edit anything above this line---
Student Name: Tucker Balch (replace with your name) Student Name: Tucker Balch (replace with your name)
GT User ID: tb34 (replace with your User ID) GT User ID: tb34 (replace with your User ID)
GT ID: 900897987 (replace with your GT ID) GT ID: 900897987 (replace with your GT ID)
""" """
import numpy as np import numpy as np
import math import pandas as pd
import math
# this function should return a dataset (X and Y) that will work
# better for linear regression than decision trees
def best4LinReg(seed=1489683273): def best4LinReg(seed=1489683273):
np.random.seed(seed) """
X = np.zeros((100,2)) This function should return a dataset (X and Y) that will work better for
Y = np.random.random(size = (100,))*200-100 linear regression than decision trees.
# Here's is an example of creating a Y from randomly generated
# X with multiple columns We make Y a simple linear combination of X. That will give the Linear
# Y = X[:,0] + np.sin(X[:,1]) + X[:,2]**2 + X[:,3]**3 Regression algorithm a very easy time (no RMSE at all) and beat the DT
return X, Y easily.
"""
def best4DT(seed=1489683273): np.random.seed(seed)
np.random.seed(seed) X = np.random.random(size=(100, 2)) * 200 - 100
X = np.zeros((100,2)) Y = X[:, 0] * -2 + X[:, 1] * 3
Y = np.random.random(size = (100,))*200-100 return X, Y
return X, Y
def author(): def best4DT(seed=1489683273):
return 'tb34' #Change this to your user ID """
This function should return a dataset that will work better for decision
if __name__=="__main__": trees than linear regression.
print("they call me Tim.") """
# Z = np.append(X, Y.reshape(Y.shape[0], 1), 1)
# pd.DataFrame(Z).to_csv("Z.csv", header=None, index=None)
# np.random.seed(seed)
# X = np.random.random(size=(100, 10))*1000-100
# Y = np.random.random(size=(100,))*1000-100
np.random.seed(seed)
# X_1 = np.random.random(size=(100, 1))*200-100
# X_2 = np.random.random(size=(100, 1))*200-100
# X_3 = np.random.random(size=(100, 1))*200-100
# X_4 = np.random.random(size=(100, 1))*200-100
# X = np.concatenate([X_1, X_2, X_3, X_4], 1)
# XXX: I honestly don't know how to help the DTLearner, yet.
X_1 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_2 = np.asarray([i for i in range(100, 1100, 10)]).reshape(100, 1)
X_3 = np.asarray([i for i in range(200, 300)]).reshape(100, 1)
X_4 = np.asarray([i for i in range(300, 400)]).reshape(100, 1)
X_5 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_6 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_7 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_8 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8], 1)
# Y = X[:, 0] * 2 + X[:, 1] * 3
Y = np.random.random(size=(100,)) * 200 - 100
return X, Y
def author():
return 'felixm' # Change this to your user ID
if __name__ == "__main__":
print("they call me Tim.")

View File

@@ -1,100 +1,104 @@
""" """
Test best4 data generator. (c) 2016 Tucker Balch Test best4 data generator. (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech) Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332 Atlanta, Georgia 30332
All Rights Reserved All Rights Reserved
Template code for CS 4646/7646 Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed such as github and gitlab. This copyright statement should not be removed
or edited. or edited.
We do grant permission to share solutions privately with non-students such We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation. GT honor code violation.
-----do not edit anything above this line--- -----do not edit anything above this line---
""" """
import numpy as np import numpy as np
import math import math
import LinRegLearner as lrl import LinRegLearner as lrl
import DTLearner as dt import DTLearner as dt
from gen_data import best4LinReg, best4DT from gen_data import best4LinReg, best4DT
# compare two learners' rmse out of sample # compare two learners' rmse out of sample
def compare_os_rmse(learner1, learner2, X, Y):
# compute how much of the data is training and testing def compare_os_rmse(learner1, learner2, X, Y):
train_rows = int(math.floor(0.6* X.shape[0]))
test_rows = X.shape[0] - train_rows # compute how much of the data is training and testing
train_rows = int(math.floor(0.6 * X.shape[0]))
# separate out training and testing data test_rows = X.shape[0] - train_rows
train = np.random.choice(X.shape[0], size=train_rows, replace=False)
test = np.setdiff1d(np.array(range(X.shape[0])), train) # separate out training and testing data
trainX = X[train, :] train = np.random.choice(X.shape[0], size=train_rows, replace=False)
trainY = Y[train] test = np.setdiff1d(np.array(range(X.shape[0])), train)
testX = X[test, :] trainX = X[train, :]
testY = Y[test] trainY = Y[train]
testX = X[test, :]
# train the learners testY = Y[test]
learner1.addEvidence(trainX, trainY) # train it
learner2.addEvidence(trainX, trainY) # train it # train the learners
learner1.addEvidence(trainX, trainY) # train it
# evaluate learner1 out of sample learner2.addEvidence(trainX, trainY) # train it
predY = learner1.query(testX) # get the predictions
rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) # evaluate learner1 out of sample
predY = learner1.query(testX) # get the predictions
# evaluate learner2 out of sample rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
predY = learner2.query(testX) # get the predictions
rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) # evaluate learner2 out of sample
predY = learner2.query(testX) # get the predictions
return rmse1, rmse2 rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
def test_code(): return rmse1, rmse2
# create two learners and get data
lrlearner = lrl.LinRegLearner(verbose = False) def test_code():
dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)
X, Y = best4LinReg() # create two learners and get data
lrlearner = lrl.LinRegLearner(verbose=False)
# compare the two learners dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) X, Y = best4LinReg()
# share results # compare the two learners
print() rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
print("best4LinReg() results")
print(f"RMSE LR : {rmseLR}") # share results
print(f"RMSE DT : {rmseDT}") print()
if rmseLR < 0.9 * rmseDT: print("best4LinReg() results")
print("LR < 0.9 DT: pass") print(f"RMSE LR : {rmseLR}")
else: print(f"RMSE DT : {rmseDT}")
print("LR >= 0.9 DT: fail") if rmseLR < 0.9 * rmseDT:
print print("LR < 0.9 DT: pass")
else:
# get data that is best for a random tree print("LR >= 0.9 DT: fail")
lrlearner = lrl.LinRegLearner(verbose = False) print
dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)
X, Y = best4DT() # get data that is best for a random tree
lrlearner = lrl.LinRegLearner(verbose=False)
# compare the two learners dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) X, Y = best4DT()
# share results # compare the two learners
print() rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
print("best4RT() results")
print(f"RMSE LR : {rmseLR}") # share results
print(f"RMSE DT : {rmseDT}") print()
if rmseDT < 0.9 * rmseLR: print("best4RT() results")
print("DT < 0.9 LR: pass") print(f"RMSE LR : {rmseLR}")
else: print(f"RMSE DT : {rmseDT}")
print("DT >= 0.9 LR: fail") if rmseDT < 0.9 * rmseLR:
print print("DT < 0.9 LR: pass")
else:
if __name__=="__main__": print("DT >= 0.9 LR: fail")
test_code() print
if __name__ == "__main__":
test_code()