1
0
Fork 0

Change best for LinReg to return optimal data

master
Felix Martin 2020-10-05 17:27:09 -04:00
parent a662e302db
commit 381670705b
2 changed files with 192 additions and 152 deletions

View File

@ -1,52 +1,88 @@
"""
template for generating data to fool learners (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
Student Name: Tucker Balch (replace with your name)
GT User ID: tb34 (replace with your User ID)
GT ID: 900897987 (replace with your GT ID)
"""
import numpy as np
import math
# this function should return a dataset (X and Y) that will work
# better for linear regression than decision trees
def best4LinReg(seed=1489683273):
np.random.seed(seed)
X = np.zeros((100,2))
Y = np.random.random(size = (100,))*200-100
# Here's is an example of creating a Y from randomly generated
# X with multiple columns
# Y = X[:,0] + np.sin(X[:,1]) + X[:,2]**2 + X[:,3]**3
return X, Y
def best4DT(seed=1489683273):
np.random.seed(seed)
X = np.zeros((100,2))
Y = np.random.random(size = (100,))*200-100
return X, Y
def author():
return 'tb34' #Change this to your user ID
if __name__=="__main__":
print("they call me Tim.")
"""
template for generating data to fool learners (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
Student Name: Tucker Balch (replace with your name)
GT User ID: tb34 (replace with your User ID)
GT ID: 900897987 (replace with your GT ID)
"""
import numpy as np
import pandas as pd
import math
def best4LinReg(seed=1489683273):
"""
This function should return a dataset (X and Y) that will work better for
linear regression than decision trees.
We make Y a simple linear combination of X. That will give the Linear
Regression algorithm a very easy time (no RMSE at all) and beat the DT
easily.
"""
np.random.seed(seed)
X = np.random.random(size=(100, 2)) * 200 - 100
Y = X[:, 0] * -2 + X[:, 1] * 3
return X, Y
def best4DT(seed=1489683273):
"""
This function should return a dataset that will work better for decision
trees than linear regression.
"""
# Z = np.append(X, Y.reshape(Y.shape[0], 1), 1)
# pd.DataFrame(Z).to_csv("Z.csv", header=None, index=None)
# np.random.seed(seed)
# X = np.random.random(size=(100, 10))*1000-100
# Y = np.random.random(size=(100,))*1000-100
np.random.seed(seed)
# X_1 = np.random.random(size=(100, 1))*200-100
# X_2 = np.random.random(size=(100, 1))*200-100
# X_3 = np.random.random(size=(100, 1))*200-100
# X_4 = np.random.random(size=(100, 1))*200-100
# X = np.concatenate([X_1, X_2, X_3, X_4], 1)
# XXX: I honestly don't know how to help the DTLearner, yet.
X_1 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_2 = np.asarray([i for i in range(100, 1100, 10)]).reshape(100, 1)
X_3 = np.asarray([i for i in range(200, 300)]).reshape(100, 1)
X_4 = np.asarray([i for i in range(300, 400)]).reshape(100, 1)
X_5 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_6 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_7 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X_8 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
X = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8], 1)
# Y = X[:, 0] * 2 + X[:, 1] * 3
Y = np.random.random(size=(100,)) * 200 - 100
return X, Y
def author():
return 'felixm' # Change this to your user ID
if __name__ == "__main__":
print("they call me Tim.")

View File

@ -1,100 +1,104 @@
"""
Test best4 data generator. (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
"""
import numpy as np
import math
import LinRegLearner as lrl
import DTLearner as dt
from gen_data import best4LinReg, best4DT
# compare two learners' rmse out of sample
def compare_os_rmse(learner1, learner2, X, Y):
# compute how much of the data is training and testing
train_rows = int(math.floor(0.6* X.shape[0]))
test_rows = X.shape[0] - train_rows
# separate out training and testing data
train = np.random.choice(X.shape[0], size=train_rows, replace=False)
test = np.setdiff1d(np.array(range(X.shape[0])), train)
trainX = X[train, :]
trainY = Y[train]
testX = X[test, :]
testY = Y[test]
# train the learners
learner1.addEvidence(trainX, trainY) # train it
learner2.addEvidence(trainX, trainY) # train it
# evaluate learner1 out of sample
predY = learner1.query(testX) # get the predictions
rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
# evaluate learner2 out of sample
predY = learner2.query(testX) # get the predictions
rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
return rmse1, rmse2
def test_code():
# create two learners and get data
lrlearner = lrl.LinRegLearner(verbose = False)
dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)
X, Y = best4LinReg()
# compare the two learners
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
# share results
print()
print("best4LinReg() results")
print(f"RMSE LR : {rmseLR}")
print(f"RMSE DT : {rmseDT}")
if rmseLR < 0.9 * rmseDT:
print("LR < 0.9 DT: pass")
else:
print("LR >= 0.9 DT: fail")
print
# get data that is best for a random tree
lrlearner = lrl.LinRegLearner(verbose = False)
dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)
X, Y = best4DT()
# compare the two learners
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
# share results
print()
print("best4RT() results")
print(f"RMSE LR : {rmseLR}")
print(f"RMSE DT : {rmseDT}")
if rmseDT < 0.9 * rmseLR:
print("DT < 0.9 LR: pass")
else:
print("DT >= 0.9 LR: fail")
print
if __name__=="__main__":
test_code()
"""
Test best4 data generator. (c) 2016 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
"""
import numpy as np
import math
import LinRegLearner as lrl
import DTLearner as dt
from gen_data import best4LinReg, best4DT
# compare two learners' rmse out of sample
def compare_os_rmse(learner1, learner2, X, Y):
# compute how much of the data is training and testing
train_rows = int(math.floor(0.6 * X.shape[0]))
test_rows = X.shape[0] - train_rows
# separate out training and testing data
train = np.random.choice(X.shape[0], size=train_rows, replace=False)
test = np.setdiff1d(np.array(range(X.shape[0])), train)
trainX = X[train, :]
trainY = Y[train]
testX = X[test, :]
testY = Y[test]
# train the learners
learner1.addEvidence(trainX, trainY) # train it
learner2.addEvidence(trainX, trainY) # train it
# evaluate learner1 out of sample
predY = learner1.query(testX) # get the predictions
rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
# evaluate learner2 out of sample
predY = learner2.query(testX) # get the predictions
rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
return rmse1, rmse2
def test_code():
# create two learners and get data
lrlearner = lrl.LinRegLearner(verbose=False)
dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
X, Y = best4LinReg()
# compare the two learners
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
# share results
print()
print("best4LinReg() results")
print(f"RMSE LR : {rmseLR}")
print(f"RMSE DT : {rmseDT}")
if rmseLR < 0.9 * rmseDT:
print("LR < 0.9 DT: pass")
else:
print("LR >= 0.9 DT: fail")
print
# get data that is best for a random tree
lrlearner = lrl.LinRegLearner(verbose=False)
dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
X, Y = best4DT()
# compare the two learners
rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y)
# share results
print()
print("best4RT() results")
print(f"RMSE LR : {rmseLR}")
print(f"RMSE DT : {rmseDT}")
if rmseDT < 0.9 * rmseLR:
print("DT < 0.9 LR: pass")
else:
print("DT >= 0.9 LR: fail")
print
if __name__ == "__main__":
test_code()