diff --git a/defeat_learners/gen_data.py b/defeat_learners/gen_data.py index b876306..c411662 100644 --- a/defeat_learners/gen_data.py +++ b/defeat_learners/gen_data.py @@ -1,52 +1,88 @@ -""" -template for generating data to fool learners (c) 2016 Tucker Balch -Copyright 2018, Georgia Institute of Technology (Georgia Tech) -Atlanta, Georgia 30332 -All Rights Reserved - -Template code for CS 4646/7646 - -Georgia Tech asserts copyright ownership of this template and all derivative -works, including solutions to the projects assigned in this course. Students -and other users of this template code are advised not to share it with others -or to make it available on publicly viewable websites including repositories -such as github and gitlab. This copyright statement should not be removed -or edited. - -We do grant permission to share solutions privately with non-students such -as potential employers. However, sharing with other current or future -students of CS 7646 is prohibited and subject to being investigated as a -GT honor code violation. - ------do not edit anything above this line--- - -Student Name: Tucker Balch (replace with your name) -GT User ID: tb34 (replace with your User ID) -GT ID: 900897987 (replace with your GT ID) -""" - -import numpy as np -import math - -# this function should return a dataset (X and Y) that will work -# better for linear regression than decision trees -def best4LinReg(seed=1489683273): - np.random.seed(seed) - X = np.zeros((100,2)) - Y = np.random.random(size = (100,))*200-100 - # Here's is an example of creating a Y from randomly generated - # X with multiple columns - # Y = X[:,0] + np.sin(X[:,1]) + X[:,2]**2 + X[:,3]**3 - return X, Y - -def best4DT(seed=1489683273): - np.random.seed(seed) - X = np.zeros((100,2)) - Y = np.random.random(size = (100,))*200-100 - return X, Y - -def author(): - return 'tb34' #Change this to your user ID - -if __name__=="__main__": - print("they call me Tim.") +""" +template for generating data to fool learners (c) 2016 Tucker Balch +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- + +Student Name: Tucker Balch (replace with your name) +GT User ID: tb34 (replace with your User ID) +GT ID: 900897987 (replace with your GT ID) +""" + +import numpy as np +import pandas as pd +import math + + +def best4LinReg(seed=1489683273): + """ + This function should return a dataset (X and Y) that will work better for + linear regression than decision trees. + + We make Y a simple linear combination of X. That will give the Linear + Regression algorithm a very easy time (no RMSE at all) and beat the DT + easily. + """ + np.random.seed(seed) + X = np.random.random(size=(100, 2)) * 200 - 100 + Y = X[:, 0] * -2 + X[:, 1] * 3 + return X, Y + + +def best4DT(seed=1489683273): + """ + This function should return a dataset that will work better for decision + trees than linear regression. + """ + + # Z = np.append(X, Y.reshape(Y.shape[0], 1), 1) + # pd.DataFrame(Z).to_csv("Z.csv", header=None, index=None) + # np.random.seed(seed) + # X = np.random.random(size=(100, 10))*1000-100 + # Y = np.random.random(size=(100,))*1000-100 + + np.random.seed(seed) + # X_1 = np.random.random(size=(100, 1))*200-100 + # X_2 = np.random.random(size=(100, 1))*200-100 + # X_3 = np.random.random(size=(100, 1))*200-100 + # X_4 = np.random.random(size=(100, 1))*200-100 + # X = np.concatenate([X_1, X_2, X_3, X_4], 1) + + # XXX: I honestly don't know how to help the DTLearner, yet. + + X_1 = np.asarray([i for i in range(1, 101)]).reshape(100, 1) + X_2 = np.asarray([i for i in range(100, 1100, 10)]).reshape(100, 1) + X_3 = np.asarray([i for i in range(200, 300)]).reshape(100, 1) + X_4 = np.asarray([i for i in range(300, 400)]).reshape(100, 1) + X_5 = np.asarray([i for i in range(1, 101)]).reshape(100, 1) + X_6 = np.asarray([i for i in range(1, 101)]).reshape(100, 1) + X_7 = np.asarray([i for i in range(1, 101)]).reshape(100, 1) + X_8 = np.asarray([i for i in range(1, 101)]).reshape(100, 1) + X = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8], 1) + # Y = X[:, 0] * 2 + X[:, 1] * 3 + Y = np.random.random(size=(100,)) * 200 - 100 + return X, Y + + +def author(): + return 'felixm' # Change this to your user ID + + +if __name__ == "__main__": + print("they call me Tim.") diff --git a/defeat_learners/testbest4.py b/defeat_learners/testbest4.py index 0aa9486..69d1bb5 100644 --- a/defeat_learners/testbest4.py +++ b/defeat_learners/testbest4.py @@ -1,100 +1,104 @@ -""" -Test best4 data generator. (c) 2016 Tucker Balch -Copyright 2018, Georgia Institute of Technology (Georgia Tech) -Atlanta, Georgia 30332 -All Rights Reserved - -Template code for CS 4646/7646 - -Georgia Tech asserts copyright ownership of this template and all derivative -works, including solutions to the projects assigned in this course. Students -and other users of this template code are advised not to share it with others -or to make it available on publicly viewable websites including repositories -such as github and gitlab. This copyright statement should not be removed -or edited. - -We do grant permission to share solutions privately with non-students such -as potential employers. However, sharing with other current or future -students of CS 7646 is prohibited and subject to being investigated as a -GT honor code violation. - ------do not edit anything above this line--- -""" - -import numpy as np -import math -import LinRegLearner as lrl -import DTLearner as dt -from gen_data import best4LinReg, best4DT - -# compare two learners' rmse out of sample -def compare_os_rmse(learner1, learner2, X, Y): - - # compute how much of the data is training and testing - train_rows = int(math.floor(0.6* X.shape[0])) - test_rows = X.shape[0] - train_rows - - # separate out training and testing data - train = np.random.choice(X.shape[0], size=train_rows, replace=False) - test = np.setdiff1d(np.array(range(X.shape[0])), train) - trainX = X[train, :] - trainY = Y[train] - testX = X[test, :] - testY = Y[test] - - # train the learners - learner1.addEvidence(trainX, trainY) # train it - learner2.addEvidence(trainX, trainY) # train it - - # evaluate learner1 out of sample - predY = learner1.query(testX) # get the predictions - rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) - - # evaluate learner2 out of sample - predY = learner2.query(testX) # get the predictions - rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) - - return rmse1, rmse2 - -def test_code(): - - # create two learners and get data - lrlearner = lrl.LinRegLearner(verbose = False) - dtlearner = dt.DTLearner(verbose = False, leaf_size = 1) - X, Y = best4LinReg() - - # compare the two learners - rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) - - # share results - print() - print("best4LinReg() results") - print(f"RMSE LR : {rmseLR}") - print(f"RMSE DT : {rmseDT}") - if rmseLR < 0.9 * rmseDT: - print("LR < 0.9 DT: pass") - else: - print("LR >= 0.9 DT: fail") - print - - # get data that is best for a random tree - lrlearner = lrl.LinRegLearner(verbose = False) - dtlearner = dt.DTLearner(verbose = False, leaf_size = 1) - X, Y = best4DT() - - # compare the two learners - rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) - - # share results - print() - print("best4RT() results") - print(f"RMSE LR : {rmseLR}") - print(f"RMSE DT : {rmseDT}") - if rmseDT < 0.9 * rmseLR: - print("DT < 0.9 LR: pass") - else: - print("DT >= 0.9 LR: fail") - print - -if __name__=="__main__": - test_code() +""" +Test best4 data generator. (c) 2016 Tucker Balch +Copyright 2018, Georgia Institute of Technology (Georgia Tech) +Atlanta, Georgia 30332 +All Rights Reserved + +Template code for CS 4646/7646 + +Georgia Tech asserts copyright ownership of this template and all derivative +works, including solutions to the projects assigned in this course. Students +and other users of this template code are advised not to share it with others +or to make it available on publicly viewable websites including repositories +such as github and gitlab. This copyright statement should not be removed +or edited. + +We do grant permission to share solutions privately with non-students such +as potential employers. However, sharing with other current or future +students of CS 7646 is prohibited and subject to being investigated as a +GT honor code violation. + +-----do not edit anything above this line--- +""" + +import numpy as np +import math +import LinRegLearner as lrl +import DTLearner as dt +from gen_data import best4LinReg, best4DT + +# compare two learners' rmse out of sample + + +def compare_os_rmse(learner1, learner2, X, Y): + + # compute how much of the data is training and testing + train_rows = int(math.floor(0.6 * X.shape[0])) + test_rows = X.shape[0] - train_rows + + # separate out training and testing data + train = np.random.choice(X.shape[0], size=train_rows, replace=False) + test = np.setdiff1d(np.array(range(X.shape[0])), train) + trainX = X[train, :] + trainY = Y[train] + testX = X[test, :] + testY = Y[test] + + # train the learners + learner1.addEvidence(trainX, trainY) # train it + learner2.addEvidence(trainX, trainY) # train it + + # evaluate learner1 out of sample + predY = learner1.query(testX) # get the predictions + rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) + + # evaluate learner2 out of sample + predY = learner2.query(testX) # get the predictions + rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) + + return rmse1, rmse2 + + +def test_code(): + + # create two learners and get data + lrlearner = lrl.LinRegLearner(verbose=False) + dtlearner = dt.DTLearner(verbose=False, leaf_size=1) + X, Y = best4LinReg() + + # compare the two learners + rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) + + # share results + print() + print("best4LinReg() results") + print(f"RMSE LR : {rmseLR}") + print(f"RMSE DT : {rmseDT}") + if rmseLR < 0.9 * rmseDT: + print("LR < 0.9 DT: pass") + else: + print("LR >= 0.9 DT: fail") + print + + # get data that is best for a random tree + lrlearner = lrl.LinRegLearner(verbose=False) + dtlearner = dt.DTLearner(verbose=False, leaf_size=1) + X, Y = best4DT() + + # compare the two learners + rmseLR, rmseDT = compare_os_rmse(lrlearner, dtlearner, X, Y) + + # share results + print() + print("best4RT() results") + print(f"RMSE LR : {rmseLR}") + print(f"RMSE DT : {rmseDT}") + if rmseDT < 0.9 * rmseLR: + print("DT < 0.9 LR: pass") + else: + print("DT >= 0.9 LR: fail") + print + + +if __name__ == "__main__": + test_code()