Change best for LinReg to return optimal data

Add my DT Learner to defeat_learners assignment
2020-10-05 17:27:09 -04:00 · 2020-10-05 12:49:58 -04:00
3 changed files with 300 additions and 216 deletions
--- a/defeat_learners/DTLearner.py
+++ b/defeat_learners/DTLearner.py
@@ -1,64 +1,108 @@
 """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 A simple wrapper for linear regression.  (c) 2015 Tucker Balch  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Note, this is NOT a correct DTLearner; Replace with your own implementation.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Copyright 2018, Georgia Institute of Technology (Georgia Tech)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Atlanta, Georgia 30332  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 All Rights Reserved  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Template code for CS 4646/7646  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Georgia Tech asserts copyright ownership of this template and all derivative  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 works, including solutions to the projects assigned in this course. Students  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 and other users of this template code are advised not to share it with others  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 or to make it available on publicly viewable websites including repositories  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 such as github and gitlab.  This copyright statement should not be removed  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 or edited.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 We do grant permission to share solutions privately with non-students such  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 as potential employers. However, sharing with other current or future  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 students of CS 7646 is prohibited and subject to being investigated as a  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 GT honor code violation.  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 -----do not edit anything above this line---  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 Student Name: Tucker Balch (replace with your name)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 GT User ID: tb34 (replace with your User ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 GT ID: 900897987 (replace with your GT ID)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 import numpy as np
 import warnings  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 class DTLearner(object):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-    def __init__(self, leaf_size=1, verbose = False):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+class DTLearner:
-        warnings.warn("\n\n  WARNING! THIS IS NOT A CORRECT DTLearner IMPLEMENTATION! REPLACE WITH YOUR OWN CODE\n")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    LEAF = -1
-        pass # move along, these aren't the drones you're looking for  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    NA = -1
    def __init__(self, leaf_size=1, verbose=False):
        self.leaf_size = leaf_size
        self.verbose = verbose
    def author(self):
-        return 'tb34' # replace tb34 with your Georgia Tech username  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        return 'felixm'  # replace tb34 with your Georgia Tech username
-    def addEvidence(self,dataX,dataY):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    def create_node(self, factor, split_value, left, right):
-        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        return np.array([(factor, split_value, left, right), ],
-        @summary: Add training data to learner  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+                        dtype='|i4, f4, i4,  i4')
        @param dataX: X values of data to add  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
        @param dataY: the Y training values  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
        """  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-        # slap on 1s column so linear regression finds a constant term  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    def query_point(self, point):
-        newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1])  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        node_index = 0
-        newdataX[:,0:dataX.shape[1]]=dataX  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        while self.rel_tree[node_index][0] != self.LEAF:
            node = self.rel_tree[node_index]
            split_factor = node[0]
            split_value = node[1]
            if point[split_factor] <= split_value:
                # Recurse into left sub-tree.
                node_index += node[2]
            else:
                node_index += node[3]
        v = self.rel_tree[node_index][1]
        return v
-        # build and save the model  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    def query(self, points):
        self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY, rcond=None)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
    def query(self,points):  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
        """
        @summary: Estimate a set of test points given the model we built.
        @param points: should be a numpy array with each row corresponding to a specific query.
        @returns the estimated values according to the saved model.
        """
-        return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        def query_point(p): return self.query_point(p)
        r = np.apply_along_axis(query_point, 1, points)
        return r
-if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    def build_tree(self, xs, y):
-    print("the secret clue is 'zzyzx'")  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+        """
        @summary: Build a decision tree from the training data.
        @param dataX: X values of data to add
        @param dataY: the Y training values
        """
        assert(xs.shape[0] == y.shape[0])
        assert(xs.shape[0] > 0)  # If this is 0 something went wrong.
        if xs.shape[0] <= self.leaf_size:
            value = np.mean(y)
            return self.create_node(self.LEAF, value, self.NA, self.NA)
        if np.all(y[0] == y):
            return self.create_node(self.LEAF, y[0], self.NA, self.NA)
        i, split_value = self.get_i_and_split_value(xs, y)
        select_l = xs[:, i] <= split_value
        select_r = xs[:, i] > split_value
        lt = self.build_tree(xs[select_l], y[select_l])
        rt = self.build_tree(xs[select_r], y[select_r])
        root = self.create_node(i, split_value, 1, lt.shape[0] + 1)
        root = np.concatenate([root, lt, rt])
        return root
    def addEvidence(self, data_x, data_y):
        """
        @summary: Add training data to learner
        @param dataX: X values of data to add
        @param dataY: the Y training values
        """
        self.rel_tree = self.build_tree(data_x, data_y)
    def get_correlations(self, xs, y):
        """ Return a list of sorted 2-tuples where the first element
        is the correlation and the second element is the index. Sorted by
        highest correlation first. """
        # a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
        # for i in range(xs.shape[1])])
        correlations = []
        for i in range(xs.shape[1]):
            c = abs(np.corrcoef(xs[:, i], y=y)[0, 1])
            correlations.append((c, i))
        correlations.sort(reverse=True)
        return correlations
    def get_i_and_split_value(self, xs, y):
        # If all elements are true we would get one sub-tree with zero
        # elements, but we need at least one element in both trees.  We avoid
        # zero-trees in two steps. First we take the average between the median
        # value and a smaller value an use that as the new split value. If that
        # doesn't work (when all values are the same) we choose the X with the
        # next smaller correlation. We assert that not all values are
        # smaller/equal to the split value at the end.
        for _, i in self.get_correlations(xs, y):
            split_value = np.median(xs[:, i])
            select = xs[:, i] <= split_value
            if select.all():
                for value in xs[:, i]:
                    if value < split_value:
                        split_value = (value + split_value) / 2.0
                select = xs[:, i] <= split_value
            if not select.all():
                break
        assert(not select.all())
        return i, split_value
--- a/defeat_learners/gen_data.py
+++ b/defeat_learners/gen_data.py
@@ -26,27 +26,63 @@ GT ID: 900897987 (replace with your GT ID)
 """
 import numpy as np
 import pandas as pd
 import math
-# this function should return a dataset (X and Y) that will work  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+
 # better for linear regression than decision trees  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
 def best4LinReg(seed=1489683273):
    """
    This function should return a dataset (X and Y) that will work better for
    linear regression than decision trees.
    We make Y a simple linear combination of X. That will give the Linear
    Regression algorithm a very easy time (no RMSE at all) and beat the DT
    easily.
    """
    np.random.seed(seed)
-    X = np.zeros((100,2))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    X = np.random.random(size=(100, 2)) * 200 - 100
-    Y = np.random.random(size = (100,))*200-100  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    Y = X[:, 0] * -2 + X[:, 1] * 3
    # Here's is an example of creating a Y from randomly generated  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
    # X with multiple columns  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
    # Y = X[:,0] + np.sin(X[:,1]) + X[:,2]**2 + X[:,3]**3  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
    return X, Y
 def best4DT(seed=1489683273):
    """
    This function should return a dataset that will work better for decision
    trees than linear regression.
    """
    # Z = np.append(X, Y.reshape(Y.shape[0], 1), 1)
    # pd.DataFrame(Z).to_csv("Z.csv", header=None, index=None)
    # np.random.seed(seed)
    # X = np.random.random(size=(100, 10))*1000-100
    # Y = np.random.random(size=(100,))*1000-100
    np.random.seed(seed)
-    X = np.zeros((100,2))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # X_1 = np.random.random(size=(100, 1))*200-100
-    Y = np.random.random(size = (100,))*200-100  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    # X_2 = np.random.random(size=(100, 1))*200-100
    # X_3 = np.random.random(size=(100, 1))*200-100
    # X_4 = np.random.random(size=(100, 1))*200-100
    # X = np.concatenate([X_1, X_2, X_3, X_4], 1)
    # XXX: I honestly don't know how to help the DTLearner, yet.
    X_1 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
    X_2 = np.asarray([i for i in range(100, 1100, 10)]).reshape(100, 1)
    X_3 = np.asarray([i for i in range(200, 300)]).reshape(100, 1)
    X_4 = np.asarray([i for i in range(300, 400)]).reshape(100, 1)
    X_5 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
    X_6 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
    X_7 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
    X_8 = np.asarray([i for i in range(1, 101)]).reshape(100, 1)
    X = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8], 1)
    # Y = X[:, 0] * 2 + X[:, 1] * 3
    Y = np.random.random(size=(100,)) * 200 - 100
    return X, Y
 def author():  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
    return 'tb34' #Change this to your user ID  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
-if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+def author():
    return 'felixm'  # Change this to your user ID
 if __name__ == "__main__":
    print("they call me Tim.")
--- a/defeat_learners/testbest4.py
+++ b/defeat_learners/testbest4.py
@@ -28,10 +28,12 @@ import DTLearner as dt
 from gen_data import best4LinReg, best4DT
 # compare two learners' rmse out of sample
 def compare_os_rmse(learner1, learner2, X, Y):
    # compute how much of the data is training and testing
-    train_rows = int(math.floor(0.6* X.shape[0]))  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    train_rows = int(math.floor(0.6 * X.shape[0]))
    test_rows = X.shape[0] - train_rows
    # separate out training and testing data
@@ -43,24 +45,25 @@ def compare_os_rmse(learner1, learner2, X, Y):
    testY = Y[test]
    # train the learners
-    learner1.addEvidence(trainX, trainY) # train it  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    learner1.addEvidence(trainX, trainY)  # train it
-    learner2.addEvidence(trainX, trainY) # train it  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    learner2.addEvidence(trainX, trainY)  # train it
    # evaluate learner1 out of sample
-    predY = learner1.query(testX) # get the predictions  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    predY = learner1.query(testX)  # get the predictions
    rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
    # evaluate learner2 out of sample
-    predY = learner2.query(testX) # get the predictions  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    predY = learner2.query(testX)  # get the predictions
    rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
    return rmse1, rmse2
 def test_code():
    # create two learners and get data
-    lrlearner = lrl.LinRegLearner(verbose = False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    lrlearner = lrl.LinRegLearner(verbose=False)
-    dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
    X, Y = best4LinReg()
    # compare the two learners
@@ -78,8 +81,8 @@ def test_code():
    print
    # get data that is best for a random tree
-    lrlearner = lrl.LinRegLearner(verbose = False)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    lrlearner = lrl.LinRegLearner(verbose=False)
-    dtlearner = dt.DTLearner(verbose = False, leaf_size = 1)  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+    dtlearner = dt.DTLearner(verbose=False, leaf_size=1)
    X, Y = best4DT()
    # compare the two learners
@@ -96,5 +99,6 @@ def test_code():
        print("DT >= 0.9 LR:  fail")
    print
-if __name__=="__main__":  		  	   		     			  		 			     			  	  		 	  	 		 			  		  			
+
 if __name__ == "__main__":
    test_code()
Author	SHA1	Message	Date
Felix Martin	381670705b	Change best for LinReg to return optimal data	2020-10-05 17:27:09 -04:00
Felix Martin	a662e302db	Add my DT Learner to defeat_learners assignment	2020-10-05 12:49:58 -04:00