From 51b2c9ceb02d3f0d2c73470207069202f66fb2bf Mon Sep 17 00:00:00 2001 From: Felix Martin Date: Wed, 23 Sep 2020 16:15:22 -0400 Subject: [PATCH] Finish first version of DTLearner. Needs testing. --- assess_learners/DTLearner.py | 78 +++++++++++++++++++++++++++------- assess_learners/testlearner.py | 9 ++-- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/assess_learners/DTLearner.py b/assess_learners/DTLearner.py index 5a310b6..11f5b54 100644 --- a/assess_learners/DTLearner.py +++ b/assess_learners/DTLearner.py @@ -13,9 +13,36 @@ class DTLearner(object): def author(self): return 'felixm' # replace tb34 with your Georgia Tech username - def create_node(self, factor: int, split: int, left: int, right: int): - return np.array((factor, split, left, right)) + def create_node(self, factor, split_value, left, right): + return np.array([[factor, split_value, left, right], ]) + def get_max_correlation(self, xs, y): + """ Return the index of the column x of xs that has the highest + absolute correlation with y. I would like to get a scalar value from + np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value + from the matrix. """ + # This should deliver the same result, but does not. I am not willing + # to investigate right now. + # a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1]) + # for i in range(xs.shape[1])]) + i_max = 0 + corr_max = 0 + for i in range(xs.shape[1]): + corr_matrix = np.corrcoef(xs[:, i], y=y) + corr = corr_matrix[0, 1] + corr = abs(corr) + if corr > corr_max: + corr_max = corr + i_max = i + return i_max + + def make_tree_absolute(self, tree): + for i in range(tree.shape[0]): + if tree[i, 2] == self.NA: + continue + tree[i, 2] = i + tree[i, 2] + tree[i, 3] = i + tree[i, 3] + return tree def build_tree(self, xs, y): assert(xs.shape[0] == y.shape[0]) @@ -25,14 +52,24 @@ class DTLearner(object): return self.create_node(self.LEAF, y[0], self.NA, self.NA) if np.all(y[0] == y): - return self.create_node(self.LEAV, y[0], self.NA, self.NA) + return self.create_node(self.LEAF, y[0], self.NA, self.NA) - # XXX: continue here - y = np.array([y]) - correlations = np.corrcoef(xs, y, rowvar=True) - print(f"{correlations=}") + i = self.get_max_correlation(xs, y) + split_value = np.median(xs[:,i]) - return 0 + select_lt = xs[:, i] <= split_value + select_rt = xs[:, i] > split_value + # Avoid case where all values are low or equal to the median. + if select_lt.all() or select_rt.all(): + select_lt = xs[:, i] < split_value + select_rt = xs[:, i] >= split_value + + lt = self.build_tree(xs[select_lt], y[select_lt]) + rt = self.build_tree(xs[select_rt], y[select_rt]) + root = self.create_node(i, split_value, 1, rt.shape[0] + 1) + + root = np.concatenate([root, lt, rt]) + return root def addEvidence(self, data_x, data_y): """ @@ -40,21 +77,30 @@ class DTLearner(object): @param dataX: X values of data to add @param dataY: the Y training values """ - if self.verbose: - print(data_x) - print(data_y) - self.tree = self.build_tree(data_x, data_y) + self.rel_tree = self.build_tree(data_x, data_y) + # self.abs_tree = self.make_tree_absolute(self.rel_tree) + def query_point(self, point): + node_index = 0 + while self.rel_tree[node_index, 0] != self.LEAF: + node = self.rel_tree[node_index] + split_factor = int(node[0]) + split_value = node[1] + if point[split_factor] <= split_value: + node_index += int(node[2]) + else: + node_index += int(node[3]) + return self.rel_tree[node_index, 1] - - def query(self,points): + def query(self, points): """ @summary: Estimate a set of test points given the model we built. @param points: should be a numpy array with each row corresponding to a specific query. @returns the estimated values according to the saved model. """ - return - # return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1] + query_point = lambda p: self.query_point(p) + r = np.apply_along_axis(query_point, 1, points) + return r if __name__=="__main__": print("the secret clue is 'zzyzx'") diff --git a/assess_learners/testlearner.py b/assess_learners/testlearner.py index 83adb0a..f22966b 100644 --- a/assess_learners/testlearner.py +++ b/assess_learners/testlearner.py @@ -52,18 +52,15 @@ if __name__=="__main__": testX = data[train_rows:,0:-1] testY = data[train_rows:,-1] - # print(f"{testX.shape}") - # print(f"{testY.shape}") + print(f"{testX.shape}") + print(f"{testY.shape}") # create a learner and train it # learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner learner = dtl.DTLearner(verbose = True) # create a LinRegLearner - # learner.addEvidence(trainX, trainY) # train it #XXX split back into test and non-test - learner.addEvidence(data[:,0:-1], data[:,-1]) + learner.addEvidence(trainX, trainY) print(learner.author()) - sys.exit(0) - # evaluate in sample predY = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])