Fix DTLearner. The issue was that I took the lenght of the wrong tree (right instead of left) for the root. Also avoid code duplication via abstract tree learner class because why not.

This commit is contained in:
2020-09-24 22:15:41 -04:00
parent 3f2d2f4df3
commit 7007bc7514
6 changed files with 145 additions and 523 deletions

View File

@@ -1,10 +1,8 @@
import numpy as np
from AbstractTreeLearner import AbstractTreeLearner
class DTLearner(object):
LEAF = -1
NA = -1
class DTLearner(AbstractTreeLearner):
def __init__(self, leaf_size = 1, verbose = False):
self.leaf_size = leaf_size
@@ -13,87 +11,28 @@ class DTLearner(object):
def author(self):
return 'felixm' # replace tb34 with your Georgia Tech username
def create_node(self, factor, split_value, left, right):
return np.array([[factor, split_value, left, right], ])
def get_max_correlation(self, xs, y):
""" Return the index of the column x of xs that has the highest
absolute correlation with y. I would like to get a scalar value from
np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value
from the matrix. """
# This should deliver the same result, but does not. I am not willing
# to investigate right now.
def get_correlations(self, xs, y):
""" Return a list of sorted 2-tuples where the first element
is the correlation and the second element is the index. Sorted by
highest correlation first. """
# a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
# for i in range(xs.shape[1])])
i_max = 0
corr_max = 0
correlations = []
for i in range(xs.shape[1]):
corr_matrix = np.corrcoef(xs[:, i], y=y)
corr = corr_matrix[0, 1]
corr = abs(corr)
if corr > corr_max:
corr_max = corr
i_max = i
return i_max
c = abs(np.corrcoef(xs[:, i], y=y)[0, 1])
correlations.append((c, i))
correlations.sort(reverse=True)
return correlations
def build_tree(self, xs, y):
assert(xs.shape[0] == y.shape[0])
assert(xs.shape[0] > 0) # If this is 0 something went wrong.
def get_i_and_split_value(self, xs, y):
for _, i in self.get_correlations(xs, y):
split_value = np.median(xs[:,i])
select = xs[:, i] <= split_value
# If all elements are true we would get one sub-tree with zero
# elements, but we need at least one element. Therefore, we only
# choose the index if not all elements are true. If they are we go
# to the next smaller correlation.
if not select.all():
break
return i, split_value
if xs.shape[0] <= self.leaf_size:
value = np.median(y)
return self.create_node(self.LEAF, value, self.NA, self.NA)
if np.all(y[0] == y):
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
i = self.get_max_correlation(xs, y)
split_value = np.median(xs[:,i])
select_lt = xs[:, i] <= split_value
select_rt = xs[:, i] > split_value
# Avoid case where all values are low or equal to the median.
if select_lt.all() or select_rt.all():
select_lt = xs[:, i] < split_value
select_rt = xs[:, i] >= split_value
lt = self.build_tree(xs[select_lt], y[select_lt])
rt = self.build_tree(xs[select_rt], y[select_rt])
root = self.create_node(i, split_value, 1, rt.shape[0] + 1)
root = np.concatenate([root, lt, rt])
return root
def addEvidence(self, data_x, data_y):
"""
@summary: Add training data to learner
@param dataX: X values of data to add
@param dataY: the Y training values
"""
self.rel_tree = self.build_tree(data_x, data_y)
# self.abs_tree = self.make_tree_absolute(self.rel_tree)
def query_point(self, point):
node_index = 0
while self.rel_tree[node_index, 0] != self.LEAF:
node = self.rel_tree[node_index]
split_factor = int(node[0])
split_value = node[1]
if point[split_factor] <= split_value:
node_index += int(node[2])
else:
node_index += int(node[3])
return self.rel_tree[node_index, 1]
def query(self, points):
"""
@summary: Estimate a set of test points given the model we built.
@param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model.
"""
query_point = lambda p: self.query_point(p)
r = np.apply_along_axis(query_point, 1, points)
return r
if __name__=="__main__":
print("the secret clue is 'zzyzx'")