Fix DTLearner. The issue was that I took the lenght of the wrong tree (right instead of left) for the root. Also avoid code duplication via abstract tree learner class because why not.
This commit is contained in:
@@ -1,10 +1,8 @@
|
||||
import numpy as np
|
||||
from AbstractTreeLearner import AbstractTreeLearner
|
||||
|
||||
|
||||
class DTLearner(object):
|
||||
|
||||
LEAF = -1
|
||||
NA = -1
|
||||
class DTLearner(AbstractTreeLearner):
|
||||
|
||||
def __init__(self, leaf_size = 1, verbose = False):
|
||||
self.leaf_size = leaf_size
|
||||
@@ -13,87 +11,28 @@ class DTLearner(object):
|
||||
def author(self):
|
||||
return 'felixm' # replace tb34 with your Georgia Tech username
|
||||
|
||||
def create_node(self, factor, split_value, left, right):
|
||||
return np.array([[factor, split_value, left, right], ])
|
||||
|
||||
def get_max_correlation(self, xs, y):
|
||||
""" Return the index of the column x of xs that has the highest
|
||||
absolute correlation with y. I would like to get a scalar value from
|
||||
np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value
|
||||
from the matrix. """
|
||||
# This should deliver the same result, but does not. I am not willing
|
||||
# to investigate right now.
|
||||
def get_correlations(self, xs, y):
|
||||
""" Return a list of sorted 2-tuples where the first element
|
||||
is the correlation and the second element is the index. Sorted by
|
||||
highest correlation first. """
|
||||
# a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
|
||||
# for i in range(xs.shape[1])])
|
||||
i_max = 0
|
||||
corr_max = 0
|
||||
correlations = []
|
||||
for i in range(xs.shape[1]):
|
||||
corr_matrix = np.corrcoef(xs[:, i], y=y)
|
||||
corr = corr_matrix[0, 1]
|
||||
corr = abs(corr)
|
||||
if corr > corr_max:
|
||||
corr_max = corr
|
||||
i_max = i
|
||||
return i_max
|
||||
c = abs(np.corrcoef(xs[:, i], y=y)[0, 1])
|
||||
correlations.append((c, i))
|
||||
correlations.sort(reverse=True)
|
||||
return correlations
|
||||
|
||||
def build_tree(self, xs, y):
|
||||
assert(xs.shape[0] == y.shape[0])
|
||||
assert(xs.shape[0] > 0) # If this is 0 something went wrong.
|
||||
def get_i_and_split_value(self, xs, y):
|
||||
for _, i in self.get_correlations(xs, y):
|
||||
split_value = np.median(xs[:,i])
|
||||
select = xs[:, i] <= split_value
|
||||
# If all elements are true we would get one sub-tree with zero
|
||||
# elements, but we need at least one element. Therefore, we only
|
||||
# choose the index if not all elements are true. If they are we go
|
||||
# to the next smaller correlation.
|
||||
if not select.all():
|
||||
break
|
||||
return i, split_value
|
||||
|
||||
if xs.shape[0] <= self.leaf_size:
|
||||
value = np.median(y)
|
||||
return self.create_node(self.LEAF, value, self.NA, self.NA)
|
||||
|
||||
if np.all(y[0] == y):
|
||||
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
|
||||
|
||||
i = self.get_max_correlation(xs, y)
|
||||
split_value = np.median(xs[:,i])
|
||||
|
||||
select_lt = xs[:, i] <= split_value
|
||||
select_rt = xs[:, i] > split_value
|
||||
# Avoid case where all values are low or equal to the median.
|
||||
if select_lt.all() or select_rt.all():
|
||||
select_lt = xs[:, i] < split_value
|
||||
select_rt = xs[:, i] >= split_value
|
||||
|
||||
lt = self.build_tree(xs[select_lt], y[select_lt])
|
||||
rt = self.build_tree(xs[select_rt], y[select_rt])
|
||||
root = self.create_node(i, split_value, 1, rt.shape[0] + 1)
|
||||
|
||||
root = np.concatenate([root, lt, rt])
|
||||
return root
|
||||
|
||||
def addEvidence(self, data_x, data_y):
|
||||
"""
|
||||
@summary: Add training data to learner
|
||||
@param dataX: X values of data to add
|
||||
@param dataY: the Y training values
|
||||
"""
|
||||
self.rel_tree = self.build_tree(data_x, data_y)
|
||||
# self.abs_tree = self.make_tree_absolute(self.rel_tree)
|
||||
|
||||
def query_point(self, point):
|
||||
node_index = 0
|
||||
while self.rel_tree[node_index, 0] != self.LEAF:
|
||||
node = self.rel_tree[node_index]
|
||||
split_factor = int(node[0])
|
||||
split_value = node[1]
|
||||
if point[split_factor] <= split_value:
|
||||
node_index += int(node[2])
|
||||
else:
|
||||
node_index += int(node[3])
|
||||
return self.rel_tree[node_index, 1]
|
||||
|
||||
def query(self, points):
|
||||
"""
|
||||
@summary: Estimate a set of test points given the model we built.
|
||||
@param points: should be a numpy array with each row corresponding to a specific query.
|
||||
@returns the estimated values according to the saved model.
|
||||
"""
|
||||
query_point = lambda p: self.query_point(p)
|
||||
r = np.apply_along_axis(query_point, 1, points)
|
||||
return r
|
||||
|
||||
if __name__=="__main__":
|
||||
print("the secret clue is 'zzyzx'")
|
||||
|
||||
Reference in New Issue
Block a user