Finish first version of DTLearner. Needs testing.
parent
f823029a50
commit
51b2c9ceb0
|
@ -13,9 +13,36 @@ class DTLearner(object):
|
||||||
def author(self):
|
def author(self):
|
||||||
return 'felixm' # replace tb34 with your Georgia Tech username
|
return 'felixm' # replace tb34 with your Georgia Tech username
|
||||||
|
|
||||||
def create_node(self, factor: int, split: int, left: int, right: int):
|
def create_node(self, factor, split_value, left, right):
|
||||||
return np.array((factor, split, left, right))
|
return np.array([[factor, split_value, left, right], ])
|
||||||
|
|
||||||
|
def get_max_correlation(self, xs, y):
|
||||||
|
""" Return the index of the column x of xs that has the highest
|
||||||
|
absolute correlation with y. I would like to get a scalar value from
|
||||||
|
np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value
|
||||||
|
from the matrix. """
|
||||||
|
# This should deliver the same result, but does not. I am not willing
|
||||||
|
# to investigate right now.
|
||||||
|
# a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
|
||||||
|
# for i in range(xs.shape[1])])
|
||||||
|
i_max = 0
|
||||||
|
corr_max = 0
|
||||||
|
for i in range(xs.shape[1]):
|
||||||
|
corr_matrix = np.corrcoef(xs[:, i], y=y)
|
||||||
|
corr = corr_matrix[0, 1]
|
||||||
|
corr = abs(corr)
|
||||||
|
if corr > corr_max:
|
||||||
|
corr_max = corr
|
||||||
|
i_max = i
|
||||||
|
return i_max
|
||||||
|
|
||||||
|
def make_tree_absolute(self, tree):
|
||||||
|
for i in range(tree.shape[0]):
|
||||||
|
if tree[i, 2] == self.NA:
|
||||||
|
continue
|
||||||
|
tree[i, 2] = i + tree[i, 2]
|
||||||
|
tree[i, 3] = i + tree[i, 3]
|
||||||
|
return tree
|
||||||
|
|
||||||
def build_tree(self, xs, y):
|
def build_tree(self, xs, y):
|
||||||
assert(xs.shape[0] == y.shape[0])
|
assert(xs.shape[0] == y.shape[0])
|
||||||
|
@ -25,14 +52,24 @@ class DTLearner(object):
|
||||||
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
|
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
|
||||||
|
|
||||||
if np.all(y[0] == y):
|
if np.all(y[0] == y):
|
||||||
return self.create_node(self.LEAV, y[0], self.NA, self.NA)
|
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
|
||||||
|
|
||||||
# XXX: continue here
|
i = self.get_max_correlation(xs, y)
|
||||||
y = np.array([y])
|
split_value = np.median(xs[:,i])
|
||||||
correlations = np.corrcoef(xs, y, rowvar=True)
|
|
||||||
print(f"{correlations=}")
|
|
||||||
|
|
||||||
return 0
|
select_lt = xs[:, i] <= split_value
|
||||||
|
select_rt = xs[:, i] > split_value
|
||||||
|
# Avoid case where all values are low or equal to the median.
|
||||||
|
if select_lt.all() or select_rt.all():
|
||||||
|
select_lt = xs[:, i] < split_value
|
||||||
|
select_rt = xs[:, i] >= split_value
|
||||||
|
|
||||||
|
lt = self.build_tree(xs[select_lt], y[select_lt])
|
||||||
|
rt = self.build_tree(xs[select_rt], y[select_rt])
|
||||||
|
root = self.create_node(i, split_value, 1, rt.shape[0] + 1)
|
||||||
|
|
||||||
|
root = np.concatenate([root, lt, rt])
|
||||||
|
return root
|
||||||
|
|
||||||
def addEvidence(self, data_x, data_y):
|
def addEvidence(self, data_x, data_y):
|
||||||
"""
|
"""
|
||||||
|
@ -40,21 +77,30 @@ class DTLearner(object):
|
||||||
@param dataX: X values of data to add
|
@param dataX: X values of data to add
|
||||||
@param dataY: the Y training values
|
@param dataY: the Y training values
|
||||||
"""
|
"""
|
||||||
if self.verbose:
|
self.rel_tree = self.build_tree(data_x, data_y)
|
||||||
print(data_x)
|
# self.abs_tree = self.make_tree_absolute(self.rel_tree)
|
||||||
print(data_y)
|
|
||||||
self.tree = self.build_tree(data_x, data_y)
|
|
||||||
|
|
||||||
|
def query_point(self, point):
|
||||||
|
node_index = 0
|
||||||
|
while self.rel_tree[node_index, 0] != self.LEAF:
|
||||||
|
node = self.rel_tree[node_index]
|
||||||
|
split_factor = int(node[0])
|
||||||
|
split_value = node[1]
|
||||||
|
if point[split_factor] <= split_value:
|
||||||
|
node_index += int(node[2])
|
||||||
|
else:
|
||||||
|
node_index += int(node[3])
|
||||||
|
return self.rel_tree[node_index, 1]
|
||||||
|
|
||||||
|
def query(self, points):
|
||||||
def query(self,points):
|
|
||||||
"""
|
"""
|
||||||
@summary: Estimate a set of test points given the model we built.
|
@summary: Estimate a set of test points given the model we built.
|
||||||
@param points: should be a numpy array with each row corresponding to a specific query.
|
@param points: should be a numpy array with each row corresponding to a specific query.
|
||||||
@returns the estimated values according to the saved model.
|
@returns the estimated values according to the saved model.
|
||||||
"""
|
"""
|
||||||
return
|
query_point = lambda p: self.query_point(p)
|
||||||
# return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]
|
r = np.apply_along_axis(query_point, 1, points)
|
||||||
|
return r
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
print("the secret clue is 'zzyzx'")
|
print("the secret clue is 'zzyzx'")
|
||||||
|
|
|
@ -52,18 +52,15 @@ if __name__=="__main__":
|
||||||
testX = data[train_rows:,0:-1]
|
testX = data[train_rows:,0:-1]
|
||||||
testY = data[train_rows:,-1]
|
testY = data[train_rows:,-1]
|
||||||
|
|
||||||
# print(f"{testX.shape}")
|
print(f"{testX.shape}")
|
||||||
# print(f"{testY.shape}")
|
print(f"{testY.shape}")
|
||||||
|
|
||||||
# create a learner and train it
|
# create a learner and train it
|
||||||
# learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner
|
# learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner
|
||||||
learner = dtl.DTLearner(verbose = True) # create a LinRegLearner
|
learner = dtl.DTLearner(verbose = True) # create a LinRegLearner
|
||||||
# learner.addEvidence(trainX, trainY) # train it #XXX split back into test and non-test
|
learner.addEvidence(trainX, trainY)
|
||||||
learner.addEvidence(data[:,0:-1], data[:,-1])
|
|
||||||
print(learner.author())
|
print(learner.author())
|
||||||
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# evaluate in sample
|
# evaluate in sample
|
||||||
predY = learner.query(trainX) # get the predictions
|
predY = learner.query(trainX) # get the predictions
|
||||||
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
|
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
|
||||||
|
|
Loading…
Reference in New Issue