1
0
Fork 0
ML4T/assess_learners/DTLearner.py

107 lines
3.6 KiB
Python

import numpy as np
class DTLearner(object):
LEAF = -1
NA = -1
def __init__(self, leaf_size = 1, verbose = False):
self.leaf_size = leaf_size
self.verbose = verbose
def author(self):
return 'felixm' # replace tb34 with your Georgia Tech username
def create_node(self, factor, split_value, left, right):
return np.array([[factor, split_value, left, right], ])
def get_max_correlation(self, xs, y):
""" Return the index of the column x of xs that has the highest
absolute correlation with y. I would like to get a scalar value from
np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value
from the matrix. """
# This should deliver the same result, but does not. I am not willing
# to investigate right now.
# a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
# for i in range(xs.shape[1])])
i_max = 0
corr_max = 0
for i in range(xs.shape[1]):
corr_matrix = np.corrcoef(xs[:, i], y=y)
corr = corr_matrix[0, 1]
corr = abs(corr)
if corr > corr_max:
corr_max = corr
i_max = i
return i_max
def make_tree_absolute(self, tree):
for i in range(tree.shape[0]):
if tree[i, 2] == self.NA:
continue
tree[i, 2] = i + tree[i, 2]
tree[i, 3] = i + tree[i, 3]
return tree
def build_tree(self, xs, y):
assert(xs.shape[0] == y.shape[0])
assert(xs.shape[0] > 0) # If this is 0 something went wrong.
if xs.shape[0] == 1:
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
if np.all(y[0] == y):
return self.create_node(self.LEAF, y[0], self.NA, self.NA)
i = self.get_max_correlation(xs, y)
split_value = np.median(xs[:,i])
select_lt = xs[:, i] <= split_value
select_rt = xs[:, i] > split_value
# Avoid case where all values are low or equal to the median.
if select_lt.all() or select_rt.all():
select_lt = xs[:, i] < split_value
select_rt = xs[:, i] >= split_value
lt = self.build_tree(xs[select_lt], y[select_lt])
rt = self.build_tree(xs[select_rt], y[select_rt])
root = self.create_node(i, split_value, 1, rt.shape[0] + 1)
root = np.concatenate([root, lt, rt])
return root
def addEvidence(self, data_x, data_y):
"""
@summary: Add training data to learner
@param dataX: X values of data to add
@param dataY: the Y training values
"""
self.rel_tree = self.build_tree(data_x, data_y)
# self.abs_tree = self.make_tree_absolute(self.rel_tree)
def query_point(self, point):
node_index = 0
while self.rel_tree[node_index, 0] != self.LEAF:
node = self.rel_tree[node_index]
split_factor = int(node[0])
split_value = node[1]
if point[split_factor] <= split_value:
node_index += int(node[2])
else:
node_index += int(node[3])
return self.rel_tree[node_index, 1]
def query(self, points):
"""
@summary: Estimate a set of test points given the model we built.
@param points: should be a numpy array with each row corresponding to a specific query.
@returns the estimated values according to the saved model.
"""
query_point = lambda p: self.query_point(p)
r = np.apply_along_axis(query_point, 1, points)
return r
if __name__=="__main__":
print("the secret clue is 'zzyzx'")