Fix DTLearner. The issue was that I took the lenght of the wrong tree (right instead of left) for the root. Also avoid code duplication via abstract tree learner class because why not.

2020-09-24 22:15:41 -04:00
parent 3f2d2f4df3
commit 7007bc7514
6 changed files with 145 additions and 523 deletions
--- a/assess_learners/DTLearner.py
+++ b/assess_learners/DTLearner.py
@@ -1,10 +1,8 @@
 import numpy as np
+from AbstractTreeLearner import AbstractTreeLearner


-class DTLearner(object):
-
-    LEAF = -1
-    NA = -1
+class DTLearner(AbstractTreeLearner):

    def __init__(self, leaf_size = 1, verbose = False):
        self.leaf_size = leaf_size
@@ -13,87 +11,28 @@ class DTLearner(object):
    def author(self):
        return 'felixm' # replace tb34 with your Georgia Tech username

-    def create_node(self, factor, split_value, left, right):
-        return np.array([[factor, split_value, left, right], ])
-
-    def get_max_correlation(self, xs, y):
-        """ Return the index of the column x of xs that has the highest
-        absolute correlation with y. I would like to get a scalar value from
-        np.corrcoef instead of a matrix, so I use [0, 1] to get a scalar value
-        from the matrix. """
-        # This should deliver the same result, but does not. I am not willing
-        # to investigate right now.
+    def get_correlations(self, xs, y):
+        """ Return a list of sorted 2-tuples where the first element
+        is the correlation and the second element is the index. Sorted by
+        highest correlation first. """
        # a = np.argmax([abs(np.corrcoef(xs[:,i], y)[0, 1])
                          # for i in range(xs.shape[1])])
-        i_max = 0
-        corr_max = 0
+        correlations = []
        for i in range(xs.shape[1]):
-            corr_matrix = np.corrcoef(xs[:, i], y=y)
-            corr = corr_matrix[0, 1]
-            corr = abs(corr)
-            if corr > corr_max:
-                corr_max = corr
-                i_max = i
-        return i_max
+            c = abs(np.corrcoef(xs[:, i], y=y)[0, 1])
+            correlations.append((c, i))
+        correlations.sort(reverse=True)
+        return correlations

-    def build_tree(self, xs, y):
-        assert(xs.shape[0] == y.shape[0])
-        assert(xs.shape[0] > 0) # If this is 0 something went wrong.
+    def get_i_and_split_value(self, xs, y):
+        for _, i in self.get_correlations(xs, y):
+            split_value = np.median(xs[:,i])
+            select = xs[:, i] <= split_value
+            # If all elements are true we would get one sub-tree with zero
+            # elements, but we need at least one element. Therefore, we only
+            # choose the index if not all elements are true. If they are we go
+            # to the next smaller correlation.
+            if not select.all():
+                break
+        return i, split_value

-        if xs.shape[0] <= self.leaf_size:
-            value = np.median(y)
-            return  self.create_node(self.LEAF, value, self.NA, self.NA)
-
-        if np.all(y[0] == y):
-            return self.create_node(self.LEAF, y[0], self.NA, self.NA)
-
-        i = self.get_max_correlation(xs, y)
-        split_value = np.median(xs[:,i])
-
-        select_lt = xs[:, i] <= split_value
-        select_rt = xs[:, i] > split_value
-        # Avoid case where all values are low or equal to the median.
-        if select_lt.all() or select_rt.all():
-            select_lt = xs[:, i] < split_value
-            select_rt = xs[:, i] >= split_value
-
-        lt = self.build_tree(xs[select_lt], y[select_lt])
-        rt = self.build_tree(xs[select_rt], y[select_rt])
-        root = self.create_node(i, split_value, 1, rt.shape[0] + 1)
-
-        root = np.concatenate([root, lt, rt])
-        return root
-
-    def addEvidence(self, data_x, data_y):
-        """
-        @summary: Add training data to learner
-        @param dataX: X values of data to add
-        @param dataY: the Y training values
-        """
-        self.rel_tree = self.build_tree(data_x, data_y)
-        # self.abs_tree = self.make_tree_absolute(self.rel_tree)
-
-    def query_point(self, point):
-        node_index = 0
-        while self.rel_tree[node_index, 0] != self.LEAF:
-            node = self.rel_tree[node_index]
-            split_factor = int(node[0])
-            split_value = node[1]
-            if point[split_factor] <= split_value:
-                node_index += int(node[2])
-            else:
-                node_index += int(node[3])
-        return self.rel_tree[node_index, 1]
-
-    def query(self, points):
-        """
-        @summary: Estimate a set of test points given the model we built.
-        @param points: should be a numpy array with each row corresponding to a specific query.
-        @returns the estimated values according to the saved model.
-        """
-        query_point = lambda p: self.query_point(p)
-        r = np.apply_along_axis(query_point, 1, points)
-        return r
-
-if __name__=="__main__":
-    print("the secret clue is 'zzyzx'")