Finish report for project 3.

Create charts for project 3 report.
2020-09-26 10:52:05 -04:00 · 2020-09-26 10:29:53 -04:00
11 changed files with 197 additions and 34 deletions
@@ -34,7 +34,8 @@ unzip -n zips/*.zip -d ./
 # Reports

 - [Report 1](./martingale/martingale.md)
- [Report 2](./optimize_something/readme.md)
- [Report 3](#)
+- [Report 2](./optimize_something/optimize_something.md)
+- [Report 3](./assess_learners/assess_learners.md)
+- [Report 4](#)


@@ -0,0 +1,44 @@
+# Report
+
+## Experiment 1
+
+Significant overfitting occurs for leaf sizes smaller than five. The chart shows
+that the root-mean-square-error is significantly higher for the test data
+(`rmse_out`) for leaf sizes smaller than five.
+
+Between five and nine, the error for the test data is only slightly higher, so
+there is small overfitting. Beyond that, the errors increase, and the error for
+the test data is lower than for the training data. In other words, there is no
+more overfitting for leaf sizes greater than nine.
+
+![](figure_1.png)
+
+
+## Experiment 2
+
+For all bag sizes, the difference of the RMSE for the training data and the test
+data is smaller than without bagging. The test data still has a lower RMSE up to
+a leaf size of five. For greater leaf sizes, the RMSE for the test data is
+smaller than for the training data for all bag sizes, so there is no
+overfitting.
+
+![](figure_2.png)
+
+![](figure_3.png)
+
+![](figure_4.png)
+
+![](figure_5.png)
+
+
+## Experiment 3
+
+The Random Tree learner has a correlation of one for the training data. In other
+words, it fits the training data perfectly. Consequently, the correlation for
+the test data is worse than for the Decision Tree learner. The DT learner has a
+higher correlation than the RT for all other leaf sizes, both for the training
+and the test data.
+
+![](figure_6.png)
+
+![](figure_7.png)
@@ -23,6 +23,8 @@ GT honor code violation.
 """

 import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
 import math
 import LinRegLearner as lrl
 import DTLearner as dtl
@@ -30,8 +32,148 @@ import RTLearner as rtl
 import BagLearner as bgl
 import InsaneLearner as isl
 import sys
+from dataclasses import dataclass

-if __name__=="__main__":
+
+@dataclass
+class EvaluationResult:
+    rmse_in: float
+    rmse_out: float
+    corr_in: float
+    corr_out: float
+
+
+def test_learner(data, learner_class, **kwargs):
+    trainX, trainY, testX, testY = data
+    print("\n-----------")
+    print(f"name={learner_class.__name__} {kwargs=}")
+    learner = learner_class(**kwargs)
+    learner.addEvidence(trainX, trainY)
+    print(learner.author())
+
+    # evaluate in sample
+    predY = learner.query(trainX) # get the predictions
+    rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
+    print()
+    print("In sample results")
+    print(f"RMSE: {rmse}")
+    c = np.corrcoef(predY, y=trainY)
+    print(f"corr: {c[0,1]}")
+
+    # evaluate out of sample
+    predY = learner.query(testX) # get the predictions
+    rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
+    print()
+    print("Out of sample results")
+    print(f"RMSE: {rmse}")
+    c = np.corrcoef(predY, y=testY)
+    print(f"corr: {c[0,1]}")
+    print()
+
+
+def test_learners(data):
+    test_learner(data, lrl.LinRegLearner)
+    # test_learner(data, dtl.DTLearner, leaf_size=1)
+    # test_learner(data, rtl.RTLearner, leaf_size=6)
+    test_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
+    # test_learner(data, isl.InsaneLearner)
+
+
+def eval_learner(data, learner_class, **kwargs):
+    trainX, trainY, testX, testY = data
+    learner = learner_class(**kwargs)
+    learner.addEvidence(trainX, trainY)
+
+    # evaluate in sample
+    predY = learner.query(trainX) # get the predictions
+    rmse_in_sample = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
+    corr_in_sample = np.corrcoef(predY, y=trainY)[0,1]
+
+    # evaluate out of sample
+    predY = learner.query(testX) # get the predictions
+    rmse_out_sample = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
+    corr_out_sample = np.corrcoef(predY, y=testY)[0,1]
+    r = EvaluationResult(rmse_in_sample, rmse_out_sample,
+                         corr_in_sample, corr_out_sample)
+    return r
+
+
+def experiment_1(data):
+    """
+    Does overfitting occur with respect to leaf_size? Use the dataset
+    Istanbul.csv with DTLearner. For which values of leaf_size does overfitting
+    occur? Use RMSE as your metric for assessing overfitting. Support your
+    assertion with graphs/charts. (Don't use bagging).
+    """
+    results = [[i, r.rmse_in, r.rmse_out, r.corr_in, r.corr_out]
+                for i in range(1, 15)
+                if (r := eval_learner(data, dtl.DTLearner, leaf_size=i))]
+    cs = ["leaf_size", "rmse_in", "rmse_out", "corr_in", "corr_out"]
+    df = pd.DataFrame(results, columns=cs)
+    df.plot(title="DT Learner RMSE depending on leaf size for Istanbul dataset",
+            xlabel="leaf size", ylabel="RMSE",
+            x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
+    plt.savefig("figure_1.png")
+
+
+def experiment_2(data):
+    """
+    Can bagging reduce or eliminate overfitting with respect to leaf_size?
+    Again use the dataset Istanbul.csv with DTLearner. To investigate this
+    choose a fixed number of bags to use and vary leaf_size to evaluate.
+    Provide charts to validate your conclusions. Use RMSE as your metric.
+    """
+    def run_learner(leaf_size, bag_size):
+        r = eval_learner(data, bgl.BagLearner, learner=dtl.DTLearner,
+                         bags=bag_size,
+                         kwargs={'leaf_size': leaf_size})
+        return [r.rmse_in, r.rmse_out]
+
+    for i, bag_size in enumerate([5, 10, 15, 20]):
+        results = [[leaf_size] + run_learner(leaf_size, bag_size=bag_size)
+                    for leaf_size in range(1, 10)]
+        cs = ["leaf_size", "rmse_in", "rmse_out"]
+        df = pd.DataFrame(results, columns=cs)
+        df.plot(title=f"Bag of {bag_size} DT Learners RMSE over leaf size",
+                xlabel="leaf size", ylabel="RMSE",
+                x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
+        plt.savefig(f"figure_{i + 2}.png")
+
+
+def experiment_3(data):
+    """
+    Quantitatively compare "classic" decision trees (DTLearner) versus random
+    trees (RTLearner). In which ways is one method better than the other?
+    Provide at least two quantitative measures. Important, using two similar
+    measures that illustrate the same broader metric does not count as two.
+    (For example, do not use two measures for accuracy.) Note for this part of
+    the report you must conduct new experiments, don't use the results of the
+    experiments above for this(RMSE is not allowed as a new experiment).
+    """
+
+    def run_learner(leaf_size):
+        r1 = eval_learner(data, dtl.DTLearner, leaf_size=leaf_size)
+        r2 = eval_learner (data, rtl.RTLearner, leaf_size=leaf_size)
+        return [r1.corr_in, r1.corr_out, r2.corr_in, r2.corr_out]
+
+    results = [[leaf_size] + run_learner(leaf_size)
+                for leaf_size in range(1, 10)]
+    cs = ["leaf_size", "DT_corr_in", "DT_corr_out",
+          "RT_corr_in", "RT_corr_out"]
+    df = pd.DataFrame(results, columns=cs)
+    df.plot(title=f"Correlations of DT and RT for training data",
+            xlabel="leaf size", ylabel="Correlation",
+            x="leaf_size", y=["DT_corr_in", "RT_corr_in"],
+            kind="line")
+    plt.savefig(f"figure_6.png")
+    df.plot(title=f"Correlations of DT and RT for test data",
+            xlabel="leaf size", ylabel="Correlation",
+            x="leaf_size", y=["DT_corr_out", "RT_corr_out"],
+            kind="line")
+    plt.savefig(f"figure_7.png")
+
+
+def main():
    if len(sys.argv) != 2:
        print("Usage: python testlearner.py <filename>")
        sys.exit(1)
@@ -48,38 +190,14 @@ if __name__=="__main__":
    trainY = data[:train_rows,-1]
    testX = data[train_rows:,0:-1]
    testY = data[train_rows:,-1]
-    print(f"{testX.shape}")
-    print(f"{testY.shape}")
+    data = (trainX, trainY, testX, testY)

-    def test_learner(learner_class, **kwargs):
-        print("\n-----------")
-        print(f"name={learner_class.__name__} {kwargs=}")
-        learner = learner_class(**kwargs)
-        learner.addEvidence(trainX, trainY)
-        print(learner.author())
+    # test_learners(data)
+    experiment_1(data)
+    experiment_2(data)
+    experiment_3(data)

-        # evaluate in sample
-        predY = learner.query(trainX) # get the predictions
-        rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
-        print()
-        print("In sample results")
-        print(f"RMSE: {rmse}")
-        c = np.corrcoef(predY, y=trainY)
-        print(f"corr: {c[0,1]}")

-        # evaluate out of sample
-        predY = learner.query(testX) # get the predictions
-        rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
-        print()
-        print("Out of sample results")
-        print(f"RMSE: {rmse}")
-        c = np.corrcoef(predY, y=testY)
-        print(f"corr: {c[0,1]}")
-        print()
-
-    # test_learner(lrl.LinRegLearner)
-    test_learner(dtl.DTLearner, leaf_size=1)
-    # test_learner(rtl.RTLearner, leaf_size=6)
-    test_learner(bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
-    test_learner(isl.InsaneLearner)
+if __name__=="__main__":
+    main()
Author	SHA1	Message	Date
felixm	8ee47c9a1d	Finish report for project 3.	2020-09-26 10:52:05 -04:00
felixm	3ef06ccc96	Create charts for project 3 report.	2020-09-26 10:29:53 -04:00