diff --git a/assess_learners/figure_1.png b/assess_learners/figure_1.png new file mode 100644 index 0000000..9d03130 Binary files /dev/null and b/assess_learners/figure_1.png differ diff --git a/assess_learners/figure_2.png b/assess_learners/figure_2.png new file mode 100644 index 0000000..f5c2e5f Binary files /dev/null and b/assess_learners/figure_2.png differ diff --git a/assess_learners/figure_3.png b/assess_learners/figure_3.png new file mode 100644 index 0000000..836ebb2 Binary files /dev/null and b/assess_learners/figure_3.png differ diff --git a/assess_learners/figure_4.png b/assess_learners/figure_4.png new file mode 100644 index 0000000..734a106 Binary files /dev/null and b/assess_learners/figure_4.png differ diff --git a/assess_learners/figure_5.png b/assess_learners/figure_5.png new file mode 100644 index 0000000..79a5989 Binary files /dev/null and b/assess_learners/figure_5.png differ diff --git a/assess_learners/figure_6.png b/assess_learners/figure_6.png new file mode 100644 index 0000000..3ea4191 Binary files /dev/null and b/assess_learners/figure_6.png differ diff --git a/assess_learners/figure_7.png b/assess_learners/figure_7.png new file mode 100644 index 0000000..341eb88 Binary files /dev/null and b/assess_learners/figure_7.png differ diff --git a/assess_learners/testlearner.py b/assess_learners/testlearner.py index bb3f027..01d08a3 100644 --- a/assess_learners/testlearner.py +++ b/assess_learners/testlearner.py @@ -23,6 +23,8 @@ GT honor code violation. """ import numpy as np +import pandas as pd +import matplotlib.pyplot as plt import math import LinRegLearner as lrl import DTLearner as dtl @@ -30,8 +32,148 @@ import RTLearner as rtl import BagLearner as bgl import InsaneLearner as isl import sys +from dataclasses import dataclass -if __name__=="__main__": + +@dataclass +class EvaluationResult: + rmse_in: float + rmse_out: float + corr_in: float + corr_out: float + + +def test_learner(data, learner_class, **kwargs): + trainX, trainY, testX, testY = data + print("\n-----------") + print(f"name={learner_class.__name__} {kwargs=}") + learner = learner_class(**kwargs) + learner.addEvidence(trainX, trainY) + print(learner.author()) + + # evaluate in sample + predY = learner.query(trainX) # get the predictions + rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) + print() + print("In sample results") + print(f"RMSE: {rmse}") + c = np.corrcoef(predY, y=trainY) + print(f"corr: {c[0,1]}") + + # evaluate out of sample + predY = learner.query(testX) # get the predictions + rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) + print() + print("Out of sample results") + print(f"RMSE: {rmse}") + c = np.corrcoef(predY, y=testY) + print(f"corr: {c[0,1]}") + print() + + +def test_learners(data): + test_learner(data, lrl.LinRegLearner) + # test_learner(data, dtl.DTLearner, leaf_size=1) + # test_learner(data, rtl.RTLearner, leaf_size=6) + test_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5}) + # test_learner(data, isl.InsaneLearner) + + +def eval_learner(data, learner_class, **kwargs): + trainX, trainY, testX, testY = data + learner = learner_class(**kwargs) + learner.addEvidence(trainX, trainY) + + # evaluate in sample + predY = learner.query(trainX) # get the predictions + rmse_in_sample = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) + corr_in_sample = np.corrcoef(predY, y=trainY)[0,1] + + # evaluate out of sample + predY = learner.query(testX) # get the predictions + rmse_out_sample = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) + corr_out_sample = np.corrcoef(predY, y=testY)[0,1] + r = EvaluationResult(rmse_in_sample, rmse_out_sample, + corr_in_sample, corr_out_sample) + return r + + +def experiment_1(data): + """ + Does overfitting occur with respect to leaf_size? Use the dataset + Istanbul.csv with DTLearner. For which values of leaf_size does overfitting + occur? Use RMSE as your metric for assessing overfitting. Support your + assertion with graphs/charts. (Don't use bagging). + """ + results = [[i, r.rmse_in, r.rmse_out, r.corr_in, r.corr_out] + for i in range(1, 15) + if (r := eval_learner(data, dtl.DTLearner, leaf_size=i))] + cs = ["leaf_size", "rmse_in", "rmse_out", "corr_in", "corr_out"] + df = pd.DataFrame(results, columns=cs) + df.plot(title="DT Learner RMSE depending on leaf size for Istanbul dataset", + xlabel="leaf size", ylabel="RMSE", + x="leaf_size", y=["rmse_in", "rmse_out"], kind="line") + plt.savefig("figure_1.png") + + +def experiment_2(data): + """ + Can bagging reduce or eliminate overfitting with respect to leaf_size? + Again use the dataset Istanbul.csv with DTLearner. To investigate this + choose a fixed number of bags to use and vary leaf_size to evaluate. + Provide charts to validate your conclusions. Use RMSE as your metric. + """ + def run_learner(leaf_size, bag_size): + r = eval_learner(data, bgl.BagLearner, learner=dtl.DTLearner, + bags=bag_size, + kwargs={'leaf_size': leaf_size}) + return [r.rmse_in, r.rmse_out] + + for i, bag_size in enumerate([5, 10, 15, 20]): + results = [[leaf_size] + run_learner(leaf_size, bag_size=bag_size) + for leaf_size in range(1, 10)] + cs = ["leaf_size", "rmse_in", "rmse_out"] + df = pd.DataFrame(results, columns=cs) + df.plot(title=f"Bag of {bag_size} DT Learners RMSE over leaf size", + xlabel="leaf size", ylabel="RMSE", + x="leaf_size", y=["rmse_in", "rmse_out"], kind="line") + plt.savefig(f"figure_{i + 2}.png") + + +def experiment_3(data): + """ + Quantitatively compare "classic" decision trees (DTLearner) versus random + trees (RTLearner). In which ways is one method better than the other? + Provide at least two quantitative measures. Important, using two similar + measures that illustrate the same broader metric does not count as two. + (For example, do not use two measures for accuracy.) Note for this part of + the report you must conduct new experiments, don't use the results of the + experiments above for this(RMSE is not allowed as a new experiment). + """ + + def run_learner(leaf_size): + r1 = eval_learner(data, dtl.DTLearner, leaf_size=leaf_size) + r2 = eval_learner (data, rtl.RTLearner, leaf_size=leaf_size) + return [r1.corr_in, r1.corr_out, r2.corr_in, r2.corr_out] + + results = [[leaf_size] + run_learner(leaf_size) + for leaf_size in range(1, 10)] + cs = ["leaf_size", "DT_corr_in", "DT_corr_out", + "RT_corr_in", "RT_corr_out"] + df = pd.DataFrame(results, columns=cs) + df.plot(title=f"Correlations of DT and RT for training data", + xlabel="leaf size", ylabel="Correlation", + x="leaf_size", y=["DT_corr_in", "RT_corr_in"], + kind="line") + plt.savefig(f"figure_6.png") + df.plot(title=f"Correlations of DT and RT for test data", + xlabel="leaf size", ylabel="Correlation", + x="leaf_size", y=["DT_corr_out", "RT_corr_out"], + kind="line") + plt.savefig(f"figure_7.png") + + +def main(): if len(sys.argv) != 2: print("Usage: python testlearner.py ") sys.exit(1) @@ -48,38 +190,14 @@ if __name__=="__main__": trainY = data[:train_rows,-1] testX = data[train_rows:,0:-1] testY = data[train_rows:,-1] - print(f"{testX.shape}") - print(f"{testY.shape}") + data = (trainX, trainY, testX, testY) - def test_learner(learner_class, **kwargs): - print("\n-----------") - print(f"name={learner_class.__name__} {kwargs=}") - learner = learner_class(**kwargs) - learner.addEvidence(trainX, trainY) - print(learner.author()) + # test_learners(data) + experiment_1(data) + experiment_2(data) + experiment_3(data) - # evaluate in sample - predY = learner.query(trainX) # get the predictions - rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) - print() - print("In sample results") - print(f"RMSE: {rmse}") - c = np.corrcoef(predY, y=trainY) - print(f"corr: {c[0,1]}") - # evaluate out of sample - predY = learner.query(testX) # get the predictions - rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) - print() - print("Out of sample results") - print(f"RMSE: {rmse}") - c = np.corrcoef(predY, y=testY) - print(f"corr: {c[0,1]}") - print() - - # test_learner(lrl.LinRegLearner) - test_learner(dtl.DTLearner, leaf_size=1) - # test_learner(rtl.RTLearner, leaf_size=6) - test_learner(bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5}) - test_learner(isl.InsaneLearner) +if __name__=="__main__": + main()