""" Test a learner. (c) 2015 Tucker Balch Copyright 2018, Georgia Institute of Technology (Georgia Tech) Atlanta, Georgia 30332 All Rights Reserved Template code for CS 4646/7646 Georgia Tech asserts copyright ownership of this template and all derivative works, including solutions to the projects assigned in this course. Students and other users of this template code are advised not to share it with others or to make it available on publicly viewable websites including repositories such as github and gitlab. This copyright statement should not be removed or edited. We do grant permission to share solutions privately with non-students such as potential employers. However, sharing with other current or future students of CS 7646 is prohibited and subject to being investigated as a GT honor code violation. -----do not edit anything above this line--- """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import math import LinRegLearner as lrl import DTLearner as dtl import RTLearner as rtl import BagLearner as bgl import InsaneLearner as isl import sys from dataclasses import dataclass @dataclass class EvaluationResult: rmse_in: float rmse_out: float corr_in: float corr_out: float def test_learner(data, learner_class, **kwargs): trainX, trainY, testX, testY = data print("\n-----------") print(f"name={learner_class.__name__} {kwargs=}") learner = learner_class(**kwargs) learner.addEvidence(trainX, trainY) print(learner.author()) # evaluate in sample predY = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) print() print("In sample results") print(f"RMSE: {rmse}") c = np.corrcoef(predY, y=trainY) print(f"corr: {c[0,1]}") # evaluate out of sample predY = learner.query(testX) # get the predictions rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) print() print("Out of sample results") print(f"RMSE: {rmse}") c = np.corrcoef(predY, y=testY) print(f"corr: {c[0,1]}") print() def test_learners(data): test_learner(data, lrl.LinRegLearner) # test_learner(data, dtl.DTLearner, leaf_size=1) # test_learner(data, rtl.RTLearner, leaf_size=6) test_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5}) # test_learner(data, isl.InsaneLearner) def eval_learner(data, learner_class, **kwargs): trainX, trainY, testX, testY = data learner = learner_class(**kwargs) learner.addEvidence(trainX, trainY) # evaluate in sample predY = learner.query(trainX) # get the predictions rmse_in_sample = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) corr_in_sample = np.corrcoef(predY, y=trainY)[0,1] # evaluate out of sample predY = learner.query(testX) # get the predictions rmse_out_sample = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) corr_out_sample = np.corrcoef(predY, y=testY)[0,1] r = EvaluationResult(rmse_in_sample, rmse_out_sample, corr_in_sample, corr_out_sample) return r def experiment_1(data): """ Does overfitting occur with respect to leaf_size? Use the dataset Istanbul.csv with DTLearner. For which values of leaf_size does overfitting occur? Use RMSE as your metric for assessing overfitting. Support your assertion with graphs/charts. (Don't use bagging). """ results = [[i, r.rmse_in, r.rmse_out, r.corr_in, r.corr_out] for i in range(1, 15) if (r := eval_learner(data, dtl.DTLearner, leaf_size=i))] cs = ["leaf_size", "rmse_in", "rmse_out", "corr_in", "corr_out"] df = pd.DataFrame(results, columns=cs) df.plot(title="DT Learner RMSE depending on leaf size for Istanbul dataset", xlabel="leaf size", ylabel="RMSE", x="leaf_size", y=["rmse_in", "rmse_out"], kind="line") plt.savefig("figure_1.png") def experiment_2(data): """ Can bagging reduce or eliminate overfitting with respect to leaf_size? Again use the dataset Istanbul.csv with DTLearner. To investigate this choose a fixed number of bags to use and vary leaf_size to evaluate. Provide charts to validate your conclusions. Use RMSE as your metric. """ def run_learner(leaf_size, bag_size): r = eval_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=bag_size, kwargs={'leaf_size': leaf_size}) return [r.rmse_in, r.rmse_out] for i, bag_size in enumerate([5, 10, 15, 20]): results = [[leaf_size] + run_learner(leaf_size, bag_size=bag_size) for leaf_size in range(1, 10)] cs = ["leaf_size", "rmse_in", "rmse_out"] df = pd.DataFrame(results, columns=cs) df.plot(title=f"Bag of {bag_size} DT Learners RMSE over leaf size", xlabel="leaf size", ylabel="RMSE", x="leaf_size", y=["rmse_in", "rmse_out"], kind="line") plt.savefig(f"figure_{i + 2}.png") def experiment_3(data): """ Quantitatively compare "classic" decision trees (DTLearner) versus random trees (RTLearner). In which ways is one method better than the other? Provide at least two quantitative measures. Important, using two similar measures that illustrate the same broader metric does not count as two. (For example, do not use two measures for accuracy.) Note for this part of the report you must conduct new experiments, don't use the results of the experiments above for this(RMSE is not allowed as a new experiment). """ def run_learner(leaf_size): r1 = eval_learner(data, dtl.DTLearner, leaf_size=leaf_size) r2 = eval_learner (data, rtl.RTLearner, leaf_size=leaf_size) return [r1.corr_in, r1.corr_out, r2.corr_in, r2.corr_out] results = [[leaf_size] + run_learner(leaf_size) for leaf_size in range(1, 10)] cs = ["leaf_size", "DT_corr_in", "DT_corr_out", "RT_corr_in", "RT_corr_out"] df = pd.DataFrame(results, columns=cs) df.plot(title=f"Correlations of DT and RT for training data", xlabel="leaf size", ylabel="Correlation", x="leaf_size", y=["DT_corr_in", "RT_corr_in"], kind="line") plt.savefig(f"figure_6.png") df.plot(title=f"Correlations of DT and RT for test data", xlabel="leaf size", ylabel="Correlation", x="leaf_size", y=["DT_corr_out", "RT_corr_out"], kind="line") plt.savefig(f"figure_7.png") def main(): if len(sys.argv) != 2: print("Usage: python testlearner.py ") sys.exit(1) inf = open(sys.argv[1]) data = np.array([list(map(float,s.strip().split(',')[1:])) for s in inf.readlines()[1:]]) # compute how much of the data is training and testing train_rows = int(0.6* data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows,0:-1] trainY = data[:train_rows,-1] testX = data[train_rows:,0:-1] testY = data[train_rows:,-1] data = (trainX, trainY, testX, testY) # test_learners(data) experiment_1(data) experiment_2(data) experiment_3(data) if __name__=="__main__": main()