Create charts for project 3 report.
After Width: | Height: | Size: 28 KiB |
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 37 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 29 KiB |
After Width: | Height: | Size: 34 KiB |
|
@ -23,6 +23,8 @@ GT honor code violation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import math
|
import math
|
||||||
import LinRegLearner as lrl
|
import LinRegLearner as lrl
|
||||||
import DTLearner as dtl
|
import DTLearner as dtl
|
||||||
|
@ -30,8 +32,148 @@ import RTLearner as rtl
|
||||||
import BagLearner as bgl
|
import BagLearner as bgl
|
||||||
import InsaneLearner as isl
|
import InsaneLearner as isl
|
||||||
import sys
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
if __name__=="__main__":
|
|
||||||
|
@dataclass
|
||||||
|
class EvaluationResult:
|
||||||
|
rmse_in: float
|
||||||
|
rmse_out: float
|
||||||
|
corr_in: float
|
||||||
|
corr_out: float
|
||||||
|
|
||||||
|
|
||||||
|
def test_learner(data, learner_class, **kwargs):
|
||||||
|
trainX, trainY, testX, testY = data
|
||||||
|
print("\n-----------")
|
||||||
|
print(f"name={learner_class.__name__} {kwargs=}")
|
||||||
|
learner = learner_class(**kwargs)
|
||||||
|
learner.addEvidence(trainX, trainY)
|
||||||
|
print(learner.author())
|
||||||
|
|
||||||
|
# evaluate in sample
|
||||||
|
predY = learner.query(trainX) # get the predictions
|
||||||
|
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
|
||||||
|
print()
|
||||||
|
print("In sample results")
|
||||||
|
print(f"RMSE: {rmse}")
|
||||||
|
c = np.corrcoef(predY, y=trainY)
|
||||||
|
print(f"corr: {c[0,1]}")
|
||||||
|
|
||||||
|
# evaluate out of sample
|
||||||
|
predY = learner.query(testX) # get the predictions
|
||||||
|
rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
|
||||||
|
print()
|
||||||
|
print("Out of sample results")
|
||||||
|
print(f"RMSE: {rmse}")
|
||||||
|
c = np.corrcoef(predY, y=testY)
|
||||||
|
print(f"corr: {c[0,1]}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def test_learners(data):
|
||||||
|
test_learner(data, lrl.LinRegLearner)
|
||||||
|
# test_learner(data, dtl.DTLearner, leaf_size=1)
|
||||||
|
# test_learner(data, rtl.RTLearner, leaf_size=6)
|
||||||
|
test_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
|
||||||
|
# test_learner(data, isl.InsaneLearner)
|
||||||
|
|
||||||
|
|
||||||
|
def eval_learner(data, learner_class, **kwargs):
|
||||||
|
trainX, trainY, testX, testY = data
|
||||||
|
learner = learner_class(**kwargs)
|
||||||
|
learner.addEvidence(trainX, trainY)
|
||||||
|
|
||||||
|
# evaluate in sample
|
||||||
|
predY = learner.query(trainX) # get the predictions
|
||||||
|
rmse_in_sample = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
|
||||||
|
corr_in_sample = np.corrcoef(predY, y=trainY)[0,1]
|
||||||
|
|
||||||
|
# evaluate out of sample
|
||||||
|
predY = learner.query(testX) # get the predictions
|
||||||
|
rmse_out_sample = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
|
||||||
|
corr_out_sample = np.corrcoef(predY, y=testY)[0,1]
|
||||||
|
r = EvaluationResult(rmse_in_sample, rmse_out_sample,
|
||||||
|
corr_in_sample, corr_out_sample)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def experiment_1(data):
|
||||||
|
"""
|
||||||
|
Does overfitting occur with respect to leaf_size? Use the dataset
|
||||||
|
Istanbul.csv with DTLearner. For which values of leaf_size does overfitting
|
||||||
|
occur? Use RMSE as your metric for assessing overfitting. Support your
|
||||||
|
assertion with graphs/charts. (Don't use bagging).
|
||||||
|
"""
|
||||||
|
results = [[i, r.rmse_in, r.rmse_out, r.corr_in, r.corr_out]
|
||||||
|
for i in range(1, 15)
|
||||||
|
if (r := eval_learner(data, dtl.DTLearner, leaf_size=i))]
|
||||||
|
cs = ["leaf_size", "rmse_in", "rmse_out", "corr_in", "corr_out"]
|
||||||
|
df = pd.DataFrame(results, columns=cs)
|
||||||
|
df.plot(title="DT Learner RMSE depending on leaf size for Istanbul dataset",
|
||||||
|
xlabel="leaf size", ylabel="RMSE",
|
||||||
|
x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
|
||||||
|
plt.savefig("figure_1.png")
|
||||||
|
|
||||||
|
|
||||||
|
def experiment_2(data):
|
||||||
|
"""
|
||||||
|
Can bagging reduce or eliminate overfitting with respect to leaf_size?
|
||||||
|
Again use the dataset Istanbul.csv with DTLearner. To investigate this
|
||||||
|
choose a fixed number of bags to use and vary leaf_size to evaluate.
|
||||||
|
Provide charts to validate your conclusions. Use RMSE as your metric.
|
||||||
|
"""
|
||||||
|
def run_learner(leaf_size, bag_size):
|
||||||
|
r = eval_learner(data, bgl.BagLearner, learner=dtl.DTLearner,
|
||||||
|
bags=bag_size,
|
||||||
|
kwargs={'leaf_size': leaf_size})
|
||||||
|
return [r.rmse_in, r.rmse_out]
|
||||||
|
|
||||||
|
for i, bag_size in enumerate([5, 10, 15, 20]):
|
||||||
|
results = [[leaf_size] + run_learner(leaf_size, bag_size=bag_size)
|
||||||
|
for leaf_size in range(1, 10)]
|
||||||
|
cs = ["leaf_size", "rmse_in", "rmse_out"]
|
||||||
|
df = pd.DataFrame(results, columns=cs)
|
||||||
|
df.plot(title=f"Bag of {bag_size} DT Learners RMSE over leaf size",
|
||||||
|
xlabel="leaf size", ylabel="RMSE",
|
||||||
|
x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
|
||||||
|
plt.savefig(f"figure_{i + 2}.png")
|
||||||
|
|
||||||
|
|
||||||
|
def experiment_3(data):
|
||||||
|
"""
|
||||||
|
Quantitatively compare "classic" decision trees (DTLearner) versus random
|
||||||
|
trees (RTLearner). In which ways is one method better than the other?
|
||||||
|
Provide at least two quantitative measures. Important, using two similar
|
||||||
|
measures that illustrate the same broader metric does not count as two.
|
||||||
|
(For example, do not use two measures for accuracy.) Note for this part of
|
||||||
|
the report you must conduct new experiments, don't use the results of the
|
||||||
|
experiments above for this(RMSE is not allowed as a new experiment).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run_learner(leaf_size):
|
||||||
|
r1 = eval_learner(data, dtl.DTLearner, leaf_size=leaf_size)
|
||||||
|
r2 = eval_learner (data, rtl.RTLearner, leaf_size=leaf_size)
|
||||||
|
return [r1.corr_in, r1.corr_out, r2.corr_in, r2.corr_out]
|
||||||
|
|
||||||
|
results = [[leaf_size] + run_learner(leaf_size)
|
||||||
|
for leaf_size in range(1, 10)]
|
||||||
|
cs = ["leaf_size", "DT_corr_in", "DT_corr_out",
|
||||||
|
"RT_corr_in", "RT_corr_out"]
|
||||||
|
df = pd.DataFrame(results, columns=cs)
|
||||||
|
df.plot(title=f"Correlations of DT and RT for training data",
|
||||||
|
xlabel="leaf size", ylabel="Correlation",
|
||||||
|
x="leaf_size", y=["DT_corr_in", "RT_corr_in"],
|
||||||
|
kind="line")
|
||||||
|
plt.savefig(f"figure_6.png")
|
||||||
|
df.plot(title=f"Correlations of DT and RT for test data",
|
||||||
|
xlabel="leaf size", ylabel="Correlation",
|
||||||
|
x="leaf_size", y=["DT_corr_out", "RT_corr_out"],
|
||||||
|
kind="line")
|
||||||
|
plt.savefig(f"figure_7.png")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print("Usage: python testlearner.py <filename>")
|
print("Usage: python testlearner.py <filename>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -48,38 +190,14 @@ if __name__=="__main__":
|
||||||
trainY = data[:train_rows,-1]
|
trainY = data[:train_rows,-1]
|
||||||
testX = data[train_rows:,0:-1]
|
testX = data[train_rows:,0:-1]
|
||||||
testY = data[train_rows:,-1]
|
testY = data[train_rows:,-1]
|
||||||
print(f"{testX.shape}")
|
data = (trainX, trainY, testX, testY)
|
||||||
print(f"{testY.shape}")
|
|
||||||
|
|
||||||
def test_learner(learner_class, **kwargs):
|
# test_learners(data)
|
||||||
print("\n-----------")
|
experiment_1(data)
|
||||||
print(f"name={learner_class.__name__} {kwargs=}")
|
experiment_2(data)
|
||||||
learner = learner_class(**kwargs)
|
experiment_3(data)
|
||||||
learner.addEvidence(trainX, trainY)
|
|
||||||
print(learner.author())
|
|
||||||
|
|
||||||
# evaluate in sample
|
|
||||||
predY = learner.query(trainX) # get the predictions
|
|
||||||
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
|
|
||||||
print()
|
|
||||||
print("In sample results")
|
|
||||||
print(f"RMSE: {rmse}")
|
|
||||||
c = np.corrcoef(predY, y=trainY)
|
|
||||||
print(f"corr: {c[0,1]}")
|
|
||||||
|
|
||||||
# evaluate out of sample
|
if __name__=="__main__":
|
||||||
predY = learner.query(testX) # get the predictions
|
main()
|
||||||
rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
|
|
||||||
print()
|
|
||||||
print("Out of sample results")
|
|
||||||
print(f"RMSE: {rmse}")
|
|
||||||
c = np.corrcoef(predY, y=testY)
|
|
||||||
print(f"corr: {c[0,1]}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# test_learner(lrl.LinRegLearner)
|
|
||||||
test_learner(dtl.DTLearner, leaf_size=1)
|
|
||||||
# test_learner(rtl.RTLearner, leaf_size=6)
|
|
||||||
test_learner(bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
|
|
||||||
test_learner(isl.InsaneLearner)
|
|
||||||
|
|
||||||
|
|