Compare commits

...

2 Commits

Author SHA1 Message Date
8ee47c9a1d Finish report for project 3. 2020-09-26 10:52:05 -04:00
3ef06ccc96 Create charts for project 3 report. 2020-09-26 10:29:53 -04:00
11 changed files with 197 additions and 34 deletions

View File

@@ -34,7 +34,8 @@ unzip -n zips/*.zip -d ./
# Reports
- [Report 1](./martingale/martingale.md)
- [Report 2](./optimize_something/readme.md)
- [Report 3](#)
- [Report 2](./optimize_something/optimize_something.md)
- [Report 3](./assess_learners/assess_learners.md)
- [Report 4](#)

View File

@@ -0,0 +1,44 @@
# Report
## Experiment 1
Significant overfitting occurs for leaf sizes smaller than five. The chart shows
that the root-mean-square-error is significantly higher for the test data
(`rmse_out`) for leaf sizes smaller than five.
Between five and nine, the error for the test data is only slightly higher, so
there is small overfitting. Beyond that, the errors increase, and the error for
the test data is lower than for the training data. In other words, there is no
more overfitting for leaf sizes greater than nine.
![](figure_1.png)
## Experiment 2
For all bag sizes, the difference of the RMSE for the training data and the test
data is smaller than without bagging. The test data still has a lower RMSE up to
a leaf size of five. For greater leaf sizes, the RMSE for the test data is
smaller than for the training data for all bag sizes, so there is no
overfitting.
![](figure_2.png)
![](figure_3.png)
![](figure_4.png)
![](figure_5.png)
## Experiment 3
The Random Tree learner has a correlation of one for the training data. In other
words, it fits the training data perfectly. Consequently, the correlation for
the test data is worse than for the Decision Tree learner. The DT learner has a
higher correlation than the RT for all other leaf sizes, both for the training
and the test data.
![](figure_6.png)
![](figure_7.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@@ -23,6 +23,8 @@ GT honor code violation.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import LinRegLearner as lrl
import DTLearner as dtl
@@ -30,8 +32,148 @@ import RTLearner as rtl
import BagLearner as bgl
import InsaneLearner as isl
import sys
from dataclasses import dataclass
if __name__=="__main__":
@dataclass
class EvaluationResult:
rmse_in: float
rmse_out: float
corr_in: float
corr_out: float
def test_learner(data, learner_class, **kwargs):
trainX, trainY, testX, testY = data
print("\n-----------")
print(f"name={learner_class.__name__} {kwargs=}")
learner = learner_class(**kwargs)
learner.addEvidence(trainX, trainY)
print(learner.author())
# evaluate in sample
predY = learner.query(trainX) # get the predictions
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
print()
print("In sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=trainY)
print(f"corr: {c[0,1]}")
# evaluate out of sample
predY = learner.query(testX) # get the predictions
rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
print()
print("Out of sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=testY)
print(f"corr: {c[0,1]}")
print()
def test_learners(data):
test_learner(data, lrl.LinRegLearner)
# test_learner(data, dtl.DTLearner, leaf_size=1)
# test_learner(data, rtl.RTLearner, leaf_size=6)
test_learner(data, bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
# test_learner(data, isl.InsaneLearner)
def eval_learner(data, learner_class, **kwargs):
trainX, trainY, testX, testY = data
learner = learner_class(**kwargs)
learner.addEvidence(trainX, trainY)
# evaluate in sample
predY = learner.query(trainX) # get the predictions
rmse_in_sample = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
corr_in_sample = np.corrcoef(predY, y=trainY)[0,1]
# evaluate out of sample
predY = learner.query(testX) # get the predictions
rmse_out_sample = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
corr_out_sample = np.corrcoef(predY, y=testY)[0,1]
r = EvaluationResult(rmse_in_sample, rmse_out_sample,
corr_in_sample, corr_out_sample)
return r
def experiment_1(data):
"""
Does overfitting occur with respect to leaf_size? Use the dataset
Istanbul.csv with DTLearner. For which values of leaf_size does overfitting
occur? Use RMSE as your metric for assessing overfitting. Support your
assertion with graphs/charts. (Don't use bagging).
"""
results = [[i, r.rmse_in, r.rmse_out, r.corr_in, r.corr_out]
for i in range(1, 15)
if (r := eval_learner(data, dtl.DTLearner, leaf_size=i))]
cs = ["leaf_size", "rmse_in", "rmse_out", "corr_in", "corr_out"]
df = pd.DataFrame(results, columns=cs)
df.plot(title="DT Learner RMSE depending on leaf size for Istanbul dataset",
xlabel="leaf size", ylabel="RMSE",
x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
plt.savefig("figure_1.png")
def experiment_2(data):
"""
Can bagging reduce or eliminate overfitting with respect to leaf_size?
Again use the dataset Istanbul.csv with DTLearner. To investigate this
choose a fixed number of bags to use and vary leaf_size to evaluate.
Provide charts to validate your conclusions. Use RMSE as your metric.
"""
def run_learner(leaf_size, bag_size):
r = eval_learner(data, bgl.BagLearner, learner=dtl.DTLearner,
bags=bag_size,
kwargs={'leaf_size': leaf_size})
return [r.rmse_in, r.rmse_out]
for i, bag_size in enumerate([5, 10, 15, 20]):
results = [[leaf_size] + run_learner(leaf_size, bag_size=bag_size)
for leaf_size in range(1, 10)]
cs = ["leaf_size", "rmse_in", "rmse_out"]
df = pd.DataFrame(results, columns=cs)
df.plot(title=f"Bag of {bag_size} DT Learners RMSE over leaf size",
xlabel="leaf size", ylabel="RMSE",
x="leaf_size", y=["rmse_in", "rmse_out"], kind="line")
plt.savefig(f"figure_{i + 2}.png")
def experiment_3(data):
"""
Quantitatively compare "classic" decision trees (DTLearner) versus random
trees (RTLearner). In which ways is one method better than the other?
Provide at least two quantitative measures. Important, using two similar
measures that illustrate the same broader metric does not count as two.
(For example, do not use two measures for accuracy.) Note for this part of
the report you must conduct new experiments, don't use the results of the
experiments above for this(RMSE is not allowed as a new experiment).
"""
def run_learner(leaf_size):
r1 = eval_learner(data, dtl.DTLearner, leaf_size=leaf_size)
r2 = eval_learner (data, rtl.RTLearner, leaf_size=leaf_size)
return [r1.corr_in, r1.corr_out, r2.corr_in, r2.corr_out]
results = [[leaf_size] + run_learner(leaf_size)
for leaf_size in range(1, 10)]
cs = ["leaf_size", "DT_corr_in", "DT_corr_out",
"RT_corr_in", "RT_corr_out"]
df = pd.DataFrame(results, columns=cs)
df.plot(title=f"Correlations of DT and RT for training data",
xlabel="leaf size", ylabel="Correlation",
x="leaf_size", y=["DT_corr_in", "RT_corr_in"],
kind="line")
plt.savefig(f"figure_6.png")
df.plot(title=f"Correlations of DT and RT for test data",
xlabel="leaf size", ylabel="Correlation",
x="leaf_size", y=["DT_corr_out", "RT_corr_out"],
kind="line")
plt.savefig(f"figure_7.png")
def main():
if len(sys.argv) != 2:
print("Usage: python testlearner.py <filename>")
sys.exit(1)
@@ -48,38 +190,14 @@ if __name__=="__main__":
trainY = data[:train_rows,-1]
testX = data[train_rows:,0:-1]
testY = data[train_rows:,-1]
print(f"{testX.shape}")
print(f"{testY.shape}")
data = (trainX, trainY, testX, testY)
def test_learner(learner_class, **kwargs):
print("\n-----------")
print(f"name={learner_class.__name__} {kwargs=}")
learner = learner_class(**kwargs)
learner.addEvidence(trainX, trainY)
print(learner.author())
# test_learners(data)
experiment_1(data)
experiment_2(data)
experiment_3(data)
# evaluate in sample
predY = learner.query(trainX) # get the predictions
rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
print()
print("In sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=trainY)
print(f"corr: {c[0,1]}")
# evaluate out of sample
predY = learner.query(testX) # get the predictions
rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
print()
print("Out of sample results")
print(f"RMSE: {rmse}")
c = np.corrcoef(predY, y=testY)
print(f"corr: {c[0,1]}")
print()
# test_learner(lrl.LinRegLearner)
test_learner(dtl.DTLearner, leaf_size=1)
# test_learner(rtl.RTLearner, leaf_size=6)
test_learner(bgl.BagLearner, learner=dtl.DTLearner, bags=20, kwargs = {'leaf_size': 5})
test_learner(isl.InsaneLearner)
if __name__=="__main__":
main()