Add project 5 sources.
This commit is contained in:
175
p5_classification/naiveBayes.py
Normal file
175
p5_classification/naiveBayes.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# naiveBayes.py
|
||||
# -------------
|
||||
# Licensing Information: You are free to use or extend these projects for
|
||||
# educational purposes provided that (1) you do not distribute or publish
|
||||
# solutions, (2) you retain this notice, and (3) you provide clear
|
||||
# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
|
||||
#
|
||||
# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
|
||||
# The core projects and autograders were primarily created by John DeNero
|
||||
# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
|
||||
# Student side autograding was added by Brad Miller, Nick Hay, and
|
||||
# Pieter Abbeel (pabbeel@cs.berkeley.edu).
|
||||
|
||||
|
||||
import util
|
||||
import classificationMethod
|
||||
import math
|
||||
|
||||
class NaiveBayesClassifier(classificationMethod.ClassificationMethod):
|
||||
"""
|
||||
See the project description for the specifications of the Naive Bayes classifier.
|
||||
|
||||
Note that the variable 'datum' in this code refers to a counter of features
|
||||
(not to a raw samples.Datum).
|
||||
"""
|
||||
def __init__(self, legalLabels):
|
||||
self.legalLabels = legalLabels
|
||||
self.type = "naivebayes"
|
||||
self.k = 1 # this is the smoothing parameter, ** use it in your train method **
|
||||
self.automaticTuning = False # Look at this flag to decide whether to choose k automatically ** use this in your train method **
|
||||
|
||||
def setSmoothing(self, k):
|
||||
"""
|
||||
This is used by the main method to change the smoothing parameter before training.
|
||||
Do not modify this method.
|
||||
"""
|
||||
self.k = k
|
||||
|
||||
def train(self, trainingData, trainingLabels, validationData, validationLabels):
|
||||
"""
|
||||
Outside shell to call your method. Do not modify this method.
|
||||
"""
|
||||
|
||||
# might be useful in your code later...
|
||||
# this is a list of all features in the training set.
|
||||
self.features = list(set([ f for datum in trainingData for f in datum.keys() ]));
|
||||
|
||||
if (self.automaticTuning):
|
||||
kgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]
|
||||
else:
|
||||
kgrid = [self.k]
|
||||
|
||||
self.trainAndTune(trainingData, trainingLabels, validationData, validationLabels, kgrid)
|
||||
|
||||
def trainAndTune(self, trainingData, trainingLabels, validationData, validationLabels, kgrid):
|
||||
"""
|
||||
Trains the classifier by collecting counts over the training data, and
|
||||
stores the Laplace smoothed estimates so that they can be used to classify.
|
||||
Evaluate each value of k in kgrid to choose the smoothing parameter
|
||||
that gives the best accuracy on the held-out validationData.
|
||||
|
||||
trainingData and validationData are lists of feature Counters. The corresponding
|
||||
label lists contain the correct label for each datum.
|
||||
|
||||
To get the list of all possible features or labels, use self.features and
|
||||
self.legalLabels.
|
||||
"""
|
||||
|
||||
bestAccuracyCount = -1 # best accuracy so far on validation set
|
||||
|
||||
# Common training - get all counts from training data
|
||||
# We only do it once - save computation in tuning smoothing parameter
|
||||
commonPrior = util.Counter() # probability over labels
|
||||
commonConditionalProb = util.Counter() # Conditional probability of feature feat being 1
|
||||
# indexed by (feat, label)
|
||||
commonCounts = util.Counter() # how many time I have seen feature feat with label y
|
||||
# whatever inactive or active
|
||||
|
||||
for i in range(len(trainingData)):
|
||||
datum = trainingData[i]
|
||||
label = trainingLabels[i]
|
||||
commonPrior[label] += 1
|
||||
for feat, value in datum.items():
|
||||
commonCounts[(feat,label)] += 1
|
||||
if value > 0: # assume binary value
|
||||
commonConditionalProb[(feat, label)] += 1
|
||||
|
||||
for k in kgrid: # Smoothing parameter tuning loop!
|
||||
prior = util.Counter()
|
||||
conditionalProb = util.Counter()
|
||||
counts = util.Counter()
|
||||
|
||||
# get counts from common training step
|
||||
for key, val in commonPrior.items():
|
||||
prior[key] += val
|
||||
for key, val in commonCounts.items():
|
||||
counts[key] += val
|
||||
for key, val in commonConditionalProb.items():
|
||||
conditionalProb[key] += val
|
||||
|
||||
# smoothing:
|
||||
for label in self.legalLabels:
|
||||
for feat in self.features:
|
||||
conditionalProb[ (feat, label)] += k
|
||||
counts[(feat, label)] += 2*k # 2 because both value 0 and 1 are smoothed
|
||||
|
||||
# normalizing:
|
||||
prior.normalize()
|
||||
for x, count in conditionalProb.items():
|
||||
conditionalProb[x] = count * 1.0 / counts[x]
|
||||
|
||||
self.prior = prior
|
||||
self.conditionalProb = conditionalProb
|
||||
|
||||
# evaluating performance on validation set
|
||||
predictions = self.classify(validationData)
|
||||
accuracyCount = [predictions[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
|
||||
|
||||
print "Performance on validation set for k=%f: (%.1f%%)" % (k, 100.0*accuracyCount/len(validationLabels))
|
||||
if accuracyCount > bestAccuracyCount:
|
||||
bestParams = (prior, conditionalProb, k)
|
||||
bestAccuracyCount = accuracyCount
|
||||
# end of automatic tuning loop
|
||||
self.prior, self.conditionalProb, self.k = bestParams
|
||||
|
||||
def classify(self, testData):
|
||||
"""
|
||||
Classify the data based on the posterior distribution over labels.
|
||||
|
||||
You shouldn't modify this method.
|
||||
"""
|
||||
guesses = []
|
||||
self.posteriors = [] # Log posteriors are stored for later data analysis (autograder).
|
||||
for datum in testData:
|
||||
posterior = self.calculateLogJointProbabilities(datum)
|
||||
guesses.append(posterior.argMax())
|
||||
self.posteriors.append(posterior)
|
||||
return guesses
|
||||
|
||||
def calculateLogJointProbabilities(self, datum):
|
||||
"""
|
||||
Returns the log-joint distribution over legal labels and the datum.
|
||||
Each log-probability should be stored in the log-joint counter, e.g.
|
||||
logJoint[3] = <Estimate of log( P(Label = 3, datum) )>
|
||||
|
||||
To get the list of all possible features or labels, use self.features and
|
||||
self.legalLabels.
|
||||
"""
|
||||
logJoint = util.Counter()
|
||||
|
||||
for label in self.legalLabels:
|
||||
logJoint[label] = math.log(self.prior[label])
|
||||
for feat, value in datum.items():
|
||||
if value > 0:
|
||||
logJoint[label] += math.log(self.conditionalProb[feat,label])
|
||||
else:
|
||||
logJoint[label] += math.log(1-self.conditionalProb[feat,label])
|
||||
|
||||
return logJoint
|
||||
|
||||
def findHighOddsFeatures(self, label1, label2):
|
||||
"""
|
||||
Returns the 100 best features for the odds ratio:
|
||||
P(feature=1 | label1)/P(feature=1 | label2)
|
||||
|
||||
Note: you may find 'self.features' a useful way to loop through all possible features
|
||||
"""
|
||||
featuresOdds = []
|
||||
|
||||
for feat in self.features:
|
||||
featuresOdds.append((self.conditionalProb[feat, label1]/self.conditionalProb[feat, label2], feat))
|
||||
featuresOdds.sort()
|
||||
featuresOdds = [feat for val, feat in featuresOdds[-100:]]
|
||||
|
||||
return featuresOdds
|
||||
Reference in New Issue
Block a user