-
Notifications
You must be signed in to change notification settings - Fork 0
/
NaiveBayes.py
136 lines (99 loc) · 4.72 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import math
import pprint
class NaiveBayes:
def __init__(self):
self.splitedClasses = dict()
self.trainedFeature = dict()
self.trainData = None
self.trainTarget = None
self.testData = None
# Return the mean of a list.
def __mean(self, feature):
return sum(feature) / len(feature)
# Return the variance of a list.
def __variance(self, feature):
meanDifference = list()
mean = self.__mean(feature)
for instance in feature:
meanDifference.append(pow((instance - mean), 2))
return self.__mean(meanDifference)
# Return the standard deviation from a list.
def __standardDeviation(self, feature):
return math.sqrt(self.__variance(feature))
# Divide data by class.
def __classSpliter(self):
for index, target in enumerate(self.trainData):
# Create new key in dict if class not already created.
if (self.trainTarget[index] not in self.splitedClasses):
self.splitedClasses[self.trainTarget[index]] = list()
# Add the instance to the corresponding class.
self.splitedClasses[self.trainTarget[index]].append(target)
return self.splitedClasses
# Store training data in the Naive Bayes class
def __setTrainData(self, trainData, targetData):
self.trainData = trainData
self.trainTarget = targetData
# Store test data in the Naive Bayes class
def __setTestData(self, testData):
self.testData = testData
# Calcule mean and standard deviation for each column (feature) in the training data.
def __squashFeature(self, classData):
tmp = list()
# The built-in zip give me all data from 1 column (feature), at once, on the classData list.
for feature in zip(*classData):
tmp.append((len(feature), self.__mean(feature), self.__standardDeviation(feature)))
return tmp
# Awful to read, I apologize for that, but correspond to the gaussian normal distribution.
def __gaussian(self, feature, mean, standardDeviation):
return (1 / (math.sqrt(2 * math.pi) * standardDeviation)) * math.exp(-((feature - mean) ** 2 / (2 * standardDeviation ** 2 )))
# Return the classes probability based on the instance given.
def __getPropabilities(self, testInstance):
probabilities = dict()
for key, value in self.trainedFeature.items():
probabilities[key] = value[0][0] / len(self.trainData)
for index in range(len(value)):
# Get the mean and standard deviation from each feature
_nbInstance, mean, standardDeviation = value[index]
# Multiply the gaussian from each feature
probabilities[key] *= self.__gaussian(testInstance[index], mean, standardDeviation)
return probabilities
# Return, for the instances, the predicted class + the probabilities in a tuple.
def __getPredictionForInstance(self, testInstance):
classValue = None
predictedClass = None
# Get the class probability for the test instance.
probaResults = self.__getPropabilities(testInstance)
# Check which class as the most probability, this is gonna be out prediction.
for key, value in probaResults.items():
# If first loop lap OR the probability is higher than the actual then set the new class as the highest proba.
if (classValue is None or classValue < value):
classValue = value
predictedClass = key
return ((predictedClass, probaResults))
# Training method. Take the training data and the target of the training data.
def fit(self, trainData=None, trainTarget=None, displayTraining=False):
# Store the data + label in the Naive Bayes class.
self.__setTrainData(trainData, trainTarget)
# Clean variables before to use it if algo used in cross validation or in a loop.
self.trainedFeature.clear()
self.splitedClasses.clear()
# Return a dictionnary with all the different label as a key with the instances related in it.
splitedCLasses = self.__classSpliter()
# For every class, we reduce all the instance into the number of instances, the mean and the standard deviation.
for key, value in splitedCLasses.items():
self.trainedFeature[key] = self.__squashFeature(value)
if (displayTraining == True):
pprint.pprint(self.trainedFeature)
return self
# Method to predict, should be used after the fit method.
def predict(self, testData):
# If there is no training done before then return false with error message.
if len(self.trainedFeature) is 0:
print('Error: no training data recorded. Please fit (train) before to predict.')
return False
self.__setTestData(testData)
# For every test instance we predict the class which she is the closer.
predictions = list()
for testInstance in self.testData:
predictions.append(self.__getPredictionForInstance(testInstance))
return predictions