forked from q-viper/ML-from-Basics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
naive_bayes.py
199 lines (149 loc) · 6.36 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from collections import Counter
from nlp import NLP
nlp = NLP()
class NaiveBayes():
"""
A class to perform Naive Bayes on text.
Methods:
* fit: to train a model
* predict: to do prediction
Use Cases:
spam = ['win money now', 'easy moneey now', 'win the money by replying']
ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']
all_txt = spam + ham
classes = [0, 0, 0, 1, 1, 1, 1, 1]
nb = NaiveBayes(all_txt, classes)
nb.fit()
test = ['easy boy easy. boy you need easy money?', "it was easy game" ]
nb.predict([spam[1]])
# {0: 0.06976744186046512, 1: 0.9302325581395349}
"""
def __init__(self, text, label):
self.text = text
self.label = label
self.cond_probs = {}
self.features = []
self.classes = []
self.class_prob = {}
def fit(self, view=False):
"""
Input: List of texts.
A method to find all the probability of P(word/class).
It finds out the probabilty for each word to be on each class.
Example:
--------
spam = ['win money now', 'easy moneey now', 'win the money by replying']
ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']
all_txt = spam + ham
classes = [0, 0, 0, 1, 1, 1, 1, 1]
nb = NaiveBayes(all_txt, classes)
nb.fit()
Steps:
---------
* Find the BoW
* Find the examples on each class
* Find the probability of word on class p(w/c)
* Find the probability of class given word. P(c/w)
"""
text = self.text
label = self.label
bow = nlp.count_vectorizer(text)
self.features = bow.columns.to_list()
if view:
print('Your BoW is:\n', bow)
classes = label
self.classes = list(Counter(classes).keys())
bow['out'] = classes
bow_class = bow.groupby(by='out', axis=0)
# count of each class examples
counts = bow_class.count()
# used for prediction
class_prob = counts / counts.sum(axis=0)
class_prob = dict(class_prob.mean(axis=1))
self.class_prob = class_prob
# count of each word on each class
self.count_words_class = bow_class.sum()
# find prob of word in each class.... no. of that word in class / total word in class
prob_w_c = bow_class.sum() / counts
# find p(word/class)
prob_w_c = round(prob_w_c * counts / counts.sum(axis=0), 5)
self.cond_probs = prob_w_c
def classes_(self):
"""
A method to see all classes counts for each word.
"""
return self.count_words_class
def predict(self, example):
"""
A method for prediction.
Input: List of text.
Output: Prediction for each classes.
Example:
----------
>>>test = ['easy boy easy. boy you need easy money?', "it was easy game" ]
>>>nb.predict([spam[1]])
{0: 0.06976744186046512, 1: 0.9302325581395349}
"""
txt = nlp.count_vectorizer(example, train= False)
words = dict(Counter(txt[0]))
vocab = self.features
classes = self.classes
class_prob = self.class_prob
p = self.cond_probs
# probs will store denominator value for each class. We have to add all values of it to get denominator
# probs will store values of P(w/c) where c is classes and w is words.
probs = {}
# numinator
# same as probs
num = {k:v for k,v in class_prob.items()}
"""
c, ~c
p(~c/w1, w2, w3) = p(w1, w2, w3 / ~c) * p(~c) / (p(w1, w2, w3/c) * p(c) + p(w1, w2, w3/~c) * p(~c))
p(c/ w1, w1, w3) = p(w1, w2, w3 / c) * p(c) / (p(w1, w2, w3/c) * p(c) + p(w1 , w2, w3 / ~c) * p(~c))
p(w1, w2, w3/c) = p(w1/c) * p(w2/c) * p(w3/c) = p(w1 and w2 and w3 / c)
"""
for w in words.keys():
if w in vocab:
for c in classes:
if probs.get(c) != None:
if p[w][c] != 0:
probs[c] *= p[w][c]
num[c] *= p[w][c]
else:
probs[c] = p[w][c] * class_prob[c]
num[c] *= p[w][c]
# to find probability of class given word or P(c/w), we have formula
# = p(w/c) * p(c) / p(w)
# p(w) = sum over all p(w/c) * p(c) is TP + TN
denom = sum(probs.values())
probs = {k: v/denom for k,v in num.items()}
return probs
"""
### scarp code
# sum(probs.values()), probs
# example = nlp.count_vectorizer(example)
# words = example.columns.to_list()
# vocab = self.features
# classes = self.classes
# p = self.cond_probs
# probs = {}
# class_prob = self.class_prob
# for w in words:
# if w in vocab:
# for c in classes:
# if probs.get(c) != None:
# # this is actually p(class/word) * p(class)
# probs[c] *= p[w][c]
# else:
# probs[c] = p[w][c] * class_prob[c]
# return probs
"""
spam = ['win money now', 'easy moneey now', 'win the money by replying']
ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']
all_txt = spam + ham
classes = [0, 0, 0, 1, 1, 1, 1, 1]
nb = NaiveBayes(all_txt, classes)
nb.fit()
# nb.cond_probs
test = ['easy boy easy. boy you need easy money?', "buddy, easy money", 'win easy money by replying']
nb.predict([test[0]])