-
Notifications
You must be signed in to change notification settings - Fork 1
/
Train.py
112 lines (91 loc) · 3.16 KB
/
Train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
# encoding: utf-8
import sqlite3, string, sys, operator
def connect():
conn = sqlite3.connect('spam.db')
c = conn.cursor()
return c
def tokenize(str):
""" Split text string (comment) into words"""
return str.split()
def sanitize(wl):
"""Sanitize wordlist, i.e. decapatilize and such"""
s = []
for word in wl:
for symbol in ['.', '!', ',', '\n', '\r', '?']:
if symbol in word:
s.append(symbol)
word = word.replace(symbol, '')
s.append(word)
return s
def get_words(c, spam):
""" Generates a wordlist """
if spam:
include_spam = "1"
else:
include_spam = "0"
wordlist = []
for row in c.execute("SELECT comment FROM comments WHERE spam=?", (include_spam)):
new_words = tokenize(row[0])
wordlist += sanitize(new_words)
return wordlist
def word_nbr_map(wl):
"""Counts the occurense of every word in a wordlist"""
wf = dict()
for word in wl:
try:
wf[word] = wf[word] + 1
except KeyError:
wf[word] = 1
return wf
def spamrisk_map(spam_wc, not_spam_wc, total_wc):
""" Bayes theorem kickin' it"""
risk_map = dict()
spam_length = 0
for w, v in spam_wc.iteritems():
spam_length += v
not_spam_length = 0
for w, v in not_spam_wc.iteritems():
not_spam_length += v
total_length = not_spam_length + spam_length
for word, value in total_wc.iteritems():
if word not in spam_wc and word in not_spam_wc:
risk_map[word] = 0.01
elif word in spam_wc and word not in not_spam_wc:
risk_map[word] = 0.99
else:
g = float(not_spam_wc[word] * 2)
b = float(spam_wc[word])
risk_map[word] = ( b / spam_length ) / ( ( g / not_spam_length) +(b / spam_length) )
return risk_map
def spam_prob(comment, word_spamrisk_map):
""" Calculates the probability that the inputed comment map is spam """
sc = tokenize(comment)
l = sanitize(sc)
cost = dict()
for word in l:
if not word in word_spamrisk_map:
cost[word] = 0.4
else:
cost[word] = abs(0.5 - word_spamrisk_map[word])
sort_cost_list = sorted(cost.items(), key=lambda x: -x[1])[:15]
return reduce(operator.mul, [i[1] / (i[1] + reduce(operator.mul, [1 - i[1] for i in sort_cost_list])) for i in sort_cost_list])
def print_wc(wc, length=0):
if length == 0:
for wp in sorted(wc.items(), key=lambda x: -x[1]):
print wp
else:
for wp in sorted(wc.items(), key=lambda x: -x[1])[:length]:
print wp
if __name__ == "__main__":
c = connect()
#tokenize words and add to db
spam_wordlist = get_words(c, spam=True)
not_spam_wordlist = get_words(c, spam=False)
spam_wc = word_nbr_map(spam_wordlist)
not_spam_wc = word_nbr_map(not_spam_wordlist)
total_wc = word_nbr_map(spam_wordlist + not_spam_wordlist)
word_spamrisk_map = spamrisk_map(spam_wc, not_spam_wc, total_wc)
#print_wc(word_spamrisk_map)
if len(sys.argv) > 1:
print spam_prob(sys.argv[1], word_spamrisk_map)