forked from nico/collectiveintelligence-book
-
Notifications
You must be signed in to change notification settings - Fork 1
/
feedfilter.py
72 lines (51 loc) · 1.72 KB
/
feedfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import feedparser
import re
def interestingwords(s):
splitter = re.compile(r'\W*')
return [s.lower() for s in splitter.split(s) if len(s) > 2 and len(s) < 20]
def entryfeatures(entry):
f = {}
# extract title
titlewords = interestingwords(entry['title'])
for w in titlewords: f['Title:' + w] = 1
# extract summary
summarywords = interestingwords(entry['summary'])
# count uppercase words
uc = 0
for i in range(len(summarywords)):
w = summarywords[i]
f[w] = 1
if w.isupper(): uc += 1
# get word pairs in summary aas features
if i < len(summarywords) - 1:
twowords = ' '.join(summarywords[i:i+1])
f[twowords] = 1
# keep creator and publisher as a whole
f['Publisher:' + entry['publisher']] = 1
# Insert virtual keyword for uppercase words
if float(uc) / len(summarywords) > 0.3: f['UPPERCASE'] = 1
print f.keys()
return f.keys()
def read(feed, classifier):
f = feedparser.parse(feed)
for entry in f['entries']:
print
print '----'
print 'Title: ' + entry['title'].encode('utf-8')
print 'Publisher: ' + entry['publisher'].encode('utf-8')
print
print entry['summary'].encode('utf-8')
fulltext = '%s\n%s\n%s' % (
entry['title'], entry['publisher'], entry['summary'])
#print 'Guess: ' + str(classifier.classify(fulltext))
#cl = raw_input('Enter category: ')
#classifier.train(fulltext, cl)
print 'Guess: ' + str(classifier.classify(entry))
cl = raw_input('Enter category: ')
classifier.train(entry, cl)
if __name__ == '__main__':
import docclass
#cl = docclass.fisherclassifier(docclass.getwords)
cl = docclass.fisherclassifier(entryfeatures)
cl.setdb('python_feed.db')
read('python_search.xml', cl)