forked from amgerard/twitter-political
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment_x.py
96 lines (83 loc) · 3.07 KB
/
sentiment_x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#from nltk.classify import NaiveBayesClassifier
#from nltk.corpus import subjectivity
#from nltk.sentiment import SentimentAnalyzer
#from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#import nltk
import json
import re
import enchant
# note: use download to get vader_lexicon.txt
#nltk.download()
english_words = enchant.Dict("en_US")
def makeFeatureVec(words, model, num_features):
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
index2word_set = set(model.index2word)
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
featureVec = np.divide(featureVec,nwords)
return featureVec
def is_canidate(x):
return re.match('hillary',x) or re.match('trump',x)
def get_words(mystr):
tokens = re.sub("[^\w]", " ", mystr).split()
words = [x for x in tokens if english_words.check(x)] # or is_canidate(x)]
return words
tweets = []
for filename in ('data/Backup_HILLARY2016.txt','data/TRUMP2016.txt'):
with open(filename) as f:
for line in f.readlines():
d = json.loads(line) # pass
if 'text' in d:
tweets.append(' '.join(get_words(d['text'])))
print 'test: ' + str(len(tweets))
sid = SentimentIntensityAnalyzer()
pos_hil = 0
neg_hil = 0
pos_don = 0
neg_don = 0
cnt = 0
for tweet in tweets:
ss = sid.polarity_scores(tweet)
#print ''
#print tweet
#for k in sorted(ss):
# print('{0}: {1}, '.format(k, ss[k]))
if cnt < 32929:
pos_hil = pos_hil + ss['pos']
neg_hil = neg_hil + ss['neg']
else:
pos_don = pos_don + ss['pos']
neg_don = neg_don + ss['neg']
cnt = cnt + 1
print 'total positive = ' + str(pos_hil + pos_don)
print 'total negative = ' + str(neg_hil + neg_don)
print 'hillary positive = ' + str(pos_hil)
print 'hillary negative = ' + str(neg_hil)
print 'donald positive = ' + str(pos_don)
print 'donald negative = ' + str(neg_don)
# another example
'''
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key, value))
'''