-
Notifications
You must be signed in to change notification settings - Fork 0
/
recommendation.py
159 lines (134 loc) · 5.83 KB
/
recommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import sys
from feature_extraction import dataloader
import random
import nltk
class activityRecommender:
def __init__(self, data_filename):
# Loading the data
print 'data for activityRecommender comes from file: %s' %(data_filename)
self.loader = dataloader.DataLoader(data_filename)
# Get Useful Constants
self.totalNumLabels = self.loader.totalNumLabels
self.numDataPoints = self.loader.numDataPoints
self.sentiment_label_indices = range(0,5)
self.time_label_indices = range(5,9)
self.event_label_indices = range(9, 24)
self.label_types = ['sentiment', 'time', 'event']
event_label_threshold = 1.0/3
self.gold_bitvectors = self.loader.extractFullLabelBitVectors(event_label_threshold)
# tagged_corpus_path = os.path.realpath('data/train_tagged.txt')
# tagged_corp_reader = nltk.corpus.reader.TaggedCorpusReader(os.path.dirname(tagged_corpus_path), os.path.basename(tagged_corpus_path), sep='_')
# self.tagged_tweets = tagged_corp_reader.tagged_sents()
candidate_corpus_path = os.path.realpath('data/train_tagged.csv')
# gs = nltk.corpus.reader.TaggedCorpusReader(os.path.dirname(gs_corpus_path), os.path.basename(gs_corpus_path), sep='_')
candidate = nltk.corpus.reader.TaggedCorpusReader(os.path.dirname(candidate_corpus_path), os.path.basename(candidate_corpus_path), sep='_')
self.tagged_tweets = candidate.tagged_sents()
corpusLen = len(self.loader.corpus)
print 'self.loader.corpus len = ', corpusLen
# print self.loader.corpus[10:]
taggedCorpusLen = len(self.tagged_tweets)
print 'self.tagged_tweets len=', taggedCorpusLen
# print self.tagged_tweets[10:]
# # Test Code to Print if we're extracting correctly
# converter = vectorToLabel.Converter()
# for i in range(self.loader.numDataPoints):
# print '************************************'
# print 'training tweet = %s' %(self.loader.corpus[i])
# print '\tbitvector is %s' %(self.gold_bitvectors[i])
# converter.printLabels(self.gold_bitvectors[i])
def getSimilarTweets(self, vectorToMatch):
similarTweets = []
numSimilarTweetsToFind = 3
for tweet_idx in range(len(self.loader.corpus)):
tweet = self.loader.corpus[tweet_idx]
corpusTweetLabels = self.gold_bitvectors[tweet_idx]
if self.checkCriteria(vectorToMatch, corpusTweetLabels):
verbs = self.getVerbs(tweet_idx)
if len(verbs) == 0: continue
similarTweets.append({'tweet': tweet,
'labels': corpusTweetLabels,
'verbs': verbs})
numSimilarTweetsToFind -= 1
if numSimilarTweetsToFind == 0: break
return similarTweets
def checkCriteria(self, vectorToMatch, corpusTweetLabelVector):
# check positive
indexOfPositiveLabel = 3
if corpusTweetLabelVector[indexOfPositiveLabel] != 1:
return False
# check if weather event conditions are similar
for event_label_idx in self.event_label_indices:
if vectorToMatch[event_label_idx] != corpusTweetLabelVector[event_label_idx]:
return False
return True
def getVerbs(self, tweet_idx):
tagged_tweet = self.tagged_tweets[tweet_idx]
verbFound = False
verbs = []
for (word, tag) in tagged_tweet:
if tag != None and tag.startswith('V'):
verbs.append(word)
verbFound = True
# if not verbFound:
# print '\t\t *** No Verbs Found ***'
return verbs
if len(sys.argv) <= 1:
print "Usage: recommendation.py tweetsToBeTagged_filename"
sys.exit()
def load_tweets(filename=None):
tweetsToBeTagged_filename = sys.argv[1]
if len(tweetsToBeTagged_filename) <= 0:
print 'tweetsToBeTagged_filename must be at least one char!'
sys.exit()
tweetsToBeTagged = []
f = open(tweetsToBeTagged_filename)
tweetsToBeTagged = f.readlines()
f.close()
return tweetsToBeTagged
def predictTweets(tweetsToBeTagged):
print 'Making predictions and converting prediction bitstrings to bitvectors...'
start_time = time.time()
testX = trained_vectorizer.transform(tweetsToBeTagged)
# print 'len(testX) = ', len(testX)
predictions_list = []
for testX_matrixcounts in testX:
predicted_fullbitvector = []
for labeltype in ['sentiment', 'event', 'time']:
predicted_label = classifiers[labeltype].predict(testX_matrixcounts)[0]
predicted_labeltype_bitvector = loader.bitstringToIntList(predicted_label)
predicted_fullbitvector += predicted_labeltype_bitvector
predictions_list.append(predicted_fullbitvector)
elapsed_time = time.time() - start_time
print 'Completed making predictions and converting prediction bitstrings to bitvectors... took ', elapsed_time
return predictions_list
import vectorToLabel
tweetsToBeTagged = load_tweets()
filename='data/train.csv'
# import combinedNaiveBayes
# cnbc = combinedNaiveBayes.combinedNBClassifier(data_filename='data/train.csv', numFolds=0)
# predictions_list = cnbc.combined_classify_tweets(tweetsToBeTagged)
import structuredNaiveBayes
snbc = structuredNaiveBayes.structuredNBClassifier(data_filename=filename, numFolds=0)
print 'loaded classifier. now predicting'
predictions_list = snbc.combined_classify_tweets(tweetsToBeTagged)
print 'predicted!'
# pickle.write(name of data structure, open(filename, 'wb'))
# prediction_list = pickle.load(open())
converter = vectorToLabel.Converter()
recommender = activityRecommender(filename)
print 'loaded recommender'
for i in range(len(tweetsToBeTagged)):
prediction_vec = predictions_list[i]
print '\n*************************************'
print 'For tweet: %s' %(tweetsToBeTagged[i])
print '\tPredicted: %s' %(prediction_vec)
labels = converter.convertToLabels(prediction_vec)
for labeltype in converter.labeltypes:
print '\tPredicted %s labels: %s' %(labeltype, labels[labeltype])
similarTweets = recommender.getSimilarTweets(prediction_vec)
print '\nLooking at Similar Tweets:'
for similarTweet in similarTweets:
# print '\tSimilar Tweet:', similarTweet['tweet']
# print '\tVerbs:', similarTweet['verbs']
print '\tPerhaps you should try to %s (source: %s)' %(str(similarTweet['verbs']), similarTweet['tweet'])