-
Notifications
You must be signed in to change notification settings - Fork 0
/
comment_sentiment.py
119 lines (108 loc) · 4.47 KB
/
comment_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import random as rd
import time
import nltk
from nltk.tokenize import TweetTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def get_comment_sentiment(comment, keywords, upvotes):
tokenizer_words = TweetTokenizer()
tokens_sentences = [tokenizer_words.tokenize(t) for t in nltk.sent_tokenize(comment)]
sentenceStruct, rawScores = [], []
start_time = time.time()
for ind in range(0, len(tokens_sentences)):
keywordDetected = False
sentenceStruct = tokens_sentences[ind]
sentence = " ".join(sentenceStruct)
sentenceScore = sentiment_scores(sentence, upvotes)
for word in sentenceStruct:
for keyword in keywords:
if not keywordDetected:
if keyword.casefold() == word.casefold():
rawScores += (keyword, sentenceScore)
keywordDetected = True
if not keywordDetected:
rawScores += ('sentence', sentenceScore)
# print(time.time()-start_time)
midScores = []
currKey = 'placeholder'
currScore = 0.0
counter = 0
# start_time = time.time()
for ind in range(0, len(rawScores)):
if (type(rawScores[ind]) is str) & (rawScores[ind] != 'sentence'):
if currKey == 'placeholder':
currKey = rawScores[ind]
currScore += rawScores[ind + 1]
counter += 1
elif rawScores[ind] != currKey:
midScores += (currKey, currScore / counter)
currKey = rawScores[ind]
currScore = 0.0
currScore += rawScores[ind + 1]
counter = 1
else:
currScore += rawScores[ind + 1]
counter += 1
elif rawScores[ind] == 'sentence':
currScore += rawScores[ind + 1]
counter += 1
midScores += (currKey, currScore / counter)
# print(time.time()-start_time)
finalScores, coveredWords = [], []
# start_time = time.time()
for ind in range(0, len(midScores)):
if (type(midScores[ind]) is str) & (midScores[ind] not in coveredWords):
coveredWords += [midScores[ind]]
currKey = midScores[ind]
currScore = midScores[ind + 1]
counter = 1
scoreAdded = False
for ell in range(ind + 2, len(midScores)):
if type(midScores[ell]) is str:
if (midScores[ell] == currKey) & (ell != (len(midScores) - 2)):
currScore += midScores[ell + 1]
counter += 1
elif (midScores[ell] == currKey) & (ell == (len(midScores) - 2)):
currScore += midScores[ell + 1]
counter += 1
finalScores += (currKey, currScore / counter)
scoreAdded = True
break
if scoreAdded is False:
finalScores += (currKey, currScore / counter)
print(time.time() - start_time)
return finalScores
def sentiment_scores(sentence, upvotes):
sid_obj = SentimentIntensityAnalyzer()
sentiment_dict = sid_obj.polarity_scores(sentence)
pos = sentiment_dict['pos']
neu = sentiment_dict['neu']
neg = sentiment_dict['neg']
score = (2 * neu * upvotes) + (pos * upvotes) - (neg * upvotes)
return score
def simplify_title(titles):
keywords = []
titleReferences = []
commonWords = 'The the This this Is is A a Of of Into into For for But but And and So so There there Through through As as Like like He he She she They they Them them It it'
start_time = time.time()
for ind in range(0, len(titles)):
simplified = False
title = titles[ind].split()
while not simplified:
keyword = title[rd.randint(0, (len(title) - 1))]
if keyword not in commonWords:
simplified = True
keywords += [keyword]
titleReferences += (titles[ind], keyword)
print(time.time() - start_time)
return keywords, titleReferences
def get_scores(titles, comment, upvotes):
keys, titleRefs = simplify_title(titles)
scores = get_comment_sentiment(comment, keys, upvotes)
rankings = []
start_time = time.time()
for ind in range(0, len(scores)):
if type(scores[ind]) is str:
if scores[ind] == titleRefs[ind + 1]:
rankings += (titleRefs[ind], scores[ind + 1])
print(time.time() - start_time)
return rankings