-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment_analysis.py
183 lines (148 loc) · 4.97 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import re
import sys
import json
import nltk.test
import abbreviations
import portuguese_tagger_processor
from sentilex import sentiLexPairRdd
from nltk.corpus.reader import WordListCorpusReader
__output_path = "result.json"
stopwords = nltk.corpus.stopwords.words('portuguese')
reader = WordListCorpusReader('.', ['symbols.txt'])
symbols = reader.words()
reader = WordListCorpusReader('.', ['positive_emoticons.txt'])
positive_emoticons = reader.words()
reader = WordListCorpusReader('.', ['negative_emoticons.txt'])
negative_emoticons = reader.words()
tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer()
tagger = portuguese_tagger_processor.get_tagger()
json_result = []
tweet_dict = {}
def count_positive_emoticons(tokens):
counter = 0
for emoticon in positive_emoticons:
if emoticon in tokens:
counter += 1
return counter
def count_negative_emoticons(tokens):
counter = 0
for emoticon in negative_emoticons:
if emoticon in tokens:
counter += 1
return counter
def replace_symbols(text):
for symbol in symbols:
text = text.replace(symbol, "")
return text
def replace_urls(text):
return re.sub(r"http\S+", "", text)
def replace_abbreviations(tokens):
for abbreviation in abbreviations.words.keys():
if abbreviation in tokens:
i = tokens.index(abbreviation)
tokens[i] = abbreviations.words[abbreviation]
return tokens
def remove_stopwords(tokens):
for stopword in stopwords:
if stopword in tokens:
tokens.remove(stopword)
return tokens
def remove_symbols(tokens):
for symbol in symbols:
if symbol in tokens:
if symbol is "...":
tokens[symbol] = " "
else:
tokens.remove(symbol)
return tokens
def replace_user_mentions(text, user_mentions):
if len(user_mentions) > 0:
for user_mention in user_mentions:
screen_name = user_mention['screen_name']
text = text.replace("@" + screen_name, "")
return text
def text_processor(tweet):
text = tweet['text'].lower()
text = replace_user_mentions(text, tweet['entities']['user_mentions'])
text = replace_urls(text)
if "rt" in text:
try:
text = replace_symbols(text.split(":")[1])
except IndexError:
pass
else:
text = replace_symbols(text)
return text.strip()
def tokens_processor(tokens):
tokens = remove_stopwords(tokens)
tokens = remove_symbols(tokens)
tokens = replace_abbreviations(tokens)
return tokens
def sentiments_processor(text, tokens, tags):
adjectives = []
adverbs = []
for tagged_word in tags:
word = tagged_word[0]
tag = tagged_word[1]
if tag == "ADJ":
adjectives.append(word)
if tag == "ADV" or tag == "ADVL+adv":
adverbs.append(word)
if len(adjectives) > 0:
positive = 0
negative = 0
for adjective in adjectives:
sentiments = sentiLexPairRdd.lookup(adjective)
has_negative_adverbs = "não" in adverbs
if len(sentiments) > 0:
sentiment = sentiments[0]
if has_negative_adverbs:
if sentiment == 1:
negative += 1
elif sentiment == -1:
positive += negative
else:
if sentiment == 1:
positive += 1
elif sentiment == -1:
negative += negative
positive = positive + count_positive_emoticons(tokens)
negative = negative + count_negative_emoticons(tokens)
score = 0
sum = positive + negative
dif = positive - negative
if sum > 0:
score = dif / sum
if score > 0.5:
tweet_dict[text] = 'positive'
elif score < 0.5:
tweet_dict[text] = 'negative'
else:
tweet_dict[text] = 'neutral'
def tweet_processor(tweet):
text = text_processor(tweet)
tokens = tweet_tokenizer.tokenize(text)
tokens = tokens_processor(tokens)
tags = tagger.tag(tokens)
sentiments_processor(text, tokens, tags)
def save():
if os.path.isfile(__output_path) is False:
for key in tweet_dict.keys():
json_result.append({"text": key, "label": tweet_dict[key]})
json_file = open(__output_path, 'w')
json.dump(json_result, json_file, indent=4)
json_file.close()
else:
json_file = open(__output_path, "r")
model = json.load(json_file)
model = list(model)
json_file.close()
for key in tweet_dict.keys():
json_object = {"text": key, "label": tweet_dict[key]}
if json_object not in model:
model.append(json_object)
json_file = open(__output_path, "w+")
json_file.write(json.dumps(model, indent=4))
json_file.close()
sys.exit(0)