forked from tslmy/politeness-estimator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_emolex.py
27 lines (22 loc) · 1.71 KB
/
prepare_emolex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
## Prepare EmoLex
from os import system
from os.path import isfile
import pandas as pd
if not isfile('NRC - Sentiment Lexicon - Research EULA Sept 2017 .pdf'):
system('wget http://sentiment.nrc.ca/lexicons-for-research/NRC-Emotion-Lexicon.zip')
system('unzip NRC-Emotion-Lexicon.zip')
system('rm NRC-Emotion-Lexicon.zip')
original_df = pd.read_excel('NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx') # Open the downloaded Excel file
# For English:
if not isfile('english_emolex.csv'):
df = original_df[['English (en)', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']] # Keep only the English tokens and the annotated emotions
df = df[df[['Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']].sum(axis=1)>0] # Drop words that does not relate to any emotion
df.to_csv('english_emolex.csv', index=False) # Save to file
# For Mandarin Chinese:
if not isfile('chinese_emolex.csv'):
df = original_df[['Chinese (Simplified) (zh-CN)', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']] # Keep only the Chinese tokens and the annotated emotions
df.drop_duplicates(subset=['Chinese (Simplified) (zh-CN)'], inplace=True) # translation inbalances
df = df[df[['Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']].sum(axis=1)>0] # Drop words that does not relate to any emotion
df.to_csv('chinese_emolex.csv', index=False) # Save to file
if isfile('NRC - Sentiment Lexicon - Research EULA Sept 2017 .pdf'):
system('rm -r NRC-Emotion-Lexicon-v0.92/ "NRC - Sentiment Lexicon - Research EULA Sept 2017 .pdf"') # Remove unzipped files