Skip to content

Commit

Permalink
added code for creating new language lexicons
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Jun 28, 2022
1 parent 2ce82d3 commit 531cc67
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions dev/add_new_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""convert into sentiment lexicon from:
https://www.kaggle.com/datasets/rtatman/sentiment-lexicons-for-81-languages/discussion/39827?resource=download
into txt files for each language with each positive word rated +1 and negative words
rated -1.
"""
from collections import defaultdict
from pathlib import Path

path = Path("/Users/au561649/Downloads/archive")

rated_words = defaultdict(list)
for w_path in set(path.glob("sentiment-lexicons/*.txt")):
lang_id = w_path.stem.split("_")[-1]
is_positive = w_path.stem.split("_")[0] == "positive"
rating = 1 if is_positive else -1
with open(path / w_path, "r") as f:
words = list(filter(lambda x: x, f.read().split("\n")))
words = [(w, rating) for w in words]
rated_words[lang_id] += words

lexicon_path = Path("/Users/au561649/Desktop/Github/asent/asent/lexicons")
for lang_id in rated_words:
with open(lexicon_path / f"{lang_id}_lexicon_chen_skiena_2014_v1.txt", "w") as f:
txt = "\n".join([", ".join([str(i) for i in w]) for w in rated_words[lang_id]])
f.write(txt)

0 comments on commit 531cc67

Please sign in to comment.