Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for stemmed sentences #16

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions afinn/afinn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re

from os.path import dirname, join
from nltk import PorterStemmer


LANGUAGE_TO_FILENAME = {
Expand Down Expand Up @@ -56,11 +57,13 @@ class Afinn(object):

"""

def __init__(self, language="en", emoticons=False, word_boundary=True):
def __init__(self, language="en", emoticons=False, word_boundary=True, stem=False):
"""Setup dictionary from data file.

The language parameter can be set to English (en) or Danish (da).

The stemmed parameter can be set to True if your input sentence has been stemmed.

Parameters
----------
language : 'en' or 'da', optional
Expand All @@ -69,13 +72,17 @@ def __init__(self, language="en", emoticons=False, word_boundary=True):
Includes emoticons in the token list
word_boundary : bool, optional
Use word boundary match in the regular expression.
stem : bool, optional
Use a stemmed word list

"""
filename = LANGUAGE_TO_FILENAME[language]
full_filename = self.full_filename(filename)
if emoticons:
# Words
self._dict = self.read_word_file(full_filename)
if stem:
self._stem_wordlist()
regex_words = self.regex_from_tokens(
list(self._dict),
word_boundary=True, capture=False)
Expand All @@ -94,7 +101,7 @@ def __init__(self, language="en", emoticons=False, word_boundary=True):
self._setup_pattern_from_regex(regex)

else:
self.setup_from_file(full_filename, word_boundary=word_boundary)
self.setup_from_file(full_filename, word_boundary=word_boundary, stem=stem)

self._word_pattern = re.compile('\w+', flags=re.UNICODE)

Expand Down Expand Up @@ -146,7 +153,7 @@ def full_filename(self, filename):
"""
return join(self.data_dir(), filename)

def setup_from_file(self, filename, word_boundary=True):
def setup_from_file(self, filename, word_boundary=True, stem=False):
"""Setup data from data file.

Read the word file and setup the regular expression pattern for
Expand All @@ -159,6 +166,11 @@ def setup_from_file(self, filename, word_boundary=True):

"""
self._dict = self.read_word_file(filename)

# Stem the word list

if stem:
self._stem_wordlist()
self._setup_pattern_from_dict(word_boundary=word_boundary)

@staticmethod
Expand Down Expand Up @@ -233,6 +245,13 @@ def regex_from_tokens(tokens, word_boundary=True, capture=True):

return regex

def _stem_wordlist(self):
stemmer = PorterStemmer()
for word in list(self._dict.keys()):
stemmed = stemmer.stem(word)
self._dict[stemmed] = self._dict.pop(word)


def _setup_pattern_from_regex(self, regex):
"""Set internal variable from regex string."""
self._pattern = re.compile(regex, flags=re.UNICODE)
Expand Down