Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ignore stopwords #157

Merged
merged 9 commits into from
Sep 1, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 90 additions & 55 deletions nlpretext/basic/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import re
import unicodedata
from flashtext import KeywordProcessor
from ftfy import fix_text as _fix_text
from nlpretext._config import constants
from nlpretext.token.tokenizer import tokenize
Expand All @@ -28,11 +29,6 @@

def normalize_whitespace(text) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Given ``text`` str, replace one or more spacings with a single space, and
one or more linebreaks with a single newline. Also strip leading/trailing
whitespace.
Expand All @@ -51,6 +47,27 @@ def normalize_whitespace(text) -> str:
).strip()
return text


def remove_whitespace(text) -> str:
"""
Given ``text`` str, remove one or more spacings and linebreaks.
Also strip leading/trailing whitespace.
eg. " foo bar " -> "foobar"

Parameters
----------
text : string

Returns
-------
string
"""
text = constants.NONBREAKING_SPACE_REGEX.sub(
"", constants.LINEBREAK_REGEX.sub("", text)
).strip()
return text


def lower_text(text: str):
"""
Given ``text`` str, transform it into lowercase
Expand All @@ -65,32 +82,95 @@ def lower_text(text: str):
"""
return text.lower()

def remove_stopwords(text: str, lang: str, custom_stopwords: list = None) -> str:

def filter_groups(token: str, ignored_stopwords: list = None) -> str:
"""
Given ``token`` str and a list of groups of words
that were concatenated into tokens, reverses the tokens
to their ungrouped state.

Parameters
----------
token : string
ignored_stopwords : list of strings

Returns
-------
string
"""
if ignored_stopwords:
for group in ignored_stopwords:
if token == remove_whitespace(group):
token = group
return token


def ungroup_ignored_stopwords(tokens: list, ignored_stopwords: list = None) -> list:
"""
Given ``tokens`` list of str and a list of groups of words
that are concatenated in tokens, reverses the tokens to
their ungrouped state.

Parameters
----------
tokens : list of strings
ignored_stopwords : list of strings

Returns
-------
list of strings
"""

return [filter_groups(token, ignored_stopwords) for token in tokens]


def remove_stopwords(text: str, lang: str, custom_stopwords: list = None, ignored_stopwords: list = None) -> str:
"""
Given ``text`` str, remove classic stopwords for a given language and
custom stopwords given as a list.
custom stopwords given as a list. Words and groups of words from
ignored_stopwords list are ignored during stopwords removal.

Parameters
----------
text : string
lang : string
custom_stopwords : list of strings
ignored_stopwords : list of strings

Returns
-------
string

Raises
-------
ValueError
if ``custom_stopwords`` and ``ignored_stopwords`` have common elements.
"""
if custom_stopwords and ignored_stopwords:
if len(set(custom_stopwords) & set(ignored_stopwords)) > 0:
amaleelhamri marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("You are trying to add and remove a stopword at the same time !")
stopwords = get_stopwords(lang)
if ignored_stopwords:
keyword_processor = KeywordProcessor()
singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]
for group_of_words in ignored_stopwords:
keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))
text = keyword_processor.replace_keywords(text)
else:
singletons_to_keep = []
if custom_stopwords:
stopwords += custom_stopwords
if lang in ["fr", "en"]:
lang_module = {
"fr" : "fr_spacy",
"en" : "en_spacy"
}[lang]
return ' '.join(
[x for x in tokenize(text, lang_module) if x not in stopwords])
return ' '.join([x for x in text.split() if x not in stopwords])
tokens = tokenize(text, lang_module)
else:
tokens = text.split()
tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]
tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
return ' '.join(tokens)


def remove_eol_characters(text) -> str:
Expand All @@ -111,11 +191,6 @@ def remove_eol_characters(text) -> str:

def fix_bad_unicode(text, normalization: str = "NFC") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Fix unicode text that's "broken" using `ftfy
<http://ftfy.readthedocs.org/>`_;
this includes mojibake, HTML entities and other code cruft,
Expand Down Expand Up @@ -143,11 +218,6 @@ def fix_bad_unicode(text, normalization: str = "NFC") -> str:

def unpack_english_contractions(text) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace *English* contractions in ``text`` str with their unshortened
forms.
N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
Expand Down Expand Up @@ -188,11 +258,6 @@ def unpack_english_contractions(text) -> str:

def replace_urls(text, replace_with: str = "*URL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace all URLs in ``text`` str with ``replace_with`` str.

Parameters
Expand All @@ -213,11 +278,6 @@ def replace_urls(text, replace_with: str = "*URL*") -> str:

def replace_emails(text, replace_with="*EMAIL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace all emails in ``text`` str with ``replace_with`` str

Parameters
Expand All @@ -238,11 +298,6 @@ def replace_phone_numbers(text, country_to_detect: list,
replace_with: str = "*PHONE*",
method: str = "regex") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Inspired code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace all phone numbers in ``text`` str with ``replace_with`` str

Parameters
Expand Down Expand Up @@ -279,11 +334,6 @@ def replace_phone_numbers(text, country_to_detect: list,

def replace_numbers(text, replace_with="*NUMBER*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace all numbers in ``text`` str with ``replace_with`` str.

Parameters
Expand All @@ -302,11 +352,6 @@ def replace_numbers(text, replace_with="*NUMBER*") -> str:

def replace_currency_symbols(text, replace_with=None) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Replace all currency symbols in ``text`` str with string specified by
``replace_with`` str.

Expand Down Expand Up @@ -334,11 +379,6 @@ def replace_currency_symbols(text, replace_with=None) -> str:

def remove_punct(text, marks=None) -> str:
"""
amaleelhamri marked this conversation as resolved.
Show resolved Hide resolved
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Remove punctuation from ``text`` by replacing all instances of ``marks``
with whitespace.

Expand Down Expand Up @@ -372,11 +412,6 @@ def remove_punct(text, marks=None) -> str:

def remove_accents(text, method: str = "unicode") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----

Remove accents from any accented unicode characters in ``text`` str,
either by transforming them into ascii equivalents or removing them
entirely.
Expand Down
20 changes: 12 additions & 8 deletions tests/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,17 +199,21 @@ def test_remove_stopwords_tokens(input_tokens, lang, expected_output):


@pytest.mark.parametrize(
"input_text, lang, expected_output",
"input_text, lang, custom_stopwords, ignored_stopwords, expected_output",
[
('I like this song very much !', 'en', 'I song !'),
('Can I get a beer?', 'en', 'Can I beer ?'),
('Je vous recommande ce film !', 'fr', 'Je recommande film !'),
('je vous recommande ce film !', 'fr', 'recommande film !'),
('Quiero una cerveza, por favor.', 'es', 'Quiero cerveza, favor.')
('I like this song very much !', 'en', None, None, 'I song !'),
('Can I get a beer?', 'en', None, None, 'Can I beer ?'),
('Je vous recommande ce film !', 'fr', None, None, 'Je recommande film !'),
('je vous recommande ce film !', 'fr', None, None, 'recommande film !'),
('Quiero una cerveza, por favor.', 'es', None, None, 'Quiero cerveza, favor.'),
('je vous recommande ce film !', 'fr', ["recommande"], None, 'film !'),
('Quiero una cerveza, por favor.', 'es', None, ["una"], 'Quiero una cerveza, favor.'),
('je vous recommande ce film !', 'fr', ["recommande"], ["je vous"], 'je vous film !'),
('je vous recommande ce film !', 'fr', ["recommande"], ["recommande ce film"], 'recommande ce film !')
],
)
def test_remove_stopwords_text(input_text, lang, expected_output):
result = remove_stopwords_text(input_text, lang)
def test_remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords, expected_output):
result = remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords)
np.testing.assert_array_equal(result, expected_output)


Expand Down