artefactory · amaleelhamri · Sep 1, 2021 · Aug 30, 2021 · Aug 30, 2021 · Aug 30, 2021
diff --git a/nlpretext/basic/preprocess.py b/nlpretext/basic/preprocess.py
@@ -19,6 +19,7 @@
 
 import re
 import unicodedata
+from flashtext import KeywordProcessor
 from ftfy import fix_text as _fix_text
 from nlpretext._config import constants
 from nlpretext.token.tokenizer import tokenize
@@ -28,11 +29,6 @@
 
 def normalize_whitespace(text) -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Given ``text`` str, replace one or more spacings with a single space, and
     one or more linebreaks with a single newline. Also strip leading/trailing
     whitespace.
@@ -51,6 +47,27 @@ def normalize_whitespace(text) -> str:
     ).strip()
     return text
 
+
+def remove_whitespace(text) -> str:
+    """
+    Given ``text`` str, remove one or more spacings and linebreaks.
+    Also strip leading/trailing whitespace.
+    eg. "   foo  bar  " -> "foobar"
+
+    Parameters
+    ----------
+    text : string
+
+    Returns
+    -------
+    string
+    """
+    text = constants.NONBREAKING_SPACE_REGEX.sub(
+        "", constants.LINEBREAK_REGEX.sub("", text)
+    ).strip()
+    return text
+
+
 def lower_text(text: str):
     """
     Given ``text`` str, transform it into lowercase
@@ -65,32 +82,95 @@ def lower_text(text: str):
     """
     return text.lower()
 
-def remove_stopwords(text: str, lang: str, custom_stopwords: list = None) -> str:
+
+def filter_groups(token: str, ignored_stopwords: list = None) -> str:
+    """
+    Given ``token`` str and a list of groups of words
+    that were concatenated into tokens, reverses the tokens
+    to their ungrouped state.
+
+    Parameters
+    ----------
+    token : string
+    ignored_stopwords : list of strings
+
+    Returns
+    -------
+    string
+    """
+    if ignored_stopwords:
+        for group in ignored_stopwords:
+            if token == remove_whitespace(group):
+                token = group
+    return token
+
+
+def ungroup_ignored_stopwords(tokens: list, ignored_stopwords: list = None) -> list:
+    """
+    Given ``tokens`` list of str and a list of groups of words
+    that are concatenated in tokens, reverses the tokens to
+    their ungrouped state.
+
+    Parameters
+    ----------
+    tokens : list of strings
+    ignored_stopwords : list of strings
+
+    Returns
+    -------
+    list of strings
+    """
+
+    return [filter_groups(token, ignored_stopwords) for token in tokens]
+
+
+def remove_stopwords(text: str, lang: str, custom_stopwords: list = None, ignored_stopwords: list = None) -> str:
     """
     Given ``text`` str, remove classic stopwords for a given language and
-    custom stopwords given as a list.
+    custom stopwords given as a list. Words and groups of words from
+    ignored_stopwords list are ignored during stopwords removal.
 
     Parameters
     ----------
     text : string
     lang : string
     custom_stopwords : list of strings
+    ignored_stopwords : list of strings
 
     Returns
     -------
     string
+
+    Raises
+    -------
+    ValueError
+        if ``custom_stopwords``  and ``ignored_stopwords`` have common elements.
     """
+    if custom_stopwords and ignored_stopwords:
+        if len(set(custom_stopwords) & set(ignored_stopwords)) > 0:
+            raise ValueError("You are trying to add and remove a stopword at the same time !")
     stopwords = get_stopwords(lang)
+    if ignored_stopwords:
+        keyword_processor = KeywordProcessor()
+        singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]
+        for group_of_words in ignored_stopwords:
+            keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))
+        text = keyword_processor.replace_keywords(text)
+    else:
+        singletons_to_keep = []
     if custom_stopwords:
         stopwords += custom_stopwords
     if lang in ["fr", "en"]:
         lang_module = {
             "fr" : "fr_spacy",
             "en" : "en_spacy"
         }[lang]
-        return ' '.join(
-            [x for x in tokenize(text, lang_module) if x not in stopwords])
-    return ' '.join([x for x in text.split() if x not in stopwords])
+        tokens = tokenize(text, lang_module)
+    else:
+        tokens = text.split()
+    tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]
+    tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
+    return ' '.join(tokens)
 
 
 def remove_eol_characters(text) -> str:
@@ -111,11 +191,6 @@ def remove_eol_characters(text) -> str:
 
 def fix_bad_unicode(text, normalization: str = "NFC") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Fix unicode text that's "broken" using `ftfy
     <http://ftfy.readthedocs.org/>`_;
     this includes mojibake, HTML entities and other code cruft,
@@ -143,11 +218,6 @@ def fix_bad_unicode(text, normalization: str = "NFC") -> str:
 
 def unpack_english_contractions(text) -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace *English* contractions in ``text`` str with their unshortened
     forms.
     N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
@@ -188,11 +258,6 @@ def unpack_english_contractions(text) -> str:
 
 def replace_urls(text, replace_with: str = "*URL*") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace all URLs in ``text`` str with ``replace_with`` str.
 
     Parameters
@@ -213,11 +278,6 @@ def replace_urls(text, replace_with: str = "*URL*") -> str:
 
 def replace_emails(text, replace_with="*EMAIL*") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace all emails in ``text`` str with ``replace_with`` str
 
     Parameters
@@ -238,11 +298,6 @@ def replace_phone_numbers(text, country_to_detect: list,
                           replace_with: str = "*PHONE*",
                           method: str = "regex") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Inspired code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace all phone numbers in ``text`` str with ``replace_with`` str
 
     Parameters
@@ -279,11 +334,6 @@ def replace_phone_numbers(text, country_to_detect: list,
 
 def replace_numbers(text, replace_with="*NUMBER*") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace all numbers in ``text`` str with ``replace_with`` str.
 
     Parameters
@@ -302,11 +352,6 @@ def replace_numbers(text, replace_with="*NUMBER*") -> str:
 
 def replace_currency_symbols(text, replace_with=None) -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Replace all currency symbols in ``text`` str with string specified by
     ``replace_with`` str.
 
@@ -334,11 +379,6 @@ def replace_currency_symbols(text, replace_with=None) -> str:
 
 def remove_punct(text, marks=None) -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Remove punctuation from ``text`` by replacing all instances of ``marks``
     with whitespace.
 
@@ -372,11 +412,6 @@ def remove_punct(text, marks=None) -> str:
 
 def remove_accents(text, method: str = "unicode") -> str:
     """
-    ----
-    Copyright 2016 Chartbeat, Inc.
-    Code from textacy: https://github.com/chartbeat-labs/textacy
-    ----
-
     Remove accents from any accented unicode characters in ``text`` str,
     either by transforming them into ascii equivalents or removing them
     entirely.

diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
@@ -199,17 +199,21 @@ def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
 
 
 @pytest.mark.parametrize(
-    "input_text, lang, expected_output",
+    "input_text, lang, custom_stopwords, ignored_stopwords, expected_output",
     [
-        ('I like this song very much !', 'en', 'I song !'),
-        ('Can I get a beer?', 'en', 'Can I beer ?'),
-        ('Je vous recommande ce film !', 'fr', 'Je recommande film !'),
-        ('je vous recommande ce film !', 'fr', 'recommande film !'),
-        ('Quiero una cerveza, por favor.', 'es', 'Quiero cerveza, favor.')
+        ('I like this song very much !', 'en', None, None, 'I song !'),
+        ('Can I get a beer?', 'en', None, None, 'Can I beer ?'),
+        ('Je vous recommande ce film !', 'fr', None, None, 'Je recommande film !'),
+        ('je vous recommande ce film !', 'fr', None, None, 'recommande film !'),
+        ('Quiero una cerveza, por favor.', 'es', None, None, 'Quiero cerveza, favor.'),
+        ('je vous recommande ce film !', 'fr', ["recommande"], None, 'film !'),
+        ('Quiero una cerveza, por favor.', 'es', None, ["una"], 'Quiero una cerveza, favor.'),
+        ('je vous recommande ce film !', 'fr', ["recommande"], ["je vous"], 'je vous film !'),
+        ('je vous recommande ce film !', 'fr', ["recommande"], ["recommande ce film"], 'recommande ce film !')
     ],
 )
-def test_remove_stopwords_text(input_text, lang, expected_output):
-    result = remove_stopwords_text(input_text, lang)
+def test_remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords, expected_output):
+    result = remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords)
     np.testing.assert_array_equal(result, expected_output)