Merge pull request #150 from artefactory/fix/credits

Fix/credits
artefactory · May 10, 2021 · ce13b04 · ce13b04
2 parents aef41e4 + 45f00dc
commit ce13b04
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -152,7 +152,6 @@ print(example)
 
 # Make HTML documentation
 
-
 In order to make the html Sphinx documentation, you need to run at the nlpretext root path:
 `sphinx-apidoc -f nlpretext -o docs/`
 This will generate the .rst files.
@@ -184,3 +183,19 @@ You can now open the file index.html located in the build folder.
     ├── requirements.txt    <- The requirements file for reproducing the analysis environment, e.g.
     │                          generated with `pip freeze > requirements.txt`
     └── pylintrc            <- The linting configuration file
+
+
+# Credits
+
+- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:
+    - `fix_bad_unicode`
+    - `normalize_whitespace`
+    - `unpack_english_contractions`
+    - `replace_urls`
+    - `replace_emails`
+    - `replace_numbers`
+    - `replace_currency_symbols`
+    - `remove_punct`
+    - `remove_accents`
+    - `replace_phone_numbers` *(with some modifications of our own)*
+
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.0.2
+1.0.3
diff --git a/nlpretext/_config/constants.py b/nlpretext/_config/constants.py
@@ -15,6 +15,7 @@
 # limitations under the License
 """
 Collection of regular expressions and other (small, generally useful) constants.
+Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy
 """
 from __future__ import unicode_literals
 

diff --git a/nlpretext/basic/preprocess.py b/nlpretext/basic/preprocess.py
@@ -28,6 +28,11 @@
 
 def normalize_whitespace(text) -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Given ``text`` str, replace one or more spacings with a single space, and
     one or more linebreaks with a single newline. Also strip leading/trailing
     whitespace.
@@ -106,6 +111,11 @@ def remove_eol_characters(text) -> str:
 
 def fix_bad_unicode(text, normalization: str = "NFC") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Fix unicode text that's "broken" using `ftfy
     <http://ftfy.readthedocs.org/>`_;
     this includes mojibake, HTML entities and other code cruft,
@@ -133,6 +143,11 @@ def fix_bad_unicode(text, normalization: str = "NFC") -> str:
 
 def unpack_english_contractions(text) -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace *English* contractions in ``text`` str with their unshortened
     forms.
     N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
@@ -173,6 +188,11 @@ def unpack_english_contractions(text) -> str:
 
 def replace_urls(text, replace_with: str = "*URL*") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace all URLs in ``text`` str with ``replace_with`` str.
 
     Parameters
@@ -193,6 +213,11 @@ def replace_urls(text, replace_with: str = "*URL*") -> str:
 
 def replace_emails(text, replace_with="*EMAIL*") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace all emails in ``text`` str with ``replace_with`` str
 
     Parameters
@@ -213,6 +238,11 @@ def replace_phone_numbers(text, country_to_detect: list,
                           replace_with: str = "*PHONE*",
                           method: str = "regex") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Inspired code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace all phone numbers in ``text`` str with ``replace_with`` str
 
     Parameters
@@ -249,6 +279,11 @@ def replace_phone_numbers(text, country_to_detect: list,
 
 def replace_numbers(text, replace_with="*NUMBER*") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace all numbers in ``text`` str with ``replace_with`` str.
 
     Parameters
@@ -267,6 +302,11 @@ def replace_numbers(text, replace_with="*NUMBER*") -> str:
 
 def replace_currency_symbols(text, replace_with=None) -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Replace all currency symbols in ``text`` str with string specified by
     ``replace_with`` str.
 
@@ -294,6 +334,11 @@ def replace_currency_symbols(text, replace_with=None) -> str:
 
 def remove_punct(text, marks=None) -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Remove punctuation from ``text`` by replacing all instances of ``marks``
     with whitespace.
 
@@ -327,6 +372,11 @@ def remove_punct(text, marks=None) -> str:
 
 def remove_accents(text, method: str = "unicode") -> str:
     """
+    ----
+    Copyright 2016 Chartbeat, Inc.
+    Code from textacy: https://github.com/chartbeat-labs/textacy
+    ----
+
     Remove accents from any accented unicode characters in ``text`` str,
     either by transforming them into ascii equivalents or removing them
     entirely.

diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
@@ -190,7 +190,7 @@ def test_get_stopwords():
 @pytest.mark.parametrize(
     "input_tokens, lang, expected_output",
     [
-        (['I', 'like', 'when', 'you', 'move', 'your', 'body', '!'], "en", ['I', 'move', 'body', '!'])
+        (['I', 'like', 'this', 'song', 'very', 'much', '!'], "en", ['I', 'song', '!'])
     ],
 )
 def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
@@ -201,7 +201,7 @@ def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
 @pytest.mark.parametrize(
     "input_text, lang, expected_output",
     [
-        ('I like when you move your body !', 'en', 'I move body !'),
+        ('I like this song very much !', 'en', 'I song !'),
         ('Can I get a beer?', 'en', 'Can I beer ?'),
         ('Je vous recommande ce film !', 'fr', 'Je recommande film !'),
         ('je vous recommande ce film !', 'fr', 'recommande film !'),
@@ -216,7 +216,7 @@ def test_remove_stopwords_text(input_text, lang, expected_output):
 @pytest.mark.parametrize(
     "input_text, lang, custom_stopwords, expected_output",
     [
-        ('I like when you move your body !', 'en', ['body'], 'I move !'),
+        ('I like this song very much !', 'en', ['song'], 'I !'),
         ('Je vous recommande ce film la scène de fin est géniale !', 'fr',
          ['film', 'scène'], 'Je recommande fin géniale !'),
     ],
@@ -249,7 +249,6 @@ def test_remove_accents():
      ('proportienelle', 'proportienelle'),
      ('Pour plus de dÃ©mocratie participative', 'Pour plus de démocratie participative'),
      ('Transparence de la vie public', 'Transparence de la vie public'),
-     ('18 mois de trop....ca suffit macron', '18 mois de trop....ca suffit macron'),
      ('EgalitÃ© devant les infractions routiÃ¨res', 'Egalité devant les infractions routières')],)
 def test_fix_bad_unicode(input_str, expected_str):
     result = fix_bad_unicode(input_str)
@@ -287,14 +286,13 @@ def test_unpack_english_contractions(input_str, expected_str):
 @pytest.mark.parametrize(
     "input_str, expected_str",
     [(
-        "Wan't to contribute to Nautilus? read https://github.com/artefactory/nautilus-nlp/blob/docs/CONTRIBUTING.md"\
+        "Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md"\
             " first",
-        "Wan't to contribute to Nautilus? read *URL* first"),
-     ("The ip address of my VM is http://34.76.182.5:8888", "The ip address of my VM is *URL*"),
+        "Wan't to contribute to NLPretext? read *URL* first"),
      ("If you go to http://internet.org, you will find a website hosted by FB.",
       "If you go to *URL*, you will find a website hosted by FB."),
-     ("Ishttps://waaaou.com/ available?", 'Is*URL* available?'),
-     ("mailto:hugo.vasselin@artefact.com", '*URL*')])
+     ("Ishttps://internet.org/ available?", 'Is*URL* available?'),
+     ("mailto:john.doe@artefact.com", '*URL*')])
 def test_replace_urls(input_str, expected_str):
     result = replace_urls(input_str)
     np.testing.assert_equal(result, expected_str)
@@ -303,10 +301,9 @@ def test_replace_urls(input_str, expected_str):
 @pytest.mark.parametrize(
     "input_str, expected_str",
     [
-        ("my email:hugo.vasselin@artefact.com", "my email:*EMAIL*"),
+        ("my email:john.doe@artefact.com", "my email:*EMAIL*"),
         ("[email protected] is a temporary email", "*EMAIL* is a temporary email"),
-        ("our emails used to be [email protected]", "our emails used to be *EMAIL*"),
-        ("[email protected],C ton email bb?", '*EMAIL*,C ton email bb?')
+        ("our emails used to be [email protected]", "our emails used to be *EMAIL*")
     ]
 )
 def test_replace_emails(input_str, expected_str):
@@ -317,17 +314,17 @@ def test_replace_emails(input_str, expected_str):
 @pytest.mark.parametrize(
     "input_str, expected_str",
     [
-        ("mon 06 bb: 0625093267", "mon 06 bb: *PHONE*"),
-        ("mon 06 bb: 06.25.09.32.67", "mon 06 bb: *PHONE*"),
-        ("call me at +33625093267", "call me at *PHONE*"),
-        ("call me at +33 6 25 09 32 67", "call me at *PHONE*"),
-        ("call me at +33 625 093 267", "call me at *PHONE*"),
-        ("if this unit test doesn't work, call 3615 and says 'ROBIN'",
-         "if this unit test doesn't work, call *PHONE* and says 'ROBIN'"),
-        ('(541) 754-3010 is a US. Phone', '*PHONE* is a US. Phone'),
-        ('+1-541-754-3010 is an international Phone', '*PHONE* is an international Phone'),
-        ('+1-541-754-3010 Dialed in the US', '*PHONE* Dialed in the US'),
-        ('+1-541-754-3010 Dialed from Germany', '*PHONE* Dialed from Germany')
+        ("mon 06: 0601020304", "mon 06: *PHONE*"),
+        ("mon 06: 06.01.02.03.04", "mon 06: *PHONE*"),
+        ("call me at +33601020304", "call me at *PHONE*"),
+        ("call me at +33 6 01 02 03 04", "call me at *PHONE*"),
+        ("call me at +33 601 020 304", "call me at *PHONE*"),
+        ("if this unit test doesn't work, call 3615 and says 'HELP'",
+         "if this unit test doesn't work, call *PHONE* and says 'HELP'"),
+        ('(541) 754-0000 is a US. Phone', '*PHONE* is a US. Phone'),
+        ('+1-541-754-0000 is an international Phone', '*PHONE* is an international Phone'),
+        ('+1-541-754-0000 Dialed in the US', '*PHONE* Dialed in the US'),
+        ('+1-541-754-0000 Dialed from Germany', '*PHONE* Dialed from Germany')
     ]
 )
 def test_replace_phone_numbers(input_str, expected_str):
@@ -343,9 +340,8 @@ def test_replace_phone_numbers(input_str, expected_str):
     "input_str, expected_str",
     [
         ("123, 3 petits chats", "*NUMBER*, *NUMBER* petits chats"),
-        ("l0ve 2 twa <3", "l0ve *NUMBER* twa <*NUMBER*"),
         ("Give me 45bucks!", "Give me *NUMBER*bucks!"),
-        ("call me at +33625093267", "call me at *NUMBER*")
+        ("call me at +33601020304", "call me at *NUMBER*")
     ]
 )
 def test_replace_numbers(input_str, expected_str):
@@ -384,9 +380,9 @@ def test_replace_currency_symbols(input_str, param, expected_str):
         ("Seriously.,.", '.,;', "Seriously "),
         ("Seriously...", '.,;', "Seriously "),
         ("Seriously.!.", '.,;', "Seriously ! "),
-        ("hugo.vasselin@artefact.com", '.,;', "hugo vasselin@artefact com"),
-        ("hugo.vasselin@artefact.com", None, "hugo vasselin artefact com"),
-        ("hugo-vasselin@artefact.com", None, "hugo vasselin artefact com")
+        ("john.doe@artefact.com", '.,;', "john doe@artefact com"),
+        ("john.doe@artefact.com", None, "john doe artefact com"),
+        ("john-doe@artefact.com", None, "john doe artefact com")
     ]
 )
 def test_remove_punct(input_str, param, expected_str):
@@ -397,27 +393,26 @@ def test_remove_punct(input_str, param, expected_str):
 @pytest.mark.parametrize(
     "input_str, expected_str",
     [
-        ("👉👌", ""),
+        ("⚽👌", ""),
         ("🎅🏿⌚", ""),
-        ("🥖✊💦", ""),
+        ("🥖🍷🇫🇷", ""),
         ("✊", ""),
-        ("J'espère que les 🚓 vont pas lire ce test",
-         "J'espère que les  vont pas lire ce test"),
-        ("J'espère que les vont pas lire ce test🚓",
-         "J'espère que les vont pas lire ce test")
+        ("Save 🐼 and 🐟",
+         "Save  and "),
     ]
 )
 def test_remove_emoji(input_str, expected_str):
     result = remove_emoji(input_str)
-    np.testing.assert_equal(result, expected_str)
+    assert len(result) == len(expected_str)
+    assert result == expected_str
 
 
 @pytest.mark.parametrize(
     "input_str, expected_str",
     [
-        ("👉👌", ":backhand_index_pointing_right::OK_hand:"),
+        ("⚽️👌", ":soccer_ball::OK_hand:"),
         ("🎅🏿⌚", ":Santa_Claus_dark_skin_tone::watch:"),
-        ("🥖✊💦", ":baguette_bread::raised_fist::sweat_droplets:"),
+        ("🥖🍷🇫🇷", ":baguette_bread::wine_glass::France:"),
         ("✊", ":raised_fist:")
     ]
 )