diff --git a/README.md b/README.md index 5d000c9..5e389b8 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,6 @@ print(example) # Make HTML documentation - In order to make the html Sphinx documentation, you need to run at the nlpretext root path: `sphinx-apidoc -f nlpretext -o docs/` This will generate the .rst files. @@ -184,3 +183,19 @@ You can now open the file index.html located in the build folder. ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. │ generated with `pip freeze > requirements.txt` └── pylintrc <- The linting configuration file + + +# Credits + +- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions: + - `fix_bad_unicode` + - `normalize_whitespace` + - `unpack_english_contractions` + - `replace_urls` + - `replace_emails` + - `replace_numbers` + - `replace_currency_symbols` + - `remove_punct` + - `remove_accents` + - `replace_phone_numbers` *(with some modifications of our own)* + diff --git a/VERSION b/VERSION index e6d5cb8..21e8796 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0.2 \ No newline at end of file +1.0.3 diff --git a/nlpretext/_config/constants.py b/nlpretext/_config/constants.py index d7c4b20..218cc17 100644 --- a/nlpretext/_config/constants.py +++ b/nlpretext/_config/constants.py @@ -15,6 +15,7 @@ # limitations under the License """ Collection of regular expressions and other (small, generally useful) constants. +Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy """ from __future__ import unicode_literals diff --git a/nlpretext/basic/preprocess.py b/nlpretext/basic/preprocess.py index f7eedf2..3aa942a 100644 --- a/nlpretext/basic/preprocess.py +++ b/nlpretext/basic/preprocess.py @@ -28,6 +28,11 @@ def normalize_whitespace(text) -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Given ``text`` str, replace one or more spacings with a single space, and one or more linebreaks with a single newline. Also strip leading/trailing whitespace. @@ -106,6 +111,11 @@ def remove_eol_characters(text) -> str: def fix_bad_unicode(text, normalization: str = "NFC") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Fix unicode text that's "broken" using `ftfy `_; this includes mojibake, HTML entities and other code cruft, @@ -133,6 +143,11 @@ def fix_bad_unicode(text, normalization: str = "NFC") -> str: def unpack_english_contractions(text) -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace *English* contractions in ``text`` str with their unshortened forms. N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), @@ -173,6 +188,11 @@ def unpack_english_contractions(text) -> str: def replace_urls(text, replace_with: str = "*URL*") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace all URLs in ``text`` str with ``replace_with`` str. Parameters @@ -193,6 +213,11 @@ def replace_urls(text, replace_with: str = "*URL*") -> str: def replace_emails(text, replace_with="*EMAIL*") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace all emails in ``text`` str with ``replace_with`` str Parameters @@ -213,6 +238,11 @@ def replace_phone_numbers(text, country_to_detect: list, replace_with: str = "*PHONE*", method: str = "regex") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Inspired code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace all phone numbers in ``text`` str with ``replace_with`` str Parameters @@ -249,6 +279,11 @@ def replace_phone_numbers(text, country_to_detect: list, def replace_numbers(text, replace_with="*NUMBER*") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace all numbers in ``text`` str with ``replace_with`` str. Parameters @@ -267,6 +302,11 @@ def replace_numbers(text, replace_with="*NUMBER*") -> str: def replace_currency_symbols(text, replace_with=None) -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str. @@ -294,6 +334,11 @@ def replace_currency_symbols(text, replace_with=None) -> str: def remove_punct(text, marks=None) -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Remove punctuation from ``text`` by replacing all instances of ``marks`` with whitespace. @@ -327,6 +372,11 @@ def remove_punct(text, marks=None) -> str: def remove_accents(text, method: str = "unicode") -> str: """ + ---- + Copyright 2016 Chartbeat, Inc. + Code from textacy: https://github.com/chartbeat-labs/textacy + ---- + Remove accents from any accented unicode characters in ``text`` str, either by transforming them into ascii equivalents or removing them entirely. diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py index a11ba16..6c9db84 100644 --- a/tests/test_preprocessor.py +++ b/tests/test_preprocessor.py @@ -190,7 +190,7 @@ def test_get_stopwords(): @pytest.mark.parametrize( "input_tokens, lang, expected_output", [ - (['I', 'like', 'when', 'you', 'move', 'your', 'body', '!'], "en", ['I', 'move', 'body', '!']) + (['I', 'like', 'this', 'song', 'very', 'much', '!'], "en", ['I', 'song', '!']) ], ) def test_remove_stopwords_tokens(input_tokens, lang, expected_output): @@ -201,7 +201,7 @@ def test_remove_stopwords_tokens(input_tokens, lang, expected_output): @pytest.mark.parametrize( "input_text, lang, expected_output", [ - ('I like when you move your body !', 'en', 'I move body !'), + ('I like this song very much !', 'en', 'I song !'), ('Can I get a beer?', 'en', 'Can I beer ?'), ('Je vous recommande ce film !', 'fr', 'Je recommande film !'), ('je vous recommande ce film !', 'fr', 'recommande film !'), @@ -216,7 +216,7 @@ def test_remove_stopwords_text(input_text, lang, expected_output): @pytest.mark.parametrize( "input_text, lang, custom_stopwords, expected_output", [ - ('I like when you move your body !', 'en', ['body'], 'I move !'), + ('I like this song very much !', 'en', ['song'], 'I !'), ('Je vous recommande ce film la scène de fin est géniale !', 'fr', ['film', 'scène'], 'Je recommande fin géniale !'), ], @@ -249,7 +249,6 @@ def test_remove_accents(): ('proportienelle', 'proportienelle'), ('Pour plus de démocratie participative', 'Pour plus de démocratie participative'), ('Transparence de la vie public', 'Transparence de la vie public'), - ('18 mois de trop....ca suffit macron', '18 mois de trop....ca suffit macron'), ('Egalité devant les infractions routières', 'Egalité devant les infractions routières')],) def test_fix_bad_unicode(input_str, expected_str): result = fix_bad_unicode(input_str) @@ -287,14 +286,13 @@ def test_unpack_english_contractions(input_str, expected_str): @pytest.mark.parametrize( "input_str, expected_str", [( - "Wan't to contribute to Nautilus? read https://github.com/artefactory/nautilus-nlp/blob/docs/CONTRIBUTING.md"\ + "Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md"\ " first", - "Wan't to contribute to Nautilus? read *URL* first"), - ("The ip address of my VM is http://34.76.182.5:8888", "The ip address of my VM is *URL*"), + "Wan't to contribute to NLPretext? read *URL* first"), ("If you go to http://internet.org, you will find a website hosted by FB.", "If you go to *URL*, you will find a website hosted by FB."), - ("Ishttps://waaaou.com/ available?", 'Is*URL* available?'), - ("mailto:hugo.vasselin@artefact.com", '*URL*')]) + ("Ishttps://internet.org/ available?", 'Is*URL* available?'), + ("mailto:john.doe@artefact.com", '*URL*')]) def test_replace_urls(input_str, expected_str): result = replace_urls(input_str) np.testing.assert_equal(result, expected_str) @@ -303,10 +301,9 @@ def test_replace_urls(input_str, expected_str): @pytest.mark.parametrize( "input_str, expected_str", [ - ("my email:hugo.vasselin@artefact.com", "my email:*EMAIL*"), + ("my email:john.doe@artefact.com", "my email:*EMAIL*"), ("v543143@nwytg.net is a temporary email", "*EMAIL* is a temporary email"), - ("our emails used to be name.surname@artefact.is", "our emails used to be *EMAIL*"), - ("chaudasse_du_13@hotmail.fr,C ton email bb?", '*EMAIL*,C ton email bb?') + ("our emails used to be name.surname@artefact.is", "our emails used to be *EMAIL*") ] ) def test_replace_emails(input_str, expected_str): @@ -317,17 +314,17 @@ def test_replace_emails(input_str, expected_str): @pytest.mark.parametrize( "input_str, expected_str", [ - ("mon 06 bb: 0625093267", "mon 06 bb: *PHONE*"), - ("mon 06 bb: 06.25.09.32.67", "mon 06 bb: *PHONE*"), - ("call me at +33625093267", "call me at *PHONE*"), - ("call me at +33 6 25 09 32 67", "call me at *PHONE*"), - ("call me at +33 625 093 267", "call me at *PHONE*"), - ("if this unit test doesn't work, call 3615 and says 'ROBIN'", - "if this unit test doesn't work, call *PHONE* and says 'ROBIN'"), - ('(541) 754-3010 is a US. Phone', '*PHONE* is a US. Phone'), - ('+1-541-754-3010 is an international Phone', '*PHONE* is an international Phone'), - ('+1-541-754-3010 Dialed in the US', '*PHONE* Dialed in the US'), - ('+1-541-754-3010 Dialed from Germany', '*PHONE* Dialed from Germany') + ("mon 06: 0601020304", "mon 06: *PHONE*"), + ("mon 06: 06.01.02.03.04", "mon 06: *PHONE*"), + ("call me at +33601020304", "call me at *PHONE*"), + ("call me at +33 6 01 02 03 04", "call me at *PHONE*"), + ("call me at +33 601 020 304", "call me at *PHONE*"), + ("if this unit test doesn't work, call 3615 and says 'HELP'", + "if this unit test doesn't work, call *PHONE* and says 'HELP'"), + ('(541) 754-0000 is a US. Phone', '*PHONE* is a US. Phone'), + ('+1-541-754-0000 is an international Phone', '*PHONE* is an international Phone'), + ('+1-541-754-0000 Dialed in the US', '*PHONE* Dialed in the US'), + ('+1-541-754-0000 Dialed from Germany', '*PHONE* Dialed from Germany') ] ) def test_replace_phone_numbers(input_str, expected_str): @@ -343,9 +340,8 @@ def test_replace_phone_numbers(input_str, expected_str): "input_str, expected_str", [ ("123, 3 petits chats", "*NUMBER*, *NUMBER* petits chats"), - ("l0ve 2 twa <3", "l0ve *NUMBER* twa <*NUMBER*"), ("Give me 45bucks!", "Give me *NUMBER*bucks!"), - ("call me at +33625093267", "call me at *NUMBER*") + ("call me at +33601020304", "call me at *NUMBER*") ] ) def test_replace_numbers(input_str, expected_str): @@ -384,9 +380,9 @@ def test_replace_currency_symbols(input_str, param, expected_str): ("Seriously.,.", '.,;', "Seriously "), ("Seriously...", '.,;', "Seriously "), ("Seriously.!.", '.,;', "Seriously ! "), - ("hugo.vasselin@artefact.com", '.,;', "hugo vasselin@artefact com"), - ("hugo.vasselin@artefact.com", None, "hugo vasselin artefact com"), - ("hugo-vasselin@artefact.com", None, "hugo vasselin artefact com") + ("john.doe@artefact.com", '.,;', "john doe@artefact com"), + ("john.doe@artefact.com", None, "john doe artefact com"), + ("john-doe@artefact.com", None, "john doe artefact com") ] ) def test_remove_punct(input_str, param, expected_str): @@ -397,27 +393,26 @@ def test_remove_punct(input_str, param, expected_str): @pytest.mark.parametrize( "input_str, expected_str", [ - ("👉👌", ""), + ("⚽👌", ""), ("🎅🏿⌚", ""), - ("🥖✊💦", ""), + ("🥖🍷🇫🇷", ""), ("✊", ""), - ("J'espère que les 🚓 vont pas lire ce test", - "J'espère que les vont pas lire ce test"), - ("J'espère que les vont pas lire ce test🚓", - "J'espère que les vont pas lire ce test") + ("Save 🐼 and 🐟", + "Save and "), ] ) def test_remove_emoji(input_str, expected_str): result = remove_emoji(input_str) - np.testing.assert_equal(result, expected_str) + assert len(result) == len(expected_str) + assert result == expected_str @pytest.mark.parametrize( "input_str, expected_str", [ - ("👉👌", ":backhand_index_pointing_right::OK_hand:"), + ("⚽️👌", ":soccer_ball::OK_hand:"), ("🎅🏿⌚", ":Santa_Claus_dark_skin_tone::watch:"), - ("🥖✊💦", ":baguette_bread::raised_fist::sweat_droplets:"), + ("🥖🍷🇫🇷", ":baguette_bread::wine_glass::France:"), ("✊", ":raised_fist:") ] )