Skip to content

Commit

Permalink
Merge pull request #150 from artefactory/fix/credits
Browse files Browse the repository at this point in the history
Fix/credits
  • Loading branch information
hugovasselin authored May 10, 2021
2 parents aef41e4 + 45f00dc commit ce13b04
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 39 deletions.
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ print(example)

# Make HTML documentation


In order to make the html Sphinx documentation, you need to run at the nlpretext root path:
`sphinx-apidoc -f nlpretext -o docs/`
This will generate the .rst files.
Expand Down Expand Up @@ -184,3 +183,19 @@ You can now open the file index.html located in the build folder.
├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
│ generated with `pip freeze > requirements.txt`
└── pylintrc <- The linting configuration file


# Credits

- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:
- `fix_bad_unicode`
- `normalize_whitespace`
- `unpack_english_contractions`
- `replace_urls`
- `replace_emails`
- `replace_numbers`
- `replace_currency_symbols`
- `remove_punct`
- `remove_accents`
- `replace_phone_numbers` *(with some modifications of our own)*

2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.2
1.0.3
1 change: 1 addition & 0 deletions nlpretext/_config/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# limitations under the License
"""
Collection of regular expressions and other (small, generally useful) constants.
Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy
"""
from __future__ import unicode_literals

Expand Down
50 changes: 50 additions & 0 deletions nlpretext/basic/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@

def normalize_whitespace(text) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Given ``text`` str, replace one or more spacings with a single space, and
one or more linebreaks with a single newline. Also strip leading/trailing
whitespace.
Expand Down Expand Up @@ -106,6 +111,11 @@ def remove_eol_characters(text) -> str:

def fix_bad_unicode(text, normalization: str = "NFC") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Fix unicode text that's "broken" using `ftfy
<http://ftfy.readthedocs.org/>`_;
this includes mojibake, HTML entities and other code cruft,
Expand Down Expand Up @@ -133,6 +143,11 @@ def fix_bad_unicode(text, normalization: str = "NFC") -> str:

def unpack_english_contractions(text) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace *English* contractions in ``text`` str with their unshortened
forms.
N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
Expand Down Expand Up @@ -173,6 +188,11 @@ def unpack_english_contractions(text) -> str:

def replace_urls(text, replace_with: str = "*URL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all URLs in ``text`` str with ``replace_with`` str.
Parameters
Expand All @@ -193,6 +213,11 @@ def replace_urls(text, replace_with: str = "*URL*") -> str:

def replace_emails(text, replace_with="*EMAIL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all emails in ``text`` str with ``replace_with`` str
Parameters
Expand All @@ -213,6 +238,11 @@ def replace_phone_numbers(text, country_to_detect: list,
replace_with: str = "*PHONE*",
method: str = "regex") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Inspired code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all phone numbers in ``text`` str with ``replace_with`` str
Parameters
Expand Down Expand Up @@ -249,6 +279,11 @@ def replace_phone_numbers(text, country_to_detect: list,

def replace_numbers(text, replace_with="*NUMBER*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all numbers in ``text`` str with ``replace_with`` str.
Parameters
Expand All @@ -267,6 +302,11 @@ def replace_numbers(text, replace_with="*NUMBER*") -> str:

def replace_currency_symbols(text, replace_with=None) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all currency symbols in ``text`` str with string specified by
``replace_with`` str.
Expand Down Expand Up @@ -294,6 +334,11 @@ def replace_currency_symbols(text, replace_with=None) -> str:

def remove_punct(text, marks=None) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Remove punctuation from ``text`` by replacing all instances of ``marks``
with whitespace.
Expand Down Expand Up @@ -327,6 +372,11 @@ def remove_punct(text, marks=None) -> str:

def remove_accents(text, method: str = "unicode") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Remove accents from any accented unicode characters in ``text`` str,
either by transforming them into ascii equivalents or removing them
entirely.
Expand Down
69 changes: 32 additions & 37 deletions tests/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_get_stopwords():
@pytest.mark.parametrize(
"input_tokens, lang, expected_output",
[
(['I', 'like', 'when', 'you', 'move', 'your', 'body', '!'], "en", ['I', 'move', 'body', '!'])
(['I', 'like', 'this', 'song', 'very', 'much', '!'], "en", ['I', 'song', '!'])
],
)
def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
Expand All @@ -201,7 +201,7 @@ def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
@pytest.mark.parametrize(
"input_text, lang, expected_output",
[
('I like when you move your body !', 'en', 'I move body !'),
('I like this song very much !', 'en', 'I song !'),
('Can I get a beer?', 'en', 'Can I beer ?'),
('Je vous recommande ce film !', 'fr', 'Je recommande film !'),
('je vous recommande ce film !', 'fr', 'recommande film !'),
Expand All @@ -216,7 +216,7 @@ def test_remove_stopwords_text(input_text, lang, expected_output):
@pytest.mark.parametrize(
"input_text, lang, custom_stopwords, expected_output",
[
('I like when you move your body !', 'en', ['body'], 'I move !'),
('I like this song very much !', 'en', ['song'], 'I !'),
('Je vous recommande ce film la scène de fin est géniale !', 'fr',
['film', 'scène'], 'Je recommande fin géniale !'),
],
Expand Down Expand Up @@ -249,7 +249,6 @@ def test_remove_accents():
('proportienelle', 'proportienelle'),
('Pour plus de démocratie participative', 'Pour plus de démocratie participative'),
('Transparence de la vie public', 'Transparence de la vie public'),
('18 mois de trop....ca suffit macron', '18 mois de trop....ca suffit macron'),
('Egalité devant les infractions routières', 'Egalité devant les infractions routières')],)
def test_fix_bad_unicode(input_str, expected_str):
result = fix_bad_unicode(input_str)
Expand Down Expand Up @@ -287,14 +286,13 @@ def test_unpack_english_contractions(input_str, expected_str):
@pytest.mark.parametrize(
"input_str, expected_str",
[(
"Wan't to contribute to Nautilus? read https://github.com/artefactory/nautilus-nlp/blob/docs/CONTRIBUTING.md"\
"Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md"\
" first",
"Wan't to contribute to Nautilus? read *URL* first"),
("The ip address of my VM is http://34.76.182.5:8888", "The ip address of my VM is *URL*"),
"Wan't to contribute to NLPretext? read *URL* first"),
("If you go to http://internet.org, you will find a website hosted by FB.",
"If you go to *URL*, you will find a website hosted by FB."),
("Ishttps://waaaou.com/ available?", 'Is*URL* available?'),
("mailto:hugo.vasselin@artefact.com", '*URL*')])
("Ishttps://internet.org/ available?", 'Is*URL* available?'),
("mailto:john.doe@artefact.com", '*URL*')])
def test_replace_urls(input_str, expected_str):
result = replace_urls(input_str)
np.testing.assert_equal(result, expected_str)
Expand All @@ -303,10 +301,9 @@ def test_replace_urls(input_str, expected_str):
@pytest.mark.parametrize(
"input_str, expected_str",
[
("my email:hugo.vasselin@artefact.com", "my email:*EMAIL*"),
("my email:john.doe@artefact.com", "my email:*EMAIL*"),
("[email protected] is a temporary email", "*EMAIL* is a temporary email"),
("our emails used to be [email protected]", "our emails used to be *EMAIL*"),
("[email protected],C ton email bb?", '*EMAIL*,C ton email bb?')
("our emails used to be [email protected]", "our emails used to be *EMAIL*")
]
)
def test_replace_emails(input_str, expected_str):
Expand All @@ -317,17 +314,17 @@ def test_replace_emails(input_str, expected_str):
@pytest.mark.parametrize(
"input_str, expected_str",
[
("mon 06 bb: 0625093267", "mon 06 bb: *PHONE*"),
("mon 06 bb: 06.25.09.32.67", "mon 06 bb: *PHONE*"),
("call me at +33625093267", "call me at *PHONE*"),
("call me at +33 6 25 09 32 67", "call me at *PHONE*"),
("call me at +33 625 093 267", "call me at *PHONE*"),
("if this unit test doesn't work, call 3615 and says 'ROBIN'",
"if this unit test doesn't work, call *PHONE* and says 'ROBIN'"),
('(541) 754-3010 is a US. Phone', '*PHONE* is a US. Phone'),
('+1-541-754-3010 is an international Phone', '*PHONE* is an international Phone'),
('+1-541-754-3010 Dialed in the US', '*PHONE* Dialed in the US'),
('+1-541-754-3010 Dialed from Germany', '*PHONE* Dialed from Germany')
("mon 06: 0601020304", "mon 06: *PHONE*"),
("mon 06: 06.01.02.03.04", "mon 06: *PHONE*"),
("call me at +33601020304", "call me at *PHONE*"),
("call me at +33 6 01 02 03 04", "call me at *PHONE*"),
("call me at +33 601 020 304", "call me at *PHONE*"),
("if this unit test doesn't work, call 3615 and says 'HELP'",
"if this unit test doesn't work, call *PHONE* and says 'HELP'"),
('(541) 754-0000 is a US. Phone', '*PHONE* is a US. Phone'),
('+1-541-754-0000 is an international Phone', '*PHONE* is an international Phone'),
('+1-541-754-0000 Dialed in the US', '*PHONE* Dialed in the US'),
('+1-541-754-0000 Dialed from Germany', '*PHONE* Dialed from Germany')
]
)
def test_replace_phone_numbers(input_str, expected_str):
Expand All @@ -343,9 +340,8 @@ def test_replace_phone_numbers(input_str, expected_str):
"input_str, expected_str",
[
("123, 3 petits chats", "*NUMBER*, *NUMBER* petits chats"),
("l0ve 2 twa <3", "l0ve *NUMBER* twa <*NUMBER*"),
("Give me 45bucks!", "Give me *NUMBER*bucks!"),
("call me at +33625093267", "call me at *NUMBER*")
("call me at +33601020304", "call me at *NUMBER*")
]
)
def test_replace_numbers(input_str, expected_str):
Expand Down Expand Up @@ -384,9 +380,9 @@ def test_replace_currency_symbols(input_str, param, expected_str):
("Seriously.,.", '.,;', "Seriously "),
("Seriously...", '.,;', "Seriously "),
("Seriously.!.", '.,;', "Seriously ! "),
("hugo.vasselin@artefact.com", '.,;', "hugo vasselin@artefact com"),
("hugo.vasselin@artefact.com", None, "hugo vasselin artefact com"),
("hugo-vasselin@artefact.com", None, "hugo vasselin artefact com")
("john.doe@artefact.com", '.,;', "john doe@artefact com"),
("john.doe@artefact.com", None, "john doe artefact com"),
("john-doe@artefact.com", None, "john doe artefact com")
]
)
def test_remove_punct(input_str, param, expected_str):
Expand All @@ -397,27 +393,26 @@ def test_remove_punct(input_str, param, expected_str):
@pytest.mark.parametrize(
"input_str, expected_str",
[
("👉👌", ""),
("👌", ""),
("🎅🏿⌚", ""),
("🥖✊💦", ""),
("🥖🍷🇫🇷", ""),
("✊", ""),
("J'espère que les 🚓 vont pas lire ce test",
"J'espère que les vont pas lire ce test"),
("J'espère que les vont pas lire ce test🚓",
"J'espère que les vont pas lire ce test")
("Save 🐼 and 🐟",
"Save and "),
]
)
def test_remove_emoji(input_str, expected_str):
result = remove_emoji(input_str)
np.testing.assert_equal(result, expected_str)
assert len(result) == len(expected_str)
assert result == expected_str


@pytest.mark.parametrize(
"input_str, expected_str",
[
("👉👌", ":backhand_index_pointing_right::OK_hand:"),
("⚽️👌", ":soccer_ball::OK_hand:"),
("🎅🏿⌚", ":Santa_Claus_dark_skin_tone::watch:"),
("🥖✊💦", ":baguette_bread::raised_fist::sweat_droplets:"),
("🥖🍷🇫🇷", ":baguette_bread::wine_glass::France:"),
("✊", ":raised_fist:")
]
)
Expand Down

0 comments on commit ce13b04

Please sign in to comment.