Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cd #142

Merged
merged 16 commits into from
Feb 16, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ jobs:
- name: Install requirements
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements_dev.txt
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
- name: Run pylint
run: |
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ print(text)
# "dinner life recommend"
```

Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/feature/readme/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.
Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master) in the ```preprocess.py``` scripts in the different folders: basic, social, token.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Il faut juste modifier le lien pour qu'on arrive dans le dossier nlpretext et pas sur le root: https://github.com/artefactory/NLPretext/tree/master/nlpretext/



# Individual Functions
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.1
1.0.0
File renamed without changes.
2 changes: 1 addition & 1 deletion nlpretext/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from nlpretext.social.preprocess import (
remove_html_tags, remove_mentions, remove_emoji, remove_hashtag)
from nlpretext.classic.preprocess import normalize_whitespace, remove_eol_characters, fix_bad_unicode
from nlpretext.basic.preprocess import normalize_whitespace, remove_eol_characters, fix_bad_unicode


class Preprocessor():
Expand Down
2 changes: 1 addition & 1 deletion nlpretext/social/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import emoji as _emoji
from nlpretext._config import constants
from nlpretext.classic.preprocess import normalize_whitespace
from nlpretext.basic.preprocess import normalize_whitespace


def remove_mentions(text) -> str:
Expand Down
13 changes: 11 additions & 2 deletions nlpretext/token/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,22 @@
from __future__ import absolute_import, division, print_function, unicode_literals

import re
from nlpretext._utils.stopwords import get_stopwords


def remove_stopwords(tokens, stopwords: list) -> str:
def remove_stopwords(tokens: list, lang: str, custom_stopwords: list = None) -> str:
"""
Remove stopwords from a text.
eg. 'I like when you move your body !' -> 'I move body !'

Parameters
----------
stopwords : list of stopwords to remove
tokens: list(str)
list of tokens
lang: str
language iso code (e.g : "en")
custom_stopwords : list(str)|None
list of custom stopwords to add. None by default

Returns
-------
Expand All @@ -41,6 +47,9 @@ def remove_stopwords(tokens, stopwords: list) -> str:
ValueError
When inputs is not a list
"""
stopwords = get_stopwords(lang)
if custom_stopwords:
stopwords += custom_stopwords
tokens = [word for word in tokens if word not in stopwords]
return tokens

Expand Down
15 changes: 0 additions & 15 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
# local package
#-e .

# external requirements
coverage
pillow
pytest==6.1.1
pytest-cov==2.10.1
python-dotenv>=0.5.1
Sphinx
sphinx_rtd_theme

#library requirements
chardet==3.0.4
emoji>=0.5.2
Expand All @@ -24,8 +12,5 @@ pylint==2.4.4
regex==2019.8.19
sacremoses==0.0.13
scikit_learn==0.23.2
setuptools==40.8.0
spacy==2.3.4
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
stop_words==2018.7.23
8 changes: 8 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
coverage==5.3
pytest==6.1.1
pytest-cov==2.10.1
python-dotenv>=0.5.1
Sphinx==3.2.1
sphinx_rtd_theme==0.5.0
setuptools==40.8.0
-r requirements.txt
29 changes: 10 additions & 19 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,38 +15,29 @@
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from setuptools import find_packages, setup
import setuptools
import setuptools.command.install
from pathlib import Path

class PostInstallCommand(setuptools.command.install.install):
"""Post-installation command."""
def run(self):
setuptools.command.install.install.run(self)
try:
import spacy
spacy.cli.validate()
except ModuleNotFoundError:
pass


with open(Path(__file__).resolve().parent.joinpath('VERSION'), 'r') as fh:
version = fh.read()
setup(

with open("requirements.txt", "r") as fr:
requirements = [req for req in fr.read().splitlines() if not req.startswith("#")]

setuptools.setup(
name='nlpretext',
packages=find_packages(),
packages=setuptools.find_packages(),
scripts=["VERSION", "requirements.txt"],
version=version,
description='All the goto functions you need to handle NLP use-cases',
author='Artefact',
license='MIT',
url='https://github.com/artefactory/nautilus-nlp',
url='https://github.com/artefactory/NLPretext',
install_requires=requirements,
classifiers=[
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
cmdclass={
'install': PostInstallCommand,
},
)
13 changes: 6 additions & 7 deletions tests/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import pytest
import numpy as np
from nlpretext.classic.preprocess import (
from nlpretext.basic.preprocess import (
normalize_whitespace, remove_eol_characters, fix_bad_unicode,
unpack_english_contractions, replace_urls, replace_emails,
replace_phone_numbers, replace_numbers, replace_currency_symbols,
remove_punct, remove_accents, remove_multiple_spaces_and_strip_text,
filter_non_latin_characters
)
from nlpretext.classic.preprocess import (
from nlpretext.basic.preprocess import (
remove_stopwords as remove_stopwords_text
)
from nlpretext.social.preprocess import (
Expand Down Expand Up @@ -188,14 +188,13 @@ def test_get_stopwords():


@pytest.mark.parametrize(
"input_tokens, expected_output",
"input_tokens, lang, expected_output",
[
(['I', 'like', 'when', 'you', 'move', 'your', 'body', '!'], ['I', 'move', 'body', '!'])
(['I', 'like', 'when', 'you', 'move', 'your', 'body', '!'], "en", ['I', 'move', 'body', '!'])
],
)
def test_remove_stopwords_tokens(input_tokens, expected_output):
stopwords = get_stopwords('en')
result = remove_stopwords_token(input_tokens, stopwords)
def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
result = remove_stopwords_token(input_tokens, lang)
np.testing.assert_array_equal(result, expected_output)


Expand Down