ci: switch to ruff linter

artefactory · Nov 9, 2023 · 23d14b8 · 23d14b8
1 parent ec1f2a0
commit 23d14b8
Show file tree

Hide file tree

Showing 26 changed files with 137 additions and 548 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-  
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
@@ -75,9 +75,9 @@ jobs:
       run: |
         STRICT=1 make check-safety
 
-    - name: Run style checks
+    - name: Lint and format
       run: |
-        STRICT=1 make check-style
+        make format-code
 
     - name: Run tests
       run: |

diff --git a/.gitignore b/.gitignore
@@ -171,6 +171,7 @@ coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
+.ruff_cache/
 
 # Translations
 *.mo

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,13 +4,14 @@ default_language_version:
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.5.0
+    rev: v4.5.0
     hooks:
-      - id: check-yaml
-        stages: [commit, push]
-      - id: end-of-file-fixer
-        stages: [commit, push]
-        exclude: ".ipynb"
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+    - id: check-yaml
+    - id: check-toml
+    - id: check-json
+    - id: check-added-large-files
 
   - repo: local
     hooks:
@@ -22,7 +23,7 @@ repos:
         stages: [commit, push]
       - id: pyupgrade
         name: pyupgrade
-        entry: poetry run pyupgrade --py37-plus
+        entry: poetry run pyupgrade --py38-plus
         types: [python]
         language: system
         stages: [commit, push]
@@ -32,19 +33,19 @@ repos:
         types: [python]
         language: system
         stages: [commit, push]
+      - id: ruff
+        name: ruf
+        entry: poetry run ruff check --config pyproject.toml
+        types: [python]
+        language: system
+        stages: [commit, push]
       - id: mypy
         name: mypy
         entry: poetry run mypy
         require_serial: true
         types: [python]
         language: system
         stages: [push]
-      - id: darglint
-        name: darglint
-        entry: poetry run darglint -v 2
-        types: [python]
-        language: system
-        stages: [push]
       - id: gitleaks
         name: gitleaks
         entry: make gitleaks

diff --git a/Makefile b/Makefile
@@ -87,7 +87,7 @@ download-poetry:
 
 .PHONY: install
 install:
-	poetry env use python3.8
+	poetry env use python3.10
 	poetry lock -n
 	poetry install -n
 ifneq ($(NO_PRE_COMMIT), 1)
@@ -96,7 +96,7 @@ endif
 
 .PHONY: check-safety
 check-safety:
-	poetry check$(POETRY_COMMAND_FLAG) && \
+	@poetry check$(POETRY_COMMAND_FLAG) && \
 	poetry run pip check$(PIP_COMMAND_FLAG) && \
 	poetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \
 	poetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG)
@@ -106,23 +106,16 @@ gitleaks:
 	commits="$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))"; \
 	if [ "$${commits}" != "" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi;
 
-.PHONY: check-style
-check-style:
-	poetry run black --config pyproject.toml --diff --check ./$(BLACK_COMMAND_FLAG) && \
-	poetry run darglint -v 2 **/*.py$(DARGLINT_COMMAND_FLAG) && \
-	poetry run isort --settings-path pyproject.toml --check-only **/*.py$(ISORT_COMMAND_FLAG) && \
-	poetry run mypy --config-file setup.cfg nlpretext tests/**.py$(MYPY_COMMAND_FLAG)
-
 .PHONY: format-code
 format-code:
-	poetry run pre-commit run
+	@poetry run pre-commit run --all
 
 .PHONY: test
 test:
-	poetry run pytest
+	@poetry run pytest
 
 .PHONY: lint
-lint: test check-safety check-style
+lint: check-safety format-code test
 
 # Example: make docker VERSION=latest
 # Example: make docker IMAGE=some_name VERSION=1.0.4

diff --git a/README.md b/README.md
@@ -25,14 +25,14 @@ All the goto functions you need to handle NLP use-cases, integrated in NLPretext
 # TL;DR
 
 
-> *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?*  :tired_face: 
+> *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?*  :tired_face:
 
 > *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved:
 
 
 **NLPretext got you covered!** :rocket:
 
-NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project. 
+NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project.
 
 
 :mag: Quickly explore below our preprocessing pipelines and individual functions referential.
@@ -215,7 +215,7 @@ print(example)
 
 ## Data augmentation <a name="data_augmentation"></a>
 
-The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library. 
+The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library.
 
 ```python
 from nlpretext.augmentation.text_augmentation import augment_text
@@ -292,7 +292,7 @@ This project is licensed under the terms of the `Apache Software License 2.0` li
     │   └── _utils          <- Where preprocessing utils scripts lives
     ├── tests               <- Where the tests lives
     ├── pyproject.toml      <- Package configuration
-    ├── poetry.lock         
+    ├── poetry.lock
     └── setup.cfg           <- Configuration for plugins and other utils
 
 # Credits

diff --git a/datasets/external/get_stanfordtweets.sh b/datasets/external/get_stanfordtweets.sh
@@ -15,6 +15,6 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-#!/bin/bash 
+#!/bin/bash
 wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip
 mkdir -p  tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip
diff --git a/docs/source/tutorials/basic_notebook.ipynb b/docs/source/tutorials/basic_notebook.ipynb
@@ -118,4 +118,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
diff --git a/nlpretext/__init__.py b/nlpretext/__init__.py
@@ -19,7 +19,7 @@
 # mypy: disable-error-code="attr-defined"
 # mypy: disable-error-code="assignment"
 
-"""All the goto functions you need to handle NLP use-cases, integrated in NLPretext"""
+"""All the goto functions you need to handle NLP use-cases, integrated in NLPretext."""
 
 from importlib.metadata import PackageNotFoundError, version
 

diff --git a/nlpretext/_config/constants.py b/nlpretext/_config/constants.py
@@ -16,7 +16,7 @@
 
 """
 Collection of regular expressions and other (small, generally useful) constants.
-Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy
+Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy.
 """
 import re
 import sys
@@ -136,7 +136,7 @@
     flags=re.IGNORECASE | re.UNICODE,
 )
 PHONE_REGEX = re.compile(
-    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
+    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"  # noqa: E501
 )
 NUMBERS_REGEX = re.compile(
     r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|"

diff --git a/nlpretext/_utils/daskloader.py b/nlpretext/_utils/daskloader.py
@@ -1,5 +1,5 @@
 # mypy: disable-error-code="attr-defined"
-from typing import Any, List, Union
+from typing import List, Union
 
 import dask.bag as db
 import dask.dataframe as dd

diff --git a/nlpretext/_utils/file_loader.py b/nlpretext/_utils/file_loader.py
@@ -25,7 +25,7 @@
 
 def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str:
     """
-    Predict a file's encoding using chardet
+    Predict a file's encoding using chardet.
 
     Parameters
     ----------
@@ -50,7 +50,7 @@ def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100)
 
 def check_text_file_format(filepath: Union[str, List[str]]) -> str:
     """
-    Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt
+    Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt.
 
     Parameters
     ----------
@@ -71,7 +71,7 @@ def check_text_file_format(filepath: Union[str, List[str]]) -> str:
         raise ValueError(f"Multiple file formats found in file path list: {format_list}")
     if None in format_re_list:
         raise ValueError(
-            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted"
+            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted"  # noqa: E501
         )
     file_format = format_list[0]
     return file_format
diff --git a/nlpretext/_utils/phone_number.py b/nlpretext/_utils/phone_number.py
@@ -24,7 +24,7 @@
 def find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]:
     """
     Python port of Google's libphonenumber.
-    https://github.com/daviddrysdale/python-phonenumbers
+    https://github.com/daviddrysdale/python-phonenumbers.
 
     Parameters
     ----------
@@ -78,7 +78,7 @@ def extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[s
 class PhoneParser:
     """
     Python port of Google's libphonenumber.
-    https://github.com/daviddrysdale/python-phonenumbers
+    https://github.com/daviddrysdale/python-phonenumbers.
     """
 
     def __init__(self):
@@ -98,7 +98,7 @@ def parse_number(
         self, text: str, region_code: Optional[str] = None
     ) -> Optional[_phonenumbers.PhoneNumber]:
         """
-        Extract phone number from text
+        Extract phone number from text.
 
         Parameters
         ----------

diff --git a/nlpretext/_utils/stopwords.py b/nlpretext/_utils/stopwords.py
@@ -22,8 +22,7 @@
 
 
 def get_stopwords(lang: str = "en") -> List[str]:
-    """
-    Inputs a language code, returns a list of stopwords for the specified language
+    """Input a language code, returns a list of stopwords for the specified language.
 
     Parameters
     ----------
@@ -61,9 +60,8 @@ def get_stopwords(lang: str = "en") -> List[str]:
                 stopwords += custom_stopwords[lang]
         else:
             raise ValueError(
-                "Language not available yet or incorrect country code. Supported languages: {}".format(
-                    supported_lang
-                )
+                "Language not available yet or incorrect country code."
+                f" Supported languages: {supported_lang}"
             )
     else:
         raise ValueError('Please input a valid country code, in 2 letters. Eg. "us" for USA. ')

diff --git a/nlpretext/augmentation/text_augmentation.py b/nlpretext/augmentation/text_augmentation.py
@@ -7,11 +7,11 @@
 import nlpaug.augmenter.word as naw
 
 
-class CouldNotAugment(ValueError):
+class CouldNotAugment(ValueError):  # noqa: D101
     pass
 
 
-class UnavailableAugmenter(ValueError):
+class UnavailableAugmenter(ValueError):  # noqa: D101
     pass
 
 
@@ -66,6 +66,7 @@ def process_entities_and_text(
     """
     Given a list of initial entities, verify that they have not been altered by
     the data augmentation operation and are still in the augmented text.
+
     Parameters
     ----------
     entities: list
@@ -85,6 +86,7 @@ def process_entities_and_text(
         initial text
     augmented_text: str
         new text resulting of data augmentation operation
+
     Returns
     -------
     Augmented text and entities with their updated position in augmented text
@@ -167,7 +169,7 @@ def get_augmented_entities(
     sentence_augmented: str, entities: List[Tuple[str, Any]]
 ) -> List[Dict[str, Any]]:
     """
-    Get entities with updated positions (start and end) in augmented text
+    Get entities with updated positions (start and end) in augmented text.
 
     Parameters
     ----------
@@ -209,7 +211,7 @@ def get_augmented_entities(
 
 def clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
-    Paired entities check to remove nested entities, the longest entity is kept
+    Paired entities check to remove nested entities, the longest entity is kept.
 
     Parameters
     ----------
@@ -253,7 +255,7 @@ def check_interval_included(
     element1: Dict[str, Any], element2: Dict[str, Any]
 ) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:
     """
-    Comparison of two entities on start and end positions to find if they are nested
+    Comparison of two entities on start and end positions to find if they are nested.
 
     Parameters
     ----------
@@ -269,7 +271,8 @@ def check_interval_included(
 
     Returns
     -------
-    If there is an entity to remove among the two returns a tuple (element to remove, element to keep)
+    If there is an entity to remove among the two returns a tuple
+    (element to remove, element to keep).
     If not, returns None
     """
     if (
-Original file line number
+Diff line change
@@ Expand Up / @@ -118,4 +118,4 @@ @@
      },
      "nbformat": 4,
      "nbformat_minor": 1
-    }
+    }