Skip to content

Commit

Permalink
improve and fix linting
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Jan 2, 2024
1 parent e11b0ee commit b877269
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 26 deletions.
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

# ruff: noqa: INP001

"""Configuration for the Sphinx documentation builder."""

# Configuration file for the Sphinx documentation builder.
Expand Down
16 changes: 16 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ ignore = [
"ERA", # eradicate - https://docs.astral.sh/ruff/rules/#eradicate-era
"ANN", # flake8-annotations - https://docs.astral.sh/ruff/rules/#flake8-annotations-ann
"FA", # flake8-future-annotations - https://docs.astral.sh/ruff/rules/#flake8-future-annotations-fa
"EM", # flake8-errmsg - https://docs.astral.sh/ruff/rules/#flake8-errmsg-em
"PTH", # flake8-use-pathlib - https://docs.astral.sh/ruff/rules/#flake8-use-pathlib-pth
"FBT", # flake8-boolean-trap - https://docs.astral.sh/ruff/rules/#flake8-boolean-trap-fbt
"TD", # flake8-todos - https://docs.astral.sh/ruff/rules/#flake8-todos-td
"SLF", # flake8-self - https://docs.astral.sh/ruff/rules/#flake8-self-slf
"D107", # Missing docstring in __init__
"D410", # Missing blank line after section ("{name}")
"D411", # Missing blank line before section ("{name}")
Expand All @@ -133,6 +138,17 @@ ignore = [
"S101", # Use of `assert` detected
"PLR2004", # Magic value used in comparison
"B011", # Do not `assert False`
"RET505", # Unnecessary `else` after `return` statement
"TRY003", # Avoid specifying long messages outside the exception class
"RET504", # Unnecessary assignment before `return` statement
"T201", # `print` found
"RET507", # Unnecessary `else` after `continue` statement
"PT015", # Assertion always fails, replace with `pytest.fail()`
"UP015", # Unnecessary open mode parameters
"FIX002", # Line contains TODO, consider resolving the issue
"PT011", # `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
"PT001", # Use `@pytest.fixture()` over `@pytest.fixture`
"RUF015", # Prefer `next(iter(sentences))` over single element slice
]

[tool.ruff.per-file-ignores]
Expand Down
8 changes: 4 additions & 4 deletions tests/ori_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def load_colon_data() -> Tuple[pd.Series, pd.DataFrame]:
try:
i = int(line)
label.append(0 if i > 0 else 1)
except: # noqa: S110, E722
except: # noqa: S110, E722, PERF203
pass

assert len(label) == 62
Expand All @@ -62,7 +62,7 @@ def load_colon_data() -> Tuple[pd.Series, pd.DataFrame]:
# generate feature names
column_names = []
for column_name in data_df.columns:
column_names.append("gene_" + str(column_name))
column_names.append("gene_" + str(column_name)) # noqa: PERF401

data_df.columns = column_names

Expand All @@ -80,7 +80,7 @@ def load_prostate_data() -> Tuple[pd.Series, pd.DataFrame]:
Returns:
Tuple containing labels and data.
"""
df = pd.read_csv("https://web.stanford.edu/~hastie/CASI_files/DATA/prostmat.csv")
df = pd.read_csv("https://web.stanford.edu/~hastie/CASI_files/DATA/prostmat.csv") # noqa: PD901
data = df.T

# labels
Expand Down Expand Up @@ -108,7 +108,7 @@ def load_leukemia_data() -> Tuple[pd.Series, pd.DataFrame]:
Returns:
Tuple containing labels and data.
"""
df = pd.read_csv("https://web.stanford.edu/~hastie/CASI_files/DATA/leukemia_big.csv")
df = pd.read_csv("https://web.stanford.edu/~hastie/CASI_files/DATA/leukemia_big.csv") # noqa: PD901
data = df.T

# labels
Expand Down
6 changes: 3 additions & 3 deletions tests/test_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_chunk_md():
assert result[2] == "### Headline 3 / 1\n\n#### Headline 4 / 1\n\nContent."


def test_MdTextSplitter_call():
def test_MdTextSplitter_call(): # noqa: N802
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_merger = MdTextSplitter(
max_token=15,
Expand All @@ -63,7 +63,7 @@ def test_MdTextSplitter_call():
assert merged_md[1] == "### Headline 3 / 1\n\n#### Headline 4 / 1\n\nContent."


def test_MdTextSplitter_call_no_merge():
def test_MdTextSplitter_call_no_merge(): # noqa: N802
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_merger = MdTextSplitter(
max_token=1,
Expand All @@ -78,7 +78,7 @@ def test_MdTextSplitter_call_no_merge():
assert merged_md[2] == "### Headline 3 / 1\n\n#### Headline 4 / 1\n\nContent."


def test_MdTextSplitter_call_all_merge():
def test_MdTextSplitter_call_all_merge(): # noqa: N802
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_merger = MdTextSplitter(
max_token=1000,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from mltb2.openai import OpenAiTokenCounter


def test_OpenAiTokenCounter_call_string():
def test_OpenAiTokenCounter_call_string(): # noqa: N802
token_counter = OpenAiTokenCounter("gpt-4")
token_count = token_counter("Das ist ein Text.")

assert token_count == 5


def test_OpenAiTokenCounter_call_list():
def test_OpenAiTokenCounter_call_list(): # noqa: N802
token_counter = OpenAiTokenCounter("gpt-4")
token_count = token_counter(["Das ist ein Text.", "Das ist ein anderer Text."])

Expand Down
24 changes: 12 additions & 12 deletions tests/test_somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)


def test_SoMaJoSentenceSplitter_call() -> None:
def test_SoMaJoSentenceSplitter_call() -> None: # noqa: N802
"""Test ``SoMaJoSentenceSplitter.call`` happy case."""
splitter = SoMaJoSentenceSplitter("de_CMC")
text = "Das ist der erste Satz. Das ist der 2. Satz."
Expand All @@ -28,7 +28,7 @@ def test_SoMaJoSentenceSplitter_call() -> None:
assert sentences[1] == "Das ist der 2. Satz."


def test_SoMaJoSentenceSplitter_call_space_and_linebreak() -> None:
def test_SoMaJoSentenceSplitter_call_space_and_linebreak() -> None: # noqa: N802
"""Test ``SoMaJoSentenceSplitter.call`` with space an line break."""
splitter = SoMaJoSentenceSplitter("de_CMC")
text = " Das ist der erste Satz. \n Das ist der 2. \n Satz. "
Expand All @@ -39,7 +39,7 @@ def test_SoMaJoSentenceSplitter_call_space_and_linebreak() -> None:
assert sentences[1] == "Das ist der 2. Satz."


def test_JaccardSimilarity_call():
def test_JaccardSimilarity_call(): # noqa: N802
text1 = "Das ist ein deutscher Text."
text2 = "Das ist ein anderer Text."
jaccard_similarity = JaccardSimilarity("de_CMC")
Expand All @@ -52,15 +52,15 @@ def test_JaccardSimilarity_call():
assert result2 > 0.0


def test_JaccardSimilarity_call_same():
def test_JaccardSimilarity_call_same(): # noqa: N802
text = "Das ist ein deutscher Text."
jaccard_similarity = JaccardSimilarity("de_CMC")
result = jaccard_similarity(text, text)

assert isclose(result, 1.0)


def test_JaccardSimilarity_call_no_overlap():
def test_JaccardSimilarity_call_no_overlap(): # noqa: N802
text1 = "Das ist ein deutscher Text."
text2 = "Vollkommen anders!"
jaccard_similarity = JaccardSimilarity("de_CMC")
Expand All @@ -69,7 +69,7 @@ def test_JaccardSimilarity_call_no_overlap():
assert isclose(result, 0.0)


def test_TokenExtractor_extract_url_set_with_str():
def test_TokenExtractor_extract_url_set_with_str(): # noqa: N802
url1 = "http://may.la"
url2 = "github.com"
text_with_url = f"{url1} Das ist ein Text. {url2} Er enthält eine URL."
Expand All @@ -80,7 +80,7 @@ def test_TokenExtractor_extract_url_set_with_str():
assert url2 in result


def test_TokenExtractor_extract_url_set_with_list():
def test_TokenExtractor_extract_url_set_with_list(): # noqa: N802
url1 = "http://may.la"
url2 = "github.com"
text_with_url = [f"{url1} Das ist ein Text.", f"{url2} Er enthält eine URL."]
Expand All @@ -91,7 +91,7 @@ def test_TokenExtractor_extract_url_set_with_list():
assert url2 in result


def test_TokenExtractor_extract_url_set_no_url():
def test_TokenExtractor_extract_url_set_no_url(): # noqa: N802
text_with_url = "Das ist ein Text. Er enthält keine URLs."
token_extractor = TokenExtractor("de_CMC")
result = token_extractor.extract_url_set(text_with_url)
Expand Down Expand Up @@ -137,7 +137,7 @@ def test_detokenize():
assert result == "Das ist ein Satz."


def test_UrlSwapper_swap_urls():
def test_UrlSwapper_swap_urls(): # noqa: N802
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_url = "This is a text with URL: http://may.la."
Expand All @@ -153,7 +153,7 @@ def test_UrlSwapper_swap_urls():
"2 MD URL s: [Philip May](http://may.la). [other link](https://github.com/telekom/mltb2#installation)",
],
)
def test_UrlSwapper__is_reversible(text_with_url: str):
def test_UrlSwapper__is_reversible(text_with_url: str): # noqa: N802
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_reverse_swapped_url, no_reverse_swap_urls = url_swapper.reverse_swap_urls(
Expand All @@ -163,7 +163,7 @@ def test_UrlSwapper__is_reversible(text_with_url: str):
assert len(no_reverse_swap_urls) == 0


def test_UrlSwapper__no_reverse_swap_urls():
def test_UrlSwapper__no_reverse_swap_urls(): # noqa: N802
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_url = "This is a text with URL: http://may.la."
Expand All @@ -176,7 +176,7 @@ def test_UrlSwapper__no_reverse_swap_urls():


# see https://github.com/telekom/mltb2/issues/94
def test_UrlSwapper__markdown_bug():
def test_UrlSwapper__markdown_bug(): # noqa: N802
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_url = "This is a MD link: [https://something-1.com](https://something-2.com)."
Expand Down
6 changes: 3 additions & 3 deletions tests/test_somajo_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from mltb2.transformers import TransformersTokenCounter


def test_TextSplitter_call():
def test_TextSplitter_call(): # noqa: N802
somajo_sentence_splitter = SoMaJoSentenceSplitter("de_CMC")
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_splitter = TextSplitter(
Expand All @@ -29,7 +29,7 @@ def test_TextSplitter_call():
assert split_text[2] == "Satz 4 ist das."


def test_TextSplitter_call_sentence_too_long_exception():
def test_TextSplitter_call_sentence_too_long_exception(): # noqa: N802
somajo_sentence_splitter = SoMaJoSentenceSplitter("de_CMC")
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_splitter = TextSplitter(
Expand All @@ -43,7 +43,7 @@ def test_TextSplitter_call_sentence_too_long_exception():
text_splitter(text)


def test_TextSplitter_call_sentence_too_long_no_exception():
def test_TextSplitter_call_sentence_too_long_no_exception(): # noqa: N802
somajo_sentence_splitter = SoMaJoSentenceSplitter("de_CMC")
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
text_splitter = TextSplitter(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from mltb2.transformers import TransformersTokenCounter


def test_TransformersTokenCounter_call_string():
def test_TransformersTokenCounter_call_string(): # noqa: N802
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
token_count = transformers_token_counter("Das ist ein Text.")

assert token_count == 5


def test_TransformersTokenCounter_call_list():
def test_TransformersTokenCounter_call_list(): # noqa: N802
transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
token_count = transformers_token_counter(["Das ist ein Text.", "Das ist ein anderer Text."])

Expand Down

0 comments on commit b877269

Please sign in to comment.