From a63fe54163792e7d79056baa8d61df00a55b88d8 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 09:44:53 +0100 Subject: [PATCH 1/7] Add hypothesis to test dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4e854ce..c88ae37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,6 +105,7 @@ mdformat-footnote = "*" [tool.poetry.group.test.dependencies] pytest = "*" pytest-rerunfailures = "*" +hypothesis = "*" [tool.poetry.group.doc.dependencies] sphinx = "*" From af91d272a8b207fe009103239cef8e29e88fde43 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 10:08:20 +0100 Subject: [PATCH 2/7] Add hypothesis tests for text module --- tests/test_text.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index 93b8bf8..5e370c8 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,6 +6,8 @@ from math import isclose import pytest +from hypothesis import given, settings +from hypothesis.strategies import text from mltb2.text import ( INVISIBLE_CHARACTERS, @@ -21,6 +23,30 @@ ) +@settings(max_examples=1000) +@given(text()) +def test_remove_and_detect_invisible_characters_hypothesis(text: str): + result = remove_invisible_characters(text) + assert isinstance(result, str) + if has_invisible_characters(text): + assert len(result) < len(text) + else: + assert len(result) == len(text) + + +@settings(max_examples=1000) +@given(text()) +def test_replace_and_detect_special_whitespaces_hypothesis(text: str): + result = replace_special_whitespaces(text) + assert isinstance(result, str) + text_whitespace_count = text.count(" ") + result_whitespace_count = result.count(" ") + if has_special_whitespaces(text): + assert text_whitespace_count < result_whitespace_count + else: + assert text_whitespace_count == result_whitespace_count + + def test_remove_invisible_characters(): text = "Hello\u200bWorld\u00ad!" result = remove_invisible_characters(text) From 17749670947638fb9aef8668f4359836b2f4a8cc Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 10:09:18 +0100 Subject: [PATCH 3/7] Add test cases for TransformersTokenCounter --- tests/test_transformers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 25edd1d..f8c6078 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -3,9 +3,29 @@ # which is available at https://opensource.org/licenses/MIT +import pytest +from hypothesis import given, settings +from hypothesis.strategies import text + from mltb2.transformers import TransformersTokenCounter +@pytest.fixture(scope="module") +def deepset_gbert_base_token_counter(): + return TransformersTokenCounter("deepset/gbert-base") + + +@settings(max_examples=1000, deadline=None) +@given(text=text()) +def test_TransformersTokenCounter_hypothesis( # noqa: N802 + text: str, deepset_gbert_base_token_counter: TransformersTokenCounter +): + token_count = deepset_gbert_base_token_counter(text) + + assert isinstance(token_count, int) + assert token_count >= 0 + + def test_TransformersTokenCounter_call_string(): # noqa: N802 transformers_token_counter = TransformersTokenCounter("deepset/gbert-base") token_count = transformers_token_counter("Das ist ein Text.") From f5e0814a47cf363184828007048310232ac59f9f Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 10:13:38 +0100 Subject: [PATCH 4/7] Add hypothesis test for replacing multiple whitespaces --- tests/test_text.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index 5e370c8..88e79b8 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -47,6 +47,16 @@ def test_replace_and_detect_special_whitespaces_hypothesis(text: str): assert text_whitespace_count == result_whitespace_count +@settings(max_examples=1000) +@given(text()) +def test_replace_multiple_whitespaces_hypothesis(text: str): + result = replace_multiple_whitespaces(text) + text_whitespace_count = text.count(" ") + result_whitespace_count = result.count(" ") + assert len(result) <= len(text) + assert text_whitespace_count <= result_whitespace_count + + def test_remove_invisible_characters(): text = "Hello\u200bWorld\u00ad!" result = remove_invisible_characters(text) From 18c1246449ed942e9710e52d6deafd2a51ef14b9 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 10:14:33 +0100 Subject: [PATCH 5/7] Update copyright year in test_text.py --- tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index 88e79b8..578dd03 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Philip May +# Copyright (c) 2023-2024 Philip May # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT From 5d91372a471237d20100fc98eb1f9e8ffea9056c Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 10:15:35 +0100 Subject: [PATCH 6/7] Add return type annotation to deepset_gbert_base_token_counter function --- tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index f8c6078..6a7fc20 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -11,7 +11,7 @@ @pytest.fixture(scope="module") -def deepset_gbert_base_token_counter(): +def deepset_gbert_base_token_counter() -> TransformersTokenCounter: return TransformersTokenCounter("deepset/gbert-base") From 77503b73266b77774cc402959018a311cf874520 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Thu, 4 Jan 2024 11:04:32 +0100 Subject: [PATCH 7/7] Fix whitespace count comparison in test_replace_multiple_whitespaces_hypothesis() --- tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index 578dd03..9c1d949 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -54,7 +54,7 @@ def test_replace_multiple_whitespaces_hypothesis(text: str): text_whitespace_count = text.count(" ") result_whitespace_count = result.count(" ") assert len(result) <= len(text) - assert text_whitespace_count <= result_whitespace_count + assert result_whitespace_count <= text_whitespace_count def test_remove_invisible_characters():