telekom · PhilipMay · Jan 4, 2024 · Jan 4, 2024 · Jan 4, 2024 · Jan 4, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -105,6 +105,7 @@ mdformat-footnote = "*"
 [tool.poetry.group.test.dependencies]
 pytest = "*"
 pytest-rerunfailures = "*"
+hypothesis = "*"
 
 [tool.poetry.group.doc.dependencies]
 sphinx = "*"

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -1,11 +1,13 @@
-# Copyright (c) 2023 Philip May
+# Copyright (c) 2023-2024 Philip May
 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
 from collections import Counter, defaultdict
 from math import isclose
 
 import pytest
+from hypothesis import given, settings
+from hypothesis.strategies import text
 
 from mltb2.text import (
     INVISIBLE_CHARACTERS,
@@ -21,6 +23,40 @@
 )
 
 
+@settings(max_examples=1000)
+@given(text())
+def test_remove_and_detect_invisible_characters_hypothesis(text: str):
+    result = remove_invisible_characters(text)
+    assert isinstance(result, str)
+    if has_invisible_characters(text):
+        assert len(result) < len(text)
+    else:
+        assert len(result) == len(text)
+
+
+@settings(max_examples=1000)
+@given(text())
+def test_replace_and_detect_special_whitespaces_hypothesis(text: str):
+    result = replace_special_whitespaces(text)
+    assert isinstance(result, str)
+    text_whitespace_count = text.count(" ")
+    result_whitespace_count = result.count(" ")
+    if has_special_whitespaces(text):
+        assert text_whitespace_count < result_whitespace_count
+    else:
+        assert text_whitespace_count == result_whitespace_count
+
+
+@settings(max_examples=1000)
+@given(text())
+def test_replace_multiple_whitespaces_hypothesis(text: str):
+    result = replace_multiple_whitespaces(text)
+    text_whitespace_count = text.count(" ")
+    result_whitespace_count = result.count(" ")
+    assert len(result) <= len(text)
+    assert result_whitespace_count <= text_whitespace_count
+
+
 def test_remove_invisible_characters():
     text = "Hello\u200bWorld\u00ad!"
     result = remove_invisible_characters(text)

diff --git a/tests/test_transformers.py b/tests/test_transformers.py
@@ -3,9 +3,29 @@
 # which is available at https://opensource.org/licenses/MIT
 
 
+import pytest
+from hypothesis import given, settings
+from hypothesis.strategies import text
+
 from mltb2.transformers import TransformersTokenCounter
 
 
+@pytest.fixture(scope="module")
+def deepset_gbert_base_token_counter() -> TransformersTokenCounter:
+    return TransformersTokenCounter("deepset/gbert-base")
+
+
+@settings(max_examples=1000, deadline=None)
+@given(text=text())
+def test_TransformersTokenCounter_hypothesis(  # noqa: N802
+    text: str, deepset_gbert_base_token_counter: TransformersTokenCounter
+):
+    token_count = deepset_gbert_base_token_counter(text)
+
+    assert isinstance(token_count, int)
+    assert token_count >= 0
+
+
 def test_TransformersTokenCounter_call_string():  # noqa: N802
     transformers_token_counter = TransformersTokenCounter("deepset/gbert-base")
     token_count = transformers_token_counter("Das ist ein Text.")