Merge pull request #3711 from Cheukting/codec

HypothesisWorks · Sep 1, 2023 · 3936211 · 3936211
2 parents 0560fce + 2b2de29
commit 3936211
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 7 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,7 @@
+RELEASE_TYPE: minor
+
+Adds a new ``codec=`` option in :func:`~hypothesis.strategies.characters`, making it
+convenient to produce only characters which can be encoded as ``ascii`` or ``utf-8``
+bytestrings.
+
+Support for other codecs will be added in a future release.
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py
@@ -8,6 +8,7 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
+import codecs
 import enum
 import math
 import operator
@@ -27,11 +28,13 @@
     Any,
     AnyStr,
     Callable,
+    Collection,
     Dict,
     FrozenSet,
     Hashable,
     Iterable,
     List,
+    Literal,
     Optional,
     Pattern,
     Protocol,
@@ -523,12 +526,13 @@ def dictionaries(
 @defines_strategy(force_reusable_values=True)
 def characters(
     *,
-    whitelist_categories: Optional[Sequence[str]] = None,
-    blacklist_categories: Optional[Sequence[str]] = None,
-    blacklist_characters: Optional[Sequence[str]] = None,
+    whitelist_categories: Optional[Collection[str]] = None,
+    blacklist_categories: Optional[Collection[str]] = None,
+    blacklist_characters: Optional[Collection[str]] = None,
     min_codepoint: Optional[int] = None,
     max_codepoint: Optional[int] = None,
-    whitelist_characters: Optional[Sequence[str]] = None,
+    whitelist_characters: Optional[Collection[str]] = None,
+    codec: Optional[Literal["ascii", "utf-8"]] = None,
 ) -> SearchStrategy[str]:
     r"""Generates characters, length-one :class:`python:str`\ ings,
     following specified filtering rules.
@@ -549,6 +553,12 @@ def characters(
       that list will be not be produced. Any overlap between
       ``whitelist_characters`` and ``blacklist_characters`` will raise an
       exception.
+    - If ``codec`` is specified, only characters in certain `codec encodings`_
+      will be produced. Currently only `ascii` and `utf-8` are supported.
+      ``whitelist_characters`` which cannot be encoded using this codec will
+      raise an exception.  If non-encodable codepoints or categories are
+      explicitly allowed, the ``codec`` argument will exclude them without
+      raising an exception.
 
     The ``_codepoint`` arguments must be integers between zero and
     :obj:`python:sys.maxunicode`.  The ``_characters`` arguments must be
@@ -562,6 +572,7 @@ def characters(
     for characters in any punctuation category.
 
     .. _general category: https://wikipedia.org/wiki/Unicode_character_property
+    .. _codec encodings: https://docs.python.org/3/library/codecs.html#encodings-and-unicode
 
     Examples from this strategy shrink towards the codepoint for ``'0'``,
     or the first allowable codepoint after it if ``'0'`` is excluded.
@@ -575,6 +586,7 @@ def characters(
         and whitelist_categories is None
         and blacklist_categories is None
         and whitelist_characters is not None
+        and codec is None
     ):
         raise InvalidArgument(
             "Nothing is excluded by other arguments, so passing only "
@@ -613,6 +625,42 @@ def characters(
             f"{whitelist_categories=} and {blacklist_categories=}"
         )
 
+    if codec is not None:
+        try:
+            codecs.lookup(codec)
+        except LookupError:
+            raise InvalidArgument(f"{codec=} is not valid on this system") from None
+        except TypeError:
+            raise InvalidArgument(f"{codec=} is not a valid codec") from None
+
+        for char in whitelist_characters:
+            try:
+                char.encode(encoding=codec, errors="strict")
+            except UnicodeEncodeError:
+                raise InvalidArgument(
+                    f"Character {char!r} in {whitelist_characters=} "
+                    f"cannot be encoded with {codec=}"
+                ) from None
+
+        # ascii and utf-8 are sufficient common that we have faster special handling
+        if codec == "ascii":
+            if (max_codepoint is None) or (max_codepoint > 127):
+                max_codepoint = 127
+        elif codec == "utf-8":
+            if whitelist_categories is not None:
+                whitelist_categories = tuple(
+                    c for c in whitelist_categories if c != "Cs"
+                )
+            if blacklist_categories is None:
+                blacklist_categories = ("Cs",)
+            elif "Cs" not in blacklist_categories:
+                blacklist_categories = tuple(blacklist_categories) + ("Cs",)
+        else:
+            # TODO: handle all other codecs.  We'll probably want to do this inside
+            #       `OneCharStringStrategy`, by checking which intervals are supported,
+            #       caching that, and taking the intersection of their intervals.
+            raise InvalidArgument(f"{codec=} must be one of 'ascii', 'utf-8', or None")
+
     return OneCharStringStrategy(
         whitelist_categories=whitelist_categories,
         blacklist_categories=blacklist_categories,
@@ -639,9 +687,7 @@ def _check_is_single_character(c):
 @cacheable
 @defines_strategy(force_reusable_values=True)
 def text(
-    alphabet: Union[Sequence[str], SearchStrategy[str]] = characters(
-        blacklist_categories=("Cs",)
-    ),
+    alphabet: Union[Collection[str], SearchStrategy[str]] = characters(codec="utf-8"),
     *,
     min_size: int = 0,
     max_size: Optional[int] = None,

diff --git a/hypothesis-python/tests/cover/test_core.py b/hypothesis-python/tests/cover/test_core.py
@@ -143,3 +143,29 @@ def empty_db(_):
 def test_non_executed_tests_raise_skipped(test_fn):
     with pytest.raises(unittest.SkipTest):
         test_fn()
+
+
+@pytest.mark.parametrize(
+    "codec, max_codepoint, blacklist_categories, whitelist_categories",
+    [
+        ("ascii", None, None, None),
+        ("ascii", 128, None, None),
+        ("ascii", 100, None, None),
+        ("utf-8", None, None, None),
+        ("utf-8", None, ["Cs"], None),
+        ("utf-8", None, ["N"], None),
+        ("utf-8", None, None, ["N"]),
+    ],
+)
+@given(s.data())
+def test_characters_codec(
+    codec, max_codepoint, blacklist_categories, whitelist_categories, data
+):
+    strategy = s.characters(
+        codec=codec,
+        max_codepoint=max_codepoint,
+        blacklist_categories=blacklist_categories,
+        whitelist_categories=whitelist_categories,
+    )
+    example = data.draw(strategy)
+    assert example.encode(encoding=codec).decode(encoding=codec) == example
diff --git a/hypothesis-python/tests/cover/test_direct_strategies.py b/hypothesis-python/tests/cover/test_direct_strategies.py
@@ -180,6 +180,11 @@ def fn_ktest(*fnkwargs):
     (ds.characters, {"max_codepoint": "1"}),
     (ds.characters, {"whitelist_categories": []}),
     (ds.characters, {"whitelist_categories": ["Nd"], "blacklist_categories": ["Nd"]}),
+    (ds.characters, {"codec": 100}),
+    (ds.characters, {"codec": "cp861"}),  # not yet implemented
+    (ds.characters, {"codec": "this is not a valid codec name"}),
+    (ds.characters, {"codec": "ascii", "whitelist_characters": "é"}),
+    (ds.characters, {"codec": "utf-8", "whitelist_categories": "Cs"}),
     (ds.slices, {"size": None}),
     (ds.slices, {"size": "chips"}),
     (ds.slices, {"size": -1}),