Skip to content

Commit

Permalink
Merge pull request #3711 from Cheukting/codec
Browse files Browse the repository at this point in the history
  • Loading branch information
Zac-HD authored Sep 1, 2023
2 parents 0560fce + 2b2de29 commit 3936211
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 7 deletions.
7 changes: 7 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
RELEASE_TYPE: minor

Adds a new ``codec=`` option in :func:`~hypothesis.strategies.characters`, making it
convenient to produce only characters which can be encoded as ``ascii`` or ``utf-8``
bytestrings.

Support for other codecs will be added in a future release.
60 changes: 53 additions & 7 deletions hypothesis-python/src/hypothesis/strategies/_internal/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

import codecs
import enum
import math
import operator
Expand All @@ -27,11 +28,13 @@
Any,
AnyStr,
Callable,
Collection,
Dict,
FrozenSet,
Hashable,
Iterable,
List,
Literal,
Optional,
Pattern,
Protocol,
Expand Down Expand Up @@ -523,12 +526,13 @@ def dictionaries(
@defines_strategy(force_reusable_values=True)
def characters(
*,
whitelist_categories: Optional[Sequence[str]] = None,
blacklist_categories: Optional[Sequence[str]] = None,
blacklist_characters: Optional[Sequence[str]] = None,
whitelist_categories: Optional[Collection[str]] = None,
blacklist_categories: Optional[Collection[str]] = None,
blacklist_characters: Optional[Collection[str]] = None,
min_codepoint: Optional[int] = None,
max_codepoint: Optional[int] = None,
whitelist_characters: Optional[Sequence[str]] = None,
whitelist_characters: Optional[Collection[str]] = None,
codec: Optional[Literal["ascii", "utf-8"]] = None,
) -> SearchStrategy[str]:
r"""Generates characters, length-one :class:`python:str`\ ings,
following specified filtering rules.
Expand All @@ -549,6 +553,12 @@ def characters(
that list will be not be produced. Any overlap between
``whitelist_characters`` and ``blacklist_characters`` will raise an
exception.
- If ``codec`` is specified, only characters in certain `codec encodings`_
will be produced. Currently only `ascii` and `utf-8` are supported.
``whitelist_characters`` which cannot be encoded using this codec will
raise an exception. If non-encodable codepoints or categories are
explicitly allowed, the ``codec`` argument will exclude them without
raising an exception.
The ``_codepoint`` arguments must be integers between zero and
:obj:`python:sys.maxunicode`. The ``_characters`` arguments must be
Expand All @@ -562,6 +572,7 @@ def characters(
for characters in any punctuation category.
.. _general category: https://wikipedia.org/wiki/Unicode_character_property
.. _codec encodings: https://docs.python.org/3/library/codecs.html#encodings-and-unicode
Examples from this strategy shrink towards the codepoint for ``'0'``,
or the first allowable codepoint after it if ``'0'`` is excluded.
Expand All @@ -575,6 +586,7 @@ def characters(
and whitelist_categories is None
and blacklist_categories is None
and whitelist_characters is not None
and codec is None
):
raise InvalidArgument(
"Nothing is excluded by other arguments, so passing only "
Expand Down Expand Up @@ -613,6 +625,42 @@ def characters(
f"{whitelist_categories=} and {blacklist_categories=}"
)

if codec is not None:
try:
codecs.lookup(codec)
except LookupError:
raise InvalidArgument(f"{codec=} is not valid on this system") from None
except TypeError:
raise InvalidArgument(f"{codec=} is not a valid codec") from None

for char in whitelist_characters:
try:
char.encode(encoding=codec, errors="strict")
except UnicodeEncodeError:
raise InvalidArgument(
f"Character {char!r} in {whitelist_characters=} "
f"cannot be encoded with {codec=}"
) from None

# ascii and utf-8 are sufficient common that we have faster special handling
if codec == "ascii":
if (max_codepoint is None) or (max_codepoint > 127):
max_codepoint = 127
elif codec == "utf-8":
if whitelist_categories is not None:
whitelist_categories = tuple(
c for c in whitelist_categories if c != "Cs"
)
if blacklist_categories is None:
blacklist_categories = ("Cs",)
elif "Cs" not in blacklist_categories:
blacklist_categories = tuple(blacklist_categories) + ("Cs",)
else:
# TODO: handle all other codecs. We'll probably want to do this inside
# `OneCharStringStrategy`, by checking which intervals are supported,
# caching that, and taking the intersection of their intervals.
raise InvalidArgument(f"{codec=} must be one of 'ascii', 'utf-8', or None")

return OneCharStringStrategy(
whitelist_categories=whitelist_categories,
blacklist_categories=blacklist_categories,
Expand All @@ -639,9 +687,7 @@ def _check_is_single_character(c):
@cacheable
@defines_strategy(force_reusable_values=True)
def text(
alphabet: Union[Sequence[str], SearchStrategy[str]] = characters(
blacklist_categories=("Cs",)
),
alphabet: Union[Collection[str], SearchStrategy[str]] = characters(codec="utf-8"),
*,
min_size: int = 0,
max_size: Optional[int] = None,
Expand Down
26 changes: 26 additions & 0 deletions hypothesis-python/tests/cover/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,29 @@ def empty_db(_):
def test_non_executed_tests_raise_skipped(test_fn):
with pytest.raises(unittest.SkipTest):
test_fn()


@pytest.mark.parametrize(
"codec, max_codepoint, blacklist_categories, whitelist_categories",
[
("ascii", None, None, None),
("ascii", 128, None, None),
("ascii", 100, None, None),
("utf-8", None, None, None),
("utf-8", None, ["Cs"], None),
("utf-8", None, ["N"], None),
("utf-8", None, None, ["N"]),
],
)
@given(s.data())
def test_characters_codec(
codec, max_codepoint, blacklist_categories, whitelist_categories, data
):
strategy = s.characters(
codec=codec,
max_codepoint=max_codepoint,
blacklist_categories=blacklist_categories,
whitelist_categories=whitelist_categories,
)
example = data.draw(strategy)
assert example.encode(encoding=codec).decode(encoding=codec) == example
5 changes: 5 additions & 0 deletions hypothesis-python/tests/cover/test_direct_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,11 @@ def fn_ktest(*fnkwargs):
(ds.characters, {"max_codepoint": "1"}),
(ds.characters, {"whitelist_categories": []}),
(ds.characters, {"whitelist_categories": ["Nd"], "blacklist_categories": ["Nd"]}),
(ds.characters, {"codec": 100}),
(ds.characters, {"codec": "cp861"}), # not yet implemented
(ds.characters, {"codec": "this is not a valid codec name"}),
(ds.characters, {"codec": "ascii", "whitelist_characters": "é"}),
(ds.characters, {"codec": "utf-8", "whitelist_categories": "Cs"}),
(ds.slices, {"size": None}),
(ds.slices, {"size": "chips"}),
(ds.slices, {"size": -1}),
Expand Down

0 comments on commit 3936211

Please sign in to comment.