Skip to content

Commit

Permalink
refactor: Reduce SchemaValidationError traceback length (#3530)
Browse files Browse the repository at this point in the history
  • Loading branch information
dangotbanned authored Aug 31, 2024
1 parent de58ec8 commit 98d9fb8
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 267 deletions.
213 changes: 120 additions & 93 deletions altair/utils/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from altair import vegalite

if TYPE_CHECKING:
from types import ModuleType
from typing import ClassVar

from referencing import Registry
Expand All @@ -57,6 +58,7 @@
from typing import Never, Self
else:
from typing_extensions import Never, Self
_OptionalModule: TypeAlias = "ModuleType | None"

ValidationErrorList: TypeAlias = List[jsonschema.exceptions.ValidationError]
GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList]
Expand Down Expand Up @@ -559,9 +561,25 @@ def _resolve_references(


class SchemaValidationError(jsonschema.ValidationError):
"""A wrapper for jsonschema.ValidationError with friendlier traceback."""

def __init__(self, obj: SchemaBase, err: jsonschema.ValidationError) -> None:
"""
A wrapper for ``jsonschema.ValidationError`` with friendlier traceback.
Parameters
----------
obj
The instance that failed ``self.validate(...)``.
err
The original ``ValidationError``.
Notes
-----
We do not raise `from err` as else the resulting traceback is very long
as it contains part of the Vega-Lite schema.
It would also first show the less helpful `ValidationError` instead of
the more user friendly `SchemaValidationError`.
"""
super().__init__(**err._contents())
self.obj = obj
self._errors: GroupedValidationErrors = getattr(
Expand Down Expand Up @@ -991,88 +1009,45 @@ def to_dict(
Parameters
----------
validate : bool, optional
If True (default), then validate the output dictionary
against the schema.
If True (default), then validate the result against the schema.
ignore : list[str], optional
A list of keys to ignore. It is usually not needed
to specify this argument as a user.
A list of keys to ignore.
context : dict[str, Any], optional
A context dictionary. It is usually not needed
to specify this argument as a user.
Notes
-----
Technical: The ignore parameter will *not* be passed to child to_dict
function calls.
Returns
-------
dict
The dictionary representation of this object
A context dictionary.
Raises
------
SchemaValidationError :
if validate=True and the dict does not conform to the schema
If ``validate`` and the result does not conform to the schema.
Notes
-----
- ``ignore``, ``context`` are usually not needed to be specified as a user.
- *Technical*: ``ignore`` will **not** be passed to child :meth:`.to_dict()`.
"""
if context is None:
context = {}
if ignore is None:
ignore = []
# The following return the package only if it has already been
# imported - otherwise they return None. This is useful for
# isinstance checks - for example, if pandas has not been imported,
# then an object is definitely not a `pandas.Timestamp`.
pd_opt = sys.modules.get("pandas")
np_opt = sys.modules.get("numpy")
context = context or {}
ignore = ignore or []
opts = _get_optional_modules(np_opt="numpy", pd_opt="pandas")

if self._args and not self._kwds:
result = _todict(
self._args[0], context=context, np_opt=np_opt, pd_opt=pd_opt
)
kwds = self._args[0]
elif not self._args:
kwds = self._kwds.copy()
# parsed_shorthand is added by FieldChannelMixin.
# It's used below to replace shorthand with its long form equivalent
# parsed_shorthand is removed from context if it exists so that it is
# not passed to child to_dict function calls
parsed_shorthand = context.pop("parsed_shorthand", {})
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
# or if the encoding channel does not support sorting
if "sort" in parsed_shorthand and (
"sort" not in kwds or kwds["type"] not in {"ordinal", Undefined}
):
parsed_shorthand.pop("sort")

kwds.update(
{
k: v
for k, v in parsed_shorthand.items()
if kwds.get(k, Undefined) is Undefined
}
)
kwds = {
k: v for k, v in kwds.items() if k not in {*list(ignore), "shorthand"}
}
if "mark" in kwds and isinstance(kwds["mark"], str):
kwds["mark"] = {"type": kwds["mark"]}
result = _todict(kwds, context=context, np_opt=np_opt, pd_opt=pd_opt)
exclude = {*ignore, "shorthand"}
if parsed := context.pop("parsed_shorthand", None):
kwds = _replace_parsed_shorthand(parsed, kwds)
kwds = {k: v for k, v in kwds.items() if k not in exclude}
if (mark := kwds.get("mark")) and isinstance(mark, str):
kwds["mark"] = {"type": mark}
else:
msg = (
f"{self.__class__} instance has both a value and properties : "
"cannot serialize to dict"
)
msg = f"{type(self)} instance has both a value and properties : cannot serialize to dict"
raise ValueError(msg)
result = _todict(kwds, context=context, **opts)
if validate:
# NOTE: Don't raise `from err`, see `SchemaValidationError` doc
try:
self.validate(result)
except jsonschema.ValidationError as err:
# We do not raise `from err` as else the resulting
# traceback is very long as it contains part
# of the Vega-Lite schema. It would also first
# show the less helpful ValidationError instead of
# the more user friendly SchemaValidationError
raise SchemaValidationError(self, err) from None
return result

Expand All @@ -1092,30 +1067,27 @@ def to_json(
Parameters
----------
validate : bool, optional
If True (default), then validate the output dictionary
against the schema.
If True (default), then validate the result against the schema.
indent : int, optional
The number of spaces of indentation to use. The default is 2.
sort_keys : bool, optional
If True (default), sort keys in the output.
ignore : list[str], optional
A list of keys to ignore. It is usually not needed
to specify this argument as a user.
A list of keys to ignore.
context : dict[str, Any], optional
A context dictionary. It is usually not needed
to specify this argument as a user.
A context dictionary.
**kwargs
Additional keyword arguments are passed to ``json.dumps()``
Raises
------
SchemaValidationError :
If ``validate`` and the result does not conform to the schema.
Notes
-----
Technical: The ignore parameter will *not* be passed to child to_dict
function calls.
Returns
-------
str
The JSON specification of the chart object.
- ``ignore``, ``context`` are usually not needed to be specified as a user.
- *Technical*: ``ignore`` will **not** be passed to child :meth:`.to_dict()`.
"""
if ignore is None:
ignore = []
Expand Down Expand Up @@ -1143,15 +1115,10 @@ def from_dict(
validate : boolean
If True (default), then validate the input against the schema.
Returns
-------
obj : Schema object
The wrapped schema
Raises
------
jsonschema.ValidationError :
if validate=True and dct does not conform to the schema
If ``validate`` and ``dct`` does not conform to the schema
"""
if validate:
cls.validate(dct)
Expand Down Expand Up @@ -1214,13 +1181,8 @@ def validate_property(
cls, name: str, value: Any, schema: dict[str, Any] | None = None
) -> None:
"""Validate a property against property schema in the context of the rootschema."""
# The following return the package only if it has already been
# imported - otherwise they return None. This is useful for
# isinstance checks - for example, if pandas has not been imported,
# then an object is definitely not a `pandas.Timestamp`.
pd_opt = sys.modules.get("pandas")
np_opt = sys.modules.get("numpy")
value = _todict(value, context={}, np_opt=np_opt, pd_opt=pd_opt)
opts = _get_optional_modules(np_opt="numpy", pd_opt="pandas")
value = _todict(value, context={}, **opts)
props = cls.resolve_references(schema or cls._schema).get("properties", {})
validate_jsonschema(
value, props.get(name, {}), rootschema=cls._rootschema or cls._schema
Expand All @@ -1230,6 +1192,71 @@ def __dir__(self) -> list[str]:
return sorted(chain(super().__dir__(), self._kwds))


def _get_optional_modules(**modules: str) -> dict[str, _OptionalModule]:
"""
Returns packages only if they have already been imported - otherwise they return `None`.
This is useful for `isinstance` checks.
For example, if `pandas` has not been imported, then an object is
definitely not a `pandas.Timestamp`.
Parameters
----------
**modules
Keyword-only binding from `{alias: module_name}`.
Examples
--------
>>> import pandas as pd # doctest: +SKIP
>>> import polars as pl # doctest: +SKIP
>>> from altair.utils.schemapi import _get_optional_modules # doctest: +SKIP
>>>
>>> _get_optional_modules(pd="pandas", pl="polars", ibis="ibis") # doctest: +SKIP
{
"pd": <module 'pandas' from '...'>,
"pl": <module 'polars' from '...'>,
"ibis": None,
}
If the user later imports ``ibis``, it would appear in subsequent calls.
>>> import ibis # doctest: +SKIP
>>>
>>> _get_optional_modules(ibis="ibis") # doctest: +SKIP
{
"ibis": <module 'ibis' from '...'>,
}
"""
return {k: sys.modules.get(v) for k, v in modules.items()}


def _replace_parsed_shorthand(
parsed_shorthand: dict[str, Any], kwds: dict[str, Any]
) -> dict[str, Any]:
"""
`parsed_shorthand` is added by `FieldChannelMixin`.
It's used below to replace shorthand with its long form equivalent
`parsed_shorthand` is removed from `context` if it exists so that it is
not passed to child `to_dict` function calls.
"""
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
# or if the encoding channel does not support sorting
if "sort" in parsed_shorthand and (
"sort" not in kwds or kwds["type"] not in {"ordinal", Undefined}
):
parsed_shorthand.pop("sort")

kwds.update(
(k, v)
for k, v in parsed_shorthand.items()
if kwds.get(k, Undefined) is Undefined
)
return kwds


TSchemaBase = TypeVar("TSchemaBase", bound=SchemaBase)

_CopyImpl = TypeVar("_CopyImpl", SchemaBase, Dict[Any, Any], List[Any])
Expand Down
Loading

0 comments on commit 98d9fb8

Please sign in to comment.