Skip to content

Commit

Permalink
hash get_object_state, use dir for native types, fix __slots__ (#208)
Browse files Browse the repository at this point in the history
Fix #207

See #207 for some discussion.

Note that this slightly changes the behavior, but only for cases which were basically broken before.

E.g., for `functools.partial`, before this change, none of the relevant args (`func`, `args`, `keywords`) were ever part of the hash. So this change should not really break anything.

The same is also true for cases where there was both a `__dict__` and `__slots__`. With Python <=3.10, it would have used only the `__dict__`, and very likely, that was just empty, and does not contain any of the relevant things (which are very likely all in the `__slots__`), so it was totally broken and basically unusable. With Python >=3.11, there is a default `__getstate__` now, which would already do the correct thing, at least for "normal" (non-native) types, i.e. merge `__dict__` and `__slots__`, i.e. our existing `get_object_state` was correct for those cases. With this change, Python <=3.10 is also correct now.

In any case, we check through `dir` now, and we check for member descriptors, which should also cover such native types (like `functools.partial`).
  • Loading branch information
albertz authored Oct 11, 2024
1 parent c7de85e commit 9e597db
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 8 deletions.
73 changes: 65 additions & 8 deletions sisyphus/hash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import enum
import hashlib
from inspect import isclass, isfunction
import pathlib
from inspect import isclass, isfunction, ismemberdescriptor


def md5(obj):
Expand Down Expand Up @@ -43,6 +44,13 @@ def get_object_state(obj):
Comment: Maybe obj.__reduce__() is a better idea? is it stable for hashing?
"""

if isinstance(obj, pathlib.PurePath):
# pathlib paths have a somewhat technical internal state
# ('_drv', '_root', '_parts', '_str', '_hash', '_pparts', '_cached_cparts'),
# so we don't want to rely on this, but instead just use the string representation as state.
# https://github.com/rwth-i6/sisyphus/pull/208#issuecomment-2405560718
return str(obj)

if hasattr(obj, "__getnewargs_ex__"):
args = obj.__getnewargs_ex__()
elif hasattr(obj, "__getnewargs__"):
Expand All @@ -52,15 +60,19 @@ def get_object_state(obj):

if hasattr(obj, "__sis_state__"):
state = obj.__sis_state__()
elif hasattr(obj, "__getstate__"):
# Note: Since Python 3.11, there is a default object.__getstate__.
# However, this default object.__getstate__ is not correct for some native types, e.g. _functools.partial.
# https://github.com/rwth-i6/sisyphus/issues/207
# https://github.com/python/cpython/issues/125094
# Thus, only use __getstate__ if it is not the default object.__getstate__.
elif hasattr(obj, "__getstate__") and obj.__class__.__getstate__ is not getattr(object, "__getstate__", None):
state = obj.__getstate__()
elif hasattr(obj, "__dict__"):
state = obj.__dict__
elif hasattr(obj, "__slots__"):
state = {k: getattr(obj, k) for k in obj.__slots__ if hasattr(obj, k)}
else:
assert args is not None, "Failed to get object state of: %s" % repr(obj)
state = None
state = _getmembers(obj)
if not state and not hasattr(obj, "__dict__") and not hasattr(obj, "__slots__"):
# Keep compatibility with old behavior.
assert args is not None, "Failed to get object state of: %s" % repr(obj)
state = None

if isinstance(obj, enum.Enum):
assert isinstance(state, dict)
Expand Down Expand Up @@ -129,3 +141,48 @@ def _obj_type_qualname(obj) -> bytes:
# In Python >=3.11, keep hash same as in Python <=3.10, https://github.com/rwth-i6/sisyphus/issues/188
return b"EnumMeta"
return type(obj).__qualname__.encode()


def _getmembers(obj):
res = {}
if hasattr(obj, "__dict__"):
res.update(obj.__dict__)
if hasattr(obj, "__slots__"):
for key in obj.__slots__:
try:
res[key] = getattr(obj, key)
except AttributeError:
pass
# Note, there are cases where `__dict__` or `__slots__` don't contain all attributes,
# e.g. for some native types, e.g. _functools.partial.
# (https://github.com/rwth-i6/sisyphus/issues/207)
# `dir()` usually still lists those attributes.
# However, to keep the behavior as before, we only want to return the object attributes here,
# not the class attributes.
cls_dict = {}
for cls in reversed(type(obj).__mro__):
if getattr(cls, "__dict__", None):
cls_dict.update(cls.__dict__)
for key in dir(obj):
if key.startswith("__"):
continue
if key in res:
continue
# Get class attribute first, to maybe skip descriptors.
if key in cls_dict:
cls_value = cls_dict[key]
if hasattr(cls_value, "__get__"): # descriptor
# descriptor are e.g. properties, bound methods, etc. We don't want to have those.
# But member descriptors are usually for slots (even for native types without __slots__),
# so that is why we keep them.
if not ismemberdescriptor(cls_value):
continue
try:
value = getattr(obj, key)
except AttributeError:
# dir might not be reliable. just skip this
continue
if key in cls_dict and cls_dict[key] is value:
continue # this is a class attribute
res[key] = value
return res
21 changes: 21 additions & 0 deletions tests/hash_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,27 @@ def test_enum(self):
+ b" (tuple, (str, '_name_'), (str, 'Entry1')), (tuple, (str, '_value_'), (int, 1))))",
)

def test_functools_partial(self):
from functools import partial

obj = partial(int, 42)
self.assertEqual(
sis_hash_helper(obj),
(
b"(partial, (dict,"
b" (tuple, (str, 'args'), (tuple, (int, 42))),"
b" (tuple, (str, 'func'), (type,"
b" (tuple, (str, 'builtins'), (str, 'int')))),"
b" (tuple, (str, 'keywords'), (dict))))"
),
)

def test_pathlib_Path(self):
from pathlib import Path

obj = Path("/etc/passwd")
self.assertEqual(sis_hash_helper(obj), b"(PosixPath, (str, '/etc/passwd'))")


if __name__ == "__main__":
unittest.main()

0 comments on commit 9e597db

Please sign in to comment.