Skip to content

Commit

Permalink
Merge pull request #366 from davidhassell/dask-hash
Browse files Browse the repository at this point in the history
dask: `Data.__hash__`
  • Loading branch information
davidhassell authored Apr 1, 2022
2 parents 530ca16 + 8299db6 commit 906e437
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 99 deletions.
54 changes: 1 addition & 53 deletions cf/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from ..functions import chunksize as cf_chunksize
from ..functions import default_netCDF_fillvals
from ..functions import fm_threshold as cf_fm_threshold
from ..functions import free_memory, hash_array
from ..functions import free_memory
from ..functions import inspect as cf_inspect
from ..functions import log_level, parse_indices, pathjoin
from ..functions import rtol as cf_rtol
Expand Down Expand Up @@ -812,58 +812,6 @@ def __data__(self):
"""Returns a new reference to self."""
return self

def __hash__(self):
"""The built-in function `hash`
Generating the hash temporarily realizes the entire array in
memory, which may not be possible for large arrays.
The hash value is dependent on the data-type and shape of the data
array. If the array is a masked array then the hash value is
independent of the fill value and of data array values underlying
any masked elements.
The hash value may be different if regenerated after the data
array has been changed in place.
The hash value is not guaranteed to be portable across versions of
Python, numpy and cf.
:Returns:
`int`
The hash value.
**Examples:**
>>> print(d.array)
[[0 1 2 3]]
>>> d.hash()
-8125230271916303273
>>> d[1, 0] = numpy.ma.masked
>>> print(d.array)
[[0 -- 2 3]]
>>> hash(d)
791917586613573563
>>> d.hardmask = False
>>> d[0, 1] = 999
>>> d[0, 1] = numpy.ma.masked
>>> d.hash()
791917586613573563
>>> d.squeeze()
>>> print(d.array)
[0 -- 2 3]
>>> hash(d)
-7007538450787927902
>>> d.dtype = float
>>> print(d.array)
[0.0 -- 2.0 3.0]
>>> hash(d)
-4816859207969696442
"""
return hash_array(self.array)

@daskified(_DASKIFIED_VERBOSE)
def __float__(self):
"""Called to implement the built-in function `float`
Expand Down
62 changes: 62 additions & 0 deletions cf/data/mixin/deprecations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,68 @@
class DataClassDeprecationsMixin:
"""Deprecated attributes and methods for the Data class."""

def __hash__(self):
"""The built-in function `hash`.
Deprecated at version TODODASK. Consider using the
`cf.hash_array` function instead.
Generating the hash temporarily realizes the entire array in
memory, which may not be possible for large arrays.
The hash value is dependent on the data-type and shape of the data
array. If the array is a masked array then the hash value is
independent of the fill value and of data array values underlying
any masked elements.
The hash value may be different if regenerated after the data
array has been changed in place.
The hash value is not guaranteed to be portable across versions of
Python, numpy and cf.
:Returns:
`int`
The hash value.
**Examples**
>>> print(d.array)
[[0 1 2 3]]
>>> d.hash()
-8125230271916303273
>>> d[1, 0] = numpy.ma.masked
>>> print(d.array)
[[0 -- 2 3]]
>>> hash(d)
791917586613573563
>>> d.hardmask = False
>>> d[0, 1] = 999
>>> d[0, 1] = numpy.ma.masked
>>> d.hash()
791917586613573563
>>> d.squeeze()
>>> print(d.array)
[0 -- 2 3]
>>> hash(d)
-7007538450787927902
>>> d.dtype = float
>>> print(d.array)
[0.0 -- 2.0 3.0]
>>> hash(d)
-4816859207969696442
"""
_DEPRECATION_ERROR_METHOD(
self,
"__hash__",
message="Consider using 'cf.hash_array' on the underlying "
"array instead.",
version="TODODASK",
removed_at="5.0.0",
)

@property
def Data(self):
"""Deprecated at version 3.0.0, use attribute `data` instead."""
Expand Down
86 changes: 40 additions & 46 deletions cf/functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import atexit
import csv
import ctypes.util
import hashlib
import importlib
import os
import platform
Expand All @@ -10,8 +11,7 @@
import urllib.parse
import warnings
from collections.abc import Iterable
from hashlib import md5 as hashlib_md5
from marshal import dumps as marshal_dumps
from marshal import dumps
from math import ceil as math_ceil
from numbers import Integral
from os import getpid, listdir, mkdir
Expand All @@ -23,14 +23,12 @@
from os.path import relpath as _os_path_relpath

import cfdm

# import cPickle
import netCDF4
import numpy as np
from numpy import __file__ as _numpy__file__
from numpy import __version__ as _numpy__version__
from numpy import all as _numpy_all
from numpy import allclose as _x_numpy_allclose
from numpy import ascontiguousarray as _numpy_ascontiguousarray
from numpy import isclose as _x_numpy_isclose
from numpy import shape as _numpy_shape
from numpy import take as _numpy_take
Expand Down Expand Up @@ -2611,80 +2609,76 @@ def pathjoin(path1, path2):
return _os_path_join(path1, path2)


def hash_array(array):
"""Return the hash value of a numpy array.
def hash_array(array, algorithm=hashlib.sha1):
"""Return a hash value of a numpy array.
The hash value is dependent on the data type, shape of the data
The hash value is dependent on the data type and the shape of the
array. If the array is a masked array then the hash value is
independent of the fill value and of data array values underlying
any masked elements.
The hash value is not guaranteed to be portable across versions of
Python, numpy and cf.
:Parameters:
array: `numpy.ndarray`
The numpy array to be hashed. May be a masked array.
algorithm: `hashlib` constructor function
Constructor function for the desired hash algorithm,
e.g. `hashlib.md5`, `hashlib.sha256`, etc.
.. versionadded:: TODODASK
:Returns:
`int`
The hash value.
**Examples:**
**Examples**
>>> a = np.array([[0, 1, 2, 3]])
>>> cf.hash_array(a)
-5620332080097671134
>>> print(array)
[[0 1 2 3]]
>>> a = np.ma.array([[0, 1, 2, 3]], mask=[[0, 1, 0, 0]])
>>> cf.hash_array(array)
-8125230271916303273
>>> array[1, 0] = numpy.ma.masked
>>> print(array)
8372868545804866378
>>> a[0, 1] = 999
>>> a[0, 1] = np.ma.masked
>>> print(a)
[[0 -- 2 3]]
>>> cf.hash_array(array)
791917586613573563
>>> array.hardmask = False
>>> array[0, 1] = 999
>>> array[0, 1] = numpy.ma.masked
>>> cf.hash_array(array)
791917586613573563
>>> array.squeeze()
>>> print(array)
[0 -- 2 3]
>>> cf.hash_array(array)
-7007538450787927902
>>> array.dtype = float
>>> print(array)
[0.0 -- 2.0 3.0]
>>> cf.hash_array(array)
-4816859207969696442
>>> print(a.data)
[[ 0 999 2 3]]
>>> cf.hash_array(a)
8372868545804866378
"""
h = hashlib_md5()
>>> a = a.astype(float)
>>> cf.hash_array(a)
5950106833921144220
h_update = h.update
"""
h = algorithm()

h_update(marshal_dumps(array.dtype.name))
h_update(marshal_dumps(array.shape))
h.update(dumps(array.dtype.name))
h.update(dumps(array.shape))

if _numpy_ma_isMA(array):
if _numpy_ma_is_masked(array):
if np.ma.isMA(array):
if np.ma.is_masked(array):
mask = array.mask
if not mask.flags.c_contiguous:
mask = _numpy_ascontiguousarray(mask)
mask = np.ascontiguousarray(mask)

h_update(mask)
h.update(mask)
array = array.copy()
array.set_fill_value()
array = array.filled()
else:
array = array.data

if not array.flags.c_contiguous:
# array = array.copy()
array = _numpy_ascontiguousarray(array)
array = np.ascontiguousarray(array)

h_update(array)
h.update(array)

return hash(h.digest())

Expand Down
19 changes: 19 additions & 0 deletions cf/test/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import sys
import unittest

import numpy as np

faulthandler.enable() # to debug seg faults and timeouts

import cf
Expand Down Expand Up @@ -312,6 +314,23 @@ def test_environment(self):
]:
self.assertIn(component, ep)

def test_hash_array(self):
import hashlib

a = np.ma.array([[0, 1, 2, 3], [0, 1, 2, 3]])
a[0, 0] = np.ma.masked
a = a.transpose()

self.assertFalse(a.flags.c_contiguous)
self.assertFalse(a.mask.flags.c_contiguous)

h = cf.hash_array(a)
self.assertIsInstance(h, int)
self.assertNotEqual(cf.hash_array(a, algorithm=hashlib.sha256), h)

a.set_fill_value(a.fill_value + 1)
self.assertEqual(cf.hash_array(a), h)


if __name__ == "__main__":
print("Run date:", datetime.datetime.now())
Expand Down

0 comments on commit 906e437

Please sign in to comment.