diff --git a/cf/data/data.py b/cf/data/data.py index c93d48f7d4..45e7f9422a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -40,7 +40,7 @@ from ..functions import chunksize as cf_chunksize from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold -from ..functions import free_memory, hash_array +from ..functions import free_memory from ..functions import inspect as cf_inspect from ..functions import log_level, parse_indices, pathjoin from ..functions import rtol as cf_rtol @@ -812,58 +812,6 @@ def __data__(self): """Returns a new reference to self.""" return self - def __hash__(self): - """The built-in function `hash` - - Generating the hash temporarily realizes the entire array in - memory, which may not be possible for large arrays. - - The hash value is dependent on the data-type and shape of the data - array. If the array is a masked array then the hash value is - independent of the fill value and of data array values underlying - any masked elements. - - The hash value may be different if regenerated after the data - array has been changed in place. - - The hash value is not guaranteed to be portable across versions of - Python, numpy and cf. - - :Returns: - - `int` - The hash value. - - **Examples:** - - >>> print(d.array) - [[0 1 2 3]] - >>> d.hash() - -8125230271916303273 - >>> d[1, 0] = numpy.ma.masked - >>> print(d.array) - [[0 -- 2 3]] - >>> hash(d) - 791917586613573563 - >>> d.hardmask = False - >>> d[0, 1] = 999 - >>> d[0, 1] = numpy.ma.masked - >>> d.hash() - 791917586613573563 - >>> d.squeeze() - >>> print(d.array) - [0 -- 2 3] - >>> hash(d) - -7007538450787927902 - >>> d.dtype = float - >>> print(d.array) - [0.0 -- 2.0 3.0] - >>> hash(d) - -4816859207969696442 - - """ - return hash_array(self.array) - @daskified(_DASKIFIED_VERBOSE) def __float__(self): """Called to implement the built-in function `float` diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 205e016333..3d4327c672 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -7,6 +7,68 @@ class DataClassDeprecationsMixin: """Deprecated attributes and methods for the Data class.""" + def __hash__(self): + """The built-in function `hash`. + + Deprecated at version TODODASK. Consider using the + `cf.hash_array` function instead. + + Generating the hash temporarily realizes the entire array in + memory, which may not be possible for large arrays. + + The hash value is dependent on the data-type and shape of the data + array. If the array is a masked array then the hash value is + independent of the fill value and of data array values underlying + any masked elements. + + The hash value may be different if regenerated after the data + array has been changed in place. + + The hash value is not guaranteed to be portable across versions of + Python, numpy and cf. + + :Returns: + + `int` + The hash value. + + **Examples** + + >>> print(d.array) + [[0 1 2 3]] + >>> d.hash() + -8125230271916303273 + >>> d[1, 0] = numpy.ma.masked + >>> print(d.array) + [[0 -- 2 3]] + >>> hash(d) + 791917586613573563 + >>> d.hardmask = False + >>> d[0, 1] = 999 + >>> d[0, 1] = numpy.ma.masked + >>> d.hash() + 791917586613573563 + >>> d.squeeze() + >>> print(d.array) + [0 -- 2 3] + >>> hash(d) + -7007538450787927902 + >>> d.dtype = float + >>> print(d.array) + [0.0 -- 2.0 3.0] + >>> hash(d) + -4816859207969696442 + + """ + _DEPRECATION_ERROR_METHOD( + self, + "__hash__", + message="Consider using 'cf.hash_array' on the underlying " + "array instead.", + version="TODODASK", + removed_at="5.0.0", + ) + @property def Data(self): """Deprecated at version 3.0.0, use attribute `data` instead.""" diff --git a/cf/functions.py b/cf/functions.py index 070f30f84e..d6a76cfc0e 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1,6 +1,7 @@ import atexit import csv import ctypes.util +import hashlib import importlib import os import platform @@ -10,8 +11,7 @@ import urllib.parse import warnings from collections.abc import Iterable -from hashlib import md5 as hashlib_md5 -from marshal import dumps as marshal_dumps +from marshal import dumps from math import ceil as math_ceil from numbers import Integral from os import getpid, listdir, mkdir @@ -23,14 +23,12 @@ from os.path import relpath as _os_path_relpath import cfdm - -# import cPickle import netCDF4 +import numpy as np from numpy import __file__ as _numpy__file__ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all from numpy import allclose as _x_numpy_allclose -from numpy import ascontiguousarray as _numpy_ascontiguousarray from numpy import isclose as _x_numpy_isclose from numpy import shape as _numpy_shape from numpy import take as _numpy_take @@ -2611,69 +2609,66 @@ def pathjoin(path1, path2): return _os_path_join(path1, path2) -def hash_array(array): - """Return the hash value of a numpy array. +def hash_array(array, algorithm=hashlib.sha1): + """Return a hash value of a numpy array. - The hash value is dependent on the data type, shape of the data + The hash value is dependent on the data type and the shape of the array. If the array is a masked array then the hash value is independent of the fill value and of data array values underlying any masked elements. - The hash value is not guaranteed to be portable across versions of - Python, numpy and cf. - :Parameters: array: `numpy.ndarray` The numpy array to be hashed. May be a masked array. + algorithm: `hashlib` constructor function + Constructor function for the desired hash algorithm, + e.g. `hashlib.md5`, `hashlib.sha256`, etc. + + .. versionadded:: TODODASK + :Returns: `int` The hash value. - **Examples:** + **Examples** + + >>> a = np.array([[0, 1, 2, 3]]) + >>> cf.hash_array(a) + -5620332080097671134 - >>> print(array) - [[0 1 2 3]] + >>> a = np.ma.array([[0, 1, 2, 3]], mask=[[0, 1, 0, 0]]) >>> cf.hash_array(array) - -8125230271916303273 - >>> array[1, 0] = numpy.ma.masked - >>> print(array) + 8372868545804866378 + + >>> a[0, 1] = 999 + >>> a[0, 1] = np.ma.masked + >>> print(a) [[0 -- 2 3]] - >>> cf.hash_array(array) - 791917586613573563 - >>> array.hardmask = False - >>> array[0, 1] = 999 - >>> array[0, 1] = numpy.ma.masked - >>> cf.hash_array(array) - 791917586613573563 - >>> array.squeeze() - >>> print(array) - [0 -- 2 3] - >>> cf.hash_array(array) - -7007538450787927902 - >>> array.dtype = float - >>> print(array) - [0.0 -- 2.0 3.0] - >>> cf.hash_array(array) - -4816859207969696442 + >>> print(a.data) + [[ 0 999 2 3]] + >>> cf.hash_array(a) + 8372868545804866378 - """ - h = hashlib_md5() + >>> a = a.astype(float) + >>> cf.hash_array(a) + 5950106833921144220 - h_update = h.update + """ + h = algorithm() - h_update(marshal_dumps(array.dtype.name)) - h_update(marshal_dumps(array.shape)) + h.update(dumps(array.dtype.name)) + h.update(dumps(array.shape)) - if _numpy_ma_isMA(array): - if _numpy_ma_is_masked(array): + if np.ma.isMA(array): + if np.ma.is_masked(array): mask = array.mask if not mask.flags.c_contiguous: - mask = _numpy_ascontiguousarray(mask) + mask = np.ascontiguousarray(mask) - h_update(mask) + h.update(mask) array = array.copy() array.set_fill_value() array = array.filled() @@ -2681,10 +2676,9 @@ def hash_array(array): array = array.data if not array.flags.c_contiguous: - # array = array.copy() - array = _numpy_ascontiguousarray(array) + array = np.ascontiguousarray(array) - h_update(array) + h.update(array) return hash(h.digest()) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 98fb157c1d..0ba1db53b0 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -5,6 +5,8 @@ import sys import unittest +import numpy as np + faulthandler.enable() # to debug seg faults and timeouts import cf @@ -312,6 +314,23 @@ def test_environment(self): ]: self.assertIn(component, ep) + def test_hash_array(self): + import hashlib + + a = np.ma.array([[0, 1, 2, 3], [0, 1, 2, 3]]) + a[0, 0] = np.ma.masked + a = a.transpose() + + self.assertFalse(a.flags.c_contiguous) + self.assertFalse(a.mask.flags.c_contiguous) + + h = cf.hash_array(a) + self.assertIsInstance(h, int) + self.assertNotEqual(cf.hash_array(a, algorithm=hashlib.sha256), h) + + a.set_fill_value(a.fill_value + 1) + self.assertEqual(cf.hash_array(a), h) + if __name__ == "__main__": print("Run date:", datetime.datetime.now())