Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

correctly encode/decode _FillValues/missing_values/dtypes for packed data #8713

Merged
merged 19 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ Bug fixes
lead to integer overflow or unsafe conversion from floating point to integer
values (:issue:`8542`, :pull:`8575`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- CF conform handling of `_FillValue`/`missing_value` and `dtype` in
`CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`,
:issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
74 changes: 62 additions & 12 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,10 @@ def encode(self, variable: Variable, name: T_Name = None):
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Use _FillValue if available to align missing_value to prevent issues
# when decoding
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
if is_time_like and data.dtype.kind in "iu":
Expand Down Expand Up @@ -322,7 +324,13 @@ def decode(self, variable: Variable, name: T_Name = None):
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

if encoded_fill_values:
transform = partial(
Expand All @@ -347,20 +355,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
def _choose_float_dtype(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function checks for the most appropriate dtype to use when encoding/decoding in CFScaleOffsetCoder.

dtype: np.dtype, mapping: MutableMapping
) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# check scale/offset first to derive wanted float dtype
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
scale_factor = mapping.get("scale_factor")
add_offset = mapping.get("add_offset")
if scale_factor is not None or add_offset is not None:
# get the type from scale_factor/add_offset to determine
# the needed floating point type
if scale_factor is not None:
scale_type = type(scale_factor)
if add_offset is not None:
offset_type = type(add_offset)
# CF conforming, both scale_factor and add-offset are given and
# of same floating point type (float32/64)
if (
add_offset is not None
and scale_factor is not None
and offset_type == scale_type
and scale_type in [np.float32, np.float64]
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
):
# in case of int32 -> we need upcast to float64
# due to precision issues
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
return np.float64
return np.dtype(scale_type).type
# Not CF conforming and add_offset given:
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if add_offset is not None:
return np.float64
# return dtype depending on given scale_factor
return np.dtype(scale_type).type
# If no scale_factor or add_offset is given, use some general rules.
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
Expand All @@ -377,7 +416,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
# if we have a _FillValue/masked_value we do not want to cast now
# but leave that to CFMaskCoder
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(data.dtype, encoding)
# but still we need a copy prevent changing original data
data = duck_array_ops.astype(data, dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
Expand All @@ -393,11 +437,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
# if we have a _FillValue/masked_value we already have the wanted
# floating point dtype here (via CFMaskCoder), so no check is necessary
# only check in other cases
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(dtype, encoding)

transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
Expand Down
75 changes: 42 additions & 33 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,96 +142,100 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
)


def create_masked_and_scaled_data() -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
def create_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
encoding = {
"_FillValue": -1,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
"dtype": "i2",
}
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_masked_and_scaled_data() -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
attributes = {
"_FillValue": -1,
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
return Dataset(
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
)


def create_unsigned_masked_scaled_data() -> Dataset:
def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": "true",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": "true",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_bad_unsigned_masked_scaled_data() -> Dataset:
def create_bad_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": True,
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": True,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_signed_masked_scaled_data() -> Dataset:
def create_signed_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": -127,
"_Unsigned": "false",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_signed_masked_scaled_data() -> Dataset:
def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -127,
"_Unsigned": "false",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
Expand Down Expand Up @@ -889,10 +893,12 @@ def test_roundtrip_empty_vlen_string_array(self) -> None:
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
],
)
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
decoded = decoded_fn()
encoded = encoded_fn()

@pytest.mark.parametrize("dtype", [np.dtype("float64"), np.dtype("float32")])
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
if hasattr(self, "zarr_version") and dtype == np.float32:
pytest.skip("float32 will be treated as float64 in zarr")
decoded = decoded_fn(dtype)
encoded = encoded_fn(dtype)
with self.roundtrip(decoded) as actual:
for k in decoded.variables:
assert decoded.variables[k].dtype == actual.variables[k].dtype
Expand All @@ -912,7 +918,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:

# make sure roundtrip encoding didn't change the
# original dataset.
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)

with self.roundtrip(encoded) as actual:
for k in decoded.variables:
Expand Down Expand Up @@ -1645,6 +1651,7 @@ def test_mask_and_scale(self) -> None:
v.add_offset = 10
v.scale_factor = 0.1
v[:] = np.array([-1, -1, 0, 1, 2])
dtype = type(v.scale_factor)

# first make sure netCDF4 reads the masked and scaled data
# correctly
Expand All @@ -1653,11 +1660,13 @@ def test_mask_and_scale(self) -> None:
[-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
)
actual = nc.variables["x"][:]
print(actual.dtype, expected.dtype)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
assert_array_equal(expected, actual)

# now check xarray
with open_dataset(tmp_file) as ds:
expected = create_masked_and_scaled_data()
expected = create_masked_and_scaled_data(np.dtype(dtype))
print(ds.variables["x"].dtype, expected.variables["x"].dtype)
assert_identical(expected, ds)

def test_0dimensional_variable(self) -> None:
Expand Down
15 changes: 8 additions & 7 deletions xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None
assert encoded.dtype == encoded.attrs["missing_value"].dtype
assert encoded.dtype == encoded.attrs["_FillValue"].dtype

with pytest.warns(variables.SerializationWarning):
roundtripped = decode_cf_variable("foo", encoded)
assert_identical(roundtripped, original)
roundtripped = decode_cf_variable("foo", encoded)
assert_identical(roundtripped, original)


def test_CFMaskCoder_missing_value() -> None:
Expand Down Expand Up @@ -96,16 +95,18 @@ def test_coder_roundtrip() -> None:


@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split())
def test_scaling_converts_to_float32(dtype) -> None:
@pytest.mark.parametrize("dtype2", "f4 f8".split())
def test_scaling_converts_to_float(dtype: str, dtype2: str) -> None:
dt = np.dtype(dtype2)
original = xr.Variable(
("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=dt.type(10))
)
coder = variables.CFScaleOffsetCoder()
encoded = coder.encode(original)
assert encoded.dtype == np.float32
assert encoded.dtype == dt
roundtripped = coder.decode(encoded)
assert_identical(original, roundtripped)
assert roundtripped.dtype == np.float32
assert roundtripped.dtype == dt


@pytest.mark.parametrize("scale_factor", (10, [10]))
Expand Down
Loading