Skip to content

Commit

Permalink
Adding in support for datetime64('NaT') (#258)
Browse files Browse the repository at this point in the history
* Speed up datetime to cdf time
* Added support to convert nan to fillvals in xarray_to_cdf
  • Loading branch information
bryan-harter authored May 10, 2024
1 parent d94252c commit 6ec09a7
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 54 deletions.
72 changes: 52 additions & 20 deletions cdflib/epochs.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def breakdown(epochs: epoch_types) -> np.ndarray:

@staticmethod
def _compose_date(
nat_positions: npt.NDArray,
years: npt.NDArray,
months: npt.NDArray,
days: npt.NDArray,
Expand All @@ -174,18 +175,29 @@ def _compose_date(
vals = (v for v in (years, months, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) if v is not None)

arrays: List[npt.NDArray[np.datetime64]] = [np.array(v, dtype=t) for t, v in zip(types, vals)]
return np.array(sum(arrays))
total_datetime = np.array(sum(arrays))
total_datetime = np.where(nat_positions, np.datetime64("NaT"), total_datetime)
return total_datetime

@classmethod
def to_datetime(cls, cdf_time: epoch_types) -> npt.NDArray[np.datetime64]:
"""
Converts CDF epoch argument to numpy.datetime64. This method
converts a scalar, or array-like. Precision is only kept to the
nearest nanosecond.
Converts CDF epoch argument to numpy.datetime64.
Parameters:
cdf_time: NumPy scalar/arrays to convert. np.int64 will be converted to cdf_tt2000, np.complex128 will be converted to cdf_epoch16, and floats will be converted to cdf_epoch.
Notes:
Because of datetime64 limitations, CDF_EPOCH16 precision is only kept to the nearest nanosecond.
"""
times = cls.breakdown(cdf_time)
times = np.atleast_2d(times)
return cls._compose_date(*times.T[:9]).astype("datetime64[ns]")

fillval_locations = np.all((times[:, 0:7] == [9999, 12, 31, 23, 59, 59, 999]), axis=1)
padval_locations = np.all((times[:, 0:7] == [0, 1, 1, 0, 0, 0, 0]), axis=1)
nan_locations = np.logical_or(fillval_locations, padval_locations)
return cls._compose_date(nan_locations, *times.T[:9]).astype("datetime64[ns]")

@staticmethod
def unixtime(cdf_time: npt.ArrayLike) -> Union[float, npt.NDArray]:
Expand Down Expand Up @@ -455,18 +467,26 @@ def breakdown_tt2000(tt2000: cdf_tt2000_type) -> np.ndarray:
"""
Breaks down the epoch(s) into UTC components.
For CDF_EPOCH:
they are 7 date/time components: year, month, day,
hour, minute, second, and millisecond
For CDF_EPOCH16:
they are 10 date/time components: year, month, day,
hour, minute, second, and millisecond, microsecond,
nanosecond, and picosecond.
For TT2000:
they are 9 date/time components: year, month, day,
hour, minute, second, millisecond, microsecond,
nanosecond.
Calculate date and time from cdf_time_tt2000 integers
Parameters
----------
epochs : array-like
Single, list, tuple, or np.array of tt2000 values
Returns
-------
components : ndarray
List or array of date and time values. The last axis contains
(in order): year, month, day, hour, minute, second, millisecond,
microsecond, and nanosecond
Notes
-----
If a bad epoch is supplied, a fill date of 9999-12-31 23:59:59 and 999 ms, 999 us, and
999 ns is returned.
"""

new_tt2000 = np.atleast_1d(tt2000).astype(np.longlong)
count = len(new_tt2000)
toutcs = np.zeros((9, count), dtype=int)
Expand Down Expand Up @@ -576,7 +596,14 @@ def breakdown_tt2000(tt2000: cdf_tt2000_type) -> np.ndarray:
toutcs[7, :] = ma1
toutcs[8, :] = na1

return np.squeeze(toutcs.T)
# Check standard fill and pad values
cdf_epoch_time_tt2000 = np.atleast_2d(toutcs.T)
fillval_locations = np.all(cdf_epoch_time_tt2000 == [1707, 9, 22, 12, 12, 10, 961, 224, 192], axis=1)
cdf_epoch_time_tt2000[fillval_locations] = [9999, 12, 31, 23, 59, 59, 999, 999, 999]
padval_locations = np.all(cdf_epoch_time_tt2000 == [1707, 9, 22, 12, 12, 10, 961, 224, 193], axis=1)
cdf_epoch_time_tt2000[padval_locations] = [0, 1, 1, 0, 0, 0, 0, 0, 0]

return np.squeeze(cdf_epoch_time_tt2000)

@staticmethod
def compute_tt2000(datetimes: npt.ArrayLike) -> Union[int, npt.NDArray[np.int64]]:
Expand Down Expand Up @@ -1187,7 +1214,10 @@ def breakdown_epoch16(epochs: cdf_epoch16_type) -> npt.NDArray:
components = np.full(shape=cshape, fill_value=[9999, 12, 31, 23, 59, 59, 999, 999, 999, 999])
for i, epoch16 in enumerate(new_epochs):
# Ignore fill values
if (epoch16.real != -1.0e31) or (epoch16.imag != -1.0e31):
if (epoch16.real != -1.0e31) or (epoch16.imag != -1.0e31) or np.isnan(epoch16):
if (epoch16.imag == -1.0e30) or (epoch16.imag == -1.0e30):
components[i] = [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
continue
esec = -epoch16.real if epoch16.real < 0.0 else epoch16.real
efra = -epoch16.imag if epoch16.imag < 0.0 else epoch16.imag

Expand Down Expand Up @@ -1518,8 +1548,8 @@ def breakdown_epoch(epochs: cdf_epoch_type) -> np.ndarray:
cshape.append(7)
components = np.full(shape=cshape, fill_value=[9999, 12, 31, 23, 59, 59, 999])
for i, epoch in enumerate(new_epochs):
# Ignore fill values
if epoch != -1.0e31:
# Ignore fill values and NaNs
if (epoch != -1.0e31) and not np.isnan(epoch):
esec = -epoch / 1000.0 if epoch < 0.0 else epoch / 1000.0
date_time = CDFepoch._calc_from_julian(esec, 0.0)

Expand All @@ -1530,6 +1560,8 @@ def breakdown_epoch(epochs: cdf_epoch_type) -> np.ndarray:
components = date_time[..., :7]
else:
components[i] = date_time[..., :7]
elif epoch == 0:
components[i] = [0, 1, 1, 0, 0, 0, 0]

return np.squeeze(components)

Expand Down
10 changes: 8 additions & 2 deletions cdflib/xarray/cdf_to_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,16 @@ def _convert_fillvals_to_nan(var_data: npt.NDArray, var_atts: Dict[str, Any], va
):
if new_data.size > 1:
if new_data[new_data == var_atts["FILLVAL"]].size != 0:
new_data[new_data == var_atts["FILLVAL"]] = np.nan
if new_data.dtype.type == np.datetime64:
new_data[new_data == var_atts["FILLVAL"]] = np.datetime64("nat")
else:
new_data[new_data == var_atts["FILLVAL"]] = np.nan
else:
if new_data == var_atts["FILLVAL"]:
new_data = np.array(np.nan)
if new_data.dtype.type == np.datetime64:
new_data[new_data == var_atts["FILLVAL"]] = np.array(np.datetime64("nat"))
else:
new_data[new_data == var_atts["FILLVAL"]] = np.array(np.nan)
return new_data


Expand Down
133 changes: 101 additions & 32 deletions cdflib/xarray/xarray_to_cdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@
11: np.uint8(255),
12: np.uint16(65535),
14: np.uint32(4294967295),
21: np.float32(-1e30),
22: np.float64(-1e30),
31: np.float64(-1e30),
32: np.complex128(complex(-1e30, -1e30)),
21: np.float32(-1e31),
22: np.float64(-1e31),
31: np.float64(-1e31),
32: np.complex128(complex(-1e31, -1e31)),
33: np.datetime64(-9223372036854775808, "ns"),
41: np.int8(-128),
44: np.float32(-1e30),
45: np.float64(-1e30),
44: np.float32(-1e31),
45: np.float64(-1e31),
51: np.str_(" "),
52: np.str_(" "),
}
Expand Down Expand Up @@ -126,7 +126,7 @@ def _is_istp_epoch_variable(var_name: str) -> Union[re.Match, None]:
return epoch_regex_1.match(var_name.lower()) or epoch_regex_2.match(var_name.lower())


def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> Tuple[int, int]:
def _dtype_to_cdf_type(var: xr.DataArray, terminate_on_warning: bool = False) -> Tuple[int, int]:
# Determines which CDF types to cast the xarray.Dataset to

# Set some defaults
Expand All @@ -140,7 +140,7 @@ def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> T
return STRINGS_TO_DATATYPES[cdf_data_type], element_size

# Everything named "epoch" should be cast to a CDF_TIME_TT2000
if _is_istp_epoch_variable(var.name.lower()):
if _is_istp_epoch_variable(var.name.lower()): # type: ignore
return STRINGS_TO_DATATYPES["CDF_TIME_TT2000"], element_size

numpy_data_type = var.dtype
Expand Down Expand Up @@ -191,14 +191,52 @@ def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> T
return STRINGS_TO_DATATYPES[cdf_data_type], element_size


def _dtype_to_fillval(var: xr.Dataset, terminate_on_warning: bool = False) -> Union[np.number, np.str_, np.datetime64, object]:
def _dtype_to_fillval(var: xr.DataArray, terminate_on_warning: bool = False) -> Union[np.number, np.str_, np.datetime64]:
datatype, _ = _dtype_to_cdf_type(var, terminate_on_warning=terminate_on_warning)
if datatype in DATATYPE_FILLVALS:
return DATATYPE_FILLVALS[datatype]
return DATATYPE_FILLVALS[datatype] # type: ignore
else:
return np.str_(" ")


def _convert_nans_to_fillval(var_data: xr.Dataset, terminate_on_warning: bool = False) -> xr.Dataset:
new_data = var_data

for var_name in new_data.data_vars:
data_array = new_data[var_name]
fill_value = _dtype_to_fillval(data_array)
if fill_value.dtype.type != np.datetime64:
try:
new_data[var_name] = new_data[var_name].fillna(fill_value)
except:
pass
var_att_dict = {}
for att in data_array.attrs:
try:
var_att_dict[att] = np.nan_to_num(data_array.attrs[att], -1e31) # type: ignore
except:
var_att_dict[att] = data_array.attrs[att]
new_data[var_name].attrs = var_att_dict

for var_name in new_data.coords:
data_array = new_data[var_name]
fill_value = _dtype_to_fillval(data_array)
if fill_value.dtype.type != np.datetime64:
try:
new_data[var_name] = new_data[var_name].fillna(fill_value)
except:
pass
var_att_dict = {}
for att in data_array.attrs:
try:
var_att_dict[att] = np.nan_to_num(data_array.attrs[att], -1e31) # type: ignore
except:
var_att_dict[att] = data_array.attrs[att]
new_data[var_name].attrs = var_att_dict

return new_data


def _verify_depend_dimensions(
dataset: xr.Dataset,
dimension_number: int,
Expand Down Expand Up @@ -684,13 +722,16 @@ def _unixtime_to_cdf_time(unixtime_data: xr.DataArray, cdf_epoch: bool = False,


def _datetime_to_cdf_time(
datetime_array: xr.DataArray, cdf_epoch: bool = False, cdf_epoch16: bool = False, attribute_name: str = ""
datetime_array: xr.DataArray,
cdf_epoch: bool = False,
cdf_epoch16: bool = False,
attribute_name: str = "",
) -> object:
if attribute_name:
datetime_data = datetime_array.attrs[attribute_name]
else:
datetime_data = datetime_array.data
datetime64_data = np.array(datetime_data, dtype="datetime64[ns]")
datetime64_data = np.atleast_1d(np.array(datetime_data, dtype="datetime64[ns]"))
cdf_epoch = False
cdf_epoch16 = False
if "CDF_DATA_TYPE" in datetime_array.attrs:
Expand All @@ -704,29 +745,52 @@ def _datetime_to_cdf_time(
else:
dtype_for_time_data = np.int64 # type: ignore

cdf_time_data = np.array([], dtype=dtype_for_time_data)
for dd in datetime64_data:
year = dd.astype("datetime64[Y]").astype(int) + 1970
month = dd.astype("datetime64[M]").astype(int) % 12 + 1
years = datetime64_data.astype("datetime64[Y]").astype("int64") + 1970
months = datetime64_data.astype("datetime64[M]").astype("int64") % 12 + 1
days = np.zeros(len(datetime64_data), dtype=np.int64)
i = 0
for i in range(len(datetime64_data)):
day = ((datetime64_data[i] - np.datetime64(f"{years[i]}-{months[i]:02d}", "M")) / 86400000000000).astype("int64") + 1
days[i] = day.item()
i += 1
hours = datetime64_data.astype("datetime64[h]").astype("int64") % 24
minutes = datetime64_data.astype("datetime64[m]").astype("int64") % 60
seconds = datetime64_data.astype("datetime64[s]").astype("int64") % 60
milliseconds = datetime64_data.astype("datetime64[ms]").astype("int64") % 1000
microseconds = datetime64_data.astype("datetime64[us]").astype("int64") % 1000
nanoseconds = datetime64_data.astype("datetime64[ns]").astype("int64") % 1000
picoseconds = 0

cdf_time_data = np.zeros(len(datetime64_data), dtype=dtype_for_time_data)
for i in range(len(datetime64_data)):
dd_to_convert = [
dd.astype("datetime64[Y]").astype("int64") + 1970,
dd.astype("datetime64[M]").astype("int64") % 12 + 1,
((dd - np.datetime64(f"{year}-{month:02d}", "M")) / 86400000000000).astype("int64") + 1,
dd.astype("datetime64[h]").astype("int64") % 24,
dd.astype("datetime64[m]").astype("int64") % 60,
dd.astype("datetime64[s]").astype("int64") % 60,
dd.astype("datetime64[ms]").astype("int64") % 1000,
dd.astype("datetime64[us]").astype("int64") % 1000,
dd.astype("datetime64[ns]").astype("int64") % 1000,
0,
years[i],
months[i],
days[i],
hours[i],
minutes[i],
seconds[i],
milliseconds[i],
microseconds[i],
nanoseconds[i],
picoseconds,
]
if cdf_epoch16:
converted_data = cdfepoch.compute(dd_to_convert)
elif cdf_epoch:
converted_data = cdfepoch.compute(dd_to_convert[0:7])
if np.isnat(datetime64_data[i]):
if cdf_epoch16:
cdf_time_data[i] == np.complex128(complex(-1e30, -1e30))
elif cdf_epoch:
cdf_time_data[i] == np.float64(-1e30)
else:
cdf_time_data[i] == np.int64(-9223372036854775808)
else:
converted_data = cdfepoch.compute(dd_to_convert[0:9])
cdf_time_data = np.append(cdf_time_data, converted_data)
if cdf_epoch16:
converted_data = cdfepoch.compute(dd_to_convert)
elif cdf_epoch:
converted_data = cdfepoch.compute(dd_to_convert[0:7])
else:
converted_data = cdfepoch.compute(dd_to_convert[0:9])
cdf_time_data[i] = converted_data

return cdf_time_data


Expand All @@ -739,6 +803,7 @@ def xarray_to_cdf(
auto_fix_depends: bool = True,
record_dimensions: List[str] = ["record0"],
compression: int = 0,
nan_to_fillval: bool = True,
from_unixtime: bool = False,
from_datetime: bool = False,
unixtime_to_cdftt2000: bool = False,
Expand All @@ -757,6 +822,7 @@ def xarray_to_cdf(
auto_fix_depends (bool, optional): Whether or not to automatically add dependencies
record_dimensions (list of str, optional): If the code cannot determine which dimensions should be made into CDF records, you may provide a list of them here
compression (int, optional): The level of compression to gzip the data in the variables. Default is no compression, standard is 6.
nan_to_fillval (bool, optional): Convert all np.nan and np.datetime64('NaT') to the standard CDF FILLVALs.
from_datetime (bool, optional, deprecated): Same as the datetime_to_cdftt2000 option
from_unixtime (bool, optional, deprecated): Same as the unixtime_to_cdftt2000 option
datetime_to_cdftt2000 (bool, optional, deprecated): Whether or not to convert variables named "epoch" or "epoch_X" to CDF_TT2000 from datetime objects
Expand Down Expand Up @@ -922,6 +988,9 @@ def xarray_to_cdf(
# Make a deep copy of the data before continuing
dataset = xarray_dataset.copy()

if nan_to_fillval:
_convert_nans_to_fillval(dataset)

if istp:
# This creates a list of suspected or confirmed label variables
_label_checker(dataset, terminate_on_warning)
Expand Down

0 comments on commit 6ec09a7

Please sign in to comment.