Adding in support for datetime64('NaT') (#258)

* Speed up datetime to cdf time * Added support to convert nan to fillvals in xarray_to_cdf
MAVENSDC · May 10, 2024 · 6ec09a7 · 6ec09a7
1 parent d94252c
commit 6ec09a7
Show file tree

Hide file tree

Showing 3 changed files with 161 additions and 54 deletions.
diff --git a/cdflib/epochs.py b/cdflib/epochs.py
@@ -154,6 +154,7 @@ def breakdown(epochs: epoch_types) -> np.ndarray:
 
     @staticmethod
     def _compose_date(
+        nat_positions: npt.NDArray,
         years: npt.NDArray,
         months: npt.NDArray,
         days: npt.NDArray,
@@ -174,18 +175,29 @@ def _compose_date(
         vals = (v for v in (years, months, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) if v is not None)
 
         arrays: List[npt.NDArray[np.datetime64]] = [np.array(v, dtype=t) for t, v in zip(types, vals)]
-        return np.array(sum(arrays))
+        total_datetime = np.array(sum(arrays))
+        total_datetime = np.where(nat_positions, np.datetime64("NaT"), total_datetime)
+        return total_datetime
 
     @classmethod
     def to_datetime(cls, cdf_time: epoch_types) -> npt.NDArray[np.datetime64]:
         """
-        Converts CDF epoch argument to numpy.datetime64. This method
-        converts a scalar, or array-like. Precision is only kept to the
-        nearest nanosecond.
+        Converts CDF epoch argument to numpy.datetime64.
+
+        Parameters:
+            cdf_time: NumPy scalar/arrays to convert. np.int64 will be converted to cdf_tt2000, np.complex128 will be converted to cdf_epoch16, and floats will be converted to cdf_epoch.
+
+        Notes:
+            Because of datetime64 limitations, CDF_EPOCH16 precision is only kept to the nearest nanosecond.
+
         """
         times = cls.breakdown(cdf_time)
         times = np.atleast_2d(times)
-        return cls._compose_date(*times.T[:9]).astype("datetime64[ns]")
+
+        fillval_locations = np.all((times[:, 0:7] == [9999, 12, 31, 23, 59, 59, 999]), axis=1)
+        padval_locations = np.all((times[:, 0:7] == [0, 1, 1, 0, 0, 0, 0]), axis=1)
+        nan_locations = np.logical_or(fillval_locations, padval_locations)
+        return cls._compose_date(nan_locations, *times.T[:9]).astype("datetime64[ns]")
 
     @staticmethod
     def unixtime(cdf_time: npt.ArrayLike) -> Union[float, npt.NDArray]:
@@ -455,18 +467,26 @@ def breakdown_tt2000(tt2000: cdf_tt2000_type) -> np.ndarray:
         """
         Breaks down the epoch(s) into UTC components.
 
-        For CDF_EPOCH:
-                they are 7 date/time components: year, month, day,
-                hour, minute, second, and millisecond
-        For CDF_EPOCH16:
-                they are 10 date/time components: year, month, day,
-                hour, minute, second, and millisecond, microsecond,
-                nanosecond, and picosecond.
-        For TT2000:
-                they are 9 date/time components: year, month, day,
-                hour, minute, second, millisecond, microsecond,
-                nanosecond.
+        Calculate date and time from cdf_time_tt2000 integers
+
+        Parameters
+        ----------
+        epochs : array-like
+            Single, list, tuple, or np.array of tt2000 values
+
+        Returns
+        -------
+        components : ndarray
+            List or array of date and time values.  The last axis contains
+            (in order): year, month, day, hour, minute, second, millisecond,
+            microsecond, and nanosecond
+
+        Notes
+        -----
+        If a bad epoch is supplied, a fill date of 9999-12-31 23:59:59 and 999 ms, 999 us, and
+        999 ns is returned.
         """
+
         new_tt2000 = np.atleast_1d(tt2000).astype(np.longlong)
         count = len(new_tt2000)
         toutcs = np.zeros((9, count), dtype=int)
@@ -576,7 +596,14 @@ def breakdown_tt2000(tt2000: cdf_tt2000_type) -> np.ndarray:
         toutcs[7, :] = ma1
         toutcs[8, :] = na1
 
-        return np.squeeze(toutcs.T)
+        # Check standard fill and pad values
+        cdf_epoch_time_tt2000 = np.atleast_2d(toutcs.T)
+        fillval_locations = np.all(cdf_epoch_time_tt2000 == [1707, 9, 22, 12, 12, 10, 961, 224, 192], axis=1)
+        cdf_epoch_time_tt2000[fillval_locations] = [9999, 12, 31, 23, 59, 59, 999, 999, 999]
+        padval_locations = np.all(cdf_epoch_time_tt2000 == [1707, 9, 22, 12, 12, 10, 961, 224, 193], axis=1)
+        cdf_epoch_time_tt2000[padval_locations] = [0, 1, 1, 0, 0, 0, 0, 0, 0]
+
+        return np.squeeze(cdf_epoch_time_tt2000)
 
     @staticmethod
     def compute_tt2000(datetimes: npt.ArrayLike) -> Union[int, npt.NDArray[np.int64]]:
@@ -1187,7 +1214,10 @@ def breakdown_epoch16(epochs: cdf_epoch16_type) -> npt.NDArray:
         components = np.full(shape=cshape, fill_value=[9999, 12, 31, 23, 59, 59, 999, 999, 999, 999])
         for i, epoch16 in enumerate(new_epochs):
             # Ignore fill values
-            if (epoch16.real != -1.0e31) or (epoch16.imag != -1.0e31):
+            if (epoch16.real != -1.0e31) or (epoch16.imag != -1.0e31) or np.isnan(epoch16):
+                if (epoch16.imag == -1.0e30) or (epoch16.imag == -1.0e30):
+                    components[i] = [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
+                    continue
                 esec = -epoch16.real if epoch16.real < 0.0 else epoch16.real
                 efra = -epoch16.imag if epoch16.imag < 0.0 else epoch16.imag
 
@@ -1518,8 +1548,8 @@ def breakdown_epoch(epochs: cdf_epoch_type) -> np.ndarray:
         cshape.append(7)
         components = np.full(shape=cshape, fill_value=[9999, 12, 31, 23, 59, 59, 999])
         for i, epoch in enumerate(new_epochs):
-            # Ignore fill values
-            if epoch != -1.0e31:
+            # Ignore fill values and NaNs
+            if (epoch != -1.0e31) and not np.isnan(epoch):
                 esec = -epoch / 1000.0 if epoch < 0.0 else epoch / 1000.0
                 date_time = CDFepoch._calc_from_julian(esec, 0.0)
 
@@ -1530,6 +1560,8 @@ def breakdown_epoch(epochs: cdf_epoch_type) -> np.ndarray:
                     components = date_time[..., :7]
                 else:
                     components[i] = date_time[..., :7]
+            elif epoch == 0:
+                components[i] = [0, 1, 1, 0, 0, 0, 0]
 
         return np.squeeze(components)
 

diff --git a/cdflib/xarray/cdf_to_xarray.py b/cdflib/xarray/cdf_to_xarray.py
@@ -303,10 +303,16 @@ def _convert_fillvals_to_nan(var_data: npt.NDArray, var_atts: Dict[str, Any], va
         ):
             if new_data.size > 1:
                 if new_data[new_data == var_atts["FILLVAL"]].size != 0:
-                    new_data[new_data == var_atts["FILLVAL"]] = np.nan
+                    if new_data.dtype.type == np.datetime64:
+                        new_data[new_data == var_atts["FILLVAL"]] = np.datetime64("nat")
+                    else:
+                        new_data[new_data == var_atts["FILLVAL"]] = np.nan
             else:
                 if new_data == var_atts["FILLVAL"]:
-                    new_data = np.array(np.nan)
+                    if new_data.dtype.type == np.datetime64:
+                        new_data[new_data == var_atts["FILLVAL"]] = np.array(np.datetime64("nat"))
+                    else:
+                        new_data[new_data == var_atts["FILLVAL"]] = np.array(np.nan)
     return new_data
 
 

diff --git a/cdflib/xarray/xarray_to_cdf.py b/cdflib/xarray/xarray_to_cdf.py
@@ -60,14 +60,14 @@
     11: np.uint8(255),
     12: np.uint16(65535),
     14: np.uint32(4294967295),
-    21: np.float32(-1e30),
-    22: np.float64(-1e30),
-    31: np.float64(-1e30),
-    32: np.complex128(complex(-1e30, -1e30)),
+    21: np.float32(-1e31),
+    22: np.float64(-1e31),
+    31: np.float64(-1e31),
+    32: np.complex128(complex(-1e31, -1e31)),
     33: np.datetime64(-9223372036854775808, "ns"),
     41: np.int8(-128),
-    44: np.float32(-1e30),
-    45: np.float64(-1e30),
+    44: np.float32(-1e31),
+    45: np.float64(-1e31),
     51: np.str_(" "),
     52: np.str_(" "),
 }
@@ -126,7 +126,7 @@ def _is_istp_epoch_variable(var_name: str) -> Union[re.Match, None]:
     return epoch_regex_1.match(var_name.lower()) or epoch_regex_2.match(var_name.lower())
 
 
-def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> Tuple[int, int]:
+def _dtype_to_cdf_type(var: xr.DataArray, terminate_on_warning: bool = False) -> Tuple[int, int]:
     # Determines which CDF types to cast the xarray.Dataset to
 
     # Set some defaults
@@ -140,7 +140,7 @@ def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> T
             return STRINGS_TO_DATATYPES[cdf_data_type], element_size
 
     # Everything named "epoch" should be cast to a CDF_TIME_TT2000
-    if _is_istp_epoch_variable(var.name.lower()):
+    if _is_istp_epoch_variable(var.name.lower()):  # type: ignore
         return STRINGS_TO_DATATYPES["CDF_TIME_TT2000"], element_size
 
     numpy_data_type = var.dtype
@@ -191,14 +191,52 @@ def _dtype_to_cdf_type(var: xr.Dataset, terminate_on_warning: bool = False) -> T
     return STRINGS_TO_DATATYPES[cdf_data_type], element_size
 
 
-def _dtype_to_fillval(var: xr.Dataset, terminate_on_warning: bool = False) -> Union[np.number, np.str_, np.datetime64, object]:
+def _dtype_to_fillval(var: xr.DataArray, terminate_on_warning: bool = False) -> Union[np.number, np.str_, np.datetime64]:
     datatype, _ = _dtype_to_cdf_type(var, terminate_on_warning=terminate_on_warning)
     if datatype in DATATYPE_FILLVALS:
-        return DATATYPE_FILLVALS[datatype]
+        return DATATYPE_FILLVALS[datatype]  # type: ignore
     else:
         return np.str_(" ")
 
 
+def _convert_nans_to_fillval(var_data: xr.Dataset, terminate_on_warning: bool = False) -> xr.Dataset:
+    new_data = var_data
+
+    for var_name in new_data.data_vars:
+        data_array = new_data[var_name]
+        fill_value = _dtype_to_fillval(data_array)
+        if fill_value.dtype.type != np.datetime64:
+            try:
+                new_data[var_name] = new_data[var_name].fillna(fill_value)
+            except:
+                pass
+        var_att_dict = {}
+        for att in data_array.attrs:
+            try:
+                var_att_dict[att] = np.nan_to_num(data_array.attrs[att], -1e31)  # type: ignore
+            except:
+                var_att_dict[att] = data_array.attrs[att]
+        new_data[var_name].attrs = var_att_dict
+
+    for var_name in new_data.coords:
+        data_array = new_data[var_name]
+        fill_value = _dtype_to_fillval(data_array)
+        if fill_value.dtype.type != np.datetime64:
+            try:
+                new_data[var_name] = new_data[var_name].fillna(fill_value)
+            except:
+                pass
+        var_att_dict = {}
+        for att in data_array.attrs:
+            try:
+                var_att_dict[att] = np.nan_to_num(data_array.attrs[att], -1e31)  # type: ignore
+            except:
+                var_att_dict[att] = data_array.attrs[att]
+        new_data[var_name].attrs = var_att_dict
+
+    return new_data
+
+
 def _verify_depend_dimensions(
     dataset: xr.Dataset,
     dimension_number: int,
@@ -684,13 +722,16 @@ def _unixtime_to_cdf_time(unixtime_data: xr.DataArray, cdf_epoch: bool = False,
 
 
 def _datetime_to_cdf_time(
-    datetime_array: xr.DataArray, cdf_epoch: bool = False, cdf_epoch16: bool = False, attribute_name: str = ""
+    datetime_array: xr.DataArray,
+    cdf_epoch: bool = False,
+    cdf_epoch16: bool = False,
+    attribute_name: str = "",
 ) -> object:
     if attribute_name:
         datetime_data = datetime_array.attrs[attribute_name]
     else:
         datetime_data = datetime_array.data
-    datetime64_data = np.array(datetime_data, dtype="datetime64[ns]")
+    datetime64_data = np.atleast_1d(np.array(datetime_data, dtype="datetime64[ns]"))
     cdf_epoch = False
     cdf_epoch16 = False
     if "CDF_DATA_TYPE" in datetime_array.attrs:
@@ -704,29 +745,52 @@ def _datetime_to_cdf_time(
     else:
         dtype_for_time_data = np.int64  # type: ignore
 
-    cdf_time_data = np.array([], dtype=dtype_for_time_data)
-    for dd in datetime64_data:
-        year = dd.astype("datetime64[Y]").astype(int) + 1970
-        month = dd.astype("datetime64[M]").astype(int) % 12 + 1
+    years = datetime64_data.astype("datetime64[Y]").astype("int64") + 1970
+    months = datetime64_data.astype("datetime64[M]").astype("int64") % 12 + 1
+    days = np.zeros(len(datetime64_data), dtype=np.int64)
+    i = 0
+    for i in range(len(datetime64_data)):
+        day = ((datetime64_data[i] - np.datetime64(f"{years[i]}-{months[i]:02d}", "M")) / 86400000000000).astype("int64") + 1
+        days[i] = day.item()
+        i += 1
+    hours = datetime64_data.astype("datetime64[h]").astype("int64") % 24
+    minutes = datetime64_data.astype("datetime64[m]").astype("int64") % 60
+    seconds = datetime64_data.astype("datetime64[s]").astype("int64") % 60
+    milliseconds = datetime64_data.astype("datetime64[ms]").astype("int64") % 1000
+    microseconds = datetime64_data.astype("datetime64[us]").astype("int64") % 1000
+    nanoseconds = datetime64_data.astype("datetime64[ns]").astype("int64") % 1000
+    picoseconds = 0
+
+    cdf_time_data = np.zeros(len(datetime64_data), dtype=dtype_for_time_data)
+    for i in range(len(datetime64_data)):
         dd_to_convert = [
-            dd.astype("datetime64[Y]").astype("int64") + 1970,
-            dd.astype("datetime64[M]").astype("int64") % 12 + 1,
-            ((dd - np.datetime64(f"{year}-{month:02d}", "M")) / 86400000000000).astype("int64") + 1,
-            dd.astype("datetime64[h]").astype("int64") % 24,
-            dd.astype("datetime64[m]").astype("int64") % 60,
-            dd.astype("datetime64[s]").astype("int64") % 60,
-            dd.astype("datetime64[ms]").astype("int64") % 1000,
-            dd.astype("datetime64[us]").astype("int64") % 1000,
-            dd.astype("datetime64[ns]").astype("int64") % 1000,
-            0,
+            years[i],
+            months[i],
+            days[i],
+            hours[i],
+            minutes[i],
+            seconds[i],
+            milliseconds[i],
+            microseconds[i],
+            nanoseconds[i],
+            picoseconds,
         ]
-        if cdf_epoch16:
-            converted_data = cdfepoch.compute(dd_to_convert)
-        elif cdf_epoch:
-            converted_data = cdfepoch.compute(dd_to_convert[0:7])
+        if np.isnat(datetime64_data[i]):
+            if cdf_epoch16:
+                cdf_time_data[i] == np.complex128(complex(-1e30, -1e30))
+            elif cdf_epoch:
+                cdf_time_data[i] == np.float64(-1e30)
+            else:
+                cdf_time_data[i] == np.int64(-9223372036854775808)
         else:
-            converted_data = cdfepoch.compute(dd_to_convert[0:9])
-        cdf_time_data = np.append(cdf_time_data, converted_data)
+            if cdf_epoch16:
+                converted_data = cdfepoch.compute(dd_to_convert)
+            elif cdf_epoch:
+                converted_data = cdfepoch.compute(dd_to_convert[0:7])
+            else:
+                converted_data = cdfepoch.compute(dd_to_convert[0:9])
+            cdf_time_data[i] = converted_data
+
     return cdf_time_data
 
 
@@ -739,6 +803,7 @@ def xarray_to_cdf(
     auto_fix_depends: bool = True,
     record_dimensions: List[str] = ["record0"],
     compression: int = 0,
+    nan_to_fillval: bool = True,
     from_unixtime: bool = False,
     from_datetime: bool = False,
     unixtime_to_cdftt2000: bool = False,
@@ -757,6 +822,7 @@ def xarray_to_cdf(
         auto_fix_depends (bool, optional): Whether or not to automatically add dependencies
         record_dimensions (list of str, optional): If the code cannot determine which dimensions should be made into CDF records, you may provide a list of them here
         compression (int, optional): The level of compression to gzip the data in the variables.  Default is no compression, standard is 6.
+        nan_to_fillval (bool, optional): Convert all np.nan and np.datetime64('NaT') to the standard CDF FILLVALs.
         from_datetime (bool, optional, deprecated): Same as the datetime_to_cdftt2000 option
         from_unixtime (bool, optional, deprecated): Same as the unixtime_to_cdftt2000 option
         datetime_to_cdftt2000 (bool, optional, deprecated): Whether or not to convert variables named "epoch" or "epoch_X" to CDF_TT2000 from datetime objects
@@ -922,6 +988,9 @@ def xarray_to_cdf(
     # Make a deep copy of the data before continuing
     dataset = xarray_dataset.copy()
 
+    if nan_to_fillval:
+        _convert_nans_to_fillval(dataset)
+
     if istp:
         # This creates a list of suspected or confirmed label variables
         _label_checker(dataset, terminate_on_warning)