968 increase minimal xugrid version to speed up xuopen dataset (#984)

* replace ds[varn] with ds.variables[varn] where possible, calling the frozen object prevents accessing the variable which is slow in case of many variables * updated minimal xugrid version * updated whatsnew
Deltares · Sep 3, 2024 · fa8c25b · fa8c25b
1 parent 87ff0e1
commit fa8c25b
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 17 deletions.
diff --git a/dfm_tools/interpolate_grid2bnd.py b/dfm_tools/interpolate_grid2bnd.py
@@ -478,7 +478,7 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) ->
     # TODO: revert after fixing https://github.com/Deltares/xugrid/issues/274
     vars_without_facedim = []
     for varn in uds.variables:
-        if facedim not in uds[varn].dims:
+        if facedim not in uds.variables[varn].dims:
             vars_without_facedim.append(varn)
     uds_face = uds.drop(vars_without_facedim)
 
@@ -488,8 +488,9 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) ->
     # re-add removed variables again, sometimes important for e.g. depth
     # TODO: remove after fixing https://github.com/Deltares/xugrid/issues/274
     for varn in vars_without_facedim:
-        if edgedim not in uds[varn].dims and nodedim not in uds[varn].dims:
-            ds[varn] = uds[varn]
+        vardims = uds.variables[varn].dims
+        if edgedim not in vardims and nodedim not in vardims:
+            ds[varn] = uds.variables[varn]
 
     # rename station dimname and varname (is index, are both mesh2d_nFaces to start with)
     ds = ds.rename({facedim:dimn_point}) # rename mesh2d_nFaces to plipoints

diff --git a/dfm_tools/xarray_helpers.py b/dfm_tools/xarray_helpers.py
@@ -111,8 +111,8 @@ def preprocess_ERA5(ds):
     # Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
     # By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset)
     # ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files.
-    for var in ds.data_vars:
-        if not set(['dtype','scale_factor','add_offset']).issubset(ds[var].encoding.keys()):
+    for var in ds.data_vars.keys():
+        if not set(['dtype','scale_factor','add_offset']).issubset(ds.variables[var].encoding.keys()):
             continue
         # the _FillValue will still be -32767 (int default), but this is no issue for float32
         ds[var].encoding.pop('scale_factor')
@@ -315,7 +315,7 @@ def Dataset_varswithdim(ds,dimname): #TODO: dit zit ook in xugrid, wordt nu gebr
 
     varlist_keep = []
     for varname in ds.variables.keys():
-        if dimname in ds[varname].dims:
+        if dimname in ds.variables[varname].dims:
             varlist_keep.append(varname)
     ds = ds[varlist_keep]
 

diff --git a/dfm_tools/xugrid_helpers.py b/dfm_tools/xugrid_helpers.py
@@ -71,7 +71,7 @@ def remove_ghostcells(uds, fname): #TODO: remove ghostcells from output or align
 
     #drop ghostcells
     part_domainno_fromfname = int(part_domainno_fromfname)
-    da_domainno = uds[varn_domain]
+    da_domainno = uds.variables[varn_domain]
     idx = np.flatnonzero(da_domainno == part_domainno_fromfname)
     uds = uds.isel({uds.grid.face_dimension:idx})
     return uds
@@ -118,11 +118,11 @@ def decode_default_fillvals(ds):
     # TODO: this function can be removed when xarray does it automatically: https://github.com/Deltares/dfm_tools/issues/490
 
     nfillattrs_added = 0
-    for varn in ds.variables:
+    for varn in ds.variables.keys():
         # TODO: possible to get always_mask boolean with `netCDF4.Dataset(file_nc).variables[varn].always_mask`, but this seems to be always True for FM mapfiles
-        if '_FillValue' in ds[varn].encoding:
+        if '_FillValue' in ds.variables[varn].encoding:
             continue
-        dtype_str = ds[varn].dtype.str[1:]
+        dtype_str = ds.variables[varn].dtype.str[1:]
         if dtype_str not in default_fillvals.keys():
             continue
         varn_fillval = default_fillvals[dtype_str]
@@ -144,9 +144,9 @@ def remove_nan_fillvalue_attrs(ds : (xr.Dataset, xu.UgridDataset)):
         ds = ds.obj
 
     count = 0
-    for varn in ds.variables:
-        if '_FillValue' in ds[varn].encoding:
-            if np.isnan(ds[varn].encoding['_FillValue']):
+    for varn in ds.variables.keys():
+        if '_FillValue' in ds.variables[varn].encoding:
+            if np.isnan(ds.variables[varn].encoding['_FillValue']):
                 ds[varn].encoding.pop('_FillValue')
                 count += 1
     if count > 0:
@@ -299,9 +299,9 @@ def open_dataset_curvilinear(file_nc,
 
     print('>> getting vertices from ds: ',end='')
     dtstart = dt.datetime.now()
-    vertices_longitude = ds[varn_vert_lon].to_numpy()
+    vertices_longitude = ds.variables[varn_vert_lon].to_numpy()
     vertices_longitude = vertices_longitude.reshape(-1,vertices_longitude.shape[-1])
-    vertices_latitude = ds[varn_vert_lat].to_numpy()
+    vertices_latitude = ds.variables[varn_vert_lat].to_numpy()
     vertices_latitude = vertices_latitude.reshape(-1,vertices_latitude.shape[-1])
     print(f'{(dt.datetime.now()-dtstart).total_seconds():.2f} sec')
 

diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -8,6 +8,7 @@
 - update to cdsapi 0.7.2 and properly catching error for dummy dataset in [#972](https://github.com/Deltares/dfm_tools/pull/972)
 - deprecated `dfmt.open_dataset_extra()` (partly replaced by `dfmt.open_prepare_dataset()`) in [#974](https://github.com/Deltares/dfm_tools/pull/974)
 - improved nan-conversion in `dfmt.forcinglike_to_Dataset()` in [#982](https://github.com/Deltares/dfm_tools/pull/982)
+- improved performance of `dfmt.open_partitioned_dataset()` for datasets with many variables in [#984](https://github.com/Deltares/dfm_tools/pull/984)
 
 
 ## 0.25.0 (2024-08-16)

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,8 +36,8 @@ dependencies = [
 	"netcdf4>=1.5.4",
 	#bottleneck<1.3.3 pip install fails in py39
 	"bottleneck>=1.3.3",
-	#xugrid<0.11.2 sometimes fails on merged chunks that are inconsistent
-	"xugrid>=0.11.2",
+	#xugrid<0.12.0 has sub-optimal performance because of accessing dataarrays of variables
+	"xugrid>=0.12.0",
 	#cdsapi<0.7.2 has different error upon dummy dataset
 	"cdsapi>=0.7.2",
 	#pydap<3.4.0 is from May 2017 and does not support newer python versions