diff --git a/doc/conf.py b/doc/conf.py index 14b28b4e471..f8368ec105e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -34,7 +34,7 @@ subprocess.run(["conda", "list"]) else: print("pip environment:") - subprocess.run(["pip", "list"]) + subprocess.run([sys.executable, "-m", "pip", "list"]) print(f"xarray: {xarray.__version__}, {xarray.__file__}") diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index db612d74859..068edba1e64 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -12,6 +12,36 @@ Weather and climate data .. _Climate and Forecast (CF) conventions: http://cfconventions.org +.. _cf_variables: + +Related Variables +----------------- + +Several CF variable attributes contain lists of other variables +associated with the variable with the attribute. A few of these are +now parsed by XArray, with the attribute value popped to encoding on +read and the variables in that value interpreted as non-dimension +coordinates: + +- ``coordinates`` +- ``bounds`` +- ``grid_mapping`` +- ``climatology`` +- ``geometry`` +- ``node_coordinates`` +- ``node_count`` +- ``part_node_count`` +- ``interior_ring`` +- ``cell_measures`` +- ``formula_terms`` + +This decoding is controlled by the ``decode_coords`` kwarg to +:py:func:`open_dataset` and :py:func:`open_mfdataset`. + +The CF attribute ``ancillary_variables`` was not included in the list +due to the variables listed there being associated primarily with the +variable with the attribute, rather than with the dimensions. + .. _metpy_accessor: CF-compliant coordinate variables diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b06003b630..5f65839cce6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,6 +39,13 @@ Breaking changes always be set such that ``int64`` values can be used. In the past, no units finer than "seconds" were chosen, which would sometimes mean that ``float64`` values were required, which would lead to inaccurate I/O round-trips. +- Variables referred to in attributes like ``bounds`` and ``grid_mapping`` + are can be set as coordinate variables. These attributes + are moved to :py:attr:`DataArray.encoding` from + :py:attr:`DataArray.attrs`. This behaviour is controlled by the + ``decode_coords`` kwarg to :py:func:`open_dataset` and + :py:func:`open_mfdataset`. The full list of decoded attributes is in + :ref:`weather-climate` (:pull:`2844`, :issue:`3689`) - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`). By `Aureliana Barghini `_. @@ -340,7 +347,6 @@ New Features - Expose ``use_cftime`` option in :py:func:`~xarray.open_zarr` (:issue:`2886`, :pull:`3229`) By `Samnan Rahee `_ and `Anderson Banihirwe `_. - Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 81314588784..659ae12a2c9 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -354,9 +354,14 @@ def open_dataset( form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine @@ -613,9 +618,14 @@ def open_dataarray( form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ optional Engine to use when reading files. If not provided, the default engine diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index d31fc9ea773..de1b3e1bb29 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -195,10 +195,14 @@ def open_dataset( removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. This keyword may not be supported by all the backends. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. This keyword may not be supported by all the - backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. drop_variables: str or iterable, optional A variable or list of variables to exclude from the dataset parsing. This may be useful to drop variables with problems or diff --git a/xarray/conventions.py b/xarray/conventions.py index 93e765e5622..bb0cc5cd338 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -11,6 +11,23 @@ from .core.pycompat import is_duck_dask_array from .core.variable import IndexVariable, Variable, as_variable +CF_RELATED_DATA = ( + "bounds", + "grid_mapping", + "climatology", + "geometry", + "node_coordinates", + "node_count", + "part_node_count", + "interior_ring", + "cell_measures", + "formula_terms", +) +CF_RELATED_DATA_NEEDS_PARSING = ( + "cell_measures", + "formula_terms", +) + class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): """Decode arrays on the fly from non-native to native endianness @@ -256,6 +273,9 @@ def encode_cf_variable(var, needs_copy=True, name=None): var = maybe_default_fill_value(var) var = maybe_encode_bools(var) var = ensure_dtype_not_object(var, name=name) + + for attr_name in CF_RELATED_DATA: + pop_to(var.encoding, var.attrs, attr_name) return var @@ -499,7 +519,7 @@ def stackable(dim): use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if decode_coords: + if decode_coords in [True, "coordinates", "all"]: var_attrs = new_vars[k].attrs if "coordinates" in var_attrs: coord_str = var_attrs["coordinates"] @@ -509,6 +529,38 @@ def stackable(dim): del var_attrs["coordinates"] coord_names.update(var_coord_names) + if decode_coords == "all": + for attr_name in CF_RELATED_DATA: + if attr_name in var_attrs: + attr_val = var_attrs[attr_name] + if attr_name not in CF_RELATED_DATA_NEEDS_PARSING: + var_names = attr_val.split() + else: + roles_and_names = [ + role_or_name + for part in attr_val.split(":") + for role_or_name in part.split() + ] + if len(roles_and_names) % 2 == 1: + warnings.warn( + f"Attribute {attr_name:s} malformed", stacklevel=5 + ) + var_names = roles_and_names[1::2] + if all(var_name in variables for var_name in var_names): + new_vars[k].encoding[attr_name] = attr_val + coord_names.update(var_names) + else: + referenced_vars_not_in_variables = [ + proj_name + for proj_name in var_names + if proj_name not in variables + ] + warnings.warn( + f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}", + stacklevel=5, + ) + del var_attrs[attr_name] + if decode_coords and "coordinates" in attributes: attributes = dict(attributes) coord_names.update(attributes.pop("coordinates").split()) @@ -542,9 +594,14 @@ def decode_cf( decode_times : bool, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. - decode_coords : bool, optional - Use the 'coordinates' attribute on variable (or the dataset itself) to - identify coordinates. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. drop_variables : str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or @@ -664,6 +721,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): global_coordinates = non_dim_coord_names.copy() variable_coordinates = defaultdict(set) + not_technically_coordinates = set() for coord_name in non_dim_coord_names: target_dims = variables[coord_name].dims for k, v in variables.items(): @@ -674,6 +732,13 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): ): variable_coordinates[k].add(coord_name) + if any( + attr_name in v.encoding and coord_name in v.encoding.get(attr_name) + for attr_name in CF_RELATED_DATA + ): + not_technically_coordinates.add(coord_name) + global_coordinates.discard(coord_name) + variables = {k: v.copy(deep=False) for k, v in variables.items()} # keep track of variable names written to file under the "coordinates" attributes @@ -691,7 +756,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): # we get support for attrs["coordinates"] for free. coords_str = pop_to(encoding, attrs, "coordinates") if not coords_str and variable_coordinates[name]: - attrs["coordinates"] = " ".join(map(str, variable_coordinates[name])) + attrs["coordinates"] = " ".join( + str(coord_name) + for coord_name in variable_coordinates[name] + if coord_name not in not_technically_coordinates + ) if "coordinates" in attrs: written_coords.update(attrs["coordinates"].split()) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3750c0715ae..f2e00ad5622 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ requires_cftime, requires_dask, requires_h5netcdf, + requires_iris, requires_netCDF4, requires_pseudonetcdf, requires_pydap, @@ -857,6 +858,118 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) + @staticmethod + def _create_cf_dataset(): + original = Dataset( + dict( + variable=( + ("ln_p", "latitude", "longitude"), + np.arange(8, dtype="f4").reshape(2, 2, 2), + {"ancillary_variables": "std_devs det_lim"}, + ), + std_devs=( + ("ln_p", "latitude", "longitude"), + np.arange(0.1, 0.9, 0.1).reshape(2, 2, 2), + {"standard_name": "standard_error"}, + ), + det_lim=( + (), + 0.1, + {"standard_name": "detection_minimum"}, + ), + ), + dict( + latitude=("latitude", [0, 1], {"units": "degrees_north"}), + longitude=("longitude", [0, 1], {"units": "degrees_east"}), + latlon=((), -1, {"grid_mapping_name": "latitude_longitude"}), + latitude_bnds=(("latitude", "bnds2"), [[0, 1], [1, 2]]), + longitude_bnds=(("longitude", "bnds2"), [[0, 1], [1, 2]]), + areas=( + ("latitude", "longitude"), + [[1, 1], [1, 1]], + {"units": "degree^2"}, + ), + ln_p=( + "ln_p", + [1.0, 0.5], + { + "standard_name": "atmosphere_ln_pressure_coordinate", + "computed_standard_name": "air_pressure", + }, + ), + P0=((), 1013.25, {"units": "hPa"}), + ), + ) + original["variable"].encoding.update( + {"cell_measures": "area: areas", "grid_mapping": "latlon"}, + ) + original.coords["latitude"].encoding.update( + dict(grid_mapping="latlon", bounds="latitude_bnds") + ) + original.coords["longitude"].encoding.update( + dict(grid_mapping="latlon", bounds="longitude_bnds") + ) + original.coords["ln_p"].encoding.update({"formula_terms": "p0: P0 lev : ln_p"}) + return original + + def test_grid_mapping_and_bounds_are_not_coordinates_in_file(self): + original = self._create_cf_dataset() + with create_tmp_file() as tmp_file: + original.to_netcdf(tmp_file) + with open_dataset(tmp_file, decode_coords=False) as ds: + assert ds.coords["latitude"].attrs["bounds"] == "latitude_bnds" + assert ds.coords["longitude"].attrs["bounds"] == "longitude_bnds" + assert "latlon" not in ds["variable"].attrs["coordinates"] + assert "coordinates" not in ds.attrs + + def test_coordinate_variables_after_dataset_roundtrip(self): + original = self._create_cf_dataset() + with self.roundtrip(original, open_kwargs={"decode_coords": "all"}) as actual: + assert_identical(actual, original) + + with self.roundtrip(original) as actual: + expected = original.reset_coords( + ["latitude_bnds", "longitude_bnds", "areas", "P0", "latlon"] + ) + # equal checks that coords and data_vars are equal which + # should be enough + # identical would require resetting a number of attributes + # skip that. + assert_equal(actual, expected) + + def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip(self): + original = self._create_cf_dataset() + # The DataArray roundtrip should have the same warnings as the + # Dataset, but we already tested for those, so just go for the + # new warnings. It would appear that there is no way to tell + # pytest "This warning and also this warning should both be + # present". + # xarray/tests/test_conventions.py::TestCFEncodedDataStore + # needs the to_dataset. The other backends should be fine + # without it. + with pytest.warns( + UserWarning, + match=( + r"Variable\(s\) referenced in bounds not in variables: " + r"\['l(at|ong)itude_bnds'\]" + ), + ): + with self.roundtrip( + original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} + ) as actual: + assert_identical(actual, original["variable"].to_dataset()) + + @requires_iris + def test_coordinate_variables_after_iris_roundtrip(self): + original = self._create_cf_dataset() + iris_cube = original["variable"].to_iris() + actual = DataArray.from_iris(iris_cube) + # Bounds will be missing (xfail) + del original.coords["latitude_bnds"], original.coords["longitude_bnds"] + # Ancillary vars will be missing + # Those are data_vars, and will be dropped when grabbing the variable + assert_identical(actual, original["variable"]) + def test_coordinates_encoding(self): def equals_latlon(obj): return obj == "lat lon" or obj == "lon lat"