Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor CFWriter utility into CF directory #2524

Merged
merged 42 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
5ac92a7
Refactor area-related functions
ghiggi Jun 27, 2023
4b19566
Refactor attrs-related functions
ghiggi Jun 27, 2023
6c27d86
Fix datetime import
ghiggi Jun 27, 2023
59f73ea
Replace deprecated CFWriter.da2cf with make_cf_dataarray
ghiggi Jun 27, 2023
396700f
Refactor Dataset encodings
ghiggi Jun 27, 2023
f664c60
Refactor CF-compliant DataArray creation
ghiggi Jun 27, 2023
3953319
Refactor CF-compliant Datasets creation
ghiggi Jun 27, 2023
064558d
Fix changed imports
ghiggi Jun 27, 2023
5762950
Fix all writers tests
ghiggi Jun 27, 2023
dabaa44
Add logging on top-file
ghiggi Jun 27, 2023
ea12d9b
Update satpy/tests/writer_tests/cf_tests/test_dataaarray.py
ghiggi Jun 28, 2023
109a80e
Merge branch 'main' into feature-cf-dataset
ghiggi Oct 10, 2023
eb0d6e7
Merge branch 'feature-cf-dataset' of github.com:ghiggi/satpy into fea…
ghiggi Oct 10, 2023
fe43fc8
Dummy changes
ghiggi Oct 10, 2023
c37fcb7
Set private functions
ghiggi Oct 10, 2023
22a8d09
Reorg files
ghiggi Oct 10, 2023
e3df20e
Unest cf directories
ghiggi Oct 10, 2023
bf33719
Fix imports error
ghiggi Oct 10, 2023
0dae774
Rename functions and refactor
ghiggi Oct 10, 2023
57afb82
Fix cf_writer module path
ghiggi Oct 10, 2023
1575849
Solve conflicts and merge main
ghiggi Oct 11, 2023
9ef2af9
Avoid modification to CHANGELOG
ghiggi Nov 14, 2023
e96f0de
Rename _encode* functions
ghiggi Nov 14, 2023
48df162
Update xarray version
ghiggi Nov 14, 2023
9372e32
Merge main
ghiggi Nov 14, 2023
14b1f06
Set default epoch argument to None
ghiggi Nov 14, 2023
ec5f8fc
Reduce future risk of circular imports
ghiggi Nov 14, 2023
9a8810f
Fix tests
ghiggi Nov 14, 2023
09325c2
Fix docstrings
ghiggi Nov 16, 2023
c42d1ed
Move EPOCH to satpy.cf.coords
ghiggi Nov 16, 2023
ba09f18
Simplify functions for CodeScene happiness
ghiggi Nov 16, 2023
83e815d
Cleanup CF attrs functions
djhoese Nov 16, 2023
cadcfef
Remove commented out tests
djhoese Nov 16, 2023
ceabff9
Refactor attribute handling
djhoese Nov 16, 2023
bf681f2
Rename dataarray to data_arr and add type annotations
djhoese Nov 16, 2023
055cbef
Reduce code complexity
djhoese Nov 17, 2023
2a65eea
Refactor CF area tests
djhoese Nov 17, 2023
63e8407
Refactor CF area tests a little more
djhoese Nov 17, 2023
5042202
Fix sphinx docstring error in make_cf_data_array
djhoese Nov 17, 2023
cc366c0
Add py.typed file so users get type information in their IDE
djhoese Nov 17, 2023
44c3c29
Merge branch 'main' into feature-cf-dataset
mraspaud Nov 28, 2023
b4e8fa5
Fix style
mraspaud Nov 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ include satpy/version.py
include pyproject.toml
include setup.py
include setup.cfg
include satpy/py.typed
global-exclude *.py[cod]
63 changes: 25 additions & 38 deletions satpy/_scene_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,48 +52,35 @@
If Scene DataArrays are on different areas, currently it fails, although
in future we might return a DataTree object, grouped by area.

Parameters
----------
scn: satpy.Scene
Satpy Scene.
datasets (iterable):
List of Satpy Scene datasets to include in the output xr.Dataset.
Elements can be string name, a wavelength as a number, a DataID,
or DataQuery object.
If None (the default), it include all loaded Scene datasets.
header_attrs:
Global attributes of the output xr.Dataset.
epoch (str):
Reference time for encoding the time coordinates (if available).
Example format: "seconds since 1970-01-01 00:00:00".
If None, the default reference time is retrieved using "from satpy.cf_writer import EPOCH"
flatten_attrs (bool):
If True, flatten dict-type attributes.
exclude_attrs (list):
List of xr.DataArray attribute names to be excluded.
include_lonlats (bool):
If True, it includes 'latitude' and 'longitude' coordinates.
If the 'area' attribute is a SwathDefinition, it always includes
latitude and longitude coordinates.
pretty (bool):
Don't modify coordinate names, if possible. Makes the file prettier,
but possibly less consistent.
include_orig_name (bool).
Include the original dataset name as a variable attribute in the xr.Dataset.
numeric_name_prefix (str):
Prefix to add the each variable with name starting with a digit.
Use '' or None to leave this out.
Args:
scn (satpy.Scene): Satpy Scene.
datasets (iterable, optional): List of Satpy Scene datasets to include in
the output xr.Dataset. Elements can be string name, a wavelength as a
number, a DataID, or DataQuery object. If None (the default), it
includes all loaded Scene datasets.
header_attrs: Global attributes of the output xr.Dataset.
epoch (str, optional): Reference time for encoding the time coordinates
(if available). Format example: "seconds since 1970-01-01 00:00:00".
If None, the default reference time is retrieved using
"from satpy.cf_writer import EPOCH".
flatten_attrs (bool, optional): If True, flatten dict-type attributes.
exclude_attrs (list, optional): List of xr.DataArray attribute names to
be excluded.
include_lonlats (bool, optional): If True, includes 'latitude' and
'longitude' coordinates. If the 'area' attribute is a SwathDefinition,
it always includes latitude and longitude coordinates.
pretty (bool, optional): Don't modify coordinate names, if possible. Makes
the file prettier, but possibly less consistent.
include_orig_name (bool, optional): Include the original dataset name as a
variable attribute in the xr.Dataset.
numeric_name_prefix (str, optional): Prefix to add to each variable with
name starting with a digit. Use '' or None to leave this out.

Returns:
-------
ds, xr.Dataset
A CF-compliant xr.Dataset
xr.Dataset: A CF-compliant xr.Dataset

"""
from satpy.writers.cf_writer import EPOCH, collect_cf_datasets

if epoch is None:
epoch = EPOCH
from satpy.cf.datasets import collect_cf_datasets

Check notice on line 83 in satpy/_scene_converters.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

✅ No longer an issue: Large Method

to_xarray is no longer above the threshold for lines of code

# Get list of DataArrays
if datasets is None:
Expand Down
2 changes: 0 additions & 2 deletions satpy/writers/cf/__init__.py → satpy/cf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Code for generation of CF-compliant datasets."""
78 changes: 78 additions & 0 deletions satpy/cf/area.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) 2017-2023 Satpy developers
#
# This file is part of satpy.
#
# satpy is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""CF processing of pyresample area information."""
import logging

import xarray as xr
from packaging.version import Version
from pyresample.geometry import AreaDefinition, SwathDefinition

logger = logging.getLogger(__name__)


def _add_lonlat_coords(data_arr: xr.DataArray) -> xr.DataArray:
"""Add 'longitude' and 'latitude' coordinates to DataArray."""
data_arr = data_arr.copy()
area = data_arr.attrs["area"]
ignore_dims = {dim: 0 for dim in data_arr.dims if dim not in ["x", "y"]}
chunks = getattr(data_arr.isel(**ignore_dims), "chunks", None)
lons, lats = area.get_lonlats(chunks=chunks)
data_arr["longitude"] = xr.DataArray(lons, dims=["y", "x"],
attrs={"name": "longitude",
"standard_name": "longitude",
"units": "degrees_east"},
name="longitude")
data_arr["latitude"] = xr.DataArray(lats, dims=["y", "x"],
attrs={"name": "latitude",
"standard_name": "latitude",
"units": "degrees_north"},
name="latitude")
return data_arr


def _create_grid_mapping(area):
"""Create the grid mapping instance for `area`."""
import pyproj

if Version(pyproj.__version__) < Version("2.4.1"):
# technically 2.2, but important bug fixes in 2.4.1
raise ImportError("'cf' writer requires pyproj 2.4.1 or greater")

Check warning on line 52 in satpy/cf/area.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/area.py#L52

Added line #L52 was not covered by tests
# let pyproj do the heavily lifting (pyproj 2.0+ required)
grid_mapping = area.crs.to_cf()
return area.area_id, grid_mapping


def _add_grid_mapping(data_arr: xr.DataArray) -> tuple[xr.DataArray, xr.DataArray]:
"""Convert an area to at CF grid mapping."""
data_arr = data_arr.copy()
area = data_arr.attrs["area"]
gmapping_var_name, attrs = _create_grid_mapping(area)
data_arr.attrs["grid_mapping"] = gmapping_var_name
return data_arr, xr.DataArray(0, attrs=attrs, name=gmapping_var_name)


def area2cf(data_arr: xr.DataArray, include_lonlats: bool = False, got_lonlats: bool = False) -> list[xr.DataArray]:
"""Convert an area to at CF grid mapping or lon and lats."""
res = []
include_lonlats = include_lonlats or isinstance(data_arr.attrs["area"], SwathDefinition)
is_area_def = isinstance(data_arr.attrs["area"], AreaDefinition)
if not got_lonlats and include_lonlats:
data_arr = _add_lonlat_coords(data_arr)
if is_area_def:
data_arr, gmapping = _add_grid_mapping(data_arr)
res.append(gmapping)
res.append(data_arr)
return res
231 changes: 231 additions & 0 deletions satpy/cf/attrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# Copyright (c) 2017-2023 Satpy developers
#
# This file is part of satpy.
#
# satpy is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""CF processing of attributes."""
from __future__ import annotations

import datetime
import json
import logging
from collections import OrderedDict

import numpy as np
import xarray as xr

from satpy.writers.utils import flatten_dict

logger = logging.getLogger(__name__)


class AttributeEncoder(json.JSONEncoder):
"""JSON encoder for dataset attributes."""

def default(self, obj):
"""Return a json-serializable object for *obj*.

In order to facilitate decoding, elements in dictionaries, lists/tuples and multi-dimensional arrays are
encoded recursively.
"""
if isinstance(obj, dict):
serialized = {}
for key, val in obj.items():
serialized[key] = self.default(val)
return serialized

Check warning on line 45 in satpy/cf/attrs.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/attrs.py#L42-L45

Added lines #L42 - L45 were not covered by tests
elif isinstance(obj, (list, tuple, np.ndarray)):
return [self.default(item) for item in obj]
return self._encode(obj)

def _encode(self, obj):
"""Encode the given object as a json-serializable datatype."""
if isinstance(obj, (bool, np.bool_)):
# Bool has to be checked first, because it is a subclass of int
return str(obj).lower()
elif isinstance(obj, (int, float, str)):
return obj

Check warning on line 56 in satpy/cf/attrs.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/attrs.py#L56

Added line #L56 was not covered by tests
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)

Check warning on line 60 in satpy/cf/attrs.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/attrs.py#L60

Added line #L60 was not covered by tests
elif isinstance(obj, np.void):
return tuple(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()

Check warning on line 64 in satpy/cf/attrs.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/attrs.py#L64

Added line #L64 was not covered by tests
return str(obj)


def _encode_numpy_array(obj):
"""Encode numpy array as a netCDF4 serializable datatype."""
from satpy.writers.cf_writer import NC4_DTYPES

# Only plain 1-d arrays are supported. Skip record arrays and multi-dimensional arrays.
is_plain_1d = not obj.dtype.fields and len(obj.shape) <= 1
if not is_plain_1d:
raise ValueError("Only a 1D numpy array can be encoded as netCDF attribute.")
if obj.dtype in NC4_DTYPES:
return obj
if obj.dtype == np.bool_:
# Boolean arrays are not supported, convert to array of strings.
return [s.lower() for s in obj.astype(str)]
return obj.tolist()

Check warning on line 81 in satpy/cf/attrs.py

View check run for this annotation

Codecov / codecov/patch

satpy/cf/attrs.py#L81

Added line #L81 was not covered by tests


def _encode_object(obj):
"""Try to encode `obj` as a netCDF/Zarr compatible datatype which most closely resembles the object's nature.

Raises:
ValueError if no such datatype could be found
"""
is_nonbool_int = isinstance(obj, int) and not isinstance(obj, (bool, np.bool_))
is_encode_type = isinstance(obj, (float, str, np.integer, np.floating))
if is_nonbool_int or is_encode_type:
return obj
elif isinstance(obj, np.ndarray):
return _encode_numpy_array(obj)
raise ValueError("Unable to encode")


def _try_decode_object(obj):
"""Try to decode byte string."""
try:
decoded = obj.decode()
except AttributeError:
decoded = obj
return decoded


def _encode_python_objects(obj):
"""Try to find the datatype which most closely resembles the object's nature.

If on failure, encode as a string. Plain lists are encoded recursively.
"""
if isinstance(obj, (list, tuple)) and all([not isinstance(item, (list, tuple)) for item in obj]):
return [_encode_to_cf(item) for item in obj]
try:
dump = _encode_object(obj)
except ValueError:
decoded = _try_decode_object(obj)
dump = json.dumps(decoded, cls=AttributeEncoder).strip('"')
return dump


def _encode_to_cf(obj):
"""Encode the given object as a netcdf compatible datatype."""
try:
return obj.to_cf()
except AttributeError:
return _encode_python_objects(obj)


def _encode_nc_attrs(attrs):
"""Encode dataset attributes in a netcdf compatible datatype.

Args:
attrs (dict):
Attributes to be encoded
Returns:
dict: Encoded (and sorted) attributes

"""
encoded_attrs = []
for key, val in sorted(attrs.items()):
if val is not None:
encoded_attrs.append((key, _encode_to_cf(val)))
return OrderedDict(encoded_attrs)


def preprocess_attrs(
data_arr: xr.DataArray,
flatten_attrs: bool,
exclude_attrs: list[str] | None
) -> xr.DataArray:
"""Preprocess DataArray attributes to be written into CF-compliant netCDF/Zarr."""
_drop_attrs(data_arr, exclude_attrs)
_add_ancillary_variables_attrs(data_arr)
_format_prerequisites_attrs(data_arr)

if "long_name" not in data_arr.attrs and "standard_name" not in data_arr.attrs:
data_arr.attrs["long_name"] = data_arr.name

if flatten_attrs:
data_arr.attrs = flatten_dict(data_arr.attrs)

data_arr.attrs = _encode_nc_attrs(data_arr.attrs)

return data_arr


def _drop_attrs(
data_arr: xr.DataArray,
user_excluded_attrs: list[str] | None
) -> None:
"""Remove undesirable attributes."""
attrs_to_drop = (
(user_excluded_attrs or []) +
_get_satpy_attrs(data_arr) +
_get_none_attrs(data_arr) +
["area"]
)
for key in attrs_to_drop:
data_arr.attrs.pop(key, None)


def _get_satpy_attrs(data_arr: xr.DataArray) -> list[str]:
"""Remove _satpy attribute."""
return [key for key in data_arr.attrs if key.startswith("_satpy")] + ["_last_resampler"]


def _get_none_attrs(data_arr: xr.DataArray) -> list[str]:
"""Remove attribute keys with None value."""
return [attr_name for attr_name, attr_val in data_arr.attrs.items() if attr_val is None]


def _add_ancillary_variables_attrs(data_arr: xr.DataArray) -> None:
"""Replace ancillary_variables DataArray with a list of their name."""
list_ancillary_variable_names = [da_ancillary.attrs["name"]
for da_ancillary in data_arr.attrs.get("ancillary_variables", [])]
if list_ancillary_variable_names:
data_arr.attrs["ancillary_variables"] = " ".join(list_ancillary_variable_names)
else:
data_arr.attrs.pop("ancillary_variables", None)


def _format_prerequisites_attrs(data_arr: xr.DataArray) -> None:
"""Reformat prerequisites attribute value to string."""
if "prerequisites" in data_arr.attrs:
data_arr.attrs["prerequisites"] = [np.bytes_(str(prereq)) for prereq in data_arr.attrs["prerequisites"]]


def _add_history(attrs):
"""Add 'history' attribute to dictionary."""
_history_create = "Created by pytroll/satpy on {}".format(datetime.datetime.utcnow())
if "history" in attrs:
if isinstance(attrs["history"], list):
attrs["history"] = "".join(attrs["history"])
attrs["history"] += "\n" + _history_create
else:
attrs["history"] = _history_create
return attrs


def preprocess_header_attrs(header_attrs, flatten_attrs=False):
"""Prepare file header attributes."""
if header_attrs is not None:
if flatten_attrs:
header_attrs = flatten_dict(header_attrs)
header_attrs = _encode_nc_attrs(header_attrs) # OrderedDict
else:
header_attrs = {}
header_attrs = _add_history(header_attrs)
return header_attrs
Loading
Loading