Merge branch 'main' into pre-commit-ci-update-config

scverse · Dec 16, 2024 · f09cab8 · f09cab8
2 parents e61acc3 + 803a66e
commit f09cab8
Show file tree

Hide file tree

Showing 20 changed files with 267 additions and 194 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -46,8 +46,8 @@ If applicable, add screenshots to help explain your problem.
 
 **Desktop (optional):**
 
--   OS: [e.g. macOS, Windows, Linux]
--   Version [e.g. 22]
+- OS: [e.g. macOS, Windows, Linux]
+- Version [e.g. 22]
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/README.md b/README.md
@@ -11,9 +11,9 @@
 
 SpatialData is a data framework that comprises a FAIR storage format and a collection of python libraries for performant access, alignment, and processing of uni- and multi-modal spatial omics datasets. This repository contains the core spatialdata library. See the links below to learn more about other packages in the SpatialData ecosystem.
 
--   [spatialdata-io](https://github.com/scverse/spatialdata-io): load data from common spatial omics technologies into spatialdata.
--   [spatialdata-plot](https://github.com/scverse/spatialdata-plot): Static plotting library for spatialdata.
--   [napari-spatialdata](https://github.com/scverse/napari-spatialdata): napari plugin for interactive exploration and annotation of spatial data.
+- [spatialdata-io](https://github.com/scverse/spatialdata-io): load data from common spatial omics technologies into spatialdata.
+- [spatialdata-plot](https://github.com/scverse/spatialdata-plot): Static plotting library for spatialdata.
+- [napari-spatialdata](https://github.com/scverse/napari-spatialdata): napari plugin for interactive exploration and annotation of spatial data.
 
 [//]: # "numfocus-fiscal-sponsor-attribution"
 
@@ -32,16 +32,16 @@ The spatialdata project also received support by the Chan Zuckerberg Initiative.
 
 ![SpatialDataOverview](https://github.com/scverse/spatialdata/assets/1120672/cb91071f-12a7-4b8e-9430-2b3a0f65e52f)
 
--   **The library is currently under review.** We expect there to be changes as the community provides feedback. We have an announcement channel for communicating these changes, please see the contact section below.
--   The SpatialData storage format is built on top of the [OME-NGFF](https://ngff.openmicroscopy.org/latest/) specification.
+- **The library is currently under review.** We expect there to be changes as the community provides feedback. We have an announcement channel for communicating these changes, please see the contact section below.
+- The SpatialData storage format is built on top of the [OME-NGFF](https://ngff.openmicroscopy.org/latest/) specification.
 
 ## Getting started
 
 Please refer to the [documentation][link-docs]. In particular:
 
--   [API documentation][link-api].
--   [Design doc][link-design-doc].
--   [Example notebooks][link-notebooks].
+- [API documentation][link-api].
+- [Design doc][link-design-doc].
+- [Example notebooks][link-notebooks].
 
 Another useful resource to get started is the source code of the [`spatialdata-io`](https://github.com/scverse/spatialdata-io) package, which shows example of how to read data from common technologies.
 
@@ -61,20 +61,20 @@ mamba install -c conda-forge spatialdata napari-spatialdata spatialdata-io spati
 
 ## Limitations
 
--   Code only manually tested for Windows machines. Currently the framework is being developed using Linux, macOS and Windows machines, but it is automatically tested only for Linux and macOS machines.
+- Code only manually tested for Windows machines. Currently the framework is being developed using Linux, macOS and Windows machines, but it is automatically tested only for Linux and macOS machines.
 
 ## Contact
 
 To get involved in the discussion, or if you need help to get started, you are welcome to use the following options.
 
--   <ins>Chat</ins> via [`scverse` Zulip](https://scverse.zulipchat.com/#narrow/stream/315824-spatial) (public or 1 to 1).
--   <ins>Forum post</ins> in the [scverse discourse forum](https://discourse.scverse.org/).
--   <ins>Bug report/feature request</ins> via the [GitHub issue tracker][issue-tracker].
--   <ins>Zoom call</ins> as part of the SpatialData Community Meetings, held every 2 weeks on Thursday, [schedule here](https://hackmd.io/enWU826vRai-JYaL7TZaSw).
+- <ins>Chat</ins> via [`scverse` Zulip](https://scverse.zulipchat.com/#narrow/stream/315824-spatial) (public or 1 to 1).
+- <ins>Forum post</ins> in the [scverse discourse forum](https://discourse.scverse.org/).
+- <ins>Bug report/feature request</ins> via the [GitHub issue tracker][issue-tracker].
+- <ins>Zoom call</ins> as part of the SpatialData Community Meetings, held every 2 weeks on Thursday, [schedule here](https://hackmd.io/enWU826vRai-JYaL7TZaSw).
 
 Finally, especially relevant for for developers that are building a library upon `spatialdata`, please follow this channel for:
 
--   <ins>Announcements</ins> on new features and important changes [Zulip](https://imagesc.zulipchat.com/#narrow/stream/329057-scverse/topic/spatialdata.20announcements).
+- <ins>Announcements</ins> on new features and important changes [Zulip](https://imagesc.zulipchat.com/#narrow/stream/329057-scverse/topic/spatialdata.20announcements).
 
 ## Citation
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "anndata>=0.9.1",
     "click",
     "dask-image",
-    "dask>=2024.4.1",
+    "dask>=2024.4.1,<=2024.11.2",
     "fsspec",
     "geopandas>=0.14",
     "multiscale_spatial_image>=2.0.2",

diff --git a/src/spatialdata/_core/_elements.py b/src/spatialdata/_core/_elements.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from collections import UserDict
-from collections.abc import Iterable
+from collections.abc import Iterable, KeysView, ValuesView
 from typing import Any
 from warnings import warn
 
@@ -56,6 +56,14 @@ def __delitem__(self, key: str) -> None:
         self._shared_keys.remove(key)
         super().__delitem__(key)
 
+    def keys(self) -> KeysView[str]:
+        """Return the keys of the Elements."""
+        return self.data.keys()
+
+    def values(self) -> ValuesView[Any]:
+        """Return the values of the Elements."""
+        return self.data.values()
+
 
 class Images(Elements):
     def __setitem__(self, key: str, value: Raster_T) -> None:

diff --git a/src/spatialdata/_core/operations/rasterize.py b/src/spatialdata/_core/operations/rasterize.py
@@ -217,9 +217,9 @@ def rasterize(
         The table optionally containing the `value_key` and the name of the table in the returned `SpatialData` object.
         Must be `None` when `data` is a `SpatialData` object, otherwise it assumes the default value of `'table'`.
     return_regions_as_labels
-        By default, single-scale images of shape `(c, y, x)` are returned. If `True`, returns labels and shapes as
-        labels of shape `(y, x)` as opposed to an image of shape `(c, y, x)`. Points and images are always returned
-        as images, and multiscale raster data is always returned as single-scale data.
+        By default, single-scale images of shape `(c, y, x)` are returned. If `True`, returns labels, shapes and points
+        as labels of shape `(y, x)` as opposed to an image of shape `(c, y, x)`. Images are always returned as images,
+        and multiscale raster data is always returned as single-scale data.
     agg_func
         Available only when rasterizing points and shapes. A reduction function from datashader (its name, or a
         `Callable`). See the notes for more details on the default behavior.
@@ -234,6 +234,11 @@ def rasterize(
     into a `DataArray` (not a `DataTree`). So if a `SpatialData` object with elements is passed, a `SpatialData` object
     with single-scale images and labels will be returned.
 
+    When `return_regions_as_labels` is `True`, the returned `DataArray` object will have an attribute called
+    `label_index_to_category` that maps the label index to the category name. You can access it via
+    `returned_data.attrs["label_index_to_category"]`. The returned labels will start from 1 (0 is reserved for the
+    background), and will be contiguous.
+
     Notes
     -----
     For images and labels, the parameters `value_key`, `table_name`, `agg_func`, and `return_single_channel` are not
@@ -587,7 +592,7 @@ def rasterize_images_labels(
     )
     assert isinstance(transformed_dask, DaskArray)
     channels = xdata.coords["c"].values if schema in (Image2DModel, Image3DModel) else None
-    transformed_data = schema.parse(transformed_dask, dims=xdata.dims, c_coords=channels)  # type: ignore[call-arg,arg-type]
+    transformed_data = schema.parse(transformed_dask, dims=xdata.dims, c_coords=channels)  # type: ignore[call-arg]
 
     if target_coordinate_system != "global":
         remove_transformation(transformed_data, "global")
@@ -650,7 +655,7 @@ def rasterize_shapes_points(
     if value_key is not None:
         kwargs = {"sdata": sdata, "element_name": element_name} if element_name is not None else {"element": data}
         data[VALUES_COLUMN] = get_values(value_key, table_name=table_name, **kwargs).iloc[:, 0]  # type: ignore[arg-type, union-attr]
-    elif isinstance(data, GeoDataFrame):
+    elif isinstance(data, GeoDataFrame) or isinstance(data, DaskDataFrame) and return_regions_as_labels is True:
         value_key = VALUES_COLUMN
         data[VALUES_COLUMN] = data.index.astype("category")
     else:
@@ -706,6 +711,14 @@ def rasterize_shapes_points(
     agg = agg.fillna(0)
 
     if return_regions_as_labels:
+        if label_index_to_category is not None:
+            max_label = next(iter(reversed(label_index_to_category.keys())))
+        else:
+            max_label = int(agg.max().values)
+        max_uint16 = np.iinfo(np.uint16).max
+        if max_label > max_uint16:
+            raise ValueError(f"Maximum label index is {max_label}. Values higher than {max_uint16} are not supported.")
+        agg = agg.astype(np.uint16)
         return Labels2DModel.parse(agg, transformations=transformations)
 
     agg = agg.expand_dims(dim={"c": 1}).transpose("c", "y", "x")

diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py
@@ -52,7 +52,7 @@ def _transform_raster(
     c_shape: tuple[int, ...]
     c_shape = (data.shape[0],) if "c" in axes else ()
     new_spatial_shape = tuple(
-        int(np.max(new_v[:, i]) - np.min(new_v[:, i])) for i in range(len(c_shape), n_spatial_dims + len(c_shape))  # type: ignore[operator]
+        int(np.max(new_v[:, i]) - np.min(new_v[:, i])) for i in range(len(c_shape), n_spatial_dims + len(c_shape))
     )
     output_shape = c_shape + new_spatial_shape
     translation_vector = np.min(new_v[:, :-1], axis=0)
@@ -86,8 +86,8 @@ def _transform_raster(
             # min_y_inverse = np.min(new_v_inverse[:, 1])
 
             if "c" in axes:
-                plt.imshow(da.moveaxis(transformed_dask, 0, 2), origin="lower", alpha=0.5)  # type: ignore[attr-defined]
-                plt.imshow(da.moveaxis(im, 0, 2), origin="lower", alpha=0.5)  # type: ignore[attr-defined]
+                plt.imshow(da.moveaxis(transformed_dask, 0, 2), origin="lower", alpha=0.5)
+                plt.imshow(da.moveaxis(im, 0, 2), origin="lower", alpha=0.5)
             else:
                 plt.imshow(transformed_dask, origin="lower", alpha=0.5)
                 plt.imshow(im, origin="lower", alpha=0.5)
@@ -322,7 +322,7 @@ def _(
     )
     c_coords = data.indexes["c"].values if "c" in data.indexes else None
     # mypy thinks that schema could be ShapesModel, PointsModel, ...
-    transformed_data = schema.parse(transformed_dask, dims=axes, c_coords=c_coords)  # type: ignore[call-arg,arg-type]
+    transformed_data = schema.parse(transformed_dask, dims=axes, c_coords=c_coords)  # type: ignore[call-arg]
     assert isinstance(transformed_data, DataArray)
     old_transformations = get_transformation(data, get_all=True)
     assert isinstance(old_transformations, dict)
@@ -448,7 +448,7 @@ def _(
     for ax in axes:
         indices = xtransformed["dim"] == ax
         new_ax = xtransformed[:, indices]
-        transformed[ax] = new_ax.data.flatten()  # type: ignore[attr-defined]
+        transformed[ax] = new_ax.data.flatten()
 
     old_transformations = get_transformation(data, get_all=True)
     assert isinstance(old_transformations, dict)
@@ -481,9 +481,9 @@ def _(
     )
     # TODO: nitpick, mypy expects a listof literals and here we have a list of strings.
     # I ignored but we may want to fix this
-    affine = transformation.to_affine(axes, axes)  # type: ignore[arg-type]
+    affine = transformation.to_affine(axes, axes)
     matrix = affine.matrix
-    shapely_notation = matrix[:-1, :-1].ravel().tolist() + matrix[:-1, -1].tolist()
+    shapely_notation = matrix[:-1, :-1].ravel().tolist() + matrix[:-1, -1].tolist()  # type: ignore[operator]
     transformed_geometry = data.geometry.affine_transform(shapely_notation)
     transformed_data = data.copy(deep=True)
     transformed_data.attrs[TRANSFORM_KEY] = {DEFAULT_COORDINATE_SYSTEM: Identity()}

diff --git a/src/spatialdata/_core/operations/vectorize.py b/src/spatialdata/_core/operations/vectorize.py
@@ -160,6 +160,13 @@ def to_polygons(data: SpatialElement, buffer_resolution: int | None = None) -> G
     """
     Convert a set of geometries (2D labels, 2D shapes) to approximated 2D polygons/multypolygons.
 
+    For optimal performance when converting rasters (:class:`xarray.DataArray` or :class:`datatree.DataTree`)
+    to polygons, it is recommended to configure `Dask` to use 'processes' rather than 'threads'.
+    For example, you can set this configuration with:
+
+    >>> import dask
+    >>> dask.config.set(scheduler='processes')
+
     Parameters
     ----------
     data
@@ -194,23 +201,22 @@ def _(
     else:
         element_single_scale = element
 
-    gdf_chunks = []
     chunk_sizes = element_single_scale.data.chunks
 
-    def _vectorize_chunk(chunk: np.ndarray, yoff: int, xoff: int) -> None:  # type: ignore[type-arg]
+    def _vectorize_chunk(chunk: np.ndarray, yoff: int, xoff: int) -> GeoDataFrame:  # type: ignore[type-arg]
         gdf = _vectorize_mask(chunk)
         gdf["chunk-location"] = f"({yoff}, {xoff})"
         gdf.geometry = gdf.translate(xoff, yoff)
-        gdf_chunks.append(gdf)
+        return gdf
 
     tasks = [
         dask.delayed(_vectorize_chunk)(chunk, sum(chunk_sizes[0][:iy]), sum(chunk_sizes[1][:ix]))
         for iy, row in enumerate(element_single_scale.data.to_delayed())
         for ix, chunk in enumerate(row)
     ]
-    dask.compute(tasks)
 
-    gdf = pd.concat(gdf_chunks)
+    results = dask.compute(*tasks)
+    gdf = pd.concat(results)
     gdf = GeoDataFrame([_dissolve_on_overlaps(*item) for item in gdf.groupby("label")], columns=["label", "geometry"])
     gdf.index = gdf["label"]
 

diff --git a/src/spatialdata/_core/query/relational_query.py b/src/spatialdata/_core/query/relational_query.py
@@ -214,7 +214,7 @@ def _filter_table_by_elements(
             # some instances have not a corresponding row in the table
             instances = np.setdiff1d(instances, n0)
         assert np.sum(to_keep) == len(instances)
-        assert sorted(set(instances.tolist())) == sorted(set(table.obs[instance_key].tolist()))
+        assert sorted(set(instances.tolist())) == sorted(set(table.obs[instance_key].tolist()))  # type: ignore[type-var]
         table_df = pd.DataFrame({instance_key: table.obs[instance_key], "position": np.arange(len(instances))})
         merged = pd.merge(table_df, pd.DataFrame(index=instances), left_on=instance_key, right_index=True, how="right")
         matched_positions = merged["position"].to_numpy()
@@ -467,7 +467,11 @@ def _left_join_spatialelement_table(
                 )
                 continue
 
-    joined_indices = joined_indices.dropna() if joined_indices is not None else None
+    if joined_indices is not None:
+        joined_indices = joined_indices.dropna()
+        # if nan were present, the dtype would have been changed to float
+        if joined_indices.dtype == float:
+            joined_indices = joined_indices.astype(int)
     joined_table = table[joined_indices, :].copy() if joined_indices is not None else None
     _inplace_fix_subset_categorical_obs(subset_adata=joined_table, original_adata=table)
 

diff --git a/src/spatialdata/_core/query/spatial_query.py b/src/spatialdata/_core/query/spatial_query.py
@@ -700,8 +700,8 @@ def _(
             bounding_box_mask = _bounding_box_mask_points(
                 points=points_query_coordinate_system,
                 axes=axes,
-                min_coordinate=min_c,
-                max_coordinate=max_c,
+                min_coordinate=min_c,  # type: ignore[arg-type]
+                max_coordinate=max_c,  # type: ignore[arg-type]
             )
             if len(bounding_box_mask) == 1:
                 bounding_box_mask = bounding_box_mask[0]

diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py
@@ -1656,11 +1656,12 @@ def tables(self) -> Tables:
         return self._tables
 
     @tables.setter
-    def tables(self, shapes: dict[str, GeoDataFrame]) -> None:
+    def tables(self, tables: dict[str, AnnData]) -> None:
         """Set tables."""
         self._shared_keys = self._shared_keys - set(self._tables.keys())
         self._tables = Tables(shared_keys=self._shared_keys)
-        for k, v in shapes.items():
+        for k, v in tables.items():
+            TableModel().validate(v)
             self._tables[k] = v
 
     @property

diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 import zarr
-from dask.dataframe import DataFrame as DaskDataFrame  # type: ignore[attr-defined]
+from dask.dataframe import DataFrame as DaskDataFrame
 from dask.dataframe import read_parquet
 from ome_zarr.format import Format
 

diff --git a/src/spatialdata/_types.py b/src/spatialdata/_types.py
@@ -1,18 +1,14 @@
+from typing import Any
+
 import numpy as np
 from xarray import DataArray, DataTree
 
 __all__ = ["ArrayLike", "ColorLike", "DTypeLike", "Raster_T"]
 
-try:
-    from numpy.typing import DTypeLike, NDArray
-
-    ArrayLike = NDArray[np.float64]
-    IntArrayLike = NDArray[np.int64]  # or any np.integer
+from numpy.typing import DTypeLike, NDArray
 
-except (ImportError, TypeError):
-    ArrayLike = np.ndarray  # type: ignore[misc]
-    IntArrayLike = np.ndarray  # type: ignore[misc]
-    DTypeLike = np.dtype  # type: ignore[misc, assignment]
+ArrayLike = NDArray[np.floating[Any]]
+IntArrayLike = NDArray[np.integer[Any]]
 
 Raster_T = DataArray | DataTree
 ColorLike = tuple[float, ...] | str
diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py
@@ -80,7 +80,7 @@ def _compute_paddings(data: DataArray, axis: str) -> tuple[int, int]:
         others = list(data.dims)
         others.remove(axis)
         # mypy (luca's pycharm config) can't see the isclose method of dask array
-        s = da.isclose(data.sum(dim=others), 0)  # type: ignore[attr-defined]
+        s = da.isclose(data.sum(dim=others), 0)
         # TODO: rewrite this to use dask array; can't get it to work with it
         x = s.compute()
         non_zero = np.where(x == 0)[0]

diff --git a/src/spatialdata/dataloader/datasets.py b/src/spatialdata/dataloader/datasets.py
@@ -144,7 +144,7 @@ def __init__(
                 **dict(rasterize_kwargs),
             )
             if rasterize
-            else bounding_box_query  # type: ignore[assignment]
+            else bounding_box_query
         )
         self._return = self._get_return(return_annotations, table_name)
         self.transform = transform

diff --git a/src/spatialdata/datasets.py b/src/spatialdata/datasets.py
@@ -182,7 +182,7 @@ def _image_blobs(
         masks = []
         for i in range(n_channels):
             mask = self._generate_blobs(length=length, seed=i)
-            mask = (mask - mask.min()) / np.ptp(mask)  # type: ignore[attr-defined]
+            mask = (mask - mask.min()) / np.ptp(mask)
             masks.append(mask)
 
         x = np.stack(masks, axis=0)