Test joins with string indices and instance id (#485)

* test join strings * fix dtype aggregate --------- Co-authored-by: Luca Marconato <[email protected]>
scverse · Mar 14, 2024 · a2970d3 · a2970d3
1 parent 09e339e
commit a2970d3
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 29 deletions.
diff --git a/src/spatialdata/_core/operations/aggregate.py b/src/spatialdata/_core/operations/aggregate.py
@@ -243,7 +243,14 @@ def _create_sdata_from_table_and_shapes(
 ) -> SpatialData:
     from spatialdata._core._deepcopy import deepcopy as _deepcopy
 
-    table.obs[instance_key] = table.obs_names.copy()
+    shapes_index_dtype = shapes.index.dtype if isinstance(shapes, GeoDataFrame) else shapes.dtype
+    try:
+        table.obs[instance_key] = table.obs_names.copy().astype(shapes_index_dtype)
+    except ValueError as err:
+        raise TypeError(
+            f"Instance key column dtype in table resulting from aggregation cannot be cast to the dtype of"
+            f"element {shapes_name}.index"
+        ) from err
     table.obs[region_key] = shapes_name
     table = TableModel.parse(table, region=shapes_name, region_key=region_key, instance_key=instance_key)
 

diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py
@@ -199,6 +199,13 @@ def validate_table_in_spatialdata(self, table: AnnData) -> None:
                     else:
                         dtype = element.index.dtype
                     if dtype != table.obs[instance_key].dtype:
+                        if dtype == str or table.obs[instance_key].dtype == str:
+                            raise TypeError(
+                                f"Table instance_key column ({instance_key}) has a dtype "
+                                f"({table.obs[instance_key].dtype}) that does not match the dtype of the indices of "
+                                f"the annotated element ({dtype})."
+                            )
+
                         warnings.warn(
                             (
                                 f"Table instance_key column ({instance_key}) has a dtype "

diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
@@ -20,7 +20,6 @@
 from multiscale_spatial_image.multiscale_spatial_image import MultiscaleSpatialImage
 from multiscale_spatial_image.to_multiscale.to_multiscale import Methods
 from pandas import CategoricalDtype
-from pandas.errors import IntCastingNaNError
 from shapely._geometry import GeometryType
 from shapely.geometry import MultiPolygon, Point, Polygon
 from shapely.geometry.collection import GeometryCollection
@@ -795,6 +794,11 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None:
             raise ValueError(f"`{attr[self.REGION_KEY_KEY]}` not found in `adata.obs`.")
         if attr[self.INSTANCE_KEY] not in data.obs:
             raise ValueError(f"`{attr[self.INSTANCE_KEY]}` not found in `adata.obs`.")
+        if (dtype := data.obs[attr[self.INSTANCE_KEY]].dtype) not in [np.int16, np.int32, np.int64, str]:
+            raise TypeError(
+                f"Only np.int16, np.int32, np.int64 or string allowed as dtype for "
+                f"instance_key column in obs. Dtype found to be {dtype}"
+            )
         expected_regions = attr[self.REGION_KEY] if isinstance(attr[self.REGION_KEY], list) else [attr[self.REGION_KEY]]
         found_regions = data.obs[attr[self.REGION_KEY_KEY]].unique().tolist()
         if len(set(expected_regions).symmetric_difference(set(found_regions))) > 0:
@@ -881,14 +885,6 @@ def parse(
             adata.obs[region_key] = pd.Categorical(adata.obs[region_key])
         if instance_key is None:
             raise ValueError("`instance_key` must be provided.")
-        if adata.obs[instance_key].dtype != int:
-            try:
-                warnings.warn(
-                    f"Converting `{cls.INSTANCE_KEY}: {instance_key}` to integer dtype.", UserWarning, stacklevel=2
-                )
-                adata.obs[instance_key] = adata.obs[instance_key].astype(int)
-            except IntCastingNaNError as exc:
-                raise ValueError("Values within table.obs[] must be able to be coerced to int dtype.") from exc
 
         grouped = adata.obs.groupby(region_key, observed=True)
         grouped_size = grouped.size()
@@ -901,6 +897,7 @@ def parse(
 
         attr = {"region": region, "region_key": region_key, "instance_key": instance_key}
         adata.uns[cls.ATTRS_KEY] = attr
+        cls().validate(adata)
         return adata
 
 

diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import math
-import warnings
 
 import numpy as np
 import pytest
@@ -419,10 +418,7 @@ def test_validate_table_in_spatialdata(full_sdata):
     region, region_key, _ = get_table_keys(table)
     assert region == "labels2d"
 
-    # no warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("error")
-        full_sdata.validate_table_in_spatialdata(table)
+    full_sdata.validate_table_in_spatialdata(table)
 
     # dtype mismatch
     full_sdata.labels["labels2d"] = Labels2DModel.parse(full_sdata.labels["labels2d"].astype("int16"))
@@ -437,10 +433,7 @@ def test_validate_table_in_spatialdata(full_sdata):
     table.obs[region_key] = "points_0"
     full_sdata.set_table_annotates_spatialelement("table", region="points_0")
 
-    # no warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("error")
-        full_sdata.validate_table_in_spatialdata(table)
+    full_sdata.validate_table_in_spatialdata(table)
 
     # dtype mismatch
     full_sdata.points["points_0"].index = full_sdata.points["points_0"].index.astype("int16")

diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py
@@ -22,6 +22,37 @@ def test_match_table_to_element(sdata_query_aggregation):
     # TODO: add tests for labels
 
 
+def test_join_using_string_instance_id_and_index(sdata_query_aggregation):
+    sdata_query_aggregation["table"].obs["instance_id"] = [
+        f"string_{i}" for i in sdata_query_aggregation["table"].obs["instance_id"]
+    ]
+    sdata_query_aggregation["values_circles"].index = pd.Index(
+        [f"string_{i}" for i in sdata_query_aggregation["values_circles"].index]
+    )
+    sdata_query_aggregation["values_polygons"].index = pd.Index(
+        [f"string_{i}" for i in sdata_query_aggregation["values_polygons"].index]
+    )
+
+    sdata_query_aggregation["values_polygons"] = sdata_query_aggregation["values_polygons"][:5]
+    sdata_query_aggregation["values_circles"] = sdata_query_aggregation["values_circles"][:5]
+
+    element_dict, table = join_sdata_spatialelement_table(
+        sdata_query_aggregation, ["values_circles", "values_polygons"], "table", "inner"
+    )
+    # Note that we started with 21 n_obs.
+    assert table.n_obs == 10
+
+    element_dict, table = join_sdata_spatialelement_table(
+        sdata_query_aggregation, ["values_circles", "values_polygons"], "table", "right_exclusive"
+    )
+    assert table.n_obs == 11
+
+    element_dict, table = join_sdata_spatialelement_table(
+        sdata_query_aggregation, ["values_circles", "values_polygons"], "table", "right"
+    )
+    assert table.n_obs == 21
+
+
 def test_left_inner_right_exclusive_join(sdata_query_aggregation):
     element_dict, table = join_sdata_spatialelement_table(
         sdata_query_aggregation, "values_polygons", "table", "right_exclusive"

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
@@ -318,6 +318,14 @@ def test_table_model(
         region: str | np.ndarray,
     ) -> None:
         region_key = "reg"
+        obs = pd.DataFrame(
+            RNG.choice(np.arange(0, 100, dtype=float), size=(10, 3), replace=False), columns=["A", "B", "C"]
+        )
+        obs[region_key] = region
+        adata = AnnData(RNG.normal(size=(10, 2)), obs=obs)
+        with pytest.raises(TypeError, match="Only np.int16"):
+            model.parse(adata, region=region, region_key=region_key, instance_key="A")
+
         obs = pd.DataFrame(RNG.choice(np.arange(0, 100), size=(10, 3), replace=False), columns=["A", "B", "C"])
         obs[region_key] = region
         adata = AnnData(RNG.normal(size=(10, 2)), obs=obs)
@@ -332,16 +340,6 @@ def test_table_model(
         assert TableModel.REGION_KEY_KEY in table.uns[TableModel.ATTRS_KEY]
         assert table.uns[TableModel.ATTRS_KEY][TableModel.REGION_KEY] == region
 
-        obs["A"] = obs["A"].astype(str)
-        adata = AnnData(RNG.normal(size=(10, 2)), obs=obs)
-        with pytest.warns(UserWarning, match="Converting"):
-            model.parse(adata, region=region, region_key=region_key, instance_key="A")
-
-        obs["A"] = pd.Series(len([chr(ord("a") + i) for i in range(10)]))
-        adata = AnnData(RNG.normal(size=(10, 2)), obs=obs)
-        with pytest.raises(ValueError, match="Values within"):
-            model.parse(adata, region=region, region_key=region_key, instance_key="A")
-
     @pytest.mark.parametrize("model", [TableModel])
     @pytest.mark.parametrize("region", [["sample_1"] * 5 + ["sample_2"] * 5])
     def test_table_instance_key_values_not_unique(self, model: TableModel, region: str | np.ndarray):