From 076ae6d1be7eedbd593aaf61945ae0c87fce1f23 Mon Sep 17 00:00:00 2001 From: Julia Dark <24235303+jp-dark@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:39:24 -0500 Subject: [PATCH] [python] Export full experiment to SpatialData (#3409) Add a `to_spatial_data` function to `io.spatial` that exports a full SOMA `Experiment` to SpatialData. --- .../src/tiledbsoma/io/spatial/__init__.py | 3 +- .../src/tiledbsoma/io/spatial/outgest.py | 224 ++++++++++++- .../python/tests/test_basic_spatialdata_io.py | 298 ++++++++++++++++++ 3 files changed, 521 insertions(+), 4 deletions(-) create mode 100644 apis/python/tests/test_basic_spatialdata_io.py diff --git a/apis/python/src/tiledbsoma/io/spatial/__init__.py b/apis/python/src/tiledbsoma/io/spatial/__init__.py index af2c4a92d3..fa1af654bd 100644 --- a/apis/python/src/tiledbsoma/io/spatial/__init__.py +++ b/apis/python/src/tiledbsoma/io/spatial/__init__.py @@ -5,5 +5,6 @@ """ from .ingest import VisiumPaths, from_visium +from .outgest import to_spatial_data -__all__ = ["from_visium", "VisiumPaths"] +__all__ = ["to_spatial_data", "from_visium", "VisiumPaths"] diff --git a/apis/python/src/tiledbsoma/io/spatial/outgest.py b/apis/python/src/tiledbsoma/io/spatial/outgest.py index 7b23ccd7a1..02d5376fa5 100644 --- a/apis/python/src/tiledbsoma/io/spatial/outgest.py +++ b/apis/python/src/tiledbsoma/io/spatial/outgest.py @@ -3,7 +3,7 @@ # # Licensed under the MIT License. import warnings -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence, Tuple, Union import pandas as pd import somacore @@ -26,8 +26,10 @@ warnings.warn("Experimental spatial outgestor requires the geopandas package.") raise err -from ... import MultiscaleImage, PointCloudDataFrame -from ..._constants import SOMA_JOINID +from ... import Experiment, MultiscaleImage, PointCloudDataFrame, Scene +from ..._constants import SOMA_JOINID, SPATIAL_DISCLAIMER +from ..._spatial_util import transform_from_json +from .. import to_anndata from ._xarray_backend import dense_nd_array_to_data_array, images_to_datatree if TYPE_CHECKING: @@ -406,3 +408,219 @@ def to_spatial_data_multiscale_image( ) return images_to_datatree(image_data_arrays) + + +def _get_transform_from_collection( + key: str, metadata: Mapping[str, Any] +) -> Optional[somacore.CoordinateTransform]: + transform_key = f"soma_scene_registry_{key}" + if transform_key in metadata: + transform_json = metadata[transform_key] + return transform_from_json(transform_json) + return None + + +def _add_scene_to_spatial_data( + sdata: sd.SpatialData, + scene_id: str, + scene: Scene, + *, + obs_id_name: str, + var_id_name: str, + measurement_names: Optional[Sequence[str]] = None, +) -> None: + """Adds items from a Scene to a SpatialData object. + + Args: + sdata: SpatialData object to update. + scene_id: Name of the scene that will be added to the SpatialData object. + scene: The scene that is being added to the SpatialData object. + obs_id_name: Name to use for the ``soma_joinid`` in ``obsl``. + var_id_name: Name to use fo the ``soma_joinid in ``varl``. + measurement_names: The names of measurements to export. If ``None``, all + measurements are included. Defaults to ``None``. + + """ + # Cannot have spatial data if no coordinate space. + if scene.coordinate_space is None: + return + + # Get the map from Scene dimension names to SpatialData dimension names. + input_axis_names = scene.coordinate_space.axis_names + _, scene_dim_map = _convert_axis_names(input_axis_names, input_axis_names) + + # Export obsl data to SpatialData. + if "obsl" in scene: + for key, df in scene.obsl.items(): + output_key = f"{scene_id}_{key}" + transform = _get_transform_from_collection(key, scene.obsl.metadata) + if isinstance(df, PointCloudDataFrame): + if "soma_geometry" in df.metadata: + sdata.shapes[output_key] = to_spatial_data_shapes( + df, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + soma_joinid_name=obs_id_name, + ) + else: + sdata.points[output_key] = to_spatial_data_points( + df, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + soma_joinid_name=obs_id_name, + ) + else: + warnings.warn( + f"Skipping obsl[{key}] in Scene {scene_id}; unexpected datatype" + f" {type(df).__name__}." + ) + + # Export varl data to SpatialData. + if "varl" in scene: + for measurement_name, subcoll in scene.varl.items(): + if ( + measurement_names is not None + and measurement_name not in measurement_names + ): + continue + for key, df in subcoll.items(): + output_key = f"{scene_id}_{measurement_name}_{key}" + transform = _get_transform_from_collection(key, subcoll.metadata) + if isinstance(df, PointCloudDataFrame): + if "soma_geometry" in df.metadata: + sdata.shapes[output_key] = to_spatial_data_shapes( + df, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + soma_joinid_name=var_id_name, + ) + else: + sdata.points[output_key] = to_spatial_data_points( + df, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + soma_joinid_name=var_id_name, + ) + else: + warnings.warn( + f"Skipping varl[{measurement_name}][{key}] in Scene " + f"{scene_id}; unexpected datatype {type(df).__name__}." + ) + + # Export img data to SpatialData. + if "img" in scene: + for key, image in scene.img.items(): + output_key = f"{scene_id}_{key}" + transform = _get_transform_from_collection(key, scene.img.metadata) + if not isinstance(image, MultiscaleImage): + warnings.warn( # type: ignore[unreachable] + f"Skipping img[{image}] in Scene {scene_id}; unexpected " + f"datatype {type(image).__name__}." + ) + if image.level_count == 1: + sdata.images[output_key] = to_spatial_data_image( + image, + 0, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + ) + else: + sdata.images[f"{scene_id}_{key}"] = to_spatial_data_multiscale_image( + image, + key=output_key, + scene_id=scene_id, + scene_dim_map=scene_dim_map, + transform=transform, + ) + + +def to_spatial_data( + experiment: Experiment, + *, + measurement_names: Optional[Sequence[str]] = None, + scene_names: Optional[Sequence[str]] = None, + obs_id_name: str = "obs_id", + var_id_name: str = "var_id", + table_kwargs: Optional[Mapping[str, Dict[str, Any]]] = None, +) -> sd.SpatialData: + """Converts the experiment group to SpatialData format. + + Args: + experiment: + measurement_names: The names of measurements to export. If ``None``, all + measurements are included. Defaults to ``None``. + scene_names: The names of the scenes to export. If ``None``, all scenes are + included. Defaults to ``None``. + obs_id_name: Column name to use for ``obs`` dataframes. Defaults to + ``"obs_id"``. + var_id_name: Column name to use for ``var`` dataframes. Defaults to + ``"var_id|``. + table_kwargs: Optional mapping from measurment name to keyword arguments to + pass to table conversions. See :method:`to_anndata` for possible keyword + arguments. + """ + warnings.warn(SPATIAL_DISCLAIMER) + + # Read non-spatial data into Anndata tables. + if "ms" in experiment: + if measurement_names is None: + ms_keys = tuple(experiment.ms.keys()) + else: + ms_keys = tuple(measurement_names) + if table_kwargs is None: + table_kwargs = {} + tables = { + measurement_name: ( + to_anndata( + experiment, + measurement_name, + obs_id_name=obs_id_name, + var_id_name=var_id_name, + **table_kwargs[measurement_name], + ) + if measurement_name in table_kwargs + else to_anndata( + experiment, + measurement_name, + obs_id_name=obs_id_name, + var_id_name=var_id_name, + ) + ) + for measurement_name in ms_keys + } + else: + tables = {} + + sdata = sd.SpatialData(tables=tables) + + # If no spatial data, return just the tables. + if "spatial" not in experiment: + return sdata + + if scene_names is None: + scene_names = tuple(experiment.spatial.keys()) + else: + scene_names = tuple(scene_names) + + for scene_id in scene_names: + scene = experiment.spatial[scene_id] + _add_scene_to_spatial_data( + sdata=sdata, + scene_id=scene_id, + scene=scene, + obs_id_name=obs_id_name, + var_id_name=var_id_name, + measurement_names=measurement_names, + ) + + return sdata diff --git a/apis/python/tests/test_basic_spatialdata_io.py b/apis/python/tests/test_basic_spatialdata_io.py new file mode 100644 index 0000000000..0b1ff95554 --- /dev/null +++ b/apis/python/tests/test_basic_spatialdata_io.py @@ -0,0 +1,298 @@ +from urllib.parse import urljoin + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import tiledbsoma as soma +from tiledbsoma import _factory + +spatial_outgest = pytest.importorskip("tiledbsoma.io.spatial.outgest") +sd = pytest.importorskip("spatialdata") +gpd = pytest.importorskip("geopandas") +shapely = pytest.importorskip("shapely") + + +@pytest.fixture(scope="module") +def sample_2d_data(): + return [ + np.random.randint(0, 255, size=(3, 64, 64), dtype=np.uint8), + np.random.randint(0, 255, size=(3, 32, 32), dtype=np.uint8), + np.random.randint(0, 255, size=(3, 16, 16), dtype=np.uint8), + np.random.randint(0, 255, size=(3, 8, 8), dtype=np.uint8), + ] + + +@pytest.fixture(scope="module") +def experiment_with_single_scene(tmp_path_factory, sample_2d_data) -> soma.Experiment: + uri = tmp_path_factory.mktemp("experiment_with_spatial_data").as_uri() + with soma.Experiment.create(uri) as exp: + assert exp.uri == uri + # Create spatial folder. + exp.add_new_collection("spatial") + + # Create scene 1. + scene1_uri = urljoin(exp.spatial.uri, "scene1") + exp.spatial["scene1"] = soma.Scene.create(scene1_uri) + scene1 = exp.spatial["scene1"] + assert scene1_uri == scene1.uri + scene1.coordinate_space = soma.CoordinateSpace.from_axis_names( + ["x_scene1", "y_scene1"] + ) + scene1.add_new_collection("obsl") + scene1.add_new_collection("varl") + scene1.varl.add_new_collection("RNA") + scene1.add_new_collection("img") + + # Add point cloud with shape to scene 1 obsl. + points1 = scene1.add_new_point_cloud_dataframe( + "points1", + "obsl", + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), 2.0 + ), + schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]), + domain=[[0, 1], [0, 1]], + ) + points1.write( + pa.Table.from_pydict( + { + "soma_joinid": np.arange(4), + "x": np.array([0, 0, 0.5, 0.5]), + "y": np.array([0, 0.5, 0, 0.5]), + } + ) + ) + points1.metadata["soma_geometry"] = 1.0 + points1.metadata["soma_geometry_type"] = "radius" + + # Add point cloud wihtout shape to scene 1 obsl + points3 = scene1.add_new_point_cloud_dataframe( + "points3", + "obsl", + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), 4.0 + ), + schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]), + domain=[[-1, 0], [-1, 0]], + ) + points3.write( + pa.Table.from_pydict( + { + "soma_joinid": np.arange(4), + "x": np.array([0, 0, -0.5, -0.5]), + "y": np.array([0, -0.5, 0, -0.5]), + } + ) + ) + + # Add point cloud without shape to scene 1 varl. + points2 = scene1.add_new_point_cloud_dataframe( + "points2", + ["varl", "RNA"], + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), -1.0 + ), + schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]), + domain=[[-1, 0], [-1, 0]], + ) + points2.write( + pa.Table.from_pydict( + { + "soma_joinid": np.arange(4), + "x": np.array([0, 0, -0.5, -0.5]), + "y": np.array([0, -0.5, 0, -0.5]), + } + ) + ) + + # Add point cloud with shape to scene 1 varl. + points4 = scene1.add_new_point_cloud_dataframe( + "points4", + ["varl", "RNA"], + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), 0.25 + ), + schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]), + domain=[[0, 1], [0, 1]], + ) + points4.write( + pa.Table.from_pydict( + { + "soma_joinid": np.arange(4), + "x": np.array([0, 0, 0.5, 0.5]), + "y": np.array([0, 0.5, 0, 0.5]), + } + ) + ) + points4.metadata["soma_geometry"] = 2.0 + points4.metadata["soma_geometry_type"] = "radius" + + # Add multiscale image with a single image. + with scene1.add_new_multiscale_image( + "image1", + "img", + type=pa.uint8(), + level_shape=(3, 64, 64), + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), 0.5 + ), + ) as image1: + coords = (slice(None), slice(None), slice(None)) + l0 = image1["level0"] + l0.write(coords, pa.Tensor.from_numpy(sample_2d_data[0])) + + # Add multiscale image with multiple resolutions. + with scene1.add_new_multiscale_image( + "image2", + "img", + type=pa.uint8(), + level_key="fullres", + level_shape=(3, 32, 32), + transform=soma.UniformScaleTransform( + ("x_scene1", "y_scene1"), ("x", "y"), 0.5 + ), + ) as image2: + coords = (slice(None), slice(None), slice(None)) + fullres = image2["fullres"] + fullres.write(coords, pa.Tensor.from_numpy(sample_2d_data[1])) + hires = image2.add_new_level("hires", shape=(3, 16, 16)) + hires.write(coords, pa.Tensor.from_numpy(sample_2d_data[2])) + lowres = image2.add_new_level("lowres", shape=(3, 8, 8)) + lowres.write(coords, pa.Tensor.from_numpy(sample_2d_data[3])) + scene1.close() + + return soma.Experiment.open(uri, mode="r") + + +def test_outgest_no_spatial(tmp_path, conftest_pbmc_small): + # Create the SOMA Experiment. + output_path = urljoin(f"{tmp_path.as_uri()}/", "outgest_no_spatial") + soma.io.from_anndata(output_path, conftest_pbmc_small, measurement_name="RNA") + + # Read full experiment into SpatialData. + with _factory.open(output_path) as exp: + sdata = spatial_outgest.to_spatial_data(exp) + + # Check the number of assets (exactly 1 table) is as expected. + assert len(sdata.tables) == 2 + assert len(sdata.points) == 0 + assert len(sdata.shapes) == 0 + assert len(sdata.images) == 0 + + # Check the values of the anndata table. + rna = sdata.tables["RNA"] + assert rna.obs.shape == conftest_pbmc_small.obs.shape + assert rna.var.shape == conftest_pbmc_small.var.shape + assert rna.X.shape == conftest_pbmc_small.X.shape + + for key in conftest_pbmc_small.obsm.keys(): + assert rna.obsm[key].shape == conftest_pbmc_small.obsm[key].shape + for key in conftest_pbmc_small.varm.keys(): + assert rna.varm[key].shape == conftest_pbmc_small.varm[key].shape + for key in conftest_pbmc_small.obsp.keys(): + assert rna.obsp[key].shape == conftest_pbmc_small.obsp[key].shape + for key in conftest_pbmc_small.varp.keys(): + assert rna.varp[key].shape == conftest_pbmc_small.varp[key].shape + + # Check the values of the anndata table. + raw = sdata.tables["raw"] + assert raw.var.shape == conftest_pbmc_small.raw.var.shape + assert raw.X.shape == conftest_pbmc_small.raw.shape + + +def test_outgest_spatial_only(experiment_with_single_scene, sample_2d_data): + # Export to SpatialData. + sdata = spatial_outgest.to_spatial_data(experiment_with_single_scene) + + # Check the number of assets is correct. + assert len(sdata.tables) == 0 + assert len(sdata.images) == 2 + assert len(sdata.shapes) == 2 + assert len(sdata.points) == 2 + + # Check image1. + image1 = sdata.images["scene1_image1"] + assert isinstance(image1, sd.models.models.DataArray) # Verify single scale image. + assert image1.attrs["transform"] == { + "scene1": sd.transformations.Scale([2, 2], ("x", "y")) + } + image_data = image1.data.compute() + np.testing.assert_equal(image_data, sample_2d_data[0]) + + # Check image2. + image2 = sdata.images["scene1_image2"] + assert isinstance(image2, sd.models.models.DataTree) # Verify mulitscale image. + for index in range(3): + image_level = image2[f"scale{index}"]["image"] + image_data = image_level.data.compute() + np.testing.assert_equal(image_data, sample_2d_data[index + 1]) + scale = 2 ** (index + 1) + assert image_level.attrs["transform"] == { + "scene1": sd.transformations.Scale([scale, scale], ("x", "y")) + } + + # Check points1. + points1 = sdata.shapes["scene1_points1"] + points1_expected = gpd.GeoDataFrame.from_dict( + { + "obs_id": np.arange(4), + "radius": np.ones((4,), dtype=np.float64), + "geometry": shapely.points( + [[0, 0], [0, 0.5], [0.5, 0], [0.5, 0.5]] + ).tolist(), + } + ) + assert all(points1 == points1_expected) + assert points1.attrs["transform"] == { + "scene1": sd.transformations.Scale([0.5, 0.5], ("x", "y")) + } + + # Check points2. + points2 = sdata.points["scene1_RNA_points2"] + points2_expected = pd.DataFrame.from_dict( + { + "x": np.array([0, 0, -0.5, -0.5]), + "y": np.array([0, -0.5, 0, -0.5]), + "var_id": np.arange(4), + } + ) + points2_data = points2.compute() + print(points2_data) + assert all(points2_data == points2_expected) + assert points2.attrs["transform"] == { + "scene1": sd.transformations.Scale([-1.0, -1.0], ("x", "y")) + } + + # Check points3. + points3 = sdata.points["scene1_points3"] + points3_expected = pd.DataFrame.from_dict( + { + "x": np.array([0, 0, -0.5, -0.5]), + "y": np.array([0, -0.5, 0, -0.5]), + "obs_id": np.arange(4), + } + ) + points3_data = points3.compute() + print(points3_data) + assert all(points3_data == points3_expected) + assert points3.attrs["transform"] == { + "scene1": sd.transformations.Scale([0.25, 0.25], ("x", "y")) + } + + # Check points4. + points4 = sdata.shapes["scene1_RNA_points4"] + points4_expected = gpd.GeoDataFrame.from_dict( + { + "var_id": np.arange(4), + "radius": np.ones((4,), dtype=np.float64), + "geometry": shapely.points( + [[0, 0], [0, 0.5], [0.5, 0], [0.5, 0.5]] + ).tolist(), + } + ) + assert all(points4 == points4_expected) + assert points4.attrs["transform"] == { + "scene1": sd.transformations.Scale([4.0, 4.0], ("x", "y")) + }