single-cell-data · jp-dark · Aug 22, 2024 · Jun 5, 2024 · May 30, 2024 · Aug 19, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
   # Remove this once we can specify a recent pyarrow.
   "pyarrow-hotfix",
   "scipy",
+  "shapely",
   "typing-extensions>=4.1", # For LiteralString (py3.11)
 ]
 requires-python = "~=3.8"
@@ -62,5 +63,5 @@ python_version = 3.8
 
 [[tool.mypy.overrides]]
 # These dependencies do not currently have canonical type stubs.
-module = ["anndata", "pyarrow", "pyarrow.compute", "pyarrow_hotfix", "scipy"]
+module = ["anndata", "pyarrow", "pyarrow.compute", "pyarrow_hotfix", "scipy", "shapely"]
 ignore_missing_imports = true
diff --git a/python-spec/requirements-py3.10.txt b/python-spec/requirements-py3.10.txt
@@ -14,6 +14,7 @@ pyarrow-hotfix==0.6
 python-dateutil==2.9.0.post0
 pytz==2024.1
 scipy==1.13.1
+shapely==2.0.4
 six==1.16.0
 typing_extensions==4.12.2
 tzdata==2024.1
diff --git a/python-spec/requirements-py3.11.txt b/python-spec/requirements-py3.11.txt
@@ -12,6 +12,7 @@ pyarrow==16.1.0
 pyarrow-hotfix==0.6
 python-dateutil==2.9.0.post0
 pytz==2024.1
+shapely==2.0.4
 scipy==1.13.1
 six==1.16.0
 typing_extensions==4.12.2

diff --git a/python-spec/requirements-py3.12.txt b/python-spec/requirements-py3.12.txt
@@ -14,6 +14,7 @@ python-dateutil==2.9.0.post0
 pytz==2024.1
 scipy==1.13.1
 setuptools==70.0.0
+shapely==2.0.4
 six==1.16.0
 typing_extensions==4.12.2
 tzdata==2024.1

diff --git a/python-spec/requirements-py3.8-lint.txt b/python-spec/requirements-py3.8-lint.txt
@@ -29,6 +29,7 @@ pytz==2024.1
 PyYAML==6.0.1
 ruff==0.4.9
 scipy==1.10.1
+shapely==2.0.4
 six==1.16.0
 tomli==2.0.1
 types-pytz==2024.1.0.20240417

diff --git a/python-spec/requirements-py3.8.txt b/python-spec/requirements-py3.8.txt
@@ -13,6 +13,7 @@ pyarrow-hotfix==0.6
 python-dateutil==2.9.0.post0
 pytz==2024.1
 scipy==1.10.1
+shapely==2.0.4
 six==1.16.0
 typing_extensions==4.12.2
 tzdata==2024.1

diff --git a/python-spec/requirements-py3.9.txt b/python-spec/requirements-py3.9.txt
@@ -18,6 +18,7 @@ pytz==2024.1
 rsa==4.7.2
 s3transfer==0.6.0
 scipy==1.13.1
+shapely==2.0.4
 six==1.16.0
 typing_extensions==4.12.2
 tzdata==2024.1
diff --git a/python-spec/src/somacore/__init__.py b/python-spec/src/somacore/__init__.py
@@ -21,10 +21,13 @@
 from .coordinates import CoordinateTransform
 from .data import DataFrame
 from .data import DenseNDArray
+from .data import GeometryDataFrame
 from .data import NDArray
+from .data import PointCloud
 from .data import ReadIter
 from .data import SparseNDArray
 from .data import SparseRead
+from .data import SpatialDataFrame
 from .experiment import Experiment
 from .images import Image2DCollection
 from .images import ImageCollection
@@ -63,6 +66,9 @@
     "Scene",
     "ImageCollection",
     "Image2DCollection",
+    "SpatialDataFrame",
+    "PointCloud",
+    "GeometryDataFrame",
     "BatchSize",
     "IOfN",
     "ResultOrder",

diff --git a/python-spec/src/somacore/data.py b/python-spec/src/somacore/data.py
@@ -495,6 +495,252 @@ def nnz(self) -> int:
         raise NotImplementedError()
 
 
+class SpatialDataFrame(base.SOMAObject, metaclass=abc.ABCMeta):
+    """A multi-column table with spatial indexing and a user-defined schema.
+
+    Lifecycle: experimental
+    """
+
+    __slots__ = ()
+
+    # Data operations
+
+    # TODO: Update this to take spatial input and return output with
+    # coords, etc.
+    @abc.abstractmethod
+    def read(
+        self,
+        coords: options.SparseDFCoords = (),
+        column_names: Optional[Sequence[str]] = None,
+        *,
+        batch_size: options.BatchSize = options.BatchSize(),
+        partitions: Optional[options.ReadPartitions] = None,
+        result_order: options.ResultOrderStr = _RO_AUTO,
+        value_filter: Optional[str] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> "ReadIter[pa.Table]":
+        """Reads a user-defined slice of data into Arrow tables.
+
+        Args:
+            coords: for each index dimension, which rows to read.
+                Defaults to ``()``, meaning no constraint -- all IDs.
+            column_names: the named columns to read and return.
+                Defaults to ``None``, meaning no constraint -- all column names.
+            partitions: If present, specifies that this is part of
+                a partitioned read, and which part of the data to include.
+            result_order: the order to return results, specified as a
+                :class:`~options.ResultOrder` or its string value.
+            value_filter: an optional value filter to apply to the results.
+                The default of ``None`` represents no filter. Value filter
+                syntax is implementation-defined; see the documentation
+                for the particular SOMA implementation for details.
+        Returns:
+            A :class:`ReadIter` of :class:`pa.Table`s.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def write(
+        self,
+        values: Union[pa.RecordBatch, pa.Table],
+        *,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> Self:
+        """Writes the data from an Arrow table to the persistent object.
+
+        As duplicate index values are not allowed, index values already present
+        in the object are overwritten and new index values are added.
+
+        Args:
+            values: An Arrow table containing all columns, including
+                the index columns. The schema for the values must match
+                the schema for the ``DataFrame``.
+
+        Returns: ``self``, to enable method chaining.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    # Metadata operations
+
+    @property
+    @abc.abstractmethod
+    def schema(self) -> pa.Schema:
+        """The schema of the data in this dataframe.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def index_column_names(self) -> Tuple[str, ...]:
+        """The names of the index (dimension) columns.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def axis_names(self) -> Tuple[str, ...]:
+        """The names of the axes of the coordinate space the data is defined on.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def domain(self) -> Tuple[Tuple[Any, Any], ...]:
+        """The allowable range of values in each index column.
+
+        Returns: a tuple of minimum and maximum values, inclusive,
+            storable on each index column of the dataframe.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+
+class PointCloud(SpatialDataFrame, metaclass=abc.ABCMeta):
+    """A multi-column table with point data and a user-defined schema.
+
+    Lifecycle: experimental
+    """
+
+    __slots__ = ()
+    soma_type: Final = "SOMAPointCloud"  # type: ignore[misc]
+
+    @classmethod
+    @abc.abstractmethod
+    def create(
+        cls,
+        uri: str,
+        *,
+        schema: pa.Schema,
+        index_column_names: Sequence[str] = (options.SOMA_JOINID, "x", "y"),
+        axis_names: Sequence[str] = ("x", "y"),
+        domain: Optional[Sequence[Optional[Tuple[Any, Any]]]] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+        context: Optional[Any] = None,
+    ) -> Self:
+        """Creates a new ``PointCloud`` at the given URI.
+
+        The schema of the created point cloud  will include a column named
+        ``soma_joinid`` of type ``pyarrow.int64``, with negative values disallowed, and
+        at least one axis with numeric type.  If a ``soma_joinid`` column is
+        present in the provided schema, it must be of the correct type.  If the
+        ``soma_joinid`` column is not provided, one will be added. The ``soma_joinid``
+        may be an index column. The axis columns must be index columns.
+
+        Args:
+            uri: The URI where the dataframe will be created.
+
+            schema: Arrow schema defining the per-column schema. This schema
+                must define all columns, including columns to be named as index
+                columns.  If the schema includes types unsupported by the SOMA
+                implementation, an error will be raised.
+
+            index_column_names: A list of column names to use as user-defined index
+                columns (e.g., ``['x', 'y']``). All named columns must exist in the
+                schema, and at least one index column name is required.
+
+            axis_names: An ordered list of axis column names that
+                coorespond to the names of axes of the the coordinate space the points
+                are defined on.
+
+            domain: An optional sequence of tuples specifying the domain of each
+                index column. Each tuple should be a pair consisting of the minimum
+                and maximum values storable in the index column. If omitted entirely,
+                or if ``None`` in a given dimension, the corresponding index-column
+                domain will use the minimum and maximum possible values for the
+                column's datatype.  This makes a point cloud dataframe growable.
+
+        Returns:
+            The newly created geometry dataframe, opened for writing.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+
+class GeometryDataFrame(SpatialDataFrame, metaclass=abc.ABCMeta):
+    """A multi-column table of geometries with spatial indexing and a user-defined
+    schema.
+
+    Lifecycle: experimental
+    """
+
+    __slots__ = ()
+    soma_type: Final = "SOMAGeometryDataFrame"  # type: ignore[misc]
+
+    # Lifecycle
+
+    @classmethod
+    @abc.abstractmethod
+    def create(
+        cls,
+        uri: str,
+        *,
+        schema: pa.Schema,
+        index_column_names: Sequence[str] = (
+            options.SOMA_JOINID,
+            options.SOMA_GEOMETRY,
+        ),
+        axis_names: Sequence[str] = ("x", "y"),
+        domain: Optional[Sequence[Optional[Tuple[Any, Any]]]] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+        context: Optional[Any] = None,
+    ) -> Self:
+        """Creates a new ``GeometryDataFrame`` at the given URI.
+
+        The schema of the created geoemetry dataframe will include a column named
+        ``soma_joinid`` of type ``pyarrow.int64``, with negative values
+        disallowed, and a column named ``soma_geometry of type ``pyarrow.binary`` or
+        ``pyarrow.large_binary``.  If a ``soma_joinid`` column or ``soma_geometry``
+        are present in the provided schema, they must be of the correct type.  If
+        either the ``soma_joinid`` column or ``soma_geometry`` column are not provided,
+        one will be added. The ``soma_joinid`` may be an index column. The
+        ``soma_geometry`` column must be an index column.
+
+        Args:
+            uri: The URI where the dataframe will be created.
+
+            schema: Arrow schema defining the per-column schema. This schema
+                must define all columns, including columns to be named as index
+                columns.  If the schema includes types unsupported by the SOMA
+                implementation, an error will be raised.
+
+            index_column_names: A list of column names to use as user-defined
+                index columns (e.g., ``['cell_type', 'tissue_type']``).
+                All named columns must exist in the schema, and at least one
+                index column name is required.
+
+            axis_names: An ordered list of axis column names that
+                coorespond to the names of the axes of the coordinate space the
+                geometries are defined on.
+
+            domain: An optional sequence of tuples specifying the domain of each
+                index column. Two tuples must be provided for the ``soma_geometry``
+                column which store the width followed by the height. Each tuple should
+                be a pair consisting of the minimum and maximum values storable in the
+                index column. If omitted entirely, or if ``None`` in a given dimension,
+                the corresponding index-column domain will use the minimum and maximum
+                possible values for the column's datatype.  This makes a dataframe
+                growable.
+
+        Returns:
+            The newly created geometry dataframe, opened for writing.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+
 #
 # Read types
 #

diff --git a/python-spec/src/somacore/ephemeral/collections.py b/python-spec/src/somacore/ephemeral/collections.py
@@ -134,7 +134,9 @@ class Collection(  # type: ignore[misc]  # __eq__ false positive
 """The loosest possible constraint of the abstract Measurement type."""
 
 _BasicAbstractScene = scene.Scene[
-    data.DataFrame, images.ImageCollection, base.SOMAObject
+    collection.Collection[data.SpatialDataFrame],
+    images.ImageCollection,
+    base.SOMAObject,
 ]
 """The loosest possible constraint of the abstract Scene type."""
 

diff --git a/python-spec/src/somacore/options.py b/python-spec/src/somacore/options.py
@@ -11,13 +11,17 @@
 import numpy as np
 import numpy.typing as npt
 import pyarrow as pa
+import shapely
 from typing_extensions import Final, Literal
 
 from . import types
 
 SOMA_JOINID: Final = "soma_joinid"
 """Global constant for the SOMA join ID."""
 
+SOMA_GEOMETRY: Final = "soma_geometry"
+"""Global constant for SOMA spatial geometry type."""
+
 OpenMode = Literal["r", "w"]
 """How to open a SOMA object: read or write."""
 
@@ -179,3 +183,8 @@ class ResultOrder(enum.Enum):
 """A single coordinate range for one dimension of a sparse nd-array."""
 SparseNDCoords = Sequence[SparseNDCoord]
 """A sequence of coordinate ranges for reading sparse ndarrays."""
+
+"""A single coordinate range for one dimension of a sparse dataframe."""
+
+GeometryDFCoords = Sequence[Union[SparseNDCoord, shapely.GeometryType]]
+"""A sequence of coordinate ranges for reading dense dataframes."""