Add first pass at geometry dataframe specification

single-cell-data · Aug 16, 2024 · f4d198b · f4d198b
1 parent ecfdf34
commit f4d198b
Show file tree

Hide file tree

Showing 2 changed files with 214 additions and 0 deletions.
diff --git a/python-spec/src/somacore/data.py b/python-spec/src/somacore/data.py
@@ -215,6 +215,211 @@ def domain(self) -> Tuple[Tuple[Any, Any], ...]:
         raise NotImplementedError()
 
 
+class GeometryDataFrame(base.SOMAObject, metaclass=abc.ABCMeta):
+    """A multi-column table with a user-defined schema.
+
+    Lifecycle: experimental
+    """
+
+    __slots__ = ()
+    soma_type: Final = "SOMAGeometryDataFrame"  # type: ignore[misc]
+
+    # Lifecycle
+
+    # TODO: Things to consider
+    # 1. Need to specify types for (x, y) envelope coordinate bounding box
+    # 2. Need way to specify max possibly domain for (x, y)
+    # 3. Need a way to enforce max possible domain or message the it's for
+    # the envelope?
+    @classmethod
+    @abc.abstractmethod
+    def create(
+        cls,
+        uri: str,
+        *,
+        schema: pa.Schema,
+        index_column_names: Sequence[str] = (
+            options.SOMA_JOINID,
+            options.SOMA_GEOMETRY,
+        ),
+        domain: Optional[Sequence[Optional[Tuple[Any, Any]]]] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+        context: Optional[Any] = None,
+    ) -> Self:
+        """Creates a new ``GeometryDataFrame`` at the given URI.
+
+        The schema of the created geoemetry dataframe will include a column named
+        ``soma_joinid`` of type ``pyarrow.int64``, with negative values
+        disallowed, and a column named ``soma_geometry of type ``pyarrow.binary`` or
+        ``pyarrow.large_binary``.  If a ``soma_joinid`` column or ``soma_geometry``
+        are present in the provided schema, they must be of the correct type.  If
+        either the ``soma_joinid`` column or ``soma_geometry`` column are not provided,
+        one will be added. The ``soma_joinid`` may be an index column. The
+        ``soma_geometry`` column must be an index column.
+
+        Args:
+            uri: The URI where the dataframe will be created.
+
+            schema: Arrow schema defining the per-column schema. This schema
+                must define all columns, including columns to be named as index
+                columns.  If the schema includes types unsupported by the SOMA
+                implementation, an error will be raised.
+
+            index_column_names: A list of column names to use as user-defined
+                index columns (e.g., ``['cell_type', 'tissue_type']``).
+                All named columns must exist in the schema, and at least one
+                index column name is required.
+
+            domain: An optional sequence of tuples specifying the domain of each
+                index column. Two tuples must be provided for the ``soma_geometry``
+                column which store the width followed by the height. Each tuple should
+                be a pair consisting of the minimum and maximum values storable in the
+                index column. If omitted entirely, or if ``None`` in a given dimension,
+                the corresponding index-column domain will use the minimum and maximum
+                possible values for the column's datatype.  This makes a dataframe
+                growable.
+
+        Returns:
+            The newly created geometry dataframe, opened for writing.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    # Data operations
+
+    @abc.abstractmethod
+    def read(
+        self,
+        coords: options.SparseDFCoords = (),
+        column_names: Optional[Sequence[str]] = None,
+        *,
+        batch_size: options.BatchSize = options.BatchSize(),
+        partitions: Optional[options.ReadPartitions] = None,
+        result_order: options.ResultOrderStr = _RO_AUTO,
+        value_filter: Optional[str] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> "ReadIter[pa.Table]":
+        """Reads a user-defined slice of data into Arrow tables.
+
+        Args:
+            coords: for each index dimension, which rows to read.
+                Defaults to ``()``, meaning no constraint -- all IDs.
+            column_names: the named columns to read and return.
+                Defaults to ``None``, meaning no constraint -- all column names.
+            partitions: If present, specifies that this is part of
+                a partitioned read, and which part of the data to include.
+            result_order: the order to return results, specified as a
+                :class:`~options.ResultOrder` or its string value.
+            value_filter: an optional value filter to apply to the results.
+                The default of ``None`` represents no filter. Value filter
+                syntax is implementation-defined; see the documentation
+                for the particular SOMA implementation for details.
+        Returns:
+            A :class:`ReadIter` of :class:`pa.Table`s.
+
+
+        **Indexing:**
+
+        Indexing is performed on a per-column basis for each indexed column.
+        To specify dimensions:
+
+        - A sequence of coordinates is accepted, one per indexed dimension.
+        - The sequence length must be less than or equal to the number of
+          indexed dimensions.
+        - If the sequence is shorter than the number of indexed coordinates,
+          then no constraint (i.e. ``None``) is used for the remaining
+          indexed dimensions.
+        - If providing a constraint on the ``soma_geometry``, it must either be
+          ``None`` (to specify no constraint) or a ``shapely.GeometryType``
+          object.
+        - Specifying an empty sequence (e.g. ``()``, the default) represents
+          no constraints over any dimension, returning the entire dataset.
+
+        Each dimension other than the ``soma_geometry`` dimension may be indexed
+        as follows:
+
+        - ``None`` or ``slice(None)`` places no constraint on the dimension.
+        - Coordinates can be specified as a scalar value, a Python sequence
+          (``list``, ``tuple``, etc.), a NumPy ndarray, an Arrow array, or
+          similar objects (as defined by ``SparseDFCoords``).
+        - Slices specify a closed range: ``slice(2, 4)`` includes both 2 and 4.
+          Slice *steps* may not be used: ``slice(10, 20, 2)`` is invalid.
+          ``slice(None)`` places no constraint on the dimension. Half-specified
+          slices like ``slice(None, 99)`` and ``slice(5, None)`` specify
+          all indices up to and including the value, and all indices
+          starting from and including the value.
+        - Negative values in indices and slices are treated as raw domain values
+          and not as indices relative to the end, unlike traditional Python
+          sequence indexing. For instance, ``slice(-10, 3)`` indicates the range
+          from −10 to 3 on the given dimension.
+
+        The ``soma_geometry`` dimension may be indexed as follows:
+
+        - ``None`` places no constraint on the dimension.
+        - A query on all intersecting geomemtries can be specified with a
+          ``shapely.GeometryType`` object.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def write(
+        self,
+        values: Union[pa.RecordBatch, pa.Table],
+        *,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> Self:
+        """Writes the data from an Arrow table to the persistent object.
+
+        As duplicate index values are not allowed, index values already present
+        in the object are overwritten and new index values are added.
+
+        Args:
+            values: An Arrow table containing all columns, including
+                the index columns. The schema for the values must match
+                the schema for the ``DataFrame``.
+
+        Returns: ``self``, to enable method chaining.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    # Metadata operations
+
+    @property
+    @abc.abstractmethod
+    def schema(self) -> pa.Schema:
+        """The schema of the data in this dataframe.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def index_column_names(self) -> Tuple[str, ...]:
+        """The names of the index (dimension) columns.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def domain(self) -> Tuple[Tuple[Any, Any], ...]:
+        """The allowable range of values in each index column.
+
+        Returns: a tuple of minimum and maximum values, inclusive,
+            storable on each index column of the dataframe.
+
+        Lifecycle: experimental
+        """
+        raise NotImplementedError()
+
+
 class NDArray(base.SOMAObject, metaclass=abc.ABCMeta):
     """Common behaviors of N-dimensional arrays of a single primitive type."""
 

diff --git a/python-spec/src/somacore/options.py b/python-spec/src/somacore/options.py
@@ -11,13 +11,17 @@
 import numpy as np
 import numpy.typing as npt
 import pyarrow as pa
+import shapely
 from typing_extensions import Final, Literal
 
 from . import types
 
 SOMA_JOINID: Final = "soma_joinid"
 """Global constant for the SOMA join ID."""
 
+SOMA_GEOMETRY: Final = "soma_geometry"
+"""Global constant for SOMA spatial geometry type."""
+
 OpenMode = Literal["r", "w"]
 """How to open a SOMA object: read or write."""
 
@@ -179,3 +183,8 @@ class ResultOrder(enum.Enum):
 """A single coordinate range for one dimension of a sparse nd-array."""
 SparseNDCoords = Sequence[SparseNDCoord]
 """A sequence of coordinate ranges for reading sparse ndarrays."""
+
+"""A single coordinate range for one dimension of a sparse dataframe."""
+
+GeometryDFCoords = Sequence[Union[SparseNDCoord, shapely.GeometryType]]
+"""A sequence of coordinate ranges for reading dense dataframes."""