Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Update TileDB-SOMA to use the somacore release with spatial datatypes #3078

Merged
merged 11 commits into from
Sep 27, 2024
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ repos:
# Pandas 2.x types (e.g. `pd.Series[Any]`). See `_types.py` or https://github.com/single-cell-data/TileDB-SOMA/issues/2839
# for more info.
- "pandas-stubs>=2"
- "somacore==1.0.17"
- "somacore==1.0.18"
- types-setuptools
args: ["--config-file=apis/python/pyproject.toml", "apis/python/src", "apis/python/devtools"]
pass_filenames: false
2 changes: 1 addition & 1 deletion apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def run(self):
"scanpy>=1.9.2",
"scipy",
# Note: the somacore version is in .pre-commit-config.yaml too
"somacore==1.0.17",
"somacore==1.0.18",
"tiledb~=0.32.0",
"typing-extensions", # Note "-" even though `import typing_extensions`
],
Expand Down
6 changes: 6 additions & 0 deletions apis/python/src/tiledbsoma/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
"""

SOMA_JOINID = "soma_joinid"
SOMA_GEOMETRY = "soma_geometry"
SOMA_OBJECT_TYPE_METADATA_KEY = "soma_object_type"
SOMA_ENCODING_VERSION_METADATA_KEY = "soma_encoding_version"
SOMA_ENCODING_VERSION = "1"

SPATIAL_DISCLAIMER = (
"Support for spatial types is experimental. Changes to both the API and data "
"storage may not be backwards compatible."
)
6 changes: 6 additions & 0 deletions apis/python/src/tiledbsoma/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ._dataframe import DataFrame
from ._indexer import IntIndexer
from ._measurement import Measurement
from ._scene import Scene
from ._soma_object import AnySOMAObject


Expand All @@ -24,6 +25,7 @@ class Experiment( # type: ignore[misc] # __eq__ false positive
experiment.Experiment[ # type: ignore[type-var]
DataFrame,
Collection[Measurement],
Collection[Scene],
AnySOMAObject,
],
):
Expand All @@ -43,6 +45,8 @@ class Experiment( # type: ignore[misc] # __eq__ false positive
defined in this dataframe.
ms (Collection):
A collection of named measurements.
spatial (Collection):
A collection of spatial scenes.

Example:
>>> import tiledbsoma
Expand All @@ -69,6 +73,8 @@ class Experiment( # type: ignore[misc] # __eq__ false positive
_subclass_constrained_soma_types = {
"obs": ("SOMADataFrame",),
"ms": ("SOMACollection",),
"spatial": ("SOMACollection",),
"obs_spatial_presence": ("SOMADataFrame",),
}

def axis_query( # type: ignore
Expand Down
260 changes: 260 additions & 0 deletions apis/python/src/tiledbsoma/_geometry_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
# Copyright (c) 2024 The Chan Zuckerberg Initiative Foundation
# Copyright (c) 2024 TileDB, Inc.
#
# Licensed under the MIT License.
"""
Implementation of a SOMA Geometry DataFrame
"""

import warnings
from typing import Any, Optional, Sequence, Tuple, Union

import pyarrow as pa
import somacore
from somacore import CoordinateSpace, CoordinateTransform, options
from typing_extensions import Self

from ._constants import SOMA_GEOMETRY, SOMA_JOINID, SPATIAL_DISCLAIMER
from ._dataframe import Domain
from ._read_iters import TableReadIter
from ._types import OpenTimestamp
from .options import SOMATileDBContext

_UNBATCHED = options.BatchSize()


class GeometryDataFrame(somacore.GeometryDataFrame):
"""A specialized SOMA object for storing complex geometries with spatial indexing.

The ``GeometryDataFrame`` class is designed to store and manage geometric shapes
such as polygons, lines, and multipoints, along with additional columns for
associated attributes.

Lifecycle:
Experimental.
"""

__slots__ = ()

# Lifecycle

@classmethod
def create(
cls,
uri: str,
*,
schema: pa.Schema,
index_column_names: Sequence[str] = (SOMA_JOINID, SOMA_GEOMETRY),
axis_names: Sequence[str] = ("x", "y"),
domain: Optional[Domain] = None,
platform_config: Optional[options.PlatformConfig] = None,
context: Optional[SOMATileDBContext] = None,
tiledb_timestamp: Optional[OpenTimestamp] = None,
) -> Self:
"""Creates a new ``GeometryDataFrame`` at the given URI.

The schema of the created geometry dataframe will include a column named
``soma_joinid`` of type ``pyarrow.int64``, with negative values
disallowed, and a column named ``soma_geometry of type ``pyarrow.binary`` or
``pyarrow.large_binary``. If a ``soma_joinid`` column or ``soma_geometry``
jp-dark marked this conversation as resolved.
Show resolved Hide resolved
are present in the provided schema, they must be of the correct type. If
either the ``soma_joinid`` column or ``soma_geometry`` column are not provided,
one will be added. The ``soma_joinid`` may be an index column. The
``soma_geometry`` column must be an index column.

Args:
uri: The URI where the dataframe will be created.
schema: Arrow schema defining the per-column schema. This schema
must define all columns, including columns to be named as index
columns. If the schema includes types unsupported by the SOMA
implementation, a ValueError will be raised.
index_column_names: A list of column names to use as user-defined
index columns (e.g., ``['cell_type', 'tissue_type']``).
All named columns must exist in the schema, and at least one
index column name is required.
axis_names: An ordered list of axis column names that correspond to the
names of the axes of the coordinate space the geometries are defined
on.
domain: An optional sequence of tuples specifying the domain of each
index column. Two tuples must be provided for the ``soma_geometry``
column which store the width followed by the height. Each tuple should
be a pair consisting of the minimum and maximum values storable in the
index column. If omitted entirely, or if ``None`` in a given dimension,
the corresponding index-column domain will use the minimum and maximum
possible values for the column's datatype. This makes a dataframe
growable.

Returns:
The newly created geometry dataframe, opened for writing.

Lifecycle:
Experimental.
"""
warnings.warn(SPATIAL_DISCLAIMER)
raise NotImplementedError()

# Data operations

def read(
self,
coords: options.SparseDFCoords = (),
column_names: Optional[Sequence[str]] = None,
*,
batch_size: options.BatchSize = _UNBATCHED,
partitions: Optional[options.ReadPartitions] = None,
result_order: options.ResultOrderStr = options.ResultOrder.AUTO,
value_filter: Optional[str] = None,
platform_config: Optional[options.PlatformConfig] = None,
) -> TableReadIter:
"""Reads a user-defined slice of data into Arrow tables.

Args:
coords: for each index dimension, which rows to read.
Defaults to ``()``, meaning no constraint -- all IDs.
column_names: the named columns to read and return.
Defaults to ``None``, meaning no constraint -- all column names.
partitions: If present, specifies that this is part of
a partitioned read, and which part of the data to include.
result_order: the order to return results, specified as a
:class:`~options.ResultOrder` or its string value.
value_filter: an optional value filter to apply to the results.
The default of ``None`` represents no filter. Value filter
syntax is implementation-defined; see the documentation
for the particular SOMA implementation for details.
Returns:
A :class:`ReadIter` of :class:`pa.Table`s.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

def read_spatial_region(
self,
region: Optional[options.SpatialRegion] = None,
column_names: Optional[Sequence[str]] = None,
*,
region_transform: Optional[CoordinateTransform] = None,
region_coord_space: Optional[CoordinateSpace] = None,
batch_size: options.BatchSize = _UNBATCHED,
partitions: Optional[options.ReadPartitions] = None,
result_order: options.ResultOrderStr = options.ResultOrder.AUTO,
value_filter: Optional[str] = None,
platform_config: Optional[options.PlatformConfig] = None,
) -> somacore.SpatialRead[somacore.ReadIter[pa.Table]]:
"""Reads data intersecting an user-defined region of space into a
:class:`SpatialRead` with data in Arrow tables.


Args:
region: The region to query. May be a box in the form
[x_min, y_min, x_max, y_max] (for 2D images), a box in the form
[x_min, y_min, z_min, x_max, y_max, z_max] (for 3D images), or
a shapely Geometry.
jp-dark marked this conversation as resolved.
Show resolved Hide resolved
column_names: The named columns to read and return.
Defaults to ``None``, meaning no constraint -- all column names.
region_transform: An optional coordinate transform from the read region to the
coordinate system of the spatial dataframe.
Defaults to ``None``, meaning an identity transform.
region_coord_space: An optional coordinate space for the region being read.
Defaults to ``None``, coordinate space will be inferred from transform.
batch_size: The size of batched reads.
Defaults to `unbatched`.
partitions: If present, specifies that this is part of a partitioned read,
and which part of the data to include.
result_order: the order to return results, specified as a
:class:`~options.ResultOrder` or its string value.
value_filter: an optional value filter to apply to the results.
The default of ``None`` represents no filter. Value filter
syntax is implementation-defined; see the documentation
for the particular SOMA implementation for details.

Returns:
A :class:`SpatialRead` with :class:`ReadIter` of :class:`pa.Table`s data.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

def write(
self,
values: Union[pa.RecordBatch, pa.Table],
*,
platform_config: Optional[options.PlatformConfig] = None,
) -> Self:
"""Writes the data from an Arrow table to the persistent object.

As duplicate index values are not allowed, index values already present
in the object are overwritten and new index values are added.

Args:
values: An Arrow table containing all columns, including
the index columns. The schema for the values must match
the schema for the ``DataFrame``.

Returns: ``self``, to enable method chaining.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

# Metadata operations

@property
def schema(self) -> pa.Schema:
"""The schema of the data in this dataframe.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

@property
def index_column_names(self) -> Tuple[str, ...]:
"""The names of the index (dimension) columns.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

@property
def axis_names(self) -> Tuple[str, ...]:
"""The names of the axes of the coordinate space the data is defined on.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

@property
def coordinate_space(self) -> Optional[CoordinateSpace]:
"""Coordinate space for this geometry dataframe.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

@coordinate_space.setter
def coordinate_space(self, value: CoordinateSpace) -> None:
"""Coordinate space for this geometry dataframe.

Lifecycle:
Experimental.
"""
raise NotImplementedError()

@property
def domain(self) -> Tuple[Tuple[Any, Any], ...]:
"""The allowable range of values in each index column.

Returns: a tuple of minimum and maximum values, inclusive,
storable on each index column of the dataframe.

Lifecycle:
Experimental.
"""
raise NotImplementedError()
1 change: 1 addition & 0 deletions apis/python/src/tiledbsoma/_measurement.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,5 @@ class Measurement( # type: ignore[misc] # __eq__ false positive
"obsp": ("SOMACollection",),
"varm": ("SOMACollection",),
"varp": ("SOMACollection",),
"var_spatial_presence": ("SOMADataFrame",),
}
Loading
Loading