From c9bb1f78b4a5216ceb79f28c25a6ef7dbefd239a Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 21 Jun 2022 12:38:40 -0400 Subject: [PATCH] Predicates for is-SOMA and is-SOMA-collection (#176) --- apis/python/src/tiledbsc/tiledb_array.py | 2 +- apis/python/src/tiledbsc/tiledb_group.py | 2 +- apis/python/src/tiledbsc/util.py | 76 +++++++++++++++++++++- apis/python/src/tiledbsc/util_tiledb.py | 5 -- apis/python/tests/test_basic_anndata_io.py | 25 +++---- apis/python/tests/test_soco_ops.py | 12 +++- 6 files changed, 96 insertions(+), 26 deletions(-) diff --git a/apis/python/src/tiledbsc/tiledb_array.py b/apis/python/src/tiledbsc/tiledb_array.py index ce616cfac7..e051622612 100644 --- a/apis/python/src/tiledbsc/tiledb_array.py +++ b/apis/python/src/tiledbsc/tiledb_array.py @@ -113,7 +113,7 @@ def _set_soma_object_type_metadata(self) -> None: """ with self._open("w") as A: A.meta[ - tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY + tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY ] = self.__class__.__name__ def show_metadata(self, recursively=True, indent=""): diff --git a/apis/python/src/tiledbsc/tiledb_group.py b/apis/python/src/tiledbsc/tiledb_group.py index c09a35c016..bd3c48674e 100644 --- a/apis/python/src/tiledbsc/tiledb_group.py +++ b/apis/python/src/tiledbsc/tiledb_group.py @@ -79,7 +79,7 @@ def _set_soma_object_type_metadata(self): """ with self._open("w") as G: G.meta[ - tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY + tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY ] = self.__class__.__name__ def _set_soma_object_type_metadata_recursively(self): diff --git a/apis/python/src/tiledbsc/util.py b/apis/python/src/tiledbsc/util.py index 4b8ee0ae74..e3ec33be95 100644 --- a/apis/python/src/tiledbsc/util.py +++ b/apis/python/src/tiledbsc/util.py @@ -9,10 +9,84 @@ import time from typing import Optional, List, Union +# This is for group/array metadata we write, to help nested-structured traversals (especially those +# that start at the SOMACollection level) confidently navigate with a minimum of introspection on +# group contents. +SOMA_OBJECT_TYPE_METADATA_KEY = "soma_object_type" + # ---------------------------------------------------------------- -def is_local_path(path: str) -> bool: +def is_soma(uri: str, ctx: Optional[tiledb.Ctx] = None) -> bool: + """ + Tells whether the URI points to a SOMA or not. """ + # A SOMA is a TileDB group, but TileDB uses groups for many things including SOMACollections and + # SOMA elements such as obs and var. If this is not a group, we say it is not a SOMA; if it is, + # we ask more questions. + if tiledb.object_type(uri, ctx=ctx) != "group": + return False + + # We can check object-type metadata, but it's possible the URI was created using an + # early-access/beta-level version of our code that wasn't yet writing object-type metadata. If + # we find object-type metadata saying this TileDB group is a SOMA, we say so; if not, we ask + # more questions. + with tiledb.Group(uri, mode="r", ctx=ctx) as G: + if SOMA_OBJECT_TYPE_METADATA_KEY in G.meta: + # Really `tiledbsc.SOMA.__name__`, but prevent a circular package import, so `"SOMA"` + return G.meta[SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMA" + + # At this point this path could be a SOMACollection, SOMA, or maybe SOMA element + # (or some manually created TileDB group). + if "obs" in G and "var" in G: + return True + + # There is a chance that: + # o this is a TileDB group; + # o it's intended to hold a SOMA but it hasn't been populated yet; + # o it was created before object-type metadata was put in place; + # o it's remained unpopulated this whole time. + # Here we say this is not a SOMA, and accept the small risk of false negative. + return False + +# ---------------------------------------------------------------- +def is_soma_collection(uri: str, ctx: Optional[tiledb.Ctx] = None) -> bool: + """ + Tells whether the URI points to a SOMACollection or not. + """ + # A SOMACollection is a TileDB group, but TileDB uses groups for many things including SOMAs and + # SOMA elements such as obs and var. If this is not a group, we say it is not a SOMACollection; + # if it is, we ask more questions. + if tiledb.object_type(uri, ctx=ctx) != "group": + return False + + # We can check object-type metadata, but it's possible the URI was created using an + # early-access/beta-level version of our code that wasn't yet writing object-type metadata. If + # we find object-type metadata saying this TileDB group is a SOMA, we say so; if not, we ask + # more questions. + with tiledb.Group(uri, mode="r", ctx=ctx) as G: + if SOMA_OBJECT_TYPE_METADATA_KEY in G.meta: + # Really `tiledbsc.SOMA.__name__`, but prevent a circular package import, so `"SOMA"` + return G.meta[SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMACollection" + + # At this point this path could be a SOMACollection, SOMA, or maybe SOMA element + # (or some manually created TileDB group). + if "obs" in G and "var" in G: + # Very slight chance of false negative if someone created and populated a SOMACollection + # using beta-level code, and added a SOMA named 'obs' and another SOMA named 'var'. + return False + + # There is a chance that: + # o this is a TileDB group; + # o it's intended to hold a SOMACollection but it hasn't been populated yet; + # o it was created before object-type metadata was put in place; + # o it's remained unpopulated this whole time. + # Here we say this is a SOMACollection, and accept the small risk of false positive. + return True + + +# ---------------------------------------------------------------- +def is_local_path(path: str) -> bool: + """ Returns information about start time of an event. Nominally float seconds since the epoch, but articulated here as being compatible with the format_elapsed function. """ diff --git a/apis/python/src/tiledbsc/util_tiledb.py b/apis/python/src/tiledbsc/util_tiledb.py index 95342ab2f1..760ba4136d 100644 --- a/apis/python/src/tiledbsc/util_tiledb.py +++ b/apis/python/src/tiledbsc/util_tiledb.py @@ -4,11 +4,6 @@ import tiledb from typing import Optional -# This is for group/array metadata we write, to help nested-structured traversals (especially those -# that start at the SOMACollection level) confidently navigate with a minimum of introspection on -# group contents. -SOMA_OBJECT_TYPE_METADATA_KEY = "soma_object_type" - # ================================================================ def show_single_cell_group(soma_uri: str, ctx: Optional[tiledb.Ctx] = None): """ diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 80ca61f2f2..7a09b03b4b 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -52,7 +52,7 @@ def test_import_anndata(adata): # raw/varm/PCs with tiledb.Group(output_path) as G: - assert G.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMA" + assert G.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMA" # Check X/data (dense) with tiledb.open(os.path.join(output_path, "X", "data")) as A: @@ -60,9 +60,7 @@ def test_import_anndata(adata): keys = list(df.keys()) assert keys == ["value", "obs_id", "var_id"] assert A.ndim == 2 - assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] == "AssayMatrix" - ) + assert A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AssayMatrix" # Convenience accessors assert soma.X["data"].shape() == soma.X.data.shape() @@ -72,17 +70,14 @@ def test_import_anndata(adata): assert df.columns.to_list() == ["obs_id", "var_id", "value"] # verify sparsity of raw data assert df.shape[0] == orig.raw.X.nnz - assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] == "AssayMatrix" - ) + assert A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AssayMatrix" # Check obs with tiledb.open(os.path.join(output_path, "obs")) as A: df = A.df[:] assert df.columns.to_list() == orig.obs_keys() assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] - == "AnnotationDataFrame" + A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AnnotationDataFrame" ) assert sorted(soma.obs.ids()) == sorted(list(orig.obs_names)) # Convenience accessors @@ -95,8 +90,7 @@ def test_import_anndata(adata): df = A.df[:] assert df.columns.to_list() == orig.var_keys() assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] - == "AnnotationDataFrame" + A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AnnotationDataFrame" ) assert sorted(soma.var.ids()) == sorted(list(orig.var_names)) # Convenience accessors @@ -113,7 +107,7 @@ def test_import_anndata(adata): assert df.shape[0] == orig.obsm[key].shape[0] assert soma.obsm[key].shape() == orig.obsm[key].shape assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] + A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AnnotationMatrix" ) # Convenience accessors: soma.obsm.X_pca <-> soma.obsm['X_pca'] @@ -127,7 +121,7 @@ def test_import_anndata(adata): assert df.shape[0] == orig.varm[key].shape[0] assert soma.varm[key].shape() == orig.varm[key].shape assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] + A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AnnotationMatrix" ) # Convenience accessors: @@ -147,10 +141,7 @@ def test_import_anndata(adata): shape = soma.obsp[key].df().shape assert shape[0] == orig.obsp[key].nnz assert shape[1] == 1 - assert ( - A.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] - == "AssayMatrix" - ) + assert A.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "AssayMatrix" # Convenience accessors: for key in soma.obsp.keys(): assert getattr(soma.obsp, key).shape() == soma.obsp[key].shape() diff --git a/apis/python/tests/test_soco_ops.py b/apis/python/tests/test_soco_ops.py index 345ede46b5..92845af2bd 100644 --- a/apis/python/tests/test_soco_ops.py +++ b/apis/python/tests/test_soco_ops.py @@ -2,6 +2,7 @@ import tiledb import tiledbsc import tiledbsc.io +import tiledbsc.util import pytest import tempfile @@ -29,7 +30,7 @@ def test_import_anndata(tmp_path): soco = tiledbsc.SOMACollection(soco_dir) with tiledb.Group(soma1_dir) as G: - assert G.meta[tiledbsc.util_tiledb.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMA" + assert G.meta[tiledbsc.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMA" soco.create_unless_exists() assert len(soco._get_member_names()) == 0 @@ -43,3 +44,12 @@ def test_import_anndata(tmp_path): assert len(soco._get_member_names()) == 1 soco.remove(soma2) assert len(soco._get_member_names()) == 0 + + assert tiledbsc.util.is_soma(soma1.uri) + assert tiledbsc.util.is_soma(soma2.uri) + assert not tiledbsc.util.is_soma(soma1.obs.uri) + assert not tiledbsc.util.is_soma(soma2.var.uri) + + assert not tiledbsc.util.is_soma_collection(soma2.var.uri) + assert not tiledbsc.util.is_soma_collection(soma2.uri) + assert tiledbsc.util.is_soma_collection(soco.uri)