Skip to content

Commit

Permalink
unit-test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 16, 2023
1 parent 4c1d18d commit 3065656
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 7 deletions.
94 changes: 87 additions & 7 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,16 @@
_NDArr = TypeVar("_NDArr", bound=NDArray)
_TDBO = TypeVar("_TDBO", bound=TileDBObject[RawHandle])

# Arrays of strings from AnnData's uns are stored in SOMA as SOMADataFrame,
# since SOMA ND arrays are necessarily arrays *of numbers*. This is okay since
# the one and only job of SOMA uns is to faithfully ingest from AnnData and
# outgest back. These are parameters common to ingest and outgest of these.
_UNS_OUTGEST_HINT_KEY = "soma_uns_outgest_hint"
_UNS_OUTGEST_HINT_1D = "array_1d"
_UNS_OUTGEST_HINT_2D = "array_2d"
_UNS_OUTGEST_COLUMN_NAME_1D = "values"
_UNS_OUTGEST_COLUMN_PREFIX_2D = "values_"


# ----------------------------------------------------------------
class IngestionParams:
Expand Down Expand Up @@ -2120,15 +2130,17 @@ def _ingest_uns_node(
return

if isinstance(value, pd.DataFrame):
num_cols = value.shape[1]
####num_cols = value.shape[1]
num_rows = value.shape[0]
with _write_dataframe(
_util.uri_joinpath(coll.uri, key),
value,
None,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
axis_mapping=AxisIDMapping.identity(num_cols),
####axis_mapping=AxisIDMapping.identity(num_cols),
axis_mapping=AxisIDMapping.identity(num_rows),
) as df:
_maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
return
Expand Down Expand Up @@ -2226,10 +2238,15 @@ def _ingest_uns_1d_string_array(
) -> None:
"""Helper for ``_ingest_uns_string_array``"""
n = len(value)
# An array like ["a", "b", "c"] becomes a DataFrame like
# soma_joinid value
# 0 a
# 1 b
# 2 c
df = pd.DataFrame(
data={
"soma_joinid": np.arange(n, dtype=np.int64),
"values": [str(e) if e else "" for e in value],
SOMA_JOINID: np.arange(n, dtype=np.int64),
_UNS_OUTGEST_COLUMN_NAME_1D: [str(e) if e else "" for e in value],
}
)
df.set_index("soma_joinid", inplace=True)
Expand All @@ -2244,6 +2261,8 @@ def _ingest_uns_1d_string_array(
context=context,
) as soma_df:
_maybe_set(coll, key, soma_df, use_relative_uri=use_relative_uri)
# TODO: comment
soma_df.metadata[_UNS_OUTGEST_HINT_KEY] = _UNS_OUTGEST_HINT_1D


def _ingest_uns_2d_string_array(
Expand All @@ -2262,6 +2281,10 @@ def _ingest_uns_2d_string_array(
back out the way it came in."""
num_rows, num_cols = value.shape
data: Dict[str, Any] = {"soma_joinid": np.arange(num_rows, dtype=np.int64)}
# An array like [["a", "b", "c"], ["d", "e", "f"]] becomes a DataFrame like
# soma_joinid values_0 values_1 values_2
# 0 a b c
# 1 d e f
for j in range(num_cols):
column_name = f"values_{j}"
data[column_name] = [str(e) if e else "" for e in value[:, j]]
Expand All @@ -2278,6 +2301,8 @@ def _ingest_uns_2d_string_array(
context=context,
) as soma_df:
_maybe_set(coll, key, soma_df, use_relative_uri=use_relative_uri)
# TODO: comment
soma_df.metadata[_UNS_OUTGEST_HINT_KEY] = _UNS_OUTGEST_HINT_2D


def _ingest_uns_ndarray(
Expand Down Expand Up @@ -2588,17 +2613,72 @@ def _extract_uns(
This is a helper function for ``to_anndata`` of ``uns`` elements.
"""

extracted = {}
extracted: Dict[str, Any] = {}
for key, element in collection.items():
if isinstance(element, Collection):
extracted[key] = _extract_uns(element)
elif isinstance(element, DataFrame):
extracted[key] = element.read().concat().to_pandas()
# TODO: back to 1D/2D if this was from-string above
hint = element.metadata.get(_UNS_OUTGEST_HINT_KEY)
pdf = element.read().concat().to_pandas()
if hint is None:
extracted[key] = pdf
elif hint == _UNS_OUTGEST_HINT_1D:
extracted[key] = _outgest_uns_1d_string_array(pdf, element.uri)
elif hint == _UNS_OUTGEST_HINT_2D:
extracted[key] = _outgest_uns_2d_string_array(pdf, element.uri)
else:
msg = (
f"Warning: uns {collection.uri}[{key!r}] has "
+ "{_UNS_OUTGEST_HINT_KEY} as unrecognized {hint}: leaving this as Pandas DataFrame"
)
logging.log_io_same(msg)
extracted[key] = pdf
elif isinstance(element, SparseNDArray):
extracted[key] = element.read().tables().concat().to_pandas()
elif isinstance(element, DenseNDArray):
extracted[key] = element.read().to_numpy()
else:
print("SKIPPING", element.soma_type)

# Primitives got set on the SOMA-experiment uns metadata.
for key, value in collection.metadata.items():
if not key.startswith("soma_"):
extracted[key] = value

return extracted


def _outgest_uns_1d_string_array(pdf: pd.DataFrame, uri_for_logging: str) -> NPNDArray:
"""Helper methods for _extract_uns"""
num_rows, num_cols = pdf.shape
# An array like ["a", "b", "c"] had become a DataFrame like
# soma_joinid value
# 0 a
# 1 b
if num_cols != 2:
raise SOMAError(f"Expected 2 columns in {uri_for_logging}; got {num_cols}")
for column_name in [SOMA_JOINID, _UNS_OUTGEST_COLUMN_NAME_1D]:
if column_name not in pdf:
raise SOMAError(f"Expected {column_name} column in {uri_for_logging}")
return np.asarray(list(pdf[_UNS_OUTGEST_COLUMN_NAME_1D]))


def _outgest_uns_2d_string_array(pdf: pd.DataFrame, uri_for_logging: str) -> NPNDArray:
"""Helper methods for _extract_uns"""
num_rows, num_cols = pdf.shape
if num_cols < 2:
raise SOMAError(f"Expected 2 columns in {uri_for_logging}; got {num_cols}")
if SOMA_JOINID not in pdf:
raise SOMAError(f"Expected {SOMA_JOINID} column in {uri_for_logging}")
num_cols -= 1
columns = []
# An array like [["a", "b", "c"], ["d", "e", "f"]] had become a DataFrame like
# soma_joinid values_0 values_1 values_2
# 0 a b c
# 1 d e f
for i in range(num_cols):
column_name = _UNS_OUTGEST_COLUMN_PREFIX_2D + str(i)
if column_name not in pdf:
raise SOMAError(f"Expected {column_name} column in {uri_for_logging}")
columns.append(list(pdf[column_name]))
return np.asarray(columns).transpose()
71 changes: 71 additions & 0 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,3 +831,74 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var
)
assert list(bdata.obs.index) == list(soma_obs[obs_id_name])
assert list(bdata.var.index) == list(soma_var[var_id_name])


def test_uns_io(tmp_path):
obs = pd.DataFrame(
data={"obs_id": np.asarray(["a", "b", "c"])},
index=np.arange(3).astype(str),
)
var = pd.DataFrame(
data={"var_id": np.asarray(["x", "y"])},
index=np.arange(2).astype(str),
)
X = np.zeros([3, 2])

uns = {
# These are stored in SOMA as metadata
"int_scalar": 7,
"float_scalar": 8.5,
"string_scalar": "hello",
# These are stored in SOMA as SOMADataFrame
"pd_df_indexed": pd.DataFrame(
data={"column_1": np.asarray(["d", "e", "f"])},
index=np.arange(3).astype(str),
),
"pd_df_nonindexed": pd.DataFrame(
data={"column_1": np.asarray(["g", "h", "i"])},
),
# These are stored in SOMA as SOMA ND arrays
"np_ndarray_1d": np.asarray([1, 2, 3]),
"np_ndarray_2d": np.asarray([[1, 2, 3], [4, 5, 6]]),
# This are stored in SOMA as a SOMACollection
"strings": {
# This are stored in SOMA as a SOMADataFrame, since SOMA ND arrays are necessarily
# arrays *of numbers*. This is okay since the one and only job of SOMA uns is to
# faithfully ingest from AnnData and outgest back.
"string_np_ndarray_1d": np.asarray(["j", "k", "l"]),
"string_np_ndarray_2d": np.asarray([["m", "n", "o"], ["p", "q", "r"]]),
},
}
adata = anndata.AnnData(
obs=obs,
var=var,
X=X,
uns=uns,
dtype=X.dtype,
)

soma_uri = tmp_path.as_posix()

tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(soma_uri) as exp:
bdata = tiledbsoma.io.to_anndata(exp, measurement_name="RNA")

# Keystroke-savers
a = adata.uns
b = bdata.uns

assert a["int_scalar"] == b["int_scalar"]
assert a["float_scalar"] == b["float_scalar"]
assert a["string_scalar"] == b["string_scalar"]

assert all(a["pd_df_indexed"]["column_1"] == b["pd_df_indexed"]["column_1"])
assert all(a["pd_df_nonindexed"]["column_1"] == b["pd_df_nonindexed"]["column_1"])

assert (a["np_ndarray_1d"] == b["np_ndarray_1d"]).all()
assert (a["np_ndarray_2d"] == b["np_ndarray_2d"]).all()

sa = a["strings"]
sb = b["strings"]
assert (sa["string_np_ndarray_1d"] == sb["string_np_ndarray_1d"]).all()
assert (sa["string_np_ndarray_2d"] == sb["string_np_ndarray_2d"]).all()

0 comments on commit 3065656

Please sign in to comment.