Skip to content

Commit

Permalink
[python] Ingest 2D uns string arrays (e.g. color labels) (#1763)
Browse files Browse the repository at this point in the history
* [python] Ingest 2D uns string arrays (e.g. color labels)

* unit-test case

* unit-test case

* comments

* code-review feedback
  • Loading branch information
johnkerl authored Oct 9, 2023
1 parent 066fe36 commit 8d6b1c7
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 10 deletions.
67 changes: 64 additions & 3 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2190,16 +2190,42 @@ def _ingest_uns_string_array(
However, ``SOMADataFrame`` _requires_ that soma_joinid be present, either
as an index column, or as a data column. The former is less confusing.
"""
if len(value.shape) != 1:

if len(value.shape) == 1:
helper = _ingest_uns_1d_string_array
elif len(value.shape) == 2:
helper = _ingest_uns_2d_string_array
else:
msg = (
f"Skipped {coll.uri}[{key!r}]"
f" (uns object): string-array is not one-dimensional"
f" (uns object): string array is neither one-dimensional nor two-dimensional"
)
logging.log_io(msg, msg)
return

helper(
coll=coll,
key=key,
value=value,
platform_config=platform_config,
context=context,
use_relative_uri=use_relative_uri,
ingestion_params=ingestion_params,
)


def _ingest_uns_1d_string_array(
coll: AnyTileDBCollection,
key: str,
value: NPNDArray,
platform_config: Optional[PlatformConfig],
context: Optional[SOMATileDBContext],
*,
use_relative_uri: Optional[bool],
ingestion_params: IngestionParams,
) -> None:
"""Helper for ``_ingest_uns_string_array``"""
n = len(value)
df_uri = _util.uri_joinpath(coll.uri, key)
df = pd.DataFrame(
data={
"soma_joinid": np.arange(n, dtype=np.int64),
Expand All @@ -2208,6 +2234,41 @@ def _ingest_uns_string_array(
)
df.set_index("soma_joinid", inplace=True)

df_uri = _util.uri_joinpath(coll.uri, key)
with _write_dataframe_impl(
df,
df_uri,
None,
ingestion_params=ingestion_params,
platform_config=platform_config,
context=context,
) as soma_df:
_maybe_set(coll, key, soma_df, use_relative_uri=use_relative_uri)


def _ingest_uns_2d_string_array(
coll: AnyTileDBCollection,
key: str,
value: NPNDArray,
platform_config: Optional[PlatformConfig],
context: Optional[SOMATileDBContext],
*,
use_relative_uri: Optional[bool],
ingestion_params: IngestionParams,
) -> None:
"""Helper for ``_ingest_uns_string_array``. Even if the 2D array is 1xN or Nx1, we
must nonetheless keep this as 2D rather than flattening to length-N 1D. That's because
this ``uns`` data is solely of interest for AnnData ingest/outgest, and it must go
back out the way it came in."""
num_rows, num_cols = value.shape
data: Dict[str, Any] = {"soma_joinid": np.arange(num_rows, dtype=np.int64)}
for j in range(num_cols):
column_name = f"values_{j}"
data[column_name] = [str(e) if e else "" for e in value[:, j]]
df = pd.DataFrame(data=data)
df.set_index("soma_joinid", inplace=True)

df_uri = _util.uri_joinpath(coll.uri, key)
with _write_dataframe_impl(
df,
df_uri,
Expand Down
Binary file modified apis/python/testdata/pbmc3k.h5ad
Binary file not shown.
22 changes: 15 additions & 7 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def h5ad_file_with_obsm_holes(request):


@pytest.fixture
def h5ad_file_uns_string_array(request):
# This has uns["louvain_colors"] with dtype.char == "U"
def h5ad_file_uns_string_arrays(request):
# This has uns["louvain_colors"] with dtype.char == "U".
# It also has uns["more_colors"] in the form '[[...]]', as often occurs in the wild.
input_path = HERE.parent / "testdata/pbmc3k.h5ad"
return input_path

Expand Down Expand Up @@ -397,23 +398,30 @@ def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended):
assert np.array_equal(got_pca_variance, original.uns["pca"]["variance"])


def test_ingest_uns_string_array(h5ad_file_uns_string_array):
def test_ingest_uns_string_arrays(h5ad_file_uns_string_arrays):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

tiledbsoma.io.from_h5ad(
output_path,
h5ad_file_uns_string_array.as_posix(),
h5ad_file_uns_string_arrays.as_posix(),
measurement_name="RNA",
)

with tiledbsoma.Experiment.open(output_path) as exp:
with tiledbsoma.DataFrame.open(
exp.ms["RNA"]["uns"]["louvain_colors"].uri
) as df:
contents = df.read().concat()["values"]
assert len(contents) == 8
assert contents[0].as_py() == "#1f77b4"
contents = df.read().concat()
assert contents.shape == (8, 2)
assert len(contents["values"]) == 8
assert contents["values"][0].as_py() == "#1f77b4"

with tiledbsoma.DataFrame.open(exp.ms["RNA"]["uns"]["more_colors"].uri) as df:
contents = df.read().concat()
assert contents.shape == (8, 2)
assert len(contents["values_0"]) == 8
assert contents["values_0"][0].as_py() == "#1f77b4"


def test_add_matrix_to_collection(adata):
Expand Down

0 comments on commit 8d6b1c7

Please sign in to comment.