Skip to content

Commit

Permalink
[python] update_obs/update_var with originhal or readback source
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 26, 2023
1 parent 503ddd1 commit 5e6cade
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 18 deletions.
6 changes: 5 additions & 1 deletion apis/python/src/tiledbsoma/io/_registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ def _string_dict_from_pandas_dataframe(
df = df.head(1) # since reset_index can be expensive on full data
if df.index.name is None or df.index.name == "index":
df.reset_index(inplace=True)
df.rename(columns={"index": default_index_name}, inplace=True)
if default_index_name in df:
if "index" in df:
df.drop(columns=["index"], inplace=True)
else:
df.rename(columns={"index": default_index_name}, inplace=True)
else:
df.reset_index(inplace=True)

Expand Down
7 changes: 5 additions & 2 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,8 +1131,11 @@ def _write_dataframe(

df.reset_index(inplace=True)
if id_column_name is not None:
df.rename(columns={"index": id_column_name}, inplace=True)
id_column_name = "index"
if id_column_name in df:
if "index" in df:
df.drop(columns=["index"], inplace=True)
else:
df.rename(columns={"index": id_column_name}, inplace=True)

df[SOMA_JOINID] = np.asarray(axis_mapping.data)

Expand Down
156 changes: 156 additions & 0 deletions apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,162 @@ def soma1(tmp_path, h5ad1):
return uri


@pytest.mark.parametrize(
"args",
[
# SOMA ID column is to be obs_id, and it is the Pandas index named "obs_id"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and it is the Pandas index named "index"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and it is the Pandas unnamed index
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is named something else
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is unnamed
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is named "index"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is implicitized integers
{
"do_set_index": False,
"index_name_to_set": None,
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas index named "alt_id"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas index named "index"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas unnamed index
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is named something else
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is unnamed
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is named "index"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is implicitized integers
{
"do_set_index": False,
"index_name_to_set": None,
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
],
)
def test_pandas_indexing(args):
"""
The index-column name for registration can take a variety of forms.
This test exercises all of them.
"""

df = pd.DataFrame(
data={
"soma_joinid": np.arange(3, dtype=np.int64),
"alt_id": ["A", "C", "G"],
"obs_id": ["AT", "CT", "GT"],
}
)
if args["do_set_index"]:
df.set_index(args["index_name_to_set"], inplace=True)
if args["do_rename_axis"]:
df.rename_axis(args["axis_name_to_set"], inplace=True)

actual_signature = registration.signatures._string_dict_from_pandas_dataframe(
df,
args["registration_index_column_name"],
)
assert actual_signature == args["expected_signature"]


def test_axis_mappings(anndata1):
mapping = registration.AxisIDMapping.identity(10)
assert mapping.data == tuple(range(10))
Expand Down
61 changes: 46 additions & 15 deletions apis/python/tests/test_update_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def adata(h5ad_file):
return anndata.read_h5ad(h5ad_file)


def test_no_change(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_no_change(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
Expand All @@ -35,9 +36,16 @@ def test_no_change(adata):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

with tiledbsoma.Experiment.open(output_path, "w") as exp:
tiledbsoma.io.update_obs(exp, adata.obs)
tiledbsoma.io.update_var(exp, adata.var, "RNA")
tiledbsoma.io.update_obs(exp, new_obs)
tiledbsoma.io.update_var(exp, new_var, "RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
o2 = exp.obs.schema
Expand All @@ -47,16 +55,21 @@ def test_no_change(adata):
assert v1 == v2


def test_add(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_add(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

new_obs["is_g1"] = new_obs["groups"] == "g1"
new_obs["seq"] = np.arange(new_obs.shape[0], dtype=np.int32)
Expand All @@ -76,16 +89,21 @@ def test_add(adata):
assert v2.field("vst.mean.sq").type == pa.float64()


def test_drop(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_drop(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

del new_obs["groups"]
del new_var["vst.mean"]
Expand All @@ -104,7 +122,8 @@ def test_drop(adata):
v2.field("vst.mean")


def test_change(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_change(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
Expand All @@ -113,8 +132,12 @@ def test_change(adata):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

new_obs["groups"] = np.arange(new_obs.shape[0], dtype=np.int16)
new_var["vst.mean"] = np.arange(new_var.shape[0], dtype=np.int32)
Expand All @@ -133,8 +156,9 @@ def test_change(adata):
assert v1 == v2


@pytest.mark.parametrize("readback", [False, True])
@pytest.mark.parametrize("shift_and_exc", [[0, None], [1, ValueError]])
def test_change_counts(adata, shift_and_exc):
def test_change_counts(adata, readback, shift_and_exc):
shift, exc = shift_and_exc
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
Expand All @@ -144,8 +168,15 @@ def test_change_counts(adata, shift_and_exc):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

old_nobs = len(adata.obs)
old_nvar = len(adata.var)
if readback:
old_obs = exp.obs.read().concat().to_pandas()
old_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
old_obs = adata.obs
old_var = adata.var

old_nobs = len(old_obs)
old_nvar = len(old_var)

new_nobs = old_nobs + shift
new_nvar = old_nvar + shift
Expand Down

0 comments on commit 5e6cade

Please sign in to comment.