Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.5] [python] update_obs/update_var with original or readback source #1725

Merged
merged 1 commit into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion apis/python/src/tiledbsoma/io/_registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ def _string_dict_from_pandas_dataframe(
df = df.head(1) # since reset_index can be expensive on full data
if df.index.name is None or df.index.name == "index":
df.reset_index(inplace=True)
df.rename(columns={"index": default_index_name}, inplace=True)
if default_index_name in df:
if "index" in df:
df.drop(columns=["index"], inplace=True)
else:
df.rename(columns={"index": default_index_name}, inplace=True)
else:
df.reset_index(inplace=True)

Expand Down
7 changes: 5 additions & 2 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,8 +1131,11 @@ def _write_dataframe(

df.reset_index(inplace=True)
if id_column_name is not None:
df.rename(columns={"index": id_column_name}, inplace=True)
id_column_name = "index"
if id_column_name in df:
if "index" in df:
df.drop(columns=["index"], inplace=True)
else:
df.rename(columns={"index": id_column_name}, inplace=True)

df[SOMA_JOINID] = np.asarray(axis_mapping.data)

Expand Down
156 changes: 156 additions & 0 deletions apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,162 @@ def soma1(tmp_path, h5ad1):
return uri


@pytest.mark.parametrize(
"args",
[
# SOMA ID column is to be obs_id, and it is the Pandas index named "obs_id"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and it is the Pandas index named "index"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and it is the Pandas unnamed index
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is named something else
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is unnamed
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is named "index"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "obs_id",
"expected_signature": {"obs_id": "string"},
},
# SOMA ID column is to be obs_id, and the Pandas index is implicitized integers
{
"do_set_index": False,
"index_name_to_set": None,
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "obs_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas index named "alt_id"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas index named "index"
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and it is the Pandas unnamed index
{
"do_set_index": True,
"index_name_to_set": "alt_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is named something else
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"obs_id": "string", "alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is unnamed
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is named "index"
{
"do_set_index": True,
"index_name_to_set": "obs_id",
"do_rename_axis": True,
"axis_name_to_set": "index",
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string"},
},
# SOMA ID column is to be alt_id, and the Pandas index is implicitized integers
{
"do_set_index": False,
"index_name_to_set": None,
"do_rename_axis": False,
"axis_name_to_set": None,
"registration_index_column_name": "alt_id",
"expected_signature": {"alt_id": "string", "obs_id": "string"},
},
],
)
def test_pandas_indexing(args):
"""
The index-column name for registration can take a variety of forms.
This test exercises all of them.
"""

df = pd.DataFrame(
data={
"soma_joinid": np.arange(3, dtype=np.int64),
"alt_id": ["A", "C", "G"],
"obs_id": ["AT", "CT", "GT"],
}
)
if args["do_set_index"]:
df.set_index(args["index_name_to_set"], inplace=True)
if args["do_rename_axis"]:
df.rename_axis(args["axis_name_to_set"], inplace=True)

actual_signature = registration.signatures._string_dict_from_pandas_dataframe(
df,
args["registration_index_column_name"],
)
assert actual_signature == args["expected_signature"]


def test_axis_mappings(anndata1):
mapping = registration.AxisIDMapping.identity(10)
assert mapping.data == tuple(range(10))
Expand Down
61 changes: 46 additions & 15 deletions apis/python/tests/test_update_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def adata(h5ad_file):
return anndata.read_h5ad(h5ad_file)


def test_no_change(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_no_change(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
Expand All @@ -35,9 +36,16 @@ def test_no_change(adata):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

with tiledbsoma.Experiment.open(output_path, "w") as exp:
tiledbsoma.io.update_obs(exp, adata.obs)
tiledbsoma.io.update_var(exp, adata.var, "RNA")
tiledbsoma.io.update_obs(exp, new_obs)
tiledbsoma.io.update_var(exp, new_var, "RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
o2 = exp.obs.schema
Expand All @@ -47,16 +55,21 @@ def test_no_change(adata):
assert v1 == v2


def test_add(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_add(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

new_obs["is_g1"] = new_obs["groups"] == "g1"
new_obs["seq"] = np.arange(new_obs.shape[0], dtype=np.int32)
Expand All @@ -76,16 +89,21 @@ def test_add(adata):
assert v2.field("vst.mean.sq").type == pa.float64()


def test_drop(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_drop(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

del new_obs["groups"]
del new_var["vst.mean"]
Expand All @@ -104,7 +122,8 @@ def test_drop(adata):
v2.field("vst.mean")


def test_change(adata):
@pytest.mark.parametrize("readback", [False, True])
def test_change(adata, readback):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
Expand All @@ -113,8 +132,12 @@ def test_change(adata):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

new_obs = adata.obs
new_var = adata.var
if readback:
new_obs = exp.obs.read().concat().to_pandas()
new_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
new_obs = adata.obs
new_var = adata.var

new_obs["groups"] = np.arange(new_obs.shape[0], dtype=np.int16)
new_var["vst.mean"] = np.arange(new_var.shape[0], dtype=np.int32)
Expand All @@ -133,8 +156,9 @@ def test_change(adata):
assert v1 == v2


@pytest.mark.parametrize("readback", [False, True])
@pytest.mark.parametrize("shift_and_exc", [[0, None], [1, ValueError]])
def test_change_counts(adata, shift_and_exc):
def test_change_counts(adata, readback, shift_and_exc):
shift, exc = shift_and_exc
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
Expand All @@ -144,8 +168,15 @@ def test_change_counts(adata, shift_and_exc):
o1 = exp.obs.schema
v1 = exp.ms["RNA"].var.schema

old_nobs = len(adata.obs)
old_nvar = len(adata.var)
if readback:
old_obs = exp.obs.read().concat().to_pandas()
old_var = exp.ms["RNA"].var.read().concat().to_pandas()
else:
old_obs = adata.obs
old_var = adata.var

old_nobs = len(old_obs)
old_nvar = len(old_var)

new_nobs = old_nobs + shift
new_nvar = old_nvar + shift
Expand Down
Loading