diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py index a40892a751..7e5ed4d83f 100644 --- a/apis/python/src/tiledbsoma/io/_registration/signatures.py +++ b/apis/python/src/tiledbsoma/io/_registration/signatures.py @@ -70,7 +70,11 @@ def _string_dict_from_pandas_dataframe( df = df.head(1) # since reset_index can be expensive on full data if df.index.name is None or df.index.name == "index": df.reset_index(inplace=True) - df.rename(columns={"index": default_index_name}, inplace=True) + if default_index_name in df: + if "index" in df: + df.drop(columns=["index"], inplace=True) + else: + df.rename(columns={"index": default_index_name}, inplace=True) else: df.reset_index(inplace=True) diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 540c7d2fef..37c35b9139 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1131,8 +1131,11 @@ def _write_dataframe( df.reset_index(inplace=True) if id_column_name is not None: - df.rename(columns={"index": id_column_name}, inplace=True) - id_column_name = "index" + if id_column_name in df: + if "index" in df: + df.drop(columns=["index"], inplace=True) + else: + df.rename(columns={"index": id_column_name}, inplace=True) df[SOMA_JOINID] = np.asarray(axis_mapping.data) diff --git a/apis/python/tests/test_registration_mappings.py b/apis/python/tests/test_registration_mappings.py index 837e38c227..c05622be20 100644 --- a/apis/python/tests/test_registration_mappings.py +++ b/apis/python/tests/test_registration_mappings.py @@ -156,6 +156,162 @@ def soma1(tmp_path, h5ad1): return uri +@pytest.mark.parametrize( + "args", + [ + # SOMA ID column is to be obs_id, and it is the Pandas index named "obs_id" + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "obs_id", + "expected_signature": {"obs_id": "string", "alt_id": "string"}, + }, + # SOMA ID column is to be obs_id, and it is the Pandas index named "index" + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": True, + "axis_name_to_set": "index", + "registration_index_column_name": "obs_id", + "expected_signature": {"obs_id": "string", "alt_id": "string"}, + }, + # SOMA ID column is to be obs_id, and it is the Pandas unnamed index + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": True, + "axis_name_to_set": None, + "registration_index_column_name": "obs_id", + "expected_signature": {"obs_id": "string", "alt_id": "string"}, + }, + # SOMA ID column is to be obs_id, and the Pandas index is named something else + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "obs_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + # SOMA ID column is to be obs_id, and the Pandas index is unnamed + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": True, + "axis_name_to_set": None, + "registration_index_column_name": "obs_id", + "expected_signature": {"obs_id": "string"}, + }, + # SOMA ID column is to be obs_id, and the Pandas index is named "index" + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": True, + "axis_name_to_set": "index", + "registration_index_column_name": "obs_id", + "expected_signature": {"obs_id": "string"}, + }, + # SOMA ID column is to be obs_id, and the Pandas index is implicitized integers + { + "do_set_index": False, + "index_name_to_set": None, + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "obs_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + # SOMA ID column is to be alt_id, and it is the Pandas index named "alt_id" + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + # SOMA ID column is to be alt_id, and it is the Pandas index named "index" + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": True, + "axis_name_to_set": "index", + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + # SOMA ID column is to be alt_id, and it is the Pandas unnamed index + { + "do_set_index": True, + "index_name_to_set": "alt_id", + "do_rename_axis": True, + "axis_name_to_set": None, + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + # SOMA ID column is to be alt_id, and the Pandas index is named something else + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "alt_id", + "expected_signature": {"obs_id": "string", "alt_id": "string"}, + }, + # SOMA ID column is to be alt_id, and the Pandas index is unnamed + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": True, + "axis_name_to_set": None, + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string"}, + }, + # SOMA ID column is to be alt_id, and the Pandas index is named "index" + { + "do_set_index": True, + "index_name_to_set": "obs_id", + "do_rename_axis": True, + "axis_name_to_set": "index", + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string"}, + }, + # SOMA ID column is to be alt_id, and the Pandas index is implicitized integers + { + "do_set_index": False, + "index_name_to_set": None, + "do_rename_axis": False, + "axis_name_to_set": None, + "registration_index_column_name": "alt_id", + "expected_signature": {"alt_id": "string", "obs_id": "string"}, + }, + ], +) +def test_pandas_indexing(args): + """ + The index-column name for registration can take a variety of forms. + This test exercises all of them. + """ + + df = pd.DataFrame( + data={ + "soma_joinid": np.arange(3, dtype=np.int64), + "alt_id": ["A", "C", "G"], + "obs_id": ["AT", "CT", "GT"], + } + ) + if args["do_set_index"]: + df.set_index(args["index_name_to_set"], inplace=True) + if args["do_rename_axis"]: + df.rename_axis(args["axis_name_to_set"], inplace=True) + + actual_signature = registration.signatures._string_dict_from_pandas_dataframe( + df, + args["registration_index_column_name"], + ) + assert actual_signature == args["expected_signature"] + + def test_axis_mappings(anndata1): mapping = registration.AxisIDMapping.identity(10) assert mapping.data == tuple(range(10)) diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py index 6e0ac7c28b..e72c574a6a 100644 --- a/apis/python/tests/test_update_dataframes.py +++ b/apis/python/tests/test_update_dataframes.py @@ -26,7 +26,8 @@ def adata(h5ad_file): return anndata.read_h5ad(h5ad_file) -def test_no_change(adata): +@pytest.mark.parametrize("readback", [False, True]) +def test_no_change(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") @@ -35,9 +36,16 @@ def test_no_change(adata): o1 = exp.obs.schema v1 = exp.ms["RNA"].var.schema + if readback: + new_obs = exp.obs.read().concat().to_pandas() + new_var = exp.ms["RNA"].var.read().concat().to_pandas() + else: + new_obs = adata.obs + new_var = adata.var + with tiledbsoma.Experiment.open(output_path, "w") as exp: - tiledbsoma.io.update_obs(exp, adata.obs) - tiledbsoma.io.update_var(exp, adata.var, "RNA") + tiledbsoma.io.update_obs(exp, new_obs) + tiledbsoma.io.update_var(exp, new_var, "RNA") with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema @@ -47,7 +55,8 @@ def test_no_change(adata): assert v1 == v2 -def test_add(adata): +@pytest.mark.parametrize("readback", [False, True]) +def test_add(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") @@ -55,8 +64,12 @@ def test_add(adata): with tiledbsoma.Experiment.open(output_path) as exp: exp.ms["RNA"].var.schema - new_obs = adata.obs - new_var = adata.var + if readback: + new_obs = exp.obs.read().concat().to_pandas() + new_var = exp.ms["RNA"].var.read().concat().to_pandas() + else: + new_obs = adata.obs + new_var = adata.var new_obs["is_g1"] = new_obs["groups"] == "g1" new_obs["seq"] = np.arange(new_obs.shape[0], dtype=np.int32) @@ -76,7 +89,8 @@ def test_add(adata): assert v2.field("vst.mean.sq").type == pa.float64() -def test_drop(adata): +@pytest.mark.parametrize("readback", [False, True]) +def test_drop(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") @@ -84,8 +98,12 @@ def test_drop(adata): with tiledbsoma.Experiment.open(output_path) as exp: exp.ms["RNA"].var.schema - new_obs = adata.obs - new_var = adata.var + if readback: + new_obs = exp.obs.read().concat().to_pandas() + new_var = exp.ms["RNA"].var.read().concat().to_pandas() + else: + new_obs = adata.obs + new_var = adata.var del new_obs["groups"] del new_var["vst.mean"] @@ -104,7 +122,8 @@ def test_drop(adata): v2.field("vst.mean") -def test_change(adata): +@pytest.mark.parametrize("readback", [False, True]) +def test_change(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") @@ -113,8 +132,12 @@ def test_change(adata): o1 = exp.obs.schema v1 = exp.ms["RNA"].var.schema - new_obs = adata.obs - new_var = adata.var + if readback: + new_obs = exp.obs.read().concat().to_pandas() + new_var = exp.ms["RNA"].var.read().concat().to_pandas() + else: + new_obs = adata.obs + new_var = adata.var new_obs["groups"] = np.arange(new_obs.shape[0], dtype=np.int16) new_var["vst.mean"] = np.arange(new_var.shape[0], dtype=np.int32) @@ -133,8 +156,9 @@ def test_change(adata): assert v1 == v2 +@pytest.mark.parametrize("readback", [False, True]) @pytest.mark.parametrize("shift_and_exc", [[0, None], [1, ValueError]]) -def test_change_counts(adata, shift_and_exc): +def test_change_counts(adata, readback, shift_and_exc): shift, exc = shift_and_exc tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name @@ -144,8 +168,15 @@ def test_change_counts(adata, shift_and_exc): o1 = exp.obs.schema v1 = exp.ms["RNA"].var.schema - old_nobs = len(adata.obs) - old_nvar = len(adata.var) + if readback: + old_obs = exp.obs.read().concat().to_pandas() + old_var = exp.ms["RNA"].var.read().concat().to_pandas() + else: + old_obs = adata.obs + old_var = adata.var + + old_nobs = len(old_obs) + old_nvar = len(old_var) new_nobs = old_nobs + shift new_nvar = old_nvar + shift