[python] update_obs/update_var with originhal or readback source

single-cell-data · Sep 26, 2023 · 5e6cade · 5e6cade
1 parent 503ddd1
commit 5e6cade
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 18 deletions.
diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py
@@ -70,7 +70,11 @@ def _string_dict_from_pandas_dataframe(
     df = df.head(1)  # since reset_index can be expensive on full data
     if df.index.name is None or df.index.name == "index":
         df.reset_index(inplace=True)
-        df.rename(columns={"index": default_index_name}, inplace=True)
+        if default_index_name in df:
+            if "index" in df:
+                df.drop(columns=["index"], inplace=True)
+        else:
+            df.rename(columns={"index": default_index_name}, inplace=True)
     else:
         df.reset_index(inplace=True)
 

diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -1131,8 +1131,11 @@ def _write_dataframe(
 
     df.reset_index(inplace=True)
     if id_column_name is not None:
-        df.rename(columns={"index": id_column_name}, inplace=True)
-        id_column_name = "index"
+        if id_column_name in df:
+            if "index" in df:
+                df.drop(columns=["index"], inplace=True)
+        else:
+            df.rename(columns={"index": id_column_name}, inplace=True)
 
     df[SOMA_JOINID] = np.asarray(axis_mapping.data)
 

diff --git a/apis/python/tests/test_registration_mappings.py b/apis/python/tests/test_registration_mappings.py
@@ -156,6 +156,162 @@ def soma1(tmp_path, h5ad1):
     return uri
 
 
+@pytest.mark.parametrize(
+    "args",
+    [
+        # SOMA ID column is to be obs_id, and it is the Pandas index named "obs_id"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"obs_id": "string", "alt_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and it is the Pandas index named "index"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": "index",
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"obs_id": "string", "alt_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and it is the Pandas unnamed index
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"obs_id": "string", "alt_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and the Pandas index is named something else
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and the Pandas index is unnamed
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"obs_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and the Pandas index is named "index"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": "index",
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"obs_id": "string"},
+        },
+        # SOMA ID column is to be obs_id, and the Pandas index is implicitized integers
+        {
+            "do_set_index": False,
+            "index_name_to_set": None,
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "obs_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and it is the Pandas index named "alt_id"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and it is the Pandas index named "index"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": "index",
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and it is the Pandas unnamed index
+        {
+            "do_set_index": True,
+            "index_name_to_set": "alt_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and the Pandas index is named something else
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"obs_id": "string", "alt_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and the Pandas index is unnamed
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and the Pandas index is named "index"
+        {
+            "do_set_index": True,
+            "index_name_to_set": "obs_id",
+            "do_rename_axis": True,
+            "axis_name_to_set": "index",
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string"},
+        },
+        # SOMA ID column is to be alt_id, and the Pandas index is implicitized integers
+        {
+            "do_set_index": False,
+            "index_name_to_set": None,
+            "do_rename_axis": False,
+            "axis_name_to_set": None,
+            "registration_index_column_name": "alt_id",
+            "expected_signature": {"alt_id": "string", "obs_id": "string"},
+        },
+    ],
+)
+def test_pandas_indexing(args):
+    """
+    The index-column name for registration can take a variety of forms.
+    This test exercises all of them.
+    """
+
+    df = pd.DataFrame(
+        data={
+            "soma_joinid": np.arange(3, dtype=np.int64),
+            "alt_id": ["A", "C", "G"],
+            "obs_id": ["AT", "CT", "GT"],
+        }
+    )
+    if args["do_set_index"]:
+        df.set_index(args["index_name_to_set"], inplace=True)
+    if args["do_rename_axis"]:
+        df.rename_axis(args["axis_name_to_set"], inplace=True)
+
+    actual_signature = registration.signatures._string_dict_from_pandas_dataframe(
+        df,
+        args["registration_index_column_name"],
+    )
+    assert actual_signature == args["expected_signature"]
+
+
 def test_axis_mappings(anndata1):
     mapping = registration.AxisIDMapping.identity(10)
     assert mapping.data == tuple(range(10))

diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py
@@ -26,7 +26,8 @@ def adata(h5ad_file):
     return anndata.read_h5ad(h5ad_file)
 
 
-def test_no_change(adata):
+@pytest.mark.parametrize("readback", [False, True])
+def test_no_change(adata, readback):
     tempdir = tempfile.TemporaryDirectory()
     output_path = tempdir.name
     tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
@@ -35,9 +36,16 @@ def test_no_change(adata):
         o1 = exp.obs.schema
         v1 = exp.ms["RNA"].var.schema
 
+        if readback:
+            new_obs = exp.obs.read().concat().to_pandas()
+            new_var = exp.ms["RNA"].var.read().concat().to_pandas()
+        else:
+            new_obs = adata.obs
+            new_var = adata.var
+
     with tiledbsoma.Experiment.open(output_path, "w") as exp:
-        tiledbsoma.io.update_obs(exp, adata.obs)
-        tiledbsoma.io.update_var(exp, adata.var, "RNA")
+        tiledbsoma.io.update_obs(exp, new_obs)
+        tiledbsoma.io.update_var(exp, new_var, "RNA")
 
     with tiledbsoma.Experiment.open(output_path) as exp:
         o2 = exp.obs.schema
@@ -47,16 +55,21 @@ def test_no_change(adata):
     assert v1 == v2
 
 
-def test_add(adata):
+@pytest.mark.parametrize("readback", [False, True])
+def test_add(adata, readback):
     tempdir = tempfile.TemporaryDirectory()
     output_path = tempdir.name
     tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
 
     with tiledbsoma.Experiment.open(output_path) as exp:
         exp.ms["RNA"].var.schema
 
-    new_obs = adata.obs
-    new_var = adata.var
+        if readback:
+            new_obs = exp.obs.read().concat().to_pandas()
+            new_var = exp.ms["RNA"].var.read().concat().to_pandas()
+        else:
+            new_obs = adata.obs
+            new_var = adata.var
 
     new_obs["is_g1"] = new_obs["groups"] == "g1"
     new_obs["seq"] = np.arange(new_obs.shape[0], dtype=np.int32)
@@ -76,16 +89,21 @@ def test_add(adata):
     assert v2.field("vst.mean.sq").type == pa.float64()
 
 
-def test_drop(adata):
+@pytest.mark.parametrize("readback", [False, True])
+def test_drop(adata, readback):
     tempdir = tempfile.TemporaryDirectory()
     output_path = tempdir.name
     tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
 
     with tiledbsoma.Experiment.open(output_path) as exp:
         exp.ms["RNA"].var.schema
 
-    new_obs = adata.obs
-    new_var = adata.var
+        if readback:
+            new_obs = exp.obs.read().concat().to_pandas()
+            new_var = exp.ms["RNA"].var.read().concat().to_pandas()
+        else:
+            new_obs = adata.obs
+            new_var = adata.var
 
     del new_obs["groups"]
     del new_var["vst.mean"]
@@ -104,7 +122,8 @@ def test_drop(adata):
         v2.field("vst.mean")
 
 
-def test_change(adata):
+@pytest.mark.parametrize("readback", [False, True])
+def test_change(adata, readback):
     tempdir = tempfile.TemporaryDirectory()
     output_path = tempdir.name
     tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")
@@ -113,8 +132,12 @@ def test_change(adata):
         o1 = exp.obs.schema
         v1 = exp.ms["RNA"].var.schema
 
-    new_obs = adata.obs
-    new_var = adata.var
+        if readback:
+            new_obs = exp.obs.read().concat().to_pandas()
+            new_var = exp.ms["RNA"].var.read().concat().to_pandas()
+        else:
+            new_obs = adata.obs
+            new_var = adata.var
 
     new_obs["groups"] = np.arange(new_obs.shape[0], dtype=np.int16)
     new_var["vst.mean"] = np.arange(new_var.shape[0], dtype=np.int32)
@@ -133,8 +156,9 @@ def test_change(adata):
     assert v1 == v2
 
 
+@pytest.mark.parametrize("readback", [False, True])
 @pytest.mark.parametrize("shift_and_exc", [[0, None], [1, ValueError]])
-def test_change_counts(adata, shift_and_exc):
+def test_change_counts(adata, readback, shift_and_exc):
     shift, exc = shift_and_exc
     tempdir = tempfile.TemporaryDirectory()
     output_path = tempdir.name
@@ -144,8 +168,15 @@ def test_change_counts(adata, shift_and_exc):
         o1 = exp.obs.schema
         v1 = exp.ms["RNA"].var.schema
 
-    old_nobs = len(adata.obs)
-    old_nvar = len(adata.var)
+        if readback:
+            old_obs = exp.obs.read().concat().to_pandas()
+            old_var = exp.ms["RNA"].var.read().concat().to_pandas()
+        else:
+            old_obs = adata.obs
+            old_var = adata.var
+
+    old_nobs = len(old_obs)
+    old_nvar = len(old_var)
 
     new_nobs = old_nobs + shift
     new_nvar = old_nvar + shift