fix duplicate rownames introduced in cellphonedb v5 (#48)

* related to ktplots#89 * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update support.py * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update plot_cpdb_chord.py * Update pyproject.toml * add a new option to toggle whether to keep the id_cp_interaction value when plotting * update notebooks
zktuong · Dec 6, 2023 · dac474c · dac474c
1 parent 797548d
commit dac474c
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 83 deletions.
diff --git a/docs/notebooks/tutorial.ipynb b/docs/notebooks/tutorial.ipynb
diff --git a/docs/notebooks/tutorial_v5.ipynb b/docs/notebooks/tutorial_v5.ipynb
diff --git a/ktplotspy/plot/plot_cpdb.py b/ktplotspy/plot/plot_cpdb.py
@@ -80,6 +80,7 @@ def plot_cpdb(
     scale_alpha_by_interaction_scores: bool = False,
     scale_alpha_by_cellsign: bool = False,
     filter_by_cellsign: bool = False,
+    keep_id_cp_interaction: bool = False,
 ) -> Union[ggplot, pd.DataFrame]:
     """Plotting CellPhoneDB results as a dot plot.
 
@@ -157,6 +158,8 @@ def plot_cpdb(
         Whether or not to filter the transparency of interactions by the cellsign.
     filter_by_cellsign: bool, optional
         Filter out interactions with a 0 value cellsign.
+    keep_id_cp_interaction: bool, optional
+        Whether to keep the original `id_cp_interaction` value when plotting.
     Returns
     -------
     Union[ggplot, pd.DataFrame]
@@ -192,7 +195,7 @@ def plot_cpdb(
         tmp = means_mat.melt(id_vars=means_mat.columns[:col_start])
         direc, classif, is_int = {}, {}, {}
         for _, r in tmp.iterrows():
-            key = r.interacting_pair.replace("_", "-") + DEFAULT_SEP * 3 + r.variable
+            key = r.id_cp_interaction + DEFAULT_SEP * 3 + r.interacting_pair.replace("_", "-") + DEFAULT_SEP * 3 + r.variable
             direc[key] = r.directionality
             classif[key] = r.classification
             is_int[key] = r.is_integrin
@@ -384,6 +387,11 @@ def plot_cpdb(
     if return_table:
         return df
     else:
+        # change the labelling of interaction_group
+        if keep_id_cp_interaction:
+            df.interaction_group = [re.sub(DEFAULT_SEP * 3, "_", c) for c in df.interaction_group]
+        else:
+            df.interaction_group = [c.split(DEFAULT_SEP * 3)[1] for c in df.interaction_group]
         # set global figure size
         options.figure_size = figsize
 

diff --git a/ktplotspy/plot/plot_cpdb_chord.py b/ktplotspy/plot/plot_cpdb_chord.py
@@ -115,13 +115,18 @@ def plot_cpdb_chord(
     # do some name wrangling
     subset_clusters = list(set(flatten([x.split("-") for x in lr_interactions.celltype_group])))
     adata_subset = adata[adata.obs[celltype_key].isin(subset_clusters)].copy()
-    interactions = means[["interacting_pair", "gene_a", "gene_b", "partner_a", "partner_b", "receptor_a", "receptor_b"]].copy()
-    interactions["converted"] = [re.sub("-", " ", x) for x in interactions.interacting_pair]
-    interactions["converted"] = [re.sub("_", "-", x) for x in interactions.interacting_pair]
+    interactions = means[
+        ["id_cp_interaction", "interacting_pair", "gene_a", "gene_b", "partner_a", "partner_b", "receptor_a", "receptor_b"]
+    ].copy()
+    interactions["use_interaction_name"] = [
+        x + DEFAULT_SEP * 3 + y for x, y in zip(interactions.id_cp_interaction, interactions.interacting_pair)
+    ]
+    # interactions["converted"] = [re.sub("-", " ", x) for x in interactions.use_interaction_name]
+    interactions["converted"] = [re.sub("_", "-", x) for x in interactions.use_interaction_name]
     lr_interactions["barcode"] = [a + DEFAULT_SEP + b for a, b in zip(lr_interactions.celltype_group, lr_interactions.interaction_group)]
     interactions_subset = interactions[interactions["converted"].isin(list(lr_interactions.interaction_group))].copy()
     # handle complexes gently
-    tm0 = {kx: rx.split("_") for kx, rx in interactions_subset.interacting_pair.items()}
+    tm0 = {kx: rx.split("_") for kx, rx in interactions_subset.use_interaction_name.items()}
     if any([len(x) > 2 for x in tm0.values()]):
         complex_id, simple_id = [], []
         for i, j in tm0.items():
@@ -156,6 +161,7 @@ def plot_cpdb_chord(
     else:
         tm0 = pd.DataFrame(tm0).T
         tm0.columns = ["id_a", "id_b"]
+        tm0.id_a = [x.split(DEFAULT_SEP * 3)[1] for x in tm0.id_a]
         interactions_subset = pd.concat([interactions_subset, tm0], axis=1)
 
     # keep only useful genes
@@ -275,7 +281,7 @@ def plot_cpdb_chord(
             end_size = 1 if end_size < 1 else end_size
             source = (j["producer"], j["start"] - 1, start_size, raxis_range[0] - size)
             destination = (j["receiver"], j["end"] - 1, end_size, raxis_range[0] - size)
-            circle.chord_plot(source, destination, edge_col_dict[lr])
+            circle.chord_plot(source, destination, edge_col_dict[lr] if lr in edge_col_dict else "#f7f7f700")
 
     custom_lines = [Line2D([0], [0], color=val, lw=4) for val in edge_col_dict.values()]
     circle.figure.legend(custom_lines, edge_col_dict.keys(), **legend_params)

diff --git a/ktplotspy/utils/support.py b/ktplotspy/utils/support.py
@@ -233,40 +233,40 @@ def prep_table(data: pd.DataFrame) -> pd.DataFrame:
         Table ready for further analysis.
     """
     dat = data.copy()
-    dat.index = make_unique(dat.interacting_pair)
+    dat.index = [x + DEFAULT_SEP * 3 + y for x, y in zip(dat.id_cp_interaction, dat.interacting_pair)]
     dat.columns = [re.sub("\\|", DEFAULT_SEP, col) for col in dat.columns]
     dat.index = [re.sub("_", "-", row) for row in dat.index]
     dat.index = [re.sub("[.]", " ", row) for row in dat.index]
 
     return dat
 
 
-def make_unique(seq: pd.Series) -> List:
-    """Make unique names.
-
-    Parameters
-    ----------
-    seq : pd.Series
-        Series to convert to unique.
-
-    Returns
-    -------
-    List
-        List of unique names.
-    """
-    seq = list(seq)
-    not_unique = [k for k, v in Counter(seq).items() if v > 1]  # so we have: ['name', 'zip']
-    # suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
-    suff_gens = dict(zip(not_unique, tee(count(1), len(not_unique))))
-    for idx, s in enumerate(seq):
-        try:
-            suffix = "_" + str(next(suff_gens[s]))
-        except KeyError:
-            # s was unique
-            continue
-        else:
-            seq[idx] += suffix
-    return seq
+# def make_unique(seq: pd.Series) -> List:
+#     """Make unique names.
+
+#     Parameters
+#     ----------
+#     seq : pd.Series
+#         Series to convert to unique.
+
+#     Returns
+#     -------
+#     List
+#         List of unique names.
+#     """
+#     seq = list(seq)
+#     not_unique = [k for k, v in Counter(seq).items() if v > 1]  # so we have: ['name', 'zip']
+#     # suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
+#     suff_gens = dict(zip(not_unique, tee(count(1), len(not_unique))))
+#     for idx, s in enumerate(seq):
+#         try:
+#             suffix = "_" + str(next(suff_gens[s]))
+#         except KeyError:
+#             # s was unique
+#             continue
+#         else:
+#             seq[idx] += suffix
+#     return seq
 
 
 def sub_pattern(cell_type: str, pattern: str) -> str:
@@ -663,20 +663,23 @@ def generate_df(
     out = []
     for _, (px, rx) in cell_type_grid.iterrows():
         for _, (
+            ici,
             ip,
             ga,
             gb,
             pa,
             pb,
             ra,
             rb,
+            ui,
             cp,
             ia,
             ib,
         ) in interactions_subset.iterrows():
             if ra:
                 if rb:
                     _out = [
+                        ici,
                         ia,
                         ib,
                         ra,
@@ -692,6 +695,7 @@ def generate_df(
                     ]
                 else:
                     _out = [
+                        ici,
                         ia,
                         ib,
                         ra,
@@ -708,6 +712,7 @@ def generate_df(
             else:
                 if rb:
                     _out = [
+                        ici,
                         ia,
                         ib,
                         ra,
@@ -723,6 +728,7 @@ def generate_df(
                     ]
                 else:  # pragma: no cover
                     _out = [
+                        ici,
                         ia,
                         ib,
                         ra,
@@ -740,6 +746,7 @@ def generate_df(
                 pd.DataFrame(
                     _out,
                     index=[
+                        "id_cp_interaction",
                         "ligand",
                         "receptor",
                         "receptor_a",
@@ -763,13 +770,14 @@ def generate_df(
     _df = _df.reset_index(drop=True)
     for i, j in _df.iterrows():
         if (j["receptor_b"]) and not (j["receptor_a"]):
-            lg, rc = j["receptor"], j["ligand"]
+            ici, lg, rc = j["id_cp_interaction"], j["receptor"], j["ligand"]
             con_pair = lg + "-" + rc
             ra, rb = j["receptor_b"], j["receptor_a"]
             px, rx = j["receiver"], j["producer"]
             pre, prf = j["receiver_expression"], j["receiver_fraction"]
             rce, rcf = j["producer_expression"], j["producer_fraction"]
             tos, frs = j["from"], j["to"]
+            _df.at[i, "id_cp_interaction"] = ici
             _df.at[i, "ligand"] = lg
             _df.at[i, "receptor"] = rc
             _df.at[i, "converted_pair"] = con_pair
@@ -783,4 +791,8 @@ def generate_df(
             _df.at[i, "receiver_fraction"] = rcf
             _df.at[i, "from"] = frs
             _df.at[i, "to"] = tos
+        else:
+            ici, lg, rc = j["id_cp_interaction"], j["ligand"], j["receptor"]
+            con_pair = rc + "-" + lg
+            _df.at[i, "converted_pair"] = con_pair
     return _df
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ktplotspy"
-version = "0.2.0"
+version = "0.2.1"
 description = "Python library for plotting Cellphonedb results. Ported from ktplots R package."
 authors = ["Kelvin Tuong <[email protected]>"]
 license = "MIT"

diff --git a/tests/test_plot_cpdb.py b/tests/test_plot_cpdb.py
@@ -23,6 +23,22 @@ def test_plot_cpdb(mock_show, adata, means, pvals):
     g
 
 
+@patch("matplotlib.pyplot.show")
+@pytest.mark.usefixtures("adata", "means", "pvals")
+def test_plot_cpdb_keep_id(mock_show, adata, means, pvals):
+    g = plot_cpdb(
+        adata=adata,
+        cell_type1="B cell",
+        cell_type2="CD4T cell",
+        means=means,
+        pvals=pvals,
+        celltype_key="celltype",
+        genes=["CXCL13", "CD274", "CXCR5"],
+        keep_id_cp_interaction=True,
+    )
+    g
+
+
 @patch("matplotlib.pyplot.show")
 @pytest.mark.usefixtures("adata", "means", "pvals")
 def test_plot_cpdb_title(mock_show, adata, means, pvals):