Revert "issue237"

This reverts commit f9a4f2e.
NCI-CGR · Apr 12, 2024 · 39ce769 · 39ce769
1 parent f9a4f2e
commit 39ce769
Show file tree

Hide file tree

Showing 10 changed files with 144 additions and 116 deletions.
diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
@@ -1,15 +1,7 @@
 import pandas as pd
 
 CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
-CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"]  # red  # blue  # gray  #gold
-
-# Assign labels to colors for plotting consistency
-CASE_CONTROL_LABEL_COLORS = {
-    "Case": CASE_CONTROL_COLORS[0],
-    "Control": CASE_CONTROL_COLORS[1],
-    "QC": CASE_CONTROL_COLORS[2],
-    "Unknown": CASE_CONTROL_COLORS[3],
-}
+CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"]  # red  # blue  # gray  # green
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
@@ -31,11 +31,7 @@ def main(sample_qc: Path, outfile: Path):
 
 
 def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return (
-        sample_qc_table.read(sample_qc)
-        .query("is_subject_representative")
-        .dropna(subset=["EUR", "AFR", "ASN"])
-    )
+    return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
 
 
 def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
@@ -46,51 +42,24 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
     fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
     fig.set_size_inches(6, 5)
 
-    # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
+    # Plot cases and controls separately
     case = sample.query("case_control == 'Case'")
     if case.shape[0] > 0:
         case_color = CASE_CONTROL_COLORS[0]
         tax.scatter(
-            case[["EUR", "AFR", "ASN"]].values,
-            color=case_color,
-            label="Case",
-            zorder=4,
-            **style_defaults
+            case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
         )
 
     control = sample.query("case_control == 'Control'")
     if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]  # blue
+        control_color = CASE_CONTROL_COLORS[1]
         tax.scatter(
             control[["EUR", "AFR", "ASN"]].values,
             color=control_color,
             label="Control",
             **style_defaults
         )
 
-    # Issue 237: Add samples if they are neither case or control.
-    project_qc = sample.query("case_control == 'QC'")
-    if project_qc.shape[0] > 0:
-        project_qc_color = CASE_CONTROL_COLORS[2]  # Yellow
-        tax.scatter(
-            project_qc[["EUR", "AFR", "ASN"]].values,
-            color=project_qc_color,
-            label="QC",
-            **style_defaults
-        )
-
-    unknown = sample.query(
-        "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
-    )
-    if unknown.shape[0] > 0:
-        unknown_color = CASE_CONTROL_COLORS[3]  # Gray
-        tax.scatter(
-            unknown[["EUR", "AFR", "ASN"]].values,
-            color=unknown_color,
-            label="Unknown",
-            **style_defaults
-        )
-
     # Add plot elements
     multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
     tax.boundary(linewidth=0.5)

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
@@ -0,0 +1,95 @@
+"""
+plot_ancestry.py
+----------------
+
+This script plots the triangle plot of ancestries from GRAF ancestry
+estimates.
+
+Output:
+    ``sample_level/ancestry.png``
+
+"""
+import os
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import seaborn as sns
+import ternary
+import typer
+
+from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
+from cgr_gwas_qc.workflow.scripts import sample_qc_table
+
+app = typer.Typer(add_completion=False)
+
+
+@app.command()
+def main(sample_qc: Path, outfile: Path):
+    sample = load_sample_data(sample_qc)
+    plot(sample, outfile)
+
+
+def load_sample_data(sample_qc: Path) -> pd.DataFrame:
+    return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"])
+
+
+def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
+    sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
+
+    # Create plots
+    style_defaults = dict(linewidth=0, alpha=0.8, s=8)
+    fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
+    fig.set_size_inches(6, 5)
+
+    # Plot cases and controls separately
+    case = sample.query("case_control == 'Case'")
+    if case.shape[0] > 0:
+        case_color = CASE_CONTROL_COLORS[0]
+        tax.scatter(
+            case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults
+        )
+
+    control = sample.query("case_control == 'Control'")
+    if control.shape[0] > 0:
+        control_color = CASE_CONTROL_COLORS[1]
+        tax.scatter(
+            control[["E(%)", "F(%)", "A(%)"]].values,
+            color=control_color,
+            label="Control",
+            **style_defaults
+        )
+
+    # Add plot elements
+    multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
+    tax.boundary(linewidth=0.5)
+    tax.gridlines(multiple=multiple, color="gray")
+    tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f")
+
+    # Set Axis labels
+    label_defaults = dict(fontsize=12, offset=0.14)
+    tax.left_axis_label("Asian", **label_defaults)
+    tax.right_axis_label("African", **label_defaults)
+    tax.bottom_axis_label("European", **label_defaults)
+
+    # Add legend
+    tax.legend(title="case_control")
+
+    # Clean-up plot
+    tax.set_background_color(color="white")
+    tax.clear_matplotlib_ticks()
+    tax.get_axes().axis("off")  # removes outer square axes
+
+    # Save if given an outfile
+    if outfile:
+        tax.savefig(outfile)
+
+
+if __name__ == "__main__":
+    if "snakemake" in locals():
+        defaults = {}
+        defaults.update({"sample_qc": Path(snakemake.input[0])})  # type: ignore # noqa
+        defaults.update({"outfile": Path(snakemake.output[0])})  # type: ignore # noqa
+        main(**defaults)
+    else:
+        app()
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
@@ -31,6 +31,7 @@
 
 @app.command()
 def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):
+
     df = (
         read_het(het)
         .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
@@ -49,20 +50,13 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
 def plot(df: pd.DataFrame, population: str, threshold: float):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.scatterplot(
         x="x_label",
         y="F",
         data=df,
         hue="case_control",
-        palette=CASE_CONTROL_LABEL_COLORS,
+        palette=COLORS,
         ax=ax,
         alpha=0.8,
         linewidth=0,
@@ -73,7 +67,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
     ax.set_xlabel("Subjects sorted by F")
     ax.set_ylabel("F")
     ax.set_ylim(_get_ylim(df.F, threshold))
-    ax.set_title(f"{population} Heterozygosity F Coefficient")
+    ax.set_title(f"{population} Homozygosity F Coefficient")
 
     # Move legend
     plt.legend(loc="upper left")

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
@@ -88,16 +88,9 @@ def plot_panel(
     )
 
     # Set basic defaults so I don't have to repeat myself
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     style_defaults = dict(linewidth=0, alpha=0.8, s=5)
     sample_defaults = {
-        **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
+        **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
         **style_defaults,
     }
     snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
@@ -18,11 +18,10 @@
 import seaborn as sns
 import typer
 
-from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
-from cgr_gwas_qc.workflow.scripts import sample_qc_table
-
 # import snakemake
 
+from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
+from cgr_gwas_qc.workflow.scripts import sample_qc_table
 
 app = typer.Typer(add_completion=False)
 
@@ -31,7 +30,7 @@
 def main(sample_qc: Path, outfile: Path, xchr: str):
     sample = load_sample_data(sample_qc)
     xchr = str(snakemake.params)  # type: ignore # noqa
-    plot(sample, xchr, outfile)
+    plot(sample, outfile, xchr)
 
 
 """
@@ -68,23 +67,16 @@ def _update_categories(sr: pd.DataFrame):
     return sr
 
 
-def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
+def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     # Create plots
     style_defaults = dict(linewidth=0, alpha=0.8, s=2)
     defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.boxplot(ax=ax, showfliers=False, **defaults)
     sns.stripplot(
-        ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
+        ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
     )
 
     # Make boxplot black and white
@@ -95,13 +87,13 @@ def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None)
     # ax.set_xlabel("Reported Sex")
     ax.set_ylabel("ChrX Inbreeding Coeff")
 
-    xchr_bool = xchr.strip().lower() == "true"
-    print(type(xchr_bool), " ", xchr_bool)
-    if xchr_bool:
-        print("sex chr included", xchr_bool)
+    xchr = xchr.strip().lower() == "true"
+    print(type(xchr), " ", xchr)
+    if xchr:
+        print("sex chr included", xchr)
         ax.set_xlabel("Reported Sex")
     else:
-        print("No sex chromosome ", xchr_bool)
+        print("No sex chromosome ", xchr)
         ax.set_xlabel("No sex chromosome \nSkipping sex condordace")
 
     # Add line at 0.5

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
@@ -50,14 +50,7 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
 def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
-    g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
+    g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
     g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
     g.map_diag(sns.kdeplot)
     g.add_legend(

diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,9 +192,7 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
     save(sample_qc, outfile)
 
@@ -322,9 +320,12 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
+
     return (
         pd.read_csv(file_name, sep="\t")
-        .rename({"Subject": "Sample_ID"}, axis=1)
+        .assign(
+            Sample_ID=lambda x: x["Subject"].astype(str)
+        )  # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
         .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
         .assign(AFR=lambda x: x["P_f (%)"] / 100)
         .assign(EUR=lambda x: x["P_e (%)"] / 100)
@@ -401,8 +402,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs,
-            columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -445,16 +445,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -500,9 +496,7 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.