Skip to content

Commit

Permalink
Revert "issue237"
Browse files Browse the repository at this point in the history
This reverts commit f9a4f2e.
  • Loading branch information
Kevin Liao committed Apr 12, 2024
1 parent f9a4f2e commit 39ce769
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 116 deletions.
10 changes: 1 addition & 9 deletions src/cgr_gwas_qc/reporting/constants.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
import pandas as pd

CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold

# Assign labels to colors for plotting consistency
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green

SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])

Expand Down
39 changes: 4 additions & 35 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ def main(sample_qc: Path, outfile: Path):


def load_sample_data(sample_qc: Path) -> pd.DataFrame:
return (
sample_qc_table.read(sample_qc)
.query("is_subject_representative")
.dropna(subset=["EUR", "AFR", "ASN"])
)
return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
Expand All @@ -46,51 +42,24 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
fig, tax = ternary.figure(scale=1) # Set scale 0 to 1
fig.set_size_inches(6, 5)

# Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
# Plot cases and controls separately
case = sample.query("case_control == 'Case'")
if case.shape[0] > 0:
case_color = CASE_CONTROL_COLORS[0]
tax.scatter(
case[["EUR", "AFR", "ASN"]].values,
color=case_color,
label="Case",
zorder=4,
**style_defaults
case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
)

control = sample.query("case_control == 'Control'")
if control.shape[0] > 0:
control_color = CASE_CONTROL_COLORS[1] # blue
control_color = CASE_CONTROL_COLORS[1]
tax.scatter(
control[["EUR", "AFR", "ASN"]].values,
color=control_color,
label="Control",
**style_defaults
)

# Issue 237: Add samples if they are neither case or control.
project_qc = sample.query("case_control == 'QC'")
if project_qc.shape[0] > 0:
project_qc_color = CASE_CONTROL_COLORS[2] # Yellow
tax.scatter(
project_qc[["EUR", "AFR", "ASN"]].values,
color=project_qc_color,
label="QC",
**style_defaults
)

unknown = sample.query(
"case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
)
if unknown.shape[0] > 0:
unknown_color = CASE_CONTROL_COLORS[3] # Gray
tax.scatter(
unknown[["EUR", "AFR", "ASN"]].values,
color=unknown_color,
label="Unknown",
**style_defaults
)

# Add plot elements
multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments
tax.boundary(linewidth=0.5)
Expand Down
95 changes: 95 additions & 0 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
plot_ancestry.py
----------------
This script plots the triangle plot of ancestries from GRAF ancestry
estimates.
Output:
``sample_level/ancestry.png``
"""
import os
from pathlib import Path
from typing import Optional

import pandas as pd
import seaborn as sns
import ternary
import typer

from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
from cgr_gwas_qc.workflow.scripts import sample_qc_table

app = typer.Typer(add_completion=False)


@app.command()
def main(sample_qc: Path, outfile: Path):
sample = load_sample_data(sample_qc)
plot(sample, outfile)


def load_sample_data(sample_qc: Path) -> pd.DataFrame:
return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"])


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

# Create plots
style_defaults = dict(linewidth=0, alpha=0.8, s=8)
fig, tax = ternary.figure(scale=1) # Set scale 0 to 1
fig.set_size_inches(6, 5)

# Plot cases and controls separately
case = sample.query("case_control == 'Case'")
if case.shape[0] > 0:
case_color = CASE_CONTROL_COLORS[0]
tax.scatter(
case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults
)

control = sample.query("case_control == 'Control'")
if control.shape[0] > 0:
control_color = CASE_CONTROL_COLORS[1]
tax.scatter(
control[["E(%)", "F(%)", "A(%)"]].values,
color=control_color,
label="Control",
**style_defaults
)

# Add plot elements
multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments
tax.boundary(linewidth=0.5)
tax.gridlines(multiple=multiple, color="gray")
tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f")

# Set Axis labels
label_defaults = dict(fontsize=12, offset=0.14)
tax.left_axis_label("Asian", **label_defaults)
tax.right_axis_label("African", **label_defaults)
tax.bottom_axis_label("European", **label_defaults)

# Add legend
tax.legend(title="case_control")

# Clean-up plot
tax.set_background_color(color="white")
tax.clear_matplotlib_ticks()
tax.get_axes().axis("off") # removes outer square axes

# Save if given an outfile
if outfile:
tax.savefig(outfile)


if __name__ == "__main__":
if "snakemake" in locals():
defaults = {}
defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa
defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa
main(**defaults)
else:
app()
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

@app.command()
def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):

df = (
read_het(het)
.join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
Expand All @@ -49,20 +50,13 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
def plot(df: pd.DataFrame, population: str, threshold: float):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

fig, ax = plt.subplots(figsize=(6, 6))
sns.scatterplot(
x="x_label",
y="F",
data=df,
hue="case_control",
palette=CASE_CONTROL_LABEL_COLORS,
palette=COLORS,
ax=ax,
alpha=0.8,
linewidth=0,
Expand All @@ -73,7 +67,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
ax.set_xlabel("Subjects sorted by F")
ax.set_ylabel("F")
ax.set_ylim(_get_ylim(df.F, threshold))
ax.set_title(f"{population} Heterozygosity F Coefficient")
ax.set_title(f"{population} Homozygosity F Coefficient")

# Move legend
plt.legend(loc="upper left")
Expand Down
9 changes: 1 addition & 8 deletions src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,9 @@ def plot_panel(
)

# Set basic defaults so I don't have to repeat myself
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

style_defaults = dict(linewidth=0, alpha=0.8, s=5)
sample_defaults = {
**dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
**dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
**style_defaults,
}
snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
Expand Down
28 changes: 10 additions & 18 deletions src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@
import seaborn as sns
import typer

from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
from cgr_gwas_qc.workflow.scripts import sample_qc_table

# import snakemake

from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
from cgr_gwas_qc.workflow.scripts import sample_qc_table

app = typer.Typer(add_completion=False)

Expand All @@ -31,7 +30,7 @@
def main(sample_qc: Path, outfile: Path, xchr: str):
sample = load_sample_data(sample_qc)
xchr = str(snakemake.params) # type: ignore # noqa
plot(sample, xchr, outfile)
plot(sample, outfile, xchr)


"""
Expand Down Expand Up @@ -68,23 +67,16 @@ def _update_categories(sr: pd.DataFrame):
return sr


def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

# Create plots
style_defaults = dict(linewidth=0, alpha=0.8, s=2)
defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
fig, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(ax=ax, showfliers=False, **defaults)
sns.stripplot(
ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
)

# Make boxplot black and white
Expand All @@ -95,13 +87,13 @@ def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None)
# ax.set_xlabel("Reported Sex")
ax.set_ylabel("ChrX Inbreeding Coeff")

xchr_bool = xchr.strip().lower() == "true"
print(type(xchr_bool), " ", xchr_bool)
if xchr_bool:
print("sex chr included", xchr_bool)
xchr = xchr.strip().lower() == "true"
print(type(xchr), " ", xchr)
if xchr:
print("sex chr included", xchr)
ax.set_xlabel("Reported Sex")
else:
print("No sex chromosome ", xchr_bool)
print("No sex chromosome ", xchr)
ax.set_xlabel("No sex chromosome \nSkipping sex condordace")

# Add line at 0.5
Expand Down
9 changes: 1 addition & 8 deletions src/cgr_gwas_qc/workflow/scripts/plot_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,7 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
g.map_diag(sns.kdeplot)
g.add_legend(
Expand Down
24 changes: 9 additions & 15 deletions src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,7 @@ def main(
)

add_qc_columns(
sample_qc,
remove_contam,
remove_rep_discordant,
sample_qc, remove_contam, remove_rep_discordant,
)
save(sample_qc, outfile)

Expand Down Expand Up @@ -322,9 +320,12 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
.. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
"""

return (
pd.read_csv(file_name, sep="\t")
.rename({"Subject": "Sample_ID"}, axis=1)
.assign(
Sample_ID=lambda x: x["Subject"].astype(str)
) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
.assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
.assign(AFR=lambda x: x["P_f (%)"] / 100)
.assign(EUR=lambda x: x["P_e (%)"] / 100)
Expand Down Expand Up @@ -401,8 +402,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram

if file_name is None:
return pd.DataFrame(
index=Sample_IDs,
columns=["Contamination_Rate", "is_contaminated"],
index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})

return (
Expand Down Expand Up @@ -445,16 +445,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie


def add_qc_columns(
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
) -> pd.DataFrame:
add_call_rate_flags(sample_qc)
_add_identifiler(sample_qc)
_add_analytic_exclusion(
sample_qc,
remove_contam,
remove_rep_discordant,
sample_qc, remove_contam, remove_rep_discordant,
)
_add_subject_representative(sample_qc)
_add_subject_dropped_from_study(sample_qc)
Expand Down Expand Up @@ -500,9 +496,7 @@ def reason_string(row: pd.Series) -> str:


def _add_analytic_exclusion(
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
) -> pd.DataFrame:
"""Adds a flag to remove samples based on provided conditions.
Expand Down
Loading

0 comments on commit 39ce769

Please sign in to comment.