diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py index a9715531..88144661 100644 --- a/src/cgr_gwas_qc/reporting/constants.py +++ b/src/cgr_gwas_qc/reporting/constants.py @@ -1,15 +1,7 @@ import pandas as pd CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"]) -CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold - -# Assign labels to colors for plotting consistency -CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], -} +CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"]) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py index 374459c9..4327333a 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py @@ -31,11 +31,7 @@ def main(sample_qc: Path, outfile: Path): def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return ( - sample_qc_table.read(sample_qc) - .query("is_subject_representative") - .dropna(subset=["EUR", "AFR", "ASN"]) - ) + return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"]) def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): @@ -46,21 +42,17 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 fig.set_size_inches(6, 5) - # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible + # Plot cases and controls separately case = sample.query("case_control == 'Case'") if case.shape[0] > 0: case_color = CASE_CONTROL_COLORS[0] tax.scatter( - case[["EUR", "AFR", "ASN"]].values, - color=case_color, - label="Case", - zorder=4, - **style_defaults + case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults ) control = sample.query("case_control == 'Control'") if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] # blue + control_color = CASE_CONTROL_COLORS[1] tax.scatter( control[["EUR", "AFR", "ASN"]].values, color=control_color, @@ -68,29 +60,6 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): **style_defaults ) - # Issue 237: Add samples if they are neither case or control. - project_qc = sample.query("case_control == 'QC'") - if project_qc.shape[0] > 0: - project_qc_color = CASE_CONTROL_COLORS[2] # Yellow - tax.scatter( - project_qc[["EUR", "AFR", "ASN"]].values, - color=project_qc_color, - label="QC", - **style_defaults - ) - - unknown = sample.query( - "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'" - ) - if unknown.shape[0] > 0: - unknown_color = CASE_CONTROL_COLORS[3] # Gray - tax.scatter( - unknown[["EUR", "AFR", "ASN"]].values, - color=unknown_color, - label="Unknown", - **style_defaults - ) - # Add plot elements multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments tax.boundary(linewidth=0.5) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py new file mode 100644 index 00000000..2a0d7242 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py @@ -0,0 +1,95 @@ +""" +plot_ancestry.py +---------------- + +This script plots the triangle plot of ancestries from GRAF ancestry +estimates. + +Output: + ``sample_level/ancestry.png`` + +""" +import os +from pathlib import Path +from typing import Optional + +import pandas as pd +import seaborn as sns +import ternary +import typer + +from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS +from cgr_gwas_qc.workflow.scripts import sample_qc_table + +app = typer.Typer(add_completion=False) + + +@app.command() +def main(sample_qc: Path, outfile: Path): + sample = load_sample_data(sample_qc) + plot(sample, outfile) + + +def load_sample_data(sample_qc: Path) -> pd.DataFrame: + return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"]) + + +def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): + sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + + # Create plots + style_defaults = dict(linewidth=0, alpha=0.8, s=8) + fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 + fig.set_size_inches(6, 5) + + # Plot cases and controls separately + case = sample.query("case_control == 'Case'") + if case.shape[0] > 0: + case_color = CASE_CONTROL_COLORS[0] + tax.scatter( + case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults + ) + + control = sample.query("case_control == 'Control'") + if control.shape[0] > 0: + control_color = CASE_CONTROL_COLORS[1] + tax.scatter( + control[["E(%)", "F(%)", "A(%)"]].values, + color=control_color, + label="Control", + **style_defaults + ) + + # Add plot elements + multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments + tax.boundary(linewidth=0.5) + tax.gridlines(multiple=multiple, color="gray") + tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f") + + # Set Axis labels + label_defaults = dict(fontsize=12, offset=0.14) + tax.left_axis_label("Asian", **label_defaults) + tax.right_axis_label("African", **label_defaults) + tax.bottom_axis_label("European", **label_defaults) + + # Add legend + tax.legend(title="case_control") + + # Clean-up plot + tax.set_background_color(color="white") + tax.clear_matplotlib_ticks() + tax.get_axes().axis("off") # removes outer square axes + + # Save if given an outfile + if outfile: + tax.savefig(outfile) + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa + defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa + main(**defaults) + else: + app() diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py index caf3dda5..72eacc11 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py @@ -31,6 +31,7 @@ @app.command() def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path): + df = ( read_het(het) .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left") @@ -49,20 +50,13 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: def plot(df: pd.DataFrame, population: str, threshold: float): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot( x="x_label", y="F", data=df, hue="case_control", - palette=CASE_CONTROL_LABEL_COLORS, + palette=COLORS, ax=ax, alpha=0.8, linewidth=0, @@ -73,7 +67,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float): ax.set_xlabel("Subjects sorted by F") ax.set_ylabel("F") ax.set_ylim(_get_ylim(df.F, threshold)) - ax.set_title(f"{population} Heterozygosity F Coefficient") + ax.set_title(f"{population} Homozygosity F Coefficient") # Move legend plt.legend(loc="upper left") diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py index 3fe23a48..c248e8e0 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py @@ -88,16 +88,9 @@ def plot_panel( ) # Set basic defaults so I don't have to repeat myself - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - style_defaults = dict(linewidth=0, alpha=0.8, s=5) sample_defaults = { - **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample), + **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample), **style_defaults, } snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults} diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py index ecfa444e..97c4b30a 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py @@ -18,11 +18,10 @@ import seaborn as sns import typer -from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS -from cgr_gwas_qc.workflow.scripts import sample_qc_table - # import snakemake +from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS +from cgr_gwas_qc.workflow.scripts import sample_qc_table app = typer.Typer(add_completion=False) @@ -31,7 +30,7 @@ def main(sample_qc: Path, outfile: Path, xchr: str): sample = load_sample_data(sample_qc) xchr = str(snakemake.params) # type: ignore # noqa - plot(sample, xchr, outfile) + plot(sample, outfile, xchr) """ @@ -68,23 +67,16 @@ def _update_categories(sr: pd.DataFrame): return sr -def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None): +def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - # Create plots style_defaults = dict(linewidth=0, alpha=0.8, s=2) defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample) fig, ax = plt.subplots(figsize=(6, 6)) sns.boxplot(ax=ax, showfliers=False, **defaults) sns.stripplot( - ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults + ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults ) # Make boxplot black and white @@ -95,13 +87,13 @@ def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None) # ax.set_xlabel("Reported Sex") ax.set_ylabel("ChrX Inbreeding Coeff") - xchr_bool = xchr.strip().lower() == "true" - print(type(xchr_bool), " ", xchr_bool) - if xchr_bool: - print("sex chr included", xchr_bool) + xchr = xchr.strip().lower() == "true" + print(type(xchr), " ", xchr) + if xchr: + print("sex chr included", xchr) ax.set_xlabel("Reported Sex") else: - print("No sex chromosome ", xchr_bool) + print("No sex chromosome ", xchr) ax.set_xlabel("No sex chromosome \nSkipping sex condordace") # Add line at 0.5 diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py index 60b5f2ba..868d79b9 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py @@ -50,14 +50,7 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path): def plot(df: pd.DataFrame, population: str) -> sns.PairGrid: sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - - g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True) + g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True) g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0) g.map_diag(sns.kdeplot) g.add_legend( diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index c121fb3a..ee39a2e5 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,9 +192,7 @@ def main( ) add_qc_columns( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) save(sample_qc, outfile) @@ -322,9 +320,12 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/ """ + return ( pd.read_csv(file_name, sep="\t") - .rename({"Subject": "Sample_ID"}, axis=1) + .assign( + Sample_ID=lambda x: x["Subject"].astype(str) + ) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_")) .assign(AFR=lambda x: x["P_f (%)"] / 100) .assign(EUR=lambda x: x["P_e (%)"] / 100) @@ -401,8 +402,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, - columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -445,16 +445,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -500,9 +496,7 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions. diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk index c5bd18ae..3c0cd1ff 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk @@ -2,6 +2,10 @@ from cgr_gwas_qc import load_config cfg = load_config() +PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} +BIG_TIME = {1: 8, 2: 24, 3: 48} + + use_contamination = ( cfg.config.user_files.idat_pattern and cfg.config.user_files.gtc_pattern @@ -11,8 +15,6 @@ use_contamination = ( localrules: all_sample_qc, - sample_concordance_plink, - sample_concordance_summary, split_sample_concordance, snp_qc_table, sample_level_sexcheck, @@ -63,17 +65,14 @@ module thousand_genomes: config: {} - module plink: snakefile: cfg.modules("plink") - module graf: snakefile: cfg.modules("graf_v2.4") - module grafpop: snakefile: cfg.modules("grafpop") @@ -317,6 +316,10 @@ rule sample_concordance_plink: pi_hat_threshold=cfg.config.software_params.pi_hat_threshold, output: "sample_level/concordance/plink.csv", + resources: + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], + script: "../scripts/concordance_table.py" @@ -383,8 +386,9 @@ rule sample_concordance_summary: output: "sample_level/concordance/summary.csv", resources: - mem_mb=lambda wildcards, attempt: 1024 * 2 * attempt, - time_hr=lambda wildcards, attempt: 2 * attempt, + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], + script: "../scripts/sample_concordance.py" @@ -409,9 +413,9 @@ use rule grafpop_populations from grafpop as graf_populations with: bed=rules.snp_call_rate_filter_2.output.bed, bim=rules.snp_call_rate_filter_2.output.bim, fam=rules.snp_call_rate_filter_2.output.fam, - # bed="sample_level/call_rate_2/samples.bed", - # bim="sample_level/call_rate_2/samples.bim", - # fam="sample_level/call_rate_2/samples.fam", + #bed="sample_level/call_rate_2/samples.bed", + #bim="sample_level/call_rate_2/samples.bim", + #fam="sample_level/call_rate_2/samples.fam", output: "sample_level/ancestry/grafpop_populations.txt", log: @@ -450,7 +454,6 @@ rule snp_qc_table: sex_chr_included = cfg.config.workflow_params.sex_chr_included if sex_chr_included: print("sex_chr_included ", sex_chr_included) - use rule sexcheck from plink as sample_level_sexcheck with: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -460,10 +463,8 @@ if sex_chr_included: out_prefix="sample_level/call_rate_1/samples", output: "sample_level/call_rate_1/samples.sexcheck", - else: print("sex_chr_included ", sex_chr_included) - rule sample_level_sexcheck: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -477,7 +478,6 @@ else: echo "FID IID PEDSEX SNPSEX STATUS F" >> sample_level/call_rate_1/samples.sexcheck """ - def _contam(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params if use_contamination: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk index 0ddf5cd0..397ce871 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk @@ -8,6 +8,9 @@ import shutil cfg = load_config() +PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} +BIG_TIME = {1: 8, 2: 24, 3: 48} + localrules: all_subject_qc, @@ -274,6 +277,9 @@ use rule genome from plink as population_level_ibd with: out_prefix="subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd", output: "subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd.genome", + resources: + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], rule population_level_concordance_plink: