Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add patient or environmental mode flag #420

Merged
merged 12 commits into from
Jan 3, 2022
3 changes: 3 additions & 0 deletions .tests/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
pepfile: config/pep/config.yaml

# execution mode. Can be either "patient" or "environment"
mode: patient

# for testing, uncomment to limit to n strain genomes
testing:
limit-strain-genomes: 5
Expand Down
3 changes: 3 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
pepfile: config/pep/config.yaml

# execution mode. Can be either "patient" or "environment"
mode: patient

# virus genome to use as reference. Must be a NCBI accession
virus-reference-genome:
- NC_045512.2
Expand Down
15 changes: 15 additions & 0 deletions docs/user-guide/configuration.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# Configuration

## Execution Mode

Accepted values: `patient`, `environment`. Defaults to `patient`.

Defines the execution mode of UnCoVar.

When the mode is set to `patient`, the sample is assumed come be from a single
host organism and contains only one strain of SARS-CoV-2. The parts of the
workflow for reconstructing the SARS-CoV-2 strain genome are activated.

If the mode is set to `environment`, the sample is assumed to be from the
environment (e.g. wastewater) and to contain different SARS-CoV-2 strains.
The parts of the workflow responsible for creating and analysing individual
genomes (e.g. assembly, lineage calling via Pangolin) are disabled.

## Adapters

There are three ways to transfer adapter sequences to UnCoVar to remove them
Expand Down
20 changes: 1 addition & 19 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,7 @@ rule save_latest_run:

checkpoint all:
input:
expand(
[
"results/{date}/qc/multiqc.html",
"results/reports/{date}.zip",
"results/{date}/tables/assembly_comparison.tsv",
],
date=get_all_run_dates(),
),
expand(
"results/{date}/plots/all.{mode}-strain.strains.kallisto.svg",
date=get_all_run_dates(),
mode=["major", "any"],
),
zip_expand(
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
zip_wildcard_1=get_dates(),
zip_wildcard_2=get_samples(),
expand_wildcard=config["variant-calling"]["filters"],
),
get_input_by_mode,
output:
touch(
expand(
Expand Down
42 changes: 42 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -1543,6 +1543,48 @@ def get_aggregated_pangolin_calls(wildcards, return_list="paths"):
return expanded_patterns


def get_checked_mode():
mode = config["mode"]
if mode == "patient" or mode == "environment":
return mode

raise TypeError(f'Mode {mode} not recognized. Can be "patient" or "environment".')


def get_input_by_mode(wildcard):
paths = [
expand(
"results/{mode}-reports/{date}.zip",
mode=config["mode"],
date=get_all_run_dates(),
),
expand(
"results/{date}/plots/all.{mode}-strain.strains.kallisto.svg",
date=get_all_run_dates(),
mode=["major", "any"],
),
zip_expand(
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
zip_wildcard_1=get_dates(),
zip_wildcard_2=get_samples(),
expand_wildcard=config["variant-calling"]["filters"],
),
]

if config["mode"] == "patient":
paths += [
expand(
[
"results/{date}/qc/multiqc.html",
"results/{date}/tables/assembly_comparison.tsv",
],
date=get_all_run_dates(),
),
]

return sum(paths, [])


def get_pangolin_for_report(wildcards):
paths = []

Expand Down
69 changes: 60 additions & 9 deletions workflow/rules/generate_output.smk
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ rule high_quality_genomes_report:
"../scripts/generate-high-quality-report.py"


rule overview_table_csv:
rule overview_table_patient_csv:
input:
reads_raw=get_raw_reads_counts,
reads_trimmed=get_trimmed_reads_counts,
Expand All @@ -142,27 +142,49 @@ rule overview_table_csv:
# as input file for the rule. Please add the output of the respective checkpoint to the rule inputs.
_=get_checkpoints_for_overview_table,
output:
qc_data="results/{date}/tables/overview.csv",
qc_data="results/{date}/tables/patient-overview.csv",
params:
assembly_used=lambda wildcards: get_assemblies_for_submission(
wildcards, "all samples"
),
mth=config.get("mth"),
samples=lambda wildcards: get_samples_for_date(wildcards.date),
mode=config["mode"],
log:
"logs/{date}/overview-table.log",
"logs/{date}/patient-overview-table.log",
conda:
"../envs/pysam.yaml"
script:
"../scripts/generate-overview-table.py"


use rule overview_table_patient_csv as overview_table_environment_csv with:
input:
reads_raw=get_raw_reads_counts,
reads_trimmed=get_trimmed_reads_counts,
kraken=get_kraken_output,
reads_used_for_assembly=expand_samples_for_date(
"results/{{date}}/tables/read_pair_counts/{sample}.txt",
),
bcf=expand_samples_for_date(
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
),
output:
qc_data="results/{date}/tables/environment-overview.csv",
params:
mth=config.get("mth"),
samples=lambda wildcards: get_samples_for_date(wildcards.date),
mode=config["mode"],
log:
"logs/{date}/environment-overview-table.log",


rule overview_table_html:
input:
"results/{date}/tables/overview.csv",
"results/{date}/tables/{execution_mode}-overview.csv",
output:
report(
directory("results/{date}/overview/"),
directory("results/{date}/{execution_mode}/overview/"),
htmlindex="index.html",
caption="../report/qc-report.rst",
category="1. Overview",
Expand All @@ -172,7 +194,7 @@ rule overview_table_html:
formatter=get_resource("report-table-formatter.js"),
pin_until="Sample",
log:
"logs/{date}/qc_report_html.log",
"logs/{date}/{execution_mode}-qc-report-html.log",
conda:
"../envs/rbt.yaml"
shell:
Expand Down Expand Up @@ -323,10 +345,13 @@ rule pangolin_call_overview_html:
"rbt csv-report {input} --pin-until {params.pin_until} {output} > {log} 2>&1"


rule snakemake_reports:
rule snakemake_reports_patient:
input:
# 1. Overview
"results/{date}/overview/",
expand(
"results/{{date}}/{execution_mode}/overview/",
execution_mode=get_checked_mode(),
),
"results/{date}/plots/lineages-over-time.svg",
expand(
"results/{{date}}/tables/variants-{ORFNAME}-over-time.csv",
Expand Down Expand Up @@ -368,7 +393,7 @@ rule snakemake_reports:
"results/high-quality-genomes/{date}.fasta",
"results/high-quality-genomes/{date}.csv",
output:
"results/reports/{date}.zip",
"results/patient-reports/{date}.zip",
params:
for_testing=get_if_testing("--snakefile ../workflow/Snakefile"),
conda:
Expand All @@ -379,3 +404,29 @@ rule snakemake_reports:
"snakemake --nolock --report-stylesheet resources/custom-stylesheet.css {input} "
"--report {output} {params.for_testing} "
"> {log} 2>&1"


use rule snakemake_reports_patient as snakemake_reports_environment with:
input:
# 1. Overview
expand(
"results/{{date}}/{execution_mode}/overview/",
execution_mode=get_checked_mode(),
),
"results/{date}/plots/all.major-strain.strains.kallisto.svg",
expand_samples_for_date(
["results/{{date}}/plots/strain-calls/{sample}.strains.kallisto.svg"]
),
# 2. Variant Call Details
lambda wildcards: expand(
"results/{{date}}/vcf-report/{target}.{filter}",
target=get_samples_for_date(wildcards.date) + ["all"],
filter=config["variant-calling"]["filters"],
),
# 3. Sequencing Details
"results/{date}/qc/laboratory/multiqc.html",
"results/{date}/plots/coverage-reference-genome.svg",
output:
"results/environment-reports/{date}.zip",
log:
"logs/snakemake_reports/{date}.log",
133 changes: 72 additions & 61 deletions workflow/scripts/generate-overview-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ def iter_with_samples(inputfiles):
return zip(snakemake.params.samples, inputfiles)


def is_patient_report():
return snakemake.params.mode == "patient"


data = pd.DataFrame(index=snakemake.params.samples)


Expand Down Expand Up @@ -104,63 +108,65 @@ def register_contig_lengths(assemblies, name):
data.loc[sample, name] = max(len(contig.sequence) for contig in infile)


# add lengths of Initial contigs
register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)")

# add lengths of polished contigs
register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)")

# add lengths of pseudo assembly
register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)")

# add lengths of Consensus assembly
register_contig_lengths(snakemake.input.consensus_contigs, "Consensus Sequence (bp)")


# add type of assembly use:
for ele in snakemake.params.assembly_used:
sample, used = ele.split(",")
if "pseudo" == used:
data.loc[sample, "Best Quality"] = "Pseudo"
elif "normal" == used:
data.loc[sample, "Best Quality"] = "De Novo"
elif "consensus" == used:
data.loc[sample, "Best Quality"] = "Consensus"
elif "not-accepted" == used:
data.loc[sample, "Best Quality"] = "-"

# add pangolin results
for sample, file in iter_with_samples(snakemake.input.pangolin):
pangolin_results = pd.read_csv(file)
assert (
pangolin_results.shape[0] == 1
), "unexpected number of rows (>1) in pangolin results"
lineage = pangolin_results.loc[0, "lineage"]
scorpio = pangolin_results.loc[0, "scorpio_call"]
if lineage == "None":
pangolin_call = "no strain called"
else:
# TODO parse scorpio output
# match = re.match(
# "((?P<varcount>\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)",
# pangolin_results.fillna("").loc[0, "note"].strip(),
# )
# assert (
# match is not None
# ), "unexpected pangolin note, please update above regular expression"
# varcount = match.group("varcount") or ""
# if varcount:
# varcount = f" ({varcount})"
# pangolin_call = f"{lineage}{varcount}"
pangolin_call = f"{lineage}"
data.loc[sample, "Pango Lineage"] = pangolin_call
if scorpio == "None":
scorpio_call = "-"
else:
scorpio_call = f"{scorpio}"
data.loc[sample, "WHO Label"] = scorpio_call
data["WHO Label"].fillna("-", inplace=True)
data["WHO Label"].replace({"nan": "-"}, inplace=True)
if is_patient_report():
# add lengths of Initial contigs
register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)")

# add lengths of polished contigs
register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)")

# add lengths of pseudo assembly
register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)")

# add lengths of Consensus assembly
register_contig_lengths(
snakemake.input.consensus_contigs, "Consensus Sequence (bp)"
)

# add type of assembly use:
for ele in snakemake.params.assembly_used:
sample, used = ele.split(",")
if "pseudo" == used:
data.loc[sample, "Best Quality"] = "Pseudo"
elif "normal" == used:
data.loc[sample, "Best Quality"] = "De Novo"
elif "consensus" == used:
data.loc[sample, "Best Quality"] = "Consensus"
elif "not-accepted" == used:
data.loc[sample, "Best Quality"] = "-"

# add pangolin results
for sample, file in iter_with_samples(snakemake.input.pangolin):
pangolin_results = pd.read_csv(file)
assert (
pangolin_results.shape[0] == 1
), "unexpected number of rows (>1) in pangolin results"
lineage = pangolin_results.loc[0, "lineage"]
scorpio = pangolin_results.loc[0, "scorpio_call"]
if lineage == "None":
pangolin_call = "no strain called"
else:
# TODO parse scorpio output
# match = re.match(
# "((?P<varcount>\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)",
# pangolin_results.fillna("").loc[0, "note"].strip(),
# )
# assert (
# match is not None
# ), "unexpected pangolin note, please update above regular expression"
# varcount = match.group("varcount") or ""
# if varcount:
# varcount = f" ({varcount})"
# pangolin_call = f"{lineage}{varcount}"
pangolin_call = f"{lineage}"
data.loc[sample, "Pango Lineage"] = pangolin_call
if scorpio == "None":
scorpio_call = "-"
else:
scorpio_call = f"{scorpio}"
data.loc[sample, "WHO Label"] = scorpio_call
data["WHO Label"].fillna("-", inplace=True)
data["WHO Label"].replace({"nan": "-"}, inplace=True)


# add variant calls
Expand Down Expand Up @@ -240,12 +246,17 @@ def fmt_variants(variants):
"Raw Reads (#)",
"Trimmed Reads (#)",
"Filtered Reads (#)",
"Largest Contig (bp)",
"De Novo Sequence (bp)",
"Pseudo Sequence (bp)",
"Consensus Sequence (bp)",
]

if is_patient_report():
int_cols += [
"Largest Contig (bp)",
"De Novo Sequence (bp)",
"Pseudo Sequence (bp)",
"Consensus Sequence (bp)",
]


data[int_cols] = data[int_cols].fillna("0").applymap(lambda x: "{0:,}".format(int(x)))
data = data.loc[:, (data != "0").any(axis=0)]
data.index.name = "Sample"
Expand Down