Skip to content

Commit

Permalink
feat: add patient or environmental mode flag (#420)
Browse files Browse the repository at this point in the history
* add new environment report

* add check

* add missing wildcards

* modiy input of rule rule

* fix typo

* add docs

* fmt

* Update docs/user-guide/configuration.md

Co-authored-by: Alexander Thomas <[email protected]>

Co-authored-by: Alexander Thomas <[email protected]>
  • Loading branch information
thomasbtf and alethomas authored Jan 3, 2022
1 parent 077b80d commit 9ac2002
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 89 deletions.
3 changes: 3 additions & 0 deletions .tests/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
pepfile: config/pep/config.yaml

# execution mode. Can be either "patient" or "environment"
mode: patient

# for testing, uncomment to limit to n strain genomes
testing:
limit-strain-genomes: 5
Expand Down
3 changes: 3 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
pepfile: config/pep/config.yaml

# execution mode. Can be either "patient" or "environment"
mode: patient

# virus genome to use as reference. Must be a NCBI accession
virus-reference-genome:
- NC_045512.2
Expand Down
15 changes: 15 additions & 0 deletions docs/user-guide/configuration.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# Configuration

## Execution Mode

Accepted values: `patient`, `environment`. Defaults to `patient`.

Defines the execution mode of UnCoVar.

When the mode is set to `patient`, the sample is assumed come be from a single
host organism and contains only one strain of SARS-CoV-2. The parts of the
workflow for reconstructing the SARS-CoV-2 strain genome are activated.

If the mode is set to `environment`, the sample is assumed to be from the
environment (e.g. wastewater) and to contain different SARS-CoV-2 strains.
The parts of the workflow responsible for creating and analysing individual
genomes (e.g. assembly, lineage calling via Pangolin) are disabled.

## Adapters

There are three ways to transfer adapter sequences to UnCoVar to remove them
Expand Down
20 changes: 1 addition & 19 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,7 @@ rule save_latest_run:

checkpoint all:
input:
expand(
[
"results/{date}/qc/multiqc.html",
"results/reports/{date}.zip",
"results/{date}/tables/assembly_comparison.tsv",
],
date=get_all_run_dates(),
),
expand(
"results/{date}/plots/all.{mode}-strain.strains.kallisto.svg",
date=get_all_run_dates(),
mode=["major", "any"],
),
zip_expand(
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
zip_wildcard_1=get_dates(),
zip_wildcard_2=get_samples(),
expand_wildcard=config["variant-calling"]["filters"],
),
get_input_by_mode,
output:
touch(
expand(
Expand Down
42 changes: 42 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -1543,6 +1543,48 @@ def get_aggregated_pangolin_calls(wildcards, return_list="paths"):
return expanded_patterns


def get_checked_mode():
mode = config["mode"]
if mode == "patient" or mode == "environment":
return mode

raise TypeError(f'Mode {mode} not recognized. Can be "patient" or "environment".')


def get_input_by_mode(wildcard):
paths = [
expand(
"results/{mode}-reports/{date}.zip",
mode=config["mode"],
date=get_all_run_dates(),
),
expand(
"results/{date}/plots/all.{mode}-strain.strains.kallisto.svg",
date=get_all_run_dates(),
mode=["major", "any"],
),
zip_expand(
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
zip_wildcard_1=get_dates(),
zip_wildcard_2=get_samples(),
expand_wildcard=config["variant-calling"]["filters"],
),
]

if config["mode"] == "patient":
paths += [
expand(
[
"results/{date}/qc/multiqc.html",
"results/{date}/tables/assembly_comparison.tsv",
],
date=get_all_run_dates(),
),
]

return sum(paths, [])


def get_pangolin_for_report(wildcards):
paths = []

Expand Down
69 changes: 60 additions & 9 deletions workflow/rules/generate_output.smk
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ rule high_quality_genomes_report:
"../scripts/generate-high-quality-report.py"


rule overview_table_csv:
rule overview_table_patient_csv:
input:
reads_raw=get_raw_reads_counts,
reads_trimmed=get_trimmed_reads_counts,
Expand All @@ -142,27 +142,49 @@ rule overview_table_csv:
# as input file for the rule. Please add the output of the respective checkpoint to the rule inputs.
_=get_checkpoints_for_overview_table,
output:
qc_data="results/{date}/tables/overview.csv",
qc_data="results/{date}/tables/patient-overview.csv",
params:
assembly_used=lambda wildcards: get_assemblies_for_submission(
wildcards, "all samples"
),
mth=config.get("mth"),
samples=lambda wildcards: get_samples_for_date(wildcards.date),
mode=config["mode"],
log:
"logs/{date}/overview-table.log",
"logs/{date}/patient-overview-table.log",
conda:
"../envs/pysam.yaml"
script:
"../scripts/generate-overview-table.py"


use rule overview_table_patient_csv as overview_table_environment_csv with:
input:
reads_raw=get_raw_reads_counts,
reads_trimmed=get_trimmed_reads_counts,
kraken=get_kraken_output,
reads_used_for_assembly=expand_samples_for_date(
"results/{{date}}/tables/read_pair_counts/{sample}.txt",
),
bcf=expand_samples_for_date(
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
),
output:
qc_data="results/{date}/tables/environment-overview.csv",
params:
mth=config.get("mth"),
samples=lambda wildcards: get_samples_for_date(wildcards.date),
mode=config["mode"],
log:
"logs/{date}/environment-overview-table.log",


rule overview_table_html:
input:
"results/{date}/tables/overview.csv",
"results/{date}/tables/{execution_mode}-overview.csv",
output:
report(
directory("results/{date}/overview/"),
directory("results/{date}/{execution_mode}/overview/"),
htmlindex="index.html",
caption="../report/qc-report.rst",
category="1. Overview",
Expand All @@ -172,7 +194,7 @@ rule overview_table_html:
formatter=get_resource("report-table-formatter.js"),
pin_until="Sample",
log:
"logs/{date}/qc_report_html.log",
"logs/{date}/{execution_mode}-qc-report-html.log",
conda:
"../envs/rbt.yaml"
shell:
Expand Down Expand Up @@ -323,10 +345,13 @@ rule pangolin_call_overview_html:
"rbt csv-report {input} --pin-until {params.pin_until} {output} > {log} 2>&1"


rule snakemake_reports:
rule snakemake_reports_patient:
input:
# 1. Overview
"results/{date}/overview/",
expand(
"results/{{date}}/{execution_mode}/overview/",
execution_mode=get_checked_mode(),
),
"results/{date}/plots/lineages-over-time.svg",
expand(
"results/{{date}}/tables/variants-{ORFNAME}-over-time.csv",
Expand Down Expand Up @@ -368,7 +393,7 @@ rule snakemake_reports:
"results/high-quality-genomes/{date}.fasta",
"results/high-quality-genomes/{date}.csv",
output:
"results/reports/{date}.zip",
"results/patient-reports/{date}.zip",
params:
for_testing=get_if_testing("--snakefile ../workflow/Snakefile"),
conda:
Expand All @@ -379,3 +404,29 @@ rule snakemake_reports:
"snakemake --nolock --report-stylesheet resources/custom-stylesheet.css {input} "
"--report {output} {params.for_testing} "
"> {log} 2>&1"


use rule snakemake_reports_patient as snakemake_reports_environment with:
input:
# 1. Overview
expand(
"results/{{date}}/{execution_mode}/overview/",
execution_mode=get_checked_mode(),
),
"results/{date}/plots/all.major-strain.strains.kallisto.svg",
expand_samples_for_date(
["results/{{date}}/plots/strain-calls/{sample}.strains.kallisto.svg"]
),
# 2. Variant Call Details
lambda wildcards: expand(
"results/{{date}}/vcf-report/{target}.{filter}",
target=get_samples_for_date(wildcards.date) + ["all"],
filter=config["variant-calling"]["filters"],
),
# 3. Sequencing Details
"results/{date}/qc/laboratory/multiqc.html",
"results/{date}/plots/coverage-reference-genome.svg",
output:
"results/environment-reports/{date}.zip",
log:
"logs/snakemake_reports/{date}.log",
133 changes: 72 additions & 61 deletions workflow/scripts/generate-overview-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ def iter_with_samples(inputfiles):
return zip(snakemake.params.samples, inputfiles)


def is_patient_report():
return snakemake.params.mode == "patient"


data = pd.DataFrame(index=snakemake.params.samples)


Expand Down Expand Up @@ -104,63 +108,65 @@ def register_contig_lengths(assemblies, name):
data.loc[sample, name] = max(len(contig.sequence) for contig in infile)


# add lengths of Initial contigs
register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)")

# add lengths of polished contigs
register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)")

# add lengths of pseudo assembly
register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)")

# add lengths of Consensus assembly
register_contig_lengths(snakemake.input.consensus_contigs, "Consensus Sequence (bp)")


# add type of assembly use:
for ele in snakemake.params.assembly_used:
sample, used = ele.split(",")
if "pseudo" == used:
data.loc[sample, "Best Quality"] = "Pseudo"
elif "normal" == used:
data.loc[sample, "Best Quality"] = "De Novo"
elif "consensus" == used:
data.loc[sample, "Best Quality"] = "Consensus"
elif "not-accepted" == used:
data.loc[sample, "Best Quality"] = "-"

# add pangolin results
for sample, file in iter_with_samples(snakemake.input.pangolin):
pangolin_results = pd.read_csv(file)
assert (
pangolin_results.shape[0] == 1
), "unexpected number of rows (>1) in pangolin results"
lineage = pangolin_results.loc[0, "lineage"]
scorpio = pangolin_results.loc[0, "scorpio_call"]
if lineage == "None":
pangolin_call = "no strain called"
else:
# TODO parse scorpio output
# match = re.match(
# "((?P<varcount>\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)",
# pangolin_results.fillna("").loc[0, "note"].strip(),
# )
# assert (
# match is not None
# ), "unexpected pangolin note, please update above regular expression"
# varcount = match.group("varcount") or ""
# if varcount:
# varcount = f" ({varcount})"
# pangolin_call = f"{lineage}{varcount}"
pangolin_call = f"{lineage}"
data.loc[sample, "Pango Lineage"] = pangolin_call
if scorpio == "None":
scorpio_call = "-"
else:
scorpio_call = f"{scorpio}"
data.loc[sample, "WHO Label"] = scorpio_call
data["WHO Label"].fillna("-", inplace=True)
data["WHO Label"].replace({"nan": "-"}, inplace=True)
if is_patient_report():
# add lengths of Initial contigs
register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)")

# add lengths of polished contigs
register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)")

# add lengths of pseudo assembly
register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)")

# add lengths of Consensus assembly
register_contig_lengths(
snakemake.input.consensus_contigs, "Consensus Sequence (bp)"
)

# add type of assembly use:
for ele in snakemake.params.assembly_used:
sample, used = ele.split(",")
if "pseudo" == used:
data.loc[sample, "Best Quality"] = "Pseudo"
elif "normal" == used:
data.loc[sample, "Best Quality"] = "De Novo"
elif "consensus" == used:
data.loc[sample, "Best Quality"] = "Consensus"
elif "not-accepted" == used:
data.loc[sample, "Best Quality"] = "-"

# add pangolin results
for sample, file in iter_with_samples(snakemake.input.pangolin):
pangolin_results = pd.read_csv(file)
assert (
pangolin_results.shape[0] == 1
), "unexpected number of rows (>1) in pangolin results"
lineage = pangolin_results.loc[0, "lineage"]
scorpio = pangolin_results.loc[0, "scorpio_call"]
if lineage == "None":
pangolin_call = "no strain called"
else:
# TODO parse scorpio output
# match = re.match(
# "((?P<varcount>\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)",
# pangolin_results.fillna("").loc[0, "note"].strip(),
# )
# assert (
# match is not None
# ), "unexpected pangolin note, please update above regular expression"
# varcount = match.group("varcount") or ""
# if varcount:
# varcount = f" ({varcount})"
# pangolin_call = f"{lineage}{varcount}"
pangolin_call = f"{lineage}"
data.loc[sample, "Pango Lineage"] = pangolin_call
if scorpio == "None":
scorpio_call = "-"
else:
scorpio_call = f"{scorpio}"
data.loc[sample, "WHO Label"] = scorpio_call
data["WHO Label"].fillna("-", inplace=True)
data["WHO Label"].replace({"nan": "-"}, inplace=True)


# add variant calls
Expand Down Expand Up @@ -240,12 +246,17 @@ def fmt_variants(variants):
"Raw Reads (#)",
"Trimmed Reads (#)",
"Filtered Reads (#)",
"Largest Contig (bp)",
"De Novo Sequence (bp)",
"Pseudo Sequence (bp)",
"Consensus Sequence (bp)",
]

if is_patient_report():
int_cols += [
"Largest Contig (bp)",
"De Novo Sequence (bp)",
"Pseudo Sequence (bp)",
"Consensus Sequence (bp)",
]


data[int_cols] = data[int_cols].fillna("0").applymap(lambda x: "{0:,}".format(int(x)))
data = data.loc[:, (data != "0").any(axis=0)]
data.index.name = "Sample"
Expand Down

0 comments on commit 9ac2002

Please sign in to comment.