diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml index e0afa29c0..24aaa2d61 100644 --- a/.tests/config/config.yaml +++ b/.tests/config/config.yaml @@ -1,5 +1,8 @@ pepfile: config/pep/config.yaml +# execution mode. Can be either "patient" or "environment" +mode: patient + # for testing, uncomment to limit to n strain genomes testing: limit-strain-genomes: 5 diff --git a/config/config.yaml b/config/config.yaml index bdf2e9608..ae6c6d16e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,5 +1,8 @@ pepfile: config/pep/config.yaml +# execution mode. Can be either "patient" or "environment" +mode: patient + # virus genome to use as reference. Must be a NCBI accession virus-reference-genome: - NC_045512.2 diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md index 74c3ebd6b..de1c612f9 100644 --- a/docs/user-guide/configuration.md +++ b/docs/user-guide/configuration.md @@ -1,5 +1,20 @@ # Configuration +## Execution Mode + +Accepted values: `patient`, `environment`. Defaults to `patient`. + +Defines the execution mode of UnCoVar. + +When the mode is set to `patient`, the sample is assumed come be from a single +host organism and contains only one strain of SARS-CoV-2. The parts of the +workflow for reconstructing the SARS-CoV-2 strain genome are activated. + +If the mode is set to `environment`, the sample is assumed to be from the +environment (e.g. wastewater) and to contain different SARS-CoV-2 strains. +The parts of the workflow responsible for creating and analysing individual +genomes (e.g. assembly, lineage calling via Pangolin) are disabled. + ## Adapters There are three ways to transfer adapter sequences to UnCoVar to remove them diff --git a/workflow/Snakefile b/workflow/Snakefile index 11c4bdc1f..f9b5bf3c4 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -79,25 +79,7 @@ rule save_latest_run: checkpoint all: input: - expand( - [ - "results/{date}/qc/multiqc.html", - "results/reports/{date}.zip", - "results/{date}/tables/assembly_comparison.tsv", - ], - date=get_all_run_dates(), - ), - expand( - "results/{date}/plots/all.{mode}-strain.strains.kallisto.svg", - date=get_all_run_dates(), - mode=["major", "any"], - ), - zip_expand( - "results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf", - zip_wildcard_1=get_dates(), - zip_wildcard_2=get_samples(), - expand_wildcard=config["variant-calling"]["filters"], - ), + get_input_by_mode, output: touch( expand( diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4d2980fe9..8c8db7f7b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1543,6 +1543,48 @@ def get_aggregated_pangolin_calls(wildcards, return_list="paths"): return expanded_patterns +def get_checked_mode(): + mode = config["mode"] + if mode == "patient" or mode == "environment": + return mode + + raise TypeError(f'Mode {mode} not recognized. Can be "patient" or "environment".') + + +def get_input_by_mode(wildcard): + paths = [ + expand( + "results/{mode}-reports/{date}.zip", + mode=config["mode"], + date=get_all_run_dates(), + ), + expand( + "results/{date}/plots/all.{mode}-strain.strains.kallisto.svg", + date=get_all_run_dates(), + mode=["major", "any"], + ), + zip_expand( + "results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf", + zip_wildcard_1=get_dates(), + zip_wildcard_2=get_samples(), + expand_wildcard=config["variant-calling"]["filters"], + ), + ] + + if config["mode"] == "patient": + paths += [ + expand( + [ + "results/{date}/qc/multiqc.html", + "results/{date}/tables/assembly_comparison.tsv", + ], + date=get_all_run_dates(), + ), + ] + + return sum(paths, []) + + def get_pangolin_for_report(wildcards): paths = [] diff --git a/workflow/rules/generate_output.smk b/workflow/rules/generate_output.smk index 71e0260a2..1e9023d69 100644 --- a/workflow/rules/generate_output.smk +++ b/workflow/rules/generate_output.smk @@ -118,7 +118,7 @@ rule high_quality_genomes_report: "../scripts/generate-high-quality-report.py" -rule overview_table_csv: +rule overview_table_patient_csv: input: reads_raw=get_raw_reads_counts, reads_trimmed=get_trimmed_reads_counts, @@ -142,27 +142,49 @@ rule overview_table_csv: # as input file for the rule. Please add the output of the respective checkpoint to the rule inputs. _=get_checkpoints_for_overview_table, output: - qc_data="results/{date}/tables/overview.csv", + qc_data="results/{date}/tables/patient-overview.csv", params: assembly_used=lambda wildcards: get_assemblies_for_submission( wildcards, "all samples" ), mth=config.get("mth"), samples=lambda wildcards: get_samples_for_date(wildcards.date), + mode=config["mode"], log: - "logs/{date}/overview-table.log", + "logs/{date}/patient-overview-table.log", conda: "../envs/pysam.yaml" script: "../scripts/generate-overview-table.py" +use rule overview_table_patient_csv as overview_table_environment_csv with: + input: + reads_raw=get_raw_reads_counts, + reads_trimmed=get_trimmed_reads_counts, + kraken=get_kraken_output, + reads_used_for_assembly=expand_samples_for_date( + "results/{{date}}/tables/read_pair_counts/{sample}.txt", + ), + bcf=expand_samples_for_date( + "results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf", + ), + output: + qc_data="results/{date}/tables/environment-overview.csv", + params: + mth=config.get("mth"), + samples=lambda wildcards: get_samples_for_date(wildcards.date), + mode=config["mode"], + log: + "logs/{date}/environment-overview-table.log", + + rule overview_table_html: input: - "results/{date}/tables/overview.csv", + "results/{date}/tables/{execution_mode}-overview.csv", output: report( - directory("results/{date}/overview/"), + directory("results/{date}/{execution_mode}/overview/"), htmlindex="index.html", caption="../report/qc-report.rst", category="1. Overview", @@ -172,7 +194,7 @@ rule overview_table_html: formatter=get_resource("report-table-formatter.js"), pin_until="Sample", log: - "logs/{date}/qc_report_html.log", + "logs/{date}/{execution_mode}-qc-report-html.log", conda: "../envs/rbt.yaml" shell: @@ -323,10 +345,13 @@ rule pangolin_call_overview_html: "rbt csv-report {input} --pin-until {params.pin_until} {output} > {log} 2>&1" -rule snakemake_reports: +rule snakemake_reports_patient: input: # 1. Overview - "results/{date}/overview/", + expand( + "results/{{date}}/{execution_mode}/overview/", + execution_mode=get_checked_mode(), + ), "results/{date}/plots/lineages-over-time.svg", expand( "results/{{date}}/tables/variants-{ORFNAME}-over-time.csv", @@ -368,7 +393,7 @@ rule snakemake_reports: "results/high-quality-genomes/{date}.fasta", "results/high-quality-genomes/{date}.csv", output: - "results/reports/{date}.zip", + "results/patient-reports/{date}.zip", params: for_testing=get_if_testing("--snakefile ../workflow/Snakefile"), conda: @@ -379,3 +404,29 @@ rule snakemake_reports: "snakemake --nolock --report-stylesheet resources/custom-stylesheet.css {input} " "--report {output} {params.for_testing} " "> {log} 2>&1" + + +use rule snakemake_reports_patient as snakemake_reports_environment with: + input: + # 1. Overview + expand( + "results/{{date}}/{execution_mode}/overview/", + execution_mode=get_checked_mode(), + ), + "results/{date}/plots/all.major-strain.strains.kallisto.svg", + expand_samples_for_date( + ["results/{{date}}/plots/strain-calls/{sample}.strains.kallisto.svg"] + ), + # 2. Variant Call Details + lambda wildcards: expand( + "results/{{date}}/vcf-report/{target}.{filter}", + target=get_samples_for_date(wildcards.date) + ["all"], + filter=config["variant-calling"]["filters"], + ), + # 3. Sequencing Details + "results/{date}/qc/laboratory/multiqc.html", + "results/{date}/plots/coverage-reference-genome.svg", + output: + "results/environment-reports/{date}.zip", + log: + "logs/snakemake_reports/{date}.log", diff --git a/workflow/scripts/generate-overview-table.py b/workflow/scripts/generate-overview-table.py index 34d046843..28683493c 100644 --- a/workflow/scripts/generate-overview-table.py +++ b/workflow/scripts/generate-overview-table.py @@ -19,6 +19,10 @@ def iter_with_samples(inputfiles): return zip(snakemake.params.samples, inputfiles) +def is_patient_report(): + return snakemake.params.mode == "patient" + + data = pd.DataFrame(index=snakemake.params.samples) @@ -104,63 +108,65 @@ def register_contig_lengths(assemblies, name): data.loc[sample, name] = max(len(contig.sequence) for contig in infile) -# add lengths of Initial contigs -register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)") - -# add lengths of polished contigs -register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)") - -# add lengths of pseudo assembly -register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)") - -# add lengths of Consensus assembly -register_contig_lengths(snakemake.input.consensus_contigs, "Consensus Sequence (bp)") - - -# add type of assembly use: -for ele in snakemake.params.assembly_used: - sample, used = ele.split(",") - if "pseudo" == used: - data.loc[sample, "Best Quality"] = "Pseudo" - elif "normal" == used: - data.loc[sample, "Best Quality"] = "De Novo" - elif "consensus" == used: - data.loc[sample, "Best Quality"] = "Consensus" - elif "not-accepted" == used: - data.loc[sample, "Best Quality"] = "-" - -# add pangolin results -for sample, file in iter_with_samples(snakemake.input.pangolin): - pangolin_results = pd.read_csv(file) - assert ( - pangolin_results.shape[0] == 1 - ), "unexpected number of rows (>1) in pangolin results" - lineage = pangolin_results.loc[0, "lineage"] - scorpio = pangolin_results.loc[0, "scorpio_call"] - if lineage == "None": - pangolin_call = "no strain called" - else: - # TODO parse scorpio output - # match = re.match( - # "((?P\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)", - # pangolin_results.fillna("").loc[0, "note"].strip(), - # ) - # assert ( - # match is not None - # ), "unexpected pangolin note, please update above regular expression" - # varcount = match.group("varcount") or "" - # if varcount: - # varcount = f" ({varcount})" - # pangolin_call = f"{lineage}{varcount}" - pangolin_call = f"{lineage}" - data.loc[sample, "Pango Lineage"] = pangolin_call - if scorpio == "None": - scorpio_call = "-" - else: - scorpio_call = f"{scorpio}" - data.loc[sample, "WHO Label"] = scorpio_call - data["WHO Label"].fillna("-", inplace=True) - data["WHO Label"].replace({"nan": "-"}, inplace=True) +if is_patient_report(): + # add lengths of Initial contigs + register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)") + + # add lengths of polished contigs + register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)") + + # add lengths of pseudo assembly + register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)") + + # add lengths of Consensus assembly + register_contig_lengths( + snakemake.input.consensus_contigs, "Consensus Sequence (bp)" + ) + + # add type of assembly use: + for ele in snakemake.params.assembly_used: + sample, used = ele.split(",") + if "pseudo" == used: + data.loc[sample, "Best Quality"] = "Pseudo" + elif "normal" == used: + data.loc[sample, "Best Quality"] = "De Novo" + elif "consensus" == used: + data.loc[sample, "Best Quality"] = "Consensus" + elif "not-accepted" == used: + data.loc[sample, "Best Quality"] = "-" + + # add pangolin results + for sample, file in iter_with_samples(snakemake.input.pangolin): + pangolin_results = pd.read_csv(file) + assert ( + pangolin_results.shape[0] == 1 + ), "unexpected number of rows (>1) in pangolin results" + lineage = pangolin_results.loc[0, "lineage"] + scorpio = pangolin_results.loc[0, "scorpio_call"] + if lineage == "None": + pangolin_call = "no strain called" + else: + # TODO parse scorpio output + # match = re.match( + # "((?P\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)", + # pangolin_results.fillna("").loc[0, "note"].strip(), + # ) + # assert ( + # match is not None + # ), "unexpected pangolin note, please update above regular expression" + # varcount = match.group("varcount") or "" + # if varcount: + # varcount = f" ({varcount})" + # pangolin_call = f"{lineage}{varcount}" + pangolin_call = f"{lineage}" + data.loc[sample, "Pango Lineage"] = pangolin_call + if scorpio == "None": + scorpio_call = "-" + else: + scorpio_call = f"{scorpio}" + data.loc[sample, "WHO Label"] = scorpio_call + data["WHO Label"].fillna("-", inplace=True) + data["WHO Label"].replace({"nan": "-"}, inplace=True) # add variant calls @@ -240,12 +246,17 @@ def fmt_variants(variants): "Raw Reads (#)", "Trimmed Reads (#)", "Filtered Reads (#)", - "Largest Contig (bp)", - "De Novo Sequence (bp)", - "Pseudo Sequence (bp)", - "Consensus Sequence (bp)", ] +if is_patient_report(): + int_cols += [ + "Largest Contig (bp)", + "De Novo Sequence (bp)", + "Pseudo Sequence (bp)", + "Consensus Sequence (bp)", + ] + + data[int_cols] = data[int_cols].fillna("0").applymap(lambda x: "{0:,}".format(int(x))) data = data.loc[:, (data != "0").any(axis=0)] data.index.name = "Sample"