diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile index 37c8217da..330476306 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile @@ -306,3 +306,40 @@ rule somatic_targeted_seq_cnv_calling_sequenza_report: **wf.get_log_file("sequenza", "report"), wrapper: wf.wrapper_path("sequenza/report") + + +# Run PureCN ------------------------------------------------------ + + +rule somatic_targeted_seq_cnv_calling_purecn_coverage: + input: + unpack(wf.get_input_files("purecn", "coverage")), + output: + **wf.get_output_files("purecn", "coverage"), + threads: wf.get_resource("purecn", "coverage", "threads") + resources: + time=wf.get_resource("purecn", "coverage", "time"), + memory=wf.get_resource("purecn", "coverage", "memory"), + partition=wf.get_resource("purecn", "coverage", "partition"), + tmpdir=wf.get_resource("purecn", "coverage", "tmpdir"), + log: + **wf.get_log_file("purecn", "coverage"), + wrapper: + wf.wrapper_path("purecn/coverage") + + +rule somatic_targeted_seq_cnv_calling_purecn_run: + input: + unpack(wf.get_input_files("purecn", "run")), + output: + **wf.get_output_files("purecn", "run"), + threads: wf.get_resource("purecn", "run", "threads") + resources: + time=wf.get_resource("purecn", "run", "time"), + memory=wf.get_resource("purecn", "run", "memory"), + partition=wf.get_resource("purecn", "run", "partition"), + tmpdir=wf.get_resource("purecn", "run", "tmpdir"), + log: + **wf.get_log_file("purecn", "run"), + wrapper: + wf.wrapper_path("purecn/run") diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 484247adc..91d708894 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -192,6 +192,24 @@ features: EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75 prefix: '' nThread: 8 + purecn: + genome_name: "unknown" # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3 + enrichment_kit_name: "unknown" # For filename only... + mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw + reptiming: "" # Nothing for GRCh38 + seed: 1234567 + extra_commands: # Recommended extra arguments for PureCN, extra_arguments: [] to clear them all + model: betabin + "fun-segmentation": PSCBS + "post-optimize": "" # post-optimize is a flag + # A PureCN panel of normals is required, with the container, the intervals & the PON rds file + path_container: REQUIRED # ../panel_of_normals/work/containers/out/purecn.simg + path_intervals: REQUIRED # ../panel_of_normals/output/purecn/out/_.list + path_panel_of_normals: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.panel_of_normals.rds + path_mapping_bias: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.mapping_bias.rds + # IMPORTANT NOTE: Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true" + somatic_variant_caller: "mutect2" + path_somatic_variants: ../somatic_variant_calling_for_purecn cnvetti_on_target: path_target_regions: REQUIRED # REQUIRED cnvetti_off_target: @@ -237,6 +255,18 @@ def get_normal_lib_name(self, wildcards): pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name] return pair.normal_sample.dna_ngs_library.name + @staticmethod + @dictify + def _get_log_file_from_prefix(prefix): + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + class CnvettiStepPartBase(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using CNVetti; shared code. @@ -373,19 +403,13 @@ def check_config(self): "Path to target regions is missing for {}".format(self.name), ) - @dictify def _get_log_file(self, action): """Return path to log file for the given action""" # Validate action self._validate_action(action) name_pattern = self.name_pattern.format(action=action) - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - for key, ext in key_ext: - yield key, os.path.join("work", name_pattern, "log", name_pattern + ext) + prefix = os.path.join("work", name_pattern, "log", name_pattern) + return self._get_log_file_from_prefix(prefix) def get_resource_usage(self, action): """Get Resource Usage @@ -537,37 +561,128 @@ def get_params(self, action): def _get_params_report(self, wildcards): return wildcards["library_name"] - @dictify def get_log_file(self, action): """Return dict of log files.""" + # Validate action + self._validate_action(action) if action == "install": prefix = "work/R_packages/log/sequenza" elif action == "gcreference": prefix = "work/static_data/log/sequenza.{length}".format( length=self.config["sequenza"]["length"], ) - elif action == "run": - prefix = ( - "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.run" - ) - elif action == "report": - prefix = ( - "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.report" - ) else: - raise UnsupportedActionException( - "Action '{action}' is not supported. Valid options: {valid}".format( - action=action, valid=", ".join(self.actions) + name_pattern = "{mapper}.sequenza.{library_name}" + prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action) + return self._get_log_file_from_prefix(prefix) + + +class PureCNStepPart(SomaticTargetedSeqCnvCallingStepPart): + """Perform somatic targeted CNV calling using PureCN""" + + #: Step name + name = "purecn" + + #: Class available actions + actions = ("coverage", "run") + + resource_usage = { + "coverage": ResourceUsage( + threads=1, + time="04:00:00", # 4 hours + memory="24G", + ), + "run": ResourceUsage( + threads=4, + time="24:00:00", # 4 hours + memory="96G", + ), + } + + def get_input_files(self, action): + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + action_mapping = { + "coverage": self._get_input_files_coverage, + "run": self._get_input_files_run, + } + return action_mapping[action] + + @dictify + def _get_input_files_run(self, wildcards): + name_pattern = "{mapper}.purecn.{library_name}".format(**wildcards) + yield "tumor", os.path.join( + "work", + name_pattern, + "out", + "{mapper}.{library_name}_coverage_loess.txt.gz".format(**wildcards), + ) + name_pattern = "{mapper}.{caller}.{library_name}".format( + caller=self.config["purecn"]["somatic_variant_caller"], + **wildcards, + ) + base_path = os.path.join("output", name_pattern, "out", name_pattern + ".full.vcf.gz") + variant_calling = self.parent.sub_workflows["somatic_variants"] + yield "vcf", variant_calling(base_path) + + @dictify + def _get_input_files_coverage(self, wildcards): + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + name_pattern = "{mapper}.{library_name}".format(**wildcards) + base_path = os.path.join("output", name_pattern, "out", name_pattern) + yield "bam", ngs_mapping(base_path + ".bam") + yield "bai", ngs_mapping(base_path + ".bam.bai") + + def get_output_files(self, action): + """Return output paths, dependent on rule""" + # Validate action + self._validate_action(action) + name_pattern = "{mapper}.purecn.{library_name}" + prefix = os.path.join("work", name_pattern, "out", "{library_name}") + action_mapping = { + "coverage": { + "coverage": os.path.join( + "work", name_pattern, "out", "{mapper}.{library_name}_coverage_loess.txt.gz" ) - ) - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), + }, + "run": { + "segments": prefix + "_dnacopy.seg", + "ploidy": prefix + ".csv", + "pvalues": prefix + "_amplification_pvalues.csv", + "vcf": prefix + ".vcf.gz", + "vcf_tbi": prefix + ".vcf.gz.tbi", + "loh": prefix + "_loh.csv", + "segments_md5": prefix + "_dnacopy.seg.md5", + "ploidy_md5": prefix + ".csv.md5", + "pvalues_md5": prefix + "_amplification_pvalues.csv.md5", + "vcf_md5": prefix + ".vcf.gz.md5", + "vcf_tbi_md5": prefix + ".vcf.gz.tbi.md5", + "loh_md5": prefix + "_loh.csv.md5", + }, + } + return action_mapping[action] + + def get_log_file(self, action): + """Return dict of log files.""" + # Validate action + self._validate_action(action) + + name_pattern = "{mapper}.purecn.{library_name}" + prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action) + return self._get_log_file_from_prefix(prefix) + + def check_config(self): + if self.name not in self.config["tools"]: + return # skip check + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_panel_of_normals"), + "Path to the PureCN panel of normal is missing", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_mapping_bias"), + "Path to the PureCN mapping bias file is missing (created in the panel_of_normals step)", ) - for key, ext in key_ext: - yield key, prefix + ext - yield key + "_md5", prefix + ext + ".md5" class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): @@ -837,16 +952,7 @@ def get_log_file(self, action): "work/{{mapper}}.cnvkit.{{library_name}}/log/" "{{mapper}}.cnvkit.{action}.{{library_name}}" ).format(action=action) - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - log_files = {} - for key, ext in key_ext: - log_files[key] = prefix + ext - log_files[key + "_md5"] = prefix + ext + ".md5" - return log_files + return self._get_log_file_from_prefix(prefix) class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart): @@ -975,29 +1081,18 @@ def check_config(self): "Path to target regions is missing", ) - @dictify - def _get_log_file(self, action): + def get_log_file(self, action): """Return path to log file for the given action""" # Validate action self._validate_action(action) - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - if action == "prepare": log_file = "work/copywriter.prepare/log/snakemake.log" - yield "log", log_file - yield "log_md5", log_file + ".md5" - elif action in ("call", "run"): - tpl = ( - "work/{mapper}.copywriter.{library_name}/log/{mapper}.copywriter.{library_name}." - + action - ) - for key, ext in key_ext: - yield key, tpl + ext + return {"log": log_file, "log_md5": log_file + ".md5"} + else: + name_pattern = "{mapper}.copywriter.{library_name}" + prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action) + return self._get_log_file_from_prefix(prefix) class SomaticTargetedSeqCnvCallingWorkflow(BaseStep): @@ -1035,11 +1130,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) CnvKitStepPart, CopywriterStepPart, SequenzaStepPart, + PureCNStepPart, LinkOutStepPart, ) ) # Initialize sub-workflows self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + if "purecn" in self.config["tools"]: + self.register_sub_workflow( + "somatic_variant_calling", + self.config["purecn"]["path_somatic_variants"], + "somatic_variants", + ) @listify def get_result_files(self): @@ -1047,6 +1149,7 @@ def get_result_files(self): tool_actions = { "cnvkit": ["fix", "postprocess", "report", "export"], "sequenza": ("run",), + "purecn": ("run",), "copywriter": ("call",), "cnvetti_on_target": ("coverage", "segment", "postprocess"), "cnvetti_off_target": ("coverage", "segment", "postprocess"), diff --git a/snappy_wrappers/wrappers/purecn/coverage/wrapper.py b/snappy_wrappers/wrappers/purecn/coverage/wrapper.py index bbdd029e5..20ae81288 100644 --- a/snappy_wrappers/wrappers/purecn/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/purecn/coverage/wrapper.py @@ -11,22 +11,45 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["purecn"] -shell.executable("/bin/bash") +if "container" in snakemake.input.keys() and snakemake.input.container: + container = snakemake.input.container +elif "path_container" in config.keys() and config["path_container"]: + container = config["path_container"] +else: + raise Exception("No path to PureCN container") + +if "intervals" in snakemake.input.keys() and snakemake.input.intervals: + intervals = snakemake.input.intervals +elif "path_intervals" in config.keys() and config["path_intervals"]: + intervals = config["path_intervals"] +else: + raise Exception("No path to PureCN intervals") # Prepare files and directories that must be accessible by the container +files_to_bind = { + "bam": snakemake.input.bam, +} +if "intervals" not in snakemake.input.keys(): + files_to_bind["intervals"] = intervals + +# Replace with full absolute paths +files_to_bind = {k: os.path.realpath(v) for k, v in files_to_bind.items()} +# Directories that mut be bound +dirs_to_bind = {k: os.path.dirname(v) for k, v in files_to_bind.items()} +# List of unique directories to bind: on cluster: -> from container: /bindings/d) +bound_dirs = {e[1]: e[0] for e in enumerate(list(set(dirs_to_bind.values())))} +# Binding command +bindings = " ".join(["-B {}:/bindings/d{}:ro".format(k, v) for k, v in bound_dirs.items()]) +# Path to files from the container bound_files = { - "bam": os.path.realpath(snakemake.input.bam), + k: "/bindings/d{}/{}".format(bound_dirs[dirs_to_bind[k]], os.path.basename(v)) + for k, v in files_to_bind.items() } -keys = list(bound_files.keys()) -bindings = [] -for i in range(len(keys)): - k = keys[i] - if bound_files[k]: - # Binding directory to /bindings/d - bindings.append(" -B {}:/bindings/d{}:ro".format(os.path.dirname(bound_files[k]), i)) - bound_files[k] = "/bindings/d{}/{}".format(i, os.path.basename(bound_files[k])) -bindings = " ".join(bindings) +if "intervals" in bound_files.keys(): + intervals = bound_files["intervals"] + +shell.executable("/bin/bash") shell( r""" @@ -58,10 +81,10 @@ --seed {config[seed]} \ --out-dir $(dirname {snakemake.output.coverage}) \ --bam {bound_files[bam]} \ - --intervals {snakemake.input.intervals} + --intervals {intervals} " mkdir -p $(dirname {snakemake.output.coverage}) -apptainer exec --home $PWD {bindings} {snakemake.input.container} $cmd +apptainer exec --home $PWD {bindings} {container} $cmd pushd $(dirname {snakemake.output.coverage}) md5sum $(basename {snakemake.output.coverage}) > $(basename {snakemake.output.coverage}).md5 diff --git a/snappy_wrappers/wrappers/purecn/environment.yaml b/snappy_wrappers/wrappers/purecn/environment.yaml index af82768c9..514a8220e 100644 --- a/snappy_wrappers/wrappers/purecn/environment.yaml +++ b/snappy_wrappers/wrappers/purecn/environment.yaml @@ -3,7 +3,3 @@ channels: - bioconda dependencies: - htslib - - bioconductor-org.Hs.eg.db - - bioconductor-purecn - - r-optparse - - r-pscbs diff --git a/snappy_wrappers/wrappers/purecn/run/environment.yaml b/snappy_wrappers/wrappers/purecn/run/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/purecn/run/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/purecn/run/wrapper.py b/snappy_wrappers/wrappers/purecn/run/wrapper.py new file mode 100644 index 000000000..0db05d037 --- /dev/null +++ b/snappy_wrappers/wrappers/purecn/run/wrapper.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +"""CUBI+Snakemake wrapper code for computing CNV using PureCN +""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["purecn"] + +# WARNING- these extra commands cannot contain file paths +extra_commands = " ".join( + [ + " --{}={}".format(k, v) if v else " --{}".format(k) + for k, v in config["extra_commands"].items() + ] +) + +# List files that must be accessible from the container +files_to_bind = { + "vcf": snakemake.input.vcf, + "mapping_bias": config["path_mapping_bias"], + "normaldb": config["path_panel_of_normals"], + "intervals": config["path_intervals"], +} +if "snp_blacklist" in config.keys() and config["snp_blacklist"]: + files_to_bind["snp-blacklist"] = config["snp_blacklist"] +if "segments" in snakemake.input.keys() and snakemake.input.segments: + files_to_bind["seg-file"] = snakemake.input.segments +if "log2" in snakemake.input.keys() and snakemake.input.log2: + files_to_bind["log-ratio-file"] = snakemake.input.log2 + +# TODO: Put the following in a function (decide where...) +# Replace with full absolute paths +files_to_bind = {k: os.path.realpath(v) for k, v in files_to_bind.items()} +# Directories that mut be bound +dirs_to_bind = {k: os.path.dirname(v) for k, v in files_to_bind.items()} +# List of unique directories to bind: on cluster: -> from container: /bindings/d) +bound_dirs = {e[1]: e[0] for e in enumerate(list(set(dirs_to_bind.values())))} +# Binding command +bindings = " ".join(["-B {}:/bindings/d{}:ro".format(k, v) for k, v in bound_dirs.items()]) +# Path to files from the container +bound_files = { + k: "/bindings/d{}/{}".format(bound_dirs[dirs_to_bind[k]], os.path.basename(v)) + for k, v in files_to_bind.items() +} + +if "snp-blacklist" in bound_files.keys(): + extra_commands += " --snp-blacklist={}".format(bound_files["snp-blacklist"]) +if "seg-file" in bound_files.keys(): + extra_commands += " --seg-file={}".format(bound_files["seg-file"]) +if "log-ratio-file" in bound_files.keys(): + extra_commands += " --log-ratio-file={}".format(bound_files["log-ratio-file"]) + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Setup auto-cleaned tmpdir +export tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT + +# Compute md5 checksum +md5() {{ + fn=$1 + d=$(dirname $fn) + f=$(basename $fn) + pushd $d 1> /dev/null 2>&1 + checksum=$(md5sum $f) + popd 1> /dev/null 2>&1 + echo "$checksum" +}} + +outdir=$(dirname {snakemake.output.segments}) +mkdir -p $outdir + +# Run PureCN with a panel of normals +cmd="/usr/local/bin/Rscript PureCN.R \ + --sampleid {snakemake.wildcards[library_name]} \ + --tumor {snakemake.input.tumor} \ + --vcf {bound_files[vcf]} \ + --mapping-bias-file {bound_files[mapping_bias]} \ + --normaldb {bound_files[normaldb]} \ + --intervals {bound_files[intervals]} \ + --genome {config[genome_name]} \ + --out $outdir --out-vcf --force \ + --seed {config[seed]} --parallel --cores={snakemake.threads} \ + {extra_commands} +" +apptainer exec --home $PWD {bindings} {config[path_container]} $cmd + +md5 {snakemake.output.segments} > {snakemake.output.segments_md5} +md5 {snakemake.output.ploidy} > {snakemake.output.ploidy_md5} +md5 {snakemake.output.pvalues} > {snakemake.output.pvalues_md5} +md5 {snakemake.output.vcf} > {snakemake.output.vcf_md5} +md5 {snakemake.output.vcf_tbi} > {snakemake.output.vcf_tbi_md5} +md5 {snakemake.output.loh} > {snakemake.output.loh_md5} +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 527556a40..6704fbfcd 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -47,10 +47,16 @@ def minimal_config(): - cnvkit - copywriter - sequenza + - purecn cnvkit: path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + purecn: + path_container: /path/to/purecn/container + path_intervals: /path/to/interval/list + path_panel_of_normals: /path/to/purecn/pon + path_mapping_bias: /path/to/mapping/bias data_sets: first_batch: @@ -80,7 +86,10 @@ def somatic_targeted_seq_cnv_calling_workflow( patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here - dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variants": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + } # Construct the workflow object return SomaticTargetedSeqCnvCallingWorkflow( dummy_workflow, @@ -926,6 +935,91 @@ def test_sequenza_step_part_get_resource_usage_call(somatic_targeted_seq_cnv_cal assert actual == expected, msg_error +# Tests for PureCNStepPart ---------------------------------------------------------------------- + + +def test_purecn_step_part_get_input_files_coverage(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_input_files() - action 'coverage'""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("purecn", "coverage")( + wildcards + ) + assert actual == expected + + +def test_purecn_step_part_get_output_files_coverage(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_output_files() - action 'coverage'""" + expected = { + "coverage": "work/{mapper}.purecn.{library_name}/out/{mapper}.{library_name}_coverage_loess.txt.gz" + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("purecn", "coverage") + assert actual == expected + + +def test_purecn_step_part_get_log_file_coverage(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_log_file() - action 'coverage'""" + base_name = "work/{mapper}.purecn.{library_name}/log/{mapper}.purecn.{library_name}.coverage" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("purecn", "coverage") + assert actual == expected + + +def test_purecn_step_part_get_input_files_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_input_files() - action 'run'""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = { + "tumor": "work/bwa.purecn.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1_coverage_loess.txt.gz", + "vcf": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.full.vcf.gz", + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("purecn", "run")(wildcards) + assert actual == expected + + +def test_purecn_step_part_get_output_files_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_output_files() - action 'run'""" + base_name = "work/{mapper}.purecn.{library_name}/out/{library_name}" + expected = { + "segments": base_name + "_dnacopy.seg", + "ploidy": base_name + ".csv", + "pvalues": base_name + "_amplification_pvalues.csv", + "vcf": base_name + ".vcf.gz", + "vcf_tbi": base_name + ".vcf.gz.tbi", + "loh": base_name + "_loh.csv", + } + expected = {**expected, **{k + "_md5": v + ".md5" for k, v in expected.items()}} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("purecn", "run") + assert actual == expected + + +def test_purecn_step_part_get_log_file_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_log_file() - action 'run'""" + base_name = "work/{mapper}.purecn.{library_name}/log/{mapper}.purecn.{library_name}.run" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("purecn", "run") + assert actual == expected + + +def test_purecn_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): + """Tests PureCNStepPart.get_resource_usage()""" + # Define expected + expected_dicts = { + "coverage": {"threads": 1, "time": "04:00:00", "memory": "24G", "partition": "medium"}, + "run": {"threads": 4, "time": "24:00:00", "memory": "96G", "partition": "medium"}, + } + # Evaluate + for action, resources in expected_dicts.items(): + for resource, expected in resources.items(): + msg_error = f"Assertion error for resource '{resource}' in '{action}'." + actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( + "purecn", action, resource + ) + assert actual == expected, msg_error + + # Tests for SomaticTargetedSeqCnvCallingWorkflow -------------------------------------------------- @@ -938,6 +1032,7 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call "cnvkit", "copywriter", "link_out", + "purecn", "sequenza", ] actual = list(sorted(somatic_targeted_seq_cnv_calling_workflow.sub_steps.keys())) @@ -1050,6 +1145,21 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call "segments.txt.md5", ) ] + # purecn + tpl = "output/bwa.purecn.P00{i}-T{t}-DNA1-WGS1/out/P00{i}-T{t}-DNA1-WGS1{ext}{checksum}" + expected += [ + tpl.format(i=i, t=t, ext=ext, checksum=checksum) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + ".csv", + ".vcf.gz", + ".vcf.gz.tbi", + "_dnacopy.seg", + "_amplification_pvalues.csv", + "_loh.csv", + ) + for checksum in ("", ".md5") + ] # sequenza tpl = ( "output/bwa.sequenza.P00{i}-T{t}-DNA1-WGS1/out/"