From bc4c55b2b095a9a8c792cf937efd6fda87d8e09e Mon Sep 17 00:00:00 2001 From: ericblanc20 Date: Mon, 23 Jan 2023 09:44:02 +0100 Subject: [PATCH] feat!: 237 revive cnvkit (#339) feat!: revives cvnkit implementation * Reference creation moved to panel_of_normal step * Added support for creating a reference based on whole genome data * Support for Copy Number Alterations using cnvkit for somatic_targeted_seq_cnv_calling & somatic_wgs_cnv_calling * Closes #236 & closes #339 BREAKING CHANGE: broken plot generation for the control_freec tool in the somatic_wgs_cnv_calling step, and the sample selection for the mutect2 panel_of_normal generation can be based on a user-generated file. --- docs/step/panel_of_normals.rst | 7 + .../workflows/panel_of_normals/Snakefile | 114 ++++ .../workflows/panel_of_normals/__init__.py | 491 +++++++++++++++--- .../__init__.py | 377 +++++++------- .../cnvkit.rules | 72 +-- .../somatic_wgs_cnv_calling/Snakefile | 16 +- .../somatic_wgs_cnv_calling/__init__.py | 367 ++++++++++++- .../somatic_wgs_cnv_calling/cnvkit.rules | 110 ++++ .../wrappers/cnvkit/access/wrapper.py | 26 +- .../wrappers/cnvkit/antitarget/wrapper.py | 41 +- .../wrappers/cnvkit/call/wrapper.py | 40 +- .../wrappers/cnvkit/coverage/wrapper.py | 68 ++- .../wrappers/cnvkit/environment.yaml | 2 +- .../wrappers/cnvkit/export/wrapper.py | 12 +- .../wrappers/cnvkit/fix/wrapper.py | 36 +- .../wrappers/cnvkit/plot/wrapper.py | 141 +++-- .../wrappers/cnvkit/reference/wrapper.py | 63 ++- .../wrappers/cnvkit/report/wrapper.py | 204 ++++++-- .../wrappers/cnvkit/segment/wrapper.py | 32 +- .../wrappers/cnvkit/target/wrapper.py | 78 ++- .../test_workflows_panel_of_normals.py | 369 ++++++++++++- .../test_workflows_panel_of_normals_wgs.py | 380 ++++++++++++++ ...kflows_somatic_targeted_seq_cnv_calling.py | 372 ++++--------- .../test_workflows_somatic_wgs_cnv_calling.py | 381 ++++++++++++-- 24 files changed, 2945 insertions(+), 854 deletions(-) create mode 100644 docs/step/panel_of_normals.rst create mode 100644 snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules create mode 100644 tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py diff --git a/docs/step/panel_of_normals.rst b/docs/step/panel_of_normals.rst new file mode 100644 index 000000000..9e95a62e8 --- /dev/null +++ b/docs/step/panel_of_normals.rst @@ -0,0 +1,7 @@ +.. _step_panel_of_normals:: + +========================================================== +Creation of panel of normals for somatic SNV & CNV calling +========================================================== + +.. automodule:: snappy_pipeline.workflows.panel_of_normals diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index c944deec5..427ed25d0 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -97,3 +97,117 @@ rule panel_of_normals_mutect2_create_panel: **wf.get_log_file("mutect2", "create_panel"), wrapper: wf.wrapper_path("mutect2/create_panel") + + +# Panel of normals (cnvkit) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Write out access file (if required, must be run prior to the cnvkit panel of normals) + + +rule panel_of_normals_access_run: + output: + **wf.get_output_files("access", "run"), + resources: + time=wf.get_resource("access", "run", "time"), + memory=wf.get_resource("access", "run", "memory"), + partition=wf.get_resource("access", "run", "partition"), + log: + **wf.get_log_file("access", "run"), + wrapper: + wf.wrapper_path("cnvkit/access") + + +# Write out the normals-only results for the normals -------------------------- + + +rule panel_of_normals_cnvkit_target: + input: + unpack(wf.get_input_files("cnvkit", "target")), + output: + **wf.get_output_files("cnvkit", "target"), + threads: wf.get_resource("cnvkit", "target", "threads") + resources: + time=wf.get_resource("cnvkit", "target", "time"), + memory=wf.get_resource("cnvkit", "target", "memory"), + partition=wf.get_resource("cnvkit", "target", "partition"), + log: + **wf.get_log_file("cnvkit", "target"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "target"), + wrapper: + wf.wrapper_path("cnvkit/target") + + +rule panel_of_normals_cnvkit_antitarget: + input: + unpack(wf.get_input_files("cnvkit", "antitarget")), + output: + **wf.get_output_files("cnvkit", "antitarget"), + threads: wf.get_resource("cnvkit", "antitarget", "threads") + resources: + time=wf.get_resource("cnvkit", "antitarget", "time"), + memory=wf.get_resource("cnvkit", "antitarget", "memory"), + partition=wf.get_resource("cnvkit", "antitarget", "partition"), + log: + **wf.get_log_file("cnvkit", "antitarget"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "antitarget"), + wrapper: + wf.wrapper_path("cnvkit/antitarget") + + +rule panel_of_normals_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + output: + **wf.get_output_files("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + log: + **wf.get_log_file("cnvkit", "coverage"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "coverage"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +# Create the panel of normals ------------------------------------------------- + + +rule panel_of_normals_cnvkit_create_panel: + input: + unpack(wf.get_input_files("cnvkit", "create_panel")), + output: + **wf.get_output_files("cnvkit", "create_panel"), + threads: wf.get_resource("cnvkit", "create_panel", "threads") + resources: + time=wf.get_resource("cnvkit", "create_panel", "time"), + memory=wf.get_resource("cnvkit", "create_panel", "memory"), + partition=wf.get_resource("cnvkit", "create_panel", "partition"), + log: + **wf.get_log_file("cnvkit", "create_panel"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "create_panel"), + wrapper: + wf.wrapper_path("cnvkit/reference") + + +rule panel_of_normals_cnvkit_report: + input: + unpack(wf.get_input_files("cnvkit", "report")), + output: + **wf.get_output_files("cnvkit", "report"), + threads: wf.get_resource("cnvkit", "report", "threads") + resources: + time=wf.get_resource("cnvkit", "report", "time"), + memory=wf.get_resource("cnvkit", "report", "memory"), + partition=wf.get_resource("cnvkit", "report", "partition"), + log: + **wf.get_log_file("cnvkit", "report"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "report"), + wrapper: + wf.wrapper_path("cnvkit/report") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index a06ffa832..714d28d3e 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -16,6 +16,11 @@ The somatic variant calling step uses Snakemake sub workflows for using the result of the ``ngs_mapping`` step. +By default, all normal DNA samples in the ``ngs_mapping`` step are using to create the panel of normals. +However, the user can select of subset of those using the ``path_normals_list`` configuration option +(which can be different for the different tools). +In this case, the libraries listed in the file will be used, **even if they are not flagged as corresponding to normal samples**. + =========== Step Output =========== @@ -28,6 +33,26 @@ vcf files for each normal) are kept in the ``work`` directory. This enables the augmentation of the panel by new files when they become available. +================================ +Notes on the ``cnvkit`` workflow +================================ + +``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. +Its design is modular, which enables its use for whole genome and amplicon data. + +``cnvkit`` provides a tool to encapsulate common practice workflows (``batch``), depending on the type of data, and on the availability of optional inputs. +The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. + +For exome and whole genome data, the ``cnvkit`` `documentation `_ +recommends the creation of a panel of normal (called ``reference``). +The actual workflow to generate this reference is slightly different between exome and whole genome data, +and also changes depending whether an accessibility file is provided by the user or not. + +Therefore, the ``cnvkit`` tool to generate such accessibility file is implemented as a separate tool. +If a user wants to create this accessibility file with ``cnvkit`` tools, then she must first run the ``access`` tool. +Only after it has been created can she use it to generate the panel of normals. +For that, she will need to modify the configuration file, adding ``cnvkit`` in the list of tools, and setting the ``access`` parameter to the output of the ``access`` tool. + ===================== Default Configuration ===================== @@ -41,18 +66,32 @@ ===================================== - Panel of normal for ``mutect2`` somatic variant caller +- Panel of normal for ``cvnkit`` somatic Copy Number Alterations caller + +``access`` is used to create a genome accessibility file that can be used for ``cnvkit`` panel of normals creation. +Its output (``output/cnvkit.access/out/cnvkit.access.bed``) is optional, but its presence impacts of the way the target and antitarget regions are computed in whole genome mode. + +In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. +For genome data, it is used by the ``autobin`` tool to compute the average target size used during target regions creation. +If it is present, the target size is computed in amplicon mode, and when it is absent, +an accessibility file is created with default settings, which value is used by ``autobin`` is whole genome mode. + +This follows the internal ``batch`` code of ``cnvkit``. ======= Reports ======= -Currently, no reports are generated. -""" +Report tables can be found in the ``output/{mapper}.cnvkit/report`` directory. +Two tables are produced, grouping results for all normal samples together: -import os +- ``metrics.txt``: coverage metrics over target and antitarget regions. +- ``sex.txt``: prediction of the donor's gender based on the coverage of chromosome X & Y target and antitarget regions. + +The cnvkit authors recommend to check these reports to ensure that all data is suitable for panel of normal creation. +""" from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions -from snakemake.io import expand from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -65,14 +104,8 @@ __author__ = "Manuel Holtgrewe " -#: Extensions of files to create as main payload -EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") - -#: Names of the files to create for the extension -EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") - #: Names of the tools that might use panel of normals -TOOLS = ["mutect2"] +TOOLS = ["mutect2", "cnvkit", "access"] #: Default configuration for the somatic_variant_calling schema DEFAULT_CONFIG = r""" @@ -81,13 +114,6 @@ panel_of_normals: tools: ['mutect2'] # REQUIRED - available: 'mutect2' path_ngs_mapping: ../ngs_mapping # REQUIRED - ignore_chroms: # patterns of chromosome names to ignore - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 # Configuration for mutect2 mutect2: path_normals_list: null # Optional file listing libraries to include in panel @@ -108,6 +134,35 @@ job_mult_time: 1 # running time multiplier merge_mult_memory: 1 # memory multiplier for merging merge_mult_time: 1 # running time multiplier for merging + ignore_chroms: # patterns of chromosome names to ignore + - NC_007605 # herpes virus + - hs37d5 # GRCh37 decoy + - chrEBV # Eppstein-Barr Virus + - '*_decoy' # decoy contig + - 'HLA-*' # HLA genes + - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 + cnvkit: + path_normals_list: "" # Optional file listing libraries to include in panel + path_target_regions: "" # Bed files of targetted regions (Missing when creating a panel of normals for WGS data) + access: "" # Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run) + annotate: "" # [target] Optional targets annotations + target_avg_size: 0 # [target] Average size of split target bins (0: use default value) + bp_per_bin: 50000 # [autobin] Expected base per bin + split: True # [target] Split large intervals into smaller ones + antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) + min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering + gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [reference & sex] Create male reference + gc_correction: True # [reference] Use GC correction + edge_correction: True # [reference] Use edge correction + rmask_correction: True # [reference] Use rmask correction + drop_low_coverage: False # [metrics] Drop very-low-coverage bins before calculations + access: # Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally) + exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) + min_gap_size: 0 # [access] Minimum gap size between accessible sequence regions (0: use default value) """ @@ -124,7 +179,7 @@ class PanelOfNormalsStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample - self.normal_libraries = self._get_normal_libraries() + self.normal_libraries = list(self._get_normal_libraries()) def _get_normal_libraries(self): for sheet in self.parent.shortcut_sheets: @@ -208,7 +263,7 @@ def _get_input_files_prepare_panel(self, wildcards): def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for merging individual results & panel creation""" paths = [] - tpl = "work/{mapper}.{tool}.prepare_panel/out/{normal_library}.vcf.gz" + tpl = "work/{mapper}.{tool}/out/{mapper}.{tool}.{normal_library}.prepare.vcf.gz" for normal in self.normal_libraries: paths.append(tpl.format(normal_library=normal, tool=self.name, **wildcards)) return {"normals": paths} @@ -224,8 +279,8 @@ def get_output_files(self, action): "vcf_tbi_md5": "vcf.gz.tbi.md5", } tpls = { - "prepare_panel": "work/{{mapper}}.{tool}.prepare_panel/out/{{normal_library}}.{ext}", - "create_panel": "work/{{mapper}}.{tool}.create_panel/out/{{mapper}}.{tool}.panel_of_normals.{ext}", + "prepare_panel": "work/{{mapper}}.{tool}/out/{{mapper}}.{tool}.{{normal_library}}.prepare.{ext}", + "create_panel": "work/{{mapper}}.{tool}/out/{{mapper}}.{tool}.panel_of_normals.{ext}", } for key, ext in ext_dict.items(): yield key, tpls[action].format(tool=self.name, ext=ext) @@ -243,13 +298,308 @@ def get_log_file(self, action): "log_md5": "log.md5", } tpls = { - "prepare_panel": "work/{{mapper}}.{tool}.prepare_panel/log/{{normal_library}}.{ext}", - "create_panel": "work/{{mapper}}.{tool}.create_panel/log/{{mapper}}.{tool}.panel_of_normals.{ext}", + "prepare_panel": "work/{{mapper}}.{tool}/log/{{mapper}}.{tool}.{{normal_library}}.prepare.{ext}", + "create_panel": "work/{{mapper}}.{tool}/log/{{mapper}}.{tool}.panel_of_normals.{ext}", } for key, ext in ext_dict.items(): yield key, tpls[action].format(tool=self.name, ext=ext) +class CnvkitStepPart(PanelOfNormalsStepPart): + """Somatic variant calling with MuTect 2""" + + #: Step name + name = "cnvkit" + + #: Class available actions + actions = ( + "target", + "antitarget", + "coverage", + "create_panel", + "report", + ) + + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). + resource_usage_dict = { + "target": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", + ), + "antitarget": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", + ), + "coverage": ResourceUsage( + threads=8, + time="02:00:00", # 2 hours + memory="16G", + ), + "create_panel": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + "report": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + } + + def __init__(self, parent): + super().__init__(parent) + if self.config["cnvkit"]["path_normals_list"]: + self.normal_libraries = [] + with open(self.config["cnvkit"]["path_normals_list"], "rt") as f: + for line in f: + self.normal_libraries.append(line.strip()) + self.is_wgs = self.config["cnvkit"]["path_target_regions"] == "" + + def check_config(self): + if self.name not in self.config["tools"]: + return # cnvkit not enabled, skip + self.parent.ensure_w_config( + ("static_data_config", "reference", "path"), + "Path to reference FASTA not configured but required for %s" % (self.name,), + ) + + def get_args(self, action): + self._validate_action(action) + if self.is_wgs: + method = "wgs" + else: + method = "hybrid" + return {"method": method, "flat": (len(self.normal_libraries) == 0)} + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + """ + # Validate action + self._validate_action(action) + return self.resource_usage_dict.get(action) + + def get_input_files(self, action): + """Return input files for cnvkit panel of normals creation""" + # Validate action + self._validate_action(action) + mapping = { + "target": self._get_input_files_target, + "antitarget": self._get_input_files_antitarget, + "coverage": self._get_input_files_coverage, + "create_panel": self._get_input_files_create_panel, + "report": self._get_input_files_report, + "access": None, + } + return mapping[action] + + def _get_input_files_target(self, wildcards): + """Helper wrapper function to estimate target average size in wgs mode""" + if not self.is_wgs: + return {} + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + bams = [ + ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + for x in self.normal_libraries + ] + bais = [x + ".bai" for x in bams] + input_files = {"bams": bams, "bais": bais} + return input_files + + def _get_input_files_antitarget(self, wildcards): + """Helper wrapper function for computing antitarget locations""" + if self.is_wgs: + return {} + return { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + } + + def _get_input_files_coverage(self, wildcards): + """Helper wrapper function for computing coverage""" + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + bam = ngs_mapping(tpl.format(**wildcards)) + return { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format( + **wildcards + ), + "bam": bam, + "bai": bam + ".bai", + } + + def _get_input_files_create_panel(self, wildcards): + """Helper wrapper function for computing panel of normals""" + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" + targets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + return { + "target": targets + if targets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "antitarget": antitargets + if antitargets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards), + } + + def _get_input_files_report(self, wildcards): + """Helper wrapper function for the panel of normals report""" + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" + targets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + return { + "target": targets, + "antitarget": antitargets, + } + + def get_output_files(self, action): + """Return panel of normal files""" + if action == "target": + return self._get_output_files_target() + elif action == "antitarget": + return self._get_output_files_antitarget() + elif action == "coverage": + return self._get_output_files_coverage() + elif action == "create_panel": + return self._get_output_files_create_panel() + elif action == "report": + return self._get_output_files_report() + elif action == "access": + return self._get_output_files_access() + else: + self._validate_action(action) + + def _get_output_files_target(self): + return { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + } + + def _get_output_files_antitarget(self): + return { + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + } + + def _get_output_files_coverage(self): + return { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + } + + def _get_output_files_create_panel(self): + return { + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + + def _get_output_files_report(self): + return { + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", + } + + def _get_output_files_access(self): + return { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + } + + @dictify + def get_log_file(self, action): + """Return panel of normal files""" + ext_dict = { + "conda_list": "conda_list.txt", + "conda_list_md5": "conda_list.txt.md5", + "conda_info": "conda_info.txt", + "conda_info_md5": "conda_info.txt.md5", + "log": "log", + "log_md5": "log.md5", + } + if action == "target": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.target.{ext}" + elif action == "antitarget": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.antitarget.{ext}" + elif action == "coverage": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.{{normal_library}}.coverage.{ext}" + elif action == "create_panel": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.panel_of_normals.{ext}" + elif action == "report": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.report.{ext}" + elif action == "access": + tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" + else: + self._validate_action(action) + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + + +class AccessStepPart(PanelOfNormalsStepPart): + """Utility to create access file for cnvkit""" + + name = "access" + actions = ("run",) + + def get_resource_usage(self, action): + # Validate action + self._validate_action(action) + return ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", + ) + + def get_input_files(self, action): + # Validate action + self._validate_action(action) + return None + + def get_output_files(self, action): + # Validate action + self._validate_action(action) + tpl = "work/cnvkit.access/out/cnvkit.access.bed" + return {"access": tpl, "access_md5": tpl + ".md5"} + + @dictify + def get_log_file(self, action): + """Return log files""" + # Validate action + self._validate_action(action) + ext_dict = { + "conda_list": "conda_list.txt", + "conda_list_md5": "conda_list.txt.md5", + "conda_info": "conda_info.txt", + "conda_info_md5": "conda_info.txt.md5", + "log": "log", + "log_md5": "log.md5", + } + tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + + class PanelOfNormalsWorkflow(BaseStep): """Creates a panel of normals""" @@ -281,6 +631,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) self.register_sub_step_classes( ( Mutect2StepPart, + CnvkitStepPart, + AccessStepPart, LinkOutStepPart, ) ) @@ -293,45 +645,64 @@ def get_result_files(self): We will process all NGS libraries of all bio samples in all sample sheets. """ - # Panel of normals - files = [] - - # yield from expand( - files.extend( - expand( - os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "out", - "{mapper}.{caller}.panel_of_normals" + "{ext}", + result_files = [] + + log_ext_list = [ + "log", + "log.md5", + "conda_list.txt", + "conda_list.txt.md5", + "conda_info.txt", + "conda_info.txt.md5", + ] + + if "mutect2" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" + ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") + result_files.extend(self._expand_result_files(tpl, ext_list)) + tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list)) + + if "cnvkit" in set(self.config["tools"]) & set(TOOLS): + tpls = [ + ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.target.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ( + "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", + ("cnn", "cnn.md5"), ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=set(self.config["tools"]) & set(TOOLS), - ext=EXT_VALUES, - ) - ) - # yield from expand( - files.extend( - expand( - os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "log", - "{mapper}.{caller}.panel_of_normals" + "{ext}", + ( + "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", + ("tsv", "tsv.md5"), ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=set(self.config["tools"]) & set(TOOLS), - ext=( - ".log", - ".log.md5", - ".conda_info.txt", - ".conda_info.txt.md5", - ".conda_list.txt", - ".conda_list.txt.md5", + ( + "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", + ("tsv", "tsv.md5"), ), - ) - ) - return files + ] + for tpl, ext_list in tpls: + result_files.extend(self._expand_result_files(tpl, ext_list)) + tpls = [ + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.target.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.report.{ext}", + ] + for tpl in tpls: + result_files.extend(self._expand_result_files(tpl, log_ext_list)) + + if "access" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/cnvkit.access/out/cnvkit.access.bed" + result_files.extend([tpl + md5 for md5 in ("", ".md5")]) + tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list)) + + return result_files + + def _expand_result_files(self, tpl, ext_list): + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for ext in ext_list: + yield tpl.format(mapper=mapper, ext=ext) def check_config(self): """Check that the path to the NGS mapping is present""" diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 3245f07be..4e017c21f 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -29,52 +29,56 @@ of this tool. In the future, this section will contain "common" output and tool-specific output sub sections. -- ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}/out/`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.bed`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.seg`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.vcf.gz`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.vcf.gz.tbi`` -- ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}/out`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.diagram.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.scatter.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.heatmap.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.heatmap.chr1.pdf`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/out/`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.bed`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.seg`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.vcf.gz`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.vcf.gz.tbi`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/report`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.diagram.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.scatter.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.heatmap.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.heatmap.chr1.pdf`` - ... - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.scatter.chrX.pdf`` -- ``{mapper}.cnvkit.report.{lib_name}-{lib_pk}/out`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.breaks.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.gainloss.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.gender.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.metrics.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.segmetrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.scatter.chrX.pdf`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/report`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.breaks.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.genemetrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.gender.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.metrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.segmetrics.txt`` For example: :: output/ - |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007 + |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007 | `-- out - | |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.bed - | |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.seg - | `-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.vcf - |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016 - | `-- out - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.diagram.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.heatmap.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.scatter.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.heatmap.chr1.pdf + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.bed + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.seg + | `-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.vcf + |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016 + | `-- report + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.diagram.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.heatmap.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.scatter.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.heatmap.chr1.pdf | |-- ... - | `-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.scatter.chrX.pdf - |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016 - | `-- out - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.breaks.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.gainloss.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.gender.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.metrics.txt - | `-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.segmetrics.txt + | `-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.scatter.chrX.pdf + |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016 + | `-- report + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.breaks.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.genemetrics.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.gender.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.metrics.txt + | `-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.segmetrics.txt [...] +Note that tool ``cnvetti`` doesn't follow the snappy convention above: +the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. +For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1-000016``. + ===================== Default Configuration ===================== @@ -92,7 +96,7 @@ """ from collections import OrderedDict -import itertools +from itertools import chain import os import os.path import sys @@ -119,15 +123,44 @@ tools: ['cnvkit'] # REQUIRED - available: 'cnvkit', 'copywriter', 'cnvetti_on_target' and 'cnvetti_off_target' path_ngs_mapping: ../ngs_mapping # REQUIRED cnvkit: - path_target_regions: REQUIRED # REQUIRED - seg_method: haar - seg_threshold: 0.0001 + path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + gc_correction: True # [fix] Use GC correction + edge_correction: True # [fix] Use edge correction + rmask_correction: True # [fix] Use rmask correction # BCBIO uses # seg_method: haar # seg_threshold: 0.0001 # -- OR # seg_method: cbs # seg_threshold: 0.000001 + segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none + segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) + drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins + drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) + smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) + center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. + filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering + calling_method: threshold # [call] One of threshold, clonal, none + call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number + ploidy: 2 # [call] Ploidy of sample cells + purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) + gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [call, diagram] Create male reference + diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes + diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes + shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex + breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene + genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene + genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene + genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff + genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps + segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff + segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps + smooth_bootstrap: False # [segmetrics] Smooth bootstrap results copywriter: path_target_regions: REQUIRED # REQUIRED bin_size: 20000 # TODO: make actually configurable @@ -394,14 +427,10 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): #: Class available actions actions = ( - "access", - "target", - "antitarget", "coverage", - "reference", "fix", - "call", "segment", + "call", "export", "plot", "report", @@ -411,12 +440,17 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): resource_usage_dict = { "plot": ResourceUsage( threads=1, - time="1-00:00:00", # 1 day + time="08:00:00", # 1 day memory=f"{30 * 1024}M", ), + "coverage": ResourceUsage( + threads=8, + time="08:00:00", # 8 hours + memory=f"{16 * 1024}M", + ), "default": ResourceUsage( threads=1, - time="1-00:00:00", # 1 day + time="03:59:59", # 1 day memory=f"{int(7.5 * 1024)}M", ), } @@ -429,20 +463,24 @@ def check_config(self): if "cnvkit" not in self.config["tools"]: return # cnvkit not enabled, skip self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_target_regions"), + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_target"), "Path to target regions is missing for cnvkit", ) + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_antitarget"), + "Path to antitarget regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_panel_of_normals"), + "Path to panel of normals (reference) is missing for cnvkit", + ) def get_input_files(self, action): """Return input paths input function, dependent on rule""" # Validate action self._validate_action(action) method_mapping = { - "access": None, - "target": self._get_input_files_target, - "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, - "reference": self._get_input_files_reference, "call": self._get_input_files_call, "fix": self._get_input_files_fix, "segment": self._get_input_files_segment, @@ -452,232 +490,173 @@ def get_input_files(self, action): } return method_mapping[action] - @staticmethod - def _get_input_files_target(_): - input_files = {"access": "work/cnvkit.access/out/access.bed"} - return input_files - - @staticmethod - def _get_input_files_antitarget(_): - input_files = { - "access": "work/cnvkit.access/out/access.bed", - "target": "work/cnvkit.target/out/target.bed", - } - return input_files - def _get_input_files_coverage(self, wildcards): - input_files = { - "target": "work/cnvkit.target/out/target.bed", - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", - } # BAM/BAI file ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files["bam"] = ngs_mapping(base_path + ".bam") - input_files["bai"] = ngs_mapping(base_path + ".bam.bai") - return input_files - - def _get_input_files_reference(self, wildcards): input_files = { - "target": "work/cnvkit.target/out/target.bed", - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), } - # BAM/BAI file - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( - normal_library=self.get_normal_lib_name(wildcards), **wildcards - ) - input_files["bam"] = ngs_mapping(base_path + ".bam") - input_files["bai"] = ngs_mapping(base_path + ".bam.bai") return input_files @staticmethod def _get_input_files_fix(wildcards): - tpl_base = "{mapper}.cnvkit.{substep}.{library_name}" - tpl = "work/" + tpl_base + "/out/" + tpl_base + ".cnn" - input_files = {"ref": tpl.format(substep="reference", **wildcards)} + tpl_base = "{mapper}.cnvkit.{library_name}" tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" - for target in ("target", "antitarget"): - input_files[target] = tpl.format(target=target, substep="coverage", **wildcards) + input_files = { + "target": tpl.format(target="target", **wildcards), + "antitarget": tpl.format(target="antitarget", **wildcards), + } return input_files @staticmethod def _get_input_files_segment(wildcards): - cnr_pattern = ( - "work/{mapper}.cnvkit.fix.{library_name}/out/{mapper}.cnvkit.fix.{library_name}.cnr" - ) + cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" input_files = {"cnr": cnr_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_call(wildcards): segment_pattern = ( - "work/{mapper}.cnvkit.segment.{library_name}/out/" - "{mapper}.cnvkit.segment.{library_name}.cns" + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" ) input_files = {"segment": segment_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_export(wildcards): - cns_pattern = ( - "work/{mapper}.cnvkit.call.{library_name}/out/{mapper}.cnvkit.call.{library_name}.cns" - ) + cns_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" input_files = {"cns": cns_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_plot(wildcards): - tpl = ( - "work/{mapper}.cnvkit.{substep}.{library_name}/out/" - "{mapper}.cnvkit.{substep}.{library_name}.{ext}" - ) + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" input_files = { - "cnr": tpl.format(substep="fix", ext="cnr", **wildcards), - "cns": tpl.format(substep="segment", ext="cns", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), } return input_files def _get_input_files_report(self, wildcards): - return self._get_input_files_plot(wildcards) + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "target": tpl.format(ext="targetcoverage.cnn", **wildcards), + "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files def get_output_files(self, action): """Return output files for the given action""" - # Validate action - self._validate_action(action) - method_mapping = { - "access": self._get_output_files_access, - "target": self._get_output_files_target, - "antitarget": self._get_output_files_antitarget, - "coverage": self._get_output_files_coverage, - "reference": self._get_output_files_reference, - "fix": self._get_output_files_fix, - "call": self._get_output_files_call, - "segment": self._get_output_files_segment, - "export": self._get_output_files_export, - "plot": self._get_output_files_plot, - "report": self._get_output_files_report, - } - return method_mapping[action]() - - @staticmethod - def _get_output_files_access(): - return "work/cnvkit.access/out/access.bed" - - @staticmethod - def _get_output_files_target(): - return "work/cnvkit.target/out/target.bed" - - @staticmethod - def _get_output_files_antitarget(): - return "work/cnvkit.antitarget/out/antitarget.bed" + if action == "coverage": + return self._get_output_files_coverage() + elif action == "fix": + return self._get_output_files_fix() + elif action == "segment": + return self._get_output_files_segment() + elif action == "call": + return self._get_output_files_call() + elif action == "export": + return self._get_output_files_export() + elif action == "plot": + return self._get_output_files_plot() + elif action == "report": + return self._get_output_files_report() + else: + self._validate_action(action) @staticmethod def _get_output_files_coverage(): - name_pattern = "{mapper}.cnvkit.coverage.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" output_files = {} for target in ("target", "antitarget"): output_files[target] = os.path.join( "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) ) + output_files[target + "_md5"] = output_files[target] + ".md5" return output_files - @staticmethod - def _get_output_files_reference(): - name_pattern = "{mapper}.cnvkit.reference.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnn") - return tpl - @staticmethod def _get_output_files_fix(): - name_pattern = "{mapper}.cnvkit.fix.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") - return tpl + return {"ratios": tpl, "ratios_md5": tpl + ".md5"} @staticmethod def _get_output_files_segment(): - name_pattern = "{mapper}.cnvkit.segment.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") - return tpl + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") + return {"segments": tpl, "segments_md5": tpl + ".md5"} @staticmethod def _get_output_files_call(): - name_pattern = "{mapper}.cnvkit.call.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") - return tpl + return {"calls": tpl, "calls_md5": tpl + ".md5"} @dictify def _get_output_files_plot(self): - plots = ("scatter", "diagram", "heatmap") - chroms = list(itertools.chain(range(1, 23), ["X", "Y"])) + plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) + chroms = list(chain(range(1, 23), ["X", "Y"])) + output_files = {} # Yield file name pairs for global plots tpl = ( - "work/{mapper}.cnvkit.plot.{library_name}/out/" - "{mapper}.cnvkit.plot.{library_name}.{plot}.pdf" - ) - yield from ( - (plot, tpl.format(plot=plot, **format_id("mapper", "library_name"))) for plot in plots + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" ) + for plot, ext in plots: + output_files[plot] = tpl.format(plot=plot, ext=ext) + output_files[plot + "_md5"] = output_files[plot] + ".md5" # Yield file name pairs for the chromosome-wise plots - chrom_plots = ("scatter", "heatmap") tpl_chrom = ( - "work/{mapper}.cnvkit.plot.{library_name}/out/" - "{mapper}.cnvkit.plot.{library_name}.{plot}.chr{chrom}.pdf" - ) - yield from ( - ( - "{plot}_chr{chrom}".format(plot=plot, chrom=chrom), - tpl_chrom.format(plot=plot, chrom=chrom, **format_id("mapper", "library_name")), - ) - for plot in chrom_plots - for chrom in chroms + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" ) + for plot, ext in chrom_plots: + for chrom in chroms: + key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) + output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) + output_files[key + "_md5"] = output_files[key] + ".md5" + return output_files @staticmethod def _get_output_files_export(): - keys = ("bed", "seg", "vcf", "vcf_tbi") - exts = ("bed", "seg", "vcf.gz", "vcf.gz.tbi") - name_pattern = "{mapper}.cnvkit.export.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".{ext}") + exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) output_files = {} - for key, ext in zip(keys, exts): - output_files[key] = tpl.format(ext=ext, **format_id("mapper", "library_name")) + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/out/" + "{{mapper}}.cnvkit.{{library_name}}.{ext}" + ) + for export, ext in exports: + output_files[export] = tpl.format(export=export, ext=ext) + output_files[export + "_md5"] = output_files[export] + ".md5" return output_files @dictify def _get_output_files_report(self): - reports = ("breaks", "gainloss", "gender", "metrics", "segmetrics") + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + output_files = {} tpl = ( - "work/{mapper}.cnvkit.report.{library_name}/out/" - "{mapper}.cnvkit.report.{library_name}.{report}.txt" - ) - yield from ( - (report, tpl.format(report=report, **format_id("mapper", "library_name"))) - for report in reports + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" ) + for report in reports: + output_files[report] = tpl.format(report=report) + output_files[report + "_md5"] = output_files[report] + ".md5" + return output_files - @staticmethod - def get_log_file(action): + def get_log_file(self, action): """Return path to log file for the given action""" - if action in ("access", "target", "antitarget"): - prefix = "work/cnvkit.{action}/log/cnvkit.{action}" - elif action in ( - "coverage", - "reference", - "fix", - "call", - "segment", - "export", - "plot", - "report", - ): - prefix = ( - "work/{{mapper}}.cnvkit.{action}.{{library_name}}/log/" - "{{mapper}}.cnvkit.{action}.{{library_name}}" - ) - else: - raise ValueError("Unknown action {}".format(action)) - prefix = prefix.format(action=action) + # Validate action + self._validate_action(action) + prefix = ( + "work/{{mapper}}.cnvkit.{{library_name}}/log/" + "{{mapper}}.cnvkit.{action}.{{library_name}}" + ).format(action=action) key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), @@ -701,6 +680,8 @@ def get_resource_usage(self, action): self._validate_action(action) if action == "plot": return self.resource_usage_dict.get("plot") + elif action == "coverage": + return self.resource_usage_dict.get("coverage") else: return self.resource_usage_dict.get("default") @@ -908,7 +889,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) def get_result_files(self): """Return list of result files for the somatic targeted sequencing CNV calling step""" tool_actions = { - "cnvkit": ("call", "report", "export", "plot"), # ("report", "export", "plot"), + "cnvkit": ("fix", "call", "report", "export", "plot"), "copywriter": ("call",), "cnvetti_on_target": ("coverage", "segment", "postprocess"), "cnvetti_off_target": ("coverage", "segment", "postprocess"), @@ -946,7 +927,7 @@ def get_result_files(self): yield f.replace("work/", "output/") def check_config(self): - """Check that the necessary globalc onfiguration is present""" + """Check that the necessary global configuration is present""" self.ensure_w_config( ("step_config", "somatic_targeted_seq_cnv_calling", "path_ngs_mapping"), ( diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules index c01f4c198..4014660c1 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules @@ -1,52 +1,3 @@ -rule somatic_accessed_seq_cnv_calling_cnvkit_access: - output: - wf.get_output_files("cnvkit", "access"), - threads: wf.get_resource("cnvkit", "access", "threads") - resources: - time=wf.get_resource("cnvkit", "access", "time"), - memory=wf.get_resource("cnvkit", "access", "memory"), - partition=wf.get_resource("cnvkit", "access", "partition"), - tmpdir=wf.get_resource("cnvkit", "access", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "access"), - wrapper: - wf.wrapper_path("cnvkit/access") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_target: - input: - unpack(wf.get_input_files("cnvkit", "target")), - output: - wf.get_output_files("cnvkit", "target"), - threads: wf.get_resource("cnvkit", "target", "threads") - resources: - time=wf.get_resource("cnvkit", "target", "time"), - memory=wf.get_resource("cnvkit", "target", "memory"), - partition=wf.get_resource("cnvkit", "target", "partition"), - tmpdir=wf.get_resource("cnvkit", "target", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "target"), - wrapper: - wf.wrapper_path("cnvkit/target") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: - input: - unpack(wf.get_input_files("cnvkit", "antitarget")), - output: - wf.get_output_files("cnvkit", "antitarget"), - threads: wf.get_resource("cnvkit", "antitarget", "threads") - resources: - time=wf.get_resource("cnvkit", "antitarget", "time"), - memory=wf.get_resource("cnvkit", "antitarget", "memory"), - partition=wf.get_resource("cnvkit", "antitarget", "partition"), - tmpdir=wf.get_resource("cnvkit", "antitarget", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "antitarget"), - wrapper: - wf.wrapper_path("cnvkit/antitarget") - - rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: input: unpack(wf.get_input_files("cnvkit", "coverage")), @@ -64,28 +15,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: wf.wrapper_path("cnvkit/coverage") -rule somatic_targeted_seq_cnv_calling_cnvkit_reference: - input: - unpack(wf.get_input_files("cnvkit", "reference")), - output: - wf.get_output_files("cnvkit", "reference"), - threads: wf.get_resource("cnvkit", "reference", "threads") - resources: - time=wf.get_resource("cnvkit", "reference", "time"), - memory=wf.get_resource("cnvkit", "reference", "memory"), - partition=wf.get_resource("cnvkit", "reference", "partition"), - tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "reference"), - wrapper: - wf.wrapper_path("cnvkit/reference") - - rule somatic_targeted_seq_cnv_calling_cnvkit_fix: input: unpack(wf.get_input_files("cnvkit", "fix")), output: - wf.get_output_files("cnvkit", "fix"), + **wf.get_output_files("cnvkit", "fix"), threads: wf.get_resource("cnvkit", "fix", "threads") resources: time=wf.get_resource("cnvkit", "fix", "time"), @@ -102,7 +36,7 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_segment: input: unpack(wf.get_input_files("cnvkit", "segment")), output: - wf.get_output_files("cnvkit", "segment"), + **wf.get_output_files("cnvkit", "segment"), threads: wf.get_resource("cnvkit", "segment", "threads") resources: time=wf.get_resource("cnvkit", "segment", "time"), @@ -119,7 +53,7 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_call: input: unpack(wf.get_input_files("cnvkit", "call")), output: - wf.get_output_files("cnvkit", "call"), + **wf.get_output_files("cnvkit", "call"), threads: wf.get_resource("cnvkit", "call", "threads") resources: time=wf.get_resource("cnvkit", "call", "time"), diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile index a4e9d2dae..ea996d6ee 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile @@ -194,18 +194,4 @@ rule somatic_wgs_cnv_calling_control_freec_plot: # Run cnvkit ------------------------------------------------ -rule somatic_wgs_cnv_calling_cnvkit_run: - input: - unpack(wf.get_input_files("cnvkit", "run")), - output: - **wf.get_output_files("cnvkit", "run"), - threads: wf.get_resource("cnvkit", "run", "threads") - resources: - time=wf.get_resource("cnvkit", "run", "time"), - memory=wf.get_resource("cnvkit", "run", "memory"), - partition=wf.get_resource("cnvkit", "run", "partition"), - tmpdir=wf.get_resource("cnvkit", "run", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "run"), - wrapper: - wf.wrapper_path("cnvkit/wgs") +include: "cnvkit.rules" diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py index 9793a8e55..6cbfdf4e2 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py @@ -137,7 +137,44 @@ tx_obj: TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene bs_obj: BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5 cnvkit: - path_annotate_refflat: REQUIRED #REQUIRED + path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + gc_correction: True # [fix] Use GC correction + edge_correction: True # [fix] Use edge correction + rmask_correction: True # [fix] Use rmask correction + # BCBIO uses + # seg_method: haar + # seg_threshold: 0.0001 + # -- OR + # seg_method: cbs + # seg_threshold: 0.000001 + segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none + segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) + drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins + drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) + smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) + center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. + filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering + calling_method: threshold # [call] One of threshold, clonal, none + call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number + ploidy: 2 # [call] Ploidy of sample cells + purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) + gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [call, diagram] Create male reference + diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes + diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes + shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex + breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene + genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene + genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene + genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff + genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps + segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff + segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps + smooth_bootstrap: False # [segmetrics] Smooth bootstrap results """ @@ -406,19 +443,243 @@ class CnvkitSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): name = "cnvkit" #: Class available actions - actions = ("run",) + actions = ( + "coverage", + "fix", + "segment", + "call", + "export", + "plot", + "report", + ) + + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). + resource_usage_dict = { + "plot": ResourceUsage( + threads=1, + time="1-00:00:00", # 1 day + memory=f"{30 * 1024}M", + ), + "coverage": ResourceUsage( + threads=8, + time="1-00:00:00", # 1 day + memory=f"{16 * 1024}M", + ), + "default": ResourceUsage( + threads=1, + time="1-00:00:00", # 1 day + memory=f"{int(7.5 * 1024)}M", + ), + } + + def __init__(self, parent): + super().__init__(parent) + + def check_config(self): + """Check configuration for cnvkit""" + if "cnvkit" not in self.config["tools"]: + return # cnvkit not enabled, skip + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_target"), + "Path to target regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_antitarget"), + "Path to antitarget regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_panel_of_normals"), + "Path to panel of normals (reference) is missing for cnvkit", + ) + + def get_input_files(self, action): + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + method_mapping = { + "coverage": self._get_input_files_coverage, + "call": self._get_input_files_call, + "fix": self._get_input_files_fix, + "segment": self._get_input_files_segment, + "export": self._get_input_files_export, + "plot": self._get_input_files_plot, + "report": self._get_input_files_report, + } + return method_mapping[action] + + def _get_input_files_coverage(self, wildcards): + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } + return input_files + + @staticmethod + def _get_input_files_fix(wildcards): + tpl_base = "{mapper}.cnvkit.{library_name}" + tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" + input_files = { + "target": tpl.format(target="target", **wildcards), + "antitarget": tpl.format(target="antitarget", **wildcards), + } + return input_files + + @staticmethod + def _get_input_files_segment(wildcards): + cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + input_files = {"cnr": cnr_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_call(wildcards): + segment_pattern = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" + ) + input_files = {"segment": segment_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_export(wildcards): + cns_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + input_files = {"cns": cns_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_plot(wildcards): + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files + + def _get_input_files_report(self, wildcards): + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "target": tpl.format(ext="targetcoverage.cnn", **wildcards), + "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files def get_output_files(self, action): + """Return output files for the given action""" # Validate action self._validate_action(action) - return { - "segment": self.base_path_out.format(var_caller=self.name, ext=".cns"), - "segment_md5": self.base_path_out.format(var_caller=self.name, ext=".cns.md5"), - "bins": self.base_path_out.format(var_caller=self.name, ext=".cnr"), - "bins_md5": self.base_path_out.format(var_caller=self.name, ext=".cnr.md5"), - "scatter": self.base_path_out.format(var_caller=self.name, ext=".scatter.png"), - "scatter_md5": self.base_path_out.format(var_caller=self.name, ext=".scatter.png.md5"), + method_mapping = { + "coverage": self._get_output_files_coverage, + "fix": self._get_output_files_fix, + "call": self._get_output_files_call, + "segment": self._get_output_files_segment, + "export": self._get_output_files_export, + "plot": self._get_output_files_plot, + "report": self._get_output_files_report, } + return method_mapping[action]() + + @staticmethod + def _get_output_files_coverage(): + name_pattern = "{mapper}.cnvkit.{library_name}" + output_files = {} + for target in ("target", "antitarget"): + output_files[target] = os.path.join( + "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) + ) + output_files[target + "_md5"] = output_files[target] + ".md5" + return output_files + + @staticmethod + def _get_output_files_fix(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") + return {"ratios": tpl, "ratios_md5": tpl + ".md5"} + + @staticmethod + def _get_output_files_segment(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") + return {"segments": tpl, "segments_md5": tpl + ".md5"} + + @staticmethod + def _get_output_files_call(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") + return {"calls": tpl, "calls_md5": tpl + ".md5"} + + @dictify + def _get_output_files_plot(self): + plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) + chroms = list(chain(range(1, 23), ["X", "Y"])) + output_files = {} + # Yield file name pairs for global plots + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in plots: + output_files[plot] = tpl.format(plot=plot, ext=ext) + output_files[plot + "_md5"] = output_files[plot] + ".md5" + # Yield file name pairs for the chromosome-wise plots + tpl_chrom = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in chrom_plots: + for chrom in chroms: + key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) + output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) + output_files[key + "_md5"] = output_files[key] + ".md5" + return output_files + + @staticmethod + def _get_output_files_export(): + exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) + output_files = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/out/" + "{{mapper}}.cnvkit.{{library_name}}.{ext}" + ) + for export, ext in exports: + output_files[export] = tpl.format(export=export, ext=ext) + output_files[export + "_md5"] = output_files[export] + ".md5" + return output_files + + @dictify + def _get_output_files_report(self): + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + output_files = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" + ) + for report in reports: + output_files[report] = tpl.format(report=report) + output_files[report + "_md5"] = output_files[report] + ".md5" + return output_files + + def get_log_file(self, action): + """Return path to log file for the given action""" + # Validate action + self._validate_action(action) + prefix = ( + "work/{{mapper}}.cnvkit.{{library_name}}/log/" + "{{mapper}}.cnvkit.{action}.{{library_name}}" + ).format(action=action) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + log_files = {} + for key, ext in key_ext: + log_files[key] = prefix + ext + log_files[key + "_md5"] = prefix + ext + ".md5" + return log_files def get_resource_usage(self, action): """Get Resource Usage @@ -430,11 +691,12 @@ def get_resource_usage(self, action): """ # Validate action self._validate_action(action) - return ResourceUsage( - threads=4, - time="1-16:00:00", # 1 day and 16 hours - memory=f"{10 * 1024 * 4}M", - ) + if action == "plot": + return self.resource_usage_dict.get("plot") + elif action == "coverage": + return self.resource_usage_dict.get("coverage") + else: + return self.resource_usage_dict.get("default") class ControlFreecSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): @@ -624,11 +886,38 @@ def get_result_files(self): ) # Plots for cnvetti if "cnvkit" in self.config["tools"]: + exts = (".cnr", ".cns", ".bed", ".seg", ".vcf.gz", ".vcf.gz.tbi") yield from self._yield_result_files( tpl, mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], caller="cnvkit", - ext=[".cnr", ".cnr.md5", ".cns", ".cns.md5", ".scatter.png", ".scatter.png.md5"], + ext=exts, + ) + yield from self._yield_result_files( + tpl, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", + ext=[ext + ".md5" for ext in exts], + ) + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + yield from self._yield_report_files( + ( + "output/{mapper}.{caller}.{cancer_library.name}/report/" + "{mapper}.{caller}.{cancer_library.name}.{ext}" + ), + [(report, "txt", False) for report in reports], + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", + ) + plots = (("diagram", "pdf", False), ("heatmap", "pdf", True), ("scatter", "png", True)) + yield from self._yield_report_files( + ( + "output/{mapper}.{caller}.{cancer_library.name}/report/" + "{mapper}.{caller}.{cancer_library.name}.{ext}" + ), + plots, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", ) if "cnvetti" in bcf_tools: for sheet in filter(is_not_background, self.shortcut_sheets): @@ -675,6 +964,50 @@ def _yield_result_files(self, tpl, **kwargs): tpl, cancer_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs ) + def _yield_report_files(self, tpl, exts, **kwargs): + """Build report paths from path template and extension list""" + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) # pragma: no cover + print( + msg.format(sample_pair.tumor_sample.name), file=sys.stderr + ) # pragma: no cover + continue # pragma: no cover + for plot, ext, chrom in exts: + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + "." + ext, + **kwargs, + ) + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + "." + ext + ".md5", + **kwargs, + ) + if chrom: + for c in map(str, chain(range(1, 23), ("X", "Y"))): + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + ".chr" + c + "." + ext, + **kwargs, + ) + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + ".chr" + c + "." + ext + ".md5", + **kwargs, + ) + def check_config(self): """Check that the necessary configuration is available for the step""" self.ensure_w_config( @@ -688,7 +1021,3 @@ def check_config(self): "WGS CNV calling" ), ) - self.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "path_somatic_variant_calling"), - "Somatic (small) variant calling tool not configured for somatic WGS CNV calling", - ) diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules new file mode 100644 index 000000000..5e91bd261 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules @@ -0,0 +1,110 @@ +rule somatic_wgs_cnv_calling_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + output: + **wf.get_output_files("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + log: + **wf.get_log_file("cnvkit", "coverage"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +rule somatic_wgs_cnv_calling_cnvkit_fix: + input: + unpack(wf.get_input_files("cnvkit", "fix")), + output: + **wf.get_output_files("cnvkit", "fix"), + threads: wf.get_resource("cnvkit", "fix", "threads") + resources: + time=wf.get_resource("cnvkit", "fix", "time"), + memory=wf.get_resource("cnvkit", "fix", "memory"), + partition=wf.get_resource("cnvkit", "fix", "partition"), + log: + **wf.get_log_file("cnvkit", "fix"), + wrapper: + wf.wrapper_path("cnvkit/fix") + + +rule somatic_wgs_cnv_calling_cnvkit_segment: + input: + unpack(wf.get_input_files("cnvkit", "segment")), + output: + **wf.get_output_files("cnvkit", "segment"), + threads: wf.get_resource("cnvkit", "segment", "threads") + resources: + time=wf.get_resource("cnvkit", "segment", "time"), + memory=wf.get_resource("cnvkit", "segment", "memory"), + partition=wf.get_resource("cnvkit", "segment", "partition"), + log: + **wf.get_log_file("cnvkit", "segment"), + wrapper: + wf.wrapper_path("cnvkit/segment") + + +rule somatic_wgs_cnv_calling_cnvkit_call: + input: + unpack(wf.get_input_files("cnvkit", "call")), + output: + **wf.get_output_files("cnvkit", "call"), + threads: wf.get_resource("cnvkit", "call", "threads") + resources: + time=wf.get_resource("cnvkit", "call", "time"), + memory=wf.get_resource("cnvkit", "call", "memory"), + partition=wf.get_resource("cnvkit", "call", "partition"), + log: + **wf.get_log_file("cnvkit", "call"), + wrapper: + wf.wrapper_path("cnvkit/call") + + +rule somatic_wgs_cnv_calling_cnvkit_plot: + input: + unpack(wf.get_input_files("cnvkit", "plot")), + output: + **wf.get_output_files("cnvkit", "plot"), + threads: wf.get_resource("cnvkit", "plot", "threads") + resources: + time=wf.get_resource("cnvkit", "plot", "time"), + memory=wf.get_resource("cnvkit", "plot", "memory"), + partition=wf.get_resource("cnvkit", "plot", "partition"), + log: + **wf.get_log_file("cnvkit", "plot"), + wrapper: + wf.wrapper_path("cnvkit/plot") + + +rule somatic_wgs_cnv_calling_cnvkit_export: + input: + unpack(wf.get_input_files("cnvkit", "export")), + output: + **wf.get_output_files("cnvkit", "export"), + threads: wf.get_resource("cnvkit", "export", "threads") + resources: + time=wf.get_resource("cnvkit", "export", "time"), + memory=wf.get_resource("cnvkit", "export", "memory"), + partition=wf.get_resource("cnvkit", "export", "partition"), + log: + **wf.get_log_file("cnvkit", "export"), + wrapper: + wf.wrapper_path("cnvkit/export") + + +rule somatic_wgs_cnv_calling_cnvkit_report: + input: + unpack(wf.get_input_files("cnvkit", "report")), + output: + **wf.get_output_files("cnvkit", "report"), + threads: wf.get_resource("cnvkit", "report", "threads") + resources: + time=wf.get_resource("cnvkit", "report", "time"), + memory=wf.get_resource("cnvkit", "report", "memory"), + partition=wf.get_resource("cnvkit", "report", "partition"), + log: + **wf.get_log_file("cnvkit", "report"), + wrapper: + wf.wrapper_path("cnvkit/report") diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 522beecc4..c72460394 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py access """ from snakemake.shell import shell @@ -7,6 +7,10 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["access"] + +exclude = " --exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" + shell( r""" # Also pipe everything to log file @@ -26,17 +30,29 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py access \ - -o {snakemake.output} \ + -o {snakemake.output.access} \ + $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ + echo --min-gap-size {config[min_gap_size]} + fi) \ + {exclude} \ {snakemake.config[static_data_config][reference][path]} - -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.access}") +d=$(dirname "{snakemake.output.access}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 126364ad6..2154efe06 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py antitarget """ from snakemake.shell import shell @@ -7,6 +7,11 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +target = snakemake.input.get("target", "") + shell( r""" # Also pipe everything to log file @@ -26,17 +31,39 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -cnvkit.py antitarget \ - -o {snakemake.output} \ - -g {snakemake.input.access} \ - {snakemake.input.target} +if [[ -n "{config[path_target_regions]}" ]] +then + cnvkit.py antitarget \ + --output {snakemake.output.antitarget} \ + $(if [[ -n "{config[access]}" ]]; then \ + echo --access {config[access]} + fi) \ + $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ + echo --avg-size {config[antitarget_avg_size]} + fi) \ + $(if [[ {config[min_size]} -gt 0 ]]; then \ + echo --min-size {config[min_size]} + fi) \ + {target} +else + touch {snakemake.output.antitarget} +fi -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.antitarget}") +d=$(dirname "{snakemake.output.antitarget}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index c935ebff2..1630d2422 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -7,6 +7,19 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +center = config["center"] +if center: + if center in set("mean", "median", "mode", "biweight"): + center = " --center " + center + else: + center = " --center-at" + center + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + shell( r""" # Also pipe everything to log file @@ -26,14 +39,35 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -cnvkit.py call --filter ampdel {snakemake.input} -o {snakemake.output} +cnvkit.py call \ + --output {snakemake.output.calls} \ + --method {config[calling_method]} \ + --thresholds={config[call_thresholds]} \ + $(if [[ "{config[filter]}" ]]; then \ + echo --filter {config[filter]} + fi) \ + {center} {gender} {male} \ + --ploidy {config[ploidy]} \ + $(if [[ {config[purity]} -gt 0 ]]; then \ + echo --purity {config[purity]} + fi) \ + {snakemake.input} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.calls}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.calls}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index fc294b99b..47fa87535 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py coverage +"""Wrapper for cnvkit.py coverage """ from snakemake.shell import shell @@ -7,6 +7,26 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +# During panel_of_normals step, the target regions are created by the target substep. +# During somatic CNV calling (both exome & wgs), the target regions are obtained from the configuration +if "target" in snakemake.input.keys(): + target = snakemake.input.target +elif "path_target" in config.keys(): + target = config["path_target"] +else: + raise Exception("Unsupported naming") + +# Same for antitarget regions +if "antitarget" in snakemake.input.keys(): + antitarget = snakemake.input.antitarget +elif "path_antitarget" in config.keys(): + antitarget = config["path_antitarget"] +else: + raise Exception("Unsupported naming") + shell( r""" # Also pipe everything to log file @@ -26,17 +46,45 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + +# Function definitions --------------------------------------------------------- + +coverage() +{{ + cnvkit.py coverage \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + --min-mapq {config[min_mapq]} \ + --processes {snakemake.threads} \ + {snakemake.input.bam} \ + --output $2 $1 +}} + +md5() +{{ + set -x + + fn=$1 + f=$(basename $fn) + d=$(dirname $fn) + pushd $d + md5sum $f > $f.md5 + popd +}} + # ----------------------------------------------------------------------------- -cnvkit.py coverage {snakemake.input.bam} {snakemake.input.target} -o {snakemake.output.target} -cnvkit.py coverage {snakemake.input.bam} {snakemake.input.antitarget} -o {snakemake.output.antitarget} +coverage {target} {snakemake.output.target} +md5 {snakemake.output.target} + +coverage {antitarget} {snakemake.output.antitarget} +md5 {snakemake.output.antitarget} +""" +) -d=$(dirname "{snakemake.output.target}") -pushd $d -fn=$(basename "{snakemake.output.target}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.antitarget}") -md5sum $fn > $fn.md5 -popd +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index 3e2333136..5b6fd5393 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -3,6 +3,6 @@ channels: - bioconda - r dependencies: -- python =3.6 +- python >3.7 - cnvkit - htslib diff --git a/snappy_wrappers/wrappers/cnvkit/export/wrapper.py b/snappy_wrappers/wrappers/cnvkit/export/wrapper.py index 8a6cc0deb..38b513d12 100644 --- a/snappy_wrappers/wrappers/cnvkit/export/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/export/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py export +"""Wrapper for cnvkit.py export """ from snakemake.shell import shell @@ -26,6 +26,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + export TMPDIR=$(mktemp -d) cnvkit.py export bed {snakemake.input} -o {snakemake.output.bed} @@ -46,6 +48,14 @@ md5sum $fn > $fn.md5 fn=$(basename "{snakemake.output.vcf}") md5sum $fn > $fn.md5 +md5sum $fn.tbi > $fn.tbi.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 3b4391d24..f3e3cde0f 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py fix +"""Wrapper for cnvkit.py fix """ from snakemake.shell import shell @@ -7,6 +7,22 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +if "ref" in snakemake.input.keys(): + ref = snakemake.input.target +elif "path_panel_of_normals" in config.keys(): + ref = config["path_panel_of_normals"] +else: + raise Exception("Unsupported naming") + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" +no_gc = " --no-gc" if not config["gc_correction"] else "" +no_edge = " --no-edge" if not config["edge_correction"] else "" +no_rmask = " --no-rmask" if not config["rmask_correction"] else "" + shell( r""" # Also pipe everything to log file @@ -26,18 +42,28 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py fix \ + --output {snakemake.output.ratios} \ + {gender} {male} {no_gc} {no_edge} {no_rmask} \ {snakemake.input.target} \ {snakemake.input.antitarget} \ - {snakemake.input.ref} \ - -o {snakemake.output} + {ref} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.ratios}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.ratios}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py index fffa95c46..83f52375c 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py plot +"""Wrapper for cnvkit.py plot """ from snakemake.shell import shell @@ -7,19 +7,24 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" -mapper = snakemake.wildcards.mapper -if "library_name" in snakemake.wildcards.keys(): - libname = snakemake.wildcards.library_name - tool_and_mapper = mapper + ".cnvkit.plot" - make_per_chr_plots = "true" - opt_filetype = "pdf" -elif "cancer_library" in snakemake.wildcards.keys(): - libname = snakemake.wildcards.cancer_library - tool_and_mapper = mapper + ".control_freec" - make_per_chr_plots = "false" - opt_filetype = "png" -else: - raise Exception("Unsupported naming") +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + +heatmaps = [ + snakemake.output.get(x) + for x in filter( + lambda x: x.startswith("heatmap_chr") and not x.endswith("_md5"), snakemake.output.keys() + ) +] +scatters = [ + snakemake.output.get(x) + for x in filter( + lambda x: x.startswith("scatter_chr") and not x.endswith("_md5"), snakemake.output.keys() + ) +] shell( r""" @@ -40,42 +45,94 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} -# ----------------------------------------------------------------------------- - -unset DISPLAY +set -x -d="work/{tool_and_mapper}.{libname}/out" +# ----------------------------------------------------------------------------- -fn="{tool_and_mapper}.{libname}.diagram.pdf" -cnvkit.py diagram -s {snakemake.input.cns} {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +md5() +{{ + d=$(dirname $1) + f=$(basename $1) + pushd $d + md5sum $f > $f.md5 + popd +}} -fn="{tool_and_mapper}.{libname}.scatter.{opt_filetype}" -cnvkit.py scatter -s {snakemake.input.cns} {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +# ----------------------------------------------------------------------------- -fn="{tool_and_mapper}.{libname}.heatmap.{opt_filetype}" -cnvkit.py heatmap {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +unset DISPLAY -if [ "{make_per_chr_plots}" = true ] ; then - chroms=$(tail -n +2 "{snakemake.input.cnr}" | cut -f 1 | sort | uniq | grep -E "^(chr)?([0-9]+|X|Y)") +if [[ -n "{snakemake.output.diagram}" ]] +then + cnvkit.py diagram \ + --output {snakemake.output.diagram} \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + --threshold {config[diagram_threshold]} --min-probes {config[diagram_min_probes]} \ + $(if [[ "{config[shift_xy]}" = "False" ]]; then \ + echo --no-shift-xy + fi) \ + {snakemake.input.cnr} +else + touch {snakemake.output.diagram} +fi +md5 {snakemake.output.diagram} - for chr in $chroms ; do - c=$(echo "$chr" | sed -e "s/^chr//") - fn="{tool_and_mapper}.{libname}.scatter.chr${{c}}.{opt_filetype}" - cnvkit.py scatter -s {snakemake.input.cns} --chromosome $chr {snakemake.input.cnr} -o $d/$fn - pushd $d ; md5sum $fn > $fn.md5 ; popd - done +if [[ -n "{snakemake.output.scatter}" ]] +then + cnvkit.py scatter \ + --output {snakemake.output.scatter} \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + {snakemake.input.cnr} +else + touch {snakemake.output.scatter} +fi +md5 {snakemake.output.scatter} - for chr in $chroms ; do - c=$(echo "$chr" | sed -e "s/^chr//") - fn="{tool_and_mapper}.{libname}.heatmap.chr${{c}}.{opt_filetype}" - cnvkit.py heatmap --chromosome $chr {snakemake.input.cnr} -o $d/$fn - pushd $d ; md5sum $fn > $fn.md5 ; popd - done +if [[ -n "{snakemake.output.heatmap}" ]] +then + cnvkit.py heatmap \ + --output {snakemake.output.heatmap} \ + {snakemake.input.cnr} else - echo "Not creating per-chromosome heatmap or scatter plots." + touch {snakemake.output.heatmap} fi +md5 {snakemake.output.heatmap} + +for heatmap in {heatmaps} +do + if [[ -n "$heatmap" ]] + then + cnvkit.py heatmap \ + --output $heatmap \ + {snakemake.input.cnr} + else + touch $heatmap + fi + md5 $heatmap +done + +for scatter in {scatters} +do + if [[ -n "$scatter" ]] + then + cnvkit.py scatter \ + --output $scatter \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + {snakemake.input.cnr} + else + touch $scatter + fi + md5 $scatter +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 9ee0f97c8..987987df2 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py reference """ from snakemake.shell import shell @@ -10,6 +10,21 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +# NOTE: snakemake.input.target and snakemake.input.antitarget contain +# the output of target & antitarget substeps when there is no bam files +# the bam files lists when the list of normals is not empty + +cluster = ( + " --cluster --min-cluster-size {}".format(config["min_cluster_size"]) + if config["min_cluster_size"] > 0 + else "" +) +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" +no_gc = " --no-gc" if not config["gc_correction"] else "" +no_edge = " --no-edge" if not config["edge_correction"] or not config["path_target_regions"] else "" +no_rmask = " --no-rmask" if not config["rmask_correction"] else "" + shell( r""" # Also pipe everything to log file @@ -29,40 +44,36 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -if [[ "{config[normals_reference]}" == "" ]] +if [[ "{snakemake.params.args[flat]}" = "True" ]] then - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - mkdir $TMPDIR/out - - cnvkit.py coverage \ - -o $TMPDIR/out/target.cnn \ - {snakemake.input.bam} \ - {snakemake.input.target} - - cnvkit.py coverage \ - -o $TMPDIR/out/antitarget.cnn \ - {snakemake.input.bam} \ - {snakemake.input.antitarget} - cnvkit.py reference \ - -o {snakemake.output} \ - -f {snakemake.config[static_data_config][reference][path]} \ - -t $TMPDIR/out/target.cnn \ - -a $TMPDIR/out/antitarget.cnn - - rm -rf $TMPDIR + --output {snakemake.output.panel} \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ + --targets {snakemake.input.target} --antitargets {snakemake.input.antitarget} else - r=$(realpath --relative-to=$d {config[normals_reference]}) - ln -s $r {snakemake.output} + cnvkit.py reference \ + --output {snakemake.output.panel} \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ + {snakemake.input.target} {snakemake.input.antitarget} fi -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.panel}") +d=$(dirname "{snakemake.output.panel}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/report/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/wrapper.py index 29d619e50..010b588b7 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py report +"""Wrapper for cnvkit.py report """ from snakemake.shell import shell @@ -7,6 +7,38 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +for param in ( + "breaks_min_probes", + "genemetrics_min_probes", + "genemetrics_threshold", + "drop_low_coverage", + "genemetrics_alpha", + "genemetrics_bootstrap", + "segmetrics_alpha", + "segmetrics_bootstrap", + "smooth_bootstrap", +): + if not param in config.keys(): + config[param] = "" + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + +input_target = snakemake.input.get("target", "") +input_antitarget = snakemake.input.get("antitarget", "") +input_cnr = snakemake.input.get("cnr", "") +input_cns = snakemake.input.get("cns", "") + +output_breaks = snakemake.output.get("breaks", "") +output_genemetrics = snakemake.output.get("genemetrics", "") + +output_segmetrics = snakemake.output.get("segmetrics", "") +output_sex = snakemake.output.get("sex", "") +output_metrics = snakemake.output.get("metrics", "") + shell( r""" # Also pipe everything to log file @@ -26,47 +58,137 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -cnvkit.py breaks {snakemake.input.cnr} {snakemake.input.cns} \ -> {snakemake.output.breaks} - -cnvkit.py gainloss {snakemake.input.cnr} -s {snakemake.input.cns} \ -> {snakemake.output.gainloss} - -cnvkit.py gender {snakemake.input.cnr} \ -> {snakemake.output.gender} - -cnvkit.py metrics {snakemake.input.cnr} -s {snakemake.input.cns} \ -> {snakemake.output.metrics} - -cnvkit.py segmetrics {snakemake.input.cnr} \ - --mean \ - --median \ - --mode \ - --stdev \ - --sem \ - --mad \ - --mse \ - --iqr \ - --bivar \ - --ci \ - --pi \ - -s {snakemake.input.cns} \ - -o {snakemake.output.segmetrics} - -d=$(dirname "{snakemake.output.breaks}") -pushd $d -fn=$(basename "{snakemake.output.breaks}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.gainloss}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.gender}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.metrics}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.segmetrics}") -md5sum $fn > $fn.md5 -popd +md5() +{{ + d=$(dirname $1) + f=$(basename $1) + pushd $d + md5sum $f > $f.md5 + popd +}} + +# ----------------------------------------------------------------------------- + +if [[ -n "{output_breaks}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py breaks \ + --output {output_breaks} \ + --min-probes {config[breaks_min_probes]} \ + {input_cnr} {input_cns} + else + touch {output_breaks} + fi + md5 {output_breaks} +fi + +if [[ -n "{output_genemetrics}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py genemetrics \ + --output {output_genemetrics} \ + --segment {input_cns} \ + --min-probes {config[genemetrics_min_probes]} --threshold {config[genemetrics_threshold]} \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {gender} {male} \ + --mean --median --mode --ttest --stdev --sem --mad --mse --iqr --bivar --ci --pi \ + --alpha {config[genemetrics_alpha]} --bootstrap {config[genemetrics_bootstrap]} \ + {input_cnr} + else + touch {output_genemetrics} + fi + md5 {output_genemetrics} +fi + +if [[ -n "{output_segmetrics}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py segmetrics \ + --output {output_segmetrics} \ + --segment {input_cns} \ + --mean --median --mode --t-test --stdev --sem --mad --mse --iqr --bivar --ci --pi \ + --alpha {config[segmetrics_alpha]} --bootstrap {config[segmetrics_bootstrap]} \ + $(if [[ "{config[smooth_bootstrap]}" = "True" ]]; then \ + echo --smooth-bootstrap + fi) \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {input_cnr} + else + touch {output_segmetrics} + fi + md5 {output_segmetrics} +fi + +if [[ -n "{output_sex}" ]] +then + if [[ -n "{input_target}" || -n "{input_antitarget}" || -n "{input_cnr}" || -n "{input_cns}" ]] + then + cnvkit.py sex \ + --output {output_sex} \ + $(if [[ "{male}" = "True" ]]; then \ + echo --male-reference + fi) \ + $(if [[ -n "{input_target}" ]]; then \ + echo {input_target} + fi) \ + $(if [[ -n "{input_antitarget}" ]]; then \ + echo {input_antitarget} + fi) \ + $(if [[ -n "{input_cnr}" ]]; then \ + echo {input_cnr} + fi) \ + $(if [[ -n "{input_cns}" ]]; then \ + echo {input_cns} + fi) + else + touch {output_sex} + fi + md5 {output_sex} +fi + +if [[ -n "{output_metrics}" ]] +then + if [[ -n "{input_target}" || -n "{input_antitarget}" || -n "{input_cnr}" || -n "{input_cns}" ]] + then + cnvkit.py metrics \ + --output {output_metrics} \ + $(if [[ "config[drop_low_coverage]" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + $(if [[ -n "{input_target}" ]]; then \ + echo {input_target} + fi) \ + $(if [[ -n "{input_antitarget}" ]]; then \ + echo {input_antitarget} + fi) \ + $(if [[ -n "{input_cnr}" ]]; then \ + echo {input_cnr} + fi) \ + $(if [[ -n "{input_cns}" ]]; then \ + echo --segments {input_cns} + fi) + else + touch {output_metrics} + fi + md5 {output_metrics} +fi +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 99b1acdce..201ebb2dd 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -10,6 +10,15 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +method = config["segmentation_method"] +if method == "cbs" and config["smooth_cbs"]: + method += " --smooth-cbs" + +if float(config["segmentation_threshold"]) > 0: + threshold = " --threshold " + str(config["segmentation_threshold"]) +else: + threshold = "" + shell( r""" # Also pipe everything to log file @@ -29,16 +38,31 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py segment \ - -m {config[seg_method]} \ - {snakemake.input} -o {snakemake.output} + --output {snakemake.output.segments} \ + --method {method} \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {threshold} \ + --drop-outliers {config[drop_outliers]} \ + {snakemake.input} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.segments}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.segments}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 95ab0d6e9..f15c0bcf0 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py target """ from snakemake.shell import shell @@ -10,6 +10,8 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +bams = " ".join(snakemake.input.get("bams", [""])) + shell( r""" # Also pipe everything to log file @@ -29,23 +31,79 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT +set -x + +# ----------------------------------------------------------------------------- + +access() +{{ + cnvkit.py access \ + -o $tmpdir/access.bed \ + {snakemake.config[static_data_config][reference][path]} +}} # ----------------------------------------------------------------------------- -zcat {config[path_target_regions]} > $TMPDIR/regions.bed +target="{config[path_target_regions]}" +target_avg_size={config[target_avg_size]} + +if [[ -z "$target" ]] && [[ $target_avg_size -eq 0 ]] +then + tmpdir=$(mktemp -d) + + if [[ -n "{bams}" ]] + then + access + cnvkit.py autobin --method wgs \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + --access $tmpdir/access.bed \ + --bp-per-bin {config[bp_per_bin]} \ + --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ + {bams} > $tmpdir/autobin.txt + target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) + + if [[ -z "{config[access]}" ]] + then + target=$tmpdir/access.bed + else + target="{config[access]}" + fi + else + if [[ -z "{config[access]}" ]] + then + access + target=$tmpdir/access.bed + else + target="{config[access]}" + fi + target_avg_size=5000 + fi +fi cnvkit.py target \ - --short-names \ - --split \ - -o {snakemake.output} \ - $TMPDIR/regions.bed + --output {snakemake.output.target} \ + $(if [[ -n "{config[annotate]}" ]]; then \ + echo --short-names --annotate {config[annotate]} + fi) \ + $(if [[ "{config[split]}" = "True" ]]; then \ + echo --split + fi) \ + $(if [[ $target_avg_size -gt 0 ]]; then \ + echo --avg-size $target_avg_size + fi) \ + $target -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.target}") +d=$(dirname "{snakemake.output.target}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index ebb50ebe0..76835beb4 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -38,9 +38,14 @@ def minimal_config(): path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['mutect2'] + tools: ['mutect2', 'cnvkit', 'access'] mutect2: germline_resource: /path/to/germline_resource.vcf + path_normals_list: "" + cnvkit: + path_excluded_regions: "" + path_target_regions: /path/to/regions.bed # WGS mode + path_normals_list: "" data_sets: first_batch: @@ -109,8 +114,8 @@ def test_mutect2_step_part_get_input_files_create_panel(panel_of_normals_workflo ) expected = { "normals": [ - "work/bwa.mutect2.prepare_panel/out/P001-N1-DNA1-WGS1.vcf.gz", - "work/bwa.mutect2.prepare_panel/out/P002-N1-DNA1-WGS1.vcf.gz", + "work/bwa.mutect2/out/bwa.mutect2.P001-N1-DNA1-WGS1.prepare.vcf.gz", + "work/bwa.mutect2/out/bwa.mutect2.P002-N1-DNA1-WGS1.prepare.vcf.gz", ], } actual = panel_of_normals_workflow.get_input_files("mutect2", "create_panel")(wildcards) @@ -119,13 +124,11 @@ def test_mutect2_step_part_get_input_files_create_panel(panel_of_normals_workflo def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_output_files_prepare_panel()""" - # TODO: Potential extension error in output files, `vcf.tbi.gz` instead of `vcf.gz.tbi`. - # TODO: If confirmed, create expected using `get_expected_output_vcf_files_dict()` expected = { - "vcf": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz", - "vcf_md5": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.md5", - "vcf_tbi": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.tbi", - "vcf_tbi_md5": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.tbi.md5", + "vcf": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz", + "vcf_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.md5", + "vcf_tbi": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi", + "vcf_tbi_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi.md5", } actual = panel_of_normals_workflow.get_output_files("mutect2", "prepare_panel") assert actual == expected @@ -133,7 +136,7 @@ def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workf def test_mutect2_step_part_get_output_files_create_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_output_files_create_panel()""" - base_name_out = "work/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals" + base_name_out = "work/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals" expected = get_expected_output_vcf_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_output_files("mutect2", "create_panel") assert actual == expected @@ -141,7 +144,7 @@ def test_mutect2_step_part_get_output_files_create_panel(panel_of_normals_workfl def test_mutect2_step_part_get_log_file_prepare_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_log_files_prepare_panel()""" - base_name_out = "work/{mapper}.mutect2.prepare_panel/log/{normal_library}" + base_name_out = "work/{mapper}.mutect2/log/{mapper}.mutect2.{normal_library}.prepare" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("mutect2", "prepare_panel") assert actual == expected @@ -149,7 +152,7 @@ def test_mutect2_step_part_get_log_file_prepare_panel(panel_of_normals_workflow) def test_mutect2_step_part_get_log_file_create_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals" + base_name_out = "work/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("mutect2", "create_panel") assert actual == expected @@ -170,7 +173,6 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "8G", "partition": "medium", } - run_expected_dict = {"threads": 1, "time": "01:00:00", "memory": "2G", "partition": "medium"} # Evaluate action `create_panel` for resource, expected in create_panel_expected_dict.items(): @@ -185,37 +187,350 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == expected, msg_error +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == {} + + +def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_antitarget()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "normal_library": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_report()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_target()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_antitarget()""" + expected = { + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_coverage()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_create_panel()""" + expected = { + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_report()""" + expected = { + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_target()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_antitarget()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_coverage()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_create_panel()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_report()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests CvnkitStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + target_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + antitarget_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + coverage_expected_dict = { + "threads": 8, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + reference_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + report_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + + # Evaluate action `target` + for resource, expected in target_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'target'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + assert actual == expected, msg_error + + # Evaluate action `antitarget` + for resource, expected in antitarget_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) + assert actual == expected, msg_error + + # Evaluate action `coverage` + for resource, expected in coverage_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + # Evaluate action `create_panel` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + assert actual == expected, msg_error + + # Evaluate action `report` + for resource, expected in report_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'report'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) + assert actual == expected, msg_error + + +# Tests for AccessStepPart ------------------------------------------------------------------------- + + +def test_access_step_part_get_input_files_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_input_files_run()""" + assert panel_of_normals_workflow.get_input_files("access", "run") == None + + +def test_access_step_part_get_output_files_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_output_files_run()""" + expected = { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("access", "run") + assert actual == expected + + +def test_access_step_part_get_log_file_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_log_file_run()""" + expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") + actual = panel_of_normals_workflow.get_log_file("access", "run") + assert actual == expected + + +def test_access_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests AccessStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'run'." + actual = panel_of_normals_workflow.get_resource("access", "run", resource) + assert actual == expected, msg_error + + # PanelOfNormalsWorkflow -------------------------------------------------------------------------- def test_panel_of_normals_workflow(panel_of_normals_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "mutect2"] + expected = ["access", "cnvkit", "link_out", "mutect2"] actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) assert actual == expected # Check result file construction - tpl = "output/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals.{ext}" - expected = [ + expected = [] + tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" + expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for mapper in ("bwa",) ] # add log files - tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals.{ext}" + tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals" + for mapper in ("bwa",): + expected += get_expected_log_files_dict(base_out=tpl.format(mapper=mapper)).values() + + # Now for basic cnvkit files (panel of normal only) + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" expected += [ - tpl.format(mapper=mapper, ext=ext) - for ext in ( - "conda_info.txt", - "conda_list.txt", - "log", - "conda_info.txt.md5", - "conda_list.txt.md5", - "log.md5", - ) - for mapper in ("bwa",) + tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) ] + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("target", "antitarget"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("bed", "bed.md5") + for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("sex", "metrics"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("tsv", "tsv.md5") + for mapper in ("bwa",) + ] + # add log files + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("target", "antitarget", "panel_of_normals", "report"): + for mapper in ("bwa",): + expected += get_expected_log_files_dict( + base_out=tpl.format(mapper=mapper, substep=substep) + ).values() + + # Access + tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" + expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] + expected += get_expected_log_files_dict( + base_out="output/cnvkit.access/log/cnvkit.access" + ).values() + expected = list(sorted(expected)) actual = list(sorted(panel_of_normals_workflow.get_result_files())) assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py new file mode 100644 index 000000000..2d785cb67 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py @@ -0,0 +1,380 @@ +# -*- coding: utf-8 -*- +"""Tests for the panel_of_normals workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.panel_of_normals import PanelOfNormalsWorkflow + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for (cancer) configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + compute_coverage_bed: true + path_target_regions: /path/to/regions.bed + bwa: + path_index: /path/to/bwa/index.fa + + panel_of_normals: + tools: ['cnvkit'] + cnvkit: + path_excluded_regions: "" + path_target_regions: "" # WGS mode + path_normals_list: "" + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def panel_of_normals_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + # Construct the workflow object + return PanelOfNormalsWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + "bais": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_antitarget()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) + assert actual == {} + + +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "normal_library": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_report()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_target()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_antitarget()""" + expected = { + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_coverage()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_create_panel()""" + expected = { + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_report()""" + expected = { + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_target()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_antitarget()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_coverage()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_create_panel()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_report()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests CvnkitStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + target_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + antitarget_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + coverage_expected_dict = { + "threads": 8, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + reference_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + report_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + + # Evaluate action `target` + for resource, expected in target_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'target'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + assert actual == expected, msg_error + + # Evaluate action `antitarget` + for resource, expected in antitarget_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) + assert actual == expected, msg_error + + # Evaluate action `coverage` + for resource, expected in coverage_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + # Evaluate action `create_panel` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + assert actual == expected, msg_error + + # Evaluate action `report` + for resource, expected in report_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'report'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) + assert actual == expected, msg_error + + +# PanelOfNormalsWorkflow -------------------------------------------------------------------------- + + +def test_panel_of_normals_workflow(panel_of_normals_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["access", "cnvkit", "link_out", "mutect2"] + actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) + assert actual == expected + expected = [] + + # Now for basic cnvkit files (panel of normal only) + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" + expected += [ + tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("target", "antitarget"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("bed", "bed.md5") + for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("sex", "metrics"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("tsv", "tsv.md5") + for mapper in ("bwa",) + ] + # add log files + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("target", "antitarget", "panel_of_normals", "report"): + for mapper in ("bwa",): + expected += get_expected_log_files_dict( + base_out=tpl.format(mapper=mapper, substep=substep) + ).values() + + expected = list(sorted(expected)) + actual = list(sorted(panel_of_normals_workflow.get_result_files())) + assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 4fae436e5..dd66c9deb 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -46,6 +46,10 @@ def minimal_config(): - cnvetti_on_target - cnvkit - copywriter + cnvkit: + path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn data_sets: first_batch: @@ -270,119 +274,6 @@ def test_cnvetti_on_target_step_part_get_resource_usage(somatic_targeted_seq_cnv assert actual == expected, msg_error -# Tests for CnvKitStepPart (access) --------------------------------------------------------------- - - -def test_cnvkit_access_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "access") - assert actual is None - - -def test_cnvkit_access_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.access/out/access.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "access") == expected - ) - - -def test_cnvkit_access_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "access") - assert actual == expected - - -def test_cnvkit_access_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'access'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "access", resource - ) - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (target) --------------------------------------------------------------- - - -def test_cnvkit_target_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "target")( - wildcards - ) - assert actual == {"access": "work/cnvkit.access/out/access.bed"} - - -def test_cnvkit_target_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.target/out/target.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "target") == expected - ) - - -def test_cnvkit_target_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.target/log/cnvkit.target") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_target_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'target'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "target", resource - ) - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (antitarget) ----------------------------------------------------------- - - -def test_cnvkit_antitarget_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "access": "work/cnvkit.access/out/access.bed", - "target": "work/cnvkit.target/out/target.bed", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "antitarget")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_antitarget_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.antitarget/out/antitarget.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "antitarget") - == expected - ) - - -def test_cnvkit_antitarget_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.antitarget/log/cnvkit.antitarget") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_antitarget_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'antitarget'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "antitarget", resource - ) - assert actual == expected, msg_error - - # Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- @@ -391,10 +282,8 @@ def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_call fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} ) expected = { - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", - "target": "work/cnvkit.target/out/target.bed", } actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "coverage")( wildcards @@ -404,12 +293,12 @@ def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_call def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.coverage.{library_name}/out/{mapper}.cnvkit.coverage.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" expected = { "target": base_name_out + ".targetcoverage.cnn", + "target_md5": base_name_out + ".targetcoverage.cnn.md5", "antitarget": base_name_out + ".antitargetcoverage.cnn", + "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", } # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "coverage") @@ -419,7 +308,7 @@ def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_cal def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): base_file_name = ( - "work/{mapper}.cnvkit.coverage.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" + "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" ) expected = get_expected_log_files_dict(base_out=base_file_name) actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "coverage") @@ -429,7 +318,7 @@ def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 8, "time": "08:00:00", "memory": "16384M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -439,70 +328,14 @@ def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling assert actual == expected, msg_error -# Tests for CnvKitStepPart (reference) ------------------------------------------------------------ - - -def test_cnvkit_reference_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "reference")( - wildcards - ) - expected = { - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "target": "work/cnvkit.target/out/target.bed", - } - assert actual == expected - - -def test_cnvkit_reference_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.reference.{library_name}/out/" - "{mapper}.cnvkit.reference.{library_name}.cnn" - ) - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "reference") - assert actual == expected - - -def test_cnvkit_reference_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.reference.{library_name}/log/{mapper}.cnvkit.reference.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "reference") - assert actual == expected - - -def test_cnvkit_reference_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'reference'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "reference", resource - ) - assert actual == expected, msg_error - - # Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - coverage_base_out = ( - "work/bwa.cnvkit.coverage.P001-T1-DNA1-WGS1/out/bwa.cnvkit.coverage.P001-T1-DNA1-WGS1" - ) - reference_base_out = ( - "work/bwa.cnvkit.reference.P001-T1-DNA1-WGS1/out/bwa.cnvkit.reference.P001-T1-DNA1-WGS1" - ) + coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" expected = { "antitarget": coverage_base_out + ".antitargetcoverage.cnn", - "ref": reference_base_out + ".cnn", "target": coverage_base_out + ".targetcoverage.cnn", } # Get actual @@ -512,13 +345,14 @@ def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_w def test_cnvkit_fix_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/{mapper}.cnvkit.fix.{library_name}/out/{mapper}.cnvkit.fix.{library_name}.cnr" + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = "work/{mapper}.cnvkit.fix.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "fix") @@ -528,7 +362,7 @@ def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_work def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -541,9 +375,7 @@ def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_work def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cnr": "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - } + expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "segment")( wildcards ) @@ -551,30 +383,27 @@ def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calli def test_cnvkit_segment_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.segment.{library_name}/out/{mapper}.cnvkit.segment.{library_name}.cns" - ) - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") == expected + base_name_out = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" ) + expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") + assert actual == expected def test_cnvkit_segment_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.segment.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "segment") - assert actual == expected def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -589,9 +418,7 @@ def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_ def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - segment_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" - ) + segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" expected = {"segment": segment_file} # Get actual wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) @@ -600,17 +427,15 @@ def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_ def test_cnvkit_call_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.call.{library_name}/out/{mapper}.cnvkit.call.{library_name}.cns" - ) - assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") == expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") + assert actual == expected def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.call.{library_name}/log/{mapper}.cnvkit.call.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "call") @@ -620,7 +445,7 @@ def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -633,10 +458,8 @@ def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - cnr_file = "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - cns_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" - ) + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" expected = { "cnr": cnr_file, "cns": cns_file, @@ -649,17 +472,23 @@ def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_ def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_out = "work/{mapper}.cnvkit.plot.{library_name}/out/{mapper}.cnvkit.plot.{library_name}" - expected = { - "diagram": base_out + ".diagram.pdf", - "heatmap": base_out + ".heatmap.pdf", - "scatter": base_out + ".scatter.pdf", - } - for chrom in chain(range(1, 23), ("X", "Y")): - for diagram in ("heatmap", "scatter"): - tpl = base_out + ".%s.chr%s.pdf" - expected["{}_chr{}".format(diagram, chrom)] = tpl % (diagram, chrom) - + expected = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): + expected[plot] = tpl.format(plot=plot, ext=ext) + expected[plot + "_md5"] = expected[plot] + ".md5" + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): + for chrom in chain(range(1, 23), ("X", "Y")): + key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) + expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) + expected[key + "_md5"] = expected[key] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "plot") assert actual == expected @@ -668,7 +497,7 @@ def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected expected = get_expected_log_files_dict( - base_out="work/{mapper}.cnvkit.plot.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" + base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" ) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "plot") @@ -678,7 +507,7 @@ def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "30720M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "08:00:00", "memory": "30720M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -691,9 +520,7 @@ def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cns": "work/bwa.cnvkit.call.P001-T1-DNA1-WGS1/out/bwa.cnvkit.call.P001-T1-DNA1-WGS1.cns" - } + expected = {"cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"} actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "export")( wildcards ) @@ -702,15 +529,11 @@ def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_callin def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.export.{library_name}/out/{mapper}.cnvkit.export.{library_name}" - ) - expected = { - "bed": base_name_out + ".bed", - "seg": base_name_out + ".seg", - "vcf_tbi": base_name_out + ".vcf.gz.tbi", - "vcf": base_name_out + ".vcf.gz", - } + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" + for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): + expected[key] = base_name_out + "." + ext + expected[key + "_md5"] = expected[key] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "export") assert actual == expected @@ -718,9 +541,7 @@ def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calli def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.export.{library_name}/log/{mapper}.cnvkit.export.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "export") @@ -730,7 +551,7 @@ def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_w def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -745,13 +566,19 @@ def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_w def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - cnr_file = "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - cns_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + target_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" + ) + antitarget_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" ) expected = { "cnr": cnr_file, "cns": cns_file, + "target": target_file, + "antitarget": antitarget_file, } # Get actual wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) @@ -763,16 +590,11 @@ def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_callin def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.report.{library_name}/out/{mapper}.cnvkit.report.{library_name}" - ) - expected = { - "breaks": base_name_out + ".breaks.txt", - "gainloss": base_name_out + ".gainloss.txt", - "gender": base_name_out + ".gender.txt", - "metrics": base_name_out + ".metrics.txt", - "segmetrics": base_name_out + ".segmetrics.txt", - } + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): + expected[report] = base_name_out + "." + report + ".txt" + expected[report + "_md5"] = expected[report] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "report") assert actual == expected @@ -780,9 +602,7 @@ def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calli def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.report.{library_name}/log/{mapper}.cnvkit.report.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "report") @@ -792,7 +612,7 @@ def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_w def test_cnvkit_report_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -995,49 +815,43 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call ) ] # cnvkit - tpl = ( - "output/bwa.cnvkit.call.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.call.P00{i}-T{t}-DNA1-WGS1.{ext}" - ) + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" expected += [ - tpl.format(i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) for ext in ("cns",) - ] - tpl = ( - "output/bwa.cnvkit.export.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.export.P00{i}-T{t}-DNA1-WGS1.{ext}" - ) - expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("bed", "seg", "vcf.gz", "vcf.gz.tbi") + for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1.{ext}" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" ) expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("diagram.pdf", "heatmap.pdf", "scatter.pdf") + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1.{diagram}.chr{chrom}.pdf" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" ) expected += [ - tpl.format(i=i, t=t, diagram=diagram, chrom=chrom) + tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for diagram in ("heatmap", "scatter") + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) for chrom in chain(range(1, 23), ("X", "Y")) + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.report.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.report.P00{i}-T{t}-DNA1-WGS1.{ext}" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" ) expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, report=report, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("breaks.txt", "gainloss.txt", "gender.txt", "metrics.txt", "segmetrics.txt") + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + for md5 in ("", ".md5") ] # copywriter tpl = ( diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py index 5e00fcf45..7bec82eaa 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py @@ -2,6 +2,7 @@ """Tests for the somatic_wgs_cnv_calling workflow module code""" +from itertools import chain import textwrap import pytest @@ -55,6 +56,11 @@ def minimal_config(): reference: /path/to/reference.fasta filter_bed: /path/to/filter.bed genome_folder: /path/to/genome/folder + cnvkit: + path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + data_sets: first_batch: @@ -230,53 +236,333 @@ def test_control_freec_step_part_get_resource(somatic_wgs_cnv_calling_workflow): assert actual == expected, msg_error -# Tests for CnvkitSomaticWgsStepPart --------------------------------------------------------------- +# Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- -def test_cnvkit_somatic_wgs_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_input_files()""" - wildcards = Wildcards(fromdict={"mapper": "bwa", "cancer_library": "P001-T1-DNA1-WGS1"}) +def test_cnvkit_coverage_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards( + fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} + ) expected = { - "normal_bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "normal_bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "tumor_bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", - "tumor_bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", } - actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "run")(wildcards) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_somatic_wgs_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_output_files()""" - base_name = "work/{mapper}.cnvkit.{cancer_library}/out/" +def test_cnvkit_coverage_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" expected = { - "segment": base_name + "{mapper}.cnvkit.{cancer_library}.cns", - "segment_md5": base_name + "{mapper}.cnvkit.{cancer_library}.cns.md5", - "bins": base_name + "{mapper}.cnvkit.{cancer_library}.cnr", - "bins_md5": base_name + "{mapper}.cnvkit.{cancer_library}.cnr.md5", - "scatter": base_name + "{mapper}.cnvkit.{cancer_library}.scatter.png", - "scatter_md5": base_name + "{mapper}.cnvkit.{cancer_library}.scatter.png.md5", + "target": base_name_out + ".targetcoverage.cnn", + "target_md5": base_name_out + ".targetcoverage.cnn.md5", + "antitarget": base_name_out + ".antitargetcoverage.cnn", + "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", } - actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "run") + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected -def test_cnvkit_somatic_wgs_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_log_file()""" - base_name = "work/{mapper}.cnvkit.{cancer_library}/log/{mapper}.cnvkit.{cancer_library}" - expected = get_expected_log_files_dict(base_out=base_name) - actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "run") +def test_cnvkit_coverage_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + base_file_name = ( + "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" + ) + expected = get_expected_log_files_dict(base_out=base_file_name) + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_coverage_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" + # Define expected + expected_dict = {"threads": 8, "time": "1-00:00:00", "memory": "16384M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ + + +def test_cnvkit_fix_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" + expected = { + "antitarget": coverage_base_out + ".antitargetcoverage.cnn", + "target": coverage_base_out + ".targetcoverage.cnn", + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) + assert actual == expected + + +def test_cnvkit_fix_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected + + +def test_cnvkit_fix_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "fix") + assert actual == expected + + +def test_cnvkit_fix_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "fix", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (segment) -------------------------------------------------------------- + + +def test_cnvkit_segment_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_resource(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_resource()""" +def test_cnvkit_segment_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" + ) + expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "segment") == expected + + +def test_cnvkit_segment_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): # Define expected - expected_dict = {"threads": 4, "time": "1-16:00:00", "memory": "40960M", "partition": "medium"} + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "segment") + assert actual == expected + + +def test_cnvkit_segment_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "run", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "segment", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (call) ----------------------------------------------------------------- + + +def test_cnvkit_call_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" + expected = {"segment": segment_file} + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) + assert actual == expected + + +def test_cnvkit_call_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "call") == expected + + +def test_cnvkit_call_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "call") + assert actual == expected + + +def test_cnvkit_call_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "call", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (plot) ----------------------------------------------------------------- + + +def test_cnvkit_plot_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + expected = { + "cnr": cnr_file, + "cns": cns_file, + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "plot")(wildcards) + assert actual == expected + + +def test_cnvkit_plot_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): + expected[plot] = tpl.format(plot=plot, ext=ext) + expected[plot + "_md5"] = expected[plot] + ".md5" + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): + for chrom in chain(range(1, 23), ("X", "Y")): + key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) + expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) + expected[key + "_md5"] = expected[key] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "plot") + assert actual == expected + + +def test_cnvkit_plot_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = get_expected_log_files_dict( + base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" + ) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "plot") + assert actual == expected + + +def test_cnvkit_plot_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "30720M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "plot", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (export) --------------------------------------------------------------- + + +def test_cnvkit_export_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = {"cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"} + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "export")(wildcards) + assert actual == expected + + +def test_cnvkit_export_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" + for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): + expected[key] = base_name_out + "." + ext + expected[key + "_md5"] = expected[key] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "export") + assert actual == expected + + +def test_cnvkit_export_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "export") + assert actual == expected + + +def test_cnvkit_export_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "export", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (report) --------------------------------------------------------------- + + +def test_cnvkit_report_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + target_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" + ) + antitarget_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" + ) + expected = { + "cnr": cnr_file, + "cns": cns_file, + "target": target_file, + "antitarget": antitarget_file, + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "report")(wildcards) + assert actual == expected + + +def test_cnvkit_report_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): + expected[report] = base_name_out + "." + report + ".txt" + expected[report + "_md5"] = expected[report] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_report_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_report_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "report", resource) assert actual == expected, msg_error @@ -471,12 +757,43 @@ def test_somatic_cnv_calling_workflow(somatic_wgs_cnv_calling_workflow): for cnv_caller in ("control_freec",) ] # -- add files from cnvkit + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" expected += [ - tpl.format(mapper=mapper, cnv_caller=cnv_caller, i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("cnr", "cnr.md5", "cns", "cns.md5", "scatter.png", "scatter.png.md5") - for mapper in ("bwa",) - for cnv_caller in ("cnvkit",) + for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" + ) + expected += [ + tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" + ) + expected += [ + tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + for chrom in chain(range(1, 23), ("X", "Y")) + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" + ) + expected += [ + tpl.format(i=i, t=t, report=report, md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + for md5 in ("", ".md5") ] # Perform the comparison expected = list(sorted(expected))