From 2b45596efadfb9dcc64f2f2812deb96106d432bf Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 18 Jan 2023 13:36:09 +0100 Subject: [PATCH] fix: re-enabling and fixing Popdel --- .../workflows/common/sv_calling.py | 10 +- .../workflows/sv_calling_wgs/Snakefile | 203 +++++++++--------- .../workflows/sv_calling_wgs/__init__.py | 63 ++++-- .../wrappers/popdel/call/wrapper.py | 11 +- 4 files changed, 162 insertions(+), 125 deletions(-) diff --git a/snappy_pipeline/workflows/common/sv_calling.py b/snappy_pipeline/workflows/common/sv_calling.py index 3fd6183c9..88199f685 100644 --- a/snappy_pipeline/workflows/common/sv_calling.py +++ b/snappy_pipeline/workflows/common/sv_calling.py @@ -50,10 +50,14 @@ def get_log_file(self, action: typing.Optional[str] = None): """Return dict of log files in the "log" directory""" _ = action if action and action != self.actions[-1]: - infix = f"{self.name}_{action}" + token = f"{self.name}_{action}" else: - infix = self.name - prefix = f"work/{{mapper}}.{infix}.{{library_name}}/log/{{mapper}}.{infix}.{{library_name}}.sv_calling" + token = self.name + if hasattr(self, f"_get_log_file_infix_{action}"): + infix = getattr(self, f"_get_log_file_infix_{action}")() + else: + infix = f"{{mapper}}.{token}.{{library_name}}" + prefix = f"work/{infix}/log/{infix}.sv_calling" key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), diff --git a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile index af67ca8a4..572df7540 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile +++ b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile @@ -9,12 +9,13 @@ from snappy_pipeline.workflows.sv_calling_wgs import SvCallingWgsWorkflow __author__ = "Manuel Holtgrewe " -# Configuration =============================================================== +# Regular expression for wildcard constraints +RE_BETWEEN_DOTS = r"[^.\.]+" +# Configuration =============================================================== configfile: "config.yaml" - # Expand "$ref" JSON pointers in configuration (also works for YAML) config, lookup_paths, config_paths = expand_ref("config.yaml", config) @@ -200,91 +201,93 @@ rule sv_calling_wgs_delly2_merge_genotypes: # wf.wrapper_path("sniffles2/germline/snf_to_vcf") -## PopDel Steps ---------------------------------------------------------------- -# -# -# ruleorder: sv_calling_wgs_popdel_reorder_vcf > sv_calling_wgs_popdel_concat_calls > sv_calling_wgs_popdel_call > sv_calling_wgs_popdel_profile -# -# -# rule sv_calling_wgs_popdel_profile: -# input: -# unpack(wf.get_input_files("popdel", "profile")), -# output: -# **wf.get_output_files("popdel", "profile"), -# threads: wf.get_resource("popdel", "profile", "threads") -# resources: -# time=wf.get_resource("popdel", "profile", "time"), -# memory=wf.get_resource("popdel", "profile", "memory"), -# partition=wf.get_resource("popdel", "profile", "partition"), -# tmpdir=wf.get_resource("popdel", "profile", "tmpdir"), -# wildcard_constraints: -# index_ngs_library=r"[^\.]+", -# log: -# **wf.get_log_file("popdel", "profile"), -# wrapper: -# wf.wrapper_path("popdel/profile") -# -# -# rule sv_calling_wgs_popdel_call: -# input: -# unpack(wf.get_input_files("popdel", "call")), -# output: -# **wf.get_output_files("popdel", "call"), -# threads: wf.get_resource("popdel", "call", "threads") -# resources: -# time=wf.get_resource("popdel", "call", "time"), -# memory=wf.get_resource("popdel", "call", "memory"), -# partition=wf.get_resource("popdel", "call", "partition"), -# tmpdir=wf.get_resource("popdel", "call", "tmpdir"), -# log: -# **wf.get_log_file("popdel", "call"), -# wrapper: -# wf.wrapper_path("popdel/call") -# -# -# rule sv_calling_wgs_popdel_concat_calls: -# input: -# unpack(wf.get_input_files("popdel", "concat_calls")), -# output: -# **wf.get_output_files("popdel", "concat_calls"), -# threads: wf.get_resource("popdel", "concat_calls", "threads") -# resources: -# time=wf.get_resource("popdel", "concat_calls", "time"), -# memory=wf.get_resource("popdel", "concat_calls", "memory"), -# partition=wf.get_resource("popdel", "concat_calls", "partition"), -# tmpdir=wf.get_resource("popdel", "concat_calls", "tmpdir"), -# log: -# **wf.get_log_file("popdel", "concat_calls"), -# wrapper: -# wf.wrapper_path("popdel/concat_calls") -# -# -# rule sv_calling_wgs_popdel_reorder_vcf: -# input: -# unpack(wf.get_input_files("popdel", "reorder_vcf")), -# output: -# **wf.get_output_files("popdel", "reorder_vcf"), -# threads: wf.get_resource("popdel", "reorder_vcf", "threads") -# resources: -# time=wf.get_resource("popdel", "reorder_vcf", "time"), -# memory=wf.get_resource("popdel", "reorder_vcf", "memory"), -# partition=wf.get_resource("popdel", "reorder_vcf", "partition"), -# tmpdir=wf.get_resource("popdel", "reorder_vcf", "tmpdir"), -# wildcard_constraints: -# index_ngs_library=r"[^\.]+", -# params: -# ped_members=wf.substep_getattr("popdel", "get_ped_members"), -# log: -# **wf.get_log_file("popdel", "reorder_vcf"), -# wrapper: -# wf.wrapper_path("popdel/reorder_vcf") +# PopDel Steps ---------------------------------------------------------------- -# Run Melt -------------------------------------------------------------------- +rule sv_calling_wgs_popdel_profile: + input: + unpack(wf.get_input_files("popdel", "profile")), + output: + **wf.get_output_files("popdel", "profile"), + wildcard_constraints: + mapper=RE_BETWEEN_DOTS, + index_ngs_library=RE_BETWEEN_DOTS, + threads: wf.get_resource("popdel", "profile", "threads") + resources: + time=wf.get_resource("popdel", "profile", "time"), + memory=wf.get_resource("popdel", "profile", "memory"), + partition=wf.get_resource("popdel", "profile", "partition"), + tmpdir=wf.get_resource("popdel", "profile", "tmpdir"), + log: + **wf.get_log_file("popdel", "profile"), + wrapper: + wf.wrapper_path("popdel/profile") -# Regular expression for wildcard constraints -RE_NO_DOT = r"[^.]+" +rule sv_calling_wgs_popdel_call: + input: + unpack(wf.get_input_files("popdel", "call")), + output: + **wf.get_output_files("popdel", "call"), + wildcard_constraints: + mapper=RE_BETWEEN_DOTS, + chrom=RE_BETWEEN_DOTS, + begin=RE_BETWEEN_DOTS, + end=RE_BETWEEN_DOTS, + threads: wf.get_resource("popdel", "call", "threads") + resources: + time=wf.get_resource("popdel", "call", "time"), + memory=wf.get_resource("popdel", "call", "memory"), + partition=wf.get_resource("popdel", "call", "partition"), + tmpdir=wf.get_resource("popdel", "call", "tmpdir"), + log: + **wf.get_log_file("popdel", "call"), + wrapper: + wf.wrapper_path("popdel/call") + + +rule sv_calling_wgs_popdel_concat_calls: + input: + unpack(wf.get_input_files("popdel", "concat_calls")), + output: + **wf.get_output_files("popdel", "concat_calls"), + wildcard_constraints: + mapper=RE_BETWEEN_DOTS, + threads: wf.get_resource("popdel", "concat_calls", "threads") + resources: + time=wf.get_resource("popdel", "concat_calls", "time"), + memory=wf.get_resource("popdel", "concat_calls", "memory"), + partition=wf.get_resource("popdel", "concat_calls", "partition"), + tmpdir=wf.get_resource("popdel", "concat_calls", "tmpdir"), + log: + **wf.get_log_file("popdel", "concat_calls"), + wrapper: + wf.wrapper_path("popdel/concat_calls") + + +rule sv_calling_wgs_popdel_reorder_vcf: + input: + unpack(wf.get_input_files("popdel", "reorder_vcf")), + output: + **wf.get_output_files("popdel", "reorder_vcf"), + wildcard_constraints: + mapper=RE_BETWEEN_DOTS, + index_ngs_library=RE_BETWEEN_DOTS, + threads: wf.get_resource("popdel", "reorder_vcf", "threads") + resources: + time=wf.get_resource("popdel", "reorder_vcf", "time"), + memory=wf.get_resource("popdel", "reorder_vcf", "memory"), + partition=wf.get_resource("popdel", "reorder_vcf", "partition"), + tmpdir=wf.get_resource("popdel", "reorder_vcf", "tmpdir"), + params: + ped_members=wf.substep_getattr("popdel", "get_ped_members"), + log: + **wf.get_log_file("popdel", "reorder_vcf"), + wrapper: + wf.wrapper_path("popdel/reorder_vcf") + + +# Run Melt -------------------------------------------------------------------- rule sv_calling_wgs_melt_preprocess: @@ -293,9 +296,9 @@ rule sv_calling_wgs_melt_preprocess: output: **wf.get_output_files("melt", "preprocess"), wildcard_constraints: - mapper=RE_NO_DOT, - library_name=RE_NO_DOT, - me_type=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + library_name=RE_BETWEEN_DOTS, + me_type=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "preprocess", "threads") resources: time=wf.get_resource("melt", "preprocess", "time"), @@ -315,9 +318,9 @@ rule sv_calling_wgs_melt_indiv_analysis: output: **wf.get_output_files("melt", "indiv_analysis"), wildcard_constraints: - mapper=RE_NO_DOT, - library_name=RE_NO_DOT, - me_type=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + library_name=RE_BETWEEN_DOTS, + me_type=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "indiv_analysis", "threads") resources: time=wf.get_resource("melt", "indiv_analysis", "time"), @@ -337,9 +340,9 @@ rule sv_calling_wgs_melt_group_analysis: output: **wf.get_output_files("melt", "group_analysis"), wildcard_constraints: - mapper=RE_NO_DOT, - index_library_name=RE_NO_DOT, - me_type=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + index_library_name=RE_BETWEEN_DOTS, + me_type=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "group_analysis", "threads") resources: time=wf.get_resource("melt", "group_analysis", "time"), @@ -359,9 +362,9 @@ rule sv_calling_wgs_melt_genotype: output: **wf.get_output_files("melt", "genotype"), wildcard_constraints: - mapper=RE_NO_DOT, - index_library_name=RE_NO_DOT, - me_type=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + index_library_name=RE_BETWEEN_DOTS, + me_type=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "genotype", "threads") resources: time=wf.get_resource("melt", "genotype", "time"), @@ -381,8 +384,8 @@ rule sv_calling_wgs_melt_make_vcf: output: **wf.get_output_files("melt", "make_vcf"), wildcard_constraints: - mapper=RE_NO_DOT, - index_library_name=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + index_library_name=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "make_vcf", "threads") resources: time=wf.get_resource("melt", "make_vcf", "time"), @@ -402,8 +405,8 @@ rule sv_calling_wgs_melt_merge_vcf: output: **wf.get_output_files("melt", "merge_vcf"), wildcard_constraints: - mapper=RE_NO_DOT, - library_name=RE_NO_DOT, + mapper=RE_BETWEEN_DOTS, + library_name=RE_BETWEEN_DOTS, threads: wf.get_resource("melt", "merge_vcf", "threads") resources: time=wf.get_resource("melt", "merge_vcf", "time"), @@ -434,7 +437,7 @@ rule sv_calling_wgs_gcnv_preprocess_intervals: log: wf.get_log_file("gcnv", "preprocess_intervals"), wrapper: - wf.wrapper_path("gcnv/preprocess_intervals") + wf.wrapper_path("gcnv/preprocess_intervals_wgs") rule sv_calling_wgs_gcnv_coverage: diff --git a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py index c08bc724f..128cdb55e 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py +++ b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py @@ -1,7 +1,8 @@ """Implementation of the ``sv_calling_wgs`` step """ -from collections import OrderedDict +from itertools import chain +import re from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background @@ -110,11 +111,16 @@ def _build_ngs_library_to_kit(self): yield donor.dna_ngs_library.name, "wgs" +def escape_dots_dashes(s: str) -> str: + """Escape dots and dashes with double-underscore constructs.""" + return s.replace("_", "__under__").replace(".", "__dot__").replace("-", "__hyphen__") + + class PopDelStepPart( - ForwardSnakemakeFilesMixin, - ForwardResourceUsageMixin, SvCallingGetResultFilesMixin, SvCallingGetLogFileMixin, + ForwardSnakemakeFilesMixin, + ForwardResourceUsageMixin, BaseStepPart, ): """WGS SV identification using PopDel. @@ -141,11 +147,11 @@ class PopDelStepPart( def __init__(self, parent): super().__init__(parent) # Build shortcut from index library name to pedigree - self.index_ngs_library_to_pedigree = OrderedDict() + self.index_ngs_library_to_pedigree = {} for sheet in self.parent.shortcut_sheets: self.index_ngs_library_to_pedigree.update(sheet.index_ngs_library_to_pedigree) # Build shortcut from library name to library info - self.library_name_to_library = OrderedDict() + self.library_name_to_library = {} for sheet in self.parent.shortcut_sheets: self.library_name_to_library.update(sheet.library_name_to_library) @@ -157,12 +163,12 @@ def get_library_extra_infos(self, wildcards): def _get_input_files_profile(self, wildcards): """Return input files for "call" action""" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - infix = f"{wildcards.mapper}.{wildcards.index_ngs_library}" + infix = f"{wildcards.mapper}.{wildcards.library_name}" yield "bam", ngs_mapping(f"output/{infix}/out/{infix}.bam") @dictify def _get_output_files_profile(self): - infix = "{mapper}.popdel_profile.{index_library_name}" + infix = "{mapper}.popdel_profile.{library_name}" yield "profile", f"work/{infix}/out/{infix}.profile" yield "profile_md5", f"work/{infix}/out/{infix}.profile.md5" @@ -183,17 +189,20 @@ def _donors_with_dna_ngs_library(self): @dictify def _get_output_files_call(self): - infix = "{mapper}.popdel_call.{chrom}-{begin}-{end}" + infix = self._get_log_file_infix_call() yield "vcf", f"work/{infix}/out/{infix}.vcf.gz" yield "vcf_md5", f"work/{infix}/out/{infix}.vcf.gz.md5" yield "vcf_tbi", f"work/{infix}/out/{infix}.vcf.gz.tbi" yield "vcf_tbi_md5", f"work/{infix}/out/{infix}.vcf.gz.tbi.md5" + def _get_log_file_infix_call(self): + return "{mapper}.popdel_call.{chrom}-{begin}-{end}" + @dictify def _get_input_files_concat_calls(self, wildcards): window_size = self.config["popdel"]["window_size"] padding = self.config["popdel"]["max_sv_size"] - vcfs = {} + vcfs = [] with open(self._get_fai_path(), "rt") as fai_file: for r in yield_regions( fai_file, @@ -203,7 +212,8 @@ def _get_input_files_concat_calls(self, wildcards): ): if r.begin == 0: r.begin = 1 - infix = f"{wildcards.mapper}.popdel.call.{r.chrom}-{r.begin}-{r.end}" + chrom = escape_dots_dashes(r.chrom) + infix = f"{wildcards.mapper}.popdel_call.{chrom}-{r.begin}-{r.end}" vcfs.append(f"work/{infix}/out/{infix}.vcf.gz") yield "vcf", vcfs @@ -215,28 +225,39 @@ def _get_ignore_chroms(self): @dictify def _get_output_files_concat_calls(self): - infix = "{mapper}.popdel_concat_calls" + infix = self._get_log_file_infix_concat_calls() yield "vcf", f"work/{infix}/out/{infix}.vcf.gz" yield "vcf_md5", f"work/{infix}/out/{infix}.vcf.gz.md5" yield "vcf_tbi", f"work/{infix}/out/{infix}.vcf.gz.tbi" yield "vcf_tbi_md5", f"work/{infix}/out/{infix}.vcf.gz.tbi.md5" + def _get_log_file_infix_concat_calls(self): + return "{mapper}.popdel_concat_calls" + @dictify def _get_input_files_reorder_vcf(self, wildcards): - infix = f"{wildcards.mapper}.popdel.internal.concat_calls" + infix = f"{wildcards.mapper}.popdel_concat_calls" yield "vcf", f"work/{infix}/out/{infix}.vcf.gz" @dictify def _get_output_files_reorder_vcf(self): - infix = "{mapper}.popdel.{index_ngs_library}" - yield "vcf", f"work/{infix}/out/{infix}.vcf.gz" - yield "vcf_md5", f"work/{infix}/out/{infix}.vcf.gz.md5" - yield "vcf_tbi", f"work/{infix}/out/{infix}.vcf.gz.tbi" - yield "vcf_tbi_md5", f"work/{infix}/out/{infix}.vcf.gz.tbi.md5" + infix = "{mapper}.popdel.{library_name}" + work_files = {} + work_files["vcf"] = f"work/{infix}/out/{infix}.vcf.gz" + work_files["vcf_md5"] = f"work/{infix}/out/{infix}.vcf.gz.md5" + work_files["vcf_tbi"] = f"work/{infix}/out/{infix}.vcf.gz.tbi" + work_files["vcf_tbi_md5"] = f"work/{infix}/out/{infix}.vcf.gz.tbi.md5" + yield from work_files.items() + yield "output_links", [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain( + work_files.values(), self.get_log_file("reorder_vcf").values() + ) + ] def get_ped_members(self, wildcards): """Used in Snakefile to rule ``sv_calling_wgs_popdel_reorder_vcf``""" - pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] + pedigree = self.index_ngs_library_to_pedigree[wildcards.library_name] return " ".join( donor.dna_ngs_library.name for donor in pedigree.donors if donor.dna_ngs_library ) @@ -261,7 +282,7 @@ def __init__(self, parent): "{mapper}.sniffles2.{index_ngs_library}{ext}" ) # Build shortcut from index library name to pedigree - self.index_ngs_library_to_pedigree = OrderedDict() + self.index_ngs_library_to_pedigree = {} for sheet in self.parent.shortcut_sheets: self.index_ngs_library_to_pedigree.update(sheet.index_ngs_library_to_pedigree) @@ -317,10 +338,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( Delly2StepPart, MantaStepPart, - # PopDelStepPart, + PopDelStepPart, GcnvWgsStepPart, MeltStepPart, - # Sniffles2StepPart, + # Sniffles2StepPart, WritePedigreeStepPart, ) ) diff --git a/snappy_wrappers/wrappers/popdel/call/wrapper.py b/snappy_wrappers/wrappers/popdel/call/wrapper.py index 47f3e9cf6..210da10ba 100644 --- a/snappy_wrappers/wrappers/popdel/call/wrapper.py +++ b/snappy_wrappers/wrappers/popdel/call/wrapper.py @@ -8,6 +8,15 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" + +def unescape_dots_dashes(s: str) -> str: + """Unescape dots and dashes from double-underscore constructs.""" + return s.replace("__hyphen__", "-").replace("__dot__", ".").replace("__under__", "_") + + +chrom = unescape_dots_dashes(snakemake.wildcards.chrom) + + with tempfile.NamedTemporaryFile("wt") as tmpf: # Write paths to input files into temporary file. # @@ -31,7 +40,7 @@ done popdel call \ - -r {snakemake.wildcards.chrom}:{snakemake.wildcards.begin}-{snakemake.wildcards.end} \ + -r {chrom}:{snakemake.wildcards.begin}-{snakemake.wildcards.end} \ -o $TMPDIR/tmp.vcf \ $TMPDIR/profiles.txt