diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_common.py b/snappy_pipeline/workflows/common/gcnv/gcnv_common.py index 2b713fceb..7c957bcff 100644 --- a/snappy_pipeline/workflows/common/gcnv/gcnv_common.py +++ b/snappy_pipeline/workflows/common/gcnv/gcnv_common.py @@ -194,6 +194,7 @@ def get_resource_usage(self, action): high_resource_action_list = ( "call_cnvs", "post_germline_calls", + "joint_germline_cnv_segmentation", ) if action in high_resource_action_list: return self.resource_usage_dict.get("high_resource") diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile index a7962792d..e3abeffcf 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile +++ b/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile @@ -27,11 +27,23 @@ wf = HelperBuildTargetSeqGcnvModelWorkflow( # Rules ======================================================================= +localrules: + # Writing out pedigrees can be done locally + sv_calling_targeted_write_pedigree_run, + + rule all: input: wf.get_result_files(), +rule build_gcnv_model_write_pedigree_run: + output: + wf.get_output_files("write_pedigree", "run"), + run: + wf.substep_dispatch("write_pedigree", "run", wildcards, output) + + rule build_gcnv_model_preprocess_intervals: input: unpack(wf.get_input_files("gcnv", "preprocess_intervals")), diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py index e5c4930b5..ee9e0fa0c 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py +++ b/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py @@ -89,7 +89,7 @@ from snakemake.io import glob_wildcards from snappy_pipeline.utils import DictQuery, dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep +from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow @@ -195,7 +195,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) (NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available - self.register_sub_step_classes((BuildGcnvTargetSeqModelStepPart,)) + self.register_sub_step_classes( + ( + WritePedigreeStepPart, + BuildGcnvTargetSeqModelStepPart, + ) + ) # Register sub workflows self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) # Build mapping from NGS DNA library to library kit diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile index e54ef4657..f73edcf13 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile +++ b/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile @@ -23,11 +23,23 @@ wf = HelperBuildWgsGcnvModelWorkflow(workflow, config, lookup_paths, config_path # Rules ======================================================================= +localrules: + # Writing out pedigrees can be done locally + sv_calling_targeted_write_pedigree_run, + + rule all: input: wf.get_result_files(), +rule build_gcnv_model_write_pedigree_run: + output: + wf.get_output_files("write_pedigree", "run"), + run: + wf.substep_dispatch("write_pedigree", "run", wildcards, output) + + rule build_gcnv_model_preprocess_intervals: input: unpack(wf.get_input_files("gcnv", "preprocess_intervals")), diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py index 357e1d4a4..3b6ce0d82 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py +++ b/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py @@ -90,7 +90,7 @@ from snakemake.io import glob_wildcards from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep +from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow @@ -231,7 +231,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) (NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available - self.register_sub_step_classes((BuildGcnvWgsModelStepPart,)) + self.register_sub_step_classes( + ( + WritePedigreeStepPart, + BuildGcnvWgsModelStepPart, + ) + ) # Register sub workflows self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) diff --git a/snappy_pipeline/workflows/sv_calling_targeted/Snakefile b/snappy_pipeline/workflows/sv_calling_targeted/Snakefile index 75eb93c04..d4715a469 100644 --- a/snappy_pipeline/workflows/sv_calling_targeted/Snakefile +++ b/snappy_pipeline/workflows/sv_calling_targeted/Snakefile @@ -98,6 +98,7 @@ rule sv_calling_targeted_gcnv_contig_ploidy: wf.get_log_file("gcnv", "contig_ploidy"), params: args=wf.get_params("gcnv", "contig_ploidy"), + step_key="sv_calling_targeted", wrapper: wf.wrapper_path("gcnv/contig_ploidy_case_mode") diff --git a/snappy_pipeline/workflows/sv_calling_targeted/__init__.py b/snappy_pipeline/workflows/sv_calling_targeted/__init__.py index b5a24e0b1..b91b90368 100644 --- a/snappy_pipeline/workflows/sv_calling_targeted/__init__.py +++ b/snappy_pipeline/workflows/sv_calling_targeted/__init__.py @@ -51,6 +51,8 @@ path_target_interval_list_mapping: [] gcnv: + # Path to interval block list with PAR region for contig calling. + path_par_intervals: null # REQUIRED # Path to gCNV model - will execute analysis in CASE MODE. # # Example: @@ -59,17 +61,25 @@ # contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` # model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` precomputed_model_paths: [] + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] delly2: path_exclude_tsv: null # optional - max_threads: 16 map_qual: 1 geno_qual: 5 qual_tra: 20 mad_cutoff: 9 + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] manta: - max_threads: 16 + num_threads: 16 + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] melt: me_refs_infix: 1KGP_Hg19 @@ -79,6 +89,9 @@ - SVA jar_file: REQUIRED genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] """ diff --git a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile index 5c3749658..44dc53a06 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile +++ b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile @@ -474,6 +474,7 @@ rule sv_calling_wgs_gcnv_contig_ploidy: wf.get_log_file("gcnv", "contig_ploidy"), params: args=wf.get_params("gcnv", "contig_ploidy"), + step_key="sv_calling_targeted", wrapper: wf.wrapper_path("gcnv/contig_ploidy_case_mode") diff --git a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py index 50a0ca0d2..5e8569d1f 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py +++ b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py @@ -50,27 +50,38 @@ # Short-read SV calling tool configuration delly2: path_exclude_tsv: null # optional - max_threads: 16 map_qual: 1 geno_qual: 5 qual_tra: 20 mad_cutoff: 9 + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] manta: - max_threads: 16 + num_threads: 16 + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] popdel: window_size: 10000000 max_sv_size: 20000 # == padding + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] gcnv: + # Path to interval block list with PAR region for contig calling. + path_par_intervals: null # REQUIRED # Path to gCNV model - will execute analysis in CASE MODE. # # Example of precomputed model: # - library: "Agilent SureSelect Human All Exon V6" # Library name # contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` # model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` - precomputed_model_paths: [] # REQUIRED - # Path to BED file with uniquely mappable regions. path_uniquely_mapable_bed: null # REQUIRED + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] melt: me_refs_infix: 1KGP_Hg19 me_types: @@ -79,10 +90,16 @@ - SVA jar_file: REQUIRED genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] # Long-read SV calling tool configuration sniffles2: tandem_repeats: /fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed # REQUIRED + # Skip processing of the following libraries. If the library is in + # family/pedigree then all of the family/pedigree will be skipped. + skip_libraries: [] # Common configuration ignore_chroms: diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py index 137f6f4ea..75b877a02 100644 --- a/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py @@ -1,9 +1,39 @@ -# -*- coding: utf-8 -*- +import pathlib from snakemake.shell import shell +# We will first call the GATK tool. However, there are issues with the reliability of the contig +# ploidy calls. This causes an exception in the JointGermlineCNVSegmentation further downstream. +# We thus rewrite the genotype calls based on the ones from the PED files as a workaround. +# +# cf. https://github.com/broadinstitute/gatk/issues/8164 + +out_path = pathlib.Path(snakemake.output.done).parent + +MALE = "male" +FEMALE = "female" + +sex_map = {} +for ped_path in snakemake.input.ped: + with open(ped_path, "rt") as inputf: + for line in inputf: + arr = line.strip().split("\t") + sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]] + +ploidy_x = {MALE: 1, FEMALE: 2} +ploidy_y = {MALE: 1, FEMALE: 0} + paths_tsv = " ".join(snakemake.input.tsv) +# Add interval block list for PAR regions if configured. +par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"]["gcnv"].get( + "path_par_intervals" +) +if par_intervals: + par_args = f"-XL {par_intervals}" +else: + par_args = "" + shell( r""" export TMPDIR=$(mktemp -d) @@ -26,6 +56,7 @@ gatk DetermineGermlineContigPloidy \ -L {snakemake.input.interval_list} \ + {par_args} \ --interval-merging-rule OVERLAPPING_ONLY \ $(for tsv in {paths_tsv}; do echo -I $tsv; done) \ --contig-ploidy-priors $PRIORS \ @@ -33,3 +64,31 @@ --output-prefix ploidy """ ) + +ploidy_calls = out_path / "ploidy-calls" +for sample_dir in ploidy_calls.glob("SAMPLE_*"): + path_name = sample_dir / "sample_name.txt" + with path_name.open("rt") as inputf: + sample_name = inputf.read().strip() + sample_sex = sex_map[sample_name] + path_call = sample_dir / "contig_ploidy.tsv" + with path_call.open("rt") as inputf: + call_lines = inputf.readlines() + for i in range(len(call_lines)): + line = call_lines[i] + arr = line.split("\t") + chrom = arr[0].lower() + if "x" in chrom or "y" in chrom: + ploidy = int(arr[1]) + if "x" in arr[0].lower(): + if ploidy != ploidy_x[sample_sex]: + arr[1] = str(ploidy_x[sample_sex]) + arr[2] = f"42.000{ploidy}" # magic marker + elif "y" in arr[0].lower(): + if ploidy != ploidy_y[sample_sex]: + arr[1] = str(ploidy_y[sample_sex]) + arr[2] = f"42.000{ploidy}" # magic marker + call_lines[i] = "\t".join(arr) + with path_call.open("wt") as outputf: + for line in call_lines: + print(line.strip(), file=outputf) diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py index 4bb37c00c..a9c77d51e 100644 --- a/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py @@ -1,9 +1,37 @@ -# -*- coding: utf-8 -*- +import pathlib from snakemake.shell import shell +# We will first call the GATK tool. However, there are issues with the reliability of the contig +# ploidy calls. This causes an exception in the JointGermlineCNVSegmentation further downstream. +# We thus rewrite the genotype calls based on the ones from the PED files as a workaround. +# +# cf. https://github.com/broadinstitute/gatk/issues/8164 + +out_path = pathlib.Path(snakemake.output.done).parent + +MALE = "male" +FEMALE = "female" + +sex_map = {} +for ped_path in snakemake.input.ped: + with open(ped_path, "rt") as inputf: + for line in inputf: + arr = line.strip().split("\t") + sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]] + +ploidy_x = {MALE: 1, FEMALE: 2} +ploidy_y = {MALE: 1, FEMALE: 0} + paths_tsv = " ".join(snakemake.input.tsv) +## Add interval block list for PAR regions if configured. +# par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"].get("path_par_intervals") +# if par_intervals: +# par_args = f"-XL {par_intervals}" +# else: +# par_args = "" + shell( r""" export THEANO_FLAGS="base_compiledir=$TMPDIR/theano_compile_dir" @@ -17,3 +45,31 @@ --output-prefix ploidy """ ) + +ploidy_calls = out_path / "ploidy-calls" +for sample_dir in ploidy_calls.glob("SAMPLE_*"): + path_name = sample_dir / "sample_name.txt" + with path_name.open("rt") as inputf: + sample_name = inputf.read().strip() + sample_sex = sex_map[sample_name] + path_call = sample_dir / "contig_ploidy.tsv" + with path_call.open("rt") as inputf: + call_lines = inputf.readlines() + for i in range(len(call_lines)): + line = call_lines[i] + arr = line.split("\t") + chrom = arr[0].lower() + if "x" in chrom or "y" in chrom: + ploidy = int(arr[1]) + if "x" in arr[0].lower(): + if ploidy != ploidy_x[sample_sex]: + arr[1] = str(ploidy_x[sample_sex]) + arr[2] = f"42.000{ploidy}" # magic marker + elif "y" in arr[0].lower(): + if ploidy != ploidy_y[sample_sex]: + arr[1] = str(ploidy_y[sample_sex]) + arr[2] = f"42.000{ploidy}" # magic marker + call_lines[i] = "\t".join(arr) + with path_call.open("wt") as outputf: + for line in call_lines: + print(line.strip(), file=outputf) diff --git a/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py b/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py index f97af28c2..aa03ab96f 100644 --- a/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py @@ -13,7 +13,7 @@ target_interval_bed = item["path"] break else: # of for, did not break out - raise Exception("Found no target intervals for %s" % item["name"]) + raise Exception("Found no target intervals for %s" % snakemake.wildcards.library_kit) shell( r""" diff --git a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py index c644b73f1..c90149d0d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py +++ b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py @@ -366,6 +366,7 @@ def test_gcnv_step_part_get_resource_usage(sv_calling_targeted_workflow): high_resource_actions = ( "call_cnvs", "post_germline_calls", + "joint_germline_cnv_segmentation", ) all_actions = sv_calling_targeted_workflow.substep_getattr("gcnv", "actions") default_actions = [action for action in all_actions if action not in high_resource_actions]