fix: various fixes to make gCNV work properly for WGS and targeted (#370

)
bihealth · Feb 9, 2023 · 14d86e3 · 14d86e3
1 parent 33785b2
commit 14d86e3
Show file tree

Hide file tree

Showing 13 changed files with 196 additions and 13 deletions.
diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_common.py b/snappy_pipeline/workflows/common/gcnv/gcnv_common.py
@@ -194,6 +194,7 @@ def get_resource_usage(self, action):
         high_resource_action_list = (
             "call_cnvs",
             "post_germline_calls",
+            "joint_germline_cnv_segmentation",
         )
         if action in high_resource_action_list:
             return self.resource_usage_dict.get("high_resource")

diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile
@@ -27,11 +27,23 @@ wf = HelperBuildTargetSeqGcnvModelWorkflow(
 # Rules =======================================================================
 
 
+localrules:
+    # Writing out pedigrees can be done locally
+    sv_calling_targeted_write_pedigree_run,
+
+
 rule all:
     input:
         wf.get_result_files(),
 
 
+rule build_gcnv_model_write_pedigree_run:
+    output:
+        wf.get_output_files("write_pedigree", "run"),
+    run:
+        wf.substep_dispatch("write_pedigree", "run", wildcards, output)
+
+
 rule build_gcnv_model_preprocess_intervals:
     input:
         unpack(wf.get_input_files("gcnv", "preprocess_intervals")),

diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py
@@ -89,7 +89,7 @@
 from snakemake.io import glob_wildcards
 
 from snappy_pipeline.utils import DictQuery, dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep
+from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart
 from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
 
@@ -195,7 +195,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             (NgsMappingWorkflow,),
         )
         # Register sub step classes so the sub steps are available
-        self.register_sub_step_classes((BuildGcnvTargetSeqModelStepPart,))
+        self.register_sub_step_classes(
+            (
+                WritePedigreeStepPart,
+                BuildGcnvTargetSeqModelStepPart,
+            )
+        )
         # Register sub workflows
         self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
         # Build mapping from NGS DNA library to library kit

diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile
@@ -23,11 +23,23 @@ wf = HelperBuildWgsGcnvModelWorkflow(workflow, config, lookup_paths, config_path
 # Rules =======================================================================
 
 
+localrules:
+    # Writing out pedigrees can be done locally
+    sv_calling_targeted_write_pedigree_run,
+
+
 rule all:
     input:
         wf.get_result_files(),
 
 
+rule build_gcnv_model_write_pedigree_run:
+    output:
+        wf.get_output_files("write_pedigree", "run"),
+    run:
+        wf.substep_dispatch("write_pedigree", "run", wildcards, output)
+
+
 rule build_gcnv_model_preprocess_intervals:
     input:
         unpack(wf.get_input_files("gcnv", "preprocess_intervals")),

diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py
@@ -90,7 +90,7 @@
 from snakemake.io import glob_wildcards
 
 from snappy_pipeline.utils import dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep
+from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart
 from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
 
@@ -231,7 +231,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             (NgsMappingWorkflow,),
         )
         # Register sub step classes so the sub steps are available
-        self.register_sub_step_classes((BuildGcnvWgsModelStepPart,))
+        self.register_sub_step_classes(
+            (
+                WritePedigreeStepPart,
+                BuildGcnvWgsModelStepPart,
+            )
+        )
         # Register sub workflows
         self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
 

diff --git a/snappy_pipeline/workflows/sv_calling_targeted/Snakefile b/snappy_pipeline/workflows/sv_calling_targeted/Snakefile
@@ -98,6 +98,7 @@ rule sv_calling_targeted_gcnv_contig_ploidy:
         wf.get_log_file("gcnv", "contig_ploidy"),
     params:
         args=wf.get_params("gcnv", "contig_ploidy"),
+        step_key="sv_calling_targeted",
     wrapper:
         wf.wrapper_path("gcnv/contig_ploidy_case_mode")
 

diff --git a/snappy_pipeline/workflows/sv_calling_targeted/__init__.py b/snappy_pipeline/workflows/sv_calling_targeted/__init__.py
@@ -51,6 +51,8 @@
     path_target_interval_list_mapping: []
 
     gcnv:
+      # Path to interval block list with PAR region for contig calling.
+      path_par_intervals: null  # REQUIRED
       # Path to gCNV model - will execute analysis in CASE MODE.
       #
       # Example:
@@ -59,17 +61,25 @@
       #   contig_ploidy: /path/to/ploidy-model         # Output from `DetermineGermlineContigPloidy`
       #   model_pattern: /path/to/model_*              # Output from `GermlineCNVCaller`
       precomputed_model_paths: []
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 
     delly2:
       path_exclude_tsv: null  # optional
-      max_threads: 16
       map_qual: 1
       geno_qual: 5
       qual_tra: 20
       mad_cutoff: 9
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 
     manta:
-      max_threads: 16
+      num_threads: 16
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 
     melt:
       me_refs_infix: 1KGP_Hg19
@@ -79,6 +89,9 @@
       - SVA
       jar_file: REQUIRED
       genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed  # adjust, e.g., Hg38/Hg38.genes.bed
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 """
 
 

diff --git a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile
@@ -474,6 +474,7 @@ rule sv_calling_wgs_gcnv_contig_ploidy:
         wf.get_log_file("gcnv", "contig_ploidy"),
     params:
         args=wf.get_params("gcnv", "contig_ploidy"),
+        step_key="sv_calling_targeted",
     wrapper:
         wf.wrapper_path("gcnv/contig_ploidy_case_mode")
 

diff --git a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py
@@ -50,27 +50,38 @@
     # Short-read SV calling tool configuration
     delly2:
       path_exclude_tsv: null  # optional
-      max_threads: 16
       map_qual: 1
       geno_qual: 5
       qual_tra: 20
       mad_cutoff: 9
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
     manta:
-      max_threads: 16
+      num_threads: 16
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
     popdel:
       window_size: 10000000
       max_sv_size: 20000  # == padding
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
     gcnv:
+      # Path to interval block list with PAR region for contig calling.
+      path_par_intervals: null  # REQUIRED
       # Path to gCNV model - will execute analysis in CASE MODE.
       #
       # Example of precomputed model:
       # - library: "Agilent SureSelect Human All Exon V6"  # Library name
       #   contig_ploidy: /path/to/ploidy-model         # Output from `DetermineGermlineContigPloidy`
       #   model_pattern: /path/to/model_*              # Output from `GermlineCNVCaller`
-      precomputed_model_paths: []  # REQUIRED
-
       # Path to BED file with uniquely mappable regions.
       path_uniquely_mapable_bed: null  # REQUIRED
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
     melt:
       me_refs_infix: 1KGP_Hg19
       me_types:
@@ -79,10 +90,16 @@
       - SVA
       jar_file: REQUIRED
       genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed  # adjust, e.g., Hg38/Hg38.genes.bed
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 
     # Long-read SV calling tool configuration
     sniffles2:
       tandem_repeats: /fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed  # REQUIRED
+      # Skip processing of the following libraries.  If the library is in
+      # family/pedigree then all of the family/pedigree will be skipped.
+      skip_libraries: []
 
     # Common configuration
     ignore_chroms:

diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py
@@ -1,9 +1,39 @@
-# -*- coding: utf-8 -*-
+import pathlib
 
 from snakemake.shell import shell
 
+# We will first call the GATK tool.  However, there are issues with the reliability of the contig
+# ploidy calls.  This causes an exception in the  JointGermlineCNVSegmentation further downstream.
+# We thus rewrite the genotype calls based on the ones from the PED files as a workaround.
+#
+# cf. https://github.com/broadinstitute/gatk/issues/8164
+
+out_path = pathlib.Path(snakemake.output.done).parent
+
+MALE = "male"
+FEMALE = "female"
+
+sex_map = {}
+for ped_path in snakemake.input.ped:
+    with open(ped_path, "rt") as inputf:
+        for line in inputf:
+            arr = line.strip().split("\t")
+            sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]]
+
+ploidy_x = {MALE: 1, FEMALE: 2}
+ploidy_y = {MALE: 1, FEMALE: 0}
+
 paths_tsv = " ".join(snakemake.input.tsv)
 
+# Add interval block list for PAR regions if configured.
+par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"]["gcnv"].get(
+    "path_par_intervals"
+)
+if par_intervals:
+    par_args = f"-XL {par_intervals}"
+else:
+    par_args = ""
+
 shell(
     r"""
 export TMPDIR=$(mktemp -d)
@@ -26,10 +56,39 @@
 
 gatk DetermineGermlineContigPloidy \
     -L {snakemake.input.interval_list} \
+    {par_args} \
     --interval-merging-rule OVERLAPPING_ONLY \
     $(for tsv in {paths_tsv}; do echo -I $tsv; done) \
     --contig-ploidy-priors $PRIORS \
     --output $(dirname {snakemake.output}) \
     --output-prefix ploidy
 """
 )
+
+ploidy_calls = out_path / "ploidy-calls"
+for sample_dir in ploidy_calls.glob("SAMPLE_*"):
+    path_name = sample_dir / "sample_name.txt"
+    with path_name.open("rt") as inputf:
+        sample_name = inputf.read().strip()
+    sample_sex = sex_map[sample_name]
+    path_call = sample_dir / "contig_ploidy.tsv"
+    with path_call.open("rt") as inputf:
+        call_lines = inputf.readlines()
+    for i in range(len(call_lines)):
+        line = call_lines[i]
+        arr = line.split("\t")
+        chrom = arr[0].lower()
+        if "x" in chrom or "y" in chrom:
+            ploidy = int(arr[1])
+            if "x" in arr[0].lower():
+                if ploidy != ploidy_x[sample_sex]:
+                    arr[1] = str(ploidy_x[sample_sex])
+                    arr[2] = f"42.000{ploidy}"  # magic marker
+            elif "y" in arr[0].lower():
+                if ploidy != ploidy_y[sample_sex]:
+                    arr[1] = str(ploidy_y[sample_sex])
+                    arr[2] = f"42.000{ploidy}"  # magic marker
+        call_lines[i] = "\t".join(arr)
+    with path_call.open("wt") as outputf:
+        for line in call_lines:
+            print(line.strip(), file=outputf)
diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py
@@ -1,9 +1,37 @@
-# -*- coding: utf-8 -*-
+import pathlib
 
 from snakemake.shell import shell
 
+# We will first call the GATK tool.  However, there are issues with the reliability of the contig
+# ploidy calls.  This causes an exception in the  JointGermlineCNVSegmentation further downstream.
+# We thus rewrite the genotype calls based on the ones from the PED files as a workaround.
+#
+# cf. https://github.com/broadinstitute/gatk/issues/8164
+
+out_path = pathlib.Path(snakemake.output.done).parent
+
+MALE = "male"
+FEMALE = "female"
+
+sex_map = {}
+for ped_path in snakemake.input.ped:
+    with open(ped_path, "rt") as inputf:
+        for line in inputf:
+            arr = line.strip().split("\t")
+            sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]]
+
+ploidy_x = {MALE: 1, FEMALE: 2}
+ploidy_y = {MALE: 1, FEMALE: 0}
+
 paths_tsv = " ".join(snakemake.input.tsv)
 
+## Add interval block list for PAR regions if configured.
+# par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"].get("path_par_intervals")
+# if par_intervals:
+#    par_args = f"-XL {par_intervals}"
+# else:
+#    par_args = ""
+
 shell(
     r"""
 export THEANO_FLAGS="base_compiledir=$TMPDIR/theano_compile_dir"
@@ -17,3 +45,31 @@
     --output-prefix ploidy
 """
 )
+
+ploidy_calls = out_path / "ploidy-calls"
+for sample_dir in ploidy_calls.glob("SAMPLE_*"):
+    path_name = sample_dir / "sample_name.txt"
+    with path_name.open("rt") as inputf:
+        sample_name = inputf.read().strip()
+    sample_sex = sex_map[sample_name]
+    path_call = sample_dir / "contig_ploidy.tsv"
+    with path_call.open("rt") as inputf:
+        call_lines = inputf.readlines()
+    for i in range(len(call_lines)):
+        line = call_lines[i]
+        arr = line.split("\t")
+        chrom = arr[0].lower()
+        if "x" in chrom or "y" in chrom:
+            ploidy = int(arr[1])
+            if "x" in arr[0].lower():
+                if ploidy != ploidy_x[sample_sex]:
+                    arr[1] = str(ploidy_x[sample_sex])
+                    arr[2] = f"42.000{ploidy}"  # magic marker
+            elif "y" in arr[0].lower():
+                if ploidy != ploidy_y[sample_sex]:
+                    arr[1] = str(ploidy_y[sample_sex])
+                    arr[2] = f"42.000{ploidy}"  # magic marker
+        call_lines[i] = "\t".join(arr)
+    with path_call.open("wt") as outputf:
+        for line in call_lines:
+            print(line.strip(), file=outputf)
diff --git a/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py b/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py
@@ -13,7 +13,7 @@
         target_interval_bed = item["path"]
         break
 else:  # of for, did not break out
-    raise Exception("Found no target intervals for %s" % item["name"])
+    raise Exception("Found no target intervals for %s" % snakemake.wildcards.library_kit)
 
 shell(
     r"""

diff --git a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py
@@ -366,6 +366,7 @@ def test_gcnv_step_part_get_resource_usage(sv_calling_targeted_workflow):
     high_resource_actions = (
         "call_cnvs",
         "post_germline_calls",
+        "joint_germline_cnv_segmentation",
     )
     all_actions = sv_calling_targeted_workflow.substep_getattr("gcnv", "actions")
     default_actions = [action for action in all_actions if action not in high_resource_actions]