bihealth · Nicolai-vKuegelgen · Nov 15, 2023 · Nov 3, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Pipfiles created by VC Code
+/Pipfile*
+
 # Mac
 .*DS_Store
 

diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,7 @@ line_length = 100
 exclude =
     docs
     tests
+    venv
     .*.py
     .snakemake.*.wrapper.py
     splitMNPsAndComplex.py

diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_run.py b/snappy_pipeline/workflows/common/gcnv/gcnv_run.py
@@ -432,22 +432,16 @@ class JointGermlineCnvSegmentationMixin:
 
     @dictify
     def _get_output_files_joint_germline_cnv_segmentation(self):
-        name_pattern = "{mapper}.gcnv.{library_name}"
+        name_pattern = "{mapper}.gcnv_joint_segmentation.{kit}.{library_name}"
         work_files = {}
         for key, suffix in RESULT_EXTENSIONS.items():
             work_files[key] = f"work/{name_pattern}/out/{name_pattern}{suffix}"
         yield from work_files.items()
-        yield "output_links", [
-            re.sub(r"^work/", "output/", work_path)
-            for work_path in chain(
-                work_files.values(), self.get_log_file("joint_germline_cnv_segmentation").values()
-            )
-        ]
 
     @dictify
     def _get_log_file_joint_germline_cnv_segmentation(self):
         """Return log file **pattern** for the step ``joint_germline_cnv_segmentation``."""
-        name_pattern = "{mapper}.gcnv.{library_name}"
+        name_pattern = "{mapper}.gcnv_joint_segmentation.{kit}.{library_name}"
         for key, ext in LOG_EXTENSIONS.items():
             yield key, f"work/{name_pattern}/log/{name_pattern}.joint_germline_segmentation{ext}"
 
@@ -456,28 +450,79 @@ def _get_input_files_joint_germline_cnv_segmentation(self, wildcards):
         # Yield list of paths to input VCF files
         pedigree = self.index_ngs_library_to_pedigree[wildcards.library_name]
         ped_ngs_library_names = [
-            donor.dna_ngs_library.name for donor in pedigree.donors if donor.dna_ngs_library
+            donor.dna_ngs_library.name
+            for donor in pedigree.donors
+            if donor.dna_ngs_library
+            and self.ngs_library_to_kit[donor.dna_ngs_library.name] == wildcards.kit
         ]
         vcfs = []
         for library_name in sorted(ped_ngs_library_names):
             name_pattern = f"{wildcards.mapper}.gcnv_post_germline_calls.{library_name}"
             vcfs.append(f"work/{name_pattern}/out/{name_pattern}.vcf.gz")
         yield "vcf", vcfs
         # Yield path to interval list file
-        library_kit = self.ngs_library_to_kit[wildcards.library_name]
-        name_pattern = f"gcnv_preprocess_intervals.{library_kit}"
+        name_pattern = f"gcnv_preprocess_intervals.{wildcards.kit}"
         yield "interval_list", f"work/{name_pattern}/out/{name_pattern}.interval_list"
         # Yield path to pedigree file
         name_pattern = f"write_pedigree.{wildcards.library_name}"
         yield "ped", f"work/{name_pattern}/out/{wildcards.library_name}.ped"
 
 
+class MergeMultikitFamiliesMixin:
+    """Methods for merging families with multiple kits.
+
+    In the case of a single kit per family, we can simply copy the input
+    to the output.
+    """
+
+    @dictify
+    def _get_output_files_merge_multikit_families(self):
+        name_pattern = "{mapper}.gcnv.{library_name}"
+        work_files = {}
+        for key, suffix in RESULT_EXTENSIONS.items():
+            work_files[key] = f"work/{name_pattern}/out/{name_pattern}{suffix}"
+        yield from work_files.items()
+        yield "output_links", [
+            re.sub(r"^work/", "output/", work_path)
+            for work_path in chain(
+                work_files.values(), self.get_log_file("merge_multikit_families").values()
+            )
+        ]
+
+    @dictify
+    def _get_log_file_merge_multikit_families(self):
+        """Return log file **pattern** for the step ``merge_multikit_families``."""
+        name_pattern = "{mapper}.gcnv.{library_name}"
+        for key, ext in LOG_EXTENSIONS.items():
+            yield key, f"work/{name_pattern}/log/{name_pattern}.merge_multikit_families{ext}"
+
+    @dictify
+    def _get_input_files_merge_multikit_families(self, wildcards):
+        # Obtain all kits that we need to merge for this family.
+        pedigree = self.index_ngs_library_to_pedigree[wildcards.library_name]
+        kits = []
+        for donor in pedigree.donors:
+            if donor.dna_ngs_library:
+                kit = self.ngs_library_to_kit[donor.dna_ngs_library.name]
+                if kit not in kits:
+                    kits.append(kit)
+        # Yield list of paths to input VCF files
+        vcfs = []
+        for kit in kits:
+            name_pattern = (
+                f"{wildcards.mapper}.gcnv_joint_segmentation.{kit}.{wildcards.library_name}"
+            )
+            vcfs.append(f"work/{name_pattern}/out/{name_pattern}.vcf.gz")
+        yield "vcf", vcfs
+
+
 class RunGcnvStepPart(
     ValidationMixin,
     ContigPloidyMixin,
     CallCnvsMixin,
     PostGermlineCallsMixin,
     JointGermlineCnvSegmentationMixin,
+    MergeMultikitFamiliesMixin,
     GcnvCommonStepPart,
 ):
     """Class with methods to run GATK4 gCNV calling in CASE mode"""
@@ -490,6 +535,7 @@ class RunGcnvStepPart(
         "call_cnvs",
         "post_germline_calls",
         "joint_germline_cnv_segmentation",
+        "merge_multikit_families",
     )
 
     def __init__(self, parent):
@@ -530,12 +576,12 @@ def get_result_files(self):
         # Get list with all result path template strings.
         result_path_tpls = [
             path
-            for path in flatten(self._get_output_files_joint_germline_cnv_segmentation().values())
+            for path in flatten(self._get_output_files_merge_multikit_families().values())
             if path.startswith("output/")
         ]
 
         # Iterate over all pedigrees.  Check library kit consistency for each pedigree.  In case of inconsistent
-        # library kits within one pedigree, raise a warning and skip this pedigree.
+        # library kits within one pedigree, raise a warning about the quality in this pedigree.
         for index_library_name, pedigree in self.index_ngs_library_to_pedigree.items():
             # Obtain all library names
             library_names = [
@@ -546,17 +592,14 @@ def get_result_files(self):
                 names_kits = list(zip(library_names, library_kits))
                 msg = (
                     "Found inconsistent library kits (more than one kit!) for pedigree with index "
-                    f"{index_library_name}.  The library name/kit pairs are {names_kits}.  This pedigree "
-                    "will be SKIPPED in CNV call generation."
+                    f"{index_library_name}.  The library name/kit pairs are {names_kits}.  SNAPPY "
+                    "will attempt to merge the results but the result may contain artifacts."
                 )
                 warnings.warn(InconsistentLibraryKitsWarning(msg))
-                continue
-            else:
-                # The library kits are consistent in the pedigree.  Yield all concrete output paths, replacing
-                # prefix "work/" by "output/".
-                for path_tpl in result_path_tpls:
-                    yield from expand(
-                        path_tpl,
-                        mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
-                        library_name=[index_library_name],
-                    )
+            # Yield all concrete output paths, replacing prefix "work/" by "output/".
+            for path_tpl in result_path_tpls:
+                yield from expand(
+                    path_tpl,
+                    mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+                    library_name=[index_library_name],
+                )
diff --git a/snappy_pipeline/workflows/sv_calling_targeted/Snakefile b/snappy_pipeline/workflows/sv_calling_targeted/Snakefile
@@ -158,6 +158,23 @@ rule sv_calling_targeted_gcnv_joint_germline_cnv_segmentation:
         wf.wrapper_path("gcnv/joint_germline_cnv_segmentation")
 
 
+rule sv_calling_targeted_gcnv_merge_multikit_families:
+    input:
+        unpack(wf.get_input_files("gcnv", "merge_multikit_families")),
+    output:
+        **wf.get_output_files("gcnv", "merge_multikit_families"),
+    threads: wf.get_resource("gcnv", "merge_multikit_families", "threads")
+    resources:
+        time=wf.get_resource("gcnv", "merge_multikit_families", "time"),
+        memory=wf.get_resource("gcnv", "merge_multikit_families", "memory"),
+        partition=wf.get_resource("gcnv", "merge_multikit_families", "partition"),
+        tmpdir=wf.get_resource("gcnv", "merge_multikit_families", "tmpdir"),
+    log:
+        **wf.get_log_file("gcnv", "merge_multikit_families"),
+    wrapper:
+        wf.wrapper_path("gcnv/merge_multikit_families")
+
+
 # Run Manta -------------------------------------------------------------------
 
 

diff --git a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile
@@ -542,3 +542,20 @@ rule sv_calling_wgs_gcnv_joint_germline_cnv_segmentation:
         **wf.get_log_file("gcnv", "joint_germline_cnv_segmentation"),
     wrapper:
         wf.wrapper_path("gcnv/joint_germline_cnv_segmentation")
+
+
+rule sv_calling_wgs_gcnv_merge_multikit_families:
+    input:
+        unpack(wf.get_input_files("gcnv", "merge_multikit_families")),
+    output:
+        **wf.get_output_files("gcnv", "merge_multikit_families"),
+    threads: wf.get_resource("gcnv", "merge_multikit_families", "threads")
+    resources:
+        time=wf.get_resource("gcnv", "merge_multikit_families", "time"),
+        memory=wf.get_resource("gcnv", "merge_multikit_families", "memory"),
+        partition=wf.get_resource("gcnv", "merge_multikit_families", "partition"),
+        tmpdir=wf.get_resource("gcnv", "merge_multikit_families", "tmpdir"),
+    log:
+        **wf.get_log_file("gcnv", "merge_multikit_families"),
+    wrapper:
+        wf.wrapper_path("gcnv/merge_multikit_families")
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ line_length = 100 @@
     exclude =
         docs
         tests
+        venv
         .*.py
         .snakemake.*.wrapper.py
         splitMNPsAndComplex.py
@@ Expand Down @@