Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: merge gCNV output in case of multi-kit families (#292) #465

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Pipfiles created by VC Code
/Pipfile*

# Mac
.*DS_Store

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ line_length = 100
exclude =
docs
tests
venv
.*.py
.snakemake.*.wrapper.py
splitMNPsAndComplex.py
Expand Down
93 changes: 68 additions & 25 deletions snappy_pipeline/workflows/common/gcnv/gcnv_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,22 +432,16 @@ class JointGermlineCnvSegmentationMixin:

@dictify
def _get_output_files_joint_germline_cnv_segmentation(self):
name_pattern = "{mapper}.gcnv.{library_name}"
name_pattern = "{mapper}.gcnv_joint_segmentation.{kit}.{library_name}"
work_files = {}
for key, suffix in RESULT_EXTENSIONS.items():
work_files[key] = f"work/{name_pattern}/out/{name_pattern}{suffix}"
yield from work_files.items()
yield "output_links", [
re.sub(r"^work/", "output/", work_path)
for work_path in chain(
work_files.values(), self.get_log_file("joint_germline_cnv_segmentation").values()
)
]

@dictify
def _get_log_file_joint_germline_cnv_segmentation(self):
"""Return log file **pattern** for the step ``joint_germline_cnv_segmentation``."""
name_pattern = "{mapper}.gcnv.{library_name}"
name_pattern = "{mapper}.gcnv_joint_segmentation.{kit}.{library_name}"
for key, ext in LOG_EXTENSIONS.items():
yield key, f"work/{name_pattern}/log/{name_pattern}.joint_germline_segmentation{ext}"

Expand All @@ -456,28 +450,79 @@ def _get_input_files_joint_germline_cnv_segmentation(self, wildcards):
# Yield list of paths to input VCF files
pedigree = self.index_ngs_library_to_pedigree[wildcards.library_name]
ped_ngs_library_names = [
donor.dna_ngs_library.name for donor in pedigree.donors if donor.dna_ngs_library
donor.dna_ngs_library.name
for donor in pedigree.donors
if donor.dna_ngs_library
and self.ngs_library_to_kit[donor.dna_ngs_library.name] == wildcards.kit
]
vcfs = []
for library_name in sorted(ped_ngs_library_names):
name_pattern = f"{wildcards.mapper}.gcnv_post_germline_calls.{library_name}"
vcfs.append(f"work/{name_pattern}/out/{name_pattern}.vcf.gz")
yield "vcf", vcfs
# Yield path to interval list file
library_kit = self.ngs_library_to_kit[wildcards.library_name]
name_pattern = f"gcnv_preprocess_intervals.{library_kit}"
name_pattern = f"gcnv_preprocess_intervals.{wildcards.kit}"
yield "interval_list", f"work/{name_pattern}/out/{name_pattern}.interval_list"
# Yield path to pedigree file
name_pattern = f"write_pedigree.{wildcards.library_name}"
yield "ped", f"work/{name_pattern}/out/{wildcards.library_name}.ped"


class MergeMultikitFamiliesMixin:
"""Methods for merging families with multiple kits.

In the case of a single kit per family, we can simply copy the input
to the output.
"""

@dictify
def _get_output_files_merge_multikit_families(self):
name_pattern = "{mapper}.gcnv.{library_name}"
work_files = {}
for key, suffix in RESULT_EXTENSIONS.items():
work_files[key] = f"work/{name_pattern}/out/{name_pattern}{suffix}"
yield from work_files.items()
yield "output_links", [
re.sub(r"^work/", "output/", work_path)
for work_path in chain(
work_files.values(), self.get_log_file("merge_multikit_families").values()
)
]

@dictify
def _get_log_file_merge_multikit_families(self):
"""Return log file **pattern** for the step ``merge_multikit_families``."""
name_pattern = "{mapper}.gcnv.{library_name}"
for key, ext in LOG_EXTENSIONS.items():
yield key, f"work/{name_pattern}/log/{name_pattern}.merge_multikit_families{ext}"

@dictify
def _get_input_files_merge_multikit_families(self, wildcards):
# Obtain all kits that we need to merge for this family.
pedigree = self.index_ngs_library_to_pedigree[wildcards.library_name]
kits = []
for donor in pedigree.donors:
if donor.dna_ngs_library:
kit = self.ngs_library_to_kit[donor.dna_ngs_library.name]
if kit not in kits:
kits.append(kit)
# Yield list of paths to input VCF files
vcfs = []
for kit in kits:
name_pattern = (
f"{wildcards.mapper}.gcnv_joint_segmentation.{kit}.{wildcards.library_name}"
)
vcfs.append(f"work/{name_pattern}/out/{name_pattern}.vcf.gz")
yield "vcf", vcfs


class RunGcnvStepPart(
ValidationMixin,
ContigPloidyMixin,
CallCnvsMixin,
PostGermlineCallsMixin,
JointGermlineCnvSegmentationMixin,
MergeMultikitFamiliesMixin,
GcnvCommonStepPart,
):
"""Class with methods to run GATK4 gCNV calling in CASE mode"""
Expand All @@ -490,6 +535,7 @@ class RunGcnvStepPart(
"call_cnvs",
"post_germline_calls",
"joint_germline_cnv_segmentation",
"merge_multikit_families",
)

def __init__(self, parent):
Expand Down Expand Up @@ -530,12 +576,12 @@ def get_result_files(self):
# Get list with all result path template strings.
result_path_tpls = [
path
for path in flatten(self._get_output_files_joint_germline_cnv_segmentation().values())
for path in flatten(self._get_output_files_merge_multikit_families().values())
if path.startswith("output/")
]

# Iterate over all pedigrees. Check library kit consistency for each pedigree. In case of inconsistent
# library kits within one pedigree, raise a warning and skip this pedigree.
# library kits within one pedigree, raise a warning about the quality in this pedigree.
for index_library_name, pedigree in self.index_ngs_library_to_pedigree.items():
# Obtain all library names
library_names = [
Expand All @@ -546,17 +592,14 @@ def get_result_files(self):
names_kits = list(zip(library_names, library_kits))
msg = (
"Found inconsistent library kits (more than one kit!) for pedigree with index "
f"{index_library_name}. The library name/kit pairs are {names_kits}. This pedigree "
"will be SKIPPED in CNV call generation."
f"{index_library_name}. The library name/kit pairs are {names_kits}. SNAPPY "
"will attempt to merge the results but the result may contain artifacts."
)
warnings.warn(InconsistentLibraryKitsWarning(msg))
continue
else:
# The library kits are consistent in the pedigree. Yield all concrete output paths, replacing
# prefix "work/" by "output/".
for path_tpl in result_path_tpls:
yield from expand(
path_tpl,
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
library_name=[index_library_name],
)
# Yield all concrete output paths, replacing prefix "work/" by "output/".
for path_tpl in result_path_tpls:
yield from expand(
path_tpl,
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
library_name=[index_library_name],
)
17 changes: 17 additions & 0 deletions snappy_pipeline/workflows/sv_calling_targeted/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,23 @@ rule sv_calling_targeted_gcnv_joint_germline_cnv_segmentation:
wf.wrapper_path("gcnv/joint_germline_cnv_segmentation")


rule sv_calling_targeted_gcnv_merge_multikit_families:
input:
unpack(wf.get_input_files("gcnv", "merge_multikit_families")),
output:
**wf.get_output_files("gcnv", "merge_multikit_families"),
threads: wf.get_resource("gcnv", "merge_multikit_families", "threads")
resources:
time=wf.get_resource("gcnv", "merge_multikit_families", "time"),
memory=wf.get_resource("gcnv", "merge_multikit_families", "memory"),
partition=wf.get_resource("gcnv", "merge_multikit_families", "partition"),
tmpdir=wf.get_resource("gcnv", "merge_multikit_families", "tmpdir"),
log:
**wf.get_log_file("gcnv", "merge_multikit_families"),
wrapper:
wf.wrapper_path("gcnv/merge_multikit_families")


# Run Manta -------------------------------------------------------------------


Expand Down
17 changes: 17 additions & 0 deletions snappy_pipeline/workflows/sv_calling_wgs/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -542,3 +542,20 @@ rule sv_calling_wgs_gcnv_joint_germline_cnv_segmentation:
**wf.get_log_file("gcnv", "joint_germline_cnv_segmentation"),
wrapper:
wf.wrapper_path("gcnv/joint_germline_cnv_segmentation")


rule sv_calling_wgs_gcnv_merge_multikit_families:
input:
unpack(wf.get_input_files("gcnv", "merge_multikit_families")),
output:
**wf.get_output_files("gcnv", "merge_multikit_families"),
threads: wf.get_resource("gcnv", "merge_multikit_families", "threads")
resources:
time=wf.get_resource("gcnv", "merge_multikit_families", "time"),
memory=wf.get_resource("gcnv", "merge_multikit_families", "memory"),
partition=wf.get_resource("gcnv", "merge_multikit_families", "partition"),
tmpdir=wf.get_resource("gcnv", "merge_multikit_families", "tmpdir"),
log:
**wf.get_log_file("gcnv", "merge_multikit_families"),
wrapper:
wf.wrapper_path("gcnv/merge_multikit_families")
Loading
Loading