bihealth · ericblanc20 · Oct 11, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 31, 2023
diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -32,6 +32,7 @@
     ngs_sanity_checking,
     panel_of_normals,
     repeat_expansion,
+    somatic_cnv_checking,
     somatic_gene_fusion_calling,
     somatic_hla_loh_calling,
     somatic_msi_calling,
@@ -88,6 +89,7 @@
     "panel_of_normals": panel_of_normals,
     "repeat_analysis": repeat_expansion,
     "ngs_sanity_checking": ngs_sanity_checking,
+    "somatic_cnv_checking": somatic_cnv_checking,
     "somatic_gene_fusion_calling": somatic_gene_fusion_calling,
     "somatic_hla_loh_calling": somatic_hla_loh_calling,
     "somatic_msi_calling": somatic_msi_calling,

diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile
@@ -61,7 +61,12 @@ rule homologous_recombination_deficiency_link_out_run:
 rule homologous_recombination_deficiency_scarHRD_install:
     output:
         **wf.get_output_files("scarHRD", "install"),
-    threads: wf.get_resource("scarHRD", "install", "threads")
+    params:
+        packages=[
+            {"name": "aroneklund/copynumber", "repo": "github"},
+            {"name": "sequenzatools/sequenza", "repo": "bitbucket"},
+            {"name": "sztup/scarHRD", "repo": "github"},
+        ],
     resources:
         time=wf.get_resource("scarHRD", "install", "time"),
         memory=wf.get_resource("scarHRD", "install", "memory"),
@@ -70,22 +75,7 @@ rule homologous_recombination_deficiency_scarHRD_install:
     log:
         **wf.get_log_file("scarHRD", "install"),
     wrapper:
-        wf.wrapper_path("scarHRD/install")
-
-
-rule homologous_recombination_deficiency_scarHRD_gcreference:
-    output:
-        **wf.get_output_files("scarHRD", "gcreference"),
-    threads: wf.get_resource("scarHRD", "gcreference", "threads")
-    resources:
-        time=wf.get_resource("scarHRD", "gcreference", "time"),
-        memory=wf.get_resource("scarHRD", "gcreference", "memory"),
-        partition=wf.get_resource("scarHRD", "gcreference", "partition"),
-        tmpdir=wf.get_resource("scarHRD", "gcreference", "tmpdir"),
-    log:
-        **wf.get_log_file("scarHRD", "gcreference"),
-    wrapper:
-        wf.wrapper_path("scarHRD/gcreference")
+        wf.wrapper_path("r")
 
 
 rule homologous_recombination_deficiency_scarHRD_run:

diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py
@@ -13,12 +13,17 @@
 bioconda directory, while this package is not found when searching `bioconda<https://bioconda.github.io/index.html>`.
 copynumber is obtained from a fork of the official Bioconductor deprecated package.
 
+TODO: The implementation needs to uncouple ``scarHRD`` from ``sequenza``.
+It should be possible as ``scarHRD`` could use the output from other CNV callers.
+However, it is poorly documented, and will require significant work.
+
 ==========
 Step Input
 ==========
 
-``homologous_recombination_deficiency`` starts off the aligned reads, i.e. ``ngs_mapping``.
-Both the normal & tumor samples are required to generate the score.
+``homologous_recombination_deficiency`` starts off the ``sequenza`` copy number output.
+Some steps performed by ``sequenza`` are repeated by ``scanHRD``, as ``scanHRD`` uses a specific
+set of arguments. This is the reason why ``sequenza`` is required by the step.
 
 ===========
 Step Output
@@ -32,9 +37,9 @@
     of this tool.  In the future, this section might contain "common" output and tool-specific
     output sub sections.
 
-- ``{mapper}.scarHRD.{lib_name}-{lib_pk}/out/``
-    - ``{mapper}.scarHRD.{lib_name}-{lib_pk}.seqz.gz``
-    - ``{mapper}.scarHRD.{lib_name}-{lib_pk}.json``
+- ``{mapper}.{caller}.scarHRD.{lib_name}-{lib_pk}/out/``
+    - ``{mapper}.{caller}.scarHRD.{lib_name}-{lib_pk}.seqz.gz``
+    - ``{mapper}.{caller}.scarHRD.{lib_name}-{lib_pk}.json``
 
 =====================
 Default Configuration
@@ -52,7 +57,6 @@
 
 """
 
-from collections import OrderedDict
 import sys
 
 from biomedsheets.shortcuts import CancerCaseSheet, is_not_background
@@ -66,7 +70,9 @@
     LinkOutStepPart,
     ResourceUsage,
 )
-from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
+from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import (
+    SomaticTargetedSeqCnvCallingWorkflow,
+)
 
 __author__ = "Eric Blanc <[email protected]>"
 
@@ -75,8 +81,8 @@
 # Default configuration homologous_recombination_deficiency
 step_config:
   homologous_recombination_deficiency:
-    tools: ['scarHRD']  # REQUIRED - available: 'mantis'
-    path_ngs_mapping: ../ngs_mapping  # REQUIRED
+    tools: ['scarHRD']  # REQUIRED - available: 'scarHRD'
+    path_cnv_calling: ../somatic_targeted_seq_cnv_calling  # REQUIRED
     scarHRD:
       genome_name: "grch37"  # Must be either "grch37", "grch38" or "mouse"
       chr_prefix: False
@@ -93,81 +99,31 @@ class ScarHRDStepPart(BaseStepPart):
     #: Class available actions
     actions = (
         "install",
-        "gcreference",
         "run",
     )
 
     def __init__(self, parent):
         super().__init__(parent)
-        self.base_path_out = (
-            "work/{{mapper}}.scarHRD.{{library_name}}/out/{{mapper}}.scarHRD.{{library_name}}{ext}"
-        )
-        # Build shortcut from cancer bio sample name to matched cancer sample
-        self.tumor_ngs_library_to_sample_pair = OrderedDict()
-        for sheet in self.parent.shortcut_sheets:
-            self.tumor_ngs_library_to_sample_pair.update(
-                sheet.all_sample_pairs_by_tumor_dna_ngs_library
-            )
-
-    def get_normal_lib_name(self, wildcards):
-        """Return name of normal (non-cancer) library"""
-        pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name]
-        return pair.normal_sample.dna_ngs_library.name
 
     def get_input_files(self, action):
-        def input_function_run(wildcards):
-            """Helper wrapper function"""
-            # Get shorcut to Snakemake sub workflow
-            ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
-            # Get names of primary libraries of the selected cancer bio sample and the
-            # corresponding primary normal sample
-            normal_base_path = (
-                "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
-                    normal_library=self.get_normal_lib_name(wildcards), **wildcards
-                )
-            )
-            tumor_base_path = (
-                "output/{mapper}.{library_name}/out/" "{mapper}.{library_name}"
-            ).format(**wildcards)
-            return {
-                "lib_path": "work/R_packages/out/.done",
-                "gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format(
-                    genome_name=self.config["scarHRD"]["genome_name"],
-                    length=self.config["scarHRD"]["length"],
-                ),
-                "normal_bam": ngs_mapping(normal_base_path + ".bam"),
-                "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
-                "tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
-                "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
-            }
+        self._validate_action(action)
 
-        if action == "install":
-            return None
-        elif action == "gcreference":
-            return None
-        elif action == "run":
-            return input_function_run
-        else:
-            raise UnsupportedActionException(
-                "Action '{action}' is not supported. Valid options: {valid}".format(
-                    action=action, valid=", ".join(self.actions)
-                )
-            )
+        return self._get_input_files_run
+
+    @dictify
+    def _get_input_files_run(self, wildcards):
+        self.cnv_calling = self.parent.sub_workflows["cnv_calling"]
+        base_name = f"{wildcards.mapper}.{wildcards.caller}.{wildcards.library_name}"
+        yield "done", "work/R_packages/out/scarHRD.done"
+        yield "seqz", self.cnv_calling(f"output/{base_name}/out/{base_name}.seqz.gz")
 
     def get_output_files(self, action):
         if action == "install":
-            return {"lib_path": "work/R_packages/out/.done"}
-        elif action == "gcreference":
-            return {
-                "gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format(
-                    genome_name=self.config["scarHRD"]["genome_name"],
-                    length=self.config["scarHRD"]["length"],
-                )
-            }
+            return {"done": "work/R_packages/out/scarHRD.done"}
         elif action == "run":
             return {
-                "sequenza": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.seqz.gz",
-                "scarHRD": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.json",
+                "scarHRD": "work/{mapper}.{caller}.scarHRD.{library_name}/out/{mapper}.{caller}.scarHRD.{library_name}.json",
+                "scarHRD_md5": "work/{mapper}.{caller}.scarHRD.{library_name}/out/{mapper}.{caller}.scarHRD.{library_name}.json.md5",
             }
         else:
             raise UnsupportedActionException(
@@ -180,14 +136,9 @@ def get_output_files(self, action):
     def _get_log_file(self, action):
         """Return dict of log files."""
         if action == "install":
-            prefix = "work/R_packages/log/R_packages"
-        elif action == "gcreference":
-            prefix = "work/static_data/log/{genome_name}_{length}".format(
-                genome_name=self.config["scarHRD"]["genome_name"],
-                length=self.config["scarHRD"]["length"],
-            )
+            prefix = "work/R_packages/log/scarHRD"
         elif action == "run":
-            prefix = "work/{mapper}.scarHRD.{library_name}/log/{mapper}.scarHRD.{library_name}"
+            prefix = "work/{mapper}.{caller}.scarHRD.{library_name}/log/{mapper}.{caller}.scarHRD.{library_name}"
         else:
             raise UnsupportedActionException(
                 "Action '{action}' is not supported. Valid options: {valid}".format(
@@ -204,32 +155,15 @@ def _get_log_file(self, action):
             yield key + "_md5", prefix + ext + ".md5"
 
     def get_resource_usage(self, action):
-        """Get Resource Usage
-
-        :param action: Action (i.e., step) in the workflow, example: 'run'.
-        :type action: str
-
-        :return: Returns ResourceUsage for step.
-        """
-        if action == "install" or action == "gcreference":
+        self._validate_action(action)
+        if action == "run":
             return ResourceUsage(
                 threads=1,
-                time="02:00:00",  # 2 hours
-                memory="4096M",
-                partition="short",
-            )
-        elif action == "run":
-            return ResourceUsage(
-                threads=2,
-                time="48:00:00",  # 2 hours
                 memory="32G",
+                time="24:00:00",
             )
         else:
-            raise UnsupportedActionException(
-                "Action '{action}' is not supported. Valid options: {valid}".format(
-                    action=action, valid=", ".join(self.actions)
-                )
-            )
+            return super().get_resource_usage(action)
 
 
 class HomologousRecombinationDeficiencyWorkflow(BaseStep):
@@ -253,16 +187,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (NgsMappingWorkflow,),
+            (SomaticTargetedSeqCnvCallingWorkflow,),
         )
         # Register sub step classes so the sub steps are available
         self.register_sub_step_classes((ScarHRDStepPart, LinkOutStepPart))
         # Initialize sub-workflows
-        self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
+        self.register_sub_workflow(
+            "somatic_targeted_seq_cnv_calling", self.config["path_cnv_calling"], "cnv_calling"
+        )
 
     @listify
     def get_result_files(self):
-        """Return list of result files for the somatic targeted sequencing CNV calling step"""
+        """Return list of result files for the homologous recombination deficiency step"""
         tool_actions = {"scarHRD": ("run",)}
         for sheet in filter(is_not_background, self.shortcut_sheets):
             for sample_pair in sheet.all_sample_pairs:
@@ -282,14 +218,17 @@ def get_result_files(self):
                             tpls = self.sub_steps[tool].get_output_files(action).values()
                         except AttributeError:
                             tpls = self.sub_steps[tool].get_output_files(action)
+                        tpls = list(tpls)
+                        tpls += list(self.sub_steps[tool].get_log_file(action).values())
                         for tpl in tpls:
                             filenames = expand(
                                 tpl,
                                 mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+                                caller=["sequenza"],
                                 library_name=[sample_pair.tumor_sample.dna_ngs_library.name],
                             )
                             for f in filenames:
-                                if ".tmp." not in f:
+                                if ".tmp." not in f and not f.endswith(".done"):
                                     yield f.replace("work/", "output/")
 
     def check_config(self):
@@ -298,3 +237,6 @@ def check_config(self):
             ("static_data_config", "reference", "path"),
             "Path to reference FASTA file not configured but required",
         )
+        assert (
+            "sequenza" in self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"]
+        )
diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile
@@ -211,3 +211,69 @@ rule panel_of_normals_cnvkit_report:
         args=wf.substep_dispatch("cnvkit", "get_args", "report"),
     wrapper:
         wf.wrapper_path("cnvkit/report")
+
+
+# Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+# Install the annotation R package --------------------------
+
+
+rule panel_of_normals_purecn_install:
+    output:
+        **wf.get_output_files("purecn", "install"),
+    params:
+        container="docker://markusriester/purecn",
+    resources:
+        time=wf.get_resource("purecn", "install", "time"),
+        memory=wf.get_resource("purecn", "install", "memory"),
+        partition=wf.get_resource("purecn", "install", "partition"),
+    log:
+        **wf.get_log_file("purecn", "install"),
+    wrapper:
+        wf.wrapper_path("singularity")
+
+
+rule panel_of_normals_purecn_prepare:
+    input:
+        **wf.get_input_files("purecn", "prepare"),
+    output:
+        **wf.get_output_files("purecn", "prepare"),
+    resources:
+        time=wf.get_resource("purecn", "prepare", "time"),
+        memory=wf.get_resource("purecn", "prepare", "memory"),
+        partition=wf.get_resource("purecn", "prepare", "partition"),
+    log:
+        **wf.get_log_file("purecn", "prepare"),
+    wrapper:
+        wf.wrapper_path("purecn/prepare")
+
+
+rule panel_of_normals_purecn_coverage:
+    input:
+        unpack(wf.get_input_files("purecn", "coverage")),
+    output:
+        **wf.get_output_files("purecn", "coverage"),
+    resources:
+        time=wf.get_resource("purecn", "coverage", "time"),
+        memory=wf.get_resource("purecn", "coverage", "memory"),
+        partition=wf.get_resource("purecn", "coverage", "partition"),
+    log:
+        **wf.get_log_file("purecn", "coverage"),
+    wrapper:
+        wf.wrapper_path("purecn/coverage")
+
+
+rule panel_of_normals_purecn_create_panel:
+    input:
+        unpack(wf.get_input_files("purecn", "create_panel")),
+    output:
+        **wf.get_output_files("purecn", "create_panel"),
+    resources:
+        time=wf.get_resource("purecn", "create_panel", "time"),
+        memory=wf.get_resource("purecn", "create_panel", "memory"),
+        partition=wf.get_resource("purecn", "create_panel", "partition"),
+    log:
+        **wf.get_log_file("purecn", "create_panel"),
+    wrapper:
+        wf.wrapper_path("purecn/create_panel")