From c0a8c4293d3ff30991000ef06c737c5d97b17399 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 22 Aug 2023 10:02:01 +0200 Subject: [PATCH 1/4] feat: Added support for scarHRD --- snappy_pipeline/apps/snappy_snake.py | 4 +- .../Snakefile | 101 ++++++ .../__init__.py | 297 ++++++++++++++++++ .../wrappers/scarHRD/environment.yaml | 10 + .../scarHRD/gcreference/environment.yaml | 1 + .../wrappers/scarHRD/gcreference/wrapper.py | 49 +++ .../wrappers/scarHRD/install/environment.yaml | 1 + .../wrappers/scarHRD/install/wrapper.py | 48 +++ .../wrappers/scarHRD/run/environment.yaml | 1 + .../wrappers/scarHRD/run/wrapper.py | 81 +++++ ...ows_homologous_recombination_deficiency.py | 200 ++++++++++++ 11 files changed, 791 insertions(+), 2 deletions(-) create mode 100644 snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile create mode 100644 snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py create mode 100644 snappy_wrappers/wrappers/scarHRD/environment.yaml create mode 120000 snappy_wrappers/wrappers/scarHRD/gcreference/environment.yaml create mode 100644 snappy_wrappers/wrappers/scarHRD/gcreference/wrapper.py create mode 120000 snappy_wrappers/wrappers/scarHRD/install/environment.yaml create mode 100644 snappy_wrappers/wrappers/scarHRD/install/wrapper.py create mode 120000 snappy_wrappers/wrappers/scarHRD/run/environment.yaml create mode 100644 snappy_wrappers/wrappers/scarHRD/run/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index 6948b0682..d6ea062f3 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -25,13 +25,13 @@ helper_gcnv_model_targeted, helper_gcnv_model_wgs, hla_typing, + homologous_recombination_deficiency, igv_session_generation, ngs_data_qc, ngs_mapping, ngs_sanity_checking, panel_of_normals, repeat_expansion, - somatic_cnv_checking, somatic_gene_fusion_calling, somatic_hla_loh_calling, somatic_msi_calling, @@ -81,13 +81,13 @@ "helper_gcnv_model_targeted": helper_gcnv_model_targeted, "helper_gcnv_model_wgs": helper_gcnv_model_wgs, "hla_typing": hla_typing, + "homologous_recombination_deficiency": homologous_recombination_deficiency, "igv_session_generation": igv_session_generation, "ngs_mapping": ngs_mapping, "ngs_data_qc": ngs_data_qc, "panel_of_normals": panel_of_normals, "repeat_analysis": repeat_expansion, "ngs_sanity_checking": ngs_sanity_checking, - "somatic_cnv_checking": somatic_cnv_checking, "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, "somatic_msi_calling": somatic_msi_calling, diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile new file mode 100644 index 000000000..e9f6d0b3b --- /dev/null +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline homologous_recombination_deficiency step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.homologous_recombination_deficiency import HomologousRecombinationDeficiencyWorkflow + +__author__ = "Eric Blanc" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== + +wf = HomologousRecombinationDeficiencyWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + +# Rules ======================================================================= + + +localrules: + # Linking files from work/ to output/ should be done locally + homologous_recombination_deficiency_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Generic linking out --------------------------------------------------------- + + +rule homologous_recombination_deficiency_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +# Homologous Recombination Deficiency score ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Run scarHRD ----------------------------------------------------------------- + + +rule homologous_recombination_deficiency_scarHRD_install: + output: + **wf.get_output_files("scarHRD", "install"), + threads: wf.get_resource("scarHRD", "install", "threads") + resources: + time=wf.get_resource("scarHRD", "install", "time"), + memory=wf.get_resource("scarHRD", "install", "memory"), + partition=wf.get_resource("scarHRD", "install", "partition"), + tmpdir=wf.get_resource("scarHRD", "install", "tmpdir"), + log: + **wf.get_log_file("scarHRD", "install"), + wrapper: + wf.wrapper_path("scarHRD/install") + + +rule homologous_recombination_deficiency_scarHRD_gcreference: + output: + **wf.get_output_files("scarHRD", "gcreference"), + threads: wf.get_resource("scarHRD", "gcreference", "threads") + resources: + time=wf.get_resource("scarHRD", "gcreference", "time"), + memory=wf.get_resource("scarHRD", "gcreference", "memory"), + partition=wf.get_resource("scarHRD", "gcreference", "partition"), + tmpdir=wf.get_resource("scarHRD", "gcreference", "tmpdir"), + log: + **wf.get_log_file("scarHRD", "gcreference"), + wrapper: + wf.wrapper_path("scarHRD/gcreference") + + +rule homologous_recombination_deficiency_scarHRD_run: + input: + unpack(wf.get_input_files("scarHRD", "run")) + output: + **wf.get_output_files("scarHRD", "run"), + threads: wf.get_resource("scarHRD", "run", "threads") + resources: + time=wf.get_resource("scarHRD", "run", "time"), + memory=wf.get_resource("scarHRD", "run", "memory"), + partition=wf.get_resource("scarHRD", "run", "partition"), + tmpdir=wf.get_resource("scarHRD", "run", "tmpdir"), + log: + **wf.get_log_file("scarHRD", "run"), + wrapper: + wf.wrapper_path("scarHRD/run") diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py new file mode 100644 index 000000000..55c396049 --- /dev/null +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``homologous_recombination_deficiency`` step + +This step allows for the computation of the scarHRD score, which is a composite of +loss-of-heterozygocity, large-scale transitions and telomeric imbalances. +The score is a proxy for holomogous recombination deficiencies scores produced from SNP arrays. +The software is described in `Sztupinszki et al.`, +but it is not part of CRAN, Bioconductor or bioconda (currently). +The implementation also relies on obsolete version of `sequenza` & +`copynumber`, which are not part of CRAN or Bioconductor anymore. +These software are downloaded directly from their github repo. + +========== +Step Input +========== + +``homologous_recombination_deficiency`` starts off the aligned reads, i.e. ``ngs_mapping``. +Both the normal & tumor samples are required to generate the score. + +=========== +Step Output +=========== + +Generally, the following links are generated to ``output/``. + +.. note:: Tool-Specific Output + + As the only integrated tool is scarHRD at the moment, the output is very tailored to the result + of this tool. In the future, this section might contain "common" output and tool-specific + output sub sections. + +- ``{mapper}.scarHRD.{lib_name}-{lib_pk}/out/`` + - ``{mapper}.scarHRD.{lib_name}-{lib_pk}.seqz.gz`` + - ``{mapper}.scarHRD.{lib_name}-{lib_pk}.json`` + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_homologous_recombination_deficiency.rst + +===================================== +Available HRD tools +===================================== + +- ``scarHRD`` + +""" + +from collections import OrderedDict +import sys + +from biomedsheets.shortcuts import CancerCaseSheet, is_not_background +from snakemake.io import expand + +from snappy_pipeline.base import UnsupportedActionException +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import ( + BaseStep, + BaseStepPart, + LinkOutStepPart, + ResourceUsage, +) +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow + +__author__ = "Eric Blanc " + +#: Default configuration for the homologous recombination deficiency step +DEFAULT_CONFIG = r""" +# Default configuration homologous_recombination_deficiency +step_config: + homologous_recombination_deficiency: + tools: ['scarHRD'] # REQUIRED - available: 'mantis' + path_ngs_mapping: ../ngs_mapping # REQUIRED + scarHRD: + genome_name: "grch37" # Must be either "grch37", "grch38" or "mouse" + chr_prefix: False + length: 50 # Wiggle track for GC reference file +""" + + +class ScarHRDStepPart(BaseStepPart): + """Computes homologous recombination deficiency score with scarHRD""" + + #: Step name + name = "scarHRD" + + #: Class available actions + actions = ( + "install", + "gcreference", + "run", + ) + + def __init__(self, parent): + super().__init__(parent) + self.base_path_out = ( + "work/{{mapper}}.scarHRD.{{library_name}}/out/{{mapper}}.scarHRD.{{library_name}}{ext}" + ) + # Build shortcut from cancer bio sample name to matched cancer sample + self.tumor_ngs_library_to_sample_pair = OrderedDict() + for sheet in self.parent.shortcut_sheets: + self.tumor_ngs_library_to_sample_pair.update( + sheet.all_sample_pairs_by_tumor_dna_ngs_library + ) + + def get_normal_lib_name(self, wildcards): + """Return name of normal (non-cancer) library""" + pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name] + return pair.normal_sample.dna_ngs_library.name + + def get_input_files(self, action): + def input_function_run(wildcards): + """Helper wrapper function""" + # Get shorcut to Snakemake sub workflow + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + # Get names of primary libraries of the selected cancer bio sample and the + # corresponding primary normal sample + normal_base_path = ( + "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( + normal_library=self.get_normal_lib_name(wildcards), **wildcards + ) + ) + tumor_base_path = ( + "output/{mapper}.{library_name}/out/" "{mapper}.{library_name}" + ).format(**wildcards) + return { + "lib_path": "work/R_packages/out/.done", + "gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format( + genome_name=self.config["scarHRD"]["genome_name"], + length=self.config["scarHRD"]["length"], + ), + "normal_bam": ngs_mapping(normal_base_path + ".bam"), + "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"), + "tumor_bam": ngs_mapping(tumor_base_path + ".bam"), + "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"), + } + + if action == "install": + return None + elif action == "gcreference": + return None + elif action == "run": + return input_function_run + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + + def get_output_files(self, action): + if action == "install": + return {"lib_path": "work/R_packages/out/.done"} + elif action == "gcreference": + return { + "gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format( + genome_name=self.config["scarHRD"]["genome_name"], + length=self.config["scarHRD"]["length"], + ) + } + elif action == "run": + return { + "sequenza": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.seqz.gz", + "scarHRD": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.json", + } + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + + @dictify + def _get_log_file(self, action): + """Return dict of log files.""" + if action == "install": + prefix = "work/R_packages/log/R_packages" + elif action == "gcreference": + prefix = "work/static_data/log/{genome_name}_{length}".format( + genome_name=self.config["scarHRD"]["genome_name"], + length=self.config["scarHRD"]["length"], + ) + elif action == "run": + prefix = "work/{mapper}.scarHRD.{library_name}/log/{mapper}.scarHRD.{library_name}" + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + """ + if action == "install" or action == "gcreference": + return ResourceUsage( + threads=1, + time="02:00:00", # 2 hours + memory="4096M", + partition="short", + ) + elif action == "run": + return ResourceUsage( + threads=2, + time="48:00:00", # 2 hours + memory="32G", + ) + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + + +class HomologousRecombinationDeficiencyWorkflow(BaseStep): + """Compute Homologous Recombination Deficiency score""" + + #: Step name + name = "homologous_recombination_deficiency" + + #: Default biomed sheet class + sheet_shortcut_class = CancerCaseSheet + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (NgsMappingWorkflow,), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((ScarHRDStepPart, LinkOutStepPart)) + # Initialize sub-workflows + self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + + @listify + def get_result_files(self): + """Return list of result files for the somatic targeted sequencing CNV calling step""" + tool_actions = {"scarHRD": ("run",)} + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + for tool in self.config["tools"]: + for action in tool_actions[tool]: + try: + tpls = self.sub_steps[tool].get_output_files(action).values() + except AttributeError: + tpls = self.sub_steps[tool].get_output_files(action) + for tpl in tpls: + filenames = expand( + tpl, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + library_name=[sample_pair.tumor_sample.dna_ngs_library.name], + ) + for f in filenames: + if ".tmp." not in f: + yield f.replace("work/", "output/") + + def check_config(self): + """Check that the necessary globalc onfiguration is present""" + self.ensure_w_config( + ("static_data_config", "reference", "path"), + "Path to reference FASTA file not configured but required", + ) diff --git a/snappy_wrappers/wrappers/scarHRD/environment.yaml b/snappy_wrappers/wrappers/scarHRD/environment.yaml new file mode 100644 index 000000000..b6a8c8b49 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/environment.yaml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python =3.9 + - sequenza-utils + - r-sequenza + - r-devtools + - r-data.table + - samtools diff --git a/snappy_wrappers/wrappers/scarHRD/gcreference/environment.yaml b/snappy_wrappers/wrappers/scarHRD/gcreference/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/gcreference/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/scarHRD/gcreference/wrapper.py b/snappy_wrappers/wrappers/scarHRD/gcreference/wrapper.py new file mode 100644 index 000000000..1b9a2c4b0 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/gcreference/wrapper.py @@ -0,0 +1,49 @@ +"""CUBI+Snakemake wrapper code for scarHRD (sequenza GC reference file) +""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +step = snakemake.config["pipeline_step"]["name"] +genome = snakemake.config["static_data_config"]["reference"]["path"] +length = snakemake.config["step_config"][step]["scarHRD"]["length"] + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +sequenza-utils gc_wiggle --fasta {genome} -w {length} -o {snakemake.output} + +pushd $(dirname {snakemake.output}) +md5sum $(basename {snakemake.output}) > $(basename {snakemake.output}).md5 +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/scarHRD/install/environment.yaml b/snappy_wrappers/wrappers/scarHRD/install/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/install/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/scarHRD/install/wrapper.py b/snappy_wrappers/wrappers/scarHRD/install/wrapper.py new file mode 100644 index 000000000..a9a9c15e0 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/install/wrapper.py @@ -0,0 +1,48 @@ +"""CUBI+Snakemake wrapper code for scarHRD (non-conda package installation) +""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +lib_path = os.path.dirname(snakemake.output.lib_path) + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +R --vanilla --slave << __EOF +devtools::install_github("aroneklund/copynumber", lib="{lib_path}", upgrade="never") +devtools::install_github("sztup/scarHRD", lib="{lib_path}", upgrade="never") +__EOF +touch {snakemake.output} +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/scarHRD/run/environment.yaml b/snappy_wrappers/wrappers/scarHRD/run/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/run/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/scarHRD/run/wrapper.py b/snappy_wrappers/wrappers/scarHRD/run/wrapper.py new file mode 100644 index 000000000..4f21ecfee --- /dev/null +++ b/snappy_wrappers/wrappers/scarHRD/run/wrapper.py @@ -0,0 +1,81 @@ +"""CUBI+Snakemake wrapper code for scarHRD (non-conda package installation) +""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +lib_path = os.path.dirname(snakemake.input.lib_path) + +step = snakemake.config["pipeline_step"]["name"] +genome = snakemake.config["static_data_config"]["reference"]["path"] +length = snakemake.config["step_config"][step]["scarHRD"]["length"] +genome_name = snakemake.config["step_config"][step]["scarHRD"]["genome_name"] + +chr_in_name = "TRUE" if snakemake.config["step_config"][step]["scarHRD"]["chr_prefix"] else "FALSE" +prefix = "chr" if snakemake.config["step_config"][step]["scarHRD"]["chr_prefix"] else "" +if genome_name == "grch37" or genome_name == "grch38": + chromosomes = " ".join([prefix + str(x) for x in list(range(1, 25)) + ["X"]]) +elif genome_name == "mouse": + chromosomes = " ".join([prefix + str(x) for x in list(range(1, 23)) + ["X"]]) +else: + raise + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +sequenza-utils bam2seqz \ + -gc {snakemake.input.gc} --fasta {genome} \ + -n {snakemake.input.normal_bam} --tumor {snakemake.input.tumor_bam} \ + -C {chromosomes} \ + | sequenza-utils seqz_binning -w {length} -s - \ + | gzip > {snakemake.output.sequenza} + +cat << __EOF | R --vanilla --slave +.libPaths(c("{lib_path}", .libPaths())) +Sys.setenv(VROOM_CONNECTION_SIZE=2000000000) +library("scarHRD") + +tbl <- scar_score("{snakemake.output.sequenza}", reference="{genome_name}", seqz=TRUE, chr.in.name={chr_in_name}) +cat('{{\n', file="{snakemake.output.scarHRD}") +cat(' "HRD": ', tbl[1,1], ',\n', sep="", file="{snakemake.output.scarHRD}", append=TRUE) +cat(' "Telomeric AI": ', tbl[1,2], ',\n', sep="", file="{snakemake.output.scarHRD}", append=TRUE) +cat(' "LST": ', tbl[1,3], ',\n', sep="", file="{snakemake.output.scarHRD}", append=TRUE) +cat(' "HRD-sum": ', tbl[1,4], '\n', sep="", file="{snakemake.output.scarHRD}", append=TRUE) +cat('}}\n', file="{snakemake.output.scarHRD}", append=TRUE) + +__EOF + +pushd $(dirname {snakemake.output.sequenza}) ; f=$(basename {snakemake.output.sequenza}) ; md5sum $f > $f.md5 ; popd +pushd $(dirname {snakemake.output.scarHRD}) ; f=$(basename {snakemake.output.scarHRD}) ; md5sum $f > $f.md5 ; popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py b/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py new file mode 100644 index 000000000..3183a63e7 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- +"""Tests for the homologous_recombination_deficiency module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.homologous_recombination_deficiency import ( + HomologousRecombinationDeficiencyWorkflow, +) + +from .common import get_expected_log_files_dict +from .conftest import patch_module_fs + +__author__ = "Eric Blanc" + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + bwa: + path_index: /path/to/bwa/index.fasta + homologous_recombination_deficiency: + tools: ['scarHRD'] + path_ngs_mapping: ../ngs_mapping # REQUIRED + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def homologous_recombination_deficiency_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return HomologousRecombinationDeficiencyWorkflow object pre-configured with cancer sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + # Construct the workflow object + return HomologousRecombinationDeficiencyWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for ScarHRDStepPart ------------------------------------------------------------------ + + +def test_scarHRD_step_part_get_input_files_run(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_input_files() - run""" + wildcards = Wildcards(fromdict={"library_name": "P001-T1-DNA1-WGS1", "mapper": "bwa"}) + expected = { + "lib_path": "work/R_packages/out/.done", + "gc": "work/static_data/out/grch37_50.wig.gz", + "normal_bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "normal_bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "tumor_bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + "tumor_bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + } + actual = homologous_recombination_deficiency_workflow.get_input_files("scarHRD", "run")( + wildcards + ) + assert actual == expected + + +def test_scarHRD_step_part_get_output_files_run(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_output_files() - run""" + # Define expected + base_name_out = "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}" + expected = { + "sequenza": base_name_out + ".seqz.gz", + "scarHRD": base_name_out + ".json", + } + # Get actual + actual = homologous_recombination_deficiency_workflow.get_output_files("scarHRD", "run") + assert actual == expected + + +def test_scarHRD_step_part_get_log_file_run(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_log_file() - run""" + base_name = "work/{mapper}.scarHRD.{library_name}/log/{mapper}.scarHRD.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name) + actual = homologous_recombination_deficiency_workflow.get_log_file("scarHRD", "run") + assert actual == expected + + +def test_scarHRD_step_part_get_resource_usage_run(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_resource() - run""" + # Define expected + expected_dict = {"threads": 2, "time": "48:00:00", "memory": "32G", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = homologous_recombination_deficiency_workflow.get_resource( + "scarHRD", "run", resource + ) + assert actual == expected, msg_error + + +def test_scarHRD_step_part_get_output_files_install(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_output_files() - install""" + # Define expected + expected = {"lib_path": "work/R_packages/out/.done"} + # Get actual + actual = homologous_recombination_deficiency_workflow.get_output_files("scarHRD", "install") + assert actual == expected + + +def test_scarHRD_step_part_get_log_file_install(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_log_file() - install""" + base_name = "work/R_packages/log/R_packages" + expected = get_expected_log_files_dict(base_out=base_name) + actual = homologous_recombination_deficiency_workflow.get_log_file("scarHRD", "install") + assert actual == expected + + +def test_scarHRD_step_part_get_resource_usage_install(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_resource() - install""" + # Define expected + expected_dict = {"threads": 1, "time": "02:00:00", "memory": "4096M", "partition": "short"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = homologous_recombination_deficiency_workflow.get_resource( + "scarHRD", "install", resource + ) + assert actual == expected, msg_error + + +def test_scarHRD_step_part_get_output_files_gcreference( + homologous_recombination_deficiency_workflow, +): + """Tests ScarHRDStepPart.get_output_files() - gcreference""" + # Define expected + expected = {"gc": "work/static_data/out/grch37_50.wig.gz"} + # Get actual + actual = homologous_recombination_deficiency_workflow.get_output_files("scarHRD", "gcreference") + assert actual == expected + + +def test_scarHRD_step_part_get_log_file_gcreference(homologous_recombination_deficiency_workflow): + """Tests ScarHRDStepPart.get_log_file() - gcreference""" + base_name = "work/static_data/log/grch37_50" + expected = get_expected_log_files_dict(base_out=base_name) + actual = homologous_recombination_deficiency_workflow.get_log_file("scarHRD", "gcreference") + assert actual == expected + + +# Tests for SomaticMsiCallingWorkflow -------------------------------------------------------------- + + +def test_homologous_recombination_deficiency_workflow(homologous_recombination_deficiency_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["link_out", "scarHRD"] + assert list(sorted(homologous_recombination_deficiency_workflow.sub_steps.keys())) == expected + # Check result file construction + expected = [ + "output/bwa.scarHRD.P001-T1-DNA1-WGS1/out/bwa.scarHRD.P001-T1-DNA1-WGS1.json", + "output/bwa.scarHRD.P001-T1-DNA1-WGS1/out/bwa.scarHRD.P001-T1-DNA1-WGS1.seqz.gz", + "output/bwa.scarHRD.P002-T1-DNA1-WGS1/out/bwa.scarHRD.P002-T1-DNA1-WGS1.json", + "output/bwa.scarHRD.P002-T1-DNA1-WGS1/out/bwa.scarHRD.P002-T1-DNA1-WGS1.seqz.gz", + "output/bwa.scarHRD.P002-T2-DNA1-WGS1/out/bwa.scarHRD.P002-T2-DNA1-WGS1.json", + "output/bwa.scarHRD.P002-T2-DNA1-WGS1/out/bwa.scarHRD.P002-T2-DNA1-WGS1.seqz.gz", + ] + actual = set(homologous_recombination_deficiency_workflow.get_result_files()) + expected = set(expected) + assert actual == expected From 38f80331f72f3b8cf3b8e4c95bf2e03eb9bd1ca4 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 22 Aug 2023 10:33:42 +0200 Subject: [PATCH 2/4] style: fixed HRD Snakefile to follow linting rules --- .../homologous_recombination_deficiency/Snakefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile index e9f6d0b3b..92f92482d 100644 --- a/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.homologous_recombination_deficiency import HomologousRecombinationDeficiencyWorkflow +from snappy_pipeline.workflows.homologous_recombination_deficiency import ( + HomologousRecombinationDeficiencyWorkflow, +) __author__ = "Eric Blanc" @@ -20,7 +22,9 @@ config, lookup_paths, config_paths = expand_ref("config.yaml", config) # WorkflowImpl Object Setup =================================================== -wf = HomologousRecombinationDeficiencyWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) +wf = HomologousRecombinationDeficiencyWorkflow( + workflow, config, lookup_paths, config_paths, os.getcwd() +) # Rules ======================================================================= @@ -86,7 +90,7 @@ rule homologous_recombination_deficiency_scarHRD_gcreference: rule homologous_recombination_deficiency_scarHRD_run: input: - unpack(wf.get_input_files("scarHRD", "run")) + unpack(wf.get_input_files("scarHRD", "run")), output: **wf.get_output_files("scarHRD", "run"), threads: wf.get_resource("scarHRD", "run", "threads") From 07757149ee3a664d989d82b10235b6ada6b5738f Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 22 Aug 2023 13:27:33 +0200 Subject: [PATCH 3/4] fix: fix chromosome choice (https://groups.google.com/g/sequenza-user-group/c/Z_Yug64b6c4), typos & doc --- .../homologous_recombination_deficiency/__init__.py | 11 +++++++---- snappy_wrappers/wrappers/scarHRD/run/wrapper.py | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py index 55c396049..6cdf57f61 100644 --- a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py @@ -6,9 +6,12 @@ The score is a proxy for holomogous recombination deficiencies scores produced from SNP arrays. The software is described in `Sztupinszki et al.`, but it is not part of CRAN, Bioconductor or bioconda (currently). -The implementation also relies on obsolete version of `sequenza` & -`copynumber`, which are not part of CRAN or Bioconductor anymore. -These software are downloaded directly from their github repo. +The implementation also relies on versions of `sequenza` & +`copynumber` which are not part of CRAN or Bioconductor anymore. +The most recent version of sequanza is downloaded from anaconda (for the R scripts), +and from bioconda for the python utilities. Note that anaconda lists the r-sequenza package in the +bioconda directory, while this package is not found when searching `bioconda`. +copynumber is obtained from a fork of the official Bioconductor deprecated package. ========== Step Input @@ -268,7 +271,7 @@ def get_result_files(self): or not sample_pair.normal_sample.dna_ngs_library ): msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" + "INFO: sample pair for cancer bio sample {} is missing primary" "normal or primary cancer NGS library" ) print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) diff --git a/snappy_wrappers/wrappers/scarHRD/run/wrapper.py b/snappy_wrappers/wrappers/scarHRD/run/wrapper.py index 4f21ecfee..6e414eade 100644 --- a/snappy_wrappers/wrappers/scarHRD/run/wrapper.py +++ b/snappy_wrappers/wrappers/scarHRD/run/wrapper.py @@ -17,11 +17,11 @@ chr_in_name = "TRUE" if snakemake.config["step_config"][step]["scarHRD"]["chr_prefix"] else "FALSE" prefix = "chr" if snakemake.config["step_config"][step]["scarHRD"]["chr_prefix"] else "" if genome_name == "grch37" or genome_name == "grch38": - chromosomes = " ".join([prefix + str(x) for x in list(range(1, 25)) + ["X"]]) + chromosomes = " ".join([prefix + str(x) for x in list(range(1, 23)) + ["X", "Y"]]) elif genome_name == "mouse": - chromosomes = " ".join([prefix + str(x) for x in list(range(1, 23)) + ["X"]]) + chromosomes = " ".join([prefix + str(x) for x in list(range(1, 21)) + ["X", "Y"]]) else: - raise + raise Exception("Invalid configuration") shell.executable("/bin/bash") From 0d08a09a1bde52add5438ec6438de94038b8a786 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 22 Aug 2023 13:37:37 +0200 Subject: [PATCH 4/4] style: fix trailing whitespace making flake8 unhappy --- .../workflows/homologous_recombination_deficiency/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py index 6cdf57f61..bae0d34c6 100644 --- a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py @@ -9,7 +9,7 @@ The implementation also relies on versions of `sequenza` & `copynumber` which are not part of CRAN or Bioconductor anymore. The most recent version of sequanza is downloaded from anaconda (for the R scripts), -and from bioconda for the python utilities. Note that anaconda lists the r-sequenza package in the +and from bioconda for the python utilities. Note that anaconda lists the r-sequenza package in the bioconda directory, while this package is not found when searching `bioconda`. copynumber is obtained from a fork of the official Bioconductor deprecated package.