From 4fd0c73c4bd75bf4aa5ef9a464713a2c5d4fada1 Mon Sep 17 00:00:00 2001 From: CuongPham <50145360+giacuong171@users.noreply.github.com> Date: Wed, 30 Aug 2023 22:49:21 +0700 Subject: [PATCH] feat: add missense TMB calculation (#432) Co-authored-by: giacuong171 --- .../somatic_variant_annotation/__init__.py | 4 +- .../tumor_mutational_burden/Snakefile | 2 + .../tumor_mutational_burden/__init__.py | 184 ++++++++++---- .../wrappers/bcftools/TMB/wrapper.py | 62 ++++- .../wrappers/vep/run/environment.yaml | 3 - .../workflows/test_tumor_mutational_burden.py | 239 ++++++++++++++++-- 6 files changed, 405 insertions(+), 89 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py index 73181a994..cce189e2f 100644 --- a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py @@ -75,7 +75,7 @@ EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") #: Names of the annotator tools -TOOLS = ("jannovar", "vep") +ANNOTATION_TOOLS = ("jannovar", "vep") #: Default configuration for the somatic_variant_calling step DEFAULT_CONFIG = r""" @@ -336,7 +336,7 @@ def get_result_files(self): annotators = list( map( lambda x: x.replace("jannovar", "jannovar_annotate_somatic_vcf"), - set(self.config["tools"]) & set(TOOLS), + set(self.config["tools"]) & set(ANNOTATION_TOOLS), ) ) callers = set(self.config["tools_somatic_variant_calling"]) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile index 27da7d775..4ce18d68d 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile +++ b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile @@ -59,6 +59,8 @@ rule tumor_mutational_burden_calculation: memory=wf.get_resource("tmb_gathering", "run", "memory"), partition=wf.get_resource("tmb_gathering", "run", "partition"), tmpdir=wf.get_resource("tmb_gathering", "run", "tmpdir"), + params: + **{"args": wf.get_params("tmb_gathering", "run")}, log: **wf.get_log_file("tmb_gathering", "run"), wrapper: diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index c7176d8fa..0880ba37f 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -5,10 +5,13 @@ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand -from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_annotation import ( + ANNOTATION_TOOLS, + SomaticVariantAnnotationWorkflow, +) from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, @@ -24,10 +27,13 @@ DEFAULT_CONFIG = r""" step_config: tumor_mutational_burden: - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + has_annotation: 'TRUE' # REQUIRED + path_somatic_variant: ../somatic_variant_annotation # REQUIRED tools_ngs_mapping: [] # default to those configured for ngs_mapping tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling + tools_somatic_variant_annotation: [] # default to those configured for somatic_variant_annotation target_regions: # REQUIRED + missense_regex: '.*[\|&]missense_variant[\|&].*' #change if the annotation tool doesn't use 'missense_variant' to indicate missense variant """ @@ -56,23 +62,41 @@ def __init__(self, parent): @dictify def get_input_files(self, action): self._validate_action(action) - tpl = ( - "output/{mapper}.{var_caller}.{tumor_library}/out/" - "{mapper}.{var_caller}.{tumor_library}" - ) + # Adding part for runnng with annotation file instead of with variant calling file + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + tpl = ( + "output/{mapper}.{var_caller}.{anno_tool}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_tool}.{tumor_library}" + ) + else: + tpl = ( + "output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} - variant_calling = self.parent.sub_workflows["somatic_variant_calling"] # read + # Adding part for runnng with annotation file instead of with variant calling file + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + variant_path = self.parent.sub_workflows["somatic_variant_annotation"] + else: + variant_path = self.parent.sub_workflows["somatic_variant_calling"] for key, ext in key_ext.items(): - yield key, variant_calling(tpl + ext) + yield key, variant_path(tpl + ext) @dictify def get_output_files(self, action): # Validate action self._validate_action(action) - prefix = ( - "work/{mapper}.{var_caller}.tmb.{tumor_library}/out/" - "{mapper}.{var_caller}.tmb.{tumor_library}" - ) + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + prefix = ( + "work/{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}" + ) + else: + prefix = ( + "work/{mapper}.{var_caller}.tmb.{tumor_library}/out/" + "{mapper}.{var_caller}.tmb.{tumor_library}" + ) key_ext = {"json": ".json"} for key, ext in key_ext.items(): yield key, prefix + ext @@ -80,11 +104,17 @@ def get_output_files(self, action): @dictify def _get_log_file(self, action): - assert action == "run" - prefix = ( - "work/{mapper}.{var_caller}.tmb.{tumor_library}/log/" - "{mapper}.{var_caller}.tmb.{tumor_library}" - ) + self._validate_action(action) + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + prefix = ( + "work/{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}" + ) + else: + prefix = ( + "work/{mapper}.{var_caller}.tmb.{tumor_library}/log/" + "{mapper}.{var_caller}.tmb.{tumor_library}" + ) key_ext = ( ("log", ".log"), @@ -95,16 +125,7 @@ def _get_log_file(self, action): yield key, prefix + ext def get_resource_usage(self, action): - """Get Resource Usage - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - :return: Returns ResourceUsage for step. - :raises UnsupportedActionException: if action not in class defined list of valid actions. - """ - if action not in self.actions: - actions_str = ", ".join(self.actions) - error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" - raise UnsupportedActionException(error_message) + self._validate_action(action) mem_mb = 4 * 1024 # 4GB return ResourceUsage( threads=2, @@ -112,6 +133,15 @@ def get_resource_usage(self, action): memory=f"{mem_mb}M", ) + def get_params(self, action): + self._validate_action(action) + return getattr(self, "_get_params_run") + + def _get_params_run(self, wildcards): + return { + "missense_re": self.w_config["step_config"]["tumor_mutational_burden"]["missense_regex"] + } + class TumorMutationalBurdenCalculationWorkflow(BaseStep): """Perform TMB calculation""" @@ -134,15 +164,27 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + (SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart)) # Register sub workflows - self.register_sub_workflow( - "somatic_variant_calling", - self.w_config["step_config"]["tumor_mutational_burden"]["path_somatic_variant_calling"], - ) + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + self.register_sub_workflow( + "somatic_variant_annotation", + self.w_config["step_config"]["tumor_mutational_burden"]["path_somatic_variant"], + ) + if not self.w_config["step_config"]["tumor_mutational_burden"][ + "tools_somatic_variant_annotation" + ]: + self.w_config["step_config"]["tumor_mutational_burden"][ + "tools_somatic_variant_annotation" + ] = self.w_config["step_config"]["somatic_variant_annotation"]["tools"] + else: + self.register_sub_workflow( + "somatic_variant_calling", + self.w_config["step_config"]["tumor_mutational_burden"]["path_somatic_variant"], + ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here if not self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"]: self.w_config["step_config"]["tumor_mutational_burden"][ @@ -160,26 +202,55 @@ def get_result_files(self): callers = set( self.w_config["step_config"]["tumor_mutational_burden"]["tools_somatic_variant_calling"] ) - name_pattern = "{mapper}.{caller}.tmb.{tumor_library.name}" - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], - caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), - ext=EXT_VALUES, - ) - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], - caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), - ext=( - ".log", - ".log.md5", - ".conda_info.txt", - ".conda_info.txt.md5", - ".conda_list.txt", - ".conda_list.txt.md5", - ), - ) + if self.w_config["step_config"]["tumor_mutational_burden"]["has_annotation"] == "TRUE": + anno_callers = set( + self.w_config["step_config"]["tumor_mutational_burden"][ + "tools_somatic_variant_annotation" + ] + ) + name_pattern = "{mapper}.{caller}.{anno_caller}.tmb.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers & set(ANNOTATION_TOOLS), + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers & set(ANNOTATION_TOOLS), + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + else: + name_pattern = "{mapper}.{caller}.tmb.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["tumor_mutational_burden"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) def _yield_result_files_matched(self, tpl, **kwargs): """Build output paths from path template and extension list. @@ -208,8 +279,8 @@ def _yield_result_files_matched(self, tpl, **kwargs): def check_config(self): """Check that the path to the NGS mapping is present""" self.ensure_w_config( - ("step_config", "tumor_mutational_burden", "path_somatic_variant_calling"), - "Path to variant calling not configured but required for tmb calculation", + ("step_config", "tumor_mutational_burden", "path_somatic_variant"), + "Path to variant (directory of vcf files) not configured but required for tmb calculation", ) self.ensure_w_config( @@ -217,3 +288,8 @@ def check_config(self): "Path to target_regions file (bed format)" "not configured but required for tmb calculation", ) + + self.ensure_w_config( + ("step_config", "tumor_mutational_burden", "has_annotation"), + "TMB needs to know wether the vcf is annotated or not", + ) diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 6bd7a88ba..f4ab3db0b 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -7,6 +7,12 @@ __author__ = "Pham Gia Cuong" __email__ = "pham.gia-cuong@bih-charite.de" +missense_re = ( + snakemake.params.args["missense_re"] + if "args" in snakemake.params.keys() and "missense_re" in snakemake.params.args.keys() + else "" +) + shell( r""" # ----------------------------------------------------------------------------- @@ -33,23 +39,53 @@ number_indels=$(bcftools view -R $bed_file -v indels --threads 2 -H {snakemake.input.vcf}| wc -l) number_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| wc -l) +if [[ -n "{missense_re}" ]] +then + number_missense_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| grep -E '{missense_re}' | wc -l) +fi + TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l ` -cat << EOF > {snakemake.output.json} -{ -"Library_name": {snakemake.wildcards.tumor_library}, -"VCF_file": $name_vcf, -"VCF_md5": $vcf_md5, -"BED_file": $bed_file_name, -"BED_md5": $bed_md5, -"TMB": $TMB, -"Number_variants": $number_variants, -"Number_snvs": $number_snvs, -"Number_indels": $number_indels, -"Total_regions_length": $total_exom_length +missense_TMB=`echo "1000000*($number_missense_variants/$total_exom_length)" | bc -l ` +if [[ {snakemake.config[step_config][tumor_mutational_burden][has_annotation]} == "TRUE" ]] +then + out_file=$(cat << EOF + {{ + "Library_name": {snakemake.wildcards.tumor_library}, + "VCF_file": $name_vcf, + "VCF_md5": $vcf_md5, + "BED_file": $bed_file_name, + "BED_md5": $bed_md5, + "TMB": $TMB, + "missense_TMB": $missense_TMB, + "Number_variants": $number_variants, + "Number_snvs": $number_snvs, + "Number_indels": $number_indels, + "Total_regions_length": $total_exom_length + }} EOF + ) + echo $out_file > {snakemake.output.json} +else + out_file=$(cat << EOF + {{ + "Library_name": {snakemake.wildcards.tumor_library}, + "VCF_file": $name_vcf, + "VCF_md5": $vcf_md5, + "BED_file": $bed_file_name, + "BED_md5": $bed_md5, + "TMB": $TMB, + "Number_variants": $number_variants, + "Number_snvs": $number_snvs, + "Number_indels": $number_indels, + "Total_regions_length": $total_exom_length + }} +EOF + ) + echo $out_file > {snakemake.output.json} +fi + pushd $(dirname {snakemake.output.json}) md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) -} """ ) diff --git a/snappy_wrappers/wrappers/vep/run/environment.yaml b/snappy_wrappers/wrappers/vep/run/environment.yaml index 48f171edf..711e58eec 100644 --- a/snappy_wrappers/wrappers/vep/run/environment.yaml +++ b/snappy_wrappers/wrappers/vep/run/environment.yaml @@ -1,8 +1,5 @@ channels: - - conda-forge - bioconda dependencies: - - python - ensembl-vep=102 - - htslib diff --git a/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py b/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py index e338ac0a6..e9aa46b22 100644 --- a/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py +++ b/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py @@ -14,8 +14,9 @@ from .conftest import patch_module_fs +# Test tumor mutational burden calculation with vcf file from somatic variant calling step @pytest.fixture(scope="module") # otherwise: performance issues -def minimal_config(): +def minimal_config_calling(): """Return YAML parsing result for configuration""" yaml = ruamel_yaml.YAML() return yaml.load( @@ -46,8 +47,9 @@ def minimal_config(): path_target_regions: /path/to/target/regions.bed tumor_mutational_burden: - path_somatic_variant_calling: ../somatic_variant_calling + path_somatic_variant: ../somatic_variant_calling tools_ngs_mapping: [] + has_annotation: 'FALSE' # REQUIRED tools_somatic_variant_calling: [] target_regions: /path/to/regions.bed @@ -65,9 +67,9 @@ def minimal_config(): @pytest.fixture -def tumor_mutational_burden_workflow( +def tumor_mutational_burden_workflow_calling( dummy_workflow, - minimal_config, + minimal_config_calling, config_lookup_paths, work_dir, config_paths, @@ -86,7 +88,7 @@ def tumor_mutational_burden_workflow( # Construct the workflow object return TumorMutationalBurdenCalculationWorkflow( dummy_workflow, - minimal_config, + minimal_config_calling, config_lookup_paths, config_paths, work_dir, @@ -96,7 +98,9 @@ def tumor_mutational_burden_workflow( # Tests for TumorMutationalBurdenCalculationStepPart ----------------------------------------------------- -def test_tumor_mutational_step_part_get_input_files(tumor_mutational_burden_workflow): +def test_tumor_mutational_step_part_get_input_files_calling( + tumor_mutational_burden_workflow_calling, +): """Test TumorMutationalBurdenCalculationStepPart.get_input_files()""" base_out = ( "SOMATIC_VARIANT_CALLING/output/{mapper}.{var_caller}.{tumor_library}/out/" @@ -106,34 +110,36 @@ def test_tumor_mutational_step_part_get_input_files(tumor_mutational_burden_work "vcf": base_out + ".vcf.gz", "vcf_tbi": base_out + ".vcf.gz.tbi", } - actual = tumor_mutational_burden_workflow.get_input_files("tmb_gathering", "run") + actual = tumor_mutational_burden_workflow_calling.get_input_files("tmb_gathering", "run") assert actual == expected -def test_tumor_mutational_step_part_get_output_files(tumor_mutational_burden_workflow): +def test_tumor_mutational_step_part_get_output_files_calling( + tumor_mutational_burden_workflow_calling, +): """Tests TumorMutationalBurdenCalculationStepPart.get_output_files()""" base_out = ( "work/{mapper}.{var_caller}.tmb.{tumor_library}/out/" "{mapper}.{var_caller}.tmb.{tumor_library}" ) expected = get_expected_output_json_files_dict(base_out=base_out) - actual = tumor_mutational_burden_workflow.get_output_files("tmb_gathering", "run") + actual = tumor_mutational_burden_workflow_calling.get_output_files("tmb_gathering", "run") assert actual == expected -def test_tumor_mutational_step_part_get_log_files(tumor_mutational_burden_workflow): +def test_tumor_mutational_step_part_get_log_files_calling(tumor_mutational_burden_workflow_calling): """Tests TumorMutationalBurdenCalculationStepPart.get_log_files()""" base_out = ( "work/{mapper}.{var_caller}.tmb.{tumor_library}/log/" "{mapper}.{var_caller}.tmb.{tumor_library}" ) expected = get_expected_log_files_dict(base_out=base_out) - actual = tumor_mutational_burden_workflow.get_log_file("tmb_gathering", "run") + actual = tumor_mutational_burden_workflow_calling.get_log_file("tmb_gathering", "run") assert actual == expected -def test_tumor_mutational_step_part_get_resource_usage( - tumor_mutational_burden_workflow, +def test_tumor_mutational_step_part_get_resource_usage_calling( + tumor_mutational_burden_workflow_calling, ): """Tests TumorMutationalBurdenCalculationStepPart.get_resource_usage()""" # Define expected @@ -141,18 +147,20 @@ def test_tumor_mutational_step_part_get_resource_usage( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = tumor_mutational_burden_workflow.get_resource("tmb_gathering", "run", resource) + actual = tumor_mutational_burden_workflow_calling.get_resource( + "tmb_gathering", "run", resource + ) assert actual == expected, msg_error # Tests for TumorMutationalBurdenCalculationWorkflow ------------------------------------------------------- -def test_tumor_mutational_burden_workflow(tumor_mutational_burden_workflow): +def test_tumor_mutational_burden_workflow_calling(tumor_mutational_burden_workflow_calling): """Test simple functionality of the workflow""" # Check created sub steps expected = ["link_out", "tmb_gathering"] - actual = list(sorted(tumor_mutational_burden_workflow.sub_steps.keys())) + actual = list(sorted(tumor_mutational_burden_workflow_calling.sub_steps.keys())) assert actual == expected # Check result file construction @@ -182,5 +190,202 @@ def test_tumor_mutational_burden_workflow(tumor_mutational_burden_workflow): for var_caller in ("mutect2", "scalpel") ] expected = list(sorted(expected)) - actual = list(sorted(tumor_mutational_burden_workflow.get_result_files())) + actual = list(sorted(tumor_mutational_burden_workflow_calling.get_result_files())) + assert expected == actual + + +# Test Tumor mutatinal burden calculation with vcf file from somatic variant annotation step +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config_annotation(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + compute_coverage_bed: true + path_target_regions: /path/to/regions.bed + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: + - mutect2 + - scalpel + scalpel: + path_target_regions: /path/to/target/regions.bed + + somatic_variant_annotation: + tools: ["jannovar", "vep"] + jannovar: + path_jannovar_ser: /path/to/jannover.ser + vep: + path_dir_cache: /path/to/dir/cache + + tumor_mutational_burden: + path_somatic_variant: ../somatic_variant_annotation # REQUIRED + has_annotation: 'TRUE' # REQUIRED + tools_ngs_mapping: [] # default to those configured for ngs_mapping + tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling + tools_somatic_variant_annotation: [] # default to those configured for somatic_variant_annotation + target_regions: /path/to/regions.bed # REQUIRED + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def tumor_mutational_burden_workflow_annotation( + dummy_workflow, + minimal_config_annotation, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return TumorMutationalBurdenCalculationWorkflow object pre-configured with cancer sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_calling": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, + } + # Construct the workflow object + return TumorMutationalBurdenCalculationWorkflow( + dummy_workflow, + minimal_config_annotation, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for TumorMutationalBurdenCalculationStepPart ----------------------------------------------------- + + +def test_tumor_mutational_step_part_get_input_files_annotation( + tumor_mutational_burden_workflow_annotation, +): + """Test TumorMutationalBurdenCalculationStepPart.get_input_files()""" + base_out = ( + "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_tool}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_tool}.{tumor_library}" + ) + expected = { + "vcf": base_out + ".vcf.gz", + "vcf_tbi": base_out + ".vcf.gz.tbi", + } + actual = tumor_mutational_burden_workflow_annotation.get_input_files("tmb_gathering", "run") + assert actual == expected + + +def test_tumor_mutational_step_part_get_output_files_annotation( + tumor_mutational_burden_workflow_annotation, +): + """Tests TumorMutationalBurdenCalculationStepPart.get_output_files()""" + base_out = ( + "work/{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}" + ) + expected = get_expected_output_json_files_dict(base_out=base_out) + actual = tumor_mutational_burden_workflow_annotation.get_output_files("tmb_gathering", "run") + assert actual == expected + + +def test_tumor_mutational_step_part_get_log_files_annotation( + tumor_mutational_burden_workflow_annotation, +): + """Tests TumorMutationalBurdenCalculationStepPart.get_log_files()""" + base_out = ( + "work/{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_tool}.tmb.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = tumor_mutational_burden_workflow_annotation.get_log_file("tmb_gathering", "run") + assert actual == expected + + +def test_tumor_mutational_step_part_get_resource_usage_annotation( + tumor_mutational_burden_workflow_annotation, +): + """Tests TumorMutationalBurdenCalculationStepPart.get_resource_usage()""" + # Define expected + expected_dict = {"threads": 2, "time": "1:00:00", "memory": "4096M"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = tumor_mutational_burden_workflow_annotation.get_resource( + "tmb_gathering", "run", resource + ) + assert actual == expected, msg_error + + +# Tests for TumorMutationalBurdenCalculationWorkflow ------------------------------------------------------- + + +def test_tumor_mutational_burden_workflow_annotation(tumor_mutational_burden_workflow_annotation): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["link_out", "tmb_gathering"] + actual = list(sorted(tumor_mutational_burden_workflow_annotation.sub_steps.keys())) + assert actual == expected + + # Check result file construction + tpl = ( + "output/{mapper}.{var_caller}.{anno_tool}.tmb.P00{i}-T{t}-DNA1-WGS1/{dir_}/" + "{mapper}.{var_caller}.{anno_tool}.tmb.P00{i}-T{t}-DNA1-WGS1.{ext}" + ) + expected = [ + tpl.format( + mapper=mapper, var_caller=var_caller, anno_tool=anno_tool, i=i, t=t, ext=ext, dir_="out" + ) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ("json", "json.md5") + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + for anno_tool in ("jannovar", "vep") + ] + expected += [ + tpl.format( + mapper=mapper, var_caller=var_caller, anno_tool=anno_tool, i=i, t=t, ext=ext, dir_="log" + ) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + for anno_tool in ("jannovar", "vep") + ] + expected = list(sorted(expected)) + actual = list(sorted(tumor_mutational_burden_workflow_annotation.get_result_files())) assert expected == actual