diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a92844b2..5117415e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,7 +92,7 @@ jobs: lfs: true fetch-depth: 2 - name: Install mamba - run: conda install -y mamba==1.0.0 + run: conda install -y mamba>=1.0.0 - name: Prepare environment.yml file run: > cp environment.yml /tmp/environment.yml && sed -i -e diff --git a/snappy_pipeline/workflows/ngs_mapping/Snakefile b/snappy_pipeline/workflows/ngs_mapping/Snakefile index 25ec76bcc..e5ba24f8d 100644 --- a/snappy_pipeline/workflows/ngs_mapping/Snakefile +++ b/snappy_pipeline/workflows/ngs_mapping/Snakefile @@ -258,3 +258,22 @@ rule ngs_mapping_bam_collect_doc_run: **wf.get_log_file("bam_collect_doc", "run"), wrapper: wf.wrapper_path("maelstrom/bam_collect_doc") + + +# Compute fingerprint --------------------------------------------------------- + + +rule ngs_mapping_ngs_chew_fingerprint: + input: + **wf.get_input_files("ngs_chew", "fingerprint")(), + output: + **wf.get_output_files("ngs_chew", "fingerprint"), + threads: wf.get_resource("ngs_chew", "fingerprint", "threads") + resources: + time=wf.get_resource("ngs_chew", "run", "time"), + memory=wf.get_resource("ngs_chew", "run", "memory"), + partition=wf.get_resource("ngs_chew", "run", "partition"), + log: + **wf.get_log_files("ngs_chew", "run"), + wrapper: + wf.wrapper_path("ngs_chew/fingerprint") diff --git a/snappy_pipeline/workflows/ngs_mapping/__init__.py b/snappy_pipeline/workflows/ngs_mapping/__init__.py index ac423dd89..913fbd921 100644 --- a/snappy_pipeline/workflows/ngs_mapping/__init__.py +++ b/snappy_pipeline/workflows/ngs_mapping/__init__.py @@ -344,6 +344,9 @@ bam_collect_doc: enabled: false window_length: 1000 + # Compute fingerprints with ngs-chew + ngs_chew_fingerprint: + enabled: true # Configuration for BWA bwa: path_index: REQUIRED # Required if listed in ngs_mapping.tools.dna; otherwise, can be removed. @@ -1184,6 +1187,79 @@ def get_resource_usage(self, action): ) +class NgsChewStepPart(BaseStepPart): + """Analyze BAM File with ``ngs-chew``, e.g., ``fingerprint``""" + + #: Step name + name = "ngs_chew" + + #: Class available actions + actions = ("fingerprint",) + + def __init__(self, parent): + super().__init__(parent) + + def get_input_files(self, action): + """Return required input files""" + self._check_action(action) + return getattr(self, f"_get_input_files_{action}") + + def _check_action(self, action): + if action not in self.actions: + actions_str = ", ".join(self.actions) + error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" + raise UnsupportedActionException(error_message) + + @dictify + def _get_input_files_run(self): + yield "bam", "work/{mapper_lib}/out/{mapper_lib}.bam" + + def get_output_files(self, action): + """Return output files""" + self._check_action(action) + return getattr(self, "_get_output_files_{action}".format(action=action))() + + @dictify + def _get_output_files_run(self): + yield "npz", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz" + yield "npz_md5", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz.md5" + + def get_log_files(self, action): + self._check_action(action) + return getattr(self, "_get_log_files_{action}".format(action=action))() + + @dictify + def _get_log_files_fingerprint(self): + prefix = "work/{mapper_lib}/log/{mapper_lib}.ngs_chew_fingerprint" + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ("wrapper", ".wrapper.py"), + ("env_yaml", ".environment.yaml"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + + :raises UnsupportedActionException: if action not in class defined list of valid actions. + """ + self._check_action(action) + return ResourceUsage( + threads=1, + time="04:00:00", + memory="2G", + ) + + class NgsMappingWorkflow(BaseStep): """Perform NGS Mapping""" @@ -1213,6 +1289,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) StarStepPart, TargetCoverageReportStepPart, BamCollectDocStepPart, + NgsChewStepPart, ) ) self.sub_steps["link_out"].disable_patterns = expand("**/*{ext}", ext=EXT_VALUES) @@ -1276,9 +1353,14 @@ def get_result_files(self): yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), ext=EXT_VALUES ) - infixes = ["mapping", "target_cov_report"] + infixes = [ + "mapping", + "target_cov_report", + ] if self.config["bam_collect_doc"]["enabled"]: infixes.append("bam_collect_doc") + if self.config["ngs_chew_fingerprint"]["enabled"]: + infixes.append("ngs_chew_fingerprint") for infix in infixes: yield from self._yield_result_files( os.path.join("output", name_pattern, "log", "{mapper}.{ngs_library.name}.{ext}"), @@ -1300,6 +1382,13 @@ def get_result_files(self): os.path.join("output", name_pattern, "report", "cov", name_pattern + ".cov.{ext}"), ext=("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5", "bw", "bw.md5"), ) + if self.config["ngs_chew_fingerprint"]["enabled"]: + yield from self._yield_result_files( + os.path.join( + "output", name_pattern, "report", "fingerprint", name_pattern + ".{ext}" + ), + ext=("npz", "npz.md5"), + ) yield from self._yield_result_files( os.path.join( "output", name_pattern, "report", "bam_qc", name_pattern + ".bam.{report}.txt" diff --git a/snappy_wrappers/wrappers/ngs_chew/fingerprint/environment.yaml b/snappy_wrappers/wrappers/ngs_chew/fingerprint/environment.yaml new file mode 100644 index 000000000..7016f8b00 --- /dev/null +++ b/snappy_wrappers/wrappers/ngs_chew/fingerprint/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - file:///data/gpfs-1/users/holtgrem_c/scratch/art/LinuxArtifacts/packages +dependencies: + - ngs-chew >=0.5.0,<0.6.0 diff --git a/snappy_wrappers/wrappers/ngs_chew/fingerprint/wrapper.py b/snappy_wrappers/wrappers/ngs_chew/fingerprint/wrapper.py new file mode 100644 index 000000000..b3c2eb5c7 --- /dev/null +++ b/snappy_wrappers/wrappers/ngs_chew/fingerprint/wrapper.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +"""Wrapper for running ``ngs-chew fingerprint``.""" + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe" +__email__ = "manuel.holtgrewe@bih-charite.de" + +path_ref = snakemake.config["static_data_config"]["reference"]["path"] +if "hg19" in path_ref or "37" in path_ref: + genome_release = "GRCh37" +else: + genome_release = "GRCh38" + +shell( + r""" +set -x + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +md5sum {snakemake.log.wrapper} >{snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +md5sum {snakemake.log.env_yaml} >{snakemake.log.env_yaml_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT +mkdir -p $TMPDIR/{{out,sorted,sort.tmp}} + +if [[ "{library_kit}" == "PacBio HiFi" ]]; then + preset=map-hifi +elif [[ "{library_kit}" == "PacBio CLR" ]]; then + preset=map-pb +elif [[ "{library_kit}" == ONT* ]]; then + preset=map-ont +else + >&2 echo "Unknown library kit {library_kit}" + exit 1 +fi + +ngs-chew fingerprint \ + --min-coverage 5 \ + --reference {snakemake.config[static_data_config][reference][path]} \ + --output-fingerprint {snakemake.ouput.npz} \ + --input-bam {snakemake.input.bam} + --genome-release {genome_release} + + +# Compute MD5 sums +pushd $(dirname $out_bam) +md5sum $(basename $out_bam) >$(basename $out_bam).md5 +md5sum $(basename $out_bam).bai >$(basename $out_bam).bai.md5 +popd + +# QC Report --------------------------------------------------------------------------------------- + +# gather statistics from BAM file +# TODO: use pipes for only reading once from disk? +samtools stats {snakemake.output.bam} > {snakemake.output.report_bamstats_txt} +samtools flagstat {snakemake.output.bam} > {snakemake.output.report_flagstats_txt} +samtools idxstats {snakemake.output.bam} > {snakemake.output.report_idxstats_txt} + +# call plot-bamstats +mkdir $TMPDIR/bamstats.d +plot-bamstats \ + -p $TMPDIR/bamstats.d/ \ + {snakemake.output.report_bamstats_txt} \ +|| true # ignore failure + +# Patch inline-html if necessary. +cat >$TMPDIR/inline-html.diff < {snakemake.output.report_bamstats_html_md5} +md5sum {snakemake.output.report_bamstats_txt} > {snakemake.output.report_bamstats_txt_md5} +md5sum {snakemake.output.report_flagstats_txt} >{snakemake.output.report_flagstats_txt_md5} +md5sum {snakemake.output.report_idxstats_txt} > {snakemake.output.report_idxstats_txt_md5} +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +sleep 1s # try to wait for log file flush +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py index a454fd36c..e6a03377d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py @@ -791,6 +791,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow): "link_out", "link_out_bam", "minimap2", + "ngs_chew", "star", "target_coverage_report", ] @@ -809,7 +810,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow): for i in range(1, 7) for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") ] - for infix in ("bam_collect_doc", "mapping", "target_cov_report"): + for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"): expected += [ "output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext) for i in range(1, 7) @@ -842,6 +843,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow): for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for i in range(1, 7) ] + expected += [ + "output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format( + i=i, ext=ext + ) + for ext in ("npz", "npz.md5") + for i in range(1, 7) + ] expected += [ "output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format( i=i, ext=ext diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py index 3759d00fd..6ef559890 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py @@ -722,6 +722,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow): "link_out", "link_out_bam", "minimap2", + "ngs_chew", "star", "target_coverage_report", ] @@ -740,7 +741,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow): for i in range(1, 7) for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") ] - for infix in ("bam_collect_doc", "mapping", "target_cov_report"): + for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"): expected += [ "output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext) for i in range(1, 7) @@ -773,6 +774,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow): for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for i in range(1, 7) ] + expected += [ + "output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format( + i=i, ext=ext + ) + for ext in ("npz", "npz.md5") + for i in range(1, 7) + ] expected += [ "output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format( i=i, ext=ext