Skip to content

Commit

Permalink
feat: implement ngs_mapping fingerprinting (#283) (#289)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Dec 22, 2022
1 parent 125b601 commit a3303ac
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
lfs: true
fetch-depth: 2
- name: Install mamba
run: conda install -y mamba==1.0.0
run: conda install -y mamba>=1.0.0
- name: Prepare environment.yml file
run: >
cp environment.yml /tmp/environment.yml && sed -i -e
Expand Down
19 changes: 19 additions & 0 deletions snappy_pipeline/workflows/ngs_mapping/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,22 @@ rule ngs_mapping_bam_collect_doc_run:
**wf.get_log_file("bam_collect_doc", "run"),
wrapper:
wf.wrapper_path("maelstrom/bam_collect_doc")


# Compute fingerprint ---------------------------------------------------------


rule ngs_mapping_ngs_chew_fingerprint:
input:
**wf.get_input_files("ngs_chew", "fingerprint")(),
output:
**wf.get_output_files("ngs_chew", "fingerprint"),
threads: wf.get_resource("ngs_chew", "fingerprint", "threads")
resources:
time=wf.get_resource("ngs_chew", "run", "time"),
memory=wf.get_resource("ngs_chew", "run", "memory"),
partition=wf.get_resource("ngs_chew", "run", "partition"),
log:
**wf.get_log_files("ngs_chew", "run"),
wrapper:
wf.wrapper_path("ngs_chew/fingerprint")
91 changes: 90 additions & 1 deletion snappy_pipeline/workflows/ngs_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,9 @@
bam_collect_doc:
enabled: false
window_length: 1000
# Compute fingerprints with ngs-chew
ngs_chew_fingerprint:
enabled: true
# Configuration for BWA
bwa:
path_index: REQUIRED # Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.
Expand Down Expand Up @@ -1184,6 +1187,79 @@ def get_resource_usage(self, action):
)


class NgsChewStepPart(BaseStepPart):
"""Analyze BAM File with ``ngs-chew``, e.g., ``fingerprint``"""

#: Step name
name = "ngs_chew"

#: Class available actions
actions = ("fingerprint",)

def __init__(self, parent):
super().__init__(parent)

def get_input_files(self, action):
"""Return required input files"""
self._check_action(action)
return getattr(self, f"_get_input_files_{action}")

def _check_action(self, action):
if action not in self.actions:
actions_str = ", ".join(self.actions)
error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
raise UnsupportedActionException(error_message)

@dictify
def _get_input_files_run(self):
yield "bam", "work/{mapper_lib}/out/{mapper_lib}.bam"

def get_output_files(self, action):
"""Return output files"""
self._check_action(action)
return getattr(self, "_get_output_files_{action}".format(action=action))()

@dictify
def _get_output_files_run(self):
yield "npz", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz"
yield "npz_md5", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz.md5"

def get_log_files(self, action):
self._check_action(action)
return getattr(self, "_get_log_files_{action}".format(action=action))()

@dictify
def _get_log_files_fingerprint(self):
prefix = "work/{mapper_lib}/log/{mapper_lib}.ngs_chew_fingerprint"
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
("wrapper", ".wrapper.py"),
("env_yaml", ".environment.yaml"),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_resource_usage(self, action):
"""Get Resource Usage
:param action: Action (i.e., step) in the workflow, example: 'run'.
:type action: str
:return: Returns ResourceUsage for step.
:raises UnsupportedActionException: if action not in class defined list of valid actions.
"""
self._check_action(action)
return ResourceUsage(
threads=1,
time="04:00:00",
memory="2G",
)


class NgsMappingWorkflow(BaseStep):
"""Perform NGS Mapping"""

Expand Down Expand Up @@ -1213,6 +1289,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
StarStepPart,
TargetCoverageReportStepPart,
BamCollectDocStepPart,
NgsChewStepPart,
)
)
self.sub_steps["link_out"].disable_patterns = expand("**/*{ext}", ext=EXT_VALUES)
Expand Down Expand Up @@ -1276,9 +1353,14 @@ def get_result_files(self):
yield from self._yield_result_files(
os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), ext=EXT_VALUES
)
infixes = ["mapping", "target_cov_report"]
infixes = [
"mapping",
"target_cov_report",
]
if self.config["bam_collect_doc"]["enabled"]:
infixes.append("bam_collect_doc")
if self.config["ngs_chew_fingerprint"]["enabled"]:
infixes.append("ngs_chew_fingerprint")
for infix in infixes:
yield from self._yield_result_files(
os.path.join("output", name_pattern, "log", "{mapper}.{ngs_library.name}.{ext}"),
Expand All @@ -1300,6 +1382,13 @@ def get_result_files(self):
os.path.join("output", name_pattern, "report", "cov", name_pattern + ".cov.{ext}"),
ext=("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5", "bw", "bw.md5"),
)
if self.config["ngs_chew_fingerprint"]["enabled"]:
yield from self._yield_result_files(
os.path.join(
"output", name_pattern, "report", "fingerprint", name_pattern + ".{ext}"
),
ext=("npz", "npz.md5"),
)
yield from self._yield_result_files(
os.path.join(
"output", name_pattern, "report", "bam_qc", name_pattern + ".bam.{report}.txt"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- file:///data/gpfs-1/users/holtgrem_c/scratch/art/LinuxArtifacts/packages
dependencies:
- ngs-chew >=0.5.0,<0.6.0
128 changes: 128 additions & 0 deletions snappy_wrappers/wrappers/ngs_chew/fingerprint/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
"""Wrapper for running ``ngs-chew fingerprint``."""

from snakemake.shell import shell

__author__ = "Manuel Holtgrewe"
__email__ = "[email protected]"

path_ref = snakemake.config["static_data_config"]["reference"]["path"]
if "hg19" in path_ref or "37" in path_ref:
genome_release = "GRCh37"
else:
genome_release = "GRCh38"

shell(
r"""
set -x
# Write out information about conda and save a copy of the wrapper with picked variables
# as well as the environment.yaml file.
conda list >{snakemake.log.conda_list}
conda info >{snakemake.log.conda_info}
md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5}
md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5}
cp {__real_file__} {snakemake.log.wrapper}
md5sum {snakemake.log.wrapper} >{snakemake.log.wrapper_md5}
cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml}
md5sum {snakemake.log.env_yaml} >{snakemake.log.env_yaml_md5}
# Also pipe stderr to log file
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
exec 2> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
mkdir -p $TMPDIR/{{out,sorted,sort.tmp}}
if [[ "{library_kit}" == "PacBio HiFi" ]]; then
preset=map-hifi
elif [[ "{library_kit}" == "PacBio CLR" ]]; then
preset=map-pb
elif [[ "{library_kit}" == ONT* ]]; then
preset=map-ont
else
>&2 echo "Unknown library kit {library_kit}"
exit 1
fi
ngs-chew fingerprint \
--min-coverage 5 \
--reference {snakemake.config[static_data_config][reference][path]} \
--output-fingerprint {snakemake.ouput.npz} \
--input-bam {snakemake.input.bam}
--genome-release {genome_release}
# Compute MD5 sums
pushd $(dirname $out_bam)
md5sum $(basename $out_bam) >$(basename $out_bam).md5
md5sum $(basename $out_bam).bai >$(basename $out_bam).bai.md5
popd
# QC Report ---------------------------------------------------------------------------------------
# gather statistics from BAM file
# TODO: use pipes for only reading once from disk?
samtools stats {snakemake.output.bam} > {snakemake.output.report_bamstats_txt}
samtools flagstat {snakemake.output.bam} > {snakemake.output.report_flagstats_txt}
samtools idxstats {snakemake.output.bam} > {snakemake.output.report_idxstats_txt}
# call plot-bamstats
mkdir $TMPDIR/bamstats.d
plot-bamstats \
-p $TMPDIR/bamstats.d/ \
{snakemake.output.report_bamstats_txt} \
|| true # ignore failure
# Patch inline-html if necessary.
cat >$TMPDIR/inline-html.diff <<EOF
diff --git a/inline_html/inline_html.py b/inline_html/inline_html.py
index 893086c..cbef6dd 100644
--- a/inline_html/inline_html.py
+++ b/inline_html/inline_html.py
@@ -20,7 +20,10 @@ def resource_to_data(path, in_file):
mime, _ = mimetypes.guess_type(path)
with open(path, 'rb') as fp:
data = fp.read()
- data64 = b''.join(base64.encodestring(data).splitlines())
+ try:
+ data64 = b''.join(base64.encodestring(data).splitlines())
+ except AttributeError:
+ data64 = b''.join(base64.encodebytes(data).splitlines())
return 'data:%s;base64,%s' % (mime, data64.decode('ascii'))
EOF
pushd $(python3 -c 'import inline_html; print(inline_html.__path__[0])')
if ! grep encodebytes inline_html.py; then
patch -p2 <$TMPDIR/inline-html.diff
fi
popd
# Convert HTML report into one file.
inline-html \
--in-file $TMPDIR/bamstats.d/index.html \
--out-file {snakemake.output.report_bamstats_html} \
|| touch {snakemake.output.report_bamstats_html}
# Build MD5 files for the reports
md5sum {snakemake.output.report_bamstats_html} > {snakemake.output.report_bamstats_html_md5}
md5sum {snakemake.output.report_bamstats_txt} > {snakemake.output.report_bamstats_txt_md5}
md5sum {snakemake.output.report_flagstats_txt} >{snakemake.output.report_flagstats_txt_md5}
md5sum {snakemake.output.report_idxstats_txt} > {snakemake.output.report_idxstats_txt_md5}
"""
)

# Compute MD5 sums of logs.
shell(
r"""
sleep 1s # try to wait for log file flush
md5sum {snakemake.log.log} >{snakemake.log.log_md5}
"""
)
10 changes: 9 additions & 1 deletion tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow):
"link_out",
"link_out_bam",
"minimap2",
"ngs_chew",
"star",
"target_coverage_report",
]
Expand All @@ -809,7 +810,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for i in range(1, 7)
for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5")
]
for infix in ("bam_collect_doc", "mapping", "target_cov_report"):
for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"):
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext)
for i in range(1, 7)
Expand Down Expand Up @@ -842,6 +843,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
)
for ext in ("npz", "npz.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow):
"link_out",
"link_out_bam",
"minimap2",
"ngs_chew",
"star",
"target_coverage_report",
]
Expand All @@ -740,7 +741,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for i in range(1, 7)
for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5")
]
for infix in ("bam_collect_doc", "mapping", "target_cov_report"):
for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"):
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext)
for i in range(1, 7)
Expand Down Expand Up @@ -773,6 +774,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
)
for ext in ("npz", "npz.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
Expand Down

0 comments on commit a3303ac

Please sign in to comment.