Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement ngs_mapping fingerprinting (#283) #289

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
lfs: true
fetch-depth: 2
- name: Install mamba
run: conda install -y mamba==1.0.0
run: conda install -y mamba>=1.0.0
- name: Prepare environment.yml file
run: >
cp environment.yml /tmp/environment.yml && sed -i -e
Expand Down
19 changes: 19 additions & 0 deletions snappy_pipeline/workflows/ngs_mapping/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,22 @@ rule ngs_mapping_bam_collect_doc_run:
**wf.get_log_file("bam_collect_doc", "run"),
wrapper:
wf.wrapper_path("maelstrom/bam_collect_doc")


# Compute fingerprint ---------------------------------------------------------


rule ngs_mapping_ngs_chew_fingerprint:
input:
**wf.get_input_files("ngs_chew", "fingerprint")(),
output:
**wf.get_output_files("ngs_chew", "fingerprint"),
threads: wf.get_resource("ngs_chew", "fingerprint", "threads")
resources:
time=wf.get_resource("ngs_chew", "run", "time"),
memory=wf.get_resource("ngs_chew", "run", "memory"),
partition=wf.get_resource("ngs_chew", "run", "partition"),
log:
**wf.get_log_files("ngs_chew", "run"),
wrapper:
wf.wrapper_path("ngs_chew/fingerprint")
91 changes: 90 additions & 1 deletion snappy_pipeline/workflows/ngs_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@
bam_collect_doc:
enabled: false
window_length: 1000
# Compute fingerprints with ngs-chew
ngs_chew_fingerprint:
enabled: true
# Configuration for BWA
bwa:
path_index: REQUIRED # Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.
Expand Down Expand Up @@ -1189,6 +1192,79 @@ def get_resource_usage(self, action):
)


class NgsChewStepPart(BaseStepPart):
"""Analyze BAM File with ``ngs-chew``, e.g., ``fingerprint``"""

#: Step name
name = "ngs_chew"

#: Class available actions
actions = ("fingerprint",)

def __init__(self, parent):
super().__init__(parent)

def get_input_files(self, action):
"""Return required input files"""
self._check_action(action)
return getattr(self, f"_get_input_files_{action}")

def _check_action(self, action):
if action not in self.actions:
actions_str = ", ".join(self.actions)
error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
raise UnsupportedActionException(error_message)

@dictify
def _get_input_files_run(self):
yield "bam", "work/{mapper_lib}/out/{mapper_lib}.bam"

def get_output_files(self, action):
"""Return output files"""
self._check_action(action)
return getattr(self, "_get_output_files_{action}".format(action=action))()

@dictify
def _get_output_files_run(self):
yield "npz", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz"
yield "npz_md5", "work/{mapper_lib}/report/fingerprint/{mapper_lib}.npz.md5"

def get_log_files(self, action):
self._check_action(action)
return getattr(self, "_get_log_files_{action}".format(action=action))()

@dictify
def _get_log_files_fingerprint(self):
prefix = "work/{mapper_lib}/log/{mapper_lib}.ngs_chew_fingerprint"
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
("wrapper", ".wrapper.py"),
("env_yaml", ".environment.yaml"),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_resource_usage(self, action):
"""Get Resource Usage

:param action: Action (i.e., step) in the workflow, example: 'run'.
:type action: str

:return: Returns ResourceUsage for step.

:raises UnsupportedActionException: if action not in class defined list of valid actions.
"""
self._check_action(action)
return ResourceUsage(
threads=1,
time="04:00:00",
memory="2G",
)


class NgsMappingWorkflow(BaseStep):
"""Perform NGS Mapping"""

Expand Down Expand Up @@ -1218,6 +1294,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
StarStepPart,
TargetCoverageReportStepPart,
BamCollectDocStepPart,
NgsChewStepPart,
)
)
self.sub_steps["link_out"].disable_patterns = expand("**/*{ext}", ext=EXT_VALUES)
Expand Down Expand Up @@ -1281,9 +1358,14 @@ def get_result_files(self):
yield from self._yield_result_files(
os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), ext=EXT_VALUES
)
infixes = ["mapping", "target_cov_report"]
infixes = [
"mapping",
"target_cov_report",
]
if self.config["bam_collect_doc"]["enabled"]:
infixes.append("bam_collect_doc")
if self.config["ngs_chew_fingerprint"]["enabled"]:
infixes.append("ngs_chew_fingerprint")
for infix in infixes:
yield from self._yield_result_files(
os.path.join("output", name_pattern, "log", "{mapper}.{ngs_library.name}.{ext}"),
Expand All @@ -1305,6 +1387,13 @@ def get_result_files(self):
os.path.join("output", name_pattern, "report", "cov", name_pattern + ".cov.{ext}"),
ext=("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5", "bw", "bw.md5"),
)
if self.config["ngs_chew_fingerprint"]["enabled"]:
yield from self._yield_result_files(
os.path.join(
"output", name_pattern, "report", "fingerprint", name_pattern + ".{ext}"
),
ext=("npz", "npz.md5"),
)
yield from self._yield_result_files(
os.path.join(
"output", name_pattern, "report", "bam_qc", name_pattern + ".bam.{report}.txt"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- file:///data/gpfs-1/users/holtgrem_c/scratch/art/LinuxArtifacts/packages
dependencies:
- ngs-chew >=0.5.0,<0.6.0
128 changes: 128 additions & 0 deletions snappy_wrappers/wrappers/ngs_chew/fingerprint/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
"""Wrapper for running ``ngs-chew fingerprint``."""

from snakemake.shell import shell

__author__ = "Manuel Holtgrewe"
__email__ = "[email protected]"

path_ref = snakemake.config["static_data_config"]["reference"]["path"]
if "hg19" in path_ref or "37" in path_ref:
genome_release = "GRCh37"
else:
genome_release = "GRCh38"

shell(
r"""
set -x

# Write out information about conda and save a copy of the wrapper with picked variables
# as well as the environment.yaml file.
conda list >{snakemake.log.conda_list}
conda info >{snakemake.log.conda_info}
md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5}
md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5}
cp {__real_file__} {snakemake.log.wrapper}
md5sum {snakemake.log.wrapper} >{snakemake.log.wrapper_md5}
cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml}
md5sum {snakemake.log.env_yaml} >{snakemake.log.env_yaml_md5}

# Also pipe stderr to log file
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
exec 2> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi

export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
mkdir -p $TMPDIR/{{out,sorted,sort.tmp}}

if [[ "{library_kit}" == "PacBio HiFi" ]]; then
preset=map-hifi
elif [[ "{library_kit}" == "PacBio CLR" ]]; then
preset=map-pb
elif [[ "{library_kit}" == ONT* ]]; then
preset=map-ont
else
>&2 echo "Unknown library kit {library_kit}"
exit 1
fi

ngs-chew fingerprint \
--min-coverage 5 \
--reference {snakemake.config[static_data_config][reference][path]} \
--output-fingerprint {snakemake.ouput.npz} \
--input-bam {snakemake.input.bam}
--genome-release {genome_release}


# Compute MD5 sums
pushd $(dirname $out_bam)
md5sum $(basename $out_bam) >$(basename $out_bam).md5
md5sum $(basename $out_bam).bai >$(basename $out_bam).bai.md5
popd

# QC Report ---------------------------------------------------------------------------------------

# gather statistics from BAM file
# TODO: use pipes for only reading once from disk?
samtools stats {snakemake.output.bam} > {snakemake.output.report_bamstats_txt}
samtools flagstat {snakemake.output.bam} > {snakemake.output.report_flagstats_txt}
samtools idxstats {snakemake.output.bam} > {snakemake.output.report_idxstats_txt}

# call plot-bamstats
mkdir $TMPDIR/bamstats.d
plot-bamstats \
-p $TMPDIR/bamstats.d/ \
{snakemake.output.report_bamstats_txt} \
|| true # ignore failure

# Patch inline-html if necessary.
cat >$TMPDIR/inline-html.diff <<EOF
diff --git a/inline_html/inline_html.py b/inline_html/inline_html.py
index 893086c..cbef6dd 100644
--- a/inline_html/inline_html.py
+++ b/inline_html/inline_html.py
@@ -20,7 +20,10 @@ def resource_to_data(path, in_file):
mime, _ = mimetypes.guess_type(path)
with open(path, 'rb') as fp:
data = fp.read()
- data64 = b''.join(base64.encodestring(data).splitlines())
+ try:
+ data64 = b''.join(base64.encodestring(data).splitlines())
+ except AttributeError:
+ data64 = b''.join(base64.encodebytes(data).splitlines())
return 'data:%s;base64,%s' % (mime, data64.decode('ascii'))
EOF
pushd $(python3 -c 'import inline_html; print(inline_html.__path__[0])')
if ! grep encodebytes inline_html.py; then
patch -p2 <$TMPDIR/inline-html.diff
fi
popd

# Convert HTML report into one file.
inline-html \
--in-file $TMPDIR/bamstats.d/index.html \
--out-file {snakemake.output.report_bamstats_html} \
|| touch {snakemake.output.report_bamstats_html}

# Build MD5 files for the reports
md5sum {snakemake.output.report_bamstats_html} > {snakemake.output.report_bamstats_html_md5}
md5sum {snakemake.output.report_bamstats_txt} > {snakemake.output.report_bamstats_txt_md5}
md5sum {snakemake.output.report_flagstats_txt} >{snakemake.output.report_flagstats_txt_md5}
md5sum {snakemake.output.report_idxstats_txt} > {snakemake.output.report_idxstats_txt_md5}
"""
)

# Compute MD5 sums of logs.
shell(
r"""
sleep 1s # try to wait for log file flush
md5sum {snakemake.log.log} >{snakemake.log.log_md5}
"""
)
10 changes: 9 additions & 1 deletion tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow):
"link_out",
"link_out_bam",
"minimap2",
"ngs_chew",
"star",
"target_coverage_report",
]
Expand All @@ -811,7 +812,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for i in range(1, 7)
for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5")
]
for infix in ("bam_collect_doc", "mapping", "target_cov_report"):
for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"):
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext)
for i in range(1, 7)
Expand Down Expand Up @@ -850,6 +851,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
)
for ext in ("npz", "npz.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,7 @@ def test_ngs_mapping_workflow_steps(ngs_mapping_workflow):
"link_out",
"link_out_bam",
"minimap2",
"ngs_chew",
"star",
"target_coverage_report",
]
Expand All @@ -742,7 +743,7 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for i in range(1, 7)
for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5")
]
for infix in ("bam_collect_doc", "mapping", "target_cov_report"):
for infix in ("bam_collect_doc", "mapping", "target_cov_report", "ngs_chew_fingerprint"):
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(i=i, ext=ext)
for i in range(1, 7)
Expand Down Expand Up @@ -781,6 +782,13 @@ def test_ngs_mapping_workflow_files(ngs_mapping_workflow):
for ext in ("bw", "bw.md5", "vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/fingerprint/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
)
for ext in ("npz", "npz.md5")
for i in range(1, 7)
]
expected += [
"output/bwa.P00{i}-N1-DNA1-WGS1/report/cov_qc/bwa.P00{i}-N1-DNA1-WGS1.{ext}".format(
i=i, ext=ext
Expand Down