Skip to content

Commit

Permalink
Merge branch 'mmi-cache'
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed Dec 16, 2024
2 parents 4b220b1 + d1a542c commit b5ccc77
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 34 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ A [Biocontainer image](https://biocontainers.pro/tools/hostile) is also availabl

- If (like [EIT Pathogena](https://www.eit-pathogena.com)) you wish to use your own remote repository of indexes, set the environment variable `HOSTILE_REPOSITORY_URL`. Hostile will then look for indexes inside `{HOSTILE_REPOSITORY_URL}/manifest.json`.

- From version 1.2.0 onwards, Hostile automatically builds and reuses Minimap2 .mmi files to reduce warmup time for long read decontamination. The first time you use a new index you may notice an indexing delay, but thereafter it will be much faster.



## Command line usage
Expand Down
119 changes: 92 additions & 27 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,27 @@
from hostile import util


def get_mmi_path(index_path: Path) -> Path:
return Path(
str(index_path)
.removesuffix(".fa")
.removesuffix(".fasta")
.removesuffix(".fa.gz")
.removesuffix(".fasta.gz")
.removesuffix(".mmi")
+ ".mmi"
)


@dataclass
class Aligner:
name: str
bin_path: Path
data_dir: Path
cmd: str
single_cmd: str
single_unindexed_cmd: str
paired_cmd: str
paired_unindexed_cmd: str

def __post_init__(self):
Path(self.data_dir).mkdir(exist_ok=True, parents=True)
Expand All @@ -27,6 +41,7 @@ def check_index(self, index: str, offline: bool = False) -> Path:
util.run(f"{self.bin_path} --version", cwd=self.data_dir)
except subprocess.CalledProcessError:
raise RuntimeError(f"Failed to execute {self.bin_path}")

if self.name == "Bowtie2":
if Path(f"{index}.1.bt2").is_file():
index_path = Path(index)
Expand Down Expand Up @@ -54,19 +69,25 @@ def check_index(self, index: str, offline: bool = False) -> Path:
index_path = self.data_dir / index
logging.info(f"Downloaded standard index {index_path}")
else:
message = f"{index} is neither a valid custom {self.name} index path nor a valid standard index name. Mode: short read (Bowtie2)"
message = f"{index} is neither a valid custom Bowtie2 index path nor a valid standard index name. Mode: short read (Bowtie2)"
if offline:
message += (
". Disable offline mode to enable discovery of standard indexes"
)
raise FileNotFoundError(message)

elif self.name == "Minimap2":
if Path(f"{index}").is_file():
index_path = Path(index)
logging.info(f"Found custom index {index}")
elif (self.data_dir / f"{index}.fa.gz").is_file():
index_path = self.data_dir / f"{index}.fa.gz"
logging.info(f"Found cached standard index {index}")
if get_mmi_path(index_path).is_file():
logging.info(f"Found cached standard index {index} (MMI available)")
else:
logging.info(
f"Found cached standard index {index} (MMI file will be generated)"
)
elif not offline and util.fetch_manifest(util.INDEX_REPOSITORY_URL).get(
index
):
Expand All @@ -92,12 +113,14 @@ def check_index(self, index: str, offline: bool = False) -> Path:
". Disable offline mode to enable discovery of standard indexes"
)
raise FileNotFoundError(message)

return index_path

def gen_clean_cmd(
self,
fastq: Path,
out_dir: Path,
stdout: bool,
index_path: Path,
invert: bool,
rename: bool,
Expand All @@ -112,10 +135,12 @@ def gen_clean_cmd(
fastq_out_path = out_dir / f"{fastq_stem}.clean.fastq.gz"
count_before_path = out_dir / f"{fastq_stem}.reads_in.txt"
count_after_path = out_dir / f"{fastq_stem}.reads_out.txt"
if not force and fastq_out_path.exists():

if not stdout and not force and fastq_out_path.exists():
raise FileExistsError(
"Output file already exists. Use --force to overwrite"
)

filter_cmd = (
" | samtools view -hF 4 -" if invert else " | samtools view -hf 4 -"
)
Expand All @@ -127,16 +152,32 @@ def gen_clean_cmd(
if rename
else ""
)
cmd_template = { # Templating for Aligner.cmd

mmi_path = get_mmi_path(index_path)

cmd_template = {
"{BIN_PATH}": str(self.bin_path),
"{INDEX_PATH}": str(index_path),
"{MMI_PATH}": str(mmi_path),
"{FASTQ}": str(fastq),
"{ALIGNER_ARGS}": str(aligner_args),
"{THREADS}": str(threads),
}
alignment_cmd = self.cmd

if self.name == "Minimap2" and not mmi_path.is_file():
alignment_cmd = self.single_unindexed_cmd
else:
alignment_cmd = self.single_cmd

for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])

# If we are streaming output, write to stdout instead of a file
if stdout:
fastq_cmd = "samtools fastq --threads 4 -c 6 -0 -" # write to stdout
else:
fastq_cmd = f"samtools fastq --threads 4 -c 6 -0 '{fastq_out_path}'"

cmd = (
# Align, stream reads to stdout in SAM format
f"{alignment_cmd}"
Expand All @@ -150,15 +191,17 @@ def gen_clean_cmd(
f"{reorder_cmd}"
# Optionally replace read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
f" | samtools fastq --threads 4 -c 6 -0 '{fastq_out_path}'"
# Stream remaining records into fastq
f" | {fastq_cmd}"
)

return cmd

def gen_paired_clean_cmd(
self,
fastq1: Path,
fastq2: Path,
stdout: bool,
out_dir: Path,
index_path: Path,
invert: bool,
Expand All @@ -176,22 +219,26 @@ def gen_paired_clean_cmd(
fastq2_out_path = out_dir / f"{fastq2_stem}.clean_2.fastq.gz"
count_before_path = out_dir / f"{fastq1_stem}.reads_in.txt"
count_after_path = out_dir / f"{fastq1_stem}.reads_out.txt"
if not force and (fastq1_out_path.exists() or fastq2_out_path.exists()):

if (
not stdout
and not force
and (fastq1_out_path.exists() or fastq2_out_path.exists())
):
raise FileExistsError(
"Output files already exist. Use --force to overwrite"
)

filter_cmd = (
" | samtools view -h -e 'flag.unmap == 0 || flag.munmap == 0' -"
if invert
else " | samtools view -hf 12 -"
)
reorder_cmd = ""
if self.name == "Bowtie2" and reorder:
if (
util.get_platform() == "darwin"
): # Under MacOS, Bowtie2's native --reorder is very slow
reorder_cmd = " | samtools sort -n -O sam -@ 6 -m 1G" if reorder else ""
else: # Under Linux, Bowtie2's --reorder option works very well
if util.get_platform() == "darwin":
reorder_cmd = " | samtools sort -n -O sam -@ 6 -m 1G"
else: # Under Linux, Bowtie2's --reorder option is efficient
reorder_cmd = ""
aligner_args += " --reorder"
rename_cmd = (
Expand All @@ -202,33 +249,47 @@ def gen_paired_clean_cmd(
if rename
else ""
)
cmd_template = { # Templating for Aligner.cmd

mmi_path = get_mmi_path(index_path)

cmd_template = {
"{BIN_PATH}": str(self.bin_path),
"{INDEX_PATH}": str(index_path),
"{MMI_PATH}": str(mmi_path),
"{FASTQ1}": str(fastq1),
"{FASTQ2}": str(fastq2),
"{ALIGNER_ARGS}": str(aligner_args),
"{THREADS}": str(threads),
}
alignment_cmd = self.paired_cmd

if self.name == "Minimap2":
logging.warning(
"Minimap2 mode is not recommended for decontaminating short (paired) reads"
)

if self.name == "Minimap2" and not mmi_path.is_file():
alignment_cmd = self.paired_unindexed_cmd
else:
alignment_cmd = self.paired_cmd

for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])

if stdout:
fastq_cmd = "samtools fastq --threads 4 -c 6 -N -0 -"
else:
fastq_cmd = f"samtools fastq --threads 4 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"

cmd = (
# Align, stream reads to stdout in SAM format
f"{alignment_cmd}"
# Count reads in stream before filtering (2048 + 256 = 2304)
f" | tee >(samtools view -F 2304 -c - > '{count_before_path}')"
# Discard mapped reads and reads with mapped mates (or inverse)
f"{filter_cmd}"
# Count reads in stream after filtering (2048 + 256 = 2304)
f" | tee >(samtools view -F 2304 -c - > '{count_after_path}')"
# Optionally sort reads by name
f"{reorder_cmd}"
# Optionally replace paired read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
f" | samtools fastq --threads 4 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"
f" | {fastq_cmd}"
)

return cmd


Expand All @@ -239,21 +300,25 @@ def gen_paired_clean_cmd(
name="Bowtie2",
bin_path=Path("bowtie2"),
data_dir=util.CACHE_DIR,
cmd=(
single_cmd=(
"'{BIN_PATH}' -x '{INDEX_PATH}' -U '{FASTQ}'"
" -k 1 --mm -p {THREADS} {ALIGNER_ARGS}"
),
single_unindexed_cmd="",
paired_cmd=(
"{BIN_PATH} -x '{INDEX_PATH}' -1 '{FASTQ1}' -2 '{FASTQ2}'"
" -k 1 --mm -p {THREADS} {ALIGNER_ARGS}"
),
paired_unindexed_cmd="",
),
"minimap2": Aligner(
name="Minimap2",
bin_path=Path("minimap2"),
data_dir=util.CACHE_DIR,
cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {THREADS} {ALIGNER_ARGS} '{INDEX_PATH}' '{FASTQ}'",
paired_cmd="'{BIN_PATH}' -ax sr --secondary no -t {THREADS} {ALIGNER_ARGS} '{INDEX_PATH}' '{FASTQ1}' '{FASTQ2}'",
single_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ}'",
single_unindexed_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ}'",
paired_cmd="'{BIN_PATH}' -ax sr --secondary no -t {THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ1}' '{FASTQ2}'",
paired_unindexed_cmd="'{BIN_PATH}' -ax sr --secondary no -t {THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ1}' '{FASTQ2}'",
),
},
)
14 changes: 9 additions & 5 deletions src/hostile/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import sys

from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -28,9 +29,10 @@ def clean(
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
stdout: bool = False,
threads: int = util.THREADS,
aligner_args: str = "",
force: bool = False,
aligner_args: str = "",
offline: bool = False,
debug: bool = False,
) -> None:
Expand All @@ -40,15 +42,16 @@ def clean(
:arg fastq1: path to forward fastq[.gz] file
:arg fastq2: optional path to reverse fastq[.gz] file (short reads only)
:arg aligner: alignment algorithm. Defaults to minimap2 (long read) given fastq1 only or bowtie2 (short read)
given fastq1 and fastq2. Override with bowtie2 if cleaning single/unpaired short reads
given fastq1 and fastq2. Override with bowtie2 for single/unpaired short reads
:arg index: name of standard index or path to custom genome (Minimap2) or Bowtie2 index
:arg invert: keep only reads aligning to the target genome (and their mates if applicable)
:arg rename: replace read names with incrementing integers
:arg reorder: ensure deterministic output order
:arg out_dir: path to output directory
:arg stdout: send FASTQ to stdout instead of writing fastq.gz file(s). Sends log to stderr instead. Paired output is interleaved
:arg threads: number of alignment threads. A sensible default is chosen automatically
:arg aligner_args: additional arguments for alignment
:arg force: overwrite existing output files
:arg aligner_args: additional arguments for alignment
:arg offline: disable automatic index download
:arg debug: show debug messages
"""
Expand All @@ -73,6 +76,7 @@ def clean(
rename=rename,
reorder=reorder,
out_dir=out_dir,
stdout=stdout,
aligner=aligner_paired,
aligner_args=aligner_args,
threads=threads,
Expand All @@ -87,13 +91,14 @@ def clean(
rename=rename,
reorder=reorder,
out_dir=out_dir,
stdout=stdout,
aligner=aligner_unpaired,
aligner_args=aligner_args,
threads=threads,
force=force,
offline=offline,
)
print(json.dumps(stats, indent=4))
print(json.dumps(stats, indent=4), file=sys.stderr if stdout else sys.stdout)


def mask(
Expand Down Expand Up @@ -156,7 +161,6 @@ def main():
{"clean": clean, "mask": mask, "fetch": fetch},
no_negated_flags=True,
strict_kwonly=False,
short={},
)


Expand Down
4 changes: 4 additions & 0 deletions src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def clean_fastqs(
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
stdout: bool = False,
aligner: ALIGNER = ALIGNER.minimap2,
aligner_args: str = "",
threads: int = util.THREADS,
Expand All @@ -168,6 +169,7 @@ def clean_fastqs(
aligner.value.gen_clean_cmd(
fastq=fastq,
out_dir=out_dir,
stdout=stdout,
index_path=index_path,
invert=invert,
rename=rename,
Expand Down Expand Up @@ -202,6 +204,7 @@ def clean_paired_fastqs(
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
stdout: bool = False,
aligner: ALIGNER = ALIGNER.bowtie2,
aligner_args: str = "",
threads: int = util.THREADS,
Expand Down Expand Up @@ -230,6 +233,7 @@ def clean_paired_fastqs(
aligner.value.gen_paired_clean_cmd(
fastq1=fastq_pair[0],
fastq2=fastq_pair[1],
stdout=stdout,
out_dir=out_dir,
index_path=index_path,
invert=invert,
Expand Down
5 changes: 4 additions & 1 deletion src/hostile/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,15 @@ def run(cmd: str, cwd: Path | None = None) -> subprocess.CompletedProcess:
def run_bash(cmd: str, cwd: Path | None = None) -> subprocess.CompletedProcess:
"""Needed because /bin/sh does not support process substitution used for tee"""
cmd_fmt = f"set -o pipefail; {cmd}"
import sys

return subprocess.run(
["/bin/bash", "-c", cmd_fmt],
cwd=cwd,
check=True,
text=True,
capture_output=True,
stdout=sys.stdout,
stderr=subprocess.PIPE,
)


Expand Down
Loading

0 comments on commit b5ccc77

Please sign in to comment.