Skip to content

Commit

Permalink
Support CASAVA 1.8+ style headers (--casava) #41
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed Dec 17, 2024
1 parent 0d47ee2 commit c71e024
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 39 deletions.
29 changes: 20 additions & 9 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,13 @@ def check_index(self, index: str, offline: bool = False) -> Path:
def gen_clean_cmd(
self,
fastq: Path,
out_dir: Path,
stdout: bool,
index_path: Path,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdout: bool,
out_dir: Path,
aligner_args: str,
aligner_threads: int,
compression_threads: int,
Expand Down Expand Up @@ -173,11 +174,15 @@ def gen_clean_cmd(
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])

# If we are streaming output, write to stdout instead of a file
if casava:
header_fmt = "-i --index-format 'i*'"
else:
header_fmt = ""

if stdout:
fastq_cmd = "samtools fastq --threads 0 -c 6 -0 -" # write to stdout
fastq_cmd = f"samtools fastq --threads 0 {header_fmt} -c 6 -0 -"
else:
fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -0 '{fastq_out_path}'"
fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 {header_fmt} -0 '{fastq_out_path}'"

cmd = (
# Align, stream reads to stdout in SAM format
Expand All @@ -202,12 +207,13 @@ def gen_paired_clean_cmd(
self,
fastq1: Path,
fastq2: Path,
stdout: bool,
out_dir: Path,
index_path: Path,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdout: bool,
out_dir: Path,
aligner_args: str,
aligner_threads: int,
compression_threads: int,
Expand Down Expand Up @@ -277,10 +283,15 @@ def gen_paired_clean_cmd(
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])

if casava:
header_fmt = "-n -i --index-format 'i*'"
else:
header_fmt = "-N"

if stdout:
fastq_cmd = "samtools fastq --threads 0 -c 6 -N -0 -"
fastq_cmd = f"samtools fastq --threads 0 -c 6 {header_fmt} -0 -"
else:
fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"
fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 {header_fmt} -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"

cmd = (
f"{alignment_cmd}"
Expand Down
22 changes: 13 additions & 9 deletions src/hostile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ def clean(
invert: bool = False,
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
stdout: bool = False,
casava: bool = False,
out_dir: Path = util.CWD,
aligner_args: str = "",
threads: int = util.CPU_COUNT,
force: bool = False,
aligner_args: str = "",
offline: bool = False,
debug: bool = False,
) -> None:
Expand All @@ -47,11 +48,12 @@ def clean(
:arg invert: keep only reads aligning to the target genome (and their mates if applicable)
:arg rename: replace read names with incrementing integers
:arg reorder: ensure deterministic output order
:arg out_dir: path to output directory
:arg casava: use Casava 1.8+ read header format
:arg stdout: send FASTQ to stdout instead of writing fastq.gz file(s). Sends log to stderr instead. Paired output is interleaved
:arg out_dir: path to output directory
:arg aligner_args: additional arguments for alignment
:arg threads: number of alignment threads. A sensible default is chosen automatically
:arg force: overwrite existing output files
:arg aligner_args: additional arguments for alignment
:arg offline: disable automatic index download
:arg debug: show debug messages
"""
Expand All @@ -63,21 +65,22 @@ def clean(
if aligner == ALIGNER.auto or aligner == ALIGNER.bowtie2
else lib.ALIGNER.minimap2
)
aligner_unpaired = (
aligner_single = (
lib.ALIGNER.minimap2
if aligner == ALIGNER.auto or aligner == ALIGNER.minimap2
else lib.ALIGNER.bowtie2
)
if fastq2:
stats = lib.clean_paired_fastqs(
[(fastq1, fastq2)],
aligner=aligner_paired,
index=index,
invert=invert,
rename=rename,
reorder=reorder,
out_dir=out_dir,
casava=casava,
stdout=stdout,
aligner=aligner_paired,
out_dir=out_dir,
aligner_args=aligner_args,
threads=threads,
force=force,
Expand All @@ -86,13 +89,14 @@ def clean(
else:
stats = lib.clean_fastqs(
[fastq1],
aligner=aligner_single,
index=index,
invert=invert,
rename=rename,
reorder=reorder,
out_dir=out_dir,
casava=casava,
stdout=stdout,
aligner=aligner_unpaired,
out_dir=out_dir,
aligner_args=aligner_args,
threads=threads,
force=force,
Expand Down
52 changes: 31 additions & 21 deletions src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ class SampleReport:


def gather_stats(
rename: bool,
reorder: bool,
fastqs: list[Path],
out_dir: Path,
aligner: str,
invert: bool,
index: str,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdout: bool,
out_dir: Path,
) -> list[dict[str, str | int | float | list[str]]]:
stats = []
for fastq1 in fastqs:
Expand All @@ -69,6 +70,7 @@ def gather_stats(
"invert": invert,
"rename": rename,
"reorder": reorder,
"casava": casava,
"stdout": stdout,
}.items()
if v
Expand All @@ -92,14 +94,15 @@ def gather_stats(


def gather_stats_paired(
rename: bool,
reorder: bool,
fastqs: list[tuple[Path, Path]],
out_dir: Path,
aligner: str,
index: str,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdout: bool,
out_dir: Path,
) -> list[dict[str, str | int | float]]:
stats = []
for fastq1, fastq2 in fastqs:
Expand All @@ -124,6 +127,7 @@ def gather_stats_paired(
"invert": invert,
"rename": rename,
"reorder": reorder,
"casava": casava,
"stdout": stdout,
}.items()
if v
Expand Down Expand Up @@ -153,13 +157,14 @@ def gather_stats_paired(

def clean_fastqs(
fastqs: list[Path],
aligner: ALIGNER = ALIGNER.minimap2,
index: str = util.DEFAULT_INDEX_NAME,
invert: bool = False,
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
casava: bool = False,
stdout: bool = False,
aligner: ALIGNER = ALIGNER.minimap2,
out_dir: Path = util.CWD,
aligner_args: str = "",
threads: int = util.CPU_COUNT,
force: bool = False,
Expand All @@ -183,12 +188,13 @@ def clean_fastqs(
backend_cmds = [
aligner.value.gen_clean_cmd(
fastq=fastq,
out_dir=out_dir,
stdout=stdout,
index_path=index_path,
invert=invert,
rename=rename,
reorder=reorder,
casava=casava,
stdout=stdout,
out_dir=out_dir,
aligner_args=aligner_args,
aligner_threads=aligner_threads,
compression_threads=compression_threads,
Expand All @@ -200,14 +206,15 @@ def clean_fastqs(
logging.info("Cleaning…")
util.run_bash_parallel(backend_cmds, description="Cleaning")
stats = gather_stats(
rename=rename,
reorder=reorder,
fastqs=fastqs,
out_dir=out_dir,
aligner=aligner.name,
index=index,
invert=invert,
rename=rename,
reorder=reorder,
casava=casava,
stdout=stdout,
out_dir=out_dir,
)
util.fix_empty_fastqs(stats)
logging.info("Cleaning complete")
Expand All @@ -216,13 +223,14 @@ def clean_fastqs(

def clean_paired_fastqs(
fastqs: list[tuple[Path, Path]],
aligner: ALIGNER = ALIGNER.bowtie2,
index: str = util.DEFAULT_INDEX_NAME,
invert: bool = False,
rename: bool = False,
reorder: bool = False,
out_dir: Path = util.CWD,
casava: bool = False,
stdout: bool = False,
aligner: ALIGNER = ALIGNER.bowtie2,
out_dir: Path = util.CWD,
aligner_args: str = "",
threads: int = util.CPU_COUNT,
force: bool = False,
Expand All @@ -249,12 +257,13 @@ def clean_paired_fastqs(
aligner.value.gen_paired_clean_cmd(
fastq1=fastq_pair[0],
fastq2=fastq_pair[1],
stdout=stdout,
out_dir=out_dir,
index_path=index_path,
invert=invert,
rename=rename,
reorder=reorder,
casava=casava,
stdout=stdout,
out_dir=out_dir,
aligner_args=aligner_args,
aligner_threads=aligner_threads,
compression_threads=compression_threads,
Expand All @@ -266,14 +275,15 @@ def clean_paired_fastqs(
logging.info("Cleaning…")
util.run_bash_parallel(backend_cmds, description="Cleaning")
stats = gather_stats_paired(
rename=rename,
reorder=reorder,
fastqs=fastqs,
out_dir=out_dir,
aligner=aligner.name,
index=index,
invert=invert,
rename=rename,
reorder=reorder,
casava=casava,
stdout=stdout,
out_dir=out_dir,
)
util.fix_empty_fastqs(stats)
logging.info("Cleaning complete")
Expand Down
34 changes: 34 additions & 0 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,3 +897,37 @@ def test_log_keys_stdout():
"fastq1_in_path",
"fastq2_in_path",
}


def test_casava_single():
run_cmd = run(
f"hostile clean --aligner bowtie2 --index {data_dir}/sars-cov-2/sars-cov-2 --fastq1 {data_dir}/tuberculosis_1_1.fastq -s --casava"
)
stdout_lines = run_cmd.stdout.split("\n")
assert stdout_lines[0] == "@Mycobacterium_tuberculosis 0:N:0:0"


def test_casava_single_rename():
run_cmd = run(
f"hostile clean --aligner bowtie2 --index {data_dir}/sars-cov-2/sars-cov-2 --fastq1 {data_dir}/tuberculosis_1_1.fastq -s -c --rename"
)
stdout_lines = run_cmd.stdout.split("\n")
assert stdout_lines[0] == "@1 0:N:0:0"


def test_casava_paired():
run_cmd = run(
f"hostile clean --index {data_dir}/sars-cov-2/sars-cov-2 --fastq1 {data_dir}/tuberculosis_1_1.fastq --fastq2 {data_dir}/tuberculosis_1_2.fastq -s --casava"
)
stdout_lines = run_cmd.stdout.split("\n")
assert stdout_lines[0] == "@Mycobacterium_tuberculosis 1:N:0:0"
assert stdout_lines[4] == "@Mycobacterium_tuberculosis 2:N:0:0"


def test_casava_paired_rename():
run_cmd = run(
f"hostile clean --index {data_dir}/sars-cov-2/sars-cov-2 --fastq1 {data_dir}/tuberculosis_1_1.fastq --fastq2 {data_dir}/tuberculosis_1_2.fastq -s -c --rename"
)
stdout_lines = run_cmd.stdout.split("\n")
assert stdout_lines[0] == "@1 1:N:0:0"
assert stdout_lines[4] == "@1 2:N:0:0"

0 comments on commit c71e024

Please sign in to comment.