Skip to content

Commit

Permalink
Optional name-sorted deterministic read ordering (for Bowtie2)
Browse files Browse the repository at this point in the history
Bowtie2 read order is not deterministic with multiple threads.
While the --reorder flag provides determinism efficiently on Linux,
the overheads are catastrophic for performance on MacOS, hence this
rather more complicated solution using Samtools sort. The new flag
is --deterministic.
  • Loading branch information
bede committed Nov 22, 2023
1 parent 6b784c7 commit b5fada5
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Installation with conda/mamba or Docker is recommended due to non-Python depende
**Conda/mamba**

```bash
conda create -n hostile -c conda-forge -c bioconda hostile # Mamba is faster
conda create -y -n hostile -c conda-forge -c bioconda hostile
conda activate hostile
```

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ dependencies:
- pip
- samtools>=1.17
- minimap2>=2.26
- bowtie2=2.4.5
- bowtie2=2.5.2
- gawk>=5.1.0
- bedtools>=2.31.0
8 changes: 8 additions & 0 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def gen_clean_cmd(
out_dir: Path,
index: Path | None,
rename: bool,
deterministic: bool,
threads: int,
force: bool,
) -> str:
Expand Down Expand Up @@ -94,6 +95,7 @@ def gen_clean_cmd(
alignment_cmd = self.cmd
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])
sort_cmd = " | samtools sort -n -O sam -@ 6 -m 1G -" if deterministic else ""
rename_cmd = (
' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int(NR)" "; print $0}}\''
if rename
Expand All @@ -108,6 +110,8 @@ def gen_clean_cmd(
f" | samtools view -f 4 -"
# Count reads in stream after filtering
f" | tee >(samtools view -F 256 -c - > '{count_after_path}')"
# Optionally sort reads by name for deterministic output
f"{sort_cmd}"
# Optionally replace read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
Expand All @@ -122,6 +126,7 @@ def gen_paired_clean_cmd(
out_dir: Path,
index: Path | None,
rename: bool,
deterministic: bool,
threads: int,
force: bool,
) -> str:
Expand Down Expand Up @@ -152,6 +157,7 @@ def gen_paired_clean_cmd(
alignment_cmd = self.paired_cmd
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])
sort_cmd = " | samtools sort -n -O sam -@ 6 -m 1G -" if deterministic else ""
rename_cmd = (
f' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)" "; print $0}}\''
if rename
Expand All @@ -166,6 +172,8 @@ def gen_paired_clean_cmd(
f" | samtools view -f 12 -"
# Count reads in stream after filtering
f" | tee >(samtools view -F 256 -c - > '{count_after_path}')"
# Optionally sort reads by name for deterministic output
f"{sort_cmd}"
# Optionally replace paired read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
Expand Down
12 changes: 8 additions & 4 deletions src/hostile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,23 @@ def clean(
aligner: ALIGNER = ALIGNER.auto,
index: Path | None = None,
rename: bool = False,
deterministic: bool = False,
out_dir: Path = lib.CWD,
threads: int = lib.THREADS,
force: bool = False,
debug: bool = False,
) -> None:
"""
Remove host reads from fastq(.gz) files
Remove reads aligning to a target genome from fastq[.gz] input files
:arg fastq1: path to forward fastq(.gz) file
:arg fastq2: optional path to reverse fastq(.gz) file
:arg aligner: alignment algorithm
:arg index: path to custom genome or index. For Bowtie2, provide an index path without the .bt2 extension
:arg aligner: alignment algorithm. Use Bowtie2 for short reads and Minimap2 for long reads
:arg index: path to custom genome or index. For Bowtie2, provide an index without .bt2 extension
:arg rename: replace read names with incrementing integers
:arg deterministic: deterministic read ordering for Bowtie2 (redundant for Minimap2)
:arg out_dir: path to output directory
:arg threads: number of CPU threads to use
:arg threads: number of threads to use
:arg force: overwrite existing output files
:arg debug: show debug messages
"""
Expand All @@ -59,6 +61,7 @@ def clean(
[(fastq1, fastq2)],
index=index,
rename=rename,
deterministic=deterministic,
out_dir=out_dir,
aligner=aligner_paired,
threads=threads,
Expand All @@ -69,6 +72,7 @@ def clean(
[fastq1],
index=index,
rename=rename,
deterministic=deterministic,
out_dir=out_dir,
aligner=aligner_unpaired,
threads=threads,
Expand Down
6 changes: 5 additions & 1 deletion src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def clean_fastqs(
fastqs: list[Path],
index: Path | None = None,
rename: bool = False,
deterministic: bool = False,
out_dir: Path = CWD,
aligner: ALIGNER = ALIGNER.minimap2,
threads: int = THREADS,
Expand All @@ -218,6 +219,7 @@ def clean_fastqs(
out_dir=out_dir,
index=index,
rename=rename,
deterministic=deterministic,
threads=threads,
force=force,
)
Expand All @@ -236,6 +238,7 @@ def clean_paired_fastqs(
fastqs: list[tuple[Path, Path]],
index: Path | None = None,
rename: bool = False,
deterministic: bool = False,
out_dir: Path = CWD,
aligner: ALIGNER = ALIGNER.bowtie2,
threads: int = THREADS,
Expand All @@ -257,6 +260,7 @@ def clean_paired_fastqs(
out_dir=out_dir,
index=index,
rename=rename,
deterministic=deterministic,
threads=threads,
force=force,
)
Expand Down Expand Up @@ -292,7 +296,7 @@ def mask(
make_cmd = (
f"minimap2 -x asm10 -t {threads} '{reference_path}' '{target}'"
f" | awk -v OFS='\t' '{{print $6, $8, $9}}'"
f" | sort -k1,1 -k2,2n"
f" | deterministic -k1,1 -k2,2n"
f" | bedtools merge -i stdin > '{bed_path}'"
)
logging.info(f"Making mask ({make_cmd=})")
Expand Down

0 comments on commit b5fada5

Please sign in to comment.