Skip to content

Commit

Permalink
68 tests passing
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed Dec 18, 2024
1 parent f6ef634 commit fec40c6
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 64 deletions.
49 changes: 30 additions & 19 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class Aligner:
single_unindexed_cmd: str
paired_cmd: str
paired_unindexed_cmd: str
paired_interleaved_cmd: str
interleaved_cmd: str
interleaved_unindexed_cmd: str

def __post_init__(self):
Path(self.data_dir).mkdir(exist_ok=True, parents=True)
Expand Down Expand Up @@ -115,14 +116,15 @@ def check_index(self, index: str, airplane: bool = False) -> Path:

def gen_clean_cmd(
self,
fastq: Path,
fastq: Path | str,
index_path: Path,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdin: bool,
stdout: bool,
output: Path,
output: Path | str,
aligner_args: str,
aligner_threads: int,
compression_threads: int,
Expand Down Expand Up @@ -202,15 +204,16 @@ def gen_clean_cmd(

def gen_paired_clean_cmd(
self,
fastq1: Path,
fastq2: Path,
fastq1: Path | str,
fastq2: Path | str,
index_path: Path,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdin: bool,
stdout: bool,
output: Path,
output: Path | str,
aligner_args: str,
aligner_threads: int,
compression_threads: int,
Expand Down Expand Up @@ -273,20 +276,20 @@ def gen_paired_clean_cmd(
)

if self.name == "Minimap2":
if not mmi_path.is_file():
if str(fastq1) == "-" and str(fastq2) == "-": # Interleaved input
alignment_cmd = self.single_unindexed_cmd # Same as single
else:
if not mmi_path.is_file(): # No MMI, make one
if stdin: # Interleaved stdin
alignment_cmd = self.interleaved_unindexed_cmd
else: # Separate fastq1 and fastq2 file input
alignment_cmd = self.paired_unindexed_cmd
else:
if str(fastq1) == "-" and str(fastq2) == "-":
alignment_cmd = self.single_cmd
else:
else: # MMI exists
if stdin: # Interleaved stdin
alignment_cmd = self.interleaved_cmd
else: # Separate fastq1 and fastq2 file input
alignment_cmd = self.paired_cmd
else: # Bowtie2
if str(fastq1) == "-" and str(fastq2) == "-":
alignment_cmd = self.paired_interleaved_cmd
else:
if stdin: # Interleaved stdin
alignment_cmd = self.interleaved_cmd
else: # Separate fastq1 and fastq2 file input
alignment_cmd = self.paired_cmd

for k in cmd_template.keys():
Expand Down Expand Up @@ -333,12 +336,13 @@ def gen_paired_clean_cmd(
"{BIN_PATH} -x '{INDEX_PATH}' -1 '{FASTQ1}' -2 '{FASTQ2}'"
" -k 1 --mm -p {ALIGNER_THREADS} {ALIGNER_ARGS}"
),
paired_interleaved_cmd=(
interleaved_cmd=(
"{BIN_PATH} -x '{INDEX_PATH}' --interleaved -"
" -k 1 --mm -p {ALIGNER_THREADS} {ALIGNER_ARGS}"
),
single_unindexed_cmd="",
paired_unindexed_cmd="",
interleaved_unindexed_cmd="",
),
"minimap2": Aligner(
name="Minimap2",
Expand All @@ -360,7 +364,14 @@ def gen_paired_clean_cmd(
"'{BIN_PATH}' -ax sr --secondary no -t {ALIGNER_THREADS} {ALIGNER_ARGS}"
" -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ1}' '{FASTQ2}'"
),
paired_interleaved_cmd="",
interleaved_cmd=(
"'{BIN_PATH}' -ax sr --secondary no -t {ALIGNER_THREADS}"
" {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ1}'"
),
interleaved_unindexed_cmd=(
"'{BIN_PATH}' -ax sr --secondary no -t {ALIGNER_THREADS}"
" {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ1}'"
),
),
},
)
69 changes: 40 additions & 29 deletions src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,19 @@ class SampleReport:


def gather_stats(
fastqs: list[Path],
fastqs: list[Path | None],
aligner: str,
index: str,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdin: bool,
stdout: bool,
output: Path,
) -> list[dict[str, str | int | float | list[str]]]:
stats = []
logging.debug(f"gather_stats() {fastqs=}")
for fastq1 in fastqs:
fastq1_stem = util.fastq_path_to_stem(fastq1)
fastq1_out_path = output / f"{fastq1_stem}.clean.fastq.gz"
Expand All @@ -71,6 +73,8 @@ def gather_stats(
"rename": rename,
"reorder": reorder,
"casava": casava,
"stdin": stdin,
"stdout": stdout,
}.items()
if v
]
Expand All @@ -79,8 +83,8 @@ def gather_stats(
aligner=aligner,
index=index,
options=options,
fastq1_in_name=Path(fastq1).name,
fastq1_in_path=str(fastq1),
fastq1_in_name=Path(fastq1).name if not stdin else None,
fastq1_in_path=str(fastq1) if not stdin else None,
fastq1_out_name=fastq1_out_path.name if not stdout else None,
fastq1_out_path=str(fastq1_out_path) if not stdout else None,
reads_in=n_reads_in,
Expand All @@ -93,13 +97,14 @@ def gather_stats(


def gather_stats_paired(
fastqs: list[tuple[Path, Path]],
fastqs: list[tuple[Path | None, Path | None]],
aligner: str,
index: str,
invert: bool,
rename: bool,
reorder: bool,
casava: bool,
stdin: bool,
stdout: bool,
output: Path,
) -> list[dict[str, str | int | float | list[str]]]:
Expand Down Expand Up @@ -128,6 +133,8 @@ def gather_stats_paired(
"rename": rename,
"reorder": reorder,
"casava": casava,
"stdin": stdin,
"stdout": stdout,
}.items()
if v
]
Expand All @@ -136,10 +143,10 @@ def gather_stats_paired(
aligner=aligner,
index=index,
options=options,
fastq1_in_name=Path(fastq1).name,
fastq2_in_name=Path(fastq2).name,
fastq1_in_path=str(fastq1),
fastq2_in_path=str(fastq2),
fastq1_in_name=Path(fastq1).name if not stdin else None,
fastq2_in_name=Path(fastq2).name if not stdin else None,
fastq1_in_path=str(fastq1) if not stdin else None,
fastq2_in_path=str(fastq2) if not stdin else None,
fastq1_out_name=fastq1_out_path.name if not stdout else None,
fastq2_out_name=fastq2_out_path.name if not stdout else None,
fastq1_out_path=str(fastq1_out_path) if not stdout else None,
Expand Down Expand Up @@ -168,27 +175,27 @@ def clean_fastqs(
force: bool = False,
airplane: bool = False,
):
output = Path(output)
stdin = str(fastqs[0]) == "-"
stdout = str(output) == "-"
output = Path(util.CWD) if stdout else Path(output)
aligner_threads, compression_threads = util.allocate_threads(threads, stdout=stdout)
logging.debug(
f"clean_fastqs() {threads=} {aligner_threads=} {compression_threads=}"
f" {util.CACHE_DIR=} {util.INDEX_REPOSITORY_URL=}"
)
logging.debug(f"{util.CACHE_DIR=}")
logging.debug(f"{util.INDEX_REPOSITORY_URL=}")
if aligner == ALIGNER.bowtie2:
logging.info(f"Hostile v{__version__}. Mode: short read (Bowtie2)")
logging.info(
f"Hostile v{__version__}. Mode: short read {'from stdin ' if stdin else ''}(Bowtie2)"
)
elif aligner == ALIGNER.minimap2:
logging.info(f"Hostile v{__version__}. Mode: long read (Minimap2)")
if len(fastqs) == 1 and fastqs[0] == "-":
stdin = True
else:
stdin = False
logging.info(
f"Hostile v{__version__}. Mode: long read {'from stdin ' if stdin else ''}(Minimap2)"
)
if not stdin:
fastqs = [Path(path).absolute() for path in fastqs]
if not all(fastq.is_file() for fastq in fastqs):
logging.debug(f"{fastqs=}")
raise FileNotFoundError("One or more fastq files do not exist")
output.mkdir(exist_ok=True, parents=True)
index_path = aligner.value.check_index(index, airplane=airplane)
backend_cmds = [
aligner.value.gen_clean_cmd(
Expand All @@ -198,6 +205,7 @@ def clean_fastqs(
rename=rename,
reorder=reorder,
casava=casava,
stdin=stdin,
stdout=stdout,
output=output,
aligner_args=aligner_args,
Expand All @@ -211,7 +219,7 @@ def clean_fastqs(
logging.info("Cleaning…")
if stdin:
util.run_bash(backend_cmds[0], stdin=True)
fastqs[0] = Path("stdin")
fastqs[0] = "stdin"
else:
util.run_bash_parallel(backend_cmds, description="Cleaning")
stats = gather_stats(
Expand All @@ -222,6 +230,7 @@ def clean_fastqs(
rename=rename,
reorder=reorder,
casava=casava,
stdin=stdin,
stdout=stdout,
output=output,
)
Expand All @@ -244,28 +253,28 @@ def clean_paired_fastqs(
force: bool = False,
airplane: bool = False,
):
output = Path(output)
stdin = str(fastqs[0][0]) == "-"
stdout = str(output) == "-"
output = Path(util.CWD) if stdout else Path(output)
aligner_threads, compression_threads = util.allocate_threads(threads, stdout=stdout)
logging.debug(
f"clean_paired_fastqs() {threads=} {aligner_threads=} {compression_threads=}"
f" {util.CACHE_DIR=} {util.INDEX_REPOSITORY_URL=}"
)
logging.debug(f"{util.CACHE_DIR=}")
logging.debug(f"{util.INDEX_REPOSITORY_URL=}")
if aligner == ALIGNER.bowtie2:
logging.info(f"Hostile v{__version__}. Mode: paired short read (Bowtie2)")
logging.info(
f"Hostile v{__version__}. Mode: paired short read {'from stdin ' if stdin else ''}(Bowtie2)"
)
elif aligner == ALIGNER.minimap2:
logging.info(f"Hostile v{__version__}. Mode: paired short read (Minimap2)")
if len(fastqs) == 1 and fastqs[0][0] == "-":
stdin = True
else:
stdin = False
logging.info(
f"Hostile v{__version__}. Mode: paired short read {'from stdin ' if stdin else ''}(Minimap2)"
)
if not stdin:
fastqs = [
(Path(path1).absolute(), Path(path2).absolute()) for path1, path2 in fastqs
]
if not all(path.is_file() for fastq_pair in fastqs for path in fastq_pair):
raise FileNotFoundError("One or more fastq files do not exist")
output.mkdir(exist_ok=True, parents=True)
index_path = aligner.value.check_index(index, airplane=airplane)
backend_cmds = [
aligner.value.gen_paired_clean_cmd(
Expand All @@ -276,6 +285,7 @@ def clean_paired_fastqs(
rename=rename,
reorder=reorder,
casava=casava,
stdin=stdin,
stdout=stdout,
output=output,
aligner_args=aligner_args,
Expand All @@ -300,6 +310,7 @@ def clean_paired_fastqs(
rename=rename,
reorder=reorder,
casava=casava,
stdin=stdin,
stdout=stdout,
output=output,
)
Expand Down
8 changes: 0 additions & 8 deletions tests/data/tuberculosis_2.fastq

This file was deleted.

Loading

0 comments on commit fec40c6

Please sign in to comment.