diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py index 22120ff..2910b02 100644 --- a/src/hostile/aligner.py +++ b/src/hostile/aligner.py @@ -119,12 +119,13 @@ def check_index(self, index: str, offline: bool = False) -> Path: def gen_clean_cmd( self, fastq: Path, - out_dir: Path, - stdout: bool, index_path: Path, invert: bool, rename: bool, reorder: bool, + casava: bool, + stdout: bool, + out_dir: Path, aligner_args: str, aligner_threads: int, compression_threads: int, @@ -173,11 +174,15 @@ def gen_clean_cmd( for k in cmd_template.keys(): alignment_cmd = alignment_cmd.replace(k, cmd_template[k]) - # If we are streaming output, write to stdout instead of a file + if casava: + header_fmt = "-i --index-format 'i*'" + else: + header_fmt = "" + if stdout: - fastq_cmd = "samtools fastq --threads 0 -c 6 -0 -" # write to stdout + fastq_cmd = f"samtools fastq --threads 0 {header_fmt} -c 6 -0 -" else: - fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -0 '{fastq_out_path}'" + fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 {header_fmt} -0 '{fastq_out_path}'" cmd = ( # Align, stream reads to stdout in SAM format @@ -202,12 +207,13 @@ def gen_paired_clean_cmd( self, fastq1: Path, fastq2: Path, - stdout: bool, - out_dir: Path, index_path: Path, invert: bool, rename: bool, reorder: bool, + casava: bool, + stdout: bool, + out_dir: Path, aligner_args: str, aligner_threads: int, compression_threads: int, @@ -277,10 +283,15 @@ def gen_paired_clean_cmd( for k in cmd_template.keys(): alignment_cmd = alignment_cmd.replace(k, cmd_template[k]) + if casava: + header_fmt = "-n -i --index-format 'i*'" + else: + header_fmt = "-N" + if stdout: - fastq_cmd = "samtools fastq --threads 0 -c 6 -N -0 -" + fastq_cmd = f"samtools fastq --threads 0 -c 6 {header_fmt} -0 -" else: - fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null" + fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 {header_fmt} -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null" cmd = ( f"{alignment_cmd}" diff --git a/src/hostile/cli.py b/src/hostile/cli.py index 20ac04c..390504b 100644 --- a/src/hostile/cli.py +++ b/src/hostile/cli.py @@ -28,11 +28,12 @@ def clean( invert: bool = False, rename: bool = False, reorder: bool = False, - out_dir: Path = util.CWD, stdout: bool = False, + casava: bool = False, + out_dir: Path = util.CWD, + aligner_args: str = "", threads: int = util.CPU_COUNT, force: bool = False, - aligner_args: str = "", offline: bool = False, debug: bool = False, ) -> None: @@ -47,11 +48,12 @@ def clean( :arg invert: keep only reads aligning to the target genome (and their mates if applicable) :arg rename: replace read names with incrementing integers :arg reorder: ensure deterministic output order - :arg out_dir: path to output directory + :arg casava: use Casava 1.8+ read header format :arg stdout: send FASTQ to stdout instead of writing fastq.gz file(s). Sends log to stderr instead. Paired output is interleaved + :arg out_dir: path to output directory + :arg aligner_args: additional arguments for alignment :arg threads: number of alignment threads. A sensible default is chosen automatically :arg force: overwrite existing output files - :arg aligner_args: additional arguments for alignment :arg offline: disable automatic index download :arg debug: show debug messages """ @@ -63,7 +65,7 @@ def clean( if aligner == ALIGNER.auto or aligner == ALIGNER.bowtie2 else lib.ALIGNER.minimap2 ) - aligner_unpaired = ( + aligner_single = ( lib.ALIGNER.minimap2 if aligner == ALIGNER.auto or aligner == ALIGNER.minimap2 else lib.ALIGNER.bowtie2 @@ -71,13 +73,14 @@ def clean( if fastq2: stats = lib.clean_paired_fastqs( [(fastq1, fastq2)], + aligner=aligner_paired, index=index, invert=invert, rename=rename, reorder=reorder, - out_dir=out_dir, + casava=casava, stdout=stdout, - aligner=aligner_paired, + out_dir=out_dir, aligner_args=aligner_args, threads=threads, force=force, @@ -86,13 +89,14 @@ def clean( else: stats = lib.clean_fastqs( [fastq1], + aligner=aligner_single, index=index, invert=invert, rename=rename, reorder=reorder, - out_dir=out_dir, + casava=casava, stdout=stdout, - aligner=aligner_unpaired, + out_dir=out_dir, aligner_args=aligner_args, threads=threads, force=force, diff --git a/src/hostile/lib.py b/src/hostile/lib.py index 5b76a83..06805cb 100644 --- a/src/hostile/lib.py +++ b/src/hostile/lib.py @@ -39,14 +39,15 @@ class SampleReport: def gather_stats( - rename: bool, - reorder: bool, fastqs: list[Path], - out_dir: Path, aligner: str, - invert: bool, index: str, + invert: bool, + rename: bool, + reorder: bool, + casava: bool, stdout: bool, + out_dir: Path, ) -> list[dict[str, str | int | float | list[str]]]: stats = [] for fastq1 in fastqs: @@ -69,6 +70,7 @@ def gather_stats( "invert": invert, "rename": rename, "reorder": reorder, + "casava": casava, "stdout": stdout, }.items() if v @@ -92,14 +94,15 @@ def gather_stats( def gather_stats_paired( - rename: bool, - reorder: bool, fastqs: list[tuple[Path, Path]], - out_dir: Path, aligner: str, index: str, invert: bool, + rename: bool, + reorder: bool, + casava: bool, stdout: bool, + out_dir: Path, ) -> list[dict[str, str | int | float]]: stats = [] for fastq1, fastq2 in fastqs: @@ -124,6 +127,7 @@ def gather_stats_paired( "invert": invert, "rename": rename, "reorder": reorder, + "casava": casava, "stdout": stdout, }.items() if v @@ -153,13 +157,14 @@ def gather_stats_paired( def clean_fastqs( fastqs: list[Path], + aligner: ALIGNER = ALIGNER.minimap2, index: str = util.DEFAULT_INDEX_NAME, invert: bool = False, rename: bool = False, reorder: bool = False, - out_dir: Path = util.CWD, + casava: bool = False, stdout: bool = False, - aligner: ALIGNER = ALIGNER.minimap2, + out_dir: Path = util.CWD, aligner_args: str = "", threads: int = util.CPU_COUNT, force: bool = False, @@ -183,12 +188,13 @@ def clean_fastqs( backend_cmds = [ aligner.value.gen_clean_cmd( fastq=fastq, - out_dir=out_dir, - stdout=stdout, index_path=index_path, invert=invert, rename=rename, reorder=reorder, + casava=casava, + stdout=stdout, + out_dir=out_dir, aligner_args=aligner_args, aligner_threads=aligner_threads, compression_threads=compression_threads, @@ -200,14 +206,15 @@ def clean_fastqs( logging.info("Cleaning…") util.run_bash_parallel(backend_cmds, description="Cleaning") stats = gather_stats( - rename=rename, - reorder=reorder, fastqs=fastqs, - out_dir=out_dir, aligner=aligner.name, index=index, invert=invert, + rename=rename, + reorder=reorder, + casava=casava, stdout=stdout, + out_dir=out_dir, ) util.fix_empty_fastqs(stats) logging.info("Cleaning complete") @@ -216,13 +223,14 @@ def clean_fastqs( def clean_paired_fastqs( fastqs: list[tuple[Path, Path]], + aligner: ALIGNER = ALIGNER.bowtie2, index: str = util.DEFAULT_INDEX_NAME, invert: bool = False, rename: bool = False, reorder: bool = False, - out_dir: Path = util.CWD, + casava: bool = False, stdout: bool = False, - aligner: ALIGNER = ALIGNER.bowtie2, + out_dir: Path = util.CWD, aligner_args: str = "", threads: int = util.CPU_COUNT, force: bool = False, @@ -249,12 +257,13 @@ def clean_paired_fastqs( aligner.value.gen_paired_clean_cmd( fastq1=fastq_pair[0], fastq2=fastq_pair[1], - stdout=stdout, - out_dir=out_dir, index_path=index_path, invert=invert, rename=rename, reorder=reorder, + casava=casava, + stdout=stdout, + out_dir=out_dir, aligner_args=aligner_args, aligner_threads=aligner_threads, compression_threads=compression_threads, @@ -266,14 +275,15 @@ def clean_paired_fastqs( logging.info("Cleaning…") util.run_bash_parallel(backend_cmds, description="Cleaning") stats = gather_stats_paired( - rename=rename, - reorder=reorder, fastqs=fastqs, - out_dir=out_dir, aligner=aligner.name, index=index, invert=invert, + rename=rename, + reorder=reorder, + casava=casava, stdout=stdout, + out_dir=out_dir, ) util.fix_empty_fastqs(stats) logging.info("Cleaning complete")