From 9634e7a3f3faf5d13f00f498fc8c53fa9225e31a Mon Sep 17 00:00:00 2001
From: Bede Constantinides <bedeabc@gmail.com>
Date: Fri, 15 Nov 2024 15:10:55 +0000
Subject: [PATCH] Clarify docs and CLI help wrt short/long read inference via
 fastq1 and fastq2 args (#47, #48)

---
 README.md              | 33 +++++++++++++++++----------------
 src/hostile/aligner.py |  4 ++--
 src/hostile/cli.py     |  9 +++++----
 tests/test_all.py      | 15 ++++++++++++++-
 4 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 1202807..950f359 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 # Hostile
 
-Hostile accurately removes host sequences from short and long read (meta)genomes, consuming paired or unpaired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). Bowtie2 is the default aligner for short (paired) reads while Minimap2 is default aligner for long reads. In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. By default, Hostile requires 4GB of RAM for decontaminating short reads and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems [or](mailto:b@bede.im) [otherwise](https://twitter.com/beconsta) [reach](https://bsky.app/profile/bedec.bsky.social) [out](https://mstdn.science/@bede) for help, advice etc.
+Hostile accurately removes host sequences from short and long read (meta)genomes, consuming single-read or paired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). Bowtie2 is the default aligner for short (paired) reads while Minimap2 is default aligner for long reads. In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. By default, Hostile requires 4GB of RAM for decontaminating short reads and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems or otherwise [reach](https://bsky.app/profile/bedec.bsky.social) [out](mailto:b@bede.im) for help and advice.
 
 
 
@@ -66,21 +66,21 @@ Hostile automatically downloads and caches the default index `human-t2t-hla` whe
 
 ```bash
 $ hostile clean -h
-usage: hostile clean [-h] --fastq1 FASTQ1 [--fastq2 FASTQ2] [--aligner {bowtie2,minimap2,auto}] [--index INDEX]
-                     [--invert] [--rename] [--reorder] [--out-dir OUT_DIR] [--threads THREADS]
-                     [--aligner-args ALIGNER_ARGS] [--force] [--offline] [--debug]
+usage: hostile clean [-h] --fastq1 FASTQ1 [--fastq2 FASTQ2] [--aligner {bowtie2,minimap2,auto}] [--index INDEX] [--invert] [--rename] [--reorder] [--out-dir OUT_DIR]
+                     [--threads THREADS] [--aligner-args ALIGNER_ARGS] [--force] [--offline] [--debug]
 
-Remove reads aligning to an index from fastq[.gz] input files
+Remove reads aligning to an index from fastq[.gz] input files.
 
 options:
   -h, --help            show this help message and exit
   --fastq1 FASTQ1       path to forward fastq[.gz] file
-  --fastq2 FASTQ2       optional path to reverse fastq[.gz] file
+  --fastq2 FASTQ2       optional path to reverse fastq[.gz] file (short reads only)
                         (default: None)
   --aligner {bowtie2,minimap2,auto}
-                        alignment algorithm. Default is Bowtie2 (paired reads) & Minimap2 (unpaired reads)
+                        alignment algorithm. Defaults to minimap2 (long read) given fastq1 only or bowtie2 (short read)
+                        given fastq1 and fastq2. Override with bowtie2 if cleaning single/unpaired short reads
                         (default: auto)
-  --index INDEX         name of standard index or path to custom genome/index
+  --index INDEX         name of standard index or path to custom genome (Minimap2) or Bowtie2 index
                         (default: human-t2t-hla)
   --invert              keep only reads aligning to the target genome (and their mates if applicable)
                         (default: False)
@@ -89,7 +89,7 @@ options:
   --reorder             ensure deterministic output order
                         (default: False)
   --out-dir OUT_DIR     path to output directory
-                        (default: /Users/bede/Research/Git/hostile)
+                        (default: ./)
   --threads THREADS     number of alignment threads. A sensible default is chosen automatically
                         (default: 5)
   --aligner-args ALIGNER_ARGS
@@ -105,7 +105,7 @@ options:
 
 
 
-**Short reads, default index**
+**Short paired reads, default index**
 
 ```bash
 $ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz
@@ -135,8 +135,7 @@ INFO: Cleaning complete
 ]
 ```
 
-
-**Short reads, masked index, save log**
+**Short paired reads, masked index, save log**
 
 ```bash
 $ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz --index human-t2t-hla-argos985 > log.json
@@ -148,9 +147,9 @@ INFO: Cleaning complete
 
 
 
-**Short unpaired reads, save log**
+**Short single/unpaired reads, save log**
 
-By default, single fastqs are assumed to be long reads. Override this by specifying `--aligner bowtie2` when decontaminating unpaired short reads.
+By default, single/unpaired fastqs are assumed to be long reads. Override this by specifying `--aligner bowtie2` when decontaminating unpaired short reads.
 
 ```bash
 $ hostile clean --aligner bowtie2 --fastq1 tests/data/human_1_1.fastq.gz > log.json
@@ -229,14 +228,16 @@ You may wish to use one of the existing [reference genomes](#reference-genomes--
 
 ## Limitations
 
-- Hostile prioritises retaining microbial sequences above discarding host sequences. If you strive to remove every last human sequence, other approaches may serve you better.
+- Hostile prioritises retaining microbial sequences above discarding host sequences. If you strive to remove every last human sequence, other approaches may serve you better ([blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html)).
 - Performance is not always improved by using all available CPU cores. A sensible default is therefore chosen automatically at runtime based on the number of available CPU cores.
-- Minimap2 has an overhead of 30-90s for human genome indexing prior to starting decontamination. Surprisingly, loading a prebuilt index is not significantly faster. I hope to mitigate this in a future release.
+- Minimap2 has an overhead of 30-90s for human genome indexing prior to starting decontamination, which frustrate users processing large numbers of small samples. Support for automatic Minimap2 index caching [is planned for a future release](https://github.com/bede/hostile/issues/39).
 
 
 
 ## Citation
 
+Please cite Hostile if you find it useful.
+
 Bede Constantinides, Martin Hunt, Derrick W Crook,  Hostile: accurate decontamination of microbial host sequences, *Bioinformatics*, 2023; btad728, https://doi.org/10.1093/bioinformatics/btad728
 
 ```latex
diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py
index 6e3e1ec..0406738 100644
--- a/src/hostile/aligner.py
+++ b/src/hostile/aligner.py
@@ -53,7 +53,7 @@ def check_index(self, index: str, offline: bool = False) -> Path:
                 index_path = self.data_dir / index
                 logging.info(f"Downloaded standard index {index_path}")
             else:
-                message = f"{index} is neither a valid custom index path nor a valid standard index name"
+                message = f"{index} is neither a valid custom {self.name} index path nor a valid standard index name. Mode: short read (Bowtie2)"
                 if offline:
                     message += (
                         ". Disable offline mode to enable discovery of standard indexes"
@@ -83,7 +83,7 @@ def check_index(self, index: str, offline: bool = False) -> Path:
                 index_path = self.data_dir / f"{index}.fa.gz"
                 logging.info(f"Downloaded standard index {index_path}")
             else:
-                message = f"{index} is neither a valid custom index path nor a valid standard index name"
+                message = f"{index} is neither a valid custom FASTA path, nor a valid standard index name. Mode: long read (Minimap2)"
                 if offline:
                     message += (
                         ". Disable offline mode to enable discovery of standard indexes"
diff --git a/src/hostile/cli.py b/src/hostile/cli.py
index ce0c1ef..e283e6b 100644
--- a/src/hostile/cli.py
+++ b/src/hostile/cli.py
@@ -35,12 +35,13 @@ def clean(
     debug: bool = False,
 ) -> None:
     """
-    Remove reads aligning to an index from fastq[.gz] input files
+    Remove reads aligning to an index from fastq[.gz] input files.
 
     :arg fastq1: path to forward fastq[.gz] file
-    :arg fastq2: optional path to reverse fastq[.gz] file
-    :arg aligner: alignment algorithm. Default is Bowtie2 (paired reads) & Minimap2 (unpaired reads)
-    :arg index: name of standard index or path to custom genome/index
+    :arg fastq2: optional path to reverse fastq[.gz] file (short reads only)
+    :arg aligner: alignment algorithm. Defaults to minimap2 (long read) given fastq1 only or bowtie2 (short read)
+        given fastq1 and fastq2. Override with bowtie2 if cleaning single/unpaired short reads
+    :arg index: name of standard index or path to custom genome (Minimap2) or Bowtie2 index
     :arg invert: keep only reads aligning to the target genome (and their mates if applicable)
     :arg rename: replace read names with incrementing integers
     :arg reorder: ensure deterministic output order
diff --git a/tests/test_all.py b/tests/test_all.py
index f8bdd82..3bc1f32 100644
--- a/tests/test_all.py
+++ b/tests/test_all.py
@@ -715,11 +715,24 @@ def test_mismatched_number_of_reads_bowtie2():
     shutil.rmtree(out_dir, ignore_errors=True)
 
 
-def test_offline_invalid_standard_index_name():
+def test_offline_invalid_mm2_standard_index_name():
     with pytest.raises(FileNotFoundError):
         lib.clean_fastqs(
             fastqs=[data_dir / "sars-cov-2_1_1.fastq"],
             index="invalid_index_name",
+            aligner=lib.ALIGNER.minimap2,
+            out_dir=out_dir,
+            offline=True,
+        )
+    shutil.rmtree(out_dir, ignore_errors=True)
+
+
+def test_offline_invalid_bt2_standard_index_name():
+    with pytest.raises(FileNotFoundError):
+        lib.clean_fastqs(
+            fastqs=[data_dir / "sars-cov-2_1_1.fastq"],
+            index="invalid_index_name",
+            aligner=lib.ALIGNER.bowtie2,
             out_dir=out_dir,
             offline=True,
         )