From 637e387e8693a3c5962ee4d218e859dfc7a33998 Mon Sep 17 00:00:00 2001
From: Bede Constantinides <bedeabc@gmail.com>
Date: Mon, 16 Dec 2024 17:50:41 +0000
Subject: [PATCH] Smart thread count allocation between alignment and
 compression

---
 README.md               | 95 +++++++++++++++++++++++------------------
 src/hostile/__init__.py |  2 +-
 src/hostile/aligner.py  | 30 +++++++------
 src/hostile/cli.py      |  2 +-
 src/hostile/lib.py      | 32 ++++++++------
 src/hostile/util.py     | 32 +++++++++-----
 6 files changed, 111 insertions(+), 82 deletions(-)

diff --git a/README.md b/README.md
index b1b5ddc..b5c322c 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,13 @@
 
 # Hostile
 
-Hostile accurately removes host sequences from short and long read (meta)genomes, consuming single-read or paired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). Bowtie2 is the default aligner for short (paired) reads while Minimap2 is default aligner for long reads. In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. By default, Hostile requires 4GB of RAM for decontaminating short reads and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems or otherwise [reach](https://bsky.app/profile/bedec.bsky.social) [out](mailto:b@bede.im) for help and advice.
+Hostile accurately removes host sequences from short and long read (meta)genomes, consuming single-read or paired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. In typical use, Hostile requires 4GB of RAM for decontaminating short reads (Bowtie2) and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems or otherwise [reach](https://bsky.app/profile/bedec.bsky.social) [out](mailto:b@bede.im) for help and advice.
 
 
 
 ## Indexes
 
-The default index `human-t2t-hla` comprises [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) and [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51, and is downloaded automatically when running Hostile unless another index is specified. Marginally higher microbial sequence retention may be possible using masked indexes. The index `human-t2t-hla-argos985` is masked against [985 reference grade bacterial genomes](https://www.ncbi.nlm.nih.gov/bioproject/231221) including common human pathogens, while `human-t2t-hla.argos-bacteria-985_rs-viral-202401_ml-phage-202401` is further masked against all known virus and phage genomes. The latter should be used when retention of viral sequences is a priority. To use a standard index, simply pass its name as the value of the `--index` argument, which takes care of downloading and caching the relevant index. [Object storage](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o) is provided by the [ModMedMicro research unit](https://www.expmedndm.ox.ac.uk/modernising-medical-microbiology) at the University of Oxford. Custom indexes are also supported (see below).
+The default index `human-t2t-hla` comprises [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/11828891) and [IPD-IMGT/HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) v3.51, and is downloaded automatically when running Hostile unless another index is specified. Higher microbial sequence retention may be possible using masked indexes, which are very easy to use. The index `human-t2t-hla-argos985` is masked against [985 reference grade bacterial genomes](https://www.ncbi.nlm.nih.gov/bioproject/231221) including common human pathogens, while `human-t2t-hla.argos-bacteria-985_rs-viral-202401_ml-phage-202401` is further masked against all known virus and phage genomes. The latter should be used when retention of viral sequences is a priority. To use a standard index, simply pass its name as the value of the `--index` argument, which takes care of downloading and caching the relevant index. [Object storage](https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o) is provided by the [ModMedMicro research unit](https://www.expmedndm.ox.ac.uk/modernising-medical-microbiology) at the University of Oxford. Custom indexes are also supported (see below).
 
 |                             Name                             |                         Composition                          | Date    | Masked positions       |
 | :----------------------------------------------------------: | :----------------------------------------------------------: | ------- | ---------------------- |
@@ -50,7 +50,24 @@ A [Biocontainer image](https://biocontainers.pro/tools/hostile) is also availabl
 
 
 
-## Using non-default (including custom) indexes
+## Getting started
+
+```bash
+# Long reads
+hostile clean --fastq1 long.fastq.gz  # Creates long.clean.fastq.gz
+hostile clean --fastq1 --index mouse-mm39  # Use mouse index
+hostile clean --fastq1 long.fastq.gz --stdout > long.clean.fastq  # Send to stdout
+hostile clean --invert --fastq1 long.fastq.gz  # Keep only host reads
+
+# Short reads
+hostile clean --fastq1 short.r1.fq.gz --aligner bowtie2  # Single/unpaired
+hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz  # Paired
+hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz --stdout > clean.fq  # Send interleaved read pairs to stdout
+```
+
+
+
+## Custom indexes
 
 - To download ahead of time and cache the default index (`human-t2t-hla`), run `hostile fetch`
 - To list available standard indexes, run `hostile fetch --list`
@@ -111,93 +128,89 @@ options:
 
 
 
-**Long reads, default index**
+**Long reads**
 
 Writes compressed fastq.gz files to current working directory, sends log to stdout
 ```bash
 $ hostile clean --fastq1 tests/data/tuberculosis_1_1.fastq.gz
-INFO: Hostile version 1.0.0. Mode: long read (Minimap2)
-INFO: Found cached standard index human-t2t-hla
+INFO: Hostile v2.0.0. Mode: long read (Minimap2)
+INFO: Found cached standard index human-t2t-hla (MMI available)
 INFO: Cleaning…
 INFO: Cleaning complete
 [
     {
-        "version": "1.0.0",
+        "version": "2.0.0",
         "aligner": "minimap2",
         "index": "human-t2t-hla",
         "options": [],
         "fastq1_in_name": "tuberculosis_1_1.fastq.gz",
         "fastq1_in_path": "/Users/bede/Research/git/hostile/tests/data/tuberculosis_1_1.fastq.gz",
-        "fastq1_out_name": "tuberculosis_1_1.clean.fastq.gz",
-        "fastq1_out_path": "/Users/bede/Research/git/hostile/tuberculosis_1_1.clean.fastq.gz",
         "reads_in": 1,
         "reads_out": 1,
         "reads_removed": 0,
-        "reads_removed_proportion": 0.0
+        "reads_removed_proportion": 0.0,
+        "fastq1_out_name": "tuberculosis_1_1.clean.fastq.gz",
+        "fastq1_out_path": "/Users/bede/Research/git/hostile/tuberculosis_1_1.clean.fastq.gz"
     }
 ]
-
 ```
 
 
 
-**Long reads, default index, stream reads to stdout**
-
-Sends uncompressed FASTQ to stdout, log to stderr
+**Long reads, send reads to stdout**
 
 ```bash
-$ hostile clean --fastq1 tests/data/tuberculosis_1_1.fastq.gz
-INFO: Hostile version 1.0.0. Mode: long read (Minimap2)
-INFO: Found cached standard index human-t2t-hla
+$ hostile clean --fastq1 tests/data/tuberculosis_1_1.fastq.gz --stdout > out.fastq
+INFO: Hostile v2.0.0. Mode: long read (Minimap2)
+INFO: Found cached standard index human-t2t-hla (MMI available)
 INFO: Cleaning…
 INFO: Cleaning complete
 [
     {
-        "version": "1.0.0",
+        "version": "2.0.0",
         "aligner": "minimap2",
         "index": "human-t2t-hla",
-        "options": [],
+        "options": [
+            "stdout"
+        ],
         "fastq1_in_name": "tuberculosis_1_1.fastq.gz",
         "fastq1_in_path": "/Users/bede/Research/git/hostile/tests/data/tuberculosis_1_1.fastq.gz",
-        "fastq1_out_name": "tuberculosis_1_1.clean.fastq.gz",
-        "fastq1_out_path": "/Users/bede/Research/git/hostile/tuberculosis_1_1.clean.fastq.gz",
         "reads_in": 1,
         "reads_out": 1,
         "reads_removed": 0,
         "reads_removed_proportion": 0.0
     }
 ]
-
 ```
 
 
 
-**Short paired reads, default index**
+**Short paired reads**
 
 ```bash
 $ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz --aligner bowtie2
-INFO: Hostile version 1.0.0. Mode: paired short read (Bowtie2)
-INFO: Found cached standard index human-t2t-hla
-INFO: Cleaning…
-INFO: Cleaning complete
+14:40:51 INFO: Hostile v2.0.0. Mode: paired short read (Bowtie2)
+14:40:51 INFO: Found cached standard index human-t2t-hla
+14:40:51 INFO: Cleaning…
+14:40:52 INFO: Cleaning complete
 [
     {
-        "version": "1.0.0",
+        "version": "2.0.0",
         "aligner": "bowtie2",
         "index": "human-t2t-hla",
         "options": [],
         "fastq1_in_name": "human_1_1.fastq.gz",
-        "fastq1_in_path": "/Users/bede/human_1_1.fastq.gz",
-        "fastq1_out_name": "human_1_1.clean_1.fastq.gz",
-        "fastq1_out_path": "/Users/bede/human_1_1.clean_1.fastq.gz",
+        "fastq1_in_path": "/Users/bede/Research/git/hostile/tests/data/human_1_1.fastq.gz",
         "reads_in": 2,
         "reads_out": 0,
         "reads_removed": 2,
         "reads_removed_proportion": 1.0,
         "fastq2_in_name": "human_1_2.fastq.gz",
-        "fastq2_in_path": "/Users/bede/human_1_2.fastq.gz",
+        "fastq2_in_path": "/Users/bede/Research/git/hostile/tests/data/human_1_2.fastq.gz",
+        "fastq1_out_name": "human_1_1.clean_1.fastq.gz",
+        "fastq1_out_path": "/Users/bede/Research/git/hostile/human_1_1.clean_1.fastq.gz",
         "fastq2_out_name": "human_1_2.clean_2.fastq.gz",
-        "fastq2_out_path": "/Users/bede/human_1_2.clean_2.fastq.gz"
+        "fastq2_out_path": "/Users/bede/Research/git/hostile/human_1_2.clean_2.fastq.gz"
     }
 ]
 ```
@@ -206,30 +219,28 @@ INFO: Cleaning complete
 
 ```bash
 $ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz --aligner bowtie2 --index human-t2t-hla-argos985 > log.json
-INFO: Hostile version 1.0.0. Mode: paired short read (Bowtie2)
-INFO: Found cached standard index human-t2t-hla
+INFO: Hostile v2.0.0. Mode: paired short read (Bowtie2)
+INFO: Found cached standard index human-t2t-hla-argos985
 INFO: Cleaning…
 INFO: Cleaning complete
 ```
 
 
 
-**Short single/unpaired reads, save log**
+**Short single/unpaired reads, compress with Zstandard**
 
-By default, single/unpaired fastqs are assumed to be long reads. Override this by specifying `--aligner bowtie2` when decontaminating unpaired short reads.
+By default, single/unpaired fastqs are assumed to be long reads. Ensure to override this with `--aligner bowtie2` when decontaminating single/unpaired short reads.
 
 ```bash
-$ hostile clean --aligner bowtie2 --fastq1 tests/data/human_1_1.fastq.gz > log.json
-INFO: Hostile version 1.0.0. Mode: short read (Bowtie2)
-INFO: Found cached standard index human-t2t-hla
+$ hostile clean --fastq1 human_1_1.fastq.gz --aligner bowtie2 |
+INFO: Hostile v2.0.0. Mode: paired short read (Bowtie2)
+INFO: Found cached standard index human-t2t-hla-argos985
 INFO: Cleaning…
 INFO: Cleaning complete
 ```
 
 
 
-
-
 ## Python usage
 
 ```python
diff --git a/src/hostile/__init__.py b/src/hostile/__init__.py
index 1a609c5..3409fe5 100644
--- a/src/hostile/__init__.py
+++ b/src/hostile/__init__.py
@@ -1,3 +1,3 @@
 """Accurate host read removal"""
 
-__version__ = "1.1.0"
+__version__ = "2.0.0"
diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py
index a4709cf..22120ff 100644
--- a/src/hostile/aligner.py
+++ b/src/hostile/aligner.py
@@ -126,7 +126,8 @@ def gen_clean_cmd(
         rename: bool,
         reorder: bool,
         aligner_args: str,
-        threads: int,
+        aligner_threads: int,
+        compression_threads: int,
         force: bool,
     ) -> str:
         fastq, out_dir = Path(fastq), Path(out_dir)
@@ -161,7 +162,7 @@ def gen_clean_cmd(
             "{MMI_PATH}": str(mmi_path),
             "{FASTQ}": str(fastq),
             "{ALIGNER_ARGS}": str(aligner_args),
-            "{THREADS}": str(threads),
+            "{ALIGNER_THREADS}": str(aligner_threads),
         }
 
         if self.name == "Minimap2" and not mmi_path.is_file():
@@ -174,9 +175,9 @@ def gen_clean_cmd(
 
         # If we are streaming output, write to stdout instead of a file
         if stdout:
-            fastq_cmd = "samtools fastq --threads 4 -c 6 -0 -"  # write to stdout
+            fastq_cmd = "samtools fastq --threads 0 -c 6 -0 -"  # write to stdout
         else:
-            fastq_cmd = f"samtools fastq --threads 4 -c 6 -0 '{fastq_out_path}'"
+            fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -0 '{fastq_out_path}'"
 
         cmd = (
             # Align, stream reads to stdout in SAM format
@@ -208,7 +209,8 @@ def gen_paired_clean_cmd(
         rename: bool,
         reorder: bool,
         aligner_args: str,
-        threads: int,
+        aligner_threads: int,
+        compression_threads: int,
         force: bool,
     ) -> str:
         fastq1, fastq2, out_dir = Path(fastq1), Path(fastq2), Path(out_dir)
@@ -259,7 +261,7 @@ def gen_paired_clean_cmd(
             "{FASTQ1}": str(fastq1),
             "{FASTQ2}": str(fastq2),
             "{ALIGNER_ARGS}": str(aligner_args),
-            "{THREADS}": str(threads),
+            "{ALIGNER_THREADS}": str(aligner_threads),
         }
 
         if self.name == "Minimap2":
@@ -276,9 +278,9 @@ def gen_paired_clean_cmd(
             alignment_cmd = alignment_cmd.replace(k, cmd_template[k])
 
         if stdout:
-            fastq_cmd = "samtools fastq --threads 4 -c 6 -N -0 -"
+            fastq_cmd = "samtools fastq --threads 0 -c 6 -N -0 -"
         else:
-            fastq_cmd = f"samtools fastq --threads 4 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"
+            fastq_cmd = f"samtools fastq --threads {compression_threads} -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}' -0 /dev/null -s /dev/null"
 
         cmd = (
             f"{alignment_cmd}"
@@ -302,12 +304,12 @@ def gen_paired_clean_cmd(
             data_dir=util.CACHE_DIR,
             single_cmd=(
                 "'{BIN_PATH}' -x '{INDEX_PATH}' -U '{FASTQ}'"
-                " -k 1 --mm -p {THREADS} {ALIGNER_ARGS}"
+                " -k 1 --mm -p {ALIGNER_THREADS} {ALIGNER_ARGS}"
             ),
             single_unindexed_cmd="",
             paired_cmd=(
                 "{BIN_PATH} -x '{INDEX_PATH}' -1 '{FASTQ1}' -2 '{FASTQ2}'"
-                " -k 1 --mm -p {THREADS} {ALIGNER_ARGS}"
+                " -k 1 --mm -p {ALIGNER_THREADS} {ALIGNER_ARGS}"
             ),
             paired_unindexed_cmd="",
         ),
@@ -315,10 +317,10 @@ def gen_paired_clean_cmd(
             name="Minimap2",
             bin_path=Path("minimap2"),
             data_dir=util.CACHE_DIR,
-            single_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ}'",
-            single_unindexed_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ}'",
-            paired_cmd="'{BIN_PATH}' -ax sr --secondary no -t {THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ1}' '{FASTQ2}'",
-            paired_unindexed_cmd="'{BIN_PATH}' -ax sr --secondary no -t {THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ1}' '{FASTQ2}'",
+            single_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {ALIGNER_THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ}'",
+            single_unindexed_cmd="'{BIN_PATH}' -ax map-ont --secondary no -t {ALIGNER_THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ}'",
+            paired_cmd="'{BIN_PATH}' -ax sr --secondary no -t {ALIGNER_THREADS} {ALIGNER_ARGS} '{MMI_PATH}' '{FASTQ1}' '{FASTQ2}'",
+            paired_unindexed_cmd="'{BIN_PATH}' -ax sr --secondary no -t {ALIGNER_THREADS} {ALIGNER_ARGS} -d '{MMI_PATH}' '{INDEX_PATH}' '{FASTQ1}' '{FASTQ2}'",
         ),
     },
 )
diff --git a/src/hostile/cli.py b/src/hostile/cli.py
index 035a725..20ac04c 100644
--- a/src/hostile/cli.py
+++ b/src/hostile/cli.py
@@ -30,7 +30,7 @@ def clean(
     reorder: bool = False,
     out_dir: Path = util.CWD,
     stdout: bool = False,
-    threads: int = util.THREADS,
+    threads: int = util.CPU_COUNT,
     force: bool = False,
     aligner_args: str = "",
     offline: bool = False,
diff --git a/src/hostile/lib.py b/src/hostile/lib.py
index b02d309..5b76a83 100644
--- a/src/hostile/lib.py
+++ b/src/hostile/lib.py
@@ -161,17 +161,20 @@ def clean_fastqs(
     stdout: bool = False,
     aligner: ALIGNER = ALIGNER.minimap2,
     aligner_args: str = "",
-    threads: int = util.THREADS,
+    threads: int = util.CPU_COUNT,
     force: bool = False,
     offline: bool = False,
 ):
-    logging.debug(f"clean_fastqs() {threads=}")
+    aligner_threads, compression_threads = util.allocate_threads(threads, stdout=stdout)
+    logging.debug(
+        f"clean_fastqs() {threads=} {aligner_threads=} {compression_threads=}"
+    )
     logging.debug(f"{util.CACHE_DIR=}")
     logging.debug(f"{util.INDEX_REPOSITORY_URL=}")
     if aligner == ALIGNER.bowtie2:
-        logging.info(f"Hostile version {__version__}. Mode: short read (Bowtie2)")
+        logging.info(f"Hostile v{__version__}. Mode: short read (Bowtie2)")
     elif aligner == ALIGNER.minimap2:
-        logging.info(f"Hostile version {__version__}. Mode: long read (Minimap2)")
+        logging.info(f"Hostile v{__version__}. Mode: long read (Minimap2)")
     fastqs = [Path(path).absolute() for path in fastqs]
     if not all(fastq.is_file() for fastq in fastqs):
         raise FileNotFoundError("One or more fastq files do not exist")
@@ -187,7 +190,8 @@ def clean_fastqs(
             rename=rename,
             reorder=reorder,
             aligner_args=aligner_args,
-            threads=threads,
+            aligner_threads=aligner_threads,
+            compression_threads=compression_threads,
             force=force,
         )
         for fastq in fastqs
@@ -220,21 +224,20 @@ def clean_paired_fastqs(
     stdout: bool = False,
     aligner: ALIGNER = ALIGNER.bowtie2,
     aligner_args: str = "",
-    threads: int = util.THREADS,
+    threads: int = util.CPU_COUNT,
     force: bool = False,
     offline: bool = False,
 ):
-    logging.debug(f"clean_paired_fastqs() {threads=}")
+    aligner_threads, compression_threads = util.allocate_threads(threads, stdout=stdout)
+    logging.debug(
+        f"clean_paired_fastqs() {threads=} {aligner_threads=} {compression_threads=}"
+    )
     logging.debug(f"{util.CACHE_DIR=}")
     logging.debug(f"{util.INDEX_REPOSITORY_URL=}")
     if aligner == ALIGNER.bowtie2:
-        logging.info(
-            f"Hostile version {__version__}. Mode: paired short read (Bowtie2)"
-        )
+        logging.info(f"Hostile v{__version__}. Mode: paired short read (Bowtie2)")
     elif aligner == ALIGNER.minimap2:
-        logging.info(
-            f"Hostile version {__version__}. Mode: paired short read (Minimap2)"
-        )
+        logging.info(f"Hostile v{__version__}. Mode: paired short read (Minimap2)")
     fastqs = [
         (Path(path1).absolute(), Path(path2).absolute()) for path1, path2 in fastqs
     ]
@@ -253,7 +256,8 @@ def clean_paired_fastqs(
             rename=rename,
             reorder=reorder,
             aligner_args=aligner_args,
-            threads=threads,
+            aligner_threads=aligner_threads,
+            compression_threads=compression_threads,
             force=force,
         )
         for fastq_pair in fastqs
diff --git a/src/hostile/util.py b/src/hostile/util.py
index 9c364ae..d589335 100644
--- a/src/hostile/util.py
+++ b/src/hostile/util.py
@@ -18,15 +18,25 @@
 from tqdm import tqdm
 
 
-def choose_default_thread_count(cpu_count: int) -> int:
-    """Choose a sensible number of threads for alignment"""
-    cpu_count = int(cpu_count)
-    if cpu_count <= 1:
-        return 1
-    elif 1 < cpu_count < 17:
-        return int(cpu_count / 2)
-    else:
-        return 10
+def allocate_threads(cpu_count: int, stdout: bool = False) -> tuple[int, int]:
+    """Choose default thread counts for alignment and compression"""
+    cpu_count = max(1, int(cpu_count))  # Ensure at least 1 CPU core is considered
+
+    if cpu_count == 1:
+        return 1, 1 - 1
+
+    if stdout:
+        alignment_threads = min(30, max(1, cpu_count - 1))
+        return alignment_threads, 1 - 1
+
+    if cpu_count > 32:
+        return 22, 10 - 1
+
+    # Calculate alignment and compression threads to approximate a 2:1 ratio
+    alignment_threads = max(1, (2 * cpu_count) // 3)
+    compression_threads = min(10, max(1, cpu_count - alignment_threads)) - 1
+
+    return alignment_threads, compression_threads
 
 
 CWD = Path.cwd()
@@ -37,7 +47,7 @@ def choose_default_thread_count(cpu_count: int) -> int:
 )
 
 CPU_COUNT = multiprocessing.cpu_count()
-THREADS = choose_default_thread_count(CPU_COUNT)
+# THREADS = allocate_threads(CPU_COUNT)
 DEFAULT_INDEX_REPOSITORY_URL = "https://objectstorage.uk-london-1.oraclecloud.com/n/lrbvkel2wjot/b/human-genome-bucket/o"
 INDEX_REPOSITORY_URL = os.environ.get(
     "HOSTILE_REPOSITORY_URL", DEFAULT_INDEX_REPOSITORY_URL
@@ -72,6 +82,8 @@ def handle_alignment_exceptions(exception: subprocess.CalledProcessError) -> Non
     logging.debug(f"stderr: {exception.stderr}")
     alignment_successful = False
     stream_empty = False
+    if "function mm_idx_load":  # Minimap2 index corruption
+        raise RuntimeError("Minimap2 index appears corrupted, run hostile index purge")
     if "Error, fewer reads in file specified" in exception.stderr:  # Bowtie2
         raise RuntimeError("fastq1 and fastq2 contain different numbers of reads")
     if 'Failed to read header for "-"' in exception.stderr: