From 55d7f28bdfad96fae00b14aedb609fc856a598e7 Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Wed, 27 Mar 2024 14:02:27 -0400
Subject: [PATCH 1/6] Patch some incompatibilities in report fields

---
 src/seismicrna/core/batch/muts.py | 4 ++--
 src/seismicrna/core/report.py     | 9 ++++++---
 src/seismicrna/core/version.py    | 2 +-
 src/seismicrna/mask/write.py      | 3 +--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/seismicrna/core/batch/muts.py b/src/seismicrna/core/batch/muts.py
index 1ea654ac..f1c6418c 100644
--- a/src/seismicrna/core/batch/muts.py
+++ b/src/seismicrna/core/batch/muts.py
@@ -86,11 +86,11 @@ def pos_nums(self) -> np.ndarray:
 
     @property
     def mid5s(self):
-        return self._mid5s
+        return self._mid5s  # compatibility
 
     @property
     def mid3s(self):
-        return self._mid3s
+        return self._mid3s  # compatibility
 
     @property
     def read_weights(self) -> pd.DataFrame | None:
diff --git a/src/seismicrna/core/report.py b/src/seismicrna/core/report.py
index 73f1bd17..3ac066dc 100644
--- a/src/seismicrna/core/report.py
+++ b/src/seismicrna/core/report.py
@@ -436,10 +436,12 @@ def oconv_datetime(dtime: datetime):
                       int)
 NumReadsLoNCovF = Field("n_reads_min_ncov",
                         "Number of Reads Cut -- Too Few Covered Positions",
-                        int)
+                        int,
+                        default=0)  # compatibility
 NumReadsDiscontigF = Field("n_reads_discontig",
                            "Number of Reads Cut -- Discontiguous Mates",
-                           int)
+                           int,
+                           default=0)  # compatibility
 NumReadsLoInfoF = Field("n_reads_min_finfo",
                         "Number of Reads Cut -- Too Few Informative Positions",
                         int)
@@ -454,7 +456,8 @@ def oconv_datetime(dtime: datetime):
                       int)
 NumUniqReadKeptF = Field("n_uniq_reads",
                          "Number of Unique Reads",
-                         int)
+                         int,
+                         default=0)  # compatibility
 
 # Cluster fields
 
diff --git a/src/seismicrna/core/version.py b/src/seismicrna/core/version.py
index 18175d26..b5800333 100644
--- a/src/seismicrna/core/version.py
+++ b/src/seismicrna/core/version.py
@@ -6,7 +6,7 @@
 
 logger = getLogger(__name__)
 
-__version__ = "0.15.0"
+__version__ = "0.15.1"
 
 VERSION_DELIM = "."
 
diff --git a/src/seismicrna/mask/write.py b/src/seismicrna/mask/write.py
index fd644f96..c0810899 100644
--- a/src/seismicrna/mask/write.py
+++ b/src/seismicrna/mask/write.py
@@ -189,8 +189,7 @@ def _filter_discontig_read(self, batch: RefseqMutsBatch):
         """ Filter out reads with discontiguous mates. """
         # Find the reads with contiguous mates.
         reads = batch.read_nums[batch.contiguous_reads]
-        logger.debug(f"{self} kept {reads.size} reads with coverage "
-                     f"≥ {self.min_ncov_read} in {batch}")
+        logger.debug(f"{self} kept {reads.size} contiguous reads in {batch}")
         # Return a new batch of only those reads.
         return apply_mask(batch, reads)
 

From 99a91fc250aaa4fc57fd9b523ffd0b70842fb379 Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Wed, 27 Mar 2024 14:03:50 -0400
Subject: [PATCH 2/6] Fix problem where positions with zero probability of
 being covered by a read would give NaN mutation rates (and cause crashes) in
 _calc_p_mut_given_span_noclose

---
 src/seismicrna/core/mu/unbias.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/seismicrna/core/mu/unbias.py b/src/seismicrna/core/mu/unbias.py
index bdc2502b..bfe1703c 100644
--- a/src/seismicrna/core/mu/unbias.py
+++ b/src/seismicrna/core/mu/unbias.py
@@ -545,7 +545,9 @@ def _calc_p_mut_given_span_noclose(p_mut_given_span: np.ndarray,
                                                  * np.expand_dims(p_ends, 2))
     # Compute the mutation rates given no two mutations are too close
     # one position (j) at a time.
-    p_mut_given_span_noclose = p_mut_given_span / p_noclose_given_span
+    p_mut_given_span_noclose = np.nan_to_num(
+        p_mut_given_span / p_noclose_given_span
+    )
     for j in range(npos):
         nrows = j + 1
         ncols = npos - j

From ea84d23844a2a6dab55943ad6aa7e3bed0a314ff Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Wed, 27 Mar 2024 21:36:11 -0400
Subject: [PATCH 3/6] Add _triu_mul function to avoid warning about invalid
 values in multiplication

---
 src/seismicrna/core/mu/unbias.py | 33 +++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/seismicrna/core/mu/unbias.py b/src/seismicrna/core/mu/unbias.py
index bfe1703c..3774cbb6 100644
--- a/src/seismicrna/core/mu/unbias.py
+++ b/src/seismicrna/core/mu/unbias.py
@@ -235,6 +235,37 @@ def _triu_norm(a: np.ndarray):
     return a / _triu_sum(a)
 
 
+@jit()
+def _triu_mul(factor1: np.ndarray, factor2: np.ndarray):
+    """ Multiply the upper triangles of `numer` and `denom`.
+
+    This function is meant to be called by another function that has
+    validated the arguments; hence, this function makes assumptions:
+
+    -   `factor1` has at least 2 dimensions.
+    -   The first two dimensions of `factor1` have equal length.
+    -   `factor2` has the same first 2 dimensions as `factor1`.
+    -   `factor1` and `factor2` can be broadcast to each other.
+
+    Parameters
+    ----------
+    factor1: np.ndarray
+        Factor 1.
+    factor2: np.ndarray
+        Factor 2.
+
+    Returns
+    -------
+    np.ndarray
+        Product of the upper triangles; values below the main diagonal
+        are undefined.
+    """
+    product = np.empty(np.broadcast_shapes(factor1.shape, factor2.shape))
+    for j in range(factor1.shape[0]):
+        product[j, j:] = factor1[j, j:] * factor2[j, j:]
+    return product
+
+
 @jit()
 def _triu_div(numer: np.ndarray, denom: np.ndarray):
     """ Divide the upper triangles of `numer` and `denom`.
@@ -1066,7 +1097,7 @@ def calc_p_ends_given_noclose(p_ends: np.ndarray,
               nonzero=True)
     # Calculate the proportion of total reads that would have each
     # pair of end coordinates.
-    return _triu_norm(p_ends[:, :, np.newaxis] * p_noclose_given_ends)
+    return _triu_norm(_triu_mul(p_ends[:, :, np.newaxis], p_noclose_given_ends))
 
 
 @jit()

From f6e816be11a24426f7d1aa5c788f304036abf53a Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Sun, 31 Mar 2024 22:16:56 -0400
Subject: [PATCH 4/6] Stop EM clustering if the likelihood decreases

---
 src/seismicrna/cluster/em.py |  4 ++--
 src/seismicrna/mask/write.py | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/seismicrna/cluster/em.py b/src/seismicrna/cluster/em.py
index b2d43dd6..32dcd855 100644
--- a/src/seismicrna/cluster/em.py
+++ b/src/seismicrna/cluster/em.py
@@ -444,8 +444,8 @@ def run(self, props_seed: int | None = None, resps_seed: int | None = None):
                 logger.warning(f"{self}, iteration {self.iter} returned a "
                                f"smaller log likelihood ({self.log_like}) than "
                                f"the previous iteration ({self.log_like_prev})")
-            elif (self.delta_log_like < self.conv_thresh
-                  and self.iter >= self.min_iter):
+            if (self.delta_log_like < self.conv_thresh
+                    and self.iter >= self.min_iter):
                 # Converge if the increase in log likelihood is
                 # smaller than the convergence cutoff and at least
                 # the minimum number of iterations have been run.
diff --git a/src/seismicrna/mask/write.py b/src/seismicrna/mask/write.py
index c0810899..4524050d 100644
--- a/src/seismicrna/mask/write.py
+++ b/src/seismicrna/mask/write.py
@@ -84,15 +84,7 @@ def __init__(self,
         self.max_fmut_read = max_fmut_read
         self._n_reads = defaultdict(int)
         # Set the parameters for filtering positions.
-        if not min_ninfo_pos >= 0:
-            raise ValueError(
-                f"min_ninfo_pos must be ≥ 0, but got {min_ninfo_pos}"
-            )
         self.min_ninfo_pos = min_ninfo_pos
-        if not 0. <= max_fmut_pos < 1.:
-            raise ValueError(
-                f"max_fmut_pos Must be ≥ 0, < 1, but got {max_fmut_pos}"
-            )
         self.max_fmut_pos = max_fmut_pos
         # Set the parameters for saving files.
         self.top = dataset.top
@@ -306,13 +298,21 @@ def _filter_batch_reads(self, batch: RefseqMutsBatch):
     def _filter_positions(self, info: pd.Series, muts: pd.Series):
         """ Remove the positions that do not pass the filters. """
         # Mask the positions with insufficient informative reads.
+        if not 1 <= self.min_ninfo_pos:
+            raise ValueError("min_ninfo_pos must be ≥ 1, "
+                             f"but got {self.min_ninfo_pos}")
         self.section.add_mask(
             self.MASK_POS_NINFO,
-            index_to_pos(info.index[info < self.min_ninfo_pos]))
+            index_to_pos(info.index[info < self.min_ninfo_pos])
+        )
         # Mask the positions with excessive mutation fractions.
+        if not 0. <= self.max_fmut_pos <= 1.:
+            raise ValueError("max_fmut_pos must be ≥ 0 and ≤ 1, "
+                             f"but got {self.max_fmut_pos}")
         self.section.add_mask(
             self.MASK_POS_FMUT,
-            index_to_pos(info.index[(muts / info) > self.max_fmut_pos]))
+            index_to_pos(info.index[(muts / info) > self.max_fmut_pos])
+        )
 
     def mask(self):
         # Exclude positions based on the parameters.

From ac479f918c934685dc40f4aa4afa83e1ece5b0eb Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Sun, 31 Mar 2024 22:37:24 -0400
Subject: [PATCH 5/6] Add option to fold with multiple threads using Fold-smp

---
 src/seismicrna/core/extern/shell.py |  1 +
 src/seismicrna/fold/main.py         | 13 ++++++-----
 src/seismicrna/fold/rnastructure.py | 34 ++++++++++++++++++++---------
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/src/seismicrna/core/extern/shell.py b/src/seismicrna/core/extern/shell.py
index d1c10a61..1a6d2295 100644
--- a/src/seismicrna/core/extern/shell.py
+++ b/src/seismicrna/core/extern/shell.py
@@ -16,6 +16,7 @@
 ECHO_CMD = "echo"
 FASTQC_CMD = "fastqc"
 RNASTRUCTURE_FOLD_CMD = "Fold"
+RNASTRUCTURE_FOLD_SMP_CMD = "Fold-smp"
 GUNZIP_CMD = "gunzip"
 SAMTOOLS_CMD = "samtools"
 WORD_COUNT_CMD = "wc"
diff --git a/src/seismicrna/fold/main.py b/src/seismicrna/fold/main.py
index 26486c3c..1e45dc4c 100644
--- a/src/seismicrna/fold/main.py
+++ b/src/seismicrna/fold/main.py
@@ -124,6 +124,7 @@ def run(input_path: tuple[str, ...],
     return list(chain(dispatch(fold_profile,
                                max_procs,
                                parallel,
+                               pass_n_procs=True,
                                args=[(loader, ref_sections.list(loader.ref))
                                      for loader in tables],
                                kwargs=dict(temp_dir=Path(temp_dir),
@@ -139,26 +140,26 @@ def run(input_path: tuple[str, ...],
                                            fold_mfe=fold_mfe,
                                            fold_max=fold_max,
                                            fold_percent=fold_percent,
-                                           force=force),
-                               pass_n_procs=True)))
+                                           force=force))))
 
 
 def fold_profile(table: MaskPosTable | ClustPosTable,
                  sections: list[Section],
-                 n_procs: int,
                  quantile: float,
+                 n_procs: int,
                  **kwargs):
     """ Fold an RNA molecule from one table of reactivities. """
     return dispatch(fold_section,
                     n_procs,
                     parallel=True,
+                    hybrid=True,
+                    pass_n_procs=True,
                     args=as_list_of_tuples(table.iter_profiles(
                         sections=sections, quantile=quantile)
                     ),
                     kwargs=dict(out_dir=table.top,
                                 quantile=quantile,
-                                **kwargs),
-                    pass_n_procs=False)
+                                **kwargs))
 
 
 def fold_section(rna: RNAProfile,
@@ -171,6 +172,7 @@ def fold_section(rna: RNAProfile,
                  fold_max: int,
                  fold_percent: float,
                  force: bool,
+                 n_procs: int,
                  **kwargs):
     """ Fold a section of an RNA from one mutational profile. """
     began = datetime.now()
@@ -184,6 +186,7 @@ def fold_section(rna: RNAProfile,
                    fold_max=fold_max,
                    fold_percent=fold_percent,
                    force=force,
+                   n_procs=n_procs,
                    **kwargs)
     ct2dot(ct_file)
     ended = datetime.now()
diff --git a/src/seismicrna/fold/rnastructure.py b/src/seismicrna/fold/rnastructure.py
index 1e59658b..9d920be8 100644
--- a/src/seismicrna/fold/rnastructure.py
+++ b/src/seismicrna/fold/rnastructure.py
@@ -5,6 +5,7 @@
 https://rna.urmc.rochester.edu/RNAstructure.html
 """
 
+import os
 import re
 from logging import getLogger
 from pathlib import Path
@@ -13,6 +14,7 @@
 from ..core.extern import (RNASTRUCTURE_CT2DOT_CMD,
                            RNASTRUCTURE_DOT2CT_CMD,
                            RNASTRUCTURE_FOLD_CMD,
+                           RNASTRUCTURE_FOLD_SMP_CMD,
                            args_to_cmd,
                            run_cmd)
 from ..core.rna import RNAProfile, renumber_ct
@@ -20,6 +22,8 @@
 
 logger = getLogger(__name__)
 
+FOLD_SMP_NUM_THREADS = "OMP_NUM_THREADS"
+
 
 def fold(rna: RNAProfile, *,
          fold_temp: float,
@@ -31,24 +35,34 @@ def fold(rna: RNAProfile, *,
          out_dir: Path,
          temp_dir: Path,
          keep_temp: bool,
-         force: bool):
-    """ Run the 'Fold' program of RNAstructure. """
+         force: bool,
+         n_procs: int):
+    """ Run the 'Fold' or 'Fold-smp' program of RNAstructure. """
     ct_file = rna.get_ct_file(out_dir)
     if need_write(ct_file, force):
-        cmd = [RNASTRUCTURE_FOLD_CMD, "--temperature", fold_temp]
-        # Constraints.
+        if n_procs > 1:
+            # Fold with multiple threads using the Fold-smp program.
+            cmd = [RNASTRUCTURE_FOLD_SMP_CMD]
+            os.environ[FOLD_SMP_NUM_THREADS] = str(n_procs)
+        else:
+            # Fold with one thread using the Fold program.
+            cmd = [RNASTRUCTURE_FOLD_CMD]
+        # Temperature of folding (Kelvin).
+        cmd.extend(["--temperature", fold_temp])
         if fold_constraint is not None:
+            # File of constraints.
             cmd.extend(["--constraint", fold_constraint])
-        # Maximum distance between paired bases.
         if fold_md > 0:
+            # Maximum distance between paired bases.
             cmd.extend(["--maxdistance", fold_md])
-        # Predict only the minimum free energy structure.
         if fold_mfe:
+            # Predict only the minimum free energy structure.
             cmd.append("--MFE")
-        # Maximum number of structures.
-        cmd.extend(["--maximum", fold_max])
-        # Maximum % difference between free energies of structures.
-        cmd.extend(["--percent", fold_percent])
+        else:
+            # Maximum number of structures.
+            cmd.extend(["--maximum", fold_max])
+            # Maximum % difference between free energies of structures.
+            cmd.extend(["--percent", fold_percent])
         # DMS reactivities file for the RNA.
         cmd.extend(["--DMS", dms_file := rna.to_dms(temp_dir)])
         # Temporary FASTA file for the RNA.

From ecec903f53adee0e507588a68c13266778cb80d1 Mon Sep 17 00:00:00 2001
From: matthewfallan <31744230+matthewfallan@users.noreply.github.com>
Date: Mon, 1 Apr 2024 00:48:56 -0400
Subject: [PATCH 6/6] Enable SAMtools to sort in the user-specified temporary
 directory

---
 src/seismicrna/align/write.py  | 24 +++++++++++++++++++++++-
 src/seismicrna/align/xamops.py | 17 +++++++++++++----
 src/seismicrna/core/ngs/xam.py |  4 ++++
 src/seismicrna/core/path.py    |  2 ++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/src/seismicrna/align/write.py b/src/seismicrna/align/write.py
index dfaef94d..5aaf653e 100644
--- a/src/seismicrna/align/write.py
+++ b/src/seismicrna/align/write.py
@@ -185,6 +185,15 @@ def fq_pipeline(fq_inp: FastqUnit,
     reads_align = run_xamgen(fq_inp if fq_cut is None else fq_cut,
                              xam_whole,
                              index_pfx=bowtie2_index,
+                             temp_dir=path.builddir(
+                                 path.SampSeg,
+                                 path.CmdSeg,
+                                 path.StepSeg,
+                                 top=temp_dir,
+                                 sample=sample,
+                                 cmd=path.CMD_ALIGN_DIR,
+                                 step=path.STEP_ALIGN_MAP
+                             ),
                              n_procs=n_procs,
                              bt2_local=bt2_local,
                              bt2_discordant=bt2_discordant,
@@ -280,7 +289,20 @@ def fq_pipeline(fq_inp: FastqUnit,
                                  ext=(path.CRAM_EXT if cram else path.BAM_EXT))
             if xam_ref.parent != xams_out_dir:
                 raise path.PathValueError(f"{xam_ref} is not in {xams_out_dir}")
-            exp_kwargs = dict(ref=ref, header=ref_headers[ref], n_procs=n_procs)
+            exp_kwargs = dict(
+                ref=ref,
+                header=ref_headers[ref],
+                temp_dir=path.builddir(path.SampSeg,
+                                       path.CmdSeg,
+                                       path.StepSeg,
+                                       path.RefSeg,
+                                       top=temp_dir,
+                                       sample=sample,
+                                       cmd=path.CMD_ALIGN_DIR,
+                                       step=path.STEP_ALIGN_SORT,
+                                       ref=ref),
+                n_procs=n_procs
+            )
             if cram:
                 # Write the one reference sequence to a temporary FASTA.
                 # Do NOT use overwrite=True because if refs_file is a
diff --git a/src/seismicrna/align/xamops.py b/src/seismicrna/align/xamops.py
index f1a71124..b4d53da1 100644
--- a/src/seismicrna/align/xamops.py
+++ b/src/seismicrna/align/xamops.py
@@ -240,8 +240,9 @@ def parse_match(m: re.Match[str]):
 
 def xamgen_cmd(fq_inp: FastqUnit,
                bam_out: Path | None, *,
-               min_mapq: int,
-               n_procs: int,
+               temp_dir: Path | None = None,
+               min_mapq: int | None = None,
+               n_procs: int = 1,
                **kwargs):
     """ Wrap alignment and post-processing into one pipeline. """
     bowtie2_step = bowtie2_cmd(fq_inp, None, n_procs=n_procs, **kwargs)
@@ -261,7 +262,10 @@ def xamgen_cmd(fq_inp: FastqUnit,
                                  flags_exc=flags_exc,
                                  bam=True,
                                  n_procs=1)
-    sort_xam_step = sort_xam_cmd(None, bam_out, n_procs=1)
+    sort_xam_step = sort_xam_cmd(None,
+                                 bam_out,
+                                 temp_dir=temp_dir,
+                                 n_procs=1)
     return cmds_to_pipe([bowtie2_step, view_xam_step, sort_xam_step])
 
 
@@ -275,6 +279,7 @@ def export_cmd(xam_in: Path | None,
                ref: str,
                header: str,
                ref_file: Path | None = None,
+               temp_dir: Path | None = None,
                n_procs: int = 1):
     """ Wrap selecting, sorting, and exporting into one pipeline. """
     # Pipe the header line.
@@ -290,7 +295,11 @@ def export_cmd(xam_in: Path | None,
     # Merge the one header line and the reads for the reference.
     merge_step = cmds_to_subshell([echo_step, ref_step])
     # Sort reads by name so that mates are adjacent.
-    sort_step = sort_xam_cmd(None, None, name=True, n_procs=n_procs)
+    sort_step = sort_xam_cmd(None,
+                             None,
+                             temp_dir=temp_dir,
+                             name=True,
+                             n_procs=n_procs)
     # Export the reads into a XAM file.
     export_step = view_xam_cmd(None,
                                xam_out,
diff --git a/src/seismicrna/core/ngs/xam.py b/src/seismicrna/core/ngs/xam.py
index e732abe8..e6c18ad2 100755
--- a/src/seismicrna/core/ngs/xam.py
+++ b/src/seismicrna/core/ngs/xam.py
@@ -63,6 +63,7 @@ def index_fasta_cmd(fasta: Path):
 
 def sort_xam_cmd(xam_inp: Path | None,
                  xam_out: Path | None, *,
+                 temp_dir: Path | None = None,
                  name: bool = False,
                  n_procs: int = 1):
     """ Sort a SAM or BAM file using `samtools sort`. """
@@ -70,6 +71,9 @@ def sort_xam_cmd(xam_inp: Path | None,
     if name:
         # Sort by name instead of coordinate.
         args.append("-n")
+    if temp_dir:
+        # Write temporary files to this directory.
+        args.extend(["-T", temp_dir])
     if xam_out:
         args.extend(["-o", xam_out])
     else:
diff --git a/src/seismicrna/core/path.py b/src/seismicrna/core/path.py
index 5d11695a..95e15e88 100644
--- a/src/seismicrna/core/path.py
+++ b/src/seismicrna/core/path.py
@@ -80,6 +80,7 @@
 STEP_ALIGN_INDEX_DEMULT = "index-demult"
 STEP_ALIGN_TRIM = "trim"
 STEP_ALIGN_MAP = "map"
+STEP_ALIGN_SORT = "sort"
 
 STEPS_REL_SAMS = "sams"
 
@@ -89,6 +90,7 @@
          STEP_ALIGN_INDEX_DEMULT,
          STEP_ALIGN_TRIM,
          STEP_ALIGN_MAP,
+         STEP_ALIGN_SORT,
          STEPS_REL_SAMS)
 
 # Tables