Merge pull request #4 from rouskinlab/demult_fixed

Demult fixed
rouskinlab · Nov 8, 2023 · 674a100 · 674a100
2 parents b54f498 + d6ab074
commit 674a100
Show file tree

Hide file tree

Showing 30 changed files with 4,190 additions and 5 deletions.
diff --git a/src/seismicrna/align/write.py b/src/seismicrna/align/write.py
@@ -12,6 +12,14 @@
                      run_export,
                      run_xamgen)
 from ..core import path
+<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
+from ..core.cmd import CMD_ALIGN, CMD_QC
+from ..core.fasta import parse_fasta, write_fasta
+from ..core.parallel import dispatch
+from ..core.seq import DNA
+from ..core.xam import (count_single_paired, run_flagstat, run_ref_header,
+                        run_index_xam, run_index_fasta, run_idxstats)
+========
 from ..core.arg import CMD_ALIGN, CMD_QC
 from ..core.ngs import (count_single_paired,
                         count_total_reads,
@@ -22,6 +30,7 @@
                         run_idxstats)
 from ..core.parallel import dispatch
 from ..core.seq import DNA, parse_fasta, write_fasta
+>>>>>>>> main:src/seismicrna/align/write.py
 
 logger = getLogger(__name__)
 
@@ -175,12 +184,18 @@ def fq_pipeline(fq_inp: FastqUnit,
         fq_cut = None
     # Align the FASTQ to the reference sequence using Bowtie2.
     xam_whole = path.build(*path.XAM_STEP_SEGS,
+<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
+                           top=temp_dir, sample=sample,
+                           cmd=CMD_ALIGN, step=path.STEP_ALIGN_MAP,
+                           ref=refset, ext=path.BAM_EXT)
+========
                            top=temp_dir,
                            sample=sample,
                            cmd=CMD_ALIGN,
                            step=path.STEP_ALIGN_MAP,
                            ref=refset,
                            ext=path.BAM_EXT)
+>>>>>>>> main:src/seismicrna/align/write.py
     reads_align = run_xamgen(fq_inp if fq_cut is None else fq_cut,
                              xam_whole,
                              index_pfx=bowtie2_index,
@@ -323,8 +338,13 @@ def fq_pipeline(fq_inp: FastqUnit,
         logger.warning(f"Skipped references with fewer than {min_reads} reads: "
                        f"{sorted(insufficient_refs)}")
     # Write a report to summarize the alignment.
+<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
+    report = AlignReport(sample=sample,
+                         demultiplexed=fq_inp.ref is not None,
+========
     report = report_type(sample=sample,
                          ref=fq_inp.ref,
+>>>>>>>> main:src/seismicrna/align/write.py
                          paired_end=fq_inp.paired,
                          phred_enc=fq_inp.phred_enc,
                          fastqc=fastqc,
@@ -364,9 +384,13 @@ def fq_pipeline(fq_inp: FastqUnit,
                          reads_trim=reads_trim,
                          reads_align=reads_align,
                          reads_filter=reads_filter,
+<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
+                         reads_refs=reads_refs)
+========
                          reads_refs=reads_refs,
                          began=began,
                          ended=ended)
+>>>>>>>> main:src/seismicrna/align/write.py
     report.save(out_dir, overwrite=True)
     # Return a list of name-sorted XAM files, each of which contains a
     # set of reads that all align to the same reference.
@@ -551,7 +575,11 @@ def check_fqs_xams(alignments: dict[tuple[str, str], FastqUnit],
             if xam_expect.is_file():
                 # If the XAM file already exists, then add it to the
                 # dict of XAM files that have already been aligned.
+<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
+                xams_existing.append(xam_expect)
+========
                 xams_exist.append(xam_expect)
+>>>>>>>> main:src/seismicrna/align/write.py
                 break
         else:
             # If at least one XAM file for a FASTQ unit does not exist,

diff --git a/src/seismicrna/cluster/__init__.py b/src/seismicrna/cluster/__init__.py
@@ -0,0 +1,22 @@
+from .main import cli, run, params
+
+########################################################################
+#                                                                      #
+# Copyright ©2023, the Rouskin Lab.                                    #
+#                                                                      #
+# This file is part of SEISMIC-RNA.                                    #
+#                                                                      #
+# SEISMIC-RNA is free software; you can redistribute it and/or modify  #
+# it under the terms of the GNU General Public License as published by #
+# the Free Software Foundation; either version 3 of the License, or    #
+# (at your option) any later version.                                  #
+#                                                                      #
+# SEISMIC-RNA is distributed in the hope that it will be useful, but   #
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- #
+# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General     #
+# Public License for more details.                                     #
+#                                                                      #
+# You should have received a copy of the GNU General Public License    #
+# along with SEISMIC-RNA; if not, see <https://www.gnu.org/licenses>.  #
+#                                                                      #
+########################################################################
diff --git a/src/seismicrna/core/batch.py b/src/seismicrna/core/batch.py
@@ -0,0 +1,236 @@
+import re
+from abc import ABC, abstractmethod
+from functools import cached_property
+from logging import getLogger
+
+import numpy as np
+import pandas as pd
+
+from .output import PickleOutput
+from .rel import REL_TYPE
+from .sect import seq_pos_to_index
+from .seq import DNA
+from .types import fit_uint_type
+
+logger = getLogger(__name__)
+
+BATCH_INDEX = 0
+POS_INDEX = 1
+READ_INDEX = 0
+
+READ_NUM = "Read Number"
+
+
+def get_length(array: np.ndarray, what: str) -> int:
+    if array.ndim != 1:
+        raise ValueError(f"{what} must have 1 dimension, but got {array.ndim}")
+    length, = array.shape
+    return length
+
+
+def ensure_same_length(arr1: np.ndarray,
+                       arr2: np.ndarray,
+                       what1: str,
+                       what2: str):
+    if (len1 := get_length(arr1, what1)) != (len2 := get_length(arr2, what2)):
+        raise ValueError(
+            f"Lengths differ between {what1} ({len1}) and {what2} ({len2})")
+    return len1
+
+
+def ensure_order(read_nums: np.ndarray,
+                 array1: np.ndarray,
+                 array2: np.ndarray,
+                 what1: str,
+                 what2: str,
+                 gt_eq: bool = False):
+    ensure_same_length(read_nums, array1, "read numbers", what1)
+    ensure_same_length(array1, array2, what1, what2)
+    ineq_func, ineq_sign = (np.less, '<') if gt_eq else (np.greater, '>')
+    if np.any(is_err := ineq_func(array1, array2)):
+        index = pd.Index(read_nums[is_err], name=READ_NUM)
+        errors = pd.DataFrame.from_dict({what1: pd.Series(array1[is_err],
+                                                          index=index),
+                                         what2: pd.Series(array2[is_err],
+                                                          index=index)})
+        raise ValueError(f"Got {what1} {ineq_sign} {what2}:\n{errors}")
+
+
+def sanitize_ends(read_nums: np.ndarray,
+                  end5s: list[int] | np.ndarray,
+                  mid5s: list[int] | np.ndarray,
+                  mid3s: list[int] | np.ndarray,
+                  end3s: list[int] | np.ndarray,
+                  max_pos: int):
+    pos_dtype = fit_uint_type(max_pos)
+    end5s = np.asarray(end5s, pos_dtype)
+    mid5s = np.asarray(mid5s, pos_dtype)
+    mid3s = np.asarray(mid3s, pos_dtype)
+    end3s = np.asarray(end3s, pos_dtype)
+    # Verify 5' end ≥ min position
+    ensure_order(read_nums,
+                 end5s,
+                 np.broadcast_to(POS_INDEX, end5s.shape),
+                 "5' end positions",
+                 f"minimum position ({POS_INDEX})",
+                 gt_eq=True)
+    # Verify 5' end ≤ 5' mid
+    ensure_order(read_nums,
+                 end5s,
+                 mid5s,
+                 "5' end positions",
+                 "5' middle positions")
+    # Verify 5' end ≤ 3' mid
+    ensure_order(read_nums,
+                 end5s,
+                 mid3s,
+                 "5' end positions",
+                 "3' middle positions")
+    # Verify 5' mid ≤ 3' end
+    ensure_order(read_nums,
+                 mid5s,
+                 end3s,
+                 "5' middle positions",
+                 "3' end positions")
+    # Verify 3' mid ≤ 3' end
+    ensure_order(read_nums,
+                 mid3s,
+                 end3s,
+                 "3' middle positions",
+                 "3' end positions")
+    # Verify 3' end ≤ max position
+    ensure_order(read_nums,
+                 end3s,
+                 np.broadcast_to(max_pos, end3s.shape),
+                 "3' end positions",
+                 f"maximum position ({max_pos})")
+    return end5s, mid5s, mid3s, end3s
+
+
+class ReadBatch(PickleOutput, ABC):
+    """ """
+
+    @classmethod
+    def btype(cls):
+        btype, = re.match("^([a-z]*)batch$", cls.__name__.lower()).groups()
+        return btype
+
+    def __init__(self, batch: int, sample: str, ref: str):
+        self.batch = batch
+        self.sample = sample
+        self.ref = ref
+
+    @property
+    @abstractmethod
+    def num_reads(self) -> int:
+        """ Number of reads that are actually in use. """
+
+    @property
+    @abstractmethod
+    def max_read(self) -> int:
+        """ Maximum possible value for a read index. """
+
+    @property
+    def read_dtype(self):
+        """ Data type for read numbers. """
+        return fit_uint_type(self.max_read)
+
+    @cached_property
+    @abstractmethod
+    def read_nums(self) -> np.ndarray:
+        """ Read numbers in use. """
+
+
+class ArrayBatch(ReadBatch, ABC):
+
+    def __init__(self,
+                 *args,
+                 max_pos: int,
+                 end5s: list[int] | np.ndarray,
+                 mid5s: list[int] | np.ndarray,
+                 mid3s: list[int] | np.ndarray,
+                 end3s: list[int] | np.ndarray,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_pos = max_pos
+        (self.end5s,
+         self.mid5s,
+         self.mid3s,
+         self.end3s) = sanitize_ends(self.read_nums,
+                                     end5s,
+                                     mid5s,
+                                     mid3s,
+                                     end3s,
+                                     self.max_pos)
+
+    @cached_property
+    @abstractmethod
+    def pos(self) -> np.ndarray:
+        """ Array of 1-indexed positions. """
+
+    @property
+    @abstractmethod
+    def num_pos(self) -> int:
+        """ Number of positions that are actually in use. """
+
+    @property
+    def pos_dtype(self):
+        """ Data type for position numbers. """
+        dtypes = list(set(array.dtype for array in (self.end5s,
+                                                    self.mid5s,
+                                                    self.mid3s,
+                                                    self.end3s)))
+        if len(dtypes) != 1:
+            raise
+
+    @cached_property
+    @abstractmethod
+    def array(self) -> np.ndarray:
+        """ Array of data in use. """
+
+    def get_index(self, refseq: DNA):
+        """ Positions and bases in use. """
+        if len(refseq) != self.max_pos:
+            raise ValueError(f"Expected reference sequence of {self.max_pos} "
+                             f"nt, but got {len(refseq)} nt")
+        return seq_pos_to_index(refseq, self.pos, POS_INDEX)
+
+    def get_table(self, refseq: DNA):
+        """ Table of data in use. """
+        return pd.DataFrame(self.array,
+                            self.read_nums,
+                            self.get_index(refseq),
+                            copy=False)
+
+
+class MutsBatch(ArrayBatch, ABC):
+
+    def __init__(self,
+                 *args,
+                 muts: dict[int, dict[int, list[int] | np.ndarray]],
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.muts = {pos: {REL_TYPE(rel): np.asarray(reads, self.read_dtype)
+                           for rel, reads in muts.get(pos, dict()).items()}
+                     for pos in self.pos}
+
+########################################################################
+#                                                                      #
+# Copyright ©2023, the Rouskin Lab.                                    #
+#                                                                      #
+# This file is part of SEISMIC-RNA.                                    #
+#                                                                      #
+# SEISMIC-RNA is free software; you can redistribute it and/or modify  #
+# it under the terms of the GNU General Public License as published by #
+# the Free Software Foundation; either version 3 of the License, or    #
+# (at your option) any later version.                                  #
+#                                                                      #
+# SEISMIC-RNA is distributed in the hope that it will be useful, but   #
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- #
+# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General     #
+# Public License for more details.                                     #
+#                                                                      #
+# You should have received a copy of the GNU General Public License    #
+# along with SEISMIC-RNA; if not, see <https://www.gnu.org/licenses>.  #
+#                                                                      #
+########################################################################