Skip to content

Commit

Permalink
Merge pull request #4 from rouskinlab/demult_fixed
Browse files Browse the repository at this point in the history
Demult fixed
  • Loading branch information
matthewfallan authored Nov 8, 2023
2 parents b54f498 + d6ab074 commit 674a100
Show file tree
Hide file tree
Showing 30 changed files with 4,190 additions and 5 deletions.
28 changes: 28 additions & 0 deletions src/seismicrna/align/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
run_export,
run_xamgen)
from ..core import path
<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
from ..core.cmd import CMD_ALIGN, CMD_QC
from ..core.fasta import parse_fasta, write_fasta
from ..core.parallel import dispatch
from ..core.seq import DNA
from ..core.xam import (count_single_paired, run_flagstat, run_ref_header,
run_index_xam, run_index_fasta, run_idxstats)
========
from ..core.arg import CMD_ALIGN, CMD_QC
from ..core.ngs import (count_single_paired,
count_total_reads,
Expand All @@ -22,6 +30,7 @@
run_idxstats)
from ..core.parallel import dispatch
from ..core.seq import DNA, parse_fasta, write_fasta
>>>>>>>> main:src/seismicrna/align/write.py

logger = getLogger(__name__)

Expand Down Expand Up @@ -175,12 +184,18 @@ def fq_pipeline(fq_inp: FastqUnit,
fq_cut = None
# Align the FASTQ to the reference sequence using Bowtie2.
xam_whole = path.build(*path.XAM_STEP_SEGS,
<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
top=temp_dir, sample=sample,
cmd=CMD_ALIGN, step=path.STEP_ALIGN_MAP,
ref=refset, ext=path.BAM_EXT)
========
top=temp_dir,
sample=sample,
cmd=CMD_ALIGN,
step=path.STEP_ALIGN_MAP,
ref=refset,
ext=path.BAM_EXT)
>>>>>>>> main:src/seismicrna/align/write.py
reads_align = run_xamgen(fq_inp if fq_cut is None else fq_cut,
xam_whole,
index_pfx=bowtie2_index,
Expand Down Expand Up @@ -323,8 +338,13 @@ def fq_pipeline(fq_inp: FastqUnit,
logger.warning(f"Skipped references with fewer than {min_reads} reads: "
f"{sorted(insufficient_refs)}")
# Write a report to summarize the alignment.
<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
report = AlignReport(sample=sample,
demultiplexed=fq_inp.ref is not None,
========
report = report_type(sample=sample,
ref=fq_inp.ref,
>>>>>>>> main:src/seismicrna/align/write.py
paired_end=fq_inp.paired,
phred_enc=fq_inp.phred_enc,
fastqc=fastqc,
Expand Down Expand Up @@ -364,9 +384,13 @@ def fq_pipeline(fq_inp: FastqUnit,
reads_trim=reads_trim,
reads_align=reads_align,
reads_filter=reads_filter,
<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
reads_refs=reads_refs)
========
reads_refs=reads_refs,
began=began,
ended=ended)
>>>>>>>> main:src/seismicrna/align/write.py
report.save(out_dir, overwrite=True)
# Return a list of name-sorted XAM files, each of which contains a
# set of reads that all align to the same reference.
Expand Down Expand Up @@ -551,7 +575,11 @@ def check_fqs_xams(alignments: dict[tuple[str, str], FastqUnit],
if xam_expect.is_file():
# If the XAM file already exists, then add it to the
# dict of XAM files that have already been aligned.
<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py
xams_existing.append(xam_expect)
========
xams_exist.append(xam_expect)
>>>>>>>> main:src/seismicrna/align/write.py
break
else:
# If at least one XAM file for a FASTQ unit does not exist,
Expand Down
22 changes: 22 additions & 0 deletions src/seismicrna/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from .main import cli, run, params

########################################################################
# #
# Copyright ©2023, the Rouskin Lab. #
# #
# This file is part of SEISMIC-RNA. #
# #
# SEISMIC-RNA is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; either version 3 of the License, or #
# (at your option) any later version. #
# #
# SEISMIC-RNA is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- #
# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General #
# Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with SEISMIC-RNA; if not, see <https://www.gnu.org/licenses>. #
# #
########################################################################
236 changes: 236 additions & 0 deletions src/seismicrna/core/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import re
from abc import ABC, abstractmethod
from functools import cached_property
from logging import getLogger

import numpy as np
import pandas as pd

from .output import PickleOutput
from .rel import REL_TYPE
from .sect import seq_pos_to_index
from .seq import DNA
from .types import fit_uint_type

logger = getLogger(__name__)

BATCH_INDEX = 0
POS_INDEX = 1
READ_INDEX = 0

READ_NUM = "Read Number"


def get_length(array: np.ndarray, what: str) -> int:
if array.ndim != 1:
raise ValueError(f"{what} must have 1 dimension, but got {array.ndim}")
length, = array.shape
return length


def ensure_same_length(arr1: np.ndarray,
arr2: np.ndarray,
what1: str,
what2: str):
if (len1 := get_length(arr1, what1)) != (len2 := get_length(arr2, what2)):
raise ValueError(
f"Lengths differ between {what1} ({len1}) and {what2} ({len2})")
return len1


def ensure_order(read_nums: np.ndarray,
array1: np.ndarray,
array2: np.ndarray,
what1: str,
what2: str,
gt_eq: bool = False):
ensure_same_length(read_nums, array1, "read numbers", what1)
ensure_same_length(array1, array2, what1, what2)
ineq_func, ineq_sign = (np.less, '<') if gt_eq else (np.greater, '>')
if np.any(is_err := ineq_func(array1, array2)):
index = pd.Index(read_nums[is_err], name=READ_NUM)
errors = pd.DataFrame.from_dict({what1: pd.Series(array1[is_err],
index=index),
what2: pd.Series(array2[is_err],
index=index)})
raise ValueError(f"Got {what1} {ineq_sign} {what2}:\n{errors}")


def sanitize_ends(read_nums: np.ndarray,
end5s: list[int] | np.ndarray,
mid5s: list[int] | np.ndarray,
mid3s: list[int] | np.ndarray,
end3s: list[int] | np.ndarray,
max_pos: int):
pos_dtype = fit_uint_type(max_pos)
end5s = np.asarray(end5s, pos_dtype)
mid5s = np.asarray(mid5s, pos_dtype)
mid3s = np.asarray(mid3s, pos_dtype)
end3s = np.asarray(end3s, pos_dtype)
# Verify 5' end ≥ min position
ensure_order(read_nums,
end5s,
np.broadcast_to(POS_INDEX, end5s.shape),
"5' end positions",
f"minimum position ({POS_INDEX})",
gt_eq=True)
# Verify 5' end ≤ 5' mid
ensure_order(read_nums,
end5s,
mid5s,
"5' end positions",
"5' middle positions")
# Verify 5' end ≤ 3' mid
ensure_order(read_nums,
end5s,
mid3s,
"5' end positions",
"3' middle positions")
# Verify 5' mid ≤ 3' end
ensure_order(read_nums,
mid5s,
end3s,
"5' middle positions",
"3' end positions")
# Verify 3' mid ≤ 3' end
ensure_order(read_nums,
mid3s,
end3s,
"3' middle positions",
"3' end positions")
# Verify 3' end ≤ max position
ensure_order(read_nums,
end3s,
np.broadcast_to(max_pos, end3s.shape),
"3' end positions",
f"maximum position ({max_pos})")
return end5s, mid5s, mid3s, end3s


class ReadBatch(PickleOutput, ABC):
""" """

@classmethod
def btype(cls):
btype, = re.match("^([a-z]*)batch$", cls.__name__.lower()).groups()
return btype

def __init__(self, batch: int, sample: str, ref: str):
self.batch = batch
self.sample = sample
self.ref = ref

@property
@abstractmethod
def num_reads(self) -> int:
""" Number of reads that are actually in use. """

@property
@abstractmethod
def max_read(self) -> int:
""" Maximum possible value for a read index. """

@property
def read_dtype(self):
""" Data type for read numbers. """
return fit_uint_type(self.max_read)

@cached_property
@abstractmethod
def read_nums(self) -> np.ndarray:
""" Read numbers in use. """


class ArrayBatch(ReadBatch, ABC):

def __init__(self,
*args,
max_pos: int,
end5s: list[int] | np.ndarray,
mid5s: list[int] | np.ndarray,
mid3s: list[int] | np.ndarray,
end3s: list[int] | np.ndarray,
**kwargs):
super().__init__(*args, **kwargs)
self.max_pos = max_pos
(self.end5s,
self.mid5s,
self.mid3s,
self.end3s) = sanitize_ends(self.read_nums,
end5s,
mid5s,
mid3s,
end3s,
self.max_pos)

@cached_property
@abstractmethod
def pos(self) -> np.ndarray:
""" Array of 1-indexed positions. """

@property
@abstractmethod
def num_pos(self) -> int:
""" Number of positions that are actually in use. """

@property
def pos_dtype(self):
""" Data type for position numbers. """
dtypes = list(set(array.dtype for array in (self.end5s,
self.mid5s,
self.mid3s,
self.end3s)))
if len(dtypes) != 1:
raise

@cached_property
@abstractmethod
def array(self) -> np.ndarray:
""" Array of data in use. """

def get_index(self, refseq: DNA):
""" Positions and bases in use. """
if len(refseq) != self.max_pos:
raise ValueError(f"Expected reference sequence of {self.max_pos} "
f"nt, but got {len(refseq)} nt")
return seq_pos_to_index(refseq, self.pos, POS_INDEX)

def get_table(self, refseq: DNA):
""" Table of data in use. """
return pd.DataFrame(self.array,
self.read_nums,
self.get_index(refseq),
copy=False)


class MutsBatch(ArrayBatch, ABC):

def __init__(self,
*args,
muts: dict[int, dict[int, list[int] | np.ndarray]],
**kwargs):
super().__init__(*args, **kwargs)
self.muts = {pos: {REL_TYPE(rel): np.asarray(reads, self.read_dtype)
for rel, reads in muts.get(pos, dict()).items()}
for pos in self.pos}

########################################################################
# #
# Copyright ©2023, the Rouskin Lab. #
# #
# This file is part of SEISMIC-RNA. #
# #
# SEISMIC-RNA is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; either version 3 of the License, or #
# (at your option) any later version. #
# #
# SEISMIC-RNA is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- #
# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General #
# Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with SEISMIC-RNA; if not, see <https://www.gnu.org/licenses>. #
# #
########################################################################
Loading

0 comments on commit 674a100

Please sign in to comment.