diff --git a/src/seismicrna/align/write.py b/src/seismicrna/align/write.py index f270d5b5..54fb1b1e 100644 --- a/src/seismicrna/align/write.py +++ b/src/seismicrna/align/write.py @@ -12,6 +12,14 @@ run_export, run_xamgen) from ..core import path +<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py +from ..core.cmd import CMD_ALIGN, CMD_QC +from ..core.fasta import parse_fasta, write_fasta +from ..core.parallel import dispatch +from ..core.seq import DNA +from ..core.xam import (count_single_paired, run_flagstat, run_ref_header, + run_index_xam, run_index_fasta, run_idxstats) +======== from ..core.arg import CMD_ALIGN, CMD_QC from ..core.ngs import (count_single_paired, count_total_reads, @@ -22,6 +30,7 @@ run_idxstats) from ..core.parallel import dispatch from ..core.seq import DNA, parse_fasta, write_fasta +>>>>>>>> main:src/seismicrna/align/write.py logger = getLogger(__name__) @@ -175,12 +184,18 @@ def fq_pipeline(fq_inp: FastqUnit, fq_cut = None # Align the FASTQ to the reference sequence using Bowtie2. xam_whole = path.build(*path.XAM_STEP_SEGS, +<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py + top=temp_dir, sample=sample, + cmd=CMD_ALIGN, step=path.STEP_ALIGN_MAP, + ref=refset, ext=path.BAM_EXT) +======== top=temp_dir, sample=sample, cmd=CMD_ALIGN, step=path.STEP_ALIGN_MAP, ref=refset, ext=path.BAM_EXT) +>>>>>>>> main:src/seismicrna/align/write.py reads_align = run_xamgen(fq_inp if fq_cut is None else fq_cut, xam_whole, index_pfx=bowtie2_index, @@ -323,8 +338,13 @@ def fq_pipeline(fq_inp: FastqUnit, logger.warning(f"Skipped references with fewer than {min_reads} reads: " f"{sorted(insufficient_refs)}") # Write a report to summarize the alignment. +<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py + report = AlignReport(sample=sample, + demultiplexed=fq_inp.ref is not None, +======== report = report_type(sample=sample, ref=fq_inp.ref, +>>>>>>>> main:src/seismicrna/align/write.py paired_end=fq_inp.paired, phred_enc=fq_inp.phred_enc, fastqc=fastqc, @@ -364,9 +384,13 @@ def fq_pipeline(fq_inp: FastqUnit, reads_trim=reads_trim, reads_align=reads_align, reads_filter=reads_filter, +<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py + reads_refs=reads_refs) +======== reads_refs=reads_refs, began=began, ended=ended) +>>>>>>>> main:src/seismicrna/align/write.py report.save(out_dir, overwrite=True) # Return a list of name-sorted XAM files, each of which contains a # set of reads that all align to the same reference. @@ -551,7 +575,11 @@ def check_fqs_xams(alignments: dict[tuple[str, str], FastqUnit], if xam_expect.is_file(): # If the XAM file already exists, then add it to the # dict of XAM files that have already been aligned. +<<<<<<<< HEAD:src/seismicrna/align/fq2xam.py + xams_existing.append(xam_expect) +======== xams_exist.append(xam_expect) +>>>>>>>> main:src/seismicrna/align/write.py break else: # If at least one XAM file for a FASTQ unit does not exist, diff --git a/src/seismicrna/cluster/__init__.py b/src/seismicrna/cluster/__init__.py new file mode 100644 index 00000000..970b8f9e --- /dev/null +++ b/src/seismicrna/cluster/__init__.py @@ -0,0 +1,22 @@ +from .main import cli, run, params + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/batch.py b/src/seismicrna/core/batch.py new file mode 100644 index 00000000..7e33ae53 --- /dev/null +++ b/src/seismicrna/core/batch.py @@ -0,0 +1,236 @@ +import re +from abc import ABC, abstractmethod +from functools import cached_property +from logging import getLogger + +import numpy as np +import pandas as pd + +from .output import PickleOutput +from .rel import REL_TYPE +from .sect import seq_pos_to_index +from .seq import DNA +from .types import fit_uint_type + +logger = getLogger(__name__) + +BATCH_INDEX = 0 +POS_INDEX = 1 +READ_INDEX = 0 + +READ_NUM = "Read Number" + + +def get_length(array: np.ndarray, what: str) -> int: + if array.ndim != 1: + raise ValueError(f"{what} must have 1 dimension, but got {array.ndim}") + length, = array.shape + return length + + +def ensure_same_length(arr1: np.ndarray, + arr2: np.ndarray, + what1: str, + what2: str): + if (len1 := get_length(arr1, what1)) != (len2 := get_length(arr2, what2)): + raise ValueError( + f"Lengths differ between {what1} ({len1}) and {what2} ({len2})") + return len1 + + +def ensure_order(read_nums: np.ndarray, + array1: np.ndarray, + array2: np.ndarray, + what1: str, + what2: str, + gt_eq: bool = False): + ensure_same_length(read_nums, array1, "read numbers", what1) + ensure_same_length(array1, array2, what1, what2) + ineq_func, ineq_sign = (np.less, '<') if gt_eq else (np.greater, '>') + if np.any(is_err := ineq_func(array1, array2)): + index = pd.Index(read_nums[is_err], name=READ_NUM) + errors = pd.DataFrame.from_dict({what1: pd.Series(array1[is_err], + index=index), + what2: pd.Series(array2[is_err], + index=index)}) + raise ValueError(f"Got {what1} {ineq_sign} {what2}:\n{errors}") + + +def sanitize_ends(read_nums: np.ndarray, + end5s: list[int] | np.ndarray, + mid5s: list[int] | np.ndarray, + mid3s: list[int] | np.ndarray, + end3s: list[int] | np.ndarray, + max_pos: int): + pos_dtype = fit_uint_type(max_pos) + end5s = np.asarray(end5s, pos_dtype) + mid5s = np.asarray(mid5s, pos_dtype) + mid3s = np.asarray(mid3s, pos_dtype) + end3s = np.asarray(end3s, pos_dtype) + # Verify 5' end ≥ min position + ensure_order(read_nums, + end5s, + np.broadcast_to(POS_INDEX, end5s.shape), + "5' end positions", + f"minimum position ({POS_INDEX})", + gt_eq=True) + # Verify 5' end ≤ 5' mid + ensure_order(read_nums, + end5s, + mid5s, + "5' end positions", + "5' middle positions") + # Verify 5' end ≤ 3' mid + ensure_order(read_nums, + end5s, + mid3s, + "5' end positions", + "3' middle positions") + # Verify 5' mid ≤ 3' end + ensure_order(read_nums, + mid5s, + end3s, + "5' middle positions", + "3' end positions") + # Verify 3' mid ≤ 3' end + ensure_order(read_nums, + mid3s, + end3s, + "3' middle positions", + "3' end positions") + # Verify 3' end ≤ max position + ensure_order(read_nums, + end3s, + np.broadcast_to(max_pos, end3s.shape), + "3' end positions", + f"maximum position ({max_pos})") + return end5s, mid5s, mid3s, end3s + + +class ReadBatch(PickleOutput, ABC): + """ """ + + @classmethod + def btype(cls): + btype, = re.match("^([a-z]*)batch$", cls.__name__.lower()).groups() + return btype + + def __init__(self, batch: int, sample: str, ref: str): + self.batch = batch + self.sample = sample + self.ref = ref + + @property + @abstractmethod + def num_reads(self) -> int: + """ Number of reads that are actually in use. """ + + @property + @abstractmethod + def max_read(self) -> int: + """ Maximum possible value for a read index. """ + + @property + def read_dtype(self): + """ Data type for read numbers. """ + return fit_uint_type(self.max_read) + + @cached_property + @abstractmethod + def read_nums(self) -> np.ndarray: + """ Read numbers in use. """ + + +class ArrayBatch(ReadBatch, ABC): + + def __init__(self, + *args, + max_pos: int, + end5s: list[int] | np.ndarray, + mid5s: list[int] | np.ndarray, + mid3s: list[int] | np.ndarray, + end3s: list[int] | np.ndarray, + **kwargs): + super().__init__(*args, **kwargs) + self.max_pos = max_pos + (self.end5s, + self.mid5s, + self.mid3s, + self.end3s) = sanitize_ends(self.read_nums, + end5s, + mid5s, + mid3s, + end3s, + self.max_pos) + + @cached_property + @abstractmethod + def pos(self) -> np.ndarray: + """ Array of 1-indexed positions. """ + + @property + @abstractmethod + def num_pos(self) -> int: + """ Number of positions that are actually in use. """ + + @property + def pos_dtype(self): + """ Data type for position numbers. """ + dtypes = list(set(array.dtype for array in (self.end5s, + self.mid5s, + self.mid3s, + self.end3s))) + if len(dtypes) != 1: + raise + + @cached_property + @abstractmethod + def array(self) -> np.ndarray: + """ Array of data in use. """ + + def get_index(self, refseq: DNA): + """ Positions and bases in use. """ + if len(refseq) != self.max_pos: + raise ValueError(f"Expected reference sequence of {self.max_pos} " + f"nt, but got {len(refseq)} nt") + return seq_pos_to_index(refseq, self.pos, POS_INDEX) + + def get_table(self, refseq: DNA): + """ Table of data in use. """ + return pd.DataFrame(self.array, + self.read_nums, + self.get_index(refseq), + copy=False) + + +class MutsBatch(ArrayBatch, ABC): + + def __init__(self, + *args, + muts: dict[int, dict[int, list[int] | np.ndarray]], + **kwargs): + super().__init__(*args, **kwargs) + self.muts = {pos: {REL_TYPE(rel): np.asarray(reads, self.read_dtype) + for rel, reads in muts.get(pos, dict()).items()} + for pos in self.pos} + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/fasta.py b/src/seismicrna/core/fasta.py new file mode 100644 index 00000000..1d9c2d22 --- /dev/null +++ b/src/seismicrna/core/fasta.py @@ -0,0 +1,175 @@ +import re +from logging import getLogger +from os import linesep +from pathlib import Path +from typing import Iterable + +from .path import STR_CHARS +from .seq import Seq + +logger = getLogger(__name__) + +# FASTA name line format. +FASTA_NAME_MARK = '>' +FASTA_NAME_CHARS = STR_CHARS +FASTA_NAME_REGEX = re.compile(f"^{FASTA_NAME_MARK}([{FASTA_NAME_CHARS}]*)") + + +def extract_fasta_seqname(line: str): + """ Extract the name of a sequence from a line in FASTA format. """ + return mat.groups()[0] if (mat := FASTA_NAME_REGEX.match(line)) else None + + +def valid_fasta_seqname(line: str) -> str: + """ Get a valid sequence name from a line in FASTA format. """ + # Confirm that the line matches the pattern for name lines. + if (name := extract_fasta_seqname(line)) is None: + # If the pattern failed to match, then the line is misformatted. + raise ValueError(f"FASTA name line {repr(line)} is misformatted") + if not name: + # If pattern matched, then the name can still be blank. + raise ValueError(f"FASTA name line {repr(line)} has a blank name") + if name != line[len(FASTA_NAME_MARK):].rstrip(): + # If the name is not blank, then it can have illegal characters. + raise ValueError(f"FASTA name line {repr(line)} has illegal characters") + # If none of the above, then the line and the name are valid. + return name + + +def format_fasta_name_line(name: str): + return f"{FASTA_NAME_MARK}{name.rstrip()}{linesep}" + + +def format_fasta_seq_lines(seq: Seq, wrap: int = 0): + """ Format a sequence in a FASTA file so that each line has at most + `wrap` characters, or no limit if `wrap` is 0. """ + if wrap < 0: + raise ValueError(f"Wrap cannot be negative, but got {wrap}") + if wrap == 0 or wrap >= len(seq): + return f"{seq}{linesep}" + return "".join(f"{seq[start: start + wrap]}{linesep}" + for start in range(0, len(seq), wrap)) + + +def format_fasta_record(name: str, seq: Seq, wrap: int = 0): + return f"{format_fasta_name_line(name)}{format_fasta_seq_lines(seq, wrap)}" + + +def parse_fasta(fasta: Path, + seq_type: type[Seq] | None, + only: Iterable[str] | None = None): + names = set() + with open(fasta) as f: + line = f.readline() + # Read to the end of the file. + while line: + try: + # Determine the name of the current reference. + name = valid_fasta_seqname(line) + if name in names: + raise ValueError(f"Duplicate name '{name}' in {fasta}") + names.add(name) + except Exception as error: + # Determining the name failed. + logger.error(f"Failed to parse name of reference in {fasta}: " + f"{error}") + # Advance to the next line to prevent the current line + # from being read multiple times. + line = f.readline() + else: + # Advance to the next line to prevent the current line + # from being read multiple times. + line = f.readline() + if only is None or name in only: + # Yield this record if it is not the case that only + # some records have been selected or if the record + # is among those that have been selected. + if seq_type is None: + # In name-only mode, yield just the name of the + # reference. + yield name + else: + # Otherwise, read the lines until the current + # sequence ends, then assemble the lines. + try: + segments = list() + while line and not line.startswith(FASTA_NAME_MARK): + segments.append(line.rstrip(linesep)) + line = f.readline() + seq = seq_type("".join(segments)) + logger.debug(f"Read {seq_type.__name__} '{name}' " + f"({len(seq)} nt) from {fasta}") + yield name, seq + except Exception as error: + logger.error(f"Failed to read sequence of '{name}' " + f"in {fasta}: {error}") + # Skip to the next name line if there is one, otherwise to + # the end of the file. Ignore blank lines. + while line and not line.startswith(FASTA_NAME_MARK): + line = f.readline() + + +def get_fasta_seq(fasta: Path, seq_type: type[Seq], name: str): + """ Get one sequence of a given name from a FASTA file. """ + if not isinstance(seq_type, type) and issubclass(seq_type, Seq): + raise TypeError("Expected seq_type to be subclass of Seq, but got " + f"{repr(seq_type)}") + try: + _, seq = next(iter(parse_fasta(fasta, seq_type, (name,)))) + except StopIteration: + pass + else: + return seq + raise ValueError(f"Sequence {repr(name)} was not found in {fasta}") + + +def write_fasta(fasta: Path, refs: Iterable[tuple[str, Seq]], + wrap: int = 0, overwrite: bool = False): + """ Write an iterable of reference names and DNA sequences to a + FASTA file. """ + logger.info(f"Began writing FASTA file: {fasta}") + # Record the names of all the references. + names = set() + with open(fasta, 'w' if overwrite else 'x') as f: + for name, seq in refs: + try: + # Confirm that the name is not blank. + if not name: + raise ValueError(f"Got blank reference name") + # Confirm there are no illegal characters in the name. + if illegal := set(name) - set(FASTA_NAME_CHARS): + raise ValueError(f"Reference name '{name}' has illegal " + f"characters: {illegal}") + # If there are two or more references with the same name, + # then the sequence of only the first is used. + if name in names: + raise ValueError(f"Duplicate reference name: '{name}'") + f.write(format_fasta_record(name, seq, wrap)) + logger.debug(f"Wrote reference '{name}' ({len(seq)} nt) " + f"to {fasta}") + names.add(name) + except Exception as error: + logger.error( + f"Failed to write reference '{name}' to {fasta}: {error}") + logger.info(f"Wrote {len(names)} sequences(s) to {fasta}") + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/output.py b/src/seismicrna/core/output.py new file mode 100644 index 00000000..59f197ce --- /dev/null +++ b/src/seismicrna/core/output.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from functools import cached_property +from logging import getLogger +from pathlib import Path +from typing import Any, Iterable, TypeVar + +from . import path +from .files import DEFAULT_BROTLI_LEVEL, load_pkl_br, save_pkl_br + +logger = getLogger(__name__) + + +AnyOutput = TypeVar("AnyOutput") + + +class Output(ABC): + """ Abstract base class for an output item. """ + + @classmethod + @abstractmethod + def load(cls: type[AnyOutput], file: Path) -> AnyOutput: + """ Load an object from a file. """ + + @classmethod + @abstractmethod + def dir_seg_types(cls) -> tuple[path.Segment, ...]: + """ Type(s) of the directory segment(s) in the path. """ + return tuple() + + @classmethod + @abstractmethod + def file_seg_type(cls) -> path.Segment: + """ Type of the last segment in the path. """ + + @classmethod + def seg_types(cls): + return cls.dir_seg_types() + (cls.file_seg_type(),) + + @classmethod + def auto_fields(cls) -> dict[str, Any]: + """ Fields that are filled automatically. """ + if len(exts := cls.file_seg_type().exts) != 1: + raise ValueError(f"Expected exactly one file extension, " + f"but got {cls.file_seg_type().exts}") + return {path.EXT: exts[0]} + + @classmethod + def build_path(cls, **path_fields): + """ Build the file path from the given fields. """ + return path.buildpar(*cls.seg_types(), + **(cls.auto_fields() | path_fields)) + + def path_fields(self, top: Path | None = None, exclude: Iterable[str] = ()): + """ Return the path fields as a dict. """ + fields = {path.TOP: top} if top else dict() + fields.update({field: (getattr(self, field) if hasattr(self, field) + else self.auto_fields()[field]) + for segment in self.seg_types() + for field in segment.field_types}) + for field in exclude: + fields.pop(field, None) + return fields + + def get_path(self, top: Path): + """ Return the file path. """ + return self.build_path(**self.path_fields(top)) + + @abstractmethod + def save(self, top: Path, **kwargs): + """ Save the object to a file. """ + + +class RefOutput(Output, ABC): + + @classmethod + def dir_seg_types(cls): + return super().dir_seg_types() + (path.SampSeg, + path.CmdSeg, + path.RefSeg) + + def __init__(self, sample: str, ref: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.sample = sample + self.ref = ref + + +class SectOutput(RefOutput, ABC): + + @classmethod + def dir_seg_types(cls): + return super().dir_seg_types() + (path.SectSeg,) + + def __init__(self, sect: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.sect = sect + + +class PickleOutput(Output, ABC): + + @classmethod + def load(cls, file: Path, checksum: str | None = None): + """ Load from a compressed pickle file. """ + return load_pkl_br(file, check_type=cls, checksum=checksum) + + def save(self, + top: Path, + brotli_level: int = DEFAULT_BROTLI_LEVEL, + overwrite: bool = False): + """ Save to a pickle file compressed with Brotli. """ + checksum = save_pkl_br(self, + save_path := self.get_path(top), + brotli_level=brotli_level, + overwrite=overwrite) + return save_path, checksum + + def __getstate__(self): + # Copy the __dict__ to avoid modifying this object's state. + state = self.__dict__.copy() + # Do not pickle cached properties. + for name, value in list(state.items()): + if isinstance(value, cached_property): + state.pop(name) + return state + + def __setstate__(self, state: dict[str, Any]): + self.__dict__.update(state) diff --git a/src/seismicrna/core/qual.py b/src/seismicrna/core/qual.py new file mode 100644 index 00000000..c5d5d9dd --- /dev/null +++ b/src/seismicrna/core/qual.py @@ -0,0 +1,75 @@ +from .cli import opt_min_phred, opt_phred_enc + + +def encode_phred(phred_score: int, phred_encoding: int): + """ + Encode a numeric Phred quality score as an ASCII character. + + Parameters + ---------- + phred_score : int + The Phred score as an integer. + phred_encoding : int + The encoding offset for Phred scores. A Phred score is encoded + as the character whose ASCII value is the sum of the phred score + and the encoding offset. + + Returns + ------- + str + The character whose ASCII code, in the encoding scheme of the + FASTQ file, represents valid quality. + """ + return chr(phred_score + phred_encoding) + + +def decode_phred(quality_code: str, phred_encoding: int): + """ + Decode the ASCII character for a Phred quality score to an integer. + + Parameters + ---------- + quality_code : str + The Phred score encoded as an ASCII character. + phred_encoding : int + The encoding offset for Phred scores. A Phred score is encoded + as the character whose ASCII value is the sum of the phred score + and the encoding offset. + + Returns + ------- + str + The character whose ASCII code, in the encoding scheme of the + FASTQ file, represents valid quality. + """ + return ord(quality_code) - phred_encoding + + +# Default high, medium, and low quality codes. +LO_PHRED = 0 +OK_PHRED = opt_min_phred.default +HI_PHRED = 40 +LO_QUAL = encode_phred(LO_PHRED, opt_phred_enc.default) +OK_QUAL = encode_phred(OK_PHRED, opt_phred_enc.default) +HI_QUAL = encode_phred(HI_PHRED, opt_phred_enc.default) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/refseq.py b/src/seismicrna/core/refseq.py new file mode 100644 index 00000000..f769cc30 --- /dev/null +++ b/src/seismicrna/core/refseq.py @@ -0,0 +1,20 @@ +from functools import cached_property + +from . import path +from .output import PickleOutput, RefOutput +from .seq import DNA + + +class RefseqFile(PickleOutput, RefOutput): + + @classmethod + def file_seg_type(cls): + return path.RefseqFileSeg + + def __init__(self, *args, refseq: DNA, **kwargs): + super().__init__(*args, **kwargs) + self._s = refseq.compress() + + @cached_property + def refseq(self): + return self._s.decompress() diff --git a/src/seismicrna/core/temp.py b/src/seismicrna/core/temp.py new file mode 100644 index 00000000..ec9b88ab --- /dev/null +++ b/src/seismicrna/core/temp.py @@ -0,0 +1,108 @@ +import os +from functools import wraps +from logging import getLogger +from pathlib import Path +from shutil import rmtree +from typing import Callable + +logger = getLogger(__name__) + +# Lock directory (for function lock_output) +LOCK_DIR = ".seismic-rna_lock" + + +def lock_temp_dir(run: Callable): + @wraps(run) + def wrapper(*args, temp_dir: str | Path, save_temp: bool, **kwargs): + lock_error = (f"The directory {temp_dir} is currently being used by " + f"another instance of SEISMIC-RNA. If possible, please " + f"name a temporary directory that does not yet exist " + f"with '--temp-dir /path/to/new/temp-dir/'. If a former " + f"run of SEISMIC-RNA failed to unlock this directory, " + f"then please delete it with 'rm -r {temp_dir}'.") + temp_error = (f"The temporary directory {temp_dir} exists. If any " + f"needed files reside in {temp_dir}, then please " + f"specify a nonexistent temporary directory with " + f"'--temp-dir /new/temp/dir'. Otherwise, please delete " + f"the directory with 'rm -r {temp_dir}' and then rerun.") + # Determine whether the temporary directory and the lock exist. + lock = os.path.join(temp_dir, LOCK_DIR) + try: + os.mkdir(lock) + except FileExistsError: + # The lock already exists, which means another instance of + # SEISMIC-RNA is using this temporary directory. + logger.critical(lock_error) + raise SystemExit() + except FileNotFoundError: + # The temporary directory does not exist yet, so create it + # along with a lock. + try: + os.makedirs(lock, exist_ok=False) + except FileExistsError: + # If this error happens, it is due to a very unlikely + # race condition wherein another instance of SEISMIC-RNA + # raises a FileNotFoundError from os.mkdir(lock), then + # this instance of SEISMIC-RNA does the same, then the + # first run makes the directory with os.makedirs(lock), + # and then this run tries to do the same thing but fails + # because the directory was created moments before. + logger.critical(lock_error) + raise SystemExit() + temp_dir_existed_before = False + logger.debug(f"Created and locked temporary directory: {temp_dir}") + else: + # The temporary directory had existed, but the lock had not. + temp_dir_existed_before = True + logger.debug(f"Locked temporary directory: {temp_dir}") + # The lock now exists, so any other instance of SEISMIC-RNA that + # tries to use the same lock will exit before it can use the + # temporary directory or delete the lock. Thus, this run must + # delete the lock upon exiting, regardless of the circumstances. + try: + if temp_dir_existed_before and not save_temp: + logger.critical(temp_error) + raise SystemExit() + try: + # Run the wrapped function and return its result. + return run(*args, **kwargs, + temp_dir=temp_dir, + save_temp=save_temp) + finally: + # Delete the temporary directory unless the option to + # save it was enabled. + if not save_temp: + rmtree(temp_dir, ignore_errors=True) + logger.debug(f"Deleted temporary directory: {temp_dir}") + finally: + # Always ensure that the temporary directory is unlocked + # upon exiting. + try: + os.rmdir(lock) + logger.debug(f"Unlocked temporary directory: {temp_dir}") + except FileNotFoundError: + pass + + # Return the decorator. + return wrapper + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/tests/fasta_test.py b/src/seismicrna/core/tests/fasta_test.py new file mode 100644 index 00000000..7a459e7a --- /dev/null +++ b/src/seismicrna/core/tests/fasta_test.py @@ -0,0 +1,299 @@ +import unittest as ut +from itertools import product +from logging import Filter, LogRecord +from os import linesep, remove +from pathlib import Path +from string import printable, whitespace +from tempfile import NamedTemporaryFile as NTFile + +from .. import path +from ..fasta import (FASTA_NAME_MARK, FASTA_NAME_CHARS, + valid_fasta_seqname, format_fasta_name_line, + format_fasta_record, format_fasta_seq_lines, + parse_fasta, write_fasta, logger as fasta_logger) +from ..seq import DNA, RNA + + +class TestValidFastaSeqname(ut.TestCase): + """ Test valid_fasta_seqname. """ + + def test_name_mark(self): + self.assertEqual(FASTA_NAME_MARK, '>') + + def test_valid(self): + """ Test that it works on valid name lines. """ + for name in FASTA_NAME_CHARS: + prefix = f"{FASTA_NAME_MARK}{name}" + for line in [prefix, f"{prefix}\n"]: + self.assertEqual(valid_fasta_seqname(line), name) + + def test_misformatted(self): + """ Test that it fails on misformatted lines. """ + for a, b in product(printable, repeat=2): + if a != FASTA_NAME_MARK: + prefix = f"{a}{b}" + for line in [prefix, f"{prefix}\n"]: + self.assertRaisesRegex(ValueError, "is misformatted", + valid_fasta_seqname, line) + + def test_illegal_prefix(self): + """ Test it fails on names starting with illegal characters. """ + prefixes = set(printable) - set(FASTA_NAME_CHARS) + for a, b in product(prefixes, FASTA_NAME_CHARS): + prefix = f"{FASTA_NAME_MARK}{a}{b}" + for line in [prefix, f"{prefix}\n"]: + self.assertRaisesRegex(ValueError, "has a blank name", + valid_fasta_seqname, line) + + def test_illegal_suffix(self): + """ Test it fails on names ending with illegal characters except + for trailing whitespace, which is simply ignored. """ + suffixes = set(printable) - (set(FASTA_NAME_CHARS) | set(whitespace)) + for a, b in product(FASTA_NAME_CHARS, suffixes): + prefix = f"{FASTA_NAME_MARK}{a}{b}" + for line in [prefix, f"{prefix}\n"]: + self.assertRaisesRegex(ValueError, "has illegal characters", + valid_fasta_seqname, line) + + def test_blank(self): + """ Test that it fails with blank names. """ + prefixes = [FASTA_NAME_MARK] + [f"{FASTA_NAME_MARK}{w}" + for w in whitespace] + for prefix in prefixes: + for line in [prefix, f"{prefix}\n"]: + self.assertRaisesRegex(ValueError, "has a blank name", + valid_fasta_seqname, line) + + +class TestFormat(ut.TestCase): + """ Test format_fasta_name_line and format_fasta_record. """ + + def test_format_fasta_name_line(self): + lines = { + "name": ">name\n", + "name ": ">name\n", + "name\n": ">name\n", + " name": "> name\n", + } + for name, line in lines.items(): + self.assertEqual(format_fasta_name_line(name), line) + + def test_format_fasta_seq_lines(self): + lines = { + (DNA("AGTC"), 0): "AGTC\n", + (DNA("AGTC"), 1): "A\nG\nT\nC\n", + (DNA("AGTC"), 2): "AG\nTC\n", + (DNA("AGTC"), 3): "AGT\nC\n", + (DNA("AGTC"), 4): "AGTC\n", + (DNA("AGTC"), 5): "AGTC\n", + } + for (seq, wrap), line in lines.items(): + self.assertEqual(format_fasta_seq_lines(seq, wrap), line) + + def test_format_fasta_record(self): + lines = { + ("name", DNA("ACGTN"), 0): ">name\nACGTN\n", + ("name", RNA("ACGUN"), 0): ">name\nACGUN\n", + ("name", DNA("ANCGTN"), 3): ">name\nANC\nGTN\n", + ("name", RNA("ANCGUN"), 2): ">name\nAN\nCG\nUN\n", + } + for (name, seq, wrap), line in lines.items(): + self.assertEqual(format_fasta_record(name, seq, wrap), line) + + +class TestParseFasta(ut.TestCase): + """ Test parse_fasta. """ + + def test_valid_names(self): + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC", + ">Seq2 ", "AGCTGTGNNT", "ATCG"])) + records = list(parse_fasta(filepath, None)) + self.assertEqual(records, ["Seq1", "Seq2"]) + remove(filepath) + + def test_valid_dna(self): + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC", + ">Seq2 ", "AGCTGTGNNT", "ATCG"])) + records = dict(parse_fasta(filepath, DNA)) + self.assertEqual(records, {"Seq1": DNA("GTACGTGNTCATC"), + "Seq2": DNA("AGCTGTGNNTATCG")}) + remove(filepath) + + def test_valid_rna(self): + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GUACGUGNUCAUC", + ">Seq2 ", "AGCUGUGNNU", "AUCG"])) + records = dict(parse_fasta(filepath, RNA)) + self.assertEqual(records, {"Seq1": RNA("GUACGUGNUCAUC"), + "Seq2": RNA("AGCUGUGNNUAUCG")}) + remove(filepath) + + def test_valid_empty(self): + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([])) + records = dict(parse_fasta(filepath, DNA)) + self.assertEqual(records, {}) + remove(filepath) + + def test_valid_blank_line(self): + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC", linesep, linesep, + ">Seq2 ", linesep, "AGCTGTGNNT", linesep, + "ATCG"])) + records = dict(parse_fasta(filepath, DNA)) + self.assertEqual(records, {"Seq1": DNA("GTACGTGNTCATC"), + "Seq2": DNA("AGCTGTGNNTATCG")}) + remove(filepath) + + def test_duplicate_name(self): + + class DupErrFilter(Filter): + """ Suppress errors about duplicate names. """ + + def filter(self, rec: LogRecord): + return "Duplicate name" not in rec.msg + + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC", + ">Seq1 ", "AGCTGTGNNT", "ATCG"])) + # Temporarily ignore errors about duplicate names in the FASTA. + fasta_logger.addFilter(dup_filter := DupErrFilter()) + try: + records = dict(parse_fasta(filepath, DNA)) + finally: + fasta_logger.removeFilter(dup_filter) + remove(filepath) + self.assertEqual(records, {"Seq1": DNA("GTACGTGNTCATC")}) + + def test_invalid_name(self): + + class NameErrFilter(Filter): + """ Suppress errors about invalid names. """ + + def filter(self, rec: LogRecord): + return "Failed to parse name of reference" not in rec.msg + + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC", + ">Seq2|", "AGCTGTGNNT", "ATCG"])) + # Temporarily ignore errors about duplicate names in the FASTA. + fasta_logger.addFilter(name_filter := NameErrFilter()) + try: + records = dict(parse_fasta(filepath, DNA)) + finally: + fasta_logger.removeFilter(name_filter) + remove(filepath) + self.assertEqual(records, {"Seq1": DNA("GTACGTGNTCATC")}) + + def test_invalid_seq(self): + + class SeqErrFilter(Filter): + """ Suppress errors about invalid sequences. """ + + def filter(self, rec: LogRecord): + return "Failed to read sequence" not in rec.msg + + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + f.write(linesep.join([">Seq1", "GTACGTGNTCATC ", + ">Seq2", "AGCTGTGNNT", "ATCG"])) + # Temporarily ignore errors about duplicate names in the FASTA. + fasta_logger.addFilter(seq_filter := SeqErrFilter()) + try: + records = dict(parse_fasta(filepath, DNA)) + finally: + fasta_logger.removeFilter(seq_filter) + remove(filepath) + self.assertEqual(records, {"Seq2": DNA("AGCTGTGNNTATCG")}) + + +class TestWriteFasta(ut.TestCase): + """ Test write_fasta. """ + + def test_valid_names(self): + seqs = [("Seq1", DNA("GTACGTGNTCATC")), + ("Seq2", DNA("AGCTGTGNNTATCG"))] + text = ">Seq1\nGTACGTGNTCATC\n>Seq2\nAGCTGTGNNTATCG\n" + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=True) as f: + filepath = Path(f.file.name) + write_fasta(filepath, seqs) + with open(filepath) as f: + self.assertEqual(f.read(), text) + remove(filepath) + + def test_overwrite(self): + seqs = [("Seq1", DNA("GTACGTGNTCATC")), + ("Seq2", DNA("AGCTGTGNNTATCG"))] + text = ">Seq1\nGTACGTGNTCATC\n>Seq2\nAGCTGTGNNTATCG\n" + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + write_fasta(filepath, seqs, overwrite=True) + with open(filepath) as f: + self.assertEqual(f.read(), text) + remove(filepath) + + def test_no_overwrite(self): + seqs = [("Seq1", DNA("GTACGTGNTCATC")), + ("Seq2", DNA("AGCTGTGNNTATCG"))] + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=False) as f: + filepath = Path(f.file.name) + self.assertRaisesRegex(FileExistsError, str(filepath), + write_fasta, filepath, seqs) + remove(filepath) + + def test_invalid_name(self): + seqs = [("", DNA("GTACGTGNTCATC")), + ("Seq1 ", DNA("GACGTACTGTACGT")), + (" Seq1", DNA("GACGTACTGTACGT")), + ("Seq1", DNA("GTACGTGNTCATC")), + ("Seq2", DNA("AGCTGTGNNTATCG")), + ("Seq1", DNA("ACGATGTATGTA"))] + text = ">Seq1\nGTACGTGNTCATC\n>Seq2\nAGCTGTGNNTATCG\n" + + class NameErrFilter(Filter): + """ Suppress errors about invalid names. """ + + def filter(self, rec: LogRecord): + return "Failed to write reference" not in rec.msg + + # Temporarily ignore errors about invalid names. + fasta_logger.addFilter(name_filter := NameErrFilter()) + try: + with NTFile('w', suffix=path.FASTA_EXTS[0], delete=True) as f: + filepath = Path(f.file.name) + write_fasta(filepath, seqs) + with open(filepath) as f: + self.assertEqual(f.read(), text) + finally: + fasta_logger.removeFilter(name_filter) + remove(filepath) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/tests/qual_test.py b/src/seismicrna/core/tests/qual_test.py new file mode 100644 index 00000000..235c6e7d --- /dev/null +++ b/src/seismicrna/core/tests/qual_test.py @@ -0,0 +1,58 @@ +""" + +Tests for the Quality Core Module + +======================================================================== + +""" + +import unittest as ut + +from ..qual import (LO_QUAL, OK_QUAL, HI_QUAL, LO_PHRED, OK_PHRED, HI_PHRED, + decode_phred, encode_phred) + + +class TestConstants(ut.TestCase): + + def test_quals(self): + self.assertLess(LO_QUAL, OK_QUAL) + self.assertLess(OK_QUAL, HI_QUAL) + + def test_phreds(self): + self.assertLess(LO_PHRED, OK_PHRED) + self.assertLess(OK_PHRED, HI_PHRED) + + +class TestDecode(ut.TestCase): + + def test_decode(self): + self.assertEqual(decode_phred('I', 33), 40) + self.assertEqual(decode_phred('!', 33), 0) + + +class TestEncode(ut.TestCase): + + def test_encode(self): + self.assertEqual(encode_phred(40, 33), 'I') + self.assertEqual(encode_phred(0, 33), '!') + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/core/tests/temp_test.py b/src/seismicrna/core/tests/temp_test.py new file mode 100644 index 00000000..f32d9260 --- /dev/null +++ b/src/seismicrna/core/tests/temp_test.py @@ -0,0 +1,183 @@ +import unittest as ut +from logging import Filter, LogRecord +from pathlib import Path +from tempfile import mkdtemp + +from ..temp import LOCK_DIR, lock_temp_dir, logger as temp_logger + + +def get_lock(temp_dir: Path): + return temp_dir.joinpath(LOCK_DIR) + + +def lock(temp_dir: Path): + lock_dir = get_lock(temp_dir) + lock_dir.mkdir(parents=False, exist_ok=False) + return lock_dir + + +def unlock(temp_dir: Path): + try: + get_lock(temp_dir).rmdir() + except FileNotFoundError: + pass + + +def make_temp(*args, **kwargs): + """ Make a new temporary directory with a random path. """ + return Path(mkdtemp(*args, **kwargs)) + + +def rm_temp(temp_dir: Path): + """ Remove a temporary directory, if it exists. """ + unlock(temp_dir) + try: + temp_dir.rmdir() + except FileNotFoundError: + return False + return True + + +def name_temp(*args, **kwargs): + """ Get the path of a temporary directory that does not exist. """ + temp_dir = make_temp(*args, **kwargs) + rm_temp(temp_dir) + return temp_dir + + +def make_lock_temp(*args, **kwargs): + """ Make and lock a new temporary directory. """ + temp_dir = make_temp(*args, **kwargs) + return temp_dir, lock(temp_dir) + + +def name_lock_temp(*args, **kwargs): + temp_dir = name_temp(*args, **kwargs) + return temp_dir, get_lock(temp_dir) + + +@lock_temp_dir +def run_func(*_, temp_dir: Path, save_temp: bool, **__): + """ Placeholder for run() function. """ + return save_temp, temp_dir.is_dir(), get_lock(temp_dir).is_dir() + + +class TestLockTempDir(ut.TestCase): + """ Test decorator `lock_temp_dir`. """ + + class LockErrFilter(Filter): + + def filter(self, record: LogRecord): + return "currently being used" not in record.msg + + class TempErrFilter(Filter): + + def filter(self, record: LogRecord): + return "If any needed files" not in record.msg + + def test_wraps(self): + self.assertEqual(run_func.__name__, "run_func") + self.assertEqual(run_func.__doc__, " Placeholder for run() function. ") + + def test_new_save_temp(self): + temp_dir, lock_dir = name_lock_temp() + try: + # The directory should not exist initially. + self.assertFalse(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + # The directory should be created by lock_temp_dir(). + self.assertEqual(run_func(temp_dir=temp_dir, save_temp=True), + (True, True, True)) + # The directory should not be deleted by lock_temp_dir(). + self.assertTrue(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + finally: + rm_temp(temp_dir) + + def test_new_erase_temp(self): + temp_dir, lock_dir = name_lock_temp() + try: + # The directory should not exist initially. + self.assertFalse(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + # The directory should be created by lock_temp_dir(). + self.assertEqual(run_func(temp_dir=temp_dir, save_temp=False), + (False, True, True)) + # The directory should be deleted by lock_temp_dir(). + self.assertFalse(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + finally: + rm_temp(temp_dir) + + def test_exists_save_temp(self): + temp_dir = make_temp() + lock_dir = get_lock(temp_dir) + try: + # The directory should exist initially. + self.assertTrue(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + # The function should run normally. + self.assertEqual(run_func(temp_dir=temp_dir, save_temp=True), + (True, True, True)) + # The directory should not be deleted by lock_temp_dir(). + self.assertTrue(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + finally: + rm_temp(temp_dir) + + def test_exists_erase_temp(self): + temp_dir = make_temp() + lock_dir = get_lock(temp_dir) + temp_logger.addFilter(temp_err := self.TempErrFilter()) + try: + # The directory should exist initially. + self.assertTrue(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + # The function should fail. + self.assertRaises(SystemExit, run_func, + temp_dir=temp_dir, save_temp=False) + # The directory should not be deleted by lock_temp_dir. + self.assertTrue(temp_dir.is_dir()) + self.assertFalse(lock_dir.is_dir()) + finally: + temp_logger.removeFilter(temp_err) + rm_temp(temp_dir) + + def test_locked(self): + temp_dir, lock_dir = make_lock_temp() + temp_logger.addFilter(lock_err := self.LockErrFilter()) + try: + for save_temp in (True, False): + # The directory should exist initially. + self.assertTrue(temp_dir.is_dir()) + self.assertTrue(lock_dir.is_dir()) + # The function should fail. + self.assertRaises(SystemExit, run_func, + temp_dir=temp_dir, save_temp=save_temp) + # The directory should not be deleted by lock_temp_dir. + self.assertTrue(temp_dir.is_dir()) + self.assertTrue(lock_dir.is_dir()) + finally: + temp_logger.removeFilter(lock_err) + rm_temp(temp_dir) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/demult/demultiplex.py b/src/seismicrna/demult/demultiplex.py index 9070751e..b312ec4a 100644 --- a/src/seismicrna/demult/demultiplex.py +++ b/src/seismicrna/demult/demultiplex.py @@ -1040,18 +1040,16 @@ def create_report(sequence_objects: dict, fq1: str, fq2: str, working_directory: def demultiplex_run(sections_file_csv, demulti_workspace, report_folder, fq_unit: FastqUnit, fasta, barcode_start=0, barcode_length=0, split: int = 10, clipped: int = 0, rev_clipped: int = 0, index_tolerance: int = 0, parallel: bool = False, mismatch_tolerence: int = 0, overwrite: bool = False): - - sample_name = fq_unit.sample mixed_fastq1, mixed_fastq2 = (fq_unit.paths.values()) # only works if the FASTQ has paired-end reads in two separate files mixed_fastq1=str(mixed_fastq1) mixed_fastq2=str(mixed_fastq2) - #report_folder+="" + report_folder+="/" """ makes dictionary of sequence objects """ - temp_ws = demulti_workspace + "/" + sample_name + "_demultiplex_folders_and_files/" + temp_ws = report_folder + "/" + sample_name + "_demultiplex_folders_and_files/" # final_sample_folder=temp_ws+"sample_fqs/" # print(temp_ws) @@ -1141,7 +1139,7 @@ def demultiplex_run(sections_file_csv, demulti_workspace, report_folder, fq_unit print("creating report!!!") create_report(sequence_objects, mixed_fastq1, mixed_fastq2, report_folder, unioned_sets_dictionary) - return (), (), (report_folder + sample_name,) + return (), (), (report_folder + sample_name + "/",) ######################################################################## # # diff --git a/src/seismicrna/relate/ambrel.py b/src/seismicrna/relate/ambrel.py new file mode 100644 index 00000000..e06ab643 --- /dev/null +++ b/src/seismicrna/relate/ambrel.py @@ -0,0 +1,469 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from .encode import encode_relate +from .error import RelateValueError +from ..core.rel import DELET, INS_5, INS_3, SUB_N +from ..core.seq import DNA + +# Minimum distance between an insertion and a deletion +MIN_INDEL_DIST = 2 + + +class Indel(ABC): + """ + Base class for an Insertion or Deletion (collectively, "indel") + It is used to find alternative positions for indels by keeping track + of an indel's current coordinates (as it is moved) and determining + whether a specific move is valid. + + Parameters + ---------- + + rel_ins_pos: int + The 0-indexed position of the indel with respect to the sequence + (ref or read) with the relative insertion. This position points + to one specific base. If the mutation is labeled an insertion, + then the read is the sequence with the relative insertion (since + it has a base that is not in the reference), and rel_ins_pos is + the 0-based index of the inserted base in the coordinates of the + read sequence. If the mutation is labeled a deletion, then the + reference is the sequence with the relative insertion (since it + has a base that is not in the read), and rel_ins_pos is the + 0-based index of the deleted base in the coordinates of the + reference sequence. + rel_del_pos (int): + The opposite of rel_ins_pos: the 0-indexed position of the indel + with respect to the sequence with a relative deletion (that is, + the read if the mutation is denoted a deletion, and the ref if + an insertion). Because the deleted base does not actually exist + in the sequence whose coordinates it is based on, rel_del_pos + does not refer to a specific position in the sequence, rather to + the two extant positions in the sequence that flank the deleted + position. It is most convenient for the algorithm to have this + argument refer to the position 3' of the deleted base and define + the 5' position as a property. + + """ + + __slots__ = "_ins_pos", "_ins_init", "_del_pos", "_del_init", "_tunneled" + + def __init__(self, rel_ins_pos: int, rel_del_pos: int): + self._ins_pos = rel_ins_pos + self._ins_init = rel_ins_pos + self._del_pos = rel_del_pos + self._del_init = rel_del_pos + self._tunneled = False + + @property + def ins_pos(self): + return self._ins_pos + + @property + def del_pos5(self): + return self._del_pos - 1 + + @property + def del_pos3(self): + return self._del_pos + + @property + def tunneled(self): + return self._tunneled + + @property + @abstractmethod + def rank(self) -> int: + """ Rank of the indel. """ + + def reset(self): + """ Reset the indel to its initial position, and erase its + history of tunneling. """ + self._ins_pos = self._ins_init + self._del_pos = self._del_init + self._tunneled = False + + @staticmethod + def _get_indel_by_pos(indels: list[Indel], ins_pos: int): + for indel in indels: + if indel.ins_pos == ins_pos: + return indel + + def _peek_out_of_indel(self, indels: list[Indel], from3to5: bool): + increment = -1 if from3to5 else 1 + ins_pos = self.ins_pos + increment + tunneled_indels: list[Indel] = list() + while indel := (self._get_indel_by_pos(indels, ins_pos)): + ins_pos += increment + tunneled_indels.append(indel) + self._tunneled = bool(tunneled_indels) + return ins_pos, tunneled_indels + + @staticmethod + def _collision(other: Indel, swap_pos: int): + return MIN_INDEL_DIST > (min(abs(swap_pos - other.del_pos5), + abs(swap_pos - other.del_pos3))) + + @classmethod + def _collisions(cls, indels: list[Indel], swap_pos: int): + return any(cls._collision(indel, swap_pos) for indel in indels) + + def step_del_pos(self, swap_pos: int): + # Move the indel's position (self._ins_pos) to swap_pos. + # Move self._del_pos one step in the same direction. + if swap_pos == self.ins_pos: + raise RelateValueError(f"swap ({swap_pos}) = ins ({self.ins_pos})") + self._del_pos += 1 if swap_pos > self.ins_pos else -1 + + def _step(self, swap_pos: int): + self.step_del_pos(swap_pos) + self._ins_pos = swap_pos + + @staticmethod + def _consistent_rels(curr_rel: int, swap_rel: int): + if curr_rel & swap_rel or (curr_rel & SUB_N and swap_rel & SUB_N): + # Relationship of the reference and read base (curr_rel) and + # relationship of the reference and swap base (swap_rel) are + # consistent, meaning one of the following is true: + # - both match the reference + # - one matches and the other maybe matches (i.e. low qual) + # - one is a substitution and the other could also be + # - both are substitutions + return curr_rel + # Otherwise, e.g. if one base matches and the other is a + # substitution, then the relationships are not consistent. + return 0 + + @classmethod + @abstractmethod + def _encode_swap(cls, *args, **kwargs) -> bool: + """ Encode a swap. """ + + @abstractmethod + def _try_swap(self, *args, **kwargs) -> bool: + """ Perform a swap if possible. """ + + def sweep(self, + muts: dict[int, int], + ref: DNA, + read: DNA, + qual: str, + min_qual: str, + dels: list[Deletion], + inns: list[Insertion], + from3to5: bool, + tunnel: bool): + # Move the indel as far as possible in the 5' or 3' direction. + while self._try_swap(muts, + ref, + read, + qual, + min_qual, + dels, + inns, + from3to5, + tunnel): + # All actions happen in _try_swap, so loop body is empty. + pass + + +class Deletion(Indel): + @property + def rank(self): + return self._ins_pos + + @classmethod + def _encode_swap(cls, + ref_base: str, + swap_base: str, + read_base: str, + read_qual: str, + min_qual: str): + curr_rel = encode_relate(ref_base, read_base, read_qual, min_qual) + swap_rel = encode_relate(swap_base, read_base, read_qual, min_qual) + return cls._consistent_rels(curr_rel, swap_rel) + + def _swap(self, muts: dict[int, int], swap_pos: int, relation: int): + """ + Parameters + ---------- + muts: dict + Mutation vector + swap_pos: int + Position in the reference to which the deletion moves during + this swap + relation: int + Relationship (match, sub, etc.) between the base located at + swap_pos and the base in the read + + """ + # The base at swap_pos moves to self.ins_pos, so after the swap, + # the relationship between self.ins_pos and the read base will + # be relation. + muts[self.ins_pos] |= relation + # The base at self.ref_pos is a deletion (by definition), so + # mark the position it moves to (swap_pos) as a deletion, too. + muts[swap_pos] |= DELET + self._step(swap_pos) + + def _try_swap(self, + muts: dict[int, int], + refseq: DNA, + read: DNA, + qual: str, + min_qual: str, + dels: list[Deletion], + inns: list[Insertion], + from3to5: bool, + tunnel: bool): + swap_pos, tunneled_indels = self._peek_out_of_indel(dels, from3to5) + read_pos = self.del_pos5 if from3to5 else self.del_pos3 + if (1 < swap_pos < len(refseq) and 1 < read_pos < len(read) + and (tunnel or not self.tunneled) + and not self._collisions(inns, swap_pos)): + relation = self._encode_swap(refseq[self.ins_pos - 1], + refseq[swap_pos - 1], + read[read_pos - 1], + qual[read_pos - 1], + min_qual) + if relation: + self._swap(muts, swap_pos, relation) + for indel in tunneled_indels: + indel.step_del_pos(swap_pos) + return True + return False + + +class Insertion(Indel): + @property + def rank(self): + return self._del_pos + + def stamp(self, muts: dict[int, int], reflen: int): + """ Stamp the relation vector with a 5' and a 3' insertion. """ + if 1 <= self.del_pos5 <= reflen: + muts[self.del_pos5] |= INS_5 + if 1 <= self.del_pos3 <= reflen: + muts[self.del_pos3] |= INS_3 + + @classmethod + def _encode_swap(cls, + ref_base: str, + read_base: str, + read_qual: str, + swap_base: str, + swap_qual: str, + min_qual: str): + curr_rel = encode_relate(ref_base, read_base, read_qual, min_qual) + swap_rel = encode_relate(ref_base, swap_base, swap_qual, min_qual) + return cls._consistent_rels(curr_rel, swap_rel) + + def _swap(self, + muts: dict[int, int], + ref_pos: int, + swap_pos: int, + relation: int, + reflen: int): + """ + Parameters + ---------- + muts: dict + Mutations + swap_pos: int + Position in the read to which the deletion is swapped + relation: int + Relationship (match, sub, etc.) between the base located at + swap_pos and the base in the ref + reflen: int + Length of the reference sequence. + + """ + # The base at ref_pos moves to swap_pos, so after the swap, the + # relationship between ref_pos and the read base is relation. + muts[ref_pos] |= relation + self._step(swap_pos) + # Mark the new positions of the insertion. + self.stamp(muts, reflen) + + def _try_swap(self, + muts: dict[int, int], + refseq: DNA, + read: DNA, + qual: str, + min_qual: str, + dels: list[Deletion], + inns: list[Insertion], + from3to5: bool, + tunnel: bool): + swap_pos, tunneled_indels = self._peek_out_of_indel(inns, from3to5) + ref_pos = self.del_pos5 if from3to5 else self.del_pos3 + if (1 < swap_pos < len(read) and 1 < ref_pos < len(refseq) + and (tunnel or not self.tunneled) + and not self._collisions(dels, swap_pos)): + relation = self._encode_swap(refseq[ref_pos - 1], + read[self.ins_pos - 1], + qual[self.ins_pos - 1], + read[swap_pos - 1], + qual[swap_pos - 1], + min_qual) + if relation: + self._swap(muts, ref_pos, swap_pos, relation, len(refseq)) + for indel in tunneled_indels: + indel.step_del_pos(swap_pos) + return True + return False + + +def sweep_indels(muts: dict[int, int], + refseq: DNA, + read: DNA, + qual: str, + min_qual: str, + dels: list[Deletion], + inns: list[Insertion], + from3to5: bool, + tunnel: bool): + """ + For every insertion and deletion, + + Parameters + ---------- + muts: dict + Mutations + refseq: bytes + Reference sequence + read: bytes + Sequence of the read + qual: bytes + Phred quality scores of the read, encoded as ASCII characters + min_qual: int + The minimum Phred quality score needed to consider a base call + informative: integer value of the ASCII character + dels: list[Deletion] + List of deletions identified by `vectorize_read` + inns: list[Insertion] + List of insertions identified by `vectorize_read` + from3to5: bool + Whether to move indels in the 3' -> 5' direction (True) or the + 5' -> 3' direction (False) + tunnel: bool + Whether to allow tunneling + + """ + # Collect all indels into one list. + indels: list[Indel] = list() + indels.extend(dels) + indels.extend(inns) + # Reset each indel to its initial state. This operation does nothing + # the first time sweep_indels is called because all indels start in + # their initial state (by definition). But the indels may move when + # this function runs, so resetting is necessary at the beginning of + # the second and subsequent calls to sweep_indels to ensure that the + # algorithm starts from the initial state every time. + for indel in indels: + indel.reset() + # Sort the indels by their rank. + sort_rev = from3to5 != tunnel + indels.sort(key=lambda idl: idl.rank, reverse=sort_rev) + while indels: + indel = indels.pop() + indel.sweep(muts, + refseq, + read, + qual, + min_qual, + dels, + inns, + from3to5, + tunnel) + idx = len(indels) + if sort_rev: + while idx > 0 and indel.rank > indels[idx - 1].rank: + idx -= 1 + else: + while idx > 0 and indel.rank < indels[idx - 1].rank: + idx -= 1 + if idx < len(indels): + indels.insert(idx, indel) + + +def find_ambrels(muts: dict[int, int], + refseq: DNA, + read: DNA, + qual: str, + min_qual: str, + dels: list[Deletion], + inns: list[Insertion]): + """ + Find and label all positions in the vector that are ambiguous due to + insertions and deletions. + + Parameters + ---------- + muts: dict + Mutations + refseq: DNA + Reference sequence + read: DNA + Sequence of the read + qual: str + Phred quality scores of the read, encoded as ASCII characters + min_qual: str + The minimum Phred quality score needed to consider a base call + informative: integer value of the ASCII character + dels: list[Deletion] + List of deletions identified by `vectorize_read` + inns: list[Insertion] + List of insertions identified by `vectorize_read` + + """ + # Each indel might be able to be moved in the 5' -> 3' direction + # (from3to5 is False) or 3' -> 5' direction (from3to5 is True). + # Test both directions. + for from3to5 in (False, True): + # For each indel, try to move it as far as it can go in the + # direction indicated by from3to5. Allow tunneling so that any + # runs of consecutive insertions or consecutive deletions can + # effectively move together. + sweep_indels(muts, + refseq, + read, + qual, + min_qual, + dels, + inns, + from3to5, + tunnel=True) + if any(d.tunneled for d in dels) or any(i.tunneled for i in inns): + # If any indel tunneled, + sweep_indels(muts, + refseq, + read, + qual, + min_qual, + dels, + inns, + from3to5, + tunnel=False) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/blank.py b/src/seismicrna/relate/blank.py new file mode 100644 index 00000000..dba23012 --- /dev/null +++ b/src/seismicrna/relate/blank.py @@ -0,0 +1,73 @@ +import numpy as np +import pandas as pd + +from ..core.rel import NOCOV, REL_TYPE +from ..core.sect import seq_pos_to_index +from ..core.seq import DNA + + +def blank_relvec(bases: int | DNA, + reads: int | list | np.ndarray | pd.Index | None = None): + """ + Return blank relation vector(s) of a given length. + + Parameters + ---------- + bases: int | DNA + The reference sequence (if DNA) or just its length (if int). + If the sequence itself is given, then return a Series/DataFrame + whose main index (index for Series, columns for DataFrame) is a + MultiIndex of 1-indexed positions and bases in the sequence. + If only the sequence length is given, then return a NumPy array. + reads: int | list | np.ndarray | pd.Index | None = None + Optional names of the relation vectors. If given, then return a + DataFrame whose indexes are the read names if bases is DNA, + otherwise a 2D NumPy array with one row for each name in reads. + If None, then return a Series if bases is DNA, otherwise a 1D + NumPy array. + + Returns + ------- + np.ndarray | pd.Series | pd.DataFrame + The blank relation vector(s). + """ + # Determine whether to return a Pandas or NumPy object. + if isinstance(bases, DNA): + # Make a Series or DataFrame with the sequence as its main axis. + sequence = seq_pos_to_index(bases, np.arange(1, len(bases) + 1), 1) + if reads is None: + # Return a Series representing just one relation vector. + return pd.Series(NOCOV, index=sequence, dtype=REL_TYPE) + # Ensure that names is a sequence of read names as str objects. + if isinstance(reads, int): + names = [f"Read_{i}" for i in range(1, reads + 1)] + else: + names = list(map(str, reads)) + # Return a DataFrame with one row per relation vector. + return pd.DataFrame(NOCOV, index=names, columns=sequence, dtype=REL_TYPE) + # Determine the size of the NumPy array. + size = (bases if reads is None + else ((reads, bases) if isinstance(reads, int) + else (len(reads), bases))) + return np.full(size, fill_value=NOCOV, dtype=REL_TYPE) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/cigar.py b/src/seismicrna/relate/cigar.py new file mode 100644 index 00000000..090e2c1a --- /dev/null +++ b/src/seismicrna/relate/cigar.py @@ -0,0 +1,105 @@ +import re + + +# CIGAR string operation codes +CIG_ALIGN = 'M' # alignment match +CIG_MATCH = '=' # sequence match +CIG_SUBST = 'X' # substitution +CIG_DELET = 'D' # deletion +CIG_INSRT = 'I' # insertion +CIG_SCLIP = 'S' # soft clipping + +# Regular expression pattern that matches a single CIGAR operation +# (length ≥ 1 and operation code, defined above) +CIG_PATTERN = re.compile("".join([r"(\d+)([", + CIG_ALIGN, + CIG_MATCH, + CIG_SUBST, + CIG_DELET, + CIG_INSRT, + CIG_SCLIP, + "])"])) + + +def parse_cigar(cigar_string: str): + """ + Yield the fields of a CIGAR string as pairs of (operation, length), + where operation is 1 byte indicating the CIGAR operation and length + is a positive integer indicating the number of bases from the read + that the operation consumes. Note that in the CIGAR string itself, + each length precedes its corresponding operation. + + Parameters + ---------- + cigar_string: bytes + CIGAR string from a SAM file. For full documentation, refer to + https://samtools.github.io/hts-specs/ + Yield + ----- + bytes (length = 1) + Current CIGAR operation + int (≥ 1) + Length of current CIGAR operation + """ + # Length-0 CIGAR strings are forbidden. + if not cigar_string: + raise ValueError("CIGAR string is empty") + # If the CIGAR string has any invalid bytes (e.g. an unrecognized + # operation byte, an operation longer than 1 byte, a length that is + # not a positive integer, or any extraneous characters), then the + # regular expression parser will simply skip these invalid bytes. + # In order to catch such problems, keep track of the number of + # bytes matched from the CIGAR string. After reading the CIGAR, if + # the number of bytes matched is smaller than the length of the + # CIGAR string, then some bytes must have been skipped, indicating + # that the CIGAR string contained at least one invalid byte. + num_chars_matched = 0 + # Find every operation in the CIGAR string that matches the regular + # expression. + for match in CIG_PATTERN.finditer(cigar_string): + length_str, operation = match.groups() + # Convert the length field from str to int and verify that it + # is a positive integer. + if (length_int := int(length_str)) < 1: + raise ValueError("length of CIGAR operation must be ≥ 1") + # Add the total number of characters in the current operation to + # the total number of characters matched from the CIGAR string. + num_chars_matched += len(length_str) + len(operation) + # Note that the fields are yielded as (operation, length), but + # in the CIGAR string itself, the order is (length, operation). + yield operation, length_int + # Confirm that all bytes in the CIGAR string were matched by the + # regular expression. + if num_chars_matched != len(cigar_string): + raise ValueError(f"Invalid CIGAR: '{cigar_string}'") + + +def op_consumes_ref(op: bytes) -> bool: + """ Return whether the CIGAR operation consumes the reference. """ + return op != CIG_INSRT and op != CIG_SCLIP + + +def op_consumes_read(op: bytes) -> bool: + """ Return whether the CIGAR operation consumes the read. """ + return op != CIG_DELET + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/cigarcount.py b/src/seismicrna/relate/cigarcount.py new file mode 100644 index 00000000..0c2f71d6 --- /dev/null +++ b/src/seismicrna/relate/cigarcount.py @@ -0,0 +1,73 @@ +from .cigar import (CIG_ALIGN, CIG_MATCH, CIG_SUBST, + CIG_DELET, CIG_INSRT, CIG_SCLIP, + parse_cigar) + + +class CigarOp(object): + """ Represent one operation in a CIGAR string. """ + + def __init__(self, op: str): + if op not in (CIG_ALIGN, CIG_MATCH, CIG_SUBST, + CIG_DELET, CIG_INSRT, CIG_SCLIP): + raise ValueError(f"Invalid CIGAR operation: '{op}'") + self._op = op + self._len = 1 + + @property + def op(self): + """ CIGAR operation as a character. """ + return self._op + + def lengthen(self): + """ Lengthen the operation by 1 base call. """ + self._len += 1 + + def __str__(self): + """ Text that goes into the CIGAR string. """ + return f"{self._len}{self._op}" + + +def count_cigar_muts(cigar_string: str): + """ Return the total number of mutations in a CIGAR string. """ + mutation_types = CIG_SUBST, CIG_DELET, CIG_INSRT + return sum(olen for op, olen in parse_cigar(cigar_string) + if op in mutation_types) + + +def find_cigar_op_pos(cigar_string: str, find_op: str): + """ Yield the position in the read of every base with a particular + type of operation specified by a CIGAR string. """ + consume_read = CIG_ALIGN, CIG_MATCH, CIG_SUBST, CIG_INSRT + if find_op in consume_read: + # Only these operations correspond to positions in the read. + pos = 1 + # Check each CIGAR operation, starting at position 1. + for op, olen in parse_cigar(cigar_string): + if op == find_op: + # If the operation matches the code, then yield the position + # of every base consumed by that operation. + yield from range(pos, pos := (pos + olen)) + elif op in consume_read: + # Advance the position by the length of the operation. + pos += olen + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/encode.py b/src/seismicrna/relate/encode.py new file mode 100644 index 00000000..6a53c4ec --- /dev/null +++ b/src/seismicrna/relate/encode.py @@ -0,0 +1,87 @@ +""" + +Relate Code Module + +======================================================================== + +Convert the relationships between reads and a reference from SAM format +(which encodes relationships implicitly as CIGAR strings) to vectorized +format (which encodes relationships explicitly as elements of arrays). + +------------------------------------------------------------------------ + +""" + +from ..core.rel import MATCH, SUB_A, SUB_C, SUB_G, SUB_T, ANY_N, IRREC +from ..core.seq import DNA, BASEA, BASEC, BASEG, BASET, BASEN + + +# Map bases to integer encodings and vice versa. +BASE_ENCODINGS = {BASEA: SUB_A, BASEC: SUB_C, BASEG: SUB_G, BASET: SUB_T, + BASEN: IRREC} +BASE_DECODINGS = {code: base for base, code in BASE_ENCODINGS.items() + if base != BASEN} + + +def encode_relate(ref_base: str, read_base: str, read_qual: str, min_qual: str): + """ + Encode the relation between a base in the read and a base in the + reference sequence. If the read quality is sufficient, then return + the match encoding if the read and reference bases match, otherwise + the encoding of the substitution for the base in the read. + If the read quality is insufficient, then return the fully ambiguous + base encoding, that is a match or substitution to any base except + the reference base, since a "substitution to the reference base" + would be a match, not a substitution. + + Parameters + ---------- + ref_base: DNA + Base in the reference sequence. + read_base: DNA + Base in the read sequence. + read_qual: str + ASCII encoding for the Phred quality score of the read base. + min_qual: str + Minimum value of `read_qual` to not call the relation ambiguous. + """ + return ((MATCH if ref_base == read_base + else BASE_ENCODINGS[read_base]) + if (read_qual >= min_qual + and read_base != BASEN + and ref_base != BASEN) + else ANY_N ^ BASE_ENCODINGS[ref_base]) + + +def encode_match(read_base: str, read_qual: str, min_qual: str): + """ + A more efficient version of `encode_compare` given prior knowledge + from the CIGAR string that the read and reference match at this + position. Note that there is no analagous version when there is a + known substitution because substitutions are relatively infrequent, + so optimizing their processing would speed the program only slightly + while making the source code more complex and harder to maintain. + """ + return (MATCH if (read_qual >= min_qual and read_base != BASEN) + else ANY_N ^ BASE_ENCODINGS[read_base]) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/error.py b/src/seismicrna/relate/error.py new file mode 100644 index 00000000..2e6f9793 --- /dev/null +++ b/src/seismicrna/relate/error.py @@ -0,0 +1,31 @@ +class RelateError(Exception): + """ Any error that occurs during relating. """ + + +class RelateValueError(RelateError, ValueError): + """ Any ValueError that occurs during relating. """ + + +class RelateNotImplementedError(RelateError, NotImplementedError): + """ Any NotImplementedError that occurs during relating. """ + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/human.py b/src/seismicrna/relate/human.py new file mode 100644 index 00000000..44244f2a --- /dev/null +++ b/src/seismicrna/relate/human.py @@ -0,0 +1,61 @@ +from sys import byteorder + +import numpy as np + +from ..core.rel import (MATCH, DELET, INS_5, INS_3, + SUB_A, SUB_C, SUB_G, SUB_T, + NOCOV, IRREC) + +# Number of unique bytes +N_BYTES = 2 ** 8 +# Map each common byte in the vector encoding (e.g. MATCH) to a byte of +# human-readable text. +BYTE_SYMBOLS = {NOCOV: b'_', + MATCH: b'=', + DELET: b'.', + INS_5 | MATCH: b'{', + INS_5 | MATCH | INS_3: b'+', + INS_3 | MATCH: b'}', + SUB_A: b'A', + SUB_C: b'C', + SUB_G: b'G', + SUB_T: b'T', + SUB_A | SUB_C | SUB_G | MATCH: b'N', + SUB_T | SUB_A | SUB_C | MATCH: b'N', + SUB_G | SUB_T | SUB_A | MATCH: b'N', + SUB_C | SUB_G | SUB_T | MATCH: b'N', + IRREC: b'!'} +# Default for uncommon bytes in the vector encoding. +OTHER_SYMBOL = b'?' +# Create an array that maps each vector byte to its readable character. +map_array = bytearray(OTHER_SYMBOL) * N_BYTES +for byte, symbol in BYTE_SYMBOLS.items(): + map_array[byte] = int.from_bytes(symbol, byteorder) +# Create a translation table from vector to human-readable encodings. +map_table = bytes.maketrans(bytes(range(N_BYTES)), map_array) + + +def humanize_relvec(relvec: np.ndarray): + """ Translate a binary relation vector to human-readable text. """ + return relvec.tobytes().translate(map_table) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/invert.py b/src/seismicrna/relate/invert.py new file mode 100644 index 00000000..12082d4a --- /dev/null +++ b/src/seismicrna/relate/invert.py @@ -0,0 +1,219 @@ +from typing import Sequence + +import numpy as np + +from .cigar import CIG_ALIGN, CIG_MATCH, CIG_SUBST, CIG_DELET, CIG_INSRT +from .cigarcount import CigarOp +from .encode import BASE_DECODINGS +from ..core.qual import LO_QUAL, HI_QUAL +from ..core.rel import (MATCH, DELET, INS_5, INS_3, INS_8, + SUB_A, SUB_C, SUB_G, SUB_T, ANY_N, + IRREC, NOCOV, REL_TYPE) +from ..core.seq import BASEN, DNA + + +def find_relvec_ends(relvec: np.ndarray): + """ Find the 5' and 3' ends of a relation vector. """ + if not isinstance(relvec, np.ndarray): + raise TypeError(f"Expected {np.ndarray}, but got {type(relvec)}") + if relvec.dtype.type is not REL_TYPE: + raise TypeError(f"Expected an array of type {REL_TYPE}, but got " + f"type {relvec.dtype.type}") + if relvec.ndim != 1: + raise ValueError("Expected an array with 1 dimension, but got " + f"{relvec.ndim} dimensions") + called = np.flatnonzero(relvec != NOCOV) + if called.size == 0: + raise ValueError("Relation vector is blank") + end5 = int(called[0]) + 1 + end3 = int(called[-1]) + 1 + if (nexp := end3 - end5 + 1) != called.size: + raise ValueError(f"Expected {nexp} base calls " + f"({np.arange(end5, end3 + 1)}), but got " + f"{called.size} ({called})") + return end5, end3 + + +def inverse_relate(refseq: DNA, relvec: np.ndarray, + hi_qual: str = HI_QUAL, + lo_qual: str = LO_QUAL, + ins_len: int | Sequence[int] = 1): + """ + Infer a read sequence and quality string from a reference sequence + and relation vector. + + Parameters + ---------- + refseq: DNA + Sequence of the reference. + relvec: ndarray + Relation vector. + hi_qual: str = MAX_QUAL + Character to put in the quality string at every position that is + high-quality according to the relation vector. + lo_qual: str = MIN_QUAL + Character to put in the quality string at every position that is + low-quality according to the relation vector. + ins_len: int | Sequence[int] = 1 + Number of bases to insert into the read and quality strings upon + finding an insertion in the relation vector. If an integer, then + insert that number of bases for every insertion. If a sequence + of integers, then the ith insertion gets a number of bases equal + to the ith element of ins_len. + + Returns + ------- + + """ + # Validate the relation vector and reference sequence. + end5, end3 = find_relvec_ends(relvec) + if len(refseq) != len(relvec): + raise ValueError(f"Reference sequence has {len(refseq)} nt, " + f"but relation vector has {len(relvec)} nt") + if hi_qual < lo_qual: + raise ValueError(f"The high-quality score ({hi_qual}) is less than " + f"than the low-quality score ({lo_qual})") + # Validate the quality codes. + if len(hi_qual) != 1: + raise ValueError(f"hi_qual must be 1 character, but got {len(hi_qual)}") + if len(lo_qual) != 1: + raise ValueError(f"lo_qual must be 1 character, but got {len(hi_qual)}") + # Build the read sequence, quality scores, and CIGAR string one base + # at a time. + read: list[str] = list() + qual: list[str] = list() + cigars: list[CigarOp] = list() + ins3_next = False + ins_count = 0 + + def add_to_cigar(op: str): + """ Add one base of the relation vector to the CIGAR string. """ + if cigars and cigars[-1].op == op: + # The current operation matches that of the last CigarOp: + # just increment its length. + cigars[-1].lengthen() + else: + # Either no CigarOp objects exist or the current operation + # does not match that of the last one: start a new CigarOp. + cigars.append(CigarOp(op)) + + for pos in range(end5, end3 + 1): + ref_base = refseq[pos - 1] + rel = relvec[pos - 1] + if rel == NOCOV: + raise ValueError(f"Position {pos} in {end5}-{end3} is not covered") + if rel & INS_8: + # Specially handle insertions because they may overlap any + # other relation except for a deletion. + if rel & DELET: + # Being both a deletion and an insertion is forbidden. + raise ValueError(f"Relation {rel} in {relvec} is del and ins") + if rel & INS_3: + if not pos > end5: + # Insertions cannot occur before the beginning of + # the read. + raise ValueError(f"Position {pos} in {end5}-{end3} cannot " + f"be 3' of an insertion") + if not ins3_next: + raise ValueError(f"Unexpected 3' ins at {pos} in {relvec}") + # The current position is 5' of an insertion, so the + # inserted base must be added before the base at the + # current position is added. Insert a number of bases + # given by ins_len. + if isinstance(ins_len, int): + # Add the same number of bases for every insertion. + n_ins = ins_len + else: + # Add a number of bases for this specific insertion. + n_ins = ins_len[ins_count] + read.append(BASEN * n_ins) + qual.append(hi_qual * n_ins) + for _ in range(n_ins): + add_to_cigar(CIG_INSRT) + ins_count += 1 + # Being 3' of an insertion is not allowed until the next + # position 5' of an insertion is reached. + ins3_next = False + elif ins3_next: + # Check if this position should be 3' of an insertion. + raise ValueError(f"Missing 3' ins at {pos} in {relvec}") + if rel & INS_5: + if not pos < end3: + raise ValueError(f"Position {pos} in {end5}-{end3} cannot " + f"be 5' of an insertion") + # The current position is 5' of an insertion, so the + # inserted base must be added after the base at the + # current position is added. Defer the responsibility of + # adding the inserted base to the base that lies 3' of + # the insertion and indicate that the next base must be + # 3' of an insertion by setting ins3_next to True. + ins3_next = True + # Switch off the insertion flag(s) in the relation so that + # the base at this position can be added below. + rel = rel & ~INS_8 + elif ins3_next: + # Check if this position should be 3' of an insertion. + raise ValueError(f"Missing 3' ins at {pos} in {relvec}") + if rel == MATCH: + # Match: Add the reference base to the read. + read.append(ref_base) + qual.append(hi_qual) + add_to_cigar(CIG_MATCH) + elif rel == DELET: + # Deletion from the read. + if not end5 < pos < end3: + raise ValueError( + f"Deletion cannot be at position {pos} in {end5}-{end3}") + add_to_cigar(CIG_DELET) + elif rel ^ ANY_N in (SUB_A, SUB_C, SUB_G, SUB_T, IRREC): + # Ambiguous substitution: Add any nucleotide as low quality. + read.append(BASEN) + qual.append(lo_qual) + add_to_cigar(CIG_ALIGN) + else: + # Unambiguous substitution: Add the substituted nucleotide + # as high quality. + try: + read_base = BASE_DECODINGS[rel] + except KeyError: + raise ValueError(f"Invalid relation {rel} in {relvec}") + if ref_base == read_base: + raise ValueError( + f"Cannot substitute {ref_base} to itself in {relvec}") + read.append(read_base) + qual.append(hi_qual) + add_to_cigar(CIG_SUBST) + # Check that the 5' end was found. + if end5 == 0: + raise ValueError(f"Relation vector had no 5' end: {relvec}") + if len(read) != len(qual): + raise ValueError( + f"Lengths of read ({len(read)}) and qual ({len(qual)}) differed") + if not read: + raise ValueError("Read contained no bases") + # Assemble and return the read, quality, and CIGAR strings. + read_seq = DNA("".join(read)) + qual_string = "".join(qual) + cigar_string = "".join(map(str, cigars)) + return read_seq, qual_string, cigar_string, end5, end3 + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/iterread.py b/src/seismicrna/relate/iterread.py new file mode 100644 index 00000000..61db6b30 --- /dev/null +++ b/src/seismicrna/relate/iterread.py @@ -0,0 +1,143 @@ +""" + +Read Iteration Module +======================================================================== + +------------------------------------------------------------------------ + +""" + +from collections import defaultdict +from itertools import product + +import numpy as np + +from .cigar import CIG_INSRT +from .cigarcount import count_cigar_muts, find_cigar_op_pos +from .invert import inverse_relate +from .iterrelv import iter_relvecs_all +from ..core.rel import INS_5, INS_3, REL_TYPE, NOCOV +from ..core.seq import DNA, expand_degenerate_seq + + +def ref_to_alignments(refseq: DNA, + max_ins: int = 2, + max_ins_len: int = 1, + max_ins_bases: int | None = None): + """ + For a given reference sequence, return maps from every possible read + to the CIGAR string(s) and (possibly ambiguous) relation vector for + the read. + + Parameters + ---------- + refseq: DNA + Sequence of the reference. + max_ins: int = 2 + Maximum number of insertions in the read. Must be ≥ 0. + max_ins_len: int = 1 + Maximum length of (i.e. number of bases in) one insertion. + Must be ≥ 1. + max_ins_bases: int | None = None + Maximum total number of bases inserted. Must be ≥ `max_ins`. + If `None`, there is no limit. + """ + # Initialize maps of reads to CIGAR strings and relation vectors. + quals = dict() + cigars = defaultdict(lambda: defaultdict(list)) + relvecs = defaultdict(lambda: defaultdict(lambda: np.zeros(len(refseq), + dtype=REL_TYPE))) + if max_ins < 0: + raise ValueError(f"max_ins must be ≥ 0, but got {max_ins}") + if max_ins > 0: + if max_ins_len < 1: + raise ValueError(f"max_ins_len must be ≥ 1, but got {max_ins_len}") + if max_ins_bases is not None and max_ins_bases < max_ins: + raise ValueError(f"max_ins_bases ({max_ins_bases}) " + f"must be ≥ max_ins ({max_ins})") + # Iterate through all possible relation vectors. + for relvec in iter_relvecs_all(refseq, max_ins): + # Check if there are insertions in the relation vector. + n_ins = int(np.count_nonzero(np.logical_and(relvec & INS_5, + relvec != NOCOV))) + n_ins3 = int(np.count_nonzero(np.logical_and(relvec & INS_3, + relvec != NOCOV))) + if n_ins != n_ins3: + raise ValueError(f"Got {n_ins} 5' and {n_ins3} 3' insertions") + if n_ins > max_ins: + # This should not happen. Checking just in case. + raise ValueError(f"Got {n_ins} insertions, but max_ins = {max_ins}") + # Iterate over all possible combinations of insertion lengths. + for ins_len in product(range(1, max_ins_len + 1), repeat=n_ins): + if max_ins_bases is not None and sum(ins_len) > max_ins_bases: + # Skip insertion lengths whose sum exceeds the limit. + continue + # Determine the read(s) corresponding to this relation vector. + degen, qual, cigar, end5, end3 = inverse_relate(refseq, relvec, + ins_len=ins_len) + if n_ins > 0: + # If there are insertions, find their positions. + ins_pos = list(find_cigar_op_pos(cigar, CIG_INSRT)) + # Remove quality codes of inserted bases because -- for + # the purpose of aggregating reads based on sequence, + # quality score, and position -- the "fake" quality + # scores of inserted bases should not be considered. + qual_no_ins = "".join(q for i, q in enumerate(qual, start=1) + if i not in ins_pos) + else: + qual_no_ins = qual + # Count the mutations in the CIGAR string. + nmuts = count_cigar_muts(cigar) + for read in expand_degenerate_seq(degen): + key = read, qual_no_ins, end5, end3 + # Record the original quality score. + quals[key] = qual + # Gather every CIGAR string for the read. + cigars[key][nmuts].append(cigar) + # Accumulate the bitwise OR of all relation vectors. + relvecs[key][nmuts] |= relvec + # For every read-quality-end5-end3 key, keep only the CIGAR strings + # and relation vector with the fewest mutations. + cigars_best: dict[tuple[DNA, str, int, int], list[str]] = dict() + relvec_best: dict[tuple[DNA, str, int, int], np.ndarray] = dict() + for key, cig_key in cigars.items(): + # Find the minimum number of mutations for this key. + min_nmuts = min(cig_key) + # Export the results with the fewest mutations. + cigars_best[key] = cigars[key][min_nmuts] + relvec_best[key] = relvecs[key][min_nmuts] + return quals, cigars_best, relvec_best + + +def iter_alignments(*args, **kwargs): + """ For a given reference sequence, find every read that could come + from the reference (with up to 2 bases inserted). For each read, + yield the (possibly ambiguous) relation vector and every possible + CIGAR string. """ + quals, cigars, relvecs = ref_to_alignments(*args, **kwargs) + for key, relvec in relvecs.items(): + read, _, end5, end3 = key + qual = quals[key] + for cigar in cigars[key]: + yield read, qual, cigar, end5, end3, relvec + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/iterrelv.py b/src/seismicrna/relate/iterrelv.py new file mode 100644 index 00000000..b5883b47 --- /dev/null +++ b/src/seismicrna/relate/iterrelv.py @@ -0,0 +1,156 @@ +""" + +Relate Auxiliary Module +======================================================================== + +------------------------------------------------------------------------ + +""" + +from itertools import (product, + combinations, + combinations_with_replacement as cwr) +from typing import Sequence + +import numpy as np + +from .encode import encode_relate +from ..core.rel import DELET, INS_5, INS_3, REL_TYPE, NOCOV +from ..core.sect import Section +from ..core.seq import BASEA, BASEC, BASEG, BASET, DNA + +MIN_INDEL_GAP = 1 +MIN_MAX_INS = 0 +MAX_MAX_INS = 2 + + +def iter_relvecs_q53(refseq: DNA, + low_qual: Sequence[int] = (), + end5: int | None = None, + end3: int | None = None, + max_ins: int = MAX_MAX_INS): + """ + For a given reference sequence, yield every possible unambiguous + relation vector between positions end5 and end3 that follows the + given low-quality positions and has at most two insertions. + + Parameters + ---------- + refseq: DNA + Sequence of the reference. + low_qual: Sequence[int] + List of positions in the read that are low-quality. + end5: int | None = None + 5' end of the read; 1-indexed with respect to `refseq`. + end3: int | None = None + 3' end of the read; 1-indexed with respect to `refseq`. + max_ins: int = 2 + Maximum number of insertions in the read. Must be in [0, 2]. + """ + low_qual = set(low_qual) + # Determine the section of the reference sequence that is occupied + # by the read. + sect = Section("", refseq, end5=end5, end3=end3) + if low_qual - set(sect.range_int): + raise ValueError(f"Invalid positions in low_qual: " + f"{sorted(low_qual - set(sect.range))}") + if max_ins not in range(MIN_MAX_INS, MAX_MAX_INS + 1): + raise ValueError(f"Invalid value for max_ins: {max_ins}") + # Find the possible relationships at each position in the section, + # not including insertions. + rel_opts: list[tuple[int, ...]] = [(NOCOV,)] * len(refseq) + for pos in sect.range_int: + # Find the base in the reference sequence (pos is 1-indexed). + ref_base = refseq[pos - 1] + if low_qual: + # The only option is low-quality. + opts = encode_relate(ref_base, ref_base, '0', '1'), + else: + # The options are a match and three substitutions. + opts = tuple(encode_relate(ref_base, read_base, '1', '1') + for read_base in (BASEA, BASEC, BASEG, BASET)) + # A deletion is an option at every position except the ends of + # the covered region. + if sect.end5 < pos < sect.end3: + opts = opts + (DELET,) + rel_opts[pos - 1] = opts + # Iterate through all possible relationships at each position. + margin = MIN_INDEL_GAP + 1 + for rel in product(*rel_opts): + # Generate a relation vector from the relationships. + relvec = np.array(rel, dtype=REL_TYPE) + yield relvec + if max_ins == 0: + # Do not introduce any insertions into the read. + continue + # Allow up to two insertions in one read, at every position + # except the ends of the covered region and near deletions. + for ins1_pos5, ins2_pos5 in cwr(range(sect.end5, sect.end3), 2): + if max_ins == 1 and ins1_pos5 != ins2_pos5: + # If at most one insertion can be in the read, then skip + # cases where ins1_pos5 and ins2_pos5 are not equal. + continue + # Skip insertions nearby any deletion. + mrg15 = max(ins1_pos5 - margin, 0) + mrg13 = min(ins1_pos5 + margin, relvec.size) + if np.any(relvec[mrg15: mrg13] == DELET): + continue + mrg25 = max(ins2_pos5 - margin, 0) + mrg23 = min(ins2_pos5 + margin, relvec.size) + if np.any(relvec[mrg25: mrg23] == DELET): + continue + # Yield the relation vector with these insertions. + relvec_ins = relvec.copy() + if ins1_pos5 >= 1: + relvec_ins[ins1_pos5 - 1] |= INS_5 + if ins1_pos5 < relvec.size: + relvec_ins[ins1_pos5] |= INS_3 + if ins2_pos5 >= 1: + relvec_ins[ins2_pos5 - 1] |= INS_5 + if ins2_pos5 < relvec.size: + relvec_ins[ins2_pos5] |= INS_3 + yield relvec_ins + + +def iter_relvecs_all(refseq: DNA, max_ins: int = 2): + """ + For a given reference sequence, yield every possible unambiguous + relation vector that has at most two insertions. + + Parameters + ---------- + refseq: DNA + Sequence of the reference. + max_ins: int = 2 + Maximum number of insertions in a read. + """ + # Use every possible pair of 5' and 3' end positions. + for end5, end3 in cwr(range(1, len(refseq) + 1), 2): + # Use every possible number of low-quality base calls. + for n_low_qual in range((end3 - end5 + 1) + 1): + # Use every possible set of low-quality positions. + for low_qual in combinations(range(end5, end3 + 1), n_low_qual): + # Yield every possible relation vector. + yield from iter_relvecs_q53(refseq, low_qual, end5, end3, + max_ins) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/ambrel_test.py b/src/seismicrna/relate/tests/ambrel_test.py new file mode 100644 index 00000000..e69de29b diff --git a/src/seismicrna/relate/tests/blank_test.py b/src/seismicrna/relate/tests/blank_test.py new file mode 100644 index 00000000..7fe34b5d --- /dev/null +++ b/src/seismicrna/relate/tests/blank_test.py @@ -0,0 +1,108 @@ +import unittest as ut + +import numpy as np +import pandas as pd + +from ..blank import blank_relvec +from ...core.rel import NOCOV +from ...core.sect import seq_pos_to_index +from ...core.seq import DNA + + +class TestBlankRelvec(ut.TestCase): + """ Test function `blank_relvec`. """ + + def compare_numpy(self, result, expect: np.ndarray): + self.assertIsInstance(result, np.ndarray) + self.assertIs(type(result), type(expect)) + self.assertIs(result.dtype, expect.dtype) + self.assertTrue(np.array_equal(result, expect)) + + def compare_pandas(self, result, expect: pd.Series | pd.DataFrame): + self.assertIsInstance(result, (pd.Series, pd.DataFrame)) + self.assertIs(type(result), type(expect)) + self.assertTrue(expect.equals(result)) + + def test_numpy_1d(self): + """ Test returning a 1D NumPy array. """ + for length in range(10): + self.compare_numpy(blank_relvec(length), + np.full(shape=(length,), + fill_value=NOCOV, + dtype=np.uint8)) + + def test_numpy_2d_int(self): + """ Test returning a 2D NumPy array with integer reads. """ + for length in range(10): + for n_reads in range(10): + self.compare_numpy(blank_relvec(length, n_reads), + np.full(shape=(n_reads, length), + fill_value=NOCOV, + dtype=np.uint8)) + + def test_numpy_2d_list(self): + """ Test returning a 2D NumPy array with a list of reads. """ + for length in range(10): + for n_reads in range(10): + for list_func in [list, np.array, pd.Index]: + reads = list_func(list(map(str, range(n_reads)))) + self.compare_numpy(blank_relvec(length, reads), + np.full(shape=(n_reads, length), + fill_value=NOCOV, + dtype=np.uint8)) + + def test_pandas_series(self): + """ Test returning a Pandas Series. """ + for length in range(1, 10): + seq = DNA.random(length) + index = seq_pos_to_index(seq, list(range(1, length + 1)), 1) + self.compare_pandas(blank_relvec(seq), + pd.Series(NOCOV, index=index, dtype=np.uint8)) + + def test_pandas_dataframe_int(self): + """ Test returning a Pandas DataFrame with integer reads. """ + for length in range(1, 10): + seq = DNA.random(length) + index = seq_pos_to_index(seq, list(range(1, length + 1)), 1) + for n_reads in range(10): + reads = [f"Read_{i + 1}" for i in range(n_reads)] + self.compare_pandas(blank_relvec(seq, n_reads), + pd.DataFrame(NOCOV, + index=reads, + columns=index, + dtype=np.uint8)) + + def test_pandas_dataframe_list(self): + """ Test returning a Pandas DataFrame with integer reads. """ + for length in range(1, 10): + seq = DNA.random(length) + index = seq_pos_to_index(seq, list(range(1, length + 1)), 1) + for n_reads in range(10): + for list_func in [list, np.array, pd.Index]: + reads = list_func(list(map(str, range(n_reads)))) + self.compare_pandas(blank_relvec(seq, reads), + pd.DataFrame(NOCOV, + index=reads, + columns=index, + dtype=np.uint8)) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/cigar2_test.py b/src/seismicrna/relate/tests/cigar2_test.py new file mode 100644 index 00000000..bf125dc5 --- /dev/null +++ b/src/seismicrna/relate/tests/cigar2_test.py @@ -0,0 +1,165 @@ +import string +import unittest as ut + +from ..cigar import (CIG_ALIGN, CIG_MATCH, CIG_SUBST, + CIG_DELET, CIG_INSRT, CIG_SCLIP) +from ..cigarcount import CigarOp, count_cigar_muts, find_cigar_op_pos + + +class TestCigarOp(ut.TestCase): + """ Test class `CigarOp`. """ + + ops = CIG_ALIGN, CIG_MATCH, CIG_SUBST, CIG_DELET, CIG_INSRT, CIG_SCLIP + + def test_cigar_init_valid(self): + """ Test that CigarOp instances can be created. """ + for op in self.ops: + cigar_op = CigarOp(op) + self.assertIsInstance(cigar_op, CigarOp) + self.assertEqual(cigar_op.op, op) + self.assertEqual(cigar_op._len, 1) + + def test_cigar_init_invalid(self): + """ Test that CigarOp instances cannot be created with invalid + operations. """ + for op in string.printable: + if op not in self.ops: + self.assertRaisesRegex(ValueError, f"Invalid CIGAR operation: ", + CigarOp, op) + + def test_cigar_lengthen(self): + """ Test lengthening CIGAR operations. """ + for op in self.ops: + cigar_op = CigarOp(op) + for length in range(1, 10): + self.assertEqual(cigar_op._len, length) + cigar_op.lengthen() + + def test_cigar_str(self): + """ Test string representations of CIGAR operations. """ + for op in self.ops: + cigar_op = CigarOp(op) + for length in range(1, 10): + self.assertEqual(str(cigar_op), f"{length}{op}") + cigar_op.lengthen() + + +class TestCountCigarMuts(ut.TestCase): + """ Test function `count_cigar_muts`. """ + + def test_cigar_match_subst_valid(self): + """ Count mutations in a valid CIGAR string. """ + self.assertEqual(count_cigar_muts("9S23=1X13=1D9=2I56=3S"), 4) + + +class TestFindCigarOpPos(ut.TestCase): + """ Test function `find_cigar_op_pos`. """ + + def test_cigar_xeq_aln_valid(self): + """ Find aligned positions in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_ALIGN)), + []) + + def test_cigar_m_aln_valid(self): + """ Find aligned positions in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_ALIGN)), + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, + 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108]) + + def test_cigar_xeq_mat_valid(self): + """ Find matches in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_MATCH)), + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, + 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108]) + + def test_cigar_m_mat_valid(self): + """ Find matches in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_MATCH)), + []) + + def test_cigar_xeq_sub_valid(self): + """ Find substitutions in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_SUBST)), + [24]) + + def test_cigar_m_sub_valid(self): + """ Find substitutions in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_SUBST)), + []) + + def test_cigar_xeq_del_valid(self): + """ Find deletions in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_DELET)), + []) + + def test_cigar_m_del_valid(self): + """ Find deletions in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_DELET)), + []) + + def test_cigar_xeq_ins_valid(self): + """ Find insertions in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_INSRT)), + [25, 26, 27, 50, 51, 83]) + + def test_cigar_m_ins_valid(self): + """ Find insertions in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_INSRT)), + [25, 26, 27, 50, 51, 83]) + + def test_cigar_xeq_scl_valid(self): + """ Find soft clippings in a CIGAR string with =/X codes. """ + self.assertEqual(list(find_cigar_op_pos("9S23=1X3I13=1D9=2I31=1I25=", + CIG_SCLIP)), + []) + + def test_cigar_m_scl_valid(self): + """ Find soft clippings in a CIGAR string with M codes. """ + self.assertEqual(list(find_cigar_op_pos("9S24M3I13M1D9M2I31M1I25M", + CIG_SCLIP)), + []) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/cigar_test.py b/src/seismicrna/relate/tests/cigar_test.py new file mode 100644 index 00000000..a1f3a43b --- /dev/null +++ b/src/seismicrna/relate/tests/cigar_test.py @@ -0,0 +1,46 @@ +import unittest as ut + +from ..cigar import (CIG_ALIGN, CIG_MATCH, CIG_SUBST, + CIG_DELET, CIG_INSRT, CIG_SCLIP, + parse_cigar) + + +class TestParseCigar(ut.TestCase): + """ Test function `parse_cigar`. """ + + def test_cigar_match_subst_valid(self): + """ Parse a valid CIGAR string with match and subst codes. """ + cigar = "9S23=1X13=1D9=2I56=3S" + expect = [(CIG_SCLIP, 9), (CIG_MATCH, 23), (CIG_SUBST, 1), + (CIG_MATCH, 13), (CIG_DELET, 1), (CIG_MATCH, 9), + (CIG_INSRT, 2), (CIG_MATCH, 56), (CIG_SCLIP, 3)] + self.assertEqual(list(parse_cigar(cigar)), expect) + + def test_cigar_align_valid(self): + """ Parse a valid CIGAR string with align codes. """ + cigar = "9S37M1D9M2I56M3S" + expect = [(CIG_SCLIP, 9), (CIG_ALIGN, 37), (CIG_DELET, 1), + (CIG_ALIGN, 9), (CIG_INSRT, 2), (CIG_ALIGN, 56), + (CIG_SCLIP, 3)] + self.assertEqual(list(parse_cigar(cigar)), expect) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/encode_test.py b/src/seismicrna/relate/tests/encode_test.py new file mode 100644 index 00000000..03c424a9 --- /dev/null +++ b/src/seismicrna/relate/tests/encode_test.py @@ -0,0 +1,71 @@ +import unittest as ut + +from ..encode import encode_match, encode_relate +from ...core.qual import LO_QUAL, HI_QUAL +from ...core.rel import (MATCH, + SUB_A, SUB_C, SUB_G, SUB_T, SUB_N, + ANY_B, ANY_D, ANY_H, ANY_V, ANY_N) + + +class TestEncodeRelate(ut.TestCase): + """ Test function `encode_relate`. """ + + def test_encode_relate_hi_qual(self): + """ Test when the quality is at least the minimum. """ + for ref, ref_sub in zip("ACGTN", + [ANY_B, ANY_D, ANY_H, ANY_V, ANY_N], + strict=True): + for read, read_sub in zip("ACGTN", + [SUB_A, SUB_C, SUB_G, SUB_T, SUB_N], + strict=True): + code = encode_relate(ref, read, HI_QUAL, HI_QUAL) + if ref == 'N': + self.assertEqual(code, ANY_N) + elif read == 'N': + self.assertEqual(code, ref_sub) + else: + self.assertEqual(code, MATCH if read == ref else read_sub) + + def test_encode_relate_lo_qual(self): + """ Test when the quality is less than the minimum. """ + for ref, sub in zip("ACGTN", [ANY_B, ANY_D, ANY_H, ANY_V, ANY_N], + strict=True): + for read in "ACGTN": + self.assertEqual(encode_relate(ref, read, LO_QUAL, HI_QUAL), + sub) + + +class TestEncodeMatch(ut.TestCase): + """ Test function `encode_match`. """ + + def test_encode_match_hi_qual(self): + """ Test when the quality is at least the minimum. """ + for read in "ACGT": + self.assertEqual(encode_match(read, HI_QUAL, HI_QUAL), MATCH) + self.assertEqual(encode_match('N', HI_QUAL, HI_QUAL), ANY_N) + + def test_encode_match_lo_qual(self): + """ Test when the quality is less than the minimum. """ + for read, sub in zip("ACGTN", [ANY_B, ANY_D, ANY_H, ANY_V, ANY_N]): + self.assertEqual(encode_match(read, LO_QUAL, HI_QUAL), sub) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/invert_test.py b/src/seismicrna/relate/tests/invert_test.py new file mode 100644 index 00000000..9d66f2d6 --- /dev/null +++ b/src/seismicrna/relate/tests/invert_test.py @@ -0,0 +1,410 @@ +import unittest as ut + +import numpy as np + +from ..invert import find_relvec_ends, inverse_relate +from ...core.qual import HI_QUAL, LO_QUAL +from ...core.rel import (MATCH, DELET, INS_5, INS_3, INS_8, + NOCOV, ANY_N, MINS5, MINS3, ANY_8, + SUB_A, SUB_C, SUB_G, SUB_T, REL_TYPE) +from ...core.seq import DNA + + +class TestFindRelVecEnds(ut.TestCase): + """ Test function `find_relvec_ends`. """ + + def test_matches(self): + """ Test that a relation vector of all matches is valid. """ + self.assertIsNotNone(find_relvec_ends(np.ones(8, REL_TYPE))) + + def test_nocov_margin(self): + """ Test that a relation vector with non-covered positions on + its margins is valid. """ + relvec = np.full(8, NOCOV, REL_TYPE) + relvec[4] = 1 + self.assertIsNotNone(find_relvec_ends(relvec)) + + def test_nocov_middle(self): + """ Test that a relation vector with non-covered positions in + the middle is invalid. """ + relvec = np.full(8, MATCH, REL_TYPE) + relvec[4] = NOCOV + self.assertRaises(ValueError, find_relvec_ends, relvec) + + def test_blank(self): + """ Test that an entirely blank relation vector is invalid. """ + self.assertRaises(ValueError, find_relvec_ends, + np.full(8, NOCOV, REL_TYPE)) + + def test_not_array(self): + """ Test that a non-array relation vector is invalid. """ + self.assertRaises(TypeError, find_relvec_ends, + np.ones(8, REL_TYPE).tolist()) + + def test_not_uint8(self): + """ Test that a non-uint8 relation vector is invalid. """ + self.assertRaises(TypeError, find_relvec_ends, np.ones(8, np.int8)) + + +class TestInverseRelate(ut.TestCase): + """ Test function `inverse_relate`. """ + + def assert_equal(self, ref: DNA, + relvecs: list[list[int]], + expects: list[tuple[str, str, str, int, int]]): + """ Assert that the actual and expected outputs match. """ + for relvec, expect in zip(relvecs, expects, strict=True): + with self.subTest(relvec=relvec, expect=expect): + self.assertEqual(inverse_relate(ref, np.array(relvec, + dtype=np.uint8), + HI_QUAL, LO_QUAL), + (DNA(expect[0]),) + expect[1:]) + + def assert_raise(self, ref: DNA, + relvecs: list[list[int]], + error: type[Exception], + regex: str): + """ Assert that the relation vectors raise an exception. """ + for relvec in relvecs: + with self.subTest(relvec=relvec): + self.assertRaisesRegex(error, regex, inverse_relate, + ref, np.array(relvec, + dtype=np.uint8), + HI_QUAL, LO_QUAL) + + def test_all_match(self): + """ Test when the read has four matching bases. """ + ref = DNA("ACGT") + relvecs = [[MATCH, MATCH, MATCH, MATCH]] + expects = [("ACGT", "IIII", "4=", 1, 4)] + self.assert_equal(ref, relvecs, expects) + + def test_all_match_n(self): + """ Test when the read has four matching bases and an ambiguous + base. """ + ref = DNA("ACNGT") + relvecs = [[MATCH, MATCH, ANY_N, MATCH, MATCH]] + expects = [("ACNGT", "II!II", "2=1M2=", 1, 5)] + self.assert_equal(ref, relvecs, expects) + + def test_nocov_valid(self): + """ Test when the read does not cover one or both ends of the + reference. """ + ref = DNA("ACGT") + relvecs = [ + [NOCOV, MATCH, MATCH, MATCH], + [MATCH, MATCH, MATCH, NOCOV], + [NOCOV, MATCH, MATCH, NOCOV], + [NOCOV, NOCOV, MATCH, MATCH], + [MATCH, MATCH, NOCOV, NOCOV], + ] + expects = [ + ("CGT", "III", "3=", 2, 4), + ("ACG", "III", "3=", 1, 3), + ("CG", "II", "2=", 2, 3), + ("GT", "II", "2=", 3, 4), + ("AC", "II", "2=", 1, 2), + ] + self.assert_equal(ref, relvecs, expects) + + def test_nocov_middle_invalid(self): + """ Test when the read does not cover a middle position. """ + ref = DNA("ACGT") + relvecs = [ + [MATCH, NOCOV, MATCH, MATCH], + [MATCH, MATCH, NOCOV, MATCH], + [MATCH, NOCOV, NOCOV, MATCH], + ] + self.assert_raise(ref, relvecs, ValueError, + "Expected [0-9]+ base calls") + + def test_nocov_all_invalid(self): + """ Test when the read does not cover any positions. """ + ref = DNA("ACGT") + relvecs = [[NOCOV, NOCOV, NOCOV, NOCOV]] + self.assert_raise(ref, relvecs, ValueError, + "Relation vector is blank") + + def test_low_qual_valid(self): + """ Test when the read has a low-quality base. """ + ref = DNA("ACGT") + relvecs = [ + [ANY_N - SUB_A, MATCH, MATCH, MATCH], + [MATCH, ANY_N - SUB_C, MATCH, MATCH], + [MATCH, MATCH, ANY_N - SUB_G, MATCH], + [MATCH, MATCH, MATCH, ANY_N - SUB_T], + ] + expects = [ + ("NCGT", "!III", "1M3=", 1, 4), + ("ANGT", "I!II", "1=1M2=", 1, 4), + ("ACNT", "II!I", "2=1M1=", 1, 4), + ("ACGN", "III!", "3=1M", 1, 4), + ] + self.assert_equal(ref, relvecs, expects) + + def test_subst_valid(self): + """ Test when the read has a substitution. """ + ref = DNA("ACGT") + relvecs = [ + [SUB_C, MATCH, MATCH, MATCH], + [SUB_G, MATCH, MATCH, MATCH], + [SUB_T, MATCH, MATCH, MATCH], + [MATCH, SUB_A, MATCH, MATCH], + [MATCH, SUB_G, MATCH, MATCH], + [MATCH, SUB_T, MATCH, MATCH], + [MATCH, MATCH, SUB_A, MATCH], + [MATCH, MATCH, SUB_C, MATCH], + [MATCH, MATCH, SUB_T, MATCH], + [MATCH, MATCH, MATCH, SUB_A], + [MATCH, MATCH, MATCH, SUB_C], + [MATCH, MATCH, MATCH, SUB_G], + ] + expects = [ + ("CCGT", "IIII", "1X3=", 1, 4), + ("GCGT", "IIII", "1X3=", 1, 4), + ("TCGT", "IIII", "1X3=", 1, 4), + ("AAGT", "IIII", "1=1X2=", 1, 4), + ("AGGT", "IIII", "1=1X2=", 1, 4), + ("ATGT", "IIII", "1=1X2=", 1, 4), + ("ACAT", "IIII", "2=1X1=", 1, 4), + ("ACCT", "IIII", "2=1X1=", 1, 4), + ("ACTT", "IIII", "2=1X1=", 1, 4), + ("ACGA", "IIII", "3=1X", 1, 4), + ("ACGC", "IIII", "3=1X", 1, 4), + ("ACGG", "IIII", "3=1X", 1, 4), + ] + self.assert_equal(ref, relvecs, expects) + + def test_subst_invalid(self): + """ Test when the read has an invalid substitution. """ + ref = DNA("ACGT") + relvecs = [ + [SUB_A, MATCH, MATCH, MATCH], + [MATCH, SUB_C, MATCH, MATCH], + [MATCH, MATCH, SUB_G, MATCH], + [MATCH, MATCH, MATCH, SUB_T], + ] + self.assert_raise(ref, relvecs, ValueError, + "Cannot substitute [ACGT] to itself") + + def test_delete_valid(self): + """ Test when the read has deletions. """ + ref = DNA("ACGT") + relvecs = [ + # 1 deletion + [MATCH, DELET, MATCH, MATCH], + [MATCH, MATCH, DELET, MATCH], + # 2 deletions + [MATCH, DELET, DELET, MATCH], + ] + expects = [ + # 1 deletion + ("AGT", "III", "1=1D2=", 1, 4), + ("ACT", "III", "2=1D1=", 1, 4), + # 2 deletions + ("AT", "II", "1=2D1=", 1, 4), + ] + self.assert_equal(ref, relvecs, expects) + + def test_delete_invalid(self): + """ Test when the read has a deletion at either end. """ + ref = DNA("ACGT") + relvecs = [ + [DELET, MATCH, MATCH, MATCH], + [NOCOV, DELET, MATCH, MATCH], + [NOCOV, NOCOV, DELET, MATCH], + [NOCOV, NOCOV, NOCOV, DELET], + [DELET, DELET, DELET, DELET], + [DELET, MATCH, MATCH, DELET], + [MATCH, MATCH, MATCH, DELET], + [MATCH, MATCH, DELET, NOCOV], + [MATCH, DELET, NOCOV, NOCOV], + [DELET, NOCOV, NOCOV, NOCOV], + ] + self.assert_raise(ref, relvecs, ValueError, + "Deletion cannot be at position [0-9]+ in .+") + + def test_insert_valid(self): + """ Test when the read has insertions. """ + ref = DNA("ACGT") + relvecs = [ + # 1 insertion + [MINS5, MINS3, MATCH, MATCH], + [MATCH, MINS5, MINS3, MATCH], + [MATCH, MATCH, MINS5, MINS3], + [MINS5, MINS3, MATCH, NOCOV], + [MATCH, MINS5, MINS3, NOCOV], + [NOCOV, MINS5, MINS3, MATCH], + [NOCOV, MATCH, MINS5, MINS3], + [NOCOV, MINS5, MINS3, NOCOV], + # 2 insertions, 1 base apart + [MINS5, ANY_8, MINS3, MATCH], + [MATCH, MINS5, ANY_8, MINS3], + # 2 insertions, 2 bases apart + [MINS5, MINS3, MINS5, MINS3], + # 3 insertions, 1 base apart + [MINS5, ANY_8, ANY_8, MINS3], + ] + expects = [ + # 1 insertion + ("ANCGT", "IIIII", "1=1I3=", 1, 4), + ("ACNGT", "IIIII", "2=1I2=", 1, 4), + ("ACGNT", "IIIII", "3=1I1=", 1, 4), + ("ANCG", "IIII", "1=1I2=", 1, 3), + ("ACNG", "IIII", "2=1I1=", 1, 3), + ("CNGT", "IIII", "1=1I2=", 2, 4), + ("CGNT", "IIII", "2=1I1=", 2, 4), + ("CNG", "III", "1=1I1=", 2, 3), + # 2 insertions, 1 base apart + ("ANCNGT", "IIIIII", "1=1I1=1I2=", 1, 4), + ("ACNGNT", "IIIIII", "2=1I1=1I1=", 1, 4), + # 2 insertions, 2 bases apart + ("ANCGNT", "IIIIII", "1=1I2=1I1=", 1, 4), + # 3 insertions, 1 base apart + ("ANCNGNT", "IIIIIII", "1=1I1=1I1=1I1=", 1, 4), + ] + self.assert_equal(ref, relvecs, expects) + + def test_insert_end5_invalid(self): + """ Test when the read has an insertion at the 5' end. """ + ref = DNA("ACGT") + relvecs = [ + [MATCH, MATCH, MATCH, MINS5], + [MATCH, MATCH, MINS5, ANY_8], + [MATCH, MINS5, ANY_8, ANY_8], + [MINS5, ANY_8, ANY_8, ANY_8], + [NOCOV, MATCH, MINS5, NOCOV], + [NOCOV, MINS5, ANY_8, NOCOV], + ] + self.assert_raise(ref, relvecs, ValueError, + "Position [0-9]+ in .+ cannot be 5' of an insertion") + + def test_insert_end3_invalid(self): + """ Test when the read has an insertion at the 3' end. """ + ref = DNA("ACGT") + relvecs = [ + [MINS3, MATCH, MATCH, MATCH], + [ANY_8, MINS3, MATCH, MATCH], + [ANY_8, ANY_8, MINS3, MATCH], + [ANY_8, ANY_8, ANY_8, MINS3], + [NOCOV, MINS3, MATCH, NOCOV], + [NOCOV, ANY_8, MINS3, NOCOV], + ] + self.assert_raise(ref, relvecs, ValueError, + "Position [0-9]+ in .+ cannot be 3' of an insertion") + + def test_insert_dangling_5_invalid(self): + """ Test when the read has an unmatched 5' insertion. """ + ref = DNA("ACGT") + relvecs = [ + [MINS5, MATCH, MATCH, MATCH], + [MATCH, MINS5, MATCH, MATCH], + [MATCH, MATCH, MINS5, MATCH], + [MINS5, MATCH, MINS3, MATCH], + [MATCH, MINS5, MATCH, MINS3], + [NOCOV, MINS5, MATCH, NOCOV], + ] + self.assert_raise(ref, relvecs, ValueError, + "Missing 3' ins at [0-9]+ in .+") + + def test_insert_dangling_3_invalid(self): + """ Test when the read has an unmatched 3' insertion. """ + ref = DNA("ACGT") + relvecs = [ + [MATCH, MINS3, MATCH, MATCH], + [MATCH, MATCH, MINS3, MATCH], + [MATCH, MATCH, MATCH, MINS3], + [NOCOV, MATCH, MINS3, NOCOV], + ] + self.assert_raise(ref, relvecs, ValueError, + "Unexpected 3' ins at [0-9+] in .+") + + def test_insert_bare_invalid(self): + """ Test when the read has bare insertions (with no underlying + relationship). """ + ref = DNA("ACGT") + relvecs = [ + # 1 bare insertion + [MINS5, INS_3, MATCH, MATCH], + [MATCH, MINS5, INS_3, MATCH], + [MATCH, MATCH, MINS5, INS_3], + [INS_5, MINS3, MATCH, MATCH], + [MATCH, INS_5, MINS3, MATCH], + [MATCH, MATCH, INS_5, MINS3], + [NOCOV, MINS5, INS_3, NOCOV], + [NOCOV, INS_5, MINS3, NOCOV], + # 2 bare insertions + [MINS5, INS_8, MINS3, MATCH], + [MATCH, MINS5, INS_8, MINS3], + ] + self.assert_raise(ref, relvecs, ValueError, + "Invalid relation 0") + + def test_insert_deletion_invalid(self): + """ Test when the read has an insertion next to a deletion. """ + ref = DNA("ACGT") + relvecs = [ + [MINS5, INS_3 + DELET, MATCH, MATCH], + [MATCH, MINS5, INS_3 + DELET, MATCH], + [MATCH, MATCH, MINS5, INS_3 + DELET], + [DELET + INS_5, MINS3, MATCH, MATCH], + [MATCH, DELET + INS_5, MINS3, MATCH], + [MATCH, MATCH, DELET + INS_5, MINS3], + ] + self.assert_raise(ref, relvecs, ValueError, + "Relation .+ is del and ins") + + def test_insert_non_match_valid(self): + """ Test when the read has insertions next to substitutions or + low-quality base calls. """ + ref = DNA("ACGT") + relvecs = [ + # 1 insertion next to 1 substitution + [SUB_C + INS_5, MINS3, MATCH, MATCH], + [SUB_C, MINS5, MINS3, MATCH], + [MINS5, INS_3 + SUB_T, MATCH, MATCH], + [MATCH, SUB_T + INS_5, MINS3, MATCH], + [MATCH, SUB_T, MINS5, MINS3], + # 1 insertion next to 1 low-quality base call + [ANY_N - SUB_A + INS_5, MINS3, MATCH, MATCH], + [ANY_N - SUB_A, MINS5, MINS3, MATCH], + [MINS5, INS_3 + ANY_N - SUB_C, MATCH, MATCH], + [MATCH, ANY_N - SUB_C + INS_5, MINS3, MATCH], + [MATCH, ANY_N - SUB_C, MINS5, MINS3], + ] + expects = [ + # 1 insertion next to 1 substitution + ("CNCGT", "IIIII", "1X1I3=", 1, 4), + ("CCNGT", "IIIII", "1X1=1I2=", 1, 4), + ("ANTGT", "IIIII", "1=1I1X2=", 1, 4), + ("ATNGT", "IIIII", "1=1X1I2=", 1, 4), + ("ATGNT", "IIIII", "1=1X1=1I1=", 1, 4), + # 1 insertion next to 1 low-quality base call + ("NNCGT", "!IIII", "1M1I3=", 1, 4), + ("NCNGT", "!IIII", "1M1=1I2=", 1, 4), + ("ANNGT", "II!II", "1=1I1M2=", 1, 4), + ("ANNGT", "I!III", "1=1M1I2=", 1, 4), + ("ANGNT", "I!III", "1=1M1=1I1=", 1, 4), + ] + self.assert_equal(ref, relvecs, expects) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/iterrelv_test.py b/src/seismicrna/relate/tests/iterrelv_test.py new file mode 100644 index 00000000..2c7896f7 --- /dev/null +++ b/src/seismicrna/relate/tests/iterrelv_test.py @@ -0,0 +1,507 @@ +import unittest as ut +from itertools import chain +from typing import Generator, Sequence + +import numpy as np + +from ..iterrelv import iter_relvecs_q53, iter_relvecs_all +from ...core.rel import (MATCH, DELET, INS_5, INS_3, + SUB_A, SUB_C, SUB_G, SUB_T, + ANY_B, ANY_D, ANY_H, ANY_V, + ANY_N, NOCOV) +from ...core.seq import DNA, expand_degenerate_seq + + +class TestIterRelvecsQ53(ut.TestCase): + """ Test function `iter_relvecs_q53`. """ + + @staticmethod + def list_rels(seq: str, low_qual: Sequence[int] = (), + end5: int | None = None, end3: int | None = None): + """ Convenience function to run `rel.iter_relvecs_q53` from a + sequence of str and return a list of lists of ints. """ + return list(map(np.ndarray.tolist, + iter_relvecs_q53(DNA(seq), low_qual, end5, end3))) + + def test_type(self): + """ Test that the result is a Generator of NumPy arrays. """ + self.assertTrue(isinstance(iter_relvecs_q53(DNA('A')), Generator)) + self.assertTrue(all(isinstance(relvec, np.ndarray) + for relvec in iter_relvecs_q53(DNA('A')))) + self.assertIs(list(iter_relvecs_q53(DNA('A')))[0].dtype.type, np.uint8) + + def test_a(self): + """ Test with sequence 'A'. """ + self.assertEqual(self.list_rels('A'), + [[MATCH], [SUB_C], [SUB_G], [SUB_T]]) + + def test_c(self): + """ Test with sequence 'C'. """ + self.assertEqual(self.list_rels('C'), + [[SUB_A], [MATCH], [SUB_G], [SUB_T]]) + + def test_g(self): + """ Test with sequence 'G'. """ + self.assertEqual(self.list_rels('G'), + [[SUB_A], [SUB_C], [MATCH], [SUB_T]]) + + def test_t(self): + """ Test with sequence 'T'. """ + self.assertEqual(self.list_rels('T'), + [[SUB_A], [SUB_C], [SUB_G], [MATCH]]) + + def test_n(self): + """ Test with sequence 'N'. """ + self.assertEqual(self.list_rels('N'), + [[ANY_N], [ANY_N], [ANY_N], [ANY_N]]) + + def test_aa(self): + """ Test with sequence 'AA'. """ + self.assertEqual(self.list_rels("AA"), + [[MATCH, MATCH], + [MATCH + INS_5, INS_3 + MATCH], + [MATCH, SUB_C], + [MATCH + INS_5, INS_3 + SUB_C], + [MATCH, SUB_G], + [MATCH + INS_5, INS_3 + SUB_G], + [MATCH, SUB_T], + [MATCH + INS_5, INS_3 + SUB_T], + [SUB_C, MATCH], + [SUB_C + INS_5, INS_3 + MATCH], + [SUB_C, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_G], + [SUB_C + INS_5, INS_3 + SUB_G], + [SUB_C, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_T], + [SUB_G, MATCH], + [SUB_G + INS_5, INS_3 + MATCH], + [SUB_G, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_G], + [SUB_G + INS_5, INS_3 + SUB_G], + [SUB_G, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_T], + [SUB_T, MATCH], + [SUB_T + INS_5, INS_3 + MATCH], + [SUB_T, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_G], + [SUB_T + INS_5, INS_3 + SUB_G], + [SUB_T, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_T]]) + + def test_low_qual(self): + """ Test with each low-quality base. """ + for base, low_qual in zip("ACGTN", [ANY_B, ANY_D, ANY_H, ANY_V, ANY_N], + strict=True): + self.assertEqual(self.list_rels(base, [1]), + [[low_qual]]) + + def test_low_qual_invalid(self): + """ Test that invalid low-qual positions raise ValueError. """ + seq = "ACGTN" + for n in range(1, len(seq) + 1): + self.assertTrue(isinstance(self.list_rels(seq[: n], [1]), list)) + self.assertTrue(isinstance(self.list_rels(seq[: n], [n]), list)) + self.assertRaises(ValueError, self.list_rels, seq[: n], [0]) + self.assertRaises(ValueError, self.list_rels, seq[: n], [n + 1]) + + def test_xaax(self): + """ Test that bases with no coverage are marked. """ + self.assertEqual(self.list_rels("TAAG", end5=2, end3=3), + [[NOCOV, MATCH, MATCH, NOCOV], + [NOCOV, MATCH + INS_5, INS_3 + MATCH, NOCOV], + [NOCOV, MATCH, SUB_C, NOCOV], + [NOCOV, MATCH + INS_5, INS_3 + SUB_C, NOCOV], + [NOCOV, MATCH, SUB_G, NOCOV], + [NOCOV, MATCH + INS_5, INS_3 + SUB_G, NOCOV], + [NOCOV, MATCH, SUB_T, NOCOV], + [NOCOV, MATCH + INS_5, INS_3 + SUB_T, NOCOV], + [NOCOV, SUB_C, MATCH, NOCOV], + [NOCOV, SUB_C + INS_5, INS_3 + MATCH, NOCOV], + [NOCOV, SUB_C, SUB_C, NOCOV], + [NOCOV, SUB_C + INS_5, INS_3 + SUB_C, NOCOV], + [NOCOV, SUB_C, SUB_G, NOCOV], + [NOCOV, SUB_C + INS_5, INS_3 + SUB_G, NOCOV], + [NOCOV, SUB_C, SUB_T, NOCOV], + [NOCOV, SUB_C + INS_5, INS_3 + SUB_T, NOCOV], + [NOCOV, SUB_G, MATCH, NOCOV], + [NOCOV, SUB_G + INS_5, INS_3 + MATCH, NOCOV], + [NOCOV, SUB_G, SUB_C, NOCOV], + [NOCOV, SUB_G + INS_5, INS_3 + SUB_C, NOCOV], + [NOCOV, SUB_G, SUB_G, NOCOV], + [NOCOV, SUB_G + INS_5, INS_3 + SUB_G, NOCOV], + [NOCOV, SUB_G, SUB_T, NOCOV], + [NOCOV, SUB_G + INS_5, INS_3 + SUB_T, NOCOV], + [NOCOV, SUB_T, MATCH, NOCOV], + [NOCOV, SUB_T + INS_5, INS_3 + MATCH, NOCOV], + [NOCOV, SUB_T, SUB_C, NOCOV], + [NOCOV, SUB_T + INS_5, INS_3 + SUB_C, NOCOV], + [NOCOV, SUB_T, SUB_G, NOCOV], + [NOCOV, SUB_T + INS_5, INS_3 + SUB_G, NOCOV], + [NOCOV, SUB_T, SUB_T, NOCOV], + [NOCOV, SUB_T + INS_5, INS_3 + SUB_T, NOCOV]]) + + def test_agg(self): + """ Test with sequence 'AGG'. """ + rels = self.list_rels("AGG") + self.assertEqual(rels, + [[MATCH, SUB_A, SUB_A], + [MATCH + INS_5, INS_3 + SUB_A, SUB_A], + [MATCH + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_A], + [MATCH, SUB_A + INS_5, INS_3 + SUB_A], + [MATCH, SUB_A, SUB_C], + [MATCH + INS_5, INS_3 + SUB_A, SUB_C], + [MATCH + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_C], + [MATCH, SUB_A + INS_5, INS_3 + SUB_C], + [MATCH, SUB_A, MATCH], + [MATCH + INS_5, INS_3 + SUB_A, MATCH], + [MATCH + INS_5, INS_3 + SUB_A + INS_5, INS_3 + MATCH], + [MATCH, SUB_A + INS_5, INS_3 + MATCH], + [MATCH, SUB_A, SUB_T], + [MATCH + INS_5, INS_3 + SUB_A, SUB_T], + [MATCH + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_T], + [MATCH, SUB_A + INS_5, INS_3 + SUB_T], + [MATCH, SUB_C, SUB_A], + [MATCH + INS_5, INS_3 + SUB_C, SUB_A], + [MATCH + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_A], + [MATCH, SUB_C + INS_5, INS_3 + SUB_A], + [MATCH, SUB_C, SUB_C], + [MATCH + INS_5, INS_3 + SUB_C, SUB_C], + [MATCH + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_C], + [MATCH, SUB_C + INS_5, INS_3 + SUB_C], + [MATCH, SUB_C, MATCH], + [MATCH + INS_5, INS_3 + SUB_C, MATCH], + [MATCH + INS_5, INS_3 + SUB_C + INS_5, INS_3 + MATCH], + [MATCH, SUB_C + INS_5, INS_3 + MATCH], + [MATCH, SUB_C, SUB_T], + [MATCH + INS_5, INS_3 + SUB_C, SUB_T], + [MATCH + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_T], + [MATCH, SUB_C + INS_5, INS_3 + SUB_T], + [MATCH, MATCH, SUB_A], + [MATCH + INS_5, INS_3 + MATCH, SUB_A], + [MATCH + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_A], + [MATCH, MATCH + INS_5, INS_3 + SUB_A], + [MATCH, MATCH, SUB_C], + [MATCH + INS_5, INS_3 + MATCH, SUB_C], + [MATCH + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_C], + [MATCH, MATCH + INS_5, INS_3 + SUB_C], + [MATCH, MATCH, MATCH], + [MATCH + INS_5, INS_3 + MATCH, MATCH], + [MATCH + INS_5, INS_3 + MATCH + INS_5, INS_3 + MATCH], + [MATCH, MATCH + INS_5, INS_3 + MATCH], + [MATCH, MATCH, SUB_T], + [MATCH + INS_5, INS_3 + MATCH, SUB_T], + [MATCH + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_T], + [MATCH, MATCH + INS_5, INS_3 + SUB_T], + [MATCH, SUB_T, SUB_A], + [MATCH + INS_5, INS_3 + SUB_T, SUB_A], + [MATCH + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_A], + [MATCH, SUB_T + INS_5, INS_3 + SUB_A], + [MATCH, SUB_T, SUB_C], + [MATCH + INS_5, INS_3 + SUB_T, SUB_C], + [MATCH + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_C], + [MATCH, SUB_T + INS_5, INS_3 + SUB_C], + [MATCH, SUB_T, MATCH], + [MATCH + INS_5, INS_3 + SUB_T, MATCH], + [MATCH + INS_5, INS_3 + SUB_T + INS_5, INS_3 + MATCH], + [MATCH, SUB_T + INS_5, INS_3 + MATCH], + [MATCH, SUB_T, SUB_T], + [MATCH + INS_5, INS_3 + SUB_T, SUB_T], + [MATCH + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_T], + [MATCH, SUB_T + INS_5, INS_3 + SUB_T], + [MATCH, DELET, SUB_A], + [MATCH, DELET, SUB_C], + [MATCH, DELET, MATCH], + [MATCH, DELET, SUB_T], + [SUB_C, SUB_A, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_A, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_A + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_A, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_A, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_A + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_A, MATCH], + [SUB_C + INS_5, INS_3 + SUB_A, MATCH], + [SUB_C + INS_5, INS_3 + SUB_A + INS_5, INS_3 + MATCH], + [SUB_C, SUB_A + INS_5, INS_3 + MATCH], + [SUB_C, SUB_A, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_A, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_T], + [SUB_C, SUB_A + INS_5, INS_3 + SUB_T], + [SUB_C, SUB_C, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_C, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_C + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_C, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_C, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_C + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_C, MATCH], + [SUB_C + INS_5, INS_3 + SUB_C, MATCH], + [SUB_C + INS_5, INS_3 + SUB_C + INS_5, INS_3 + MATCH], + [SUB_C, SUB_C + INS_5, INS_3 + MATCH], + [SUB_C, SUB_C, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_C, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_T], + [SUB_C, SUB_C + INS_5, INS_3 + SUB_T], + [SUB_C, MATCH, SUB_A], + [SUB_C + INS_5, INS_3 + MATCH, SUB_A], + [SUB_C + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_A], + [SUB_C, MATCH + INS_5, INS_3 + SUB_A], + [SUB_C, MATCH, SUB_C], + [SUB_C + INS_5, INS_3 + MATCH, SUB_C], + [SUB_C + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_C], + [SUB_C, MATCH + INS_5, INS_3 + SUB_C], + [SUB_C, MATCH, MATCH], + [SUB_C + INS_5, INS_3 + MATCH, MATCH], + [SUB_C + INS_5, INS_3 + MATCH + INS_5, INS_3 + MATCH], + [SUB_C, MATCH + INS_5, INS_3 + MATCH], + [SUB_C, MATCH, SUB_T], + [SUB_C + INS_5, INS_3 + MATCH, SUB_T], + [SUB_C + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_T], + [SUB_C, MATCH + INS_5, INS_3 + SUB_T], + [SUB_C, SUB_T, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_T, SUB_A], + [SUB_C + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_T + INS_5, INS_3 + SUB_A], + [SUB_C, SUB_T, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_T, SUB_C], + [SUB_C + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_T + INS_5, INS_3 + SUB_C], + [SUB_C, SUB_T, MATCH], + [SUB_C + INS_5, INS_3 + SUB_T, MATCH], + [SUB_C + INS_5, INS_3 + SUB_T + INS_5, INS_3 + MATCH], + [SUB_C, SUB_T + INS_5, INS_3 + MATCH], + [SUB_C, SUB_T, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_T, SUB_T], + [SUB_C + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_T], + [SUB_C, SUB_T + INS_5, INS_3 + SUB_T], + [SUB_C, DELET, SUB_A], + [SUB_C, DELET, SUB_C], + [SUB_C, DELET, MATCH], + [SUB_C, DELET, SUB_T], + [SUB_G, SUB_A, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_A, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_A + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_A, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_A, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_A + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_A, MATCH], + [SUB_G + INS_5, INS_3 + SUB_A, MATCH], + [SUB_G + INS_5, INS_3 + SUB_A + INS_5, INS_3 + MATCH], + [SUB_G, SUB_A + INS_5, INS_3 + MATCH], + [SUB_G, SUB_A, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_A, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_T], + [SUB_G, SUB_A + INS_5, INS_3 + SUB_T], + [SUB_G, SUB_C, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_C, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_C + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_C, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_C, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_C + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_C, MATCH], + [SUB_G + INS_5, INS_3 + SUB_C, MATCH], + [SUB_G + INS_5, INS_3 + SUB_C + INS_5, INS_3 + MATCH], + [SUB_G, SUB_C + INS_5, INS_3 + MATCH], + [SUB_G, SUB_C, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_C, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_T], + [SUB_G, SUB_C + INS_5, INS_3 + SUB_T], + [SUB_G, MATCH, SUB_A], + [SUB_G + INS_5, INS_3 + MATCH, SUB_A], + [SUB_G + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_A], + [SUB_G, MATCH + INS_5, INS_3 + SUB_A], + [SUB_G, MATCH, SUB_C], + [SUB_G + INS_5, INS_3 + MATCH, SUB_C], + [SUB_G + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_C], + [SUB_G, MATCH + INS_5, INS_3 + SUB_C], + [SUB_G, MATCH, MATCH], + [SUB_G + INS_5, INS_3 + MATCH, MATCH], + [SUB_G + INS_5, INS_3 + MATCH + INS_5, INS_3 + MATCH], + [SUB_G, MATCH + INS_5, INS_3 + MATCH], + [SUB_G, MATCH, SUB_T], + [SUB_G + INS_5, INS_3 + MATCH, SUB_T], + [SUB_G + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_T], + [SUB_G, MATCH + INS_5, INS_3 + SUB_T], + [SUB_G, SUB_T, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_T, SUB_A], + [SUB_G + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_T + INS_5, INS_3 + SUB_A], + [SUB_G, SUB_T, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_T, SUB_C], + [SUB_G + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_T + INS_5, INS_3 + SUB_C], + [SUB_G, SUB_T, MATCH], + [SUB_G + INS_5, INS_3 + SUB_T, MATCH], + [SUB_G + INS_5, INS_3 + SUB_T + INS_5, INS_3 + MATCH], + [SUB_G, SUB_T + INS_5, INS_3 + MATCH], + [SUB_G, SUB_T, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_T, SUB_T], + [SUB_G + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_T], + [SUB_G, SUB_T + INS_5, INS_3 + SUB_T], + [SUB_G, DELET, SUB_A], + [SUB_G, DELET, SUB_C], + [SUB_G, DELET, MATCH], + [SUB_G, DELET, SUB_T], + [SUB_T, SUB_A, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_A, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_A + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_A, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_A, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_A + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_A, MATCH], + [SUB_T + INS_5, INS_3 + SUB_A, MATCH], + [SUB_T + INS_5, INS_3 + SUB_A + INS_5, INS_3 + MATCH], + [SUB_T, SUB_A + INS_5, INS_3 + MATCH], + [SUB_T, SUB_A, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_A, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_A + INS_5, INS_3 + SUB_T], + [SUB_T, SUB_A + INS_5, INS_3 + SUB_T], + [SUB_T, SUB_C, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_C, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_C + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_C, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_C, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_C + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_C, MATCH], + [SUB_T + INS_5, INS_3 + SUB_C, MATCH], + [SUB_T + INS_5, INS_3 + SUB_C + INS_5, INS_3 + MATCH], + [SUB_T, SUB_C + INS_5, INS_3 + MATCH], + [SUB_T, SUB_C, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_C, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_C + INS_5, INS_3 + SUB_T], + [SUB_T, SUB_C + INS_5, INS_3 + SUB_T], + [SUB_T, MATCH, SUB_A], + [SUB_T + INS_5, INS_3 + MATCH, SUB_A], + [SUB_T + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_A], + [SUB_T, MATCH + INS_5, INS_3 + SUB_A], + [SUB_T, MATCH, SUB_C], + [SUB_T + INS_5, INS_3 + MATCH, SUB_C], + [SUB_T + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_C], + [SUB_T, MATCH + INS_5, INS_3 + SUB_C], + [SUB_T, MATCH, MATCH], + [SUB_T + INS_5, INS_3 + MATCH, MATCH], + [SUB_T + INS_5, INS_3 + MATCH + INS_5, INS_3 + MATCH], + [SUB_T, MATCH + INS_5, INS_3 + MATCH], + [SUB_T, MATCH, SUB_T], + [SUB_T + INS_5, INS_3 + MATCH, SUB_T], + [SUB_T + INS_5, INS_3 + MATCH + INS_5, INS_3 + SUB_T], + [SUB_T, MATCH + INS_5, INS_3 + SUB_T], + [SUB_T, SUB_T, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_T, SUB_A], + [SUB_T + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_T + INS_5, INS_3 + SUB_A], + [SUB_T, SUB_T, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_T, SUB_C], + [SUB_T + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_T + INS_5, INS_3 + SUB_C], + [SUB_T, SUB_T, MATCH], + [SUB_T + INS_5, INS_3 + SUB_T, MATCH], + [SUB_T + INS_5, INS_3 + SUB_T + INS_5, INS_3 + MATCH], + [SUB_T, SUB_T + INS_5, INS_3 + MATCH], + [SUB_T, SUB_T, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_T, SUB_T], + [SUB_T + INS_5, INS_3 + SUB_T + INS_5, INS_3 + SUB_T], + [SUB_T, SUB_T + INS_5, INS_3 + SUB_T], + [SUB_T, DELET, SUB_A], + [SUB_T, DELET, SUB_C], + [SUB_T, DELET, MATCH], + [SUB_T, DELET, SUB_T]]) + + +class TestIterRelvecsAll(ut.TestCase): + """ Test function `iter_relvecs_all`. """ + + def assert_equal(self, ref: DNA, expects: list): + """ Check that the expected and actual results match. """ + for exp, res in zip(chain(*expects), + iter_relvecs_all(ref), + strict=True): + with self.subTest(exp=exp, res=res): + self.assertTrue(np.all(exp == res)) + + def test_length_1(self): + """ Test with all length-1 DNA sequences. """ + for ref in expand_degenerate_seq(DNA("N")): + expects = [ + iter_relvecs_q53(ref, [], 1, 1), + iter_relvecs_q53(ref, [1], 1, 1), + ] + self.assert_equal(ref, expects) + + def test_length_2(self): + """ Test with all length-2 DNA sequences. """ + for ref in expand_degenerate_seq(DNA("NN")): + expects = [ + iter_relvecs_q53(ref, [], 1, 1), + iter_relvecs_q53(ref, [1], 1, 1), + iter_relvecs_q53(ref, [], 1, 2), + iter_relvecs_q53(ref, [1], 1, 2), + iter_relvecs_q53(ref, [2], 1, 2), + iter_relvecs_q53(ref, [1, 2], 1, 2), + iter_relvecs_q53(ref, [], 2, 2), + iter_relvecs_q53(ref, [2], 2, 2), + ] + self.assert_equal(ref, expects) + + def test_length_3(self): + """ Test with all length-3 DNA sequences. """ + for ref in expand_degenerate_seq(DNA("NNN")): + expects = [ + iter_relvecs_q53(ref, [], 1, 1), + iter_relvecs_q53(ref, [1], 1, 1), + iter_relvecs_q53(ref, [], 1, 2), + iter_relvecs_q53(ref, [1], 1, 2), + iter_relvecs_q53(ref, [2], 1, 2), + iter_relvecs_q53(ref, [1, 2], 1, 2), + iter_relvecs_q53(ref, [], 1, 3), + iter_relvecs_q53(ref, [1], 1, 3), + iter_relvecs_q53(ref, [2], 1, 3), + iter_relvecs_q53(ref, [3], 1, 3), + iter_relvecs_q53(ref, [1, 2], 1, 3), + iter_relvecs_q53(ref, [1, 3], 1, 3), + iter_relvecs_q53(ref, [2, 3], 1, 3), + iter_relvecs_q53(ref, [1, 2, 3], 1, 3), + iter_relvecs_q53(ref, [], 2, 2), + iter_relvecs_q53(ref, [2], 2, 2), + iter_relvecs_q53(ref, [], 2, 3), + iter_relvecs_q53(ref, [2], 2, 3), + iter_relvecs_q53(ref, [3], 2, 3), + iter_relvecs_q53(ref, [2, 3], 2, 3), + iter_relvecs_q53(ref, [], 3, 3), + iter_relvecs_q53(ref, [3], 3, 3), + ] + self.assert_equal(ref, expects) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +######################################################################## diff --git a/src/seismicrna/relate/tests/relate_test.py b/src/seismicrna/relate/tests/relate_test.py new file mode 100644 index 00000000..4429936f --- /dev/null +++ b/src/seismicrna/relate/tests/relate_test.py @@ -0,0 +1,131 @@ +import unittest as ut +from sys import byteorder + +import pandas as pd + +from ..iterread import iter_alignments +from ..relate import relate_line +from ..seqpos import format_seq_pos +from ...align.sim import as_sam +from ...core.cli import opt_min_mapq +from ...core.qual import OK_QUAL +from ...core.rel import NOCOV +from ...core.seq import DNA + + +class TestSeqposFormatSeqPos(ut.TestCase): + """ Test function `seqpos.format_seq_pos`. """ + + def test_acgt_index_1_acgt(self): + """ Test with ACGT, 1-indexed. """ + index = format_seq_pos(DNA("ACGT"), [1, 2, 3, 4], 1) + expect = pd.Index(["A1", "C2", "G3", "T4"]) + self.assertTrue(index.equals(expect)) + + def test_acgt_index_1_cg(self): + """ Test with ACGT, 1-indexed. """ + index = format_seq_pos(DNA("ACGT"), [2, 3], 1) + expect = pd.Index(["C2", "G3"]) + self.assertTrue(index.equals(expect)) + + def test_acgt_index_58_acgt(self): + """ Test with ACGT, 58-indexed. """ + index = format_seq_pos(DNA("ACGT"), [58, 59, 60, 61], 58) + expect = pd.Index(["A58", "C59", "G60", "T61"]) + self.assertTrue(index.equals(expect)) + + def test_acgt_index_58_cg(self): + """ Test with ACGT, 58-indexed. """ + index = format_seq_pos(DNA("ACGT"), [59, 60], 58) + expect = pd.Index(["C59", "G60"]) + self.assertTrue(index.equals(expect)) + + +class TestRelateRelateLineAmbrel(ut.TestCase): + """ Test function `relate.relate_line`. """ + + @staticmethod + def relate(ref: str, + refseq: DNA, + read: DNA, + qual: str, + mapq: int, + cigar: str, + end5: int): + """ Generate a SAM line from the given information, and use it + to compute a relation vector. """ + sam_line = as_sam("read", + 99, + ref, + end5, + mapq, + cigar, + "=", + 1, + len(read), + read, + qual) + muts = bytearray(NOCOV.to_bytes(1, byteorder) * len(refseq)) + relate_line(muts, + sam_line, + ref, + refseq, + len(refseq), + opt_min_mapq.default, + OK_QUAL, + True) + return muts + + def iter_cases(self, refseq: DNA, max_ins: int = 2): + """ Iterate through every test case. """ + for read, qual, cigar, end5, end3, relvec in iter_alignments(refseq, + max_ins, + max_ins, + max_ins): + result = self.relate("ref", + refseq, + read, + qual, + opt_min_mapq.default, + cigar, + end5) + with self.subTest(relvec=relvec, result=result): + self.assertEqual(relvec.tobytes(), result) + + def test_aaaa_0ins(self): + """ Test all possible reads with 0 insertions from AAAA. """ + self.iter_cases(DNA("AAAA"), 0) + + @ut.skip("Takes a long time to run") + def test_aaaaaa_0ins(self): + """ Test all possible reads with 0 insertions from AAAAAA. """ + self.iter_cases(DNA("AAAAAA"), 0) + + def test_aacc_1ins(self): + """ Test all possible reads with ≤ 1 insertion from AACC. """ + self.iter_cases(DNA("AACC"), 1) + + def test_acgt_1ins(self): + """ Test all possible reads with ≤ 1 insertion from ACGT. """ + self.iter_cases(DNA("ACGT"), 1) + +######################################################################## +# # +# Copyright ©2023, the Rouskin Lab. # +# # +# This file is part of SEISMIC-RNA. # +# # +# SEISMIC-RNA is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 3 of the License, or # +# (at your option) any later version. # +# # +# SEISMIC-RNA is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT- # +# ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # +# Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with SEISMIC-RNA; if not, see . # +# # +########################################################################