Skip to content

Commit

Permalink
fixup! Add positional substitution matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Sep 8, 2024
1 parent fd9af16 commit 9c48739
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 40 deletions.
19 changes: 16 additions & 3 deletions src/biotite/sequence/align/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import textwrap
from collections.abc import Sequence
import numpy as np
from biotite.sequence.alphabet import LetterAlphabet

__all__ = [
"Alignment",
Expand Down Expand Up @@ -111,7 +110,7 @@ def _gapped_str(self, seq_index):
for i in range(len(self.trace)):
j = self.trace[i][seq_index]
if j != -1:
seq_str += self.sequences[seq_index][j]
seq_str += str(self.sequences[seq_index][j])
else:
seq_str += "-"
return seq_str
Expand All @@ -133,7 +132,7 @@ def __str__(self):
# has an non-single letter alphabet
all_single_letter = True
for seq in self.sequences:
if not isinstance(seq.get_alphabet(), LetterAlphabet):
if not _is_single_letter(seq.alphabet):
all_single_letter = False
if all_single_letter:
# First dimension: sequence number,
Expand Down Expand Up @@ -665,3 +664,17 @@ def remove_terminal_gaps(alignment):
"no overlap and the resulting alignment would be empty"
)
return alignment[start:stop]


def _is_single_letter(alphabet):
"""
More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
It is sufficient that only only the string representation of each symbol is only
a single character.
"""
if alphabet.is_letter_alphabet():
return True
for symbol in alphabet:
if len(str(symbol)) != 1:
return False
return True
10 changes: 7 additions & 3 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@

import os
import numpy as np
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence, PositionalSequence
from biotite.sequence.seqtypes import (
NucleotideSequence,
PositionalSequence,
ProteinSequence,
)

__all__ = ["SubstitutionMatrix"]

Expand Down Expand Up @@ -269,7 +273,7 @@ def get_score(self, symbol1, symbol2):
code2 = self._alph2.encode(symbol2)
return self._matrix[code1, code2]

def into_positional(self, sequence1, sequence2):
def as_positional(self, sequence1, sequence2):
"""
Transform this substitution matrix and two sequences into positional
equivalents.
Expand Down Expand Up @@ -459,4 +463,4 @@ def _cartesian_product(array1, array2):
np.repeat(array1, len(array2)),
np.tile(array2, len(array1)),
]
)
)
126 changes: 92 additions & 34 deletions src/biotite/sequence/seqtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
"NucleotideSequence",
"ProteinSequence",
"PositionalSequence",
"PositionalSymbol",
"PurePositionalSequence",
]

from dataclasses import dataclass
from dataclasses import dataclass, field
import numpy as np
from biotite.sequence.alphabet import AlphabetError, Alphabet, AlphabetMapper, LetterAlphabet
from biotite.sequence.alphabet import (
Alphabet,
AlphabetError,
AlphabetMapper,
LetterAlphabet,
)
from biotite.sequence.sequence import Sequence


Expand Down Expand Up @@ -599,55 +604,108 @@ def get_molecular_weight(self, monoisotopic=False):
return weight


@dataclass(frozen=True)
class PositionalSymbol:
"""
Combination of a symbol and its position in a sequence.
Attributes
----------
symbol : object
The symbol from the original alphabet.
position : int
The 0-based position of the symbol in the sequence.
See Also
--------
PositionalSequence
The sequence type containing :class:`PositionalSymbol` objects.
"""

symbol: ...
position: ...

def __str__(self):
return str(self.symbol)


class PositionalSequence(Sequence):
"""
A sequence where each symbol is associated with a position.
The alphabet contains `PositionalSymbol` objects as symbols.
Its symbols are
This is useful for aligning sequences based on a position-specific
substitution matrix.
Parameters
----------
base_sequence : seq.Sequence
The base sequence to create the positional sequence from.
original_sequence : seq.Sequence
The original sequence to create the positional sequence from.
"""

def __init__(self, base_sequence):
@dataclass(frozen=True)
class Symbol:
"""
Combination of a symbol and its position in a sequence.
Attributes
----------
original_alphabet : Alphabet
The original alphabet, where the symbol stems from.
original_code : int
The code of the original symbol in the original alphabet.
position : int
The 0-based position of the symbol in the sequence.
symbol : object
The symbol from the original alphabet.
See Also
--------
PositionalSequence
The sequence type containing :class:`PositionalSymbol` objects.
"""

original_alphabet: ...
original_code: ...
position: ...
symbol: ... = field(init=False)

def __post_init__(self):
sym = self.original_alphabet.decode(self.original_code)
super().__setattr__("symbol", sym)

def __str__(self):
return str(self.symbol)

def __init__(self, original_sequence):
self._orig_alphabet = original_sequence.get_alphabet()
self._alphabet = Alphabet(
[PositionalSymbol(sym, i) for i, sym in enumerate(base_sequence.symbols)]
[
PositionalSequence.Symbol(self._orig_alphabet, code, pos)
for pos, code in enumerate(original_sequence.code)
]
)
self.code = np.arange(
len(base_sequence), dtype=Sequence.dtype(len(self._alphabet))
len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
)

def reconstruct(self):
"""
Reconstruct the original sequence from the positional sequence.
Returns
-------
original_sequence : GeneralSequence
The original sequence.
Although the actual type of the returned sequence is always a
:class:`GeneralSequence`, the alphabet and the symbols of the returned
sequence are equal to the original sequence.
"""
original_sequence = GeneralSequence(self._orig_alphabet)
original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
return original_sequence

def get_alphabet(self):
return self._alphabet

def __str__(self) -> str:
return "".join([str(sym) for sym in self.symbols])

def __repr__(self):
return f"PositionalSequence({self.reconstruct()!r})"


class PurePositionalSequence(Sequence):
"""
An object of this class is a 'placeholder' sequence, where each symbol is the
position in the sequence itself.
This class is similar to :class:`PositionalSequence`, but the symbols are not
derived from an original sequence, but are the pure position.
Hence, there is no meaningful string representation of the sequence and its symbols.
"""

def __init__(self, length):
self._alphabet = Alphabet(range(length))
self.code = np.arange(length, dtype=Sequence.dtype(length))

def get_alphabet(self):
return self._alphabet

def __repr__(self):
return f"PurePositionalSequence({len(self)})"
29 changes: 29 additions & 0 deletions tests/sequence/align/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,32 @@ def test_matrix_str():
"b 3 4 5",
"c 6 7 8"]
) # fmt: skip


@pytest.mark.parametrize("seed", range(10))
def test_as_positional(seed):
"""
Check if the score from a positional substitution matrix for symbols from
positional sequences is equal to the score from the original matrix for the
corresponding symbols from the original sequences.
"""
MAX_SEQ_LENGTH = 10

np.random.seed(seed)
matrix = align.SubstitutionMatrix.std_protein_matrix()
sequences = []
for _ in range(2):
seq_length = np.random.randint(1, MAX_SEQ_LENGTH)
sequence = seq.ProteinSequence()
sequence.code = np.random.randint(0, len(sequence.alphabet), size=seq_length)
sequences.append(sequence)
pos_matrix, *pos_sequences = matrix.as_positional(*sequences)

# Sanity check if the positional sequences do correspond to the original sequences
for i in range(2):
assert str(pos_sequences[i]) == str(sequences[i])
for i in range(len(sequences[0])):
for j in range(len(sequences[1])):
ref_score = matrix.get_score(sequences[0][i], sequences[1][j])
test_score = pos_matrix.get_score(pos_sequences[0][i], pos_sequences[1][j])
assert test_score == ref_score
21 changes: 21 additions & 0 deletions tests/sequence/test_seqtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

import numpy as np
import pytest
import biotite.sequence as seq

Expand Down Expand Up @@ -90,3 +91,23 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW")
mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic)
assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2)


def test_positional_sequence():
"""
Check whether the original sequence symbols can be reconstructed from a
:class:`PositionalSequence`.
"""
SEQ_LENGTH = 100

np.random.seed(0)
orig_sequence = seq.ProteinSequence()
orig_sequence.code = np.random.randint(
0, len(orig_sequence.alphabet), size=SEQ_LENGTH
)
positional_sequence = seq.PositionalSequence(orig_sequence)

assert str(positional_sequence) == str(orig_sequence)
reconstructed_sequence = positional_sequence.reconstruct()
assert np.all(reconstructed_sequence.code == orig_sequence.code)
assert reconstructed_sequence.alphabet == orig_sequence.alphabet
4 changes: 4 additions & 0 deletions tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
LetterAlphabet,
Location,
NucleotideSequence,
PositionalSequence,
ProteinSequence,
PurePositionalSequence,
SequenceProfile,
)
from biotite.sequence.align import Alignment, SubstitutionMatrix
Expand All @@ -30,6 +32,8 @@
NucleotideSequence("AACTGCTA"),
NucleotideSequence("AACTGCTA", ambiguous=True),
ProteinSequence("BIQTITE"),
PositionalSequence(NucleotideSequence("AACTGCTA")),
PurePositionalSequence(123),
Alphabet(["X", "Y", "Z"]),
GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]),
LetterAlphabet(["X", "Y", "Z"]),
Expand Down

0 comments on commit 9c48739

Please sign in to comment.