fixup! Add positional substitution matrices

biotite-dev · Sep 8, 2024 · 9c48739 · 9c48739
1 parent fd9af16
commit 9c48739
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 40 deletions.
diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py
@@ -9,7 +9,6 @@
 import textwrap
 from collections.abc import Sequence
 import numpy as np
-from biotite.sequence.alphabet import LetterAlphabet
 
 __all__ = [
     "Alignment",
@@ -111,7 +110,7 @@ def _gapped_str(self, seq_index):
         for i in range(len(self.trace)):
             j = self.trace[i][seq_index]
             if j != -1:
-                seq_str += self.sequences[seq_index][j]
+                seq_str += str(self.sequences[seq_index][j])
             else:
                 seq_str += "-"
         return seq_str
@@ -133,7 +132,7 @@ def __str__(self):
         # has an non-single letter alphabet
         all_single_letter = True
         for seq in self.sequences:
-            if not isinstance(seq.get_alphabet(), LetterAlphabet):
+            if not _is_single_letter(seq.alphabet):
                 all_single_letter = False
         if all_single_letter:
             # First dimension: sequence number,
@@ -665,3 +664,17 @@ def remove_terminal_gaps(alignment):
             "no overlap and the resulting alignment would be empty"
         )
     return alignment[start:stop]
+
+
+def _is_single_letter(alphabet):
+    """
+    More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
+    It is sufficient that only only the string representation of each symbol is only
+    a single character.
+    """
+    if alphabet.is_letter_alphabet():
+        return True
+    for symbol in alphabet:
+        if len(str(symbol)) != 1:
+            return False
+    return True
diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py
@@ -7,7 +7,11 @@
 
 import os
 import numpy as np
-from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence, PositionalSequence
+from biotite.sequence.seqtypes import (
+    NucleotideSequence,
+    PositionalSequence,
+    ProteinSequence,
+)
 
 __all__ = ["SubstitutionMatrix"]
 
@@ -269,7 +273,7 @@ def get_score(self, symbol1, symbol2):
         code2 = self._alph2.encode(symbol2)
         return self._matrix[code1, code2]
 
-    def into_positional(self, sequence1, sequence2):
+    def as_positional(self, sequence1, sequence2):
         """
         Transform this substitution matrix and two sequences into positional
         equivalents.
@@ -459,4 +463,4 @@ def _cartesian_product(array1, array2):
             np.repeat(array1, len(array2)),
             np.tile(array2, len(array1)),
         ]
-    )
+    )
diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py
@@ -9,12 +9,17 @@
     "NucleotideSequence",
     "ProteinSequence",
     "PositionalSequence",
-    "PositionalSymbol",
+    "PurePositionalSequence",
 ]
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import numpy as np
-from biotite.sequence.alphabet import AlphabetError, Alphabet, AlphabetMapper, LetterAlphabet
+from biotite.sequence.alphabet import (
+    Alphabet,
+    AlphabetError,
+    AlphabetMapper,
+    LetterAlphabet,
+)
 from biotite.sequence.sequence import Sequence
 
 
@@ -599,55 +604,108 @@ def get_molecular_weight(self, monoisotopic=False):
         return weight
 
 
-@dataclass(frozen=True)
-class PositionalSymbol:
-    """
-    Combination of a symbol and its position in a sequence.
-
-    Attributes
-    ----------
-    symbol : object
-        The symbol from the original alphabet.
-    position : int
-        The 0-based position of the symbol in the sequence.
-
-    See Also
-    --------
-    PositionalSequence
-        The sequence type containing :class:`PositionalSymbol` objects.
-    """
-
-    symbol: ...
-    position: ...
-
-    def __str__(self):
-        return str(self.symbol)
-
-
 class PositionalSequence(Sequence):
     """
     A sequence where each symbol is associated with a position.
 
-    The alphabet contains `PositionalSymbol` objects as symbols.
+    Its symbols are
     This is useful for aligning sequences based on a position-specific
     substitution matrix.
 
     Parameters
     ----------
-    base_sequence : seq.Sequence
-        The base sequence to create the positional sequence from.
+    original_sequence : seq.Sequence
+        The original sequence to create the positional sequence from.
     """
 
-    def __init__(self, base_sequence):
+    @dataclass(frozen=True)
+    class Symbol:
+        """
+        Combination of a symbol and its position in a sequence.
+
+        Attributes
+        ----------
+        original_alphabet : Alphabet
+            The original alphabet, where the symbol stems from.
+        original_code : int
+            The code of the original symbol in the original alphabet.
+        position : int
+            The 0-based position of the symbol in the sequence.
+        symbol : object
+            The symbol from the original alphabet.
+
+        See Also
+        --------
+        PositionalSequence
+            The sequence type containing :class:`PositionalSymbol` objects.
+        """
+
+        original_alphabet: ...
+        original_code: ...
+        position: ...
+        symbol: ... = field(init=False)
+
+        def __post_init__(self):
+            sym = self.original_alphabet.decode(self.original_code)
+            super().__setattr__("symbol", sym)
+
+        def __str__(self):
+            return str(self.symbol)
+
+    def __init__(self, original_sequence):
+        self._orig_alphabet = original_sequence.get_alphabet()
         self._alphabet = Alphabet(
-            [PositionalSymbol(sym, i) for i, sym in enumerate(base_sequence.symbols)]
+            [
+                PositionalSequence.Symbol(self._orig_alphabet, code, pos)
+                for pos, code in enumerate(original_sequence.code)
+            ]
         )
         self.code = np.arange(
-            len(base_sequence), dtype=Sequence.dtype(len(self._alphabet))
+            len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
         )
 
+    def reconstruct(self):
+        """
+        Reconstruct the original sequence from the positional sequence.
+
+        Returns
+        -------
+        original_sequence : GeneralSequence
+            The original sequence.
+            Although the actual type of the returned sequence is always a
+            :class:`GeneralSequence`, the alphabet and the symbols of the returned
+            sequence are equal to the original sequence.
+        """
+        original_sequence = GeneralSequence(self._orig_alphabet)
+        original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
+        return original_sequence
+
     def get_alphabet(self):
         return self._alphabet
 
     def __str__(self) -> str:
         return "".join([str(sym) for sym in self.symbols])
+
+    def __repr__(self):
+        return f"PositionalSequence({self.reconstruct()!r})"
+
+
+class PurePositionalSequence(Sequence):
+    """
+    An object of this class is a 'placeholder' sequence, where each symbol is the
+    position in the sequence itself.
+
+    This class is similar to :class:`PositionalSequence`, but the symbols are not
+    derived from an original sequence, but are the pure position.
+    Hence, there is no meaningful string representation of the sequence and its symbols.
+    """
+
+    def __init__(self, length):
+        self._alphabet = Alphabet(range(length))
+        self.code = np.arange(length, dtype=Sequence.dtype(length))
+
+    def get_alphabet(self):
+        return self._alphabet
+
+    def __repr__(self):
+        return f"PurePositionalSequence({len(self)})"
diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py
@@ -40,3 +40,32 @@ def test_matrix_str():
          "b   3   4   5",
          "c   6   7   8"]
     )  # fmt: skip
+
+
+@pytest.mark.parametrize("seed", range(10))
+def test_as_positional(seed):
+    """
+    Check if the score from a positional substitution matrix for symbols from
+    positional sequences is equal to the score from the original matrix for the
+    corresponding symbols from the original sequences.
+    """
+    MAX_SEQ_LENGTH = 10
+
+    np.random.seed(seed)
+    matrix = align.SubstitutionMatrix.std_protein_matrix()
+    sequences = []
+    for _ in range(2):
+        seq_length = np.random.randint(1, MAX_SEQ_LENGTH)
+        sequence = seq.ProteinSequence()
+        sequence.code = np.random.randint(0, len(sequence.alphabet), size=seq_length)
+        sequences.append(sequence)
+    pos_matrix, *pos_sequences = matrix.as_positional(*sequences)
+
+    # Sanity check if the positional sequences do correspond to the original sequences
+    for i in range(2):
+        assert str(pos_sequences[i]) == str(sequences[i])
+    for i in range(len(sequences[0])):
+        for j in range(len(sequences[1])):
+            ref_score = matrix.get_score(sequences[0][i], sequences[1][j])
+            test_score = pos_matrix.get_score(pos_sequences[0][i], pos_sequences[1][j])
+            assert test_score == ref_score
diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py
@@ -2,6 +2,7 @@
 # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
 # information.
 
+import numpy as np
 import pytest
 import biotite.sequence as seq
 
@@ -90,3 +91,23 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
     protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW")
     mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic)
     assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2)
+
+
+def test_positional_sequence():
+    """
+    Check whether the original sequence symbols can be reconstructed from a
+    :class:`PositionalSequence`.
+    """
+    SEQ_LENGTH = 100
+
+    np.random.seed(0)
+    orig_sequence = seq.ProteinSequence()
+    orig_sequence.code = np.random.randint(
+        0, len(orig_sequence.alphabet), size=SEQ_LENGTH
+    )
+    positional_sequence = seq.PositionalSequence(orig_sequence)
+
+    assert str(positional_sequence) == str(orig_sequence)
+    reconstructed_sequence = positional_sequence.reconstruct()
+    assert np.all(reconstructed_sequence.code == orig_sequence.code)
+    assert reconstructed_sequence.alphabet == orig_sequence.alphabet
diff --git a/tests/test_repr.py b/tests/test_repr.py
@@ -15,7 +15,9 @@
     LetterAlphabet,
     Location,
     NucleotideSequence,
+    PositionalSequence,
     ProteinSequence,
+    PurePositionalSequence,
     SequenceProfile,
 )
 from biotite.sequence.align import Alignment, SubstitutionMatrix
@@ -30,6 +32,8 @@
         NucleotideSequence("AACTGCTA"),
         NucleotideSequence("AACTGCTA", ambiguous=True),
         ProteinSequence("BIQTITE"),
+        PositionalSequence(NucleotideSequence("AACTGCTA")),
+        PurePositionalSequence(123),
         Alphabet(["X", "Y", "Z"]),
         GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]),
         LetterAlphabet(["X", "Y", "Z"]),