diff --git a/doc/tutorial/sequence/index.rst b/doc/tutorial/sequence/index.rst index ab1a96b83..c879c3e04 100644 --- a/doc/tutorial/sequence/index.rst +++ b/doc/tutorial/sequence/index.rst @@ -43,5 +43,5 @@ For an unambiguous DNA sequence, the alphabet comprises the four nucleobases. align_optimal align_heuristic align_multiple - annotations - profiles \ No newline at end of file + profiles + annotations \ No newline at end of file diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst new file mode 100644 index 000000000..8554b4588 --- /dev/null +++ b/doc/tutorial/sequence/profiles.rst @@ -0,0 +1,134 @@ +.. include:: /tutorial/preamble.rst + +Sequence profiles and position-specific scoring matrices +======================================================== + +Often sequences are not viewed in isolation: +For example, if you investigate a protein family, you do not handle a single sequence, +but an arbitrarily large collection of highly similar sequences. +At some point handling those sequences as a mere collection of individual sequences +becomes impractical for answering common questions, like which are the prevalent +amino acids at a certain positions of the sequences. + +Sequence profiles +----------------- + +.. currentmodule:: biotite.sequence + +This is where *sequence profiles* come into play: +The condense the a collection of aligned sequences into a matrix that tracks the +frequency of each symbol at each position of the alignment. +Hence, asking the questions such as '*How frequent is an alanine at the n-th position?*' +becomes a trivial indexing operation. + +As example for profiles, we will reuse the cyclotide sequence family from the +:doc:`previous chapter `. + +.. jupyter-execute:: + + from tempfile import NamedTemporaryFile + import biotite.sequence.io.fasta as fasta + import biotite.database.entrez as entrez + + query = ( + entrez.SimpleQuery("Cyclotide") & + entrez.SimpleQuery("cter") & + entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ + entrez.SimpleQuery("Precursor") + ) + uids = entrez.search(query, "protein") + temp_file = NamedTemporaryFile(suffix=".fasta", delete=False) + fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, temp_file.name, "protein", "fasta") + ) + sequences = { + # The cyclotide variant is the last character in the header + header[-1]: seq for header, seq in fasta.get_sequences(fasta_file).items() + } + # Extract cyclotide N as query sequence for later + query = sequences.pop("N") + +To create a profile, we first need to align the sequences, so corresponding symbols +appear the same position. +Then we can create a :class:`SequenceProfile` object from the :class:`Alignment`, which +simply counts for each alignment column (i.e. the sequence position) the number of +occurrences for each symbol. + +.. jupyter-execute:: + + import biotite.sequence as seq + import biotite.sequence.align as align + + alignment, _, _, _ = seq.align.align_multiple( + list(sequences.values()), + align.SubstitutionMatrix.std_protein_matrix(), + gap_penalty=-5, + ) + profile = seq.SequenceProfile.from_alignment(alignment) + count_matrix = profile.symbols + print(profile) + +Each row in the displayed count matrix +(accessible via :attr:`SequenceProfile.symbols`) refers to a single position, i.e. a +column in the input MSA, and each column refers to a symbol in the underlying alphabet +(accessible via :attr:`SequenceProfile.alphabet`). +For completeness it should be noted that :attr:`SequenceProfile.gaps` also tracks the +gaps for each position in the alignment, but we will not further use this in this +tutorial. + +.. jupyter-execute:: + + print(profile.to_consensus()) + +Note that the information about the individual sequences is lost in the condensation +process: There is no way to reconstruct the original sequences from the profile. + +Position-specific scoring matrices +---------------------------------- + +.. currentmodule:: biotite.sequence.align + +Sequence profiles can achieve even more: +The substitution matrices we have seen so far assign a score to a pair of two symbols, +regardless of their position in a sequence. +However, as discussed above, the position is crucial information to determine how +conserved a certain symbol is in a family of sequences. + +Hence, we extend the concept of substitution matrices to *position-specific* matrices, +which assign a score to a symbol and a position (instead of another symbols). + +.. jupyter-execute:: + + # For a real analysis each amino acid background frequencies should be provided + log_odds = profile.log_odds_matrix(pseudocount=1) + +Now, we encounter a problem: +To create a :class:`SubstitutionMatrix` object from the log-odds matrix, we require two +:class:`Alphabet` objects: +One is taken from the query sequence to be aligned to the profile, but which alphabet +do we use for the positional axis of the matrix? +Likewise, the alignment functions (e.g. :func:`align_optimal()`) require a two sequences +to be aligned, but we only have one query sequence. + +.. currentmodule:: biotite.sequence + +To solve this problem :mod:`biotite.sequence` provides the :class:`PositionalSequence` +which acts as a placeholder for the second sequence in the alignment. +Its alphabet contains a unique symbol for each position, i.e. the alphabet has the +sought length. + +.. jupyter-execute:: + + positional_seq = seq.PositionalSequence(profile.to_consensus()) + matrix = align.SubstitutionMatrix( + positional_seq.alphabet, + profile.alphabet, + log_odds + ) + alignment = align.align_optimal( + positional_seq, query, matrix, gap_penalty=-5, max_number=1 + )[0] + print(alignment) + +More on positional sequences +---------------------------- \ No newline at end of file