Skip to content

Commit

Permalink
Merge pull request #563 from padix-key/cigar
Browse files Browse the repository at this point in the history
Add `include_terminal_gaps` parameter
  • Loading branch information
padix-key authored May 20, 2024
2 parents a12f3ce + d0c20e6 commit 2e053cb
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 25 deletions.
75 changes: 60 additions & 15 deletions src/biotite/sequence/align/cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ def read_alignment_from_cigar(cigar, position,
AAAAGGTTTCCGACCGTAGGTAG
CCCCGGTTT--GACCGTATGTAG
Explicit terminal deletions are also possible.
Note that in this case the deleted positions count as aligned bases
with respect to the `position` parameter.
>>> print(read_alignment_from_cigar("3D9M2D12M4D", 0, ref, seg))
TATAAAAGGTTTCCGACCGTAGGTAGCTGA
---CCCCGGTTT--GACCGTATGTAG----
If bases in the segment sequence are soft-clipped, they do not
appear in the alignment.
Furthermore, the start of the reference sequence must be adapted.
Expand All @@ -122,7 +130,7 @@ def read_alignment_from_cigar(cigar, position,
GGTTTCCGACCGTAGGTAG
GGTTT--GACCGTATGTAG
Reading from BAM codes is also possible:
Reading from BAM codes is also possible.
>>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
>>> op_tuples = [
Expand Down Expand Up @@ -190,7 +198,8 @@ def read_alignment_from_cigar(cigar, position,

def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
introns=(), distinguish_matches=False,
hard_clip=False, as_string=True):
hard_clip=False, include_terminal_gaps=False,
as_string=True):
"""
Convert an :class:`Alignment` into a CIGAR string.
Expand Down Expand Up @@ -220,6 +229,13 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
hard_clip : bool, optional
If true, clipped bases are hard-clipped.
Otherwise, clipped bases are soft-clipped.
include_terminal_gaps : bool, optional
If true, terminal gaps in the segment sequence are included in
the CIGAR string.
These are represented by ``D`` operations at the start and/or
end of the string.
By default, those terminal gaps are omitted in the CIGAR, which
is the way SAM/BAM expects a CIGAR to be.
as_string : bool, optional
If true, the CIGAR string is returned.
Otherwise, a list of tuples is returned, where the first element
Expand All @@ -238,6 +254,12 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
--------
read_alignment_from_cigar
Notes
-----
If `include_terminal_gaps` is set to true, you usually want to set
``position=0`` in :func:`read_alignment_from_cigar` to get the
correct alignment.
Examples
--------
Expand All @@ -256,6 +278,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
9M2N12M
>>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
4X5=2D7=1X4=
>>> print(write_alignment_to_cigar(semiglobal_alignment, include_terminal_gaps=True))
3D9M2D12M4D
>>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
>>> print(local_alignment)
GGTTTCCGACCGTAGGTAG
Expand All @@ -274,9 +298,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
CigarOp.DELETION 2
CigarOp.MATCH 12
"""
# Ignore terminal gaps in segment sequence
no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
alignment = alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
if not include_terminal_gaps:
alignment = _remove_terminal_segment_gaps(alignment, segment_index)

ref_trace = alignment.trace[:, reference_index]
seg_trace = alignment.trace[:, segment_index]
Expand Down Expand Up @@ -321,19 +344,13 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
op_tuples = _aggregate_consecutive(operations)

clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
# Missing bases at the beginning and end of the segment are
# interpreted as clipped
# As first element in the segment trace is the first aligned base,
# all previous bases are clipped...
start_clip_length = seg_trace[0]
start_clip_length, end_clip_length = _find_clipped_bases(
alignment, segment_index
)
if start_clip_length != 0:
start_clip = [(clip_op, seg_trace[0])]
start_clip = [(clip_op, start_clip_length)]
else:
start_clip = np.zeros((0, 2), dtype=int)
# ...and the same applies for the last base
end_clip_length = (
len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
)
if end_clip_length != 0:
end_clip = [(clip_op, end_clip_length)]
else:
Expand All @@ -347,6 +364,34 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
return op_tuples


def _remove_terminal_segment_gaps(alignment, segment_index):
"""
Remove terminal gaps in the segment sequence.
"""
no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
return alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]


def _find_clipped_bases(alignment, segment_index):
"""
Find the number of clipped bases at the start and end of the segment.
"""
# Finding the clipped part is easier, when the terminal segment gaps
# are removed (if not already done)
alignment = _remove_terminal_segment_gaps(alignment, segment_index)
seg_trace = alignment.trace[:, segment_index]
# Missing bases at the beginning and end of the segment are
# interpreted as clipped
# As first element in the segment trace is the first aligned base,
# all previous bases are clipped...
start_clip_length = seg_trace[0]
# ...and the same applies for the last base
end_clip_length = (
len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
)
return start_clip_length, end_clip_length


def _aggregate_consecutive(operations):
"""
Aggregate consecutive operations of the same type.
Expand Down
20 changes: 10 additions & 10 deletions tests/sequence/align/test_cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,16 @@ def test_cigar_conversion(cigar):


@pytest.mark.parametrize(
"seed, local, distinguish_matches",
"seed, local, distinguish_matches, include_terminal_gaps",
itertools.product(
range(20),
[False, True],
[False, True],
[False, True],
)
)
def test_alignment_conversion(seed, local, distinguish_matches):
def test_alignment_conversion(seed, local, distinguish_matches,
include_terminal_gaps):
"""
Check whether an :class:`Alignment` converted into a CIGAR string
and back again into an :class:`Alignment` gives the same result.
Expand All @@ -130,15 +132,18 @@ def test_alignment_conversion(seed, local, distinguish_matches):
ref_ali = align.align_optimal(
ref, seg, matrix, terminal_penalty=False, max_number=1
)[0]
# Turn into a semi-global alignment
ref_ali = align.remove_terminal_gaps(ref_ali)
if not include_terminal_gaps:
# Turn into a semi-global alignment
ref_ali = align.remove_terminal_gaps(ref_ali)
# Remove score as the compared reconstructed alignment does not
# contain it either
ref_ali.score = None
start_position = ref_ali.trace[0,0]

cigar = align.write_alignment_to_cigar(
ref_ali, distinguish_matches=distinguish_matches
ref_ali,
distinguish_matches=distinguish_matches,
include_terminal_gaps=include_terminal_gaps
)

test_ali = align.read_alignment_from_cigar(
Expand All @@ -147,13 +152,8 @@ def test_alignment_conversion(seed, local, distinguish_matches):

print(cigar)
print("\n\n")
print(seg)
print("\n\n")
print(ref_ali)
print("\n\n")
print(test_ali)
print("\n\n")
print(start_position)
print(ref_ali.trace[:5])
print(test_ali.trace[:5])
assert test_ali == ref_ali

0 comments on commit 2e053cb

Please sign in to comment.