Skip to content

Commit

Permalink
Bug fix of reallocate_insertion_within_deletion:
Browse files Browse the repository at this point in the history
- In the script that considers the region between two deletions as an insertion sequence, the size of the other deletion was not taken into account. Even if there was a single base deletion, the entire sequence between the deletions was considered as an insertion sequence. Therefore, the region between two deletions is now defined only if the size of both deletions is equal to or greater than the specified threshold (default = 3).
  • Loading branch information
akikuno committed May 17, 2024
1 parent 2e141bf commit 7cafabf
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 87 deletions.
116 changes: 71 additions & 45 deletions src/DAJIN2/utils/cssplits_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,35 +160,56 @@ def call_sequence(cons_percentage: list[dict[str, float]], sep: str = "") -> str
###########################################################


def is_start_of_deletion(i: int, cssplits: list[str], start_del_count: int = 3) -> bool:
"""
Determine if the current index is the start of a deletion.
If there are consecutive deletions from the beginning, it is determined that there is a cluster of deletions.
"""
if i + start_del_count > len(cssplits):
def is_start_of_consecutive_deletions(cssplits: list[str], i: int, del_range: int = 3) -> bool:
"""Determine if the current index is the start of a consective deletion."""
if i + del_range > len(cssplits):
return False
return all([cs.startswith("-") for cs in cssplits[i : i + start_del_count]])

for j in range(del_range):
if not cssplits[i + j].startswith("-"):
return False

return True


def get_range_of_inclusive_deletions(cssplits: list[str], del_range: int = 3, distance: int = 10) -> set[int]:
inclusive_deletion_range: set[int] = set()
n: int = len(cssplits)
i: int = 0
while i < n:
# Check for start of deletion
if cssplits[i].startswith("-"):
start: int = i
while i < n and cssplits[i].startswith("-"):
i += 1
# We have found the end of a deletion range
end: int = i
# Check if deletion length is enough to be considered

if end - start >= del_range:
# Now check for matching sequence within 'distance'
match_count: int = 0
while i < n and match_count <= distance:
if cssplits[i].startswith("="):
match_count += 1
elif cssplits[i].startswith("-"):
# If another deletion is found, reset match count and update range
next_start: int = i
while i < n and cssplits[i].startswith("-"):
i += 1
next_end: int = i
if next_end - next_start >= del_range:
inclusive_deletion_range.update(range(start, next_end))

start = next_start
end = next_end
match_count = 0
break
i += 1

def is_within_deletion(i: int, cssplits: list[str], distance: int = 10) -> bool:
"""Count matches that are continuous for `distance` bases, and if there is a deletion in between, it is determined to be inside a cluster of deletions."""
exist_deletion = False
total_num_match = 0
j = 1
while True:
if i + j >= len(cssplits) or total_num_match > distance:
exist_deletion = False
break
current = cssplits[i + j]
if current.startswith("="):
total_num_match += 1
else:
total_num_match = 0
if current.startswith("-"):
exist_deletion = True
break
j += 1
return exist_deletion
i += 1

return inclusive_deletion_range


def adjust_cs_insertion(cs: str) -> str:
Expand All @@ -203,30 +224,35 @@ def adjust_cs_insertion(cs: str) -> str:
return cs_insertion


def detect_insertion_within_deletion(cssplits: str, start_del_count: int = 3, distance: int = 10) -> str:
cssplits = cssplits.split(",")
def reallocate_insertion_within_deletion(cssplit: str, del_range: int = 3, distance: int = 10) -> str:
"""
Since the mapping in minimap2 is local alignment, insertion bases within large deletions may be partially mapped to the reference genome and not detected as insertion bases. Therefore, update cssplits to detect insertions within large deletions as insertions.
"""
cssplits = cssplit.split(",")
cssplits_updated = cssplits.copy()

insertion = []
within_deletion = False
range_of_inclusive_deletion: set[int] = get_range_of_inclusive_deletions(cssplits, del_range, distance)

insertion_within_deletion = []
for i, cs in enumerate(cssplits):
if cs.startswith("-"):
if within_deletion is False and is_start_of_deletion(i, cssplits, start_del_count) is False:
if i in range_of_inclusive_deletion:
if cs.startswith("-"):
continue
within_deletion = is_within_deletion(i, cssplits, distance)
if within_deletion is False:
cssplits_updated[i + 1] = "".join(insertion) + cssplits_updated[i + 1]
insertion = []
if within_deletion is False:
continue
if not cs.startswith("-"):
if cs.startswith("*"):
cssplits_updated[i] = f"-{cs[1]}"
if cs.startswith("N") or cs.startswith("n"):
cs_new = cs
elif cs.startswith("*"):
cs_new = f"-{cs[1]}"
elif cs.startswith("+") and cs.split("|")[-1].startswith("*"):
cssplits_updated[i] = f'-{cs.split("|")[-1][1]}'
cs_new = f'-{cs.split("|")[-1][1]}'
else:
cssplits_updated[i] = f"-{cs[-1]}"
adjusted_cs = adjust_cs_insertion(cs)
insertion.append(adjusted_cs)
cs_new = f"-{cs[-1]}"
cssplits_updated[i] = cs_new

insertion_within_deletion.append(adjust_cs_insertion(cs))
else:
if not insertion_within_deletion:
continue
cssplits_updated[i] = "".join(insertion_within_deletion) + cs
insertion_within_deletion = []

return ",".join(cssplits_updated)
105 changes: 63 additions & 42 deletions tests/src/utils/test_cssplits_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
concatenate_cssplits,
standardize_case,
call_sequence,
is_start_of_deletion,
is_within_deletion,
is_start_of_consecutive_deletions,
# is_within_deletion,
get_range_of_inclusive_deletions,
adjust_cs_insertion,
detect_insertion_within_deletion,
reallocate_insertion_within_deletion,
)


Expand Down Expand Up @@ -128,45 +129,68 @@ def test_call_sequence(cons_percentage_by_key, expected_sequence):


###########################################################
# detect_insertion_within_deletion
# get_range_of_inclusive_deletions
###########################################################


@pytest.mark.parametrize(
"i, cssplits, start_del_count, expected",
"cssplits, expected",
[
(0, ["-A", "-A", "-A"], 3, True),
(0, ["-A", "-A"], 3, False),
(1, ["-A", "-A", "-A"], 3, False),
(["-A", "-A", "-A", "=T", "-A", "-A", "-A"], {0, 1, 2, 3, 4, 5, 6}),
(
["-A", "-A", "-A", "=T", "-A", "-A", "-A", "=G", "-A", "-A", "-A", "=T", "-A", "-A", "-A", "=G"],
{0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14},
),
(
["-A", "-A", "-A", "=T", "-A", "=T", "-A", "-A", "-A", "=G", "-G", "-G", "-G", "=G"],
{0, 1, 2, 3, 4, 5, 6, 7, 8},
),
],
)
def test_is_start_of_deletion(i: int, cssplits: list[str], start_del_count: int, expected: bool):
assert is_start_of_deletion(i, cssplits, start_del_count) == expected
def test_get_range_of_inclusive_deletions(cssplits, expected):
assert get_range_of_inclusive_deletions(cssplits) == expected


###########################################################
# reallocate_insertion_within_deletion
###########################################################


@pytest.mark.parametrize(
"index, cssplits, expected",
"cssplits, i, start_del_count, expected",
[
(0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], True),
(11, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], False),
(0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], False),
(0, ["-A", "=C", "=C", "=C", "=C", "=C", "*GC", "=C", "=C", "=C", "=C", "-T", "=G"], True),
(0, ["-A", "=C", "=C", "=C", "=C", "=C", "-T", "=C", "=C", "=C", "=C", "-T", "=G"], True),
(0, ["-A", "=C", "=C", "=C", "=C", "=C", "+A|=C", "=C", "=C", "=C", "=C", "-T", "=G"], True),
(0, ["-A", "=C", "=C", "-T", "=G"], True),
],
ids=[
"-A has a 10-character match and is within the deletion cluster.",
"-T is outside the deletion cluster.",
"-A has a 11-character match and is outside the deletion cluster.",
"-A is determined to be reset and within the deletion cluster due to a point mutation at the 6th character.",
"-A is determined to be reset and within the deletion cluster due to a deletion at the 6th character.",
"-A is determined to be reset and within the deletion cluster due to an insertion at the 6th character.",
"-A is within the deletion cluster.",
(0, ["-A", "-A", "-A"], 3, True),
(0, ["-A", "-A"], 3, False),
(1, ["-A", "-A", "-A"], 3, False),
],
)
def test_is_within_deletion(index: int, cssplits: list[str], expected: bool):
assert is_within_deletion(index, cssplits) == expected
def test_is_start_of_consecutive_deletions(cssplits: list[str], i: int, start_del_count: int, expected: bool):
assert is_start_of_consecutive_deletions(i, cssplits, start_del_count) == expected


# @pytest.mark.parametrize(
# "index, cssplits, del_range, distance, expected",
# [
# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True),
# (11, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, False),
# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, False),
# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "*GC", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True),
# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "-T", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True),
# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "+A|=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True),
# (0, ["-A", "-A", "=C", "-T", "-A"], 2, 10, True),
# ],
# ids=[
# "-A has a 10-character match and is within the deletion cluster.",
# "-T is outside the deletion cluster.",
# "-A has a 11-character match and is outside the deletion cluster.",
# "-A is determined to be reset and within the deletion cluster due to a point mutation at the 6th character.",
# "-A is determined to be reset and within the deletion cluster due to a deletion at the 6th character.",
# "-A is determined to be reset and within the deletion cluster due to an insertion at the 6th character.",
# "-A is within the deletion cluster.",
# ],
# )
# def test_is_within_deletion(index: int, cssplits: list[str], del_range: int, distance: int, expected: bool):
# assert is_within_deletion(index, cssplits, del_range, distance) == expected


@pytest.mark.parametrize(
Expand All @@ -186,24 +210,21 @@ def test_adjust_cs_insertion(cs: str, expected: str):
@pytest.mark.parametrize(
"input_str, expected_output",
[
("=A,-A,-A,-A,=C,=C,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+C|+C|=G"),
(
"-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G",
"-A,-A,-A,-C,-C,-C,-C,-C,-C,-C,-C,-C,-C,-T,+C|+C|+C|+C|+C|+C|+C|+C|+C|+C|=G",
),
("-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G", "-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G"),
("=A,-A,-A,-A,N,=C,n,-T,=G", "=A,-A,-A,-A,-N,-C,-n,-T,+N|+C|+n|=G"),
("=A,-A,-A,-A,=C,+T|+T|=C,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+T|+T|+C|+C|=G"),
("=A,-A,-A,-A,=C,+T|+T|*CG,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+T|+T|+G|+C|=G"),
("-A,-A,-A,=C,=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+C|+C|=G"),
("-A,-A,-A,=C,=C,=C,=C,-T,-T,-T", "-A,-A,-A,=C,=C,=C,=C,-T,-T,-T"),
("-A,-A,-A,N,=C,n,-T,-T,-T,=G", "-A,-A,-A,N,-C,n,-T,-T,-T,+N|+C|+n|=G"),
("-A,-A,-A,=C,+T|+T|=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+C|+C|=G"),
("-A,-A,-A,=C,+T|+T|*CG,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+G|+C|=G"),
("-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC", "-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC"),
],
ids=[
"insertion within deletion",
"10-character match",
"11-character match",
"4-character match",
"N and n",
"Insertion",
"Insertion followed by substitution",
"Should not be adjusted",
],
)
def test_detect_insertion_within_deletion(input_str: str, expected_output: str):
assert detect_insertion_within_deletion(input_str) == expected_output
def test_reallocate_insertion_within_deletion(input_str: str, expected_output: str):
assert reallocate_insertion_within_deletion(input_str, del_range=3, distance=3) == expected_output

0 comments on commit 7cafabf

Please sign in to comment.