Skip to content

Commit

Permalink
Modified to split large deletions when a match of 10 or more bases is…
Browse files Browse the repository at this point in the history
… found within the identified large deletion. Issue: #42
  • Loading branch information
akikuno committed Jun 24, 2024
1 parent c97b829 commit 0c97a9b
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 25 deletions.
3 changes: 3 additions & 0 deletions docs/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@

+ Update GitHub Actions to test with Python 3.11 and 3.12. Issue: #43 [[Commit Detail](https://github.com/akikuno/DAJIN2/commit/54df79e60b484da429c1cbf6f12b0c19196452cc)]

## 🐛 Bug Fixes

+ Update `cssplits_handler._get_index_of_large_deletions`: Modified to split large deletions when a match of 10 or more bases is found within the identified large deletion. Issue: #42 [[Commit Detail](https://github.com/akikuno/DAJIN2/commit/xxxxx)]


<!-- ############################################################# # -->
Expand Down
34 changes: 30 additions & 4 deletions src/DAJIN2/utils/cssplits_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,21 +242,47 @@ def _extract_break_points_of_large_deletions(
return break_points


def _convert_break_points_to_index(break_points: list[dict[str, int]]) -> set[int]:
index_of_large_deletions = set()
def _convert_break_points_to_index(break_points: list[dict[str, int]]) -> list[int]:
index_of_large_deletions = []
for break_point in break_points:
start = break_point["start"]
end = break_point["end"]
index_of_large_deletions |= set(range(start, end + 1))
index_of_large_deletions += list(range(start, end + 1))

return index_of_large_deletions


def _find_matched_indexes(cssplits: list[str], index_of_large_deletions: list[int]) -> list[int]:
matched_index = []
count_matches = 0
start_match = -1

index_of_large_deletions.sort()
for i in index_of_large_deletions:
if cssplits[i].startswith("="):
if start_match == -1:
start_match = i
count_matches += 1
else:
if count_matches >= 10:
matched_index += list(range(start_match, i))
count_matches = 0
start_match = -1

return matched_index


def _remove_matched_indexes(index_of_large_deletions: list[int], matched_index: list[int]) -> set[int]:
return set(index_of_large_deletions) - set(matched_index)


def _get_index_of_large_deletions(cssplits: list[str], bin_size: int = 500, percentage: int = 50) -> set[int]:
range_of_large_deletions = _extract_candidate_index_of_large_deletions(cssplits, bin_size, percentage)
break_points = _extract_break_points_of_large_deletions(cssplits, range_of_large_deletions, bin_size)

return _convert_break_points_to_index(break_points)
index_of_large_deletions = _convert_break_points_to_index(break_points)
matched_index = _find_matched_indexes(cssplits, index_of_large_deletions)
return _remove_matched_indexes(index_of_large_deletions, matched_index)


def _adjust_cs_insertion(cs: str) -> str:
Expand Down
73 changes: 52 additions & 21 deletions tests/src/utils/test_cssplits_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def test_call_sequence(cons_percentage, expected_sequence):
"cssplits, expected",
[
(["=T"] * 100 + ["-A"] * 300 + ["=T"] * 100, set(range(100, 400))),
(
["=T"] * 100 + ["-A"] * 300 + ["=T"] * 10 + ["-A"] * 300 + ["=T"] * 100,
set(range(100, 400)) | set(range(410, 710)),
),
],
)
def test_get_index_of_large_deletions(cssplits, expected):
Expand All @@ -138,24 +142,51 @@ def test_adjust_cs_insertion(cs: str, expected: str):
assert cssplits_handler._adjust_cs_insertion(cs) == expected


# @pytest.mark.parametrize(
# "input_str, expected_output",
# [
# ("-A,-A,-A,=C,=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+C|+C|=G"),
# ("-A,-A,-A,=C,=C,=C,=C,-T,-T,-T", "-A,-A,-A,=C,=C,=C,=C,-T,-T,-T"),
# ("-A,-A,-A,N,=C,n,-T,-T,-T,=G", "-A,-A,-A,N,-C,n,-T,-T,-T,+N|+C|+n|=G"),
# ("-A,-A,-A,=C,+T|+T|=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+C|+C|=G"),
# ("-A,-A,-A,=C,+T|+T|*CG,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+G|+C|=G"),
# ("-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC", "-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC"),
# ],
# ids=[
# "insertion within deletion",
# "4-character match",
# "N and n",
# "Insertion",
# "Insertion followed by substitution",
# "Should not be adjusted",
# ],
# )
# def test_reallocate_insertion_within_deletion(input_str: str, expected_output: str):
# assert reallocate_insertion_within_deletion(input_str, del_range=3, distance=3) == expected_output
@pytest.mark.parametrize(
"cssplits, expected",
[
(
["=T"] * 100 + ["-A"] * 300 + ["*TA"] * 10 + ["-A"] * 300 + ["=T"] * 100,
["=T"] * 100
+ ["-A"] * 300
+ ["-T"] * 10
+ ["-A"] * 300
+ ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+ ["=T"] * 99,
),
(
["=T"] * 100 + ["-A"] * 150 + ["=T"] * 10 + ["-A"] * 150 + ["=T"] * 100,
["=T"] * 100 + ["-A"] * 150 + ["=T"] * 10 + ["-A"] * 150 + ["=T"] * 100,
),
(
["=T"] * 100
+ ["-A"] * 100
+ ["*TA"] * 10
+ ["-A"] * 100
+ ["=T"] * 10
+ ["-A"] * 100
+ ["*TA"] * 10
+ ["-A"] * 100
+ ["=T"] * 100,
["=T"] * 100
+ ["-A"] * 100
+ ["-T"] * 10
+ ["-A"] * 100
+ ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+ ["=T"] * 9
+ ["-A"] * 100
+ ["-T"] * 10
+ ["-A"] * 100
+ ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+ ["=T"] * 99,
),
],
ids=[
"insertion within deletion",
"matched region within deletion",
"insertions within deletion and matched region",
],
)
def test_reallocate_insertion_within_deletion(cssplits: str, expected: str):
assert cssplits_handler.reallocate_insertion_within_deletion(cssplits) == expected

0 comments on commit 0c97a9b

Please sign in to comment.