Modified to split large deletions when a match of 10 or more bases is…

… found within the identified large deletion. Issue: #42
akikuno · Jun 24, 2024 · 0c97a9b · 0c97a9b
1 parent c97b829
commit 0c97a9b
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 25 deletions.
diff --git a/docs/RELEASE.md b/docs/RELEASE.md
@@ -23,6 +23,9 @@
 
 + Update GitHub Actions to test with Python 3.11 and 3.12. Issue: #43 [[Commit Detail](https://github.com/akikuno/DAJIN2/commit/54df79e60b484da429c1cbf6f12b0c19196452cc)]
 
+## 🐛 Bug Fixes
+
++ Update `cssplits_handler._get_index_of_large_deletions`: Modified to split large deletions when a match of 10 or more bases is found within the identified large deletion. Issue: #42 [[Commit Detail](https://github.com/akikuno/DAJIN2/commit/xxxxx)]
 
 
 <!-- ############################################################# # -->

diff --git a/src/DAJIN2/utils/cssplits_handler.py b/src/DAJIN2/utils/cssplits_handler.py
@@ -242,21 +242,47 @@ def _extract_break_points_of_large_deletions(
     return break_points
 
 
-def _convert_break_points_to_index(break_points: list[dict[str, int]]) -> set[int]:
-    index_of_large_deletions = set()
+def _convert_break_points_to_index(break_points: list[dict[str, int]]) -> list[int]:
+    index_of_large_deletions = []
     for break_point in break_points:
         start = break_point["start"]
         end = break_point["end"]
-        index_of_large_deletions |= set(range(start, end + 1))
+        index_of_large_deletions += list(range(start, end + 1))
 
     return index_of_large_deletions
 
 
+def _find_matched_indexes(cssplits: list[str], index_of_large_deletions: list[int]) -> list[int]:
+    matched_index = []
+    count_matches = 0
+    start_match = -1
+
+    index_of_large_deletions.sort()
+    for i in index_of_large_deletions:
+        if cssplits[i].startswith("="):
+            if start_match == -1:
+                start_match = i
+            count_matches += 1
+        else:
+            if count_matches >= 10:
+                matched_index += list(range(start_match, i))
+            count_matches = 0
+            start_match = -1
+
+    return matched_index
+
+
+def _remove_matched_indexes(index_of_large_deletions: list[int], matched_index: list[int]) -> set[int]:
+    return set(index_of_large_deletions) - set(matched_index)
+
+
 def _get_index_of_large_deletions(cssplits: list[str], bin_size: int = 500, percentage: int = 50) -> set[int]:
     range_of_large_deletions = _extract_candidate_index_of_large_deletions(cssplits, bin_size, percentage)
     break_points = _extract_break_points_of_large_deletions(cssplits, range_of_large_deletions, bin_size)
 
-    return _convert_break_points_to_index(break_points)
+    index_of_large_deletions = _convert_break_points_to_index(break_points)
+    matched_index = _find_matched_indexes(cssplits, index_of_large_deletions)
+    return _remove_matched_indexes(index_of_large_deletions, matched_index)
 
 
 def _adjust_cs_insertion(cs: str) -> str:

diff --git a/tests/src/utils/test_cssplits_handler.py b/tests/src/utils/test_cssplits_handler.py
@@ -113,6 +113,10 @@ def test_call_sequence(cons_percentage, expected_sequence):
     "cssplits, expected",
     [
         (["=T"] * 100 + ["-A"] * 300 + ["=T"] * 100, set(range(100, 400))),
+        (
+            ["=T"] * 100 + ["-A"] * 300 + ["=T"] * 10 + ["-A"] * 300 + ["=T"] * 100,
+            set(range(100, 400)) | set(range(410, 710)),
+        ),
     ],
 )
 def test_get_index_of_large_deletions(cssplits, expected):
@@ -138,24 +142,51 @@ def test_adjust_cs_insertion(cs: str, expected: str):
     assert cssplits_handler._adjust_cs_insertion(cs) == expected
 
 
-# @pytest.mark.parametrize(
-#     "input_str, expected_output",
-#     [
-#         ("-A,-A,-A,=C,=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+C|+C|=G"),
-#         ("-A,-A,-A,=C,=C,=C,=C,-T,-T,-T", "-A,-A,-A,=C,=C,=C,=C,-T,-T,-T"),
-#         ("-A,-A,-A,N,=C,n,-T,-T,-T,=G", "-A,-A,-A,N,-C,n,-T,-T,-T,+N|+C|+n|=G"),
-#         ("-A,-A,-A,=C,+T|+T|=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+C|+C|=G"),
-#         ("-A,-A,-A,=C,+T|+T|*CG,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+G|+C|=G"),
-#         ("-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC", "-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC"),
-#     ],
-#     ids=[
-#         "insertion within deletion",
-#         "4-character match",
-#         "N and n",
-#         "Insertion",
-#         "Insertion followed by substitution",
-#         "Should not be adjusted",
-#     ],
-# )
-# def test_reallocate_insertion_within_deletion(input_str: str, expected_output: str):
-#     assert reallocate_insertion_within_deletion(input_str, del_range=3, distance=3) == expected_output
+@pytest.mark.parametrize(
+    "cssplits, expected",
+    [
+        (
+            ["=T"] * 100 + ["-A"] * 300 + ["*TA"] * 10 + ["-A"] * 300 + ["=T"] * 100,
+            ["=T"] * 100
+            + ["-A"] * 300
+            + ["-T"] * 10
+            + ["-A"] * 300
+            + ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+            + ["=T"] * 99,
+        ),
+        (
+            ["=T"] * 100 + ["-A"] * 150 + ["=T"] * 10 + ["-A"] * 150 + ["=T"] * 100,
+            ["=T"] * 100 + ["-A"] * 150 + ["=T"] * 10 + ["-A"] * 150 + ["=T"] * 100,
+        ),
+        (
+            ["=T"] * 100
+            + ["-A"] * 100
+            + ["*TA"] * 10
+            + ["-A"] * 100
+            + ["=T"] * 10
+            + ["-A"] * 100
+            + ["*TA"] * 10
+            + ["-A"] * 100
+            + ["=T"] * 100,
+            ["=T"] * 100
+            + ["-A"] * 100
+            + ["-T"] * 10
+            + ["-A"] * 100
+            + ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+            + ["=T"] * 9
+            + ["-A"] * 100
+            + ["-T"] * 10
+            + ["-A"] * 100
+            + ["+A|+A|+A|+A|+A|+A|+A|+A|+A|+A|=T"]
+            + ["=T"] * 99,
+        ),
+        
+    ],
+    ids=[
+        "insertion within deletion",
+        "matched region within deletion",
+        "insertions within deletion and matched region",
+    ],
+)
+def test_reallocate_insertion_within_deletion(cssplits: str, expected: str):
+    assert cssplits_handler.reallocate_insertion_within_deletion(cssplits) == expected