From 7cafabf3a89b75af73f1a40da16e6390d47b48d5 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 17 May 2024 16:01:26 +0900 Subject: [PATCH] Bug fix of `reallocate_insertion_within_deletion`: - In the script that considers the region between two deletions as an insertion sequence, the size of the other deletion was not taken into account. Even if there was a single base deletion, the entire sequence between the deletions was considered as an insertion sequence. Therefore, the region between two deletions is now defined only if the size of both deletions is equal to or greater than the specified threshold (default = 3). --- src/DAJIN2/utils/cssplits_handler.py | 116 ++++++++++++++--------- tests/src/utils/test_cssplits_handler.py | 105 ++++++++++++-------- 2 files changed, 134 insertions(+), 87 deletions(-) diff --git a/src/DAJIN2/utils/cssplits_handler.py b/src/DAJIN2/utils/cssplits_handler.py index 3e53b511..c66795de 100644 --- a/src/DAJIN2/utils/cssplits_handler.py +++ b/src/DAJIN2/utils/cssplits_handler.py @@ -160,35 +160,56 @@ def call_sequence(cons_percentage: list[dict[str, float]], sep: str = "") -> str ########################################################### -def is_start_of_deletion(i: int, cssplits: list[str], start_del_count: int = 3) -> bool: - """ - Determine if the current index is the start of a deletion. - If there are consecutive deletions from the beginning, it is determined that there is a cluster of deletions. - """ - if i + start_del_count > len(cssplits): +def is_start_of_consecutive_deletions(cssplits: list[str], i: int, del_range: int = 3) -> bool: + """Determine if the current index is the start of a consective deletion.""" + if i + del_range > len(cssplits): return False - return all([cs.startswith("-") for cs in cssplits[i : i + start_del_count]]) + for j in range(del_range): + if not cssplits[i + j].startswith("-"): + return False + + return True + + +def get_range_of_inclusive_deletions(cssplits: list[str], del_range: int = 3, distance: int = 10) -> set[int]: + inclusive_deletion_range: set[int] = set() + n: int = len(cssplits) + i: int = 0 + while i < n: + # Check for start of deletion + if cssplits[i].startswith("-"): + start: int = i + while i < n and cssplits[i].startswith("-"): + i += 1 + # We have found the end of a deletion range + end: int = i + # Check if deletion length is enough to be considered + + if end - start >= del_range: + # Now check for matching sequence within 'distance' + match_count: int = 0 + while i < n and match_count <= distance: + if cssplits[i].startswith("="): + match_count += 1 + elif cssplits[i].startswith("-"): + # If another deletion is found, reset match count and update range + next_start: int = i + while i < n and cssplits[i].startswith("-"): + i += 1 + next_end: int = i + if next_end - next_start >= del_range: + inclusive_deletion_range.update(range(start, next_end)) + + start = next_start + end = next_end + match_count = 0 + break + i += 1 -def is_within_deletion(i: int, cssplits: list[str], distance: int = 10) -> bool: - """Count matches that are continuous for `distance` bases, and if there is a deletion in between, it is determined to be inside a cluster of deletions.""" - exist_deletion = False - total_num_match = 0 - j = 1 - while True: - if i + j >= len(cssplits) or total_num_match > distance: - exist_deletion = False - break - current = cssplits[i + j] - if current.startswith("="): - total_num_match += 1 - else: - total_num_match = 0 - if current.startswith("-"): - exist_deletion = True - break - j += 1 - return exist_deletion + i += 1 + + return inclusive_deletion_range def adjust_cs_insertion(cs: str) -> str: @@ -203,30 +224,35 @@ def adjust_cs_insertion(cs: str) -> str: return cs_insertion -def detect_insertion_within_deletion(cssplits: str, start_del_count: int = 3, distance: int = 10) -> str: - cssplits = cssplits.split(",") +def reallocate_insertion_within_deletion(cssplit: str, del_range: int = 3, distance: int = 10) -> str: + """ + Since the mapping in minimap2 is local alignment, insertion bases within large deletions may be partially mapped to the reference genome and not detected as insertion bases. Therefore, update cssplits to detect insertions within large deletions as insertions. + """ + cssplits = cssplit.split(",") cssplits_updated = cssplits.copy() - insertion = [] - within_deletion = False + range_of_inclusive_deletion: set[int] = get_range_of_inclusive_deletions(cssplits, del_range, distance) + + insertion_within_deletion = [] for i, cs in enumerate(cssplits): - if cs.startswith("-"): - if within_deletion is False and is_start_of_deletion(i, cssplits, start_del_count) is False: + if i in range_of_inclusive_deletion: + if cs.startswith("-"): continue - within_deletion = is_within_deletion(i, cssplits, distance) - if within_deletion is False: - cssplits_updated[i + 1] = "".join(insertion) + cssplits_updated[i + 1] - insertion = [] - if within_deletion is False: - continue - if not cs.startswith("-"): - if cs.startswith("*"): - cssplits_updated[i] = f"-{cs[1]}" + if cs.startswith("N") or cs.startswith("n"): + cs_new = cs + elif cs.startswith("*"): + cs_new = f"-{cs[1]}" elif cs.startswith("+") and cs.split("|")[-1].startswith("*"): - cssplits_updated[i] = f'-{cs.split("|")[-1][1]}' + cs_new = f'-{cs.split("|")[-1][1]}' else: - cssplits_updated[i] = f"-{cs[-1]}" - adjusted_cs = adjust_cs_insertion(cs) - insertion.append(adjusted_cs) + cs_new = f"-{cs[-1]}" + cssplits_updated[i] = cs_new + + insertion_within_deletion.append(adjust_cs_insertion(cs)) + else: + if not insertion_within_deletion: + continue + cssplits_updated[i] = "".join(insertion_within_deletion) + cs + insertion_within_deletion = [] return ",".join(cssplits_updated) diff --git a/tests/src/utils/test_cssplits_handler.py b/tests/src/utils/test_cssplits_handler.py index 5ff5ecbb..c458264c 100644 --- a/tests/src/utils/test_cssplits_handler.py +++ b/tests/src/utils/test_cssplits_handler.py @@ -7,10 +7,11 @@ concatenate_cssplits, standardize_case, call_sequence, - is_start_of_deletion, - is_within_deletion, + is_start_of_consecutive_deletions, + # is_within_deletion, + get_range_of_inclusive_deletions, adjust_cs_insertion, - detect_insertion_within_deletion, + reallocate_insertion_within_deletion, ) @@ -128,45 +129,68 @@ def test_call_sequence(cons_percentage_by_key, expected_sequence): ########################################################### -# detect_insertion_within_deletion +# get_range_of_inclusive_deletions ########################################################### @pytest.mark.parametrize( - "i, cssplits, start_del_count, expected", + "cssplits, expected", [ - (0, ["-A", "-A", "-A"], 3, True), - (0, ["-A", "-A"], 3, False), - (1, ["-A", "-A", "-A"], 3, False), + (["-A", "-A", "-A", "=T", "-A", "-A", "-A"], {0, 1, 2, 3, 4, 5, 6}), + ( + ["-A", "-A", "-A", "=T", "-A", "-A", "-A", "=G", "-A", "-A", "-A", "=T", "-A", "-A", "-A", "=G"], + {0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14}, + ), + ( + ["-A", "-A", "-A", "=T", "-A", "=T", "-A", "-A", "-A", "=G", "-G", "-G", "-G", "=G"], + {0, 1, 2, 3, 4, 5, 6, 7, 8}, + ), ], ) -def test_is_start_of_deletion(i: int, cssplits: list[str], start_del_count: int, expected: bool): - assert is_start_of_deletion(i, cssplits, start_del_count) == expected +def test_get_range_of_inclusive_deletions(cssplits, expected): + assert get_range_of_inclusive_deletions(cssplits) == expected + + +########################################################### +# reallocate_insertion_within_deletion +########################################################### @pytest.mark.parametrize( - "index, cssplits, expected", + "cssplits, i, start_del_count, expected", [ - (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], True), - (11, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], False), - (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], False), - (0, ["-A", "=C", "=C", "=C", "=C", "=C", "*GC", "=C", "=C", "=C", "=C", "-T", "=G"], True), - (0, ["-A", "=C", "=C", "=C", "=C", "=C", "-T", "=C", "=C", "=C", "=C", "-T", "=G"], True), - (0, ["-A", "=C", "=C", "=C", "=C", "=C", "+A|=C", "=C", "=C", "=C", "=C", "-T", "=G"], True), - (0, ["-A", "=C", "=C", "-T", "=G"], True), - ], - ids=[ - "-A has a 10-character match and is within the deletion cluster.", - "-T is outside the deletion cluster.", - "-A has a 11-character match and is outside the deletion cluster.", - "-A is determined to be reset and within the deletion cluster due to a point mutation at the 6th character.", - "-A is determined to be reset and within the deletion cluster due to a deletion at the 6th character.", - "-A is determined to be reset and within the deletion cluster due to an insertion at the 6th character.", - "-A is within the deletion cluster.", + (0, ["-A", "-A", "-A"], 3, True), + (0, ["-A", "-A"], 3, False), + (1, ["-A", "-A", "-A"], 3, False), ], ) -def test_is_within_deletion(index: int, cssplits: list[str], expected: bool): - assert is_within_deletion(index, cssplits) == expected +def test_is_start_of_consecutive_deletions(cssplits: list[str], i: int, start_del_count: int, expected: bool): + assert is_start_of_consecutive_deletions(i, cssplits, start_del_count) == expected + + +# @pytest.mark.parametrize( +# "index, cssplits, del_range, distance, expected", +# [ +# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True), +# (11, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, False), +# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, False), +# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "*GC", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True), +# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "-T", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True), +# (0, ["-A", "=C", "=C", "=C", "=C", "=C", "+A|=C", "=C", "=C", "=C", "=C", "-T", "=G"], 1, 10, True), +# (0, ["-A", "-A", "=C", "-T", "-A"], 2, 10, True), +# ], +# ids=[ +# "-A has a 10-character match and is within the deletion cluster.", +# "-T is outside the deletion cluster.", +# "-A has a 11-character match and is outside the deletion cluster.", +# "-A is determined to be reset and within the deletion cluster due to a point mutation at the 6th character.", +# "-A is determined to be reset and within the deletion cluster due to a deletion at the 6th character.", +# "-A is determined to be reset and within the deletion cluster due to an insertion at the 6th character.", +# "-A is within the deletion cluster.", +# ], +# ) +# def test_is_within_deletion(index: int, cssplits: list[str], del_range: int, distance: int, expected: bool): +# assert is_within_deletion(index, cssplits, del_range, distance) == expected @pytest.mark.parametrize( @@ -186,24 +210,21 @@ def test_adjust_cs_insertion(cs: str, expected: str): @pytest.mark.parametrize( "input_str, expected_output", [ - ("=A,-A,-A,-A,=C,=C,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+C|+C|=G"), - ( - "-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G", - "-A,-A,-A,-C,-C,-C,-C,-C,-C,-C,-C,-C,-C,-T,+C|+C|+C|+C|+C|+C|+C|+C|+C|+C|=G", - ), - ("-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G", "-A,-A,-A,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,=C,-T,=G"), - ("=A,-A,-A,-A,N,=C,n,-T,=G", "=A,-A,-A,-A,-N,-C,-n,-T,+N|+C|+n|=G"), - ("=A,-A,-A,-A,=C,+T|+T|=C,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+T|+T|+C|+C|=G"), - ("=A,-A,-A,-A,=C,+T|+T|*CG,=C,-T,=G", "=A,-A,-A,-A,-C,-C,-C,-T,+C|+T|+T|+G|+C|=G"), + ("-A,-A,-A,=C,=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+C|+C|=G"), + ("-A,-A,-A,=C,=C,=C,=C,-T,-T,-T", "-A,-A,-A,=C,=C,=C,=C,-T,-T,-T"), + ("-A,-A,-A,N,=C,n,-T,-T,-T,=G", "-A,-A,-A,N,-C,n,-T,-T,-T,+N|+C|+n|=G"), + ("-A,-A,-A,=C,+T|+T|=C,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+C|+C|=G"), + ("-A,-A,-A,=C,+T|+T|*CG,=C,-T,-T,-T,=G", "-A,-A,-A,-C,-C,-C,-T,-T,-T,+C|+T|+T|+G|+C|=G"), + ("-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC", "-G,-G,-C,=A,=C,=C,*CA,=A,-T,-T,*AC"), ], ids=[ "insertion within deletion", - "10-character match", - "11-character match", + "4-character match", "N and n", "Insertion", "Insertion followed by substitution", + "Should not be adjusted", ], ) -def test_detect_insertion_within_deletion(input_str: str, expected_output: str): - assert detect_insertion_within_deletion(input_str) == expected_output +def test_reallocate_insertion_within_deletion(input_str: str, expected_output: str): + assert reallocate_insertion_within_deletion(input_str, del_range=3, distance=3) == expected_output