diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index 5ac57018..2531a0a7 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -136,6 +136,79 @@ def get_n_reads_bam(bam_filename,bam_chr_loc=""): ######################################### + +def split_quant_window_coordinates(quant_window_coordinates): + """Split the quantification window coordinates to be iterated over. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of tuples + Where each element is a tuple and the first element of the tuple is the + start of the range and the second element is the end of the range. + """ + coord_re = re.compile(r'^(\d+)-(\d+)$') + try: + return [tuple(map(int, coord_re.match(c).groups())) for c in quant_window_coordinates.split('_')] + except: + raise CRISPRessoShared.BadParameterException("Cannot parse analysis window coordinate '" + str(quant_window_coordinates)) + + +def get_include_idxs_from_quant_window_coordinates(quant_window_coordinates): + """Get the include_idxs from the quantification window coordinates. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein. + """ + return [ + i + for coord in split_quant_window_coordinates(quant_window_coordinates) + for i in range(coord[0], coord[1] + 1) + ] + + +def get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, idxs): + """Get the include_idxs from the quantification window coordinates, but adjusted according to s1ind. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + idxs: list of int + The index values mapped to the amplicon from which this is being cloned. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein, + but adjusted according to s1inds. + """ + include_idxs = [] + for i in range(1, len(idxs)): + if abs(idxs[i-1]) == idxs[i]: + idxs[i] = -1 * abs(idxs[i]) + for coord in split_quant_window_coordinates(quant_window_coordinates): + include_idxs.extend(idxs[coord[0]:coord[1] + 1]) + + return list(filter(lambda x: x >= 0, include_idxs)) + + def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length): """ For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence) @@ -1966,21 +2039,15 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited refs[ref_name]['contains_guide'] = refs[clone_ref_name]['contains_guide'] #quantification window coordinates override other options - if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "": + if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "" and amplicon_quant_window_coordinates_arr[this_ref_idx] != '0': if amplicon_quant_window_coordinates_arr[this_ref_idx] != "": - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[this_ref_idx] + this_include_idxs = get_include_idxs_from_quant_window_coordinates(amplicon_quant_window_coordinates_arr[this_ref_idx]) else: - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[clone_ref_idx] - this_include_idxs = [] - these_coords = this_quant_window_coordinates.split("_") - for coord in these_coords: - coordRE = re.match(r'^(\d+)-(\d+)$', coord) - if coordRE: - start = s1inds[int(coordRE.group(1))] - end = s1inds[int(coordRE.group(2)) + 1] - this_include_idxs.extend(range(start, end)) - else: - raise NTException("Cannot parse analysis window coordinate '" + str(coord)) + this_include_idxs = get_cloned_include_idxs_from_quant_window_coordinates( + amplicon_quant_window_coordinates_arr[clone_ref_idx], + s1inds.copy(), + ) + #subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded) this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs']))) this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs)))) diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index 26a212a6..8e82c4e3 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -1677,7 +1677,47 @@ def set_guide_array(vals, guides, property_name): for idx, val in enumerate(vals_array): if val != '': ret_array[idx] = int(val) - return ret_array + return ret_array + + +def get_relative_coordinates(to_sequence, from_sequence): + """Given an alignment, get the relative coordinates of the second sequence to the first. + + For example, from_sequence[i] matches to to_sequence[inds[i]]. A `-1` + indicates a gap at the beginning of `to_sequence`. + + Parameters + ---------- + to_sequence : str + The alignment of the first sequence (where the coordinates are relative to) + from_sequence : str + The alignment of the second sequence + + Returns + ------- + s1inds_gap_left : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the left value. + s1inds_gap_right : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the right value. + """ + s1inds_gap_left = [] + s1inds_gap_right = [] + s1idx_left = -1 + s1idx_right = 0 + s2idx = -1 + for ix in range(len(to_sequence)): + if to_sequence[ix] != "-": + s1idx_left += 1 + if from_sequence[ix] != "-": + s2idx += 1 + s1inds_gap_left.append(s1idx_left) + s1inds_gap_right.append(s1idx_right) + if to_sequence[ix] != "-": + s1idx_right += 1 + + return s1inds_gap_left, s1inds_gap_right def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_wunsch_gap_open, @@ -1701,23 +1741,7 @@ def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_ gap_open=needleman_wunsch_gap_open, gap_extend=needleman_wunsch_gap_extend, gap_incentive=this_gap_incentive) - # print(fws1) - # print(fws2) - s1inds_l = [] - s1inds_r = [] - s1ix_l = -1 - s1ix_r = 0 - s2ix = -1 - for ix in range(len(fws1)): - if fws1[ix] != "-": - s1ix_l += 1 - if fws2[ix] != "-": - s2ix += 1 - s1inds_l.append(s1ix_l) - s1inds_r.append(s1ix_r) - if fws1[ix] != "-": - s1ix_r += 1 - return s1inds_l, s1inds_r + return get_relative_coordinates(fws1, fws2) def get_mismatches(seq_1, seq_2, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend): diff --git a/tests/unit_tests/test_CRISPRessoCORE.py b/tests/unit_tests/test_CRISPRessoCORE.py index 268b6727..9a14549f 100644 --- a/tests/unit_tests/test_CRISPRessoCORE.py +++ b/tests/unit_tests/test_CRISPRessoCORE.py @@ -1,7 +1,7 @@ """Unit tests for CRISPResso2CORE.""" import pytest -from CRISPResso2 import CRISPRessoCORE +from CRISPResso2 import CRISPRessoCORE, CRISPRessoShared def test_get_consensus_alignment_from_pairs(): """Tests for generating consensus alignments from paired reads.""" @@ -93,6 +93,106 @@ def test_get_consensus_alignment_from_pairs(): assert ref_seq == "ATCGATCGAT" assert score == 50 #double check this score... should be 5/10 + +def test_split_quant_window_coordinates_single(): + assert [(5, 10)] == CRISPRessoCORE.split_quant_window_coordinates('5-10') + + +def test_split_quant_window_coordinates_multiple(): + assert CRISPRessoCORE.split_quant_window_coordinates('2-5_10-12') == [(2, 5), (10, 12)] + + +def test_split_quant_window_coordinates_error(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('a-5') + + +def test_split_quant_window_coordinates_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('_') + + +def test_split_quant_window_coordinates_partially_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('1-3_') + + +def test_split_quant_window_coordinates_blank(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('') + + +def test_get_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + assert CRISPRessoCORE.get_include_idxs_from_quant_window_coordinates(quant_window_coordinates) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + s1inds = list(range(22)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp insertion at the beginning (left) + s1inds = list(range(5, 27)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(6, 16)), *list(range(17, 26))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp deletion at the beginning (left) + s1inds = [-1, -1, -1, -1, -1 ] + list(range(26)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(0, 6)), *list(range(7, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion(): + quant_window_coordinates = '10-20_35-40' + # represents a 7bp deletion in the middle + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 21)), *list(range(35-7, 41-7))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_modified(): + quant_window_coordinates = '10-25_35-40' + # represents a 7bp deletion in the middle, where part of the QW is deleted + # [0, 1, 3, 4, ... , 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, ... , 33] + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 23)), *list(range(35-7, 41-7))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_end_modified(): + # 5 bp deletion at end of 20 bp sequence + quant_window_coordinates = '1-5_10-20' + s1inds = [*list(range(16)), *[15, 15, 15, 15, 15]] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(10, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion(): + # 5 bp deletion and 5 bp insertion + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *[6, 7, 8, 9, 15, 16, 17, 18, 19, 20]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion_modified(): + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 2, 4, 5, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10, 15, 16, 17, 18, 19] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*[1,2,4,5], *[8, 9, 10, 15, 16, 17, 18, 19]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_across_qw(): + # 6 bp insertion in middle of 4 bp sequence + quant_window_coordinates = '1-4' + s1inds = [0,1,2,9,10] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1,2,9,10] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_entire_qw(): + # 5 bp deletion of entire qw + quant_window_coordinates = '1-4_7-10' + s1inds = [0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1, 2, 3, 4] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_include_zero(): + quant_window_coordinates = '0-5' + s1inds = [0, 1, 2, 3, 4, 5] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [0, 1, 2, 3, 4, 5] + if __name__ == "__main__": # execute only if run as a script test_get_consensus_alignment_from_pairs() diff --git a/tests/unit_tests/test_CRISPRessoShared.py b/tests/unit_tests/test_CRISPRessoShared.py index da1bc479..f6ce67a0 100644 --- a/tests/unit_tests/test_CRISPRessoShared.py +++ b/tests/unit_tests/test_CRISPRessoShared.py @@ -20,4 +20,57 @@ def test_get_mismatches(): -5, -3, ) - assert len(mismatch_cords) == 6 + assert len(mismatch_cords) == 6 + +def test_get_relative_coordinates(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'TTCGT') + assert s1inds_gap_left == [0, 1, 2, 3, 4] + assert s1inds_gap_right == [0, 1, 2, 3, 4] + + +def test_get_relative_coordinates_to_gap(): + # unaligned sequences + seq_1 = 'TTCGT' + seq_2 = 'TTCT' + + # aligned_sequences + to_sequence = 'TTC-T' + from_sequence = 'TTCGT' + + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates(to_sequence, from_sequence) + assert s1inds_gap_left == [0, 1, 2, 2, 3] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + + + assert seq_1[0] == seq_2[s1inds_gap_left[0]] + assert seq_1[1] == seq_2[s1inds_gap_left[1]] + assert seq_1[2] == seq_2[s1inds_gap_left[2]] + assert seq_1[4] == seq_2[s1inds_gap_left[4]] + + +def test_get_relative_coordinates_start_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('--CGT', 'TTCGT') + assert s1inds_gap_left == [-1, -1, 0, 1, 2] + assert s1inds_gap_right == [0, 0, 0, 1, 2] + + +def test_get_relative_coordinates_from_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'ATC-T') + assert s1inds_gap_left == [0, 1, 2, 4] + assert s1inds_gap_right == [0, 1, 2, 4] + +def test_get_relative_coordinates_end_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATC--', 'ATCGT') + assert s1inds_gap_left == [0, 1, 2, 2, 2] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + +def test_get_relative_coordinates_multiple_gaps(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('AT--TC--G--CC', 'ATCGTCGCGTTCC') + assert s1inds_gap_left == [0, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 5, 6] + assert s1inds_gap_right == [0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6] + +def test_get_relative_coordinates_ind_and_dels(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATG--C', 'A-GCTC') + assert s1inds_gap_left == [0, 2, 2, 2, 3] + assert s1inds_gap_right == [0, 2, 3, 3, 3] +