diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 952b67a8..0a0560b9 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -42,7 +42,7 @@ jobs: cp -r * ../CRISPResso2_copy - name: Copy C2_tests repo - uses: actions/checkout@master + uses: actions/checkout@v3 with: repository: edilytics/CRISPResso2_tests token: ${{ secrets.ACCESS_CRISPRESSO2_TESTS }} diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index fe669513..131dcc30 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -172,6 +172,79 @@ def get_n_reads_bam(bam_filename,bam_chr_loc=""): ######################################### + +def split_quant_window_coordinates(quant_window_coordinates): + """Split the quantification window coordinates to be iterated over. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of tuples + Where each element is a tuple and the first element of the tuple is the + start of the range and the second element is the end of the range. + """ + coord_re = re.compile(r'^(\d+)-(\d+)$') + try: + return [tuple(map(int, coord_re.match(c).groups())) for c in quant_window_coordinates.split('_')] + except: + raise CRISPRessoShared.BadParameterException("Cannot parse analysis window coordinate '" + str(quant_window_coordinates)) + + +def get_include_idxs_from_quant_window_coordinates(quant_window_coordinates): + """Get the include_idxs from the quantification window coordinates. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein. + """ + return [ + i + for coord in split_quant_window_coordinates(quant_window_coordinates) + for i in range(coord[0], coord[1] + 1) + ] + + +def get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, idxs): + """Get the include_idxs from the quantification window coordinates, but adjusted according to s1ind. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + idxs: list of int + The index values mapped to the amplicon from which this is being cloned. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein, + but adjusted according to s1inds. + """ + include_idxs = [] + for i in range(1, len(idxs)): + if abs(idxs[i-1]) == idxs[i]: + idxs[i] = -1 * abs(idxs[i]) + for coord in split_quant_window_coordinates(quant_window_coordinates): + include_idxs.extend(idxs[coord[0]:coord[1] + 1]) + + return list(filter(lambda x: x >= 0, include_idxs)) + + def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length): """ For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence) @@ -443,7 +516,7 @@ def process_fastq(fastq_filename, variantCache, ref_names, refs, args): aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold seq to search - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align @@ -518,7 +591,7 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names, aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align @@ -657,7 +730,7 @@ def process_fastq_write_out(fastq_input, fastq_output, variantCache, ref_names, aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align not_aln[''] = "" #add empty sequence to the not_aln in case the fastq has an extra newline at the end @@ -786,7 +859,7 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) # scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} # cache for reads that don't align not_aln[''] = "" # add empty sequence to the not_aln in case the fastq has an extra newline at the end @@ -1390,6 +1463,8 @@ def rreplace(s, old, new): #Prime editing + if 'Prime-edited' in amplicon_name_arr: + raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.") prime_editing_extension_seq_dna = "" #global var for the editing extension sequence for the scaffold quantification below prime_editing_edited_amp_seq = "" if args.prime_editing_pegRNA_extension_seq != "": @@ -1451,8 +1526,6 @@ def rreplace(s, old, new): if new_ref in amplicon_seq_arr: raise CRISPRessoShared.BadParameterException('The calculated prime-edited amplicon is the same as the reference sequence.') amplicon_seq_arr.append(new_ref) - if 'Prime-edited' in amplicon_name_arr: - raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.") amplicon_name_arr.append('Prime-edited') amplicon_quant_window_coordinates_arr.append('') prime_editing_edited_amp_seq = new_ref @@ -1994,21 +2067,15 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited refs[ref_name]['contains_guide'] = refs[clone_ref_name]['contains_guide'] #quantification window coordinates override other options - if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "": + if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "" and amplicon_quant_window_coordinates_arr[this_ref_idx] != '0': if amplicon_quant_window_coordinates_arr[this_ref_idx] != "": - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[this_ref_idx] + this_include_idxs = get_include_idxs_from_quant_window_coordinates(amplicon_quant_window_coordinates_arr[this_ref_idx]) else: - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[clone_ref_idx] - this_include_idxs = [] - these_coords = this_quant_window_coordinates.split("_") - for coord in these_coords: - coordRE = re.match(r'^(\d+)-(\d+)$', coord) - if coordRE: - start = s1inds[int(coordRE.group(1))] - end = s1inds[int(coordRE.group(2)) + 1] - this_include_idxs.extend(range(start, end)) - else: - raise NTException("Cannot parse analysis window coordinate '" + str(coord)) + this_include_idxs = get_cloned_include_idxs_from_quant_window_coordinates( + amplicon_quant_window_coordinates_arr[clone_ref_idx], + s1inds.copy(), + ) + #subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded) this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs']))) this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs)))) @@ -2296,7 +2363,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited info('Done!', {'percent_complete': 20}) - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": #introduce a new ref (that we didn't align to) called 'Scaffold Incorporated' -- copy it from the ref called 'prime-edited' new_ref = deepcopy(refs['Prime-edited']) new_ref['name'] = "Scaffold-incorporated" diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index d7b214cc..47d0bdf2 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -1683,6 +1683,46 @@ def set_guide_array(vals, guides, property_name): return ret_array +def get_relative_coordinates(to_sequence, from_sequence): + """Given an alignment, get the relative coordinates of the second sequence to the first. + + For example, from_sequence[i] matches to to_sequence[inds[i]]. A `-1` + indicates a gap at the beginning of `to_sequence`. + + Parameters + ---------- + to_sequence : str + The alignment of the first sequence (where the coordinates are relative to) + from_sequence : str + The alignment of the second sequence + + Returns + ------- + s1inds_gap_left : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the left value. + s1inds_gap_right : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the right value. + """ + s1inds_gap_left = [] + s1inds_gap_right = [] + s1idx_left = -1 + s1idx_right = 0 + s2idx = -1 + for ix in range(len(to_sequence)): + if to_sequence[ix] != "-": + s1idx_left += 1 + if from_sequence[ix] != "-": + s2idx += 1 + s1inds_gap_left.append(s1idx_left) + s1inds_gap_right.append(s1idx_right) + if to_sequence[ix] != "-": + s1idx_right += 1 + + return s1inds_gap_left, s1inds_gap_right + + def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend): """ @@ -1704,23 +1744,7 @@ def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_ gap_open=needleman_wunsch_gap_open, gap_extend=needleman_wunsch_gap_extend, gap_incentive=this_gap_incentive) - # print(fws1) - # print(fws2) - s1inds_l = [] - s1inds_r = [] - s1ix_l = -1 - s1ix_r = 0 - s2ix = -1 - for ix in range(len(fws1)): - if fws1[ix] != "-": - s1ix_l += 1 - if fws2[ix] != "-": - s2ix += 1 - s1inds_l.append(s1ix_l) - s1inds_r.append(s1ix_r) - if fws1[ix] != "-": - s1ix_r += 1 - return s1inds_l, s1inds_r + return get_relative_coordinates(fws1, fws2) def get_mismatches(seq_1, seq_2, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend): diff --git a/Dockerfile b/Dockerfile index 1027ae48..5667580b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,10 +10,11 @@ MAINTAINER Kendell Clement RUN apt-get update && apt-get install gcc g++ bowtie2 samtools libsys-hostname-long-perl \ -y --no-install-recommends \ && apt-get clean \ + && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /usr/share/man/* \ && rm -rf /usr/share/doc/* \ - && conda install -c defaults -c conda-forge -c bioconda -y -n base --debug -c bioconda fastp numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib pandas plotly\ + && conda install -c defaults -c conda-forge -c bioconda -y -n base --debug fastp numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib-base pandas plotly\ && conda clean --all --yes #install ms fonts @@ -40,4 +41,4 @@ RUN python setup.py install \ && CRISPRessoCompare -h -ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"] +ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"] \ No newline at end of file diff --git a/tests/unit_tests/test_CRISPRessoCORE.py b/tests/unit_tests/test_CRISPRessoCORE.py index 268b6727..9a14549f 100644 --- a/tests/unit_tests/test_CRISPRessoCORE.py +++ b/tests/unit_tests/test_CRISPRessoCORE.py @@ -1,7 +1,7 @@ """Unit tests for CRISPResso2CORE.""" import pytest -from CRISPResso2 import CRISPRessoCORE +from CRISPResso2 import CRISPRessoCORE, CRISPRessoShared def test_get_consensus_alignment_from_pairs(): """Tests for generating consensus alignments from paired reads.""" @@ -93,6 +93,106 @@ def test_get_consensus_alignment_from_pairs(): assert ref_seq == "ATCGATCGAT" assert score == 50 #double check this score... should be 5/10 + +def test_split_quant_window_coordinates_single(): + assert [(5, 10)] == CRISPRessoCORE.split_quant_window_coordinates('5-10') + + +def test_split_quant_window_coordinates_multiple(): + assert CRISPRessoCORE.split_quant_window_coordinates('2-5_10-12') == [(2, 5), (10, 12)] + + +def test_split_quant_window_coordinates_error(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('a-5') + + +def test_split_quant_window_coordinates_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('_') + + +def test_split_quant_window_coordinates_partially_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('1-3_') + + +def test_split_quant_window_coordinates_blank(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('') + + +def test_get_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + assert CRISPRessoCORE.get_include_idxs_from_quant_window_coordinates(quant_window_coordinates) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + s1inds = list(range(22)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp insertion at the beginning (left) + s1inds = list(range(5, 27)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(6, 16)), *list(range(17, 26))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp deletion at the beginning (left) + s1inds = [-1, -1, -1, -1, -1 ] + list(range(26)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(0, 6)), *list(range(7, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion(): + quant_window_coordinates = '10-20_35-40' + # represents a 7bp deletion in the middle + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 21)), *list(range(35-7, 41-7))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_modified(): + quant_window_coordinates = '10-25_35-40' + # represents a 7bp deletion in the middle, where part of the QW is deleted + # [0, 1, 3, 4, ... , 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, ... , 33] + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 23)), *list(range(35-7, 41-7))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_end_modified(): + # 5 bp deletion at end of 20 bp sequence + quant_window_coordinates = '1-5_10-20' + s1inds = [*list(range(16)), *[15, 15, 15, 15, 15]] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(10, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion(): + # 5 bp deletion and 5 bp insertion + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *[6, 7, 8, 9, 15, 16, 17, 18, 19, 20]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion_modified(): + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 2, 4, 5, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10, 15, 16, 17, 18, 19] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*[1,2,4,5], *[8, 9, 10, 15, 16, 17, 18, 19]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_across_qw(): + # 6 bp insertion in middle of 4 bp sequence + quant_window_coordinates = '1-4' + s1inds = [0,1,2,9,10] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1,2,9,10] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_entire_qw(): + # 5 bp deletion of entire qw + quant_window_coordinates = '1-4_7-10' + s1inds = [0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1, 2, 3, 4] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_include_zero(): + quant_window_coordinates = '0-5' + s1inds = [0, 1, 2, 3, 4, 5] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [0, 1, 2, 3, 4, 5] + if __name__ == "__main__": # execute only if run as a script test_get_consensus_alignment_from_pairs() diff --git a/tests/unit_tests/test_CRISPRessoShared.py b/tests/unit_tests/test_CRISPRessoShared.py index da1bc479..f6ce67a0 100644 --- a/tests/unit_tests/test_CRISPRessoShared.py +++ b/tests/unit_tests/test_CRISPRessoShared.py @@ -20,4 +20,57 @@ def test_get_mismatches(): -5, -3, ) - assert len(mismatch_cords) == 6 + assert len(mismatch_cords) == 6 + +def test_get_relative_coordinates(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'TTCGT') + assert s1inds_gap_left == [0, 1, 2, 3, 4] + assert s1inds_gap_right == [0, 1, 2, 3, 4] + + +def test_get_relative_coordinates_to_gap(): + # unaligned sequences + seq_1 = 'TTCGT' + seq_2 = 'TTCT' + + # aligned_sequences + to_sequence = 'TTC-T' + from_sequence = 'TTCGT' + + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates(to_sequence, from_sequence) + assert s1inds_gap_left == [0, 1, 2, 2, 3] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + + + assert seq_1[0] == seq_2[s1inds_gap_left[0]] + assert seq_1[1] == seq_2[s1inds_gap_left[1]] + assert seq_1[2] == seq_2[s1inds_gap_left[2]] + assert seq_1[4] == seq_2[s1inds_gap_left[4]] + + +def test_get_relative_coordinates_start_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('--CGT', 'TTCGT') + assert s1inds_gap_left == [-1, -1, 0, 1, 2] + assert s1inds_gap_right == [0, 0, 0, 1, 2] + + +def test_get_relative_coordinates_from_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'ATC-T') + assert s1inds_gap_left == [0, 1, 2, 4] + assert s1inds_gap_right == [0, 1, 2, 4] + +def test_get_relative_coordinates_end_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATC--', 'ATCGT') + assert s1inds_gap_left == [0, 1, 2, 2, 2] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + +def test_get_relative_coordinates_multiple_gaps(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('AT--TC--G--CC', 'ATCGTCGCGTTCC') + assert s1inds_gap_left == [0, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 5, 6] + assert s1inds_gap_right == [0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6] + +def test_get_relative_coordinates_ind_and_dels(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATG--C', 'A-GCTC') + assert s1inds_gap_left == [0, 2, 2, 2, 3] + assert s1inds_gap_right == [0, 2, 3, 3, 3] +