Skip to content

Commit

Permalink
Merge branch 'master' into trevor/fastp-integration
Browse files Browse the repository at this point in the history
  • Loading branch information
Colelyman committed Mar 28, 2024
2 parents 7a2f7f7 + 47b3908 commit e674c37
Show file tree
Hide file tree
Showing 6 changed files with 287 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
cp -r * ../CRISPResso2_copy
- name: Copy C2_tests repo
uses: actions/checkout@master
uses: actions/checkout@v3
with:
repository: edilytics/CRISPResso2_tests
token: ${{ secrets.ACCESS_CRISPRESSO2_TESTS }}
Expand Down
107 changes: 87 additions & 20 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,79 @@ def get_n_reads_bam(bam_filename,bam_chr_loc=""):

#########################################


def split_quant_window_coordinates(quant_window_coordinates):
"""Split the quantification window coordinates to be iterated over.
Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.
Returns
-------
list of tuples
Where each element is a tuple and the first element of the tuple is the
start of the range and the second element is the end of the range.
"""
coord_re = re.compile(r'^(\d+)-(\d+)$')
try:
return [tuple(map(int, coord_re.match(c).groups())) for c in quant_window_coordinates.split('_')]
except:
raise CRISPRessoShared.BadParameterException("Cannot parse analysis window coordinate '" + str(quant_window_coordinates))


def get_include_idxs_from_quant_window_coordinates(quant_window_coordinates):
"""Get the include_idxs from the quantification window coordinates.
Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.
Returns
-------
list of int
The include_idxs to be used for quantification, which is the quantification
window coordinates expanded to include all individual indexes contained therein.
"""
return [
i
for coord in split_quant_window_coordinates(quant_window_coordinates)
for i in range(coord[0], coord[1] + 1)
]


def get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, idxs):
"""Get the include_idxs from the quantification window coordinates, but adjusted according to s1ind.
Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.
idxs: list of int
The index values mapped to the amplicon from which this is being cloned.
Returns
-------
list of int
The include_idxs to be used for quantification, which is the quantification
window coordinates expanded to include all individual indexes contained therein,
but adjusted according to s1inds.
"""
include_idxs = []
for i in range(1, len(idxs)):
if abs(idxs[i-1]) == idxs[i]:
idxs[i] = -1 * abs(idxs[i])
for coord in split_quant_window_coordinates(quant_window_coordinates):
include_idxs.extend(idxs[coord[0]:coord[1] + 1])

return list(filter(lambda x: x >= 0, include_idxs))


def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length):
"""
For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence)
Expand Down Expand Up @@ -443,7 +516,7 @@ def process_fastq(fastq_filename, variantCache, ref_names, refs, args):
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)

pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold seq to search
if args.prime_editing_pegRNA_scaffold_seq != "":
if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)

not_aln = {} #cache for reads that don't align
Expand Down Expand Up @@ -518,7 +591,7 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names,
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)

pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)

not_aln = {} #cache for reads that don't align
Expand Down Expand Up @@ -657,7 +730,7 @@ def process_fastq_write_out(fastq_input, fastq_output, variantCache, ref_names,
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)

pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} #cache for reads that don't align
not_aln[''] = "" #add empty sequence to the not_aln in case the fastq has an extra newline at the end
Expand Down Expand Up @@ -786,7 +859,7 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)

pe_scaffold_dna_info = (0, None) # scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} # cache for reads that don't align
not_aln[''] = "" # add empty sequence to the not_aln in case the fastq has an extra newline at the end
Expand Down Expand Up @@ -1390,6 +1463,8 @@ def rreplace(s, old, new):


#Prime editing
if 'Prime-edited' in amplicon_name_arr:
raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.")
prime_editing_extension_seq_dna = "" #global var for the editing extension sequence for the scaffold quantification below
prime_editing_edited_amp_seq = ""
if args.prime_editing_pegRNA_extension_seq != "":
Expand Down Expand Up @@ -1451,8 +1526,6 @@ def rreplace(s, old, new):
if new_ref in amplicon_seq_arr:
raise CRISPRessoShared.BadParameterException('The calculated prime-edited amplicon is the same as the reference sequence.')
amplicon_seq_arr.append(new_ref)
if 'Prime-edited' in amplicon_name_arr:
raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.")
amplicon_name_arr.append('Prime-edited')
amplicon_quant_window_coordinates_arr.append('')
prime_editing_edited_amp_seq = new_ref
Expand Down Expand Up @@ -1994,21 +2067,15 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
refs[ref_name]['contains_guide'] = refs[clone_ref_name]['contains_guide']

#quantification window coordinates override other options
if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "":
if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "" and amplicon_quant_window_coordinates_arr[this_ref_idx] != '0':
if amplicon_quant_window_coordinates_arr[this_ref_idx] != "":
this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[this_ref_idx]
this_include_idxs = get_include_idxs_from_quant_window_coordinates(amplicon_quant_window_coordinates_arr[this_ref_idx])
else:
this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[clone_ref_idx]
this_include_idxs = []
these_coords = this_quant_window_coordinates.split("_")
for coord in these_coords:
coordRE = re.match(r'^(\d+)-(\d+)$', coord)
if coordRE:
start = s1inds[int(coordRE.group(1))]
end = s1inds[int(coordRE.group(2)) + 1]
this_include_idxs.extend(range(start, end))
else:
raise NTException("Cannot parse analysis window coordinate '" + str(coord))
this_include_idxs = get_cloned_include_idxs_from_quant_window_coordinates(
amplicon_quant_window_coordinates_arr[clone_ref_idx],
s1inds.copy(),
)

#subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded)
this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs'])))
this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
Expand Down Expand Up @@ -2296,7 +2363,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited

info('Done!', {'percent_complete': 20})

if args.prime_editing_pegRNA_scaffold_seq != "":
if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "":
#introduce a new ref (that we didn't align to) called 'Scaffold Incorporated' -- copy it from the ref called 'prime-edited'
new_ref = deepcopy(refs['Prime-edited'])
new_ref['name'] = "Scaffold-incorporated"
Expand Down
58 changes: 41 additions & 17 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1683,6 +1683,46 @@ def set_guide_array(vals, guides, property_name):
return ret_array


def get_relative_coordinates(to_sequence, from_sequence):
"""Given an alignment, get the relative coordinates of the second sequence to the first.
For example, from_sequence[i] matches to to_sequence[inds[i]]. A `-1`
indicates a gap at the beginning of `to_sequence`.
Parameters
----------
to_sequence : str
The alignment of the first sequence (where the coordinates are relative to)
from_sequence : str
The alignment of the second sequence
Returns
-------
s1inds_gap_left : list of int
The relative coordinates of the second sequence to the first, where gaps
in the first sequence are filled with the left value.
s1inds_gap_right : list of int
The relative coordinates of the second sequence to the first, where gaps
in the first sequence are filled with the right value.
"""
s1inds_gap_left = []
s1inds_gap_right = []
s1idx_left = -1
s1idx_right = 0
s2idx = -1
for ix in range(len(to_sequence)):
if to_sequence[ix] != "-":
s1idx_left += 1
if from_sequence[ix] != "-":
s2idx += 1
s1inds_gap_left.append(s1idx_left)
s1inds_gap_right.append(s1idx_right)
if to_sequence[ix] != "-":
s1idx_right += 1

return s1inds_gap_left, s1inds_gap_right


def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_wunsch_gap_open,
needleman_wunsch_gap_extend):
"""
Expand All @@ -1704,23 +1744,7 @@ def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_
gap_open=needleman_wunsch_gap_open,
gap_extend=needleman_wunsch_gap_extend,
gap_incentive=this_gap_incentive)
# print(fws1)
# print(fws2)
s1inds_l = []
s1inds_r = []
s1ix_l = -1
s1ix_r = 0
s2ix = -1
for ix in range(len(fws1)):
if fws1[ix] != "-":
s1ix_l += 1
if fws2[ix] != "-":
s2ix += 1
s1inds_l.append(s1ix_l)
s1inds_r.append(s1ix_r)
if fws1[ix] != "-":
s1ix_r += 1
return s1inds_l, s1inds_r
return get_relative_coordinates(fws1, fws2)


def get_mismatches(seq_1, seq_2, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend):
Expand Down
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ MAINTAINER Kendell Clement
RUN apt-get update && apt-get install gcc g++ bowtie2 samtools libsys-hostname-long-perl \
-y --no-install-recommends \
&& apt-get clean \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /usr/share/man/* \
&& rm -rf /usr/share/doc/* \
&& conda install -c defaults -c conda-forge -c bioconda -y -n base --debug -c bioconda fastp numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib pandas plotly\
&& conda install -c defaults -c conda-forge -c bioconda -y -n base --debug fastp numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib-base pandas plotly\
&& conda clean --all --yes

#install ms fonts
Expand All @@ -40,4 +41,4 @@ RUN python setup.py install \
&& CRISPRessoCompare -h


ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"]
ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"]
102 changes: 101 additions & 1 deletion tests/unit_tests/test_CRISPRessoCORE.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Unit tests for CRISPResso2CORE."""
import pytest

from CRISPResso2 import CRISPRessoCORE
from CRISPResso2 import CRISPRessoCORE, CRISPRessoShared

def test_get_consensus_alignment_from_pairs():
"""Tests for generating consensus alignments from paired reads."""
Expand Down Expand Up @@ -93,6 +93,106 @@ def test_get_consensus_alignment_from_pairs():
assert ref_seq == "ATCGATCGAT"
assert score == 50 #double check this score... should be 5/10


def test_split_quant_window_coordinates_single():
assert [(5, 10)] == CRISPRessoCORE.split_quant_window_coordinates('5-10')


def test_split_quant_window_coordinates_multiple():
assert CRISPRessoCORE.split_quant_window_coordinates('2-5_10-12') == [(2, 5), (10, 12)]


def test_split_quant_window_coordinates_error():
with pytest.raises(CRISPRessoShared.BadParameterException):
CRISPRessoCORE.split_quant_window_coordinates('a-5')


def test_split_quant_window_coordinates_empty():
with pytest.raises(CRISPRessoShared.BadParameterException):
CRISPRessoCORE.split_quant_window_coordinates('_')


def test_split_quant_window_coordinates_partially_empty():
with pytest.raises(CRISPRessoShared.BadParameterException):
CRISPRessoCORE.split_quant_window_coordinates('1-3_')


def test_split_quant_window_coordinates_blank():
with pytest.raises(CRISPRessoShared.BadParameterException):
CRISPRessoCORE.split_quant_window_coordinates('')


def test_get_include_idxs_from_quant_window_coordinates():
quant_window_coordinates = '1-10_12-20'
assert CRISPRessoCORE.get_include_idxs_from_quant_window_coordinates(quant_window_coordinates) == [*list(range(1, 11)), *list(range(12, 21))]


def test_get_cloned_include_idxs_from_quant_window_coordinates():
quant_window_coordinates = '1-10_12-20'
s1inds = list(range(22))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 11)), *list(range(12, 21))]


def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_beginning():
quant_window_coordinates = '1-10_12-20'
# represents a 5bp insertion at the beginning (left)
s1inds = list(range(5, 27))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(6, 16)), *list(range(17, 26))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_beginning():
quant_window_coordinates = '1-10_12-20'
# represents a 5bp deletion at the beginning (left)
s1inds = [-1, -1, -1, -1, -1 ] + list(range(26))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(0, 6)), *list(range(7, 16))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion():
quant_window_coordinates = '10-20_35-40'
# represents a 7bp deletion in the middle
s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 21)), *list(range(35-7, 41-7))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_modified():
quant_window_coordinates = '10-25_35-40'
# represents a 7bp deletion in the middle, where part of the QW is deleted
# [0, 1, 3, 4, ... , 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, ... , 33]
s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 23)), *list(range(35-7, 41-7))]


def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_end_modified():
# 5 bp deletion at end of 20 bp sequence
quant_window_coordinates = '1-5_10-20'
s1inds = [*list(range(16)), *[15, 15, 15, 15, 15]]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(10, 16))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion():
# 5 bp deletion and 5 bp insertion
quant_window_coordinates = '1-5_10-20'
s1inds = [0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *[6, 7, 8, 9, 15, 16, 17, 18, 19, 20]]

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion_modified():
quant_window_coordinates = '1-5_10-20'
s1inds = [0, 1, 2, 2, 4, 5, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10, 15, 16, 17, 18, 19]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*[1,2,4,5], *[8, 9, 10, 15, 16, 17, 18, 19]]

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_across_qw():
# 6 bp insertion in middle of 4 bp sequence
quant_window_coordinates = '1-4'
s1inds = [0,1,2,9,10]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1,2,9,10]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_entire_qw():
# 5 bp deletion of entire qw
quant_window_coordinates = '1-4_7-10'
s1inds = [0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1, 2, 3, 4]

def test_get_cloned_include_idxs_from_quant_window_coordinates_include_zero():
quant_window_coordinates = '0-5'
s1inds = [0, 1, 2, 3, 4, 5]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [0, 1, 2, 3, 4, 5]

if __name__ == "__main__":
# execute only if run as a script
test_get_consensus_alignment_from_pairs()
Loading

0 comments on commit e674c37

Please sign in to comment.