Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the assignment of multiple quantification window coordinates #38

Merged
merged 31 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8b6d273
Mckay/pd warnings (#45)
mbowcut2 Mar 11, 2024
fa6848f
Extract out quantification window coordinate function
Colelyman Feb 15, 2024
b3760bb
Refactor get_quant_window_coordinates function into two
Colelyman Feb 15, 2024
0ffd2cb
Handling qwc: add unit tests, refactor some more and add documentation
Colelyman Feb 16, 2024
eac72d7
Extract out get_relative_coordinates function
Colelyman Feb 16, 2024
8a5f50d
Add clarifying unit tests for `get_relative_coordinates`
Colelyman Feb 19, 2024
7db87e0
Refactor cloned indexes to use ref_positions instead of s1inds
Colelyman Mar 12, 2024
3f00008
fixed function for getting cloned qwc idxs
mbowcut2 Mar 18, 2024
baf7561
added tests for cloned qwc function
mbowcut2 Mar 18, 2024
e6b74ce
Introduce pandas sorting in CRISPRessoCompare (#47)
Snicker7 Mar 14, 2024
35c27da
Fix interleaved fastq input in CRISPRessoPooled and suppress CRISPRes…
Colelyman Mar 14, 2024
59085a0
Bug Fix - 367 (#35)
mbowcut2 Mar 14, 2024
737db80
Merge remote-tracking branch 'origin/master' into cole/fix-qwc
mbowcut2 Mar 20, 2024
2938e81
removed if check
mbowcut2 Mar 22, 2024
70dc8d1
implemented last test
mbowcut2 Mar 22, 2024
4b7745e
changed NT to BadParameterException
mbowcut2 Mar 22, 2024
2246c1c
changed tests, NT to BadParameter exceptions
mbowcut2 Mar 22, 2024
cc66985
Uncomment and correct tests for `get_relative_coordinates`
Colelyman Mar 22, 2024
f243d85
finished qwc tests
mbowcut2 Mar 22, 2024
ffc8c32
0 is an acceptable qwc
mbowcut2 Mar 22, 2024
9756e4a
new get_relative_coords function
mbowcut2 Mar 22, 2024
298d3b1
added relative coordinate tests
mbowcut2 Mar 26, 2024
5fc20ff
removed unused functions
mbowcut2 Mar 26, 2024
9ecd7f7
formatting
mbowcut2 Mar 26, 2024
b847559
check for 0 qwc
mbowcut2 Mar 26, 2024
d21feb5
remove test code
mbowcut2 Mar 26, 2024
174b6c7
remove comment
mbowcut2 Mar 26, 2024
ffb31ce
Move read filtering to after merging in CRISPResso (#39)
Colelyman Mar 22, 2024
85c28b0
GitHub actions on pr (#51)
Snicker7 Mar 22, 2024
6ce79b5
Run tests on PR only when opening PR (#53)
Snicker7 Mar 22, 2024
481333e
Update reports (#52)
Colelyman Mar 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 80 additions & 12 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,80 @@ def get_n_reads_bam(bam_filename,bam_chr_loc=""):

#########################################


def split_quant_window_coordinates(quant_window_coordinates):
"""Split the quantification window coordinates to be iterated over.

Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.

Returns
-------
list of tuples
Where each element is a tuple and the first element of the tuple is the
start of the range and the second element is the end of the range.
"""
coord_re = re.compile(r'^(\d+)-(\d+)$')
try:
return [tuple(map(int, coord_re.match(c).groups())) for c in quant_window_coordinates.split('_')]
except:
raise CRISPRessoShared.NTException("Cannot parse analysis window coordinate '" + str(quant_window_coordinates))
mbowcut2 marked this conversation as resolved.
Show resolved Hide resolved


def get_include_idxs_from_quant_window_coordinates(quant_window_coordinates):
"""Get the include_idxs from the quantification window coordinates.

Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.

Returns
-------
list of int
The include_idxs to be used for quantification, which is the quantification
window coordinates expanded to include all individual indexes contained therein.
"""
return [
i
for coord in split_quant_window_coordinates(quant_window_coordinates)
for i in range(coord[0], coord[1] + 1)
]


def get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, idxs):
"""Get the include_idxs from the quantification window coordinates, but adjusted according to s1ind.

Parameters
----------
quant_window_coordinates: str
The quantification window coordinates, in the form "5-10_100-101", where
the "_" delimits separate ranges and the "-" delimits the range itself.
s1inds: list of int
The index values mapped to the amplicon from which this is being cloned.
mbowcut2 marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
list of int
The include_idxs to be used for quantification, which is the quantification
window coordinates expanded to include all individual indexes contained therein,
but adjusted according to s1inds.
"""
include_idxs = []
for i, x in enumerate(idxs):
if i > 0:
if abs(idxs[i-1]) == x:
idxs[i] = -1 * abs(x)
mbowcut2 marked this conversation as resolved.
Show resolved Hide resolved
for coord in split_quant_window_coordinates(quant_window_coordinates):
include_idxs.extend(idxs[coord[0]:coord[1] + 1])

return list(filter(lambda x: x > 0, include_idxs))


def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length):
"""
For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence)
Expand Down Expand Up @@ -1968,19 +2042,13 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
#quantification window coordinates override other options
if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "":
if amplicon_quant_window_coordinates_arr[this_ref_idx] != "":
this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[this_ref_idx]
this_include_idxs = get_include_idxs_from_quant_window_coordinates(amplicon_quant_window_coordinates_arr[this_ref_idx])
else:
this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[clone_ref_idx]
this_include_idxs = []
these_coords = this_quant_window_coordinates.split("_")
for coord in these_coords:
coordRE = re.match(r'^(\d+)-(\d+)$', coord)
if coordRE:
start = s1inds[int(coordRE.group(1))]
end = s1inds[int(coordRE.group(2)) + 1]
this_include_idxs.extend(range(start, end))
else:
raise NTException("Cannot parse analysis window coordinate '" + str(coord))
this_include_idxs = get_cloned_include_idxs_from_quant_window_coordinates(
amplicon_quant_window_coordinates_arr[clone_ref_idx],
s1inds.copy(),
)

#subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded)
this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs'])))
this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
Expand Down
58 changes: 41 additions & 17 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,46 @@ def set_guide_array(vals, guides, property_name):
return ret_array


def get_relative_coordinates(to_sequence, from_sequence):
"""Given an alignment, get the relative coordinates of the second sequence to the first.

For example, from_sequence[i] matches to to_sequence[inds[i]]. A `-1`
indicates a gap at the beginning of `to_sequence`.

Parameters
----------
to_sequence : str
The alignment of the first sequence (where the coordinates are relative to)
from_sequence : str
The alignment of the second sequence

Returns
-------
s1inds_gap_left : list of int
The relative coordinates of the second sequence to the first, where gaps
in the first sequence are filled with the left value.
s1inds_gap_right : list of int
The relative coordinates of the second sequence to the first, where gaps
in the first sequence are filled with the right value.
"""
s1inds_gap_left = []
s1inds_gap_right = []
s1idx_left = -1
s1idx_right = 0
s2idx = -1
for ix in range(len(to_sequence)):
if to_sequence[ix] != "-":
s1idx_left += 1
if from_sequence[ix] != "-":
s2idx += 1
s1inds_gap_left.append(s1idx_left)
s1inds_gap_right.append(s1idx_right)
if to_sequence[ix] != "-":
s1idx_right += 1

return s1inds_gap_left, s1inds_gap_right


def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_wunsch_gap_open,
needleman_wunsch_gap_extend):
"""
Expand All @@ -1701,23 +1741,7 @@ def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_
gap_open=needleman_wunsch_gap_open,
gap_extend=needleman_wunsch_gap_extend,
gap_incentive=this_gap_incentive)
# print(fws1)
# print(fws2)
s1inds_l = []
s1inds_r = []
s1ix_l = -1
s1ix_r = 0
s2ix = -1
for ix in range(len(fws1)):
if fws1[ix] != "-":
s1ix_l += 1
if fws2[ix] != "-":
s2ix += 1
s1inds_l.append(s1ix_l)
s1inds_r.append(s1ix_r)
if fws1[ix] != "-":
s1ix_r += 1
return s1inds_l, s1inds_r
return get_relative_coordinates(fws1, fws2)


def get_mismatches(seq_1, seq_2, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend):
Expand Down
101 changes: 100 additions & 1 deletion tests/unit_tests/test_CRISPRessoCORE.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Unit tests for CRISPResso2CORE."""
import pytest

from CRISPResso2 import CRISPRessoCORE
from CRISPResso2 import CRISPRessoCORE, CRISPRessoShared

def test_get_consensus_alignment_from_pairs():
"""Tests for generating consensus alignments from paired reads."""
Expand Down Expand Up @@ -93,6 +93,105 @@ def test_get_consensus_alignment_from_pairs():
assert ref_seq == "ATCGATCGAT"
assert score == 50 #double check this score... should be 5/10


Colelyman marked this conversation as resolved.
Show resolved Hide resolved
def test_split_quant_window_coordinates_single():
assert [(5, 10)] == CRISPRessoCORE.split_quant_window_coordinates('5-10')


def test_split_quant_window_coordinates_multiple():
assert CRISPRessoCORE.split_quant_window_coordinates('2-5_10-12') == [(2, 5), (10, 12)]


def test_split_quant_window_coordinates_error():
with pytest.raises(CRISPRessoShared.NTException):
mbowcut2 marked this conversation as resolved.
Show resolved Hide resolved
CRISPRessoCORE.split_quant_window_coordinates('a-5')


def test_split_quant_window_coordinates_empty():
with pytest.raises(CRISPRessoShared.NTException):
CRISPRessoCORE.split_quant_window_coordinates('_')


def test_split_quant_window_coordinates_partially_empty():
with pytest.raises(CRISPRessoShared.NTException):
CRISPRessoCORE.split_quant_window_coordinates('1-3_')


def test_split_quant_window_coordinates_blank():
with pytest.raises(CRISPRessoShared.NTException):
CRISPRessoCORE.split_quant_window_coordinates('')


def test_get_include_idxs_from_quant_window_coordinates():
quant_window_coordinates = '1-10_12-20'
assert CRISPRessoCORE.get_include_idxs_from_quant_window_coordinates(quant_window_coordinates) == [*list(range(1, 11)), *list(range(12, 21))]


def test_get_cloned_include_idxs_from_quant_window_coordinates():
quant_window_coordinates = '1-10_12-20'
s1inds = list(range(22))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 11)), *list(range(12, 21))]


def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_beginning():
quant_window_coordinates = '1-10_12-20'
# represents a 5bp insertion at the beginning (left)
s1inds = list(range(5, 27))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(6, 16)), *list(range(17, 26))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_beginning():
quant_window_coordinates = '1-10_12-20'
# represents a 5bp deletion at the beginning (left)
s1inds = [-1, -1, -1, -1, -1 ] + list(range(26))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(7, 16))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion():
quant_window_coordinates = '10-20_35-40'
# represents a 7bp deletion in the middle
s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 21)), *list(range(35-7, 41-7))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_modified():
quant_window_coordinates = '10-25_35-40'
# represents a 7bp deletion in the middle, where part of the QW is deleted
# [0, 1, 3, 4, ... , 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, ... , 33]
s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34))
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 23)), *list(range(35-7, 41-7))]


def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_end_modified():
# 5 bp deletion at end of 20 bp sequence
quant_window_coordinates = '1-5_10-20'
s1inds = [*list(range(16)), *[15, 15, 15, 15, 15]]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(10, 16))]

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion():
# 5 bp deletion and 5 bp insertion
quant_window_coordinates = '1-5_10-20'
s1inds = [0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 15]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *[6, 7, 8, 9, 15]]

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion_modified():
# 5 bp deletion and 5 bp insertion with modified QW
pytest.xfail('Not implemented yet')

def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_across_qw():
# 6 bp insertion in middle of 4 bp sequence
quant_window_coordinates = '1-4'
s1inds = [0,1,2,9,10]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1,2,9,10]

def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_entire_qw():
# 5 bp deletion of entire qw
quant_window_coordinates = '1-4_7-10'
s1inds = [0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6]
assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1, 2, 3, 4]

# NOTE: include_idxs should be exactly the numbers as specificed in the string, inclusively
# For example: 20-30_175-185 --> [20, 21, ..., 30, 175, 176, ..., 185]

# BUG: you can't include the last base in the QW or you get an out of range error -- this is extant beyond the function tested here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be addressed in the PR? Or would it be better addressed in a separate PR?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PS, what would Uncle Bob think of this comment? ;)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left this as a reminder. It worked! haha

So, this touches on a key issue: QWC's are (supposedly) 1-indexed. All of these changes assume that downstream calculations correctly utilize 1-index-based coordinates as input by the user, i.e. the user inputs '1-5' --> include_idxs == [1, 2, 3, 4, 5]. The test cases verify that these indexes are selected correctly.

So, I think the only necessary change to get the last char in the sequence would be to append one more index onto s1inds before we slice it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think technically they should be 0-indexed (but a value of 0 will disable the filter).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per the documentation "Indexes are 0-based, meaning that the first nucleotide is position 0." Thanks @kclem. So, there is no last bp bug.

However, including 0 in the qwc throws a BadParameterException, I'm assuming because 0 disables the filter. Should we keep this behavior as is?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mbowcut2 I think it should behave such that if you have -qwc 0-10_15-22,0 the first amplicon will have quantification windows 0-10 and 15-22 and then the second amplicon will have the quantification windo coordinates disabled (and therefore be set by where the sgRNA aligns).

Also, -qwc 0 will outright disable it.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Colelyman okay, it works now.

if __name__ == "__main__":
# execute only if run as a script
test_get_consensus_alignment_from_pairs()
31 changes: 31 additions & 0 deletions tests/unit_tests/test_CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,34 @@ def test_get_mismatches():
-3,
)
assert len(mismatch_cords) == 6


def test_get_relative_coordinates():
s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'TTCGT')
assert s1inds_gap_left == [0, 1, 2, 3, 4]
assert s1inds_gap_right == [0, 1, 2, 3, 4]


def test_get_relative_coordinates_to_gap():
to_sequence = 'TTC-T'
from_sequence = 'TTCGT'
s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates(to_sequence, from_sequence)
assert s1inds_gap_left == [0, 1, 2, 2, 3] # should this be [0, 1, 2, 2, 4]?
assert s1inds_gap_right == [0, 1, 2, 3, 3] # should this be [0, 1, 2, 4, 4]?
assert from_sequence[0] == to_sequence[s1inds_gap_left[0]]
assert from_sequence[1] == to_sequence[s1inds_gap_left[1]]
assert from_sequence[2] == to_sequence[s1inds_gap_left[2]]
assert from_sequence[2] == to_sequence[s1inds_gap_left[2]]
# assert from_sequence[4] == to_sequence[s1inds_gap_left[4]] # this should accept, right?
mbowcut2 marked this conversation as resolved.
Show resolved Hide resolved


def test_get_relative_coordinates_start_gap():
s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('--CGT', 'TTCGT')
assert s1inds_gap_left == [-1, -1, 0, 1, 2] # should this be [-1, -1, 2, 3, 4]?
assert s1inds_gap_right == [0, 0, 0, 1, 2] # should this be [2, 2, 2, 3, 4]?


def test_get_relative_coordinates_from_gap():
s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'ATC-T')
assert s1inds_gap_left == [0, 1, 2, 4]
assert s1inds_gap_right == [0, 1, 2, 4]
Loading