Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable quantification by sgRNA #348

Merged
merged 3 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CRISPResso2/CRISPRessoBatchCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def main():
discard_guide_positions_overhanging_amplicon_edge = False
if 'discard_guide_positions_overhanging_amplicon_edge' in row:
discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
guide_qw_sizes, curr_amplicon_quant_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
for guide_seq in this_sgRNA_sequences:
Expand Down
3 changes: 2 additions & 1 deletion CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,7 +1688,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited


# Calculate cut sites for this reference
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(this_seq, this_guides, this_guide_mismatches, this_guide_names, this_guide_qw_centers,
this_guide_qw_sizes, this_quant_window_coordinates, args.exclude_bp_from_left, args.exclude_bp_from_right, args.plot_window_size, this_guide_plot_cut_points, args.discard_guide_positions_overhanging_amplicon_edge)

Expand Down Expand Up @@ -1777,6 +1777,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
'sgRNA_names': this_sgRNA_names,
'sgRNA_mismatches': this_sgRNA_mismatches,
'sgRNA_orig_sequences': this_sgRNA_orig_seqs,
'sgRNA_include_idxs': this_sgRNA_include_idxs,
'contains_guide': this_contains_guide,
'contains_coding_seq': this_contains_coding_seq,
'exon_positions': this_exon_positions,
Expand Down
2 changes: 1 addition & 1 deletion CRISPResso2/CRISPRessoMetaCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def main():
if 'discard_guide_positions_overhanging_amplicon_edge' in row:
discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge

(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
for guide_seq in this_sgRNA_sequences:
Expand Down
14 changes: 12 additions & 2 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_sgRNA_plot_idxs : list of indices to be plotted for each sgRNA
this_sgRNA_mismatches: list of mismatches between the guide and the amplicon
this_sgRNA_names : list of names for each sgRNA (to disambiguate in case a sequence aligns to multiple positions)
this_sgRNA_include_idxs : list of indices to be included in quantification per guide
this_include_idxs : list of indices to be included in quantification
this_exclude_idxs : list of indices to be excluded from quantification
"""
Expand All @@ -1364,6 +1365,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_sgRNA_plot_idxs = []
this_sgRNA_mismatches = []
this_sgRNA_names = []
this_sgRNA_include_idxs = []
this_include_idxs = []
this_exclude_idxs = []

Expand Down Expand Up @@ -1396,7 +1398,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
rv_matches = re.finditer(rv_regex, ref_seq, flags=re.IGNORECASE)

# for every match, append:
# this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,include_idxs
# this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,this_sgRNA_include_idxs,include_idxs
for m in fw_matches:
cut_p = m.start() + offset_fw
if discard_guide_positions_overhanging_amplicon_edge and (
Expand All @@ -1411,6 +1413,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
this_include_idxs.extend(range(st, en))
this_sgRNA_include_idxs.append(list(range(st, en)))
else:
this_sgRNA_include_idxs.append([])

this_sgRNA_name = guide_names[guide_idx]
if match_count == 1:
Expand Down Expand Up @@ -1440,6 +1445,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
this_include_idxs.extend(range(st, en))
this_sgRNA_include_idxs.append(list(range(st, en)))
else:
this_sgRNA_include_idxs.append([])

this_sgRNA_name = guide_names[guide_idx]
if match_count == 1:
Expand Down Expand Up @@ -1475,11 +1483,13 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
theseCoords) + "'. Coordinates must be given in the form start-end e.g. 5-10 . Please check the --analysis_window_coordinate parameter.")
given_include_idxs = coordinate_include_idxs
this_include_idxs = coordinate_include_idxs
this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)
# otherwise, take quantification centers + windows calculated for each guide above
elif this_sgRNA_cut_points and len(this_include_idxs) > 0:
given_include_idxs = this_include_idxs
else:
this_include_idxs = range(ref_seq_length)
this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)

# flatten the arrays to avoid errors with old numpy library
this_include_idxs = np.ravel(this_include_idxs)
Expand Down Expand Up @@ -1519,7 +1529,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_include_idxs = np.sort(list(this_include_idxs))
this_exclude_idxs = np.sort(list(this_exclude_idxs))

return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs
return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs, this_exclude_idxs


def set_guide_array(vals, guides, property_name):
Expand Down
91 changes: 91 additions & 0 deletions scripts/count_sgRNA_specific_edits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import argparse
import os
import pandas as pd
import zipfile

from collections import defaultdict
from CRISPResso2 import CRISPRessoShared

def count_sgRNA_specific_edits(crispresso_output_folder):
crispresso2_info = CRISPRessoShared.load_crispresso_info(crispresso_output_folder)

run_version = crispresso2_info['running_info']['version']
version_parts = run_version.split('.')
if int(version_parts[0]) != 2 or int(version_parts[1]) < 2 or int(version_parts[2]) < 15:
raise Exception('CRISPResso run must be run with CRISPResso2 v2.2.15 or later (this run was run with version v' + str(run_version) + ')')

if not crispresso2_info['running_info']['args'].write_detailed_allele_table:
raise Exception('CRISPResso run must be run with the parameter --write_detailed_allele_table')

include_idxs = {} # ref_name > guide_name > idxs
all_reference_guides = {} # order of guides for each reference
modified_counts_by_amplicon = {} # ref_name > category > count
for reference_name in crispresso2_info['results']['ref_names']:
include_idxs[reference_name] = {}
all_reference_guides[reference_name] = []
modified_counts_by_amplicon[reference_name] = defaultdict(int)
for idx, guide_name in enumerate(crispresso2_info['results']['refs'][reference_name]['sgRNA_names']):
this_guide_name = guide_name
if this_guide_name == '':
this_guide_name = crispresso2_info['results']['refs'][reference_name]['sgRNA_orig_sequences'][idx]
if this_guide_name in include_idxs[reference_name]:
this_guide_name = this_guide_name + '_' + str(crispresso2_info['results']['refs'][reference_name]['sgRNA_intervals'][idx][0])
include_idxs[reference_name][this_guide_name] = crispresso2_info['results']['refs'][reference_name]['sgRNA_include_idxs'][idx]
all_reference_guides[reference_name].append(this_guide_name)


z = zipfile.ZipFile(os.path.join(crispresso_output_folder, crispresso2_info['running_info']['allele_frequency_table_zip_filename']))
zf = z.open(crispresso2_info['running_info']['allele_frequency_table_filename'])
df_alleles = pd.read_csv(zf,sep="\t")

read_alleles_count = 0 # all alleles read
reference_count = defaultdict(int) # counts of modification at reference level
reference_modified_count = defaultdict(int)
total_counts = defaultdict(int)
modified_counts = defaultdict(int)
for idx, row in df_alleles.iterrows():
read_alleles_count += row['#Reads']
this_reference_name = row['Reference_Name']
if this_reference_name in include_idxs: # sometimes (e.g. AMBIGUOUS) reads aren't assigned to a reference, so exclude them here..
reference_count[this_reference_name] += row['#Reads']

this_allele_modified_guide_names = []
ref_is_modified = False
row_insertion_positions = [int(x) for x in row['insertion_positions'][1:-1].split(",")] if row['insertion_positions'] != '[]' else []
row_deletion_positions = [int(x) for x in row['deletion_positions'][1:-1].split(",")] if row['deletion_positions'] != '[]' else []
row_substitution_positions = [int(x) for x in row['substitution_positions'][1:-1].split(",")] if row['substitution_positions'] != '[]' else []
for guide_name in all_reference_guides[this_reference_name]:
is_modified = False
for ind in include_idxs[this_reference_name][guide_name]:
if ind in row_insertion_positions:
is_modified = True
if ind in row_deletion_positions:
is_modified = True
if ind in row_substitution_positions:
is_modified = True
if is_modified:
this_allele_modified_guide_names.append(guide_name)
ref_is_modified = True

this_category = 'UNMODIFIED'
if ref_is_modified:
reference_modified_count[this_reference_name] += row['#Reads']
this_category = 'MODIFIED ' + ' + '.join(this_allele_modified_guide_names)
modified_counts_by_amplicon[this_reference_name][this_category] += row['#Reads']


print('Processed ' + str(read_alleles_count) + ' alleles')
for reference_name in crispresso2_info['results']['ref_names']:
print ('Reference: ' + reference_name + ' (' + str(reference_modified_count[reference_name]) + '/' + str(reference_count[reference_name]) + ' modified reads)')
for category in modified_counts_by_amplicon[reference_name]:
print("\t" + category + ": " + str(modified_counts_by_amplicon[reference_name][category]))



if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Count sgRNA-specific edits')
parser.add_argument("-f", "--folder", help="CRISPResso output folder", required=True)
args = parser.parse_args()

count_sgRNA_specific_edits(args.folder)