Skip to content

Commit

Permalink
Enable quantification by sgRNA (pinellolab#348)
Browse files Browse the repository at this point in the history
This PR includes:
- storing the sgRNA-specific editing locations in the crispresso2_info object. Previously, each amplicon would record the indices of quantification windows across the guide, but not for individual guides. This stores the information for each guide in crispresso2_info['results']['refs'][reference_name]['sgRNA_include_idxs']
- a script (count_sgRNA_specific_edits.py) to parse through an allele table output from a completed CRISPResso run (`--write_detailed_allele_table` flag required) to count edits in each sgRNA separately.

I don't have a good double-edited sample handy, but it can be run on the demo HDR data [hdr.fastq.gz](http://crispresso.pinellolab.org/static/demo/hdr.fastq.gz) using the command:

```

CRISPResso -r1 hdr.fastq.gz -a acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatggtgcatctgactcctgTggagaagtctgccgttactgccctgtggggcaaggtgaacgtggatgaagttggtggtgaggccctgggcaggttggtatcaaggtta -e acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatggtgcaCctgactccGgaggagaagtctgccgttactgcGctgtggggcaaggtgaacgtggatgaagttggtggtgaggccctgggcaggttggtatcaaggtta -c atggtgcatctgactcctgTggagaagtctgccgttactgccctgtggggcaaggtgaacgtggatgaagttggtggtgaggccctgggcag -g TGCACCATGGTGTCTGTTTG,GATGAAGTTGGTGGTGAGGCCC --write_detailed_allele_table  -n hdr3 -p max -gn guide1,guide2
```

```
python CRISPResso2/scripts/count_sgRNA_specific_edits.py -f CRISPResso_on_hdr3
```

This produces: 
```
Processed 25000 alleles
Reference: Reference (2391/23415 modified reads)
        UNMODIFIED: 21024
        MODIFIED guide1: 2359
        MODIFIED guide2: 32
Reference: HDR (856/1577 modified reads)
        UNMODIFIED: 721
        MODIFIED guide1: 854
        MODIFIED guide1 + guide2: 1
        MODIFIED guide2: 1
 ```
  • Loading branch information
kclem authored and Colelyman committed Dec 22, 2023
1 parent 97da958 commit 0f7d0a5
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 5 deletions.
2 changes: 1 addition & 1 deletion CRISPResso2/CRISPRessoBatchCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def main():
discard_guide_positions_overhanging_amplicon_edge = False
if 'discard_guide_positions_overhanging_amplicon_edge' in row:
discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
guide_qw_sizes, curr_amplicon_quant_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
for guide_seq in this_sgRNA_sequences:
Expand Down
3 changes: 2 additions & 1 deletion CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,7 +1688,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited


# Calculate cut sites for this reference
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(this_seq, this_guides, this_guide_mismatches, this_guide_names, this_guide_qw_centers,
this_guide_qw_sizes, this_quant_window_coordinates, args.exclude_bp_from_left, args.exclude_bp_from_right, args.plot_window_size, this_guide_plot_cut_points, args.discard_guide_positions_overhanging_amplicon_edge)

Expand Down Expand Up @@ -1777,6 +1777,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
'sgRNA_names': this_sgRNA_names,
'sgRNA_mismatches': this_sgRNA_mismatches,
'sgRNA_orig_sequences': this_sgRNA_orig_seqs,
'sgRNA_include_idxs': this_sgRNA_include_idxs,
'contains_guide': this_contains_guide,
'contains_coding_seq': this_contains_coding_seq,
'exon_positions': this_exon_positions,
Expand Down
2 changes: 1 addition & 1 deletion CRISPResso2/CRISPRessoMetaCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def main():
if 'discard_guide_positions_overhanging_amplicon_edge' in row:
discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge

(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs,
(this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
for guide_seq in this_sgRNA_sequences:
Expand Down
14 changes: 12 additions & 2 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_sgRNA_plot_idxs : list of indices to be plotted for each sgRNA
this_sgRNA_mismatches: list of mismatches between the guide and the amplicon
this_sgRNA_names : list of names for each sgRNA (to disambiguate in case a sequence aligns to multiple positions)
this_sgRNA_include_idxs : list of indices to be included in quantification per guide
this_include_idxs : list of indices to be included in quantification
this_exclude_idxs : list of indices to be excluded from quantification
"""
Expand All @@ -1364,6 +1365,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_sgRNA_plot_idxs = []
this_sgRNA_mismatches = []
this_sgRNA_names = []
this_sgRNA_include_idxs = []
this_include_idxs = []
this_exclude_idxs = []

Expand Down Expand Up @@ -1396,7 +1398,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
rv_matches = re.finditer(rv_regex, ref_seq, flags=re.IGNORECASE)

# for every match, append:
# this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,include_idxs
# this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,this_sgRNA_include_idxs,include_idxs
for m in fw_matches:
cut_p = m.start() + offset_fw
if discard_guide_positions_overhanging_amplicon_edge and (
Expand All @@ -1411,6 +1413,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
this_include_idxs.extend(range(st, en))
this_sgRNA_include_idxs.append(list(range(st, en)))
else:
this_sgRNA_include_idxs.append([])

this_sgRNA_name = guide_names[guide_idx]
if match_count == 1:
Expand Down Expand Up @@ -1440,6 +1445,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
this_include_idxs.extend(range(st, en))
this_sgRNA_include_idxs.append(list(range(st, en)))
else:
this_sgRNA_include_idxs.append([])

this_sgRNA_name = guide_names[guide_idx]
if match_count == 1:
Expand Down Expand Up @@ -1475,11 +1483,13 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
theseCoords) + "'. Coordinates must be given in the form start-end e.g. 5-10 . Please check the --analysis_window_coordinate parameter.")
given_include_idxs = coordinate_include_idxs
this_include_idxs = coordinate_include_idxs
this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)
# otherwise, take quantification centers + windows calculated for each guide above
elif this_sgRNA_cut_points and len(this_include_idxs) > 0:
given_include_idxs = this_include_idxs
else:
this_include_idxs = range(ref_seq_length)
this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)

# flatten the arrays to avoid errors with old numpy library
this_include_idxs = np.ravel(this_include_idxs)
Expand Down Expand Up @@ -1519,7 +1529,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
this_include_idxs = np.sort(list(this_include_idxs))
this_exclude_idxs = np.sort(list(this_exclude_idxs))

return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs
return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs, this_exclude_idxs


def set_guide_array(vals, guides, property_name):
Expand Down
91 changes: 91 additions & 0 deletions scripts/count_sgRNA_specific_edits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import argparse
import os
import pandas as pd
import zipfile

from collections import defaultdict
from CRISPResso2 import CRISPRessoShared

def count_sgRNA_specific_edits(crispresso_output_folder):
crispresso2_info = CRISPRessoShared.load_crispresso_info(crispresso_output_folder)

run_version = crispresso2_info['running_info']['version']
version_parts = run_version.split('.')
if int(version_parts[0]) != 2 or int(version_parts[1]) < 2 or int(version_parts[2]) < 15:
raise Exception('CRISPResso run must be run with CRISPResso2 v2.2.15 or later (this run was run with version v' + str(run_version) + ')')

if not crispresso2_info['running_info']['args'].write_detailed_allele_table:
raise Exception('CRISPResso run must be run with the parameter --write_detailed_allele_table')

include_idxs = {} # ref_name > guide_name > idxs
all_reference_guides = {} # order of guides for each reference
modified_counts_by_amplicon = {} # ref_name > category > count
for reference_name in crispresso2_info['results']['ref_names']:
include_idxs[reference_name] = {}
all_reference_guides[reference_name] = []
modified_counts_by_amplicon[reference_name] = defaultdict(int)
for idx, guide_name in enumerate(crispresso2_info['results']['refs'][reference_name]['sgRNA_names']):
this_guide_name = guide_name
if this_guide_name == '':
this_guide_name = crispresso2_info['results']['refs'][reference_name]['sgRNA_orig_sequences'][idx]
if this_guide_name in include_idxs[reference_name]:
this_guide_name = this_guide_name + '_' + str(crispresso2_info['results']['refs'][reference_name]['sgRNA_intervals'][idx][0])
include_idxs[reference_name][this_guide_name] = crispresso2_info['results']['refs'][reference_name]['sgRNA_include_idxs'][idx]
all_reference_guides[reference_name].append(this_guide_name)


z = zipfile.ZipFile(os.path.join(crispresso_output_folder, crispresso2_info['running_info']['allele_frequency_table_zip_filename']))
zf = z.open(crispresso2_info['running_info']['allele_frequency_table_filename'])
df_alleles = pd.read_csv(zf,sep="\t")

read_alleles_count = 0 # all alleles read
reference_count = defaultdict(int) # counts of modification at reference level
reference_modified_count = defaultdict(int)
total_counts = defaultdict(int)
modified_counts = defaultdict(int)
for idx, row in df_alleles.iterrows():
read_alleles_count += row['#Reads']
this_reference_name = row['Reference_Name']
if this_reference_name in include_idxs: # sometimes (e.g. AMBIGUOUS) reads aren't assigned to a reference, so exclude them here..
reference_count[this_reference_name] += row['#Reads']

this_allele_modified_guide_names = []
ref_is_modified = False
row_insertion_positions = [int(x) for x in row['insertion_positions'][1:-1].split(",")] if row['insertion_positions'] != '[]' else []
row_deletion_positions = [int(x) for x in row['deletion_positions'][1:-1].split(",")] if row['deletion_positions'] != '[]' else []
row_substitution_positions = [int(x) for x in row['substitution_positions'][1:-1].split(",")] if row['substitution_positions'] != '[]' else []
for guide_name in all_reference_guides[this_reference_name]:
is_modified = False
for ind in include_idxs[this_reference_name][guide_name]:
if ind in row_insertion_positions:
is_modified = True
if ind in row_deletion_positions:
is_modified = True
if ind in row_substitution_positions:
is_modified = True
if is_modified:
this_allele_modified_guide_names.append(guide_name)
ref_is_modified = True

this_category = 'UNMODIFIED'
if ref_is_modified:
reference_modified_count[this_reference_name] += row['#Reads']
this_category = 'MODIFIED ' + ' + '.join(this_allele_modified_guide_names)
modified_counts_by_amplicon[this_reference_name][this_category] += row['#Reads']


print('Processed ' + str(read_alleles_count) + ' alleles')
for reference_name in crispresso2_info['results']['ref_names']:
print ('Reference: ' + reference_name + ' (' + str(reference_modified_count[reference_name]) + '/' + str(reference_count[reference_name]) + ' modified reads)')
for category in modified_counts_by_amplicon[reference_name]:
print("\t" + category + ": " + str(modified_counts_by_amplicon[reference_name][category]))



if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Count sgRNA-specific edits')
parser.add_argument("-f", "--folder", help="CRISPResso output folder", required=True)
args = parser.parse_args()

count_sgRNA_specific_edits(args.folder)

0 comments on commit 0f7d0a5

Please sign in to comment.