pinellolab · kclem · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/CRISPResso2/CRISPRessoBatchCORE.py b/CRISPResso2/CRISPRessoBatchCORE.py
@@ -262,7 +262,7 @@ def main():
                     discard_guide_positions_overhanging_amplicon_edge = False
                     if 'discard_guide_positions_overhanging_amplicon_edge' in row:
                         discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge
-                    (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
+                    (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
                         this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
                         guide_qw_sizes, curr_amplicon_quant_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
                     for guide_seq in this_sgRNA_sequences:

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -1688,7 +1688,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
 
 
             # Calculate cut sites for this reference
-            (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs,
+            (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
                 this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(this_seq, this_guides, this_guide_mismatches, this_guide_names, this_guide_qw_centers,
                 this_guide_qw_sizes, this_quant_window_coordinates, args.exclude_bp_from_left, args.exclude_bp_from_right, args.plot_window_size, this_guide_plot_cut_points, args.discard_guide_positions_overhanging_amplicon_edge)
 
@@ -1777,6 +1777,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
                    'sgRNA_names': this_sgRNA_names,
                    'sgRNA_mismatches': this_sgRNA_mismatches,
                    'sgRNA_orig_sequences': this_sgRNA_orig_seqs,
+                   'sgRNA_include_idxs': this_sgRNA_include_idxs,
                    'contains_guide': this_contains_guide,
                    'contains_coding_seq': this_contains_coding_seq,
                    'exon_positions': this_exon_positions,

diff --git a/CRISPResso2/CRISPRessoMetaCORE.py b/CRISPResso2/CRISPRessoMetaCORE.py
@@ -196,7 +196,7 @@ def main():
                     if 'discard_guide_positions_overhanging_amplicon_edge' in row:
                         discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge
 
-                    (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs,
+                    (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs,
                         this_exclude_idxs) = CRISPRessoShared.get_amplicon_info_for_guides(curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers,
                         guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge)
                     for guide_seq in this_sgRNA_sequences:

diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
@@ -1352,6 +1352,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
     this_sgRNA_plot_idxs : list of indices to be plotted for each sgRNA
     this_sgRNA_mismatches: list of mismatches between the guide and the amplicon
     this_sgRNA_names : list of names for each sgRNA (to disambiguate in case a sequence aligns to multiple positions)
+    this_sgRNA_include_idxs : list of indices to be included in quantification per guide
     this_include_idxs : list of indices to be included in quantification
     this_exclude_idxs : list of indices to be excluded from quantification
     """
@@ -1364,6 +1365,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
     this_sgRNA_plot_idxs = []
     this_sgRNA_mismatches = []
     this_sgRNA_names = []
+    this_sgRNA_include_idxs = []
     this_include_idxs = []
     this_exclude_idxs = []
 
@@ -1396,7 +1398,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
         rv_matches = re.finditer(rv_regex, ref_seq, flags=re.IGNORECASE)
 
         # for every match, append:
-        # this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,include_idxs
+        # this_sgRNA_cut_points, this_sgRNA_intervals,this_sgRNA_mismatches,this_sgRNA_names,this_sgRNA_sequences,this_sgRNA_include_idxs,include_idxs
         for m in fw_matches:
             cut_p = m.start() + offset_fw
             if discard_guide_positions_overhanging_amplicon_edge and (
@@ -1411,6 +1413,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
                 st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
                 en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
                 this_include_idxs.extend(range(st, en))
+                this_sgRNA_include_idxs.append(list(range(st, en)))
+            else:
+                this_sgRNA_include_idxs.append([])
 
             this_sgRNA_name = guide_names[guide_idx]
             if match_count == 1:
@@ -1440,6 +1445,9 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
                 st = max(0, cut_p - quantification_window_sizes[guide_idx] + 1)
                 en = min(ref_seq_length - 1, cut_p + quantification_window_sizes[guide_idx] + 1)
                 this_include_idxs.extend(range(st, en))
+                this_sgRNA_include_idxs.append(list(range(st, en)))
+            else:
+                this_sgRNA_include_idxs.append([])
 
             this_sgRNA_name = guide_names[guide_idx]
             if match_count == 1:
@@ -1475,11 +1483,13 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
                     theseCoords) + "'. Coordinates must be given in the form start-end e.g. 5-10 . Please check the --analysis_window_coordinate parameter.")
         given_include_idxs = coordinate_include_idxs
         this_include_idxs = coordinate_include_idxs
+        this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)
     # otherwise, take quantification centers + windows calculated for each guide above
     elif this_sgRNA_cut_points and len(this_include_idxs) > 0:
         given_include_idxs = this_include_idxs
     else:
         this_include_idxs = range(ref_seq_length)
+        this_sgRNA_include_idxs = [[]] * len(this_sgRNA_include_idxs)
 
     # flatten the arrays to avoid errors with old numpy library
     this_include_idxs = np.ravel(this_include_idxs)
@@ -1519,7 +1529,7 @@ def get_amplicon_info_for_guides(ref_seq, guides, guide_mismatches, guide_names,
     this_include_idxs = np.sort(list(this_include_idxs))
     this_exclude_idxs = np.sort(list(this_exclude_idxs))
 
-    return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs
+    return this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_sgRNA_include_idxs, this_include_idxs, this_exclude_idxs
 
 
 def set_guide_array(vals, guides, property_name):

diff --git a/scripts/count_sgRNA_specific_edits.py b/scripts/count_sgRNA_specific_edits.py
@@ -0,0 +1,91 @@
+import argparse
+import os
+import pandas as pd
+import zipfile
+
+from collections import defaultdict
+from CRISPResso2 import CRISPRessoShared
+
+def count_sgRNA_specific_edits(crispresso_output_folder):
+    crispresso2_info = CRISPRessoShared.load_crispresso_info(crispresso_output_folder)
+
+    run_version = crispresso2_info['running_info']['version']
+    version_parts = run_version.split('.')
+    if int(version_parts[0]) != 2 or int(version_parts[1]) < 2 or int(version_parts[2]) < 15:
+        raise Exception('CRISPResso run must be run with CRISPResso2 v2.2.15 or later (this run was run with version v' + str(run_version) + ')')
+
+    if not crispresso2_info['running_info']['args'].write_detailed_allele_table:
+        raise Exception('CRISPResso run must be run with the parameter --write_detailed_allele_table')
+
+    include_idxs = {} # ref_name > guide_name > idxs
+    all_reference_guides = {} # order of guides for each reference
+    modified_counts_by_amplicon = {} # ref_name > category > count
+    for reference_name in crispresso2_info['results']['ref_names']:
+        include_idxs[reference_name] = {}
+        all_reference_guides[reference_name] = []
+        modified_counts_by_amplicon[reference_name] = defaultdict(int)
+        for idx, guide_name in enumerate(crispresso2_info['results']['refs'][reference_name]['sgRNA_names']):
+            this_guide_name = guide_name
+            if this_guide_name == '':
+                this_guide_name = crispresso2_info['results']['refs'][reference_name]['sgRNA_orig_sequences'][idx]
+            if this_guide_name in include_idxs[reference_name]:
+                this_guide_name = this_guide_name + '_' + str(crispresso2_info['results']['refs'][reference_name]['sgRNA_intervals'][idx][0])
+            include_idxs[reference_name][this_guide_name] = crispresso2_info['results']['refs'][reference_name]['sgRNA_include_idxs'][idx]
+            all_reference_guides[reference_name].append(this_guide_name)
+
+
+    z = zipfile.ZipFile(os.path.join(crispresso_output_folder, crispresso2_info['running_info']['allele_frequency_table_zip_filename']))
+    zf = z.open(crispresso2_info['running_info']['allele_frequency_table_filename'])
+    df_alleles = pd.read_csv(zf,sep="\t")
+
+    read_alleles_count = 0 # all alleles read
+    reference_count = defaultdict(int) # counts of modification at reference level
+    reference_modified_count = defaultdict(int)
+    total_counts = defaultdict(int)
+    modified_counts = defaultdict(int)
+    for idx, row in df_alleles.iterrows():
+        read_alleles_count += row['#Reads']
+        this_reference_name = row['Reference_Name']
+        if this_reference_name in include_idxs: # sometimes (e.g. AMBIGUOUS) reads aren't assigned to a reference, so exclude them here..
+            reference_count[this_reference_name] += row['#Reads']
+
+            this_allele_modified_guide_names = []
+            ref_is_modified = False
+            row_insertion_positions = [int(x) for x in row['insertion_positions'][1:-1].split(",")] if row['insertion_positions'] != '[]' else []
+            row_deletion_positions = [int(x) for x in row['deletion_positions'][1:-1].split(",")] if row['deletion_positions'] != '[]' else []
+            row_substitution_positions = [int(x) for x in row['substitution_positions'][1:-1].split(",")] if row['substitution_positions'] != '[]' else []
+            for guide_name in all_reference_guides[this_reference_name]:
+                is_modified = False
+                for ind in include_idxs[this_reference_name][guide_name]:
+                    if ind in row_insertion_positions:
+                        is_modified = True
+                    if ind in row_deletion_positions:
+                        is_modified = True
+                    if ind in row_substitution_positions:
+                        is_modified = True
+                if is_modified:
+                    this_allele_modified_guide_names.append(guide_name)
+                    ref_is_modified = True
+
+            this_category = 'UNMODIFIED'
+            if ref_is_modified:
+                reference_modified_count[this_reference_name] += row['#Reads']
+                this_category = 'MODIFIED ' + ' + '.join(this_allele_modified_guide_names)
+            modified_counts_by_amplicon[this_reference_name][this_category] += row['#Reads']
+
+
+    print('Processed ' + str(read_alleles_count) + ' alleles')
+    for reference_name in crispresso2_info['results']['ref_names']:
+        print ('Reference: ' + reference_name + ' (' + str(reference_modified_count[reference_name]) + '/' + str(reference_count[reference_name]) + ' modified reads)')
+        for category in modified_counts_by_amplicon[reference_name]:
+            print("\t" + category + ": " + str(modified_counts_by_amplicon[reference_name][category]))
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Count sgRNA-specific edits')
+    parser.add_argument("-f", "--folder", help="CRISPResso output folder", required=True)
+    args = parser.parse_args()
+
+    count_sgRNA_specific_edits(args.folder)
+