From 58f1f08b1c112da74f4737cf6d0f6a8fee339c48 Mon Sep 17 00:00:00 2001 From: Jayoung Ryu Date: Mon, 6 May 2024 14:58:51 -0400 Subject: [PATCH] filter out 0 count gRNAs during run prep & update no edit run docs --- bean/annotate/utils.py | 17 +++++++++++++++++ bean/cli/filter.py | 4 +++- bean/framework/ReporterScreen.py | 13 +++++++++++-- bean/plotting/allele_stats.py | 7 +++++-- bean/preprocessing/utils.py | 5 +++++ docs/_create-screen.md | 4 ++-- docs/_tutorial_no_edit.md | 8 ++++++-- 7 files changed, 49 insertions(+), 9 deletions(-) diff --git a/bean/annotate/utils.py b/bean/annotate/utils.py index 2ece30f..e6cca3e 100755 --- a/bean/annotate/utils.py +++ b/bean/annotate/utils.py @@ -294,6 +294,18 @@ def parse_args(parser=None): default=None, help="Plasmid ReporterScreen object path. If provided, alleles are filtered based on if a nucleotide edit is more significantly enriched in sample compared to the plasmid data. Negative control data where no edit is expected can be fed in instead of plasmid library.", ) + parser.add_argument( + "--reporter-length", + type=int, + default=32, + help="Length of reporter sequence in the construct.", + ) + parser.add_argument( + "--reporter-right-flank-length", + type=int, + default=6, + help="Length of the right-flanking nucleotides of protospacer in the reporter.", + ) parser.add_argument( "--edit-start-pos", "-s", @@ -315,6 +327,11 @@ def parse_args(parser=None): help="Jaccard Index threshold when the alleles are mapped to the most similar alleles. In each filtering step, allele counts of filtered out alleles will be mapped to the most similar allele only if they have Jaccard Index of shared edit higher than this threshold.", default=0.3, ) + parser.add_argument( + "--filter-spacer", + help="Only consider edit within protospacer positions of reporter.", + action="store_true", + ) parser.add_argument( "--filter-window", "-w", diff --git a/bean/cli/filter.py b/bean/cli/filter.py index c7ec484..d93b170 100755 --- a/bean/cli/filter.py +++ b/bean/cli/filter.py @@ -67,7 +67,7 @@ def main(args): info(f"Filtered down to {len(bdata.uns['sig_allele_counts'])} alleles.") print(len(bdata.uns[allele_df_keys[-1]])) - if len(bdata.uns[allele_df_keys[-1]]) >= 1: + if len(bdata.uns[allele_df_keys[-1]]) >= 1 and args.filter_spacer: info("Filtering out edits outside spacer position...") bdata.uns[f"{allele_df_keys[-1]}_spacer"] = ( bdata.filter_allele_counts_by_pos( @@ -77,6 +77,8 @@ def main(args): map_to_filtered=True, allele_uns_key=allele_df_keys[-1], jaccard_threshold=0.2, + reporter_length=args.reporter_length, + reporter_right_flank_length=args.reporter_right_flank_length, ).reset_index(drop=True) ) info( diff --git a/bean/framework/ReporterScreen.py b/bean/framework/ReporterScreen.py index c684a6e..3d4794c 100755 --- a/bean/framework/ReporterScreen.py +++ b/bean/framework/ReporterScreen.py @@ -621,7 +621,9 @@ def get_normalized_allele_counts(self, allele_count_df=None, norm_thres=10): norm_counts = self._get_allele_norm( allele_count_df=allele_count_df, thres=norm_thres ) - alleles_norm.loc[:, count_columns] = alleles_norm.loc[:, count_columns].astype(float) + alleles_norm.loc[:, count_columns] = alleles_norm.loc[:, count_columns].astype( + float + ) alleles_norm.loc[:, count_columns] = ( alleles_norm.loc[:, count_columns] / norm_counts ) @@ -637,6 +639,8 @@ def filter_allele_counts_by_pos( map_to_filtered: bool = True, distribute: bool = False, jaccard_threshold: float = 0.1, + reporter_length: int = 32, + reporter_right_flank_length: int = 6, ): """ Filter alleles based on barcode matched counts, allele counts, @@ -649,6 +653,8 @@ def filter_allele_counts_by_pos( rel_pos_end: rel_pos to end including (exclusive) rel_pos_is_reporter: rel_pos_start and rel_pos_end is 0-based relative to reporter sequence start. jaccard_threshold (float) -- + reporter_length: Length of reporter sequence + reporter_right_flank_length: Length of right-flanking nucleotides following protospacer sequence in reporter. """ allele_count_df = self.uns[allele_uns_key].copy() if rel_pos_is_reporter: @@ -663,7 +669,10 @@ def filter_allele_counts_by_pos( if "guide_len" not in self.guides.columns.tolist(): self.guides["guide_len"] = self.guides.sequence.map(len) guide_start_pos = ( - 32 - 6 - self.guides.loc[allele_count_df.guide, "guide_len"].values + reporter_length + - 1 + - reporter_right_flank_length + - self.guides.loc[allele_count_df.guide, "guide_len"].values ) filtered_allele, filtered_edits = zip( *[ diff --git a/bean/plotting/allele_stats.py b/bean/plotting/allele_stats.py index 89b683f..42313c3 100755 --- a/bean/plotting/allele_stats.py +++ b/bean/plotting/allele_stats.py @@ -1,4 +1,5 @@ """Plotting functions to describe allele/guide/edit stats for allele count information stored in ReporterScreen.uns[allele_df_key].""" + import numpy as np import matplotlib.pyplot as plt @@ -32,7 +33,9 @@ def plot_n_guides_per_edit( + list(map(lambda e: e.get_abs_edit(), a.nt_allele.edits)) ) elif allele_col == "allele": - a["edits"] = a[allele_col].map(lambda a: list(a.edits)) + a["edits"] = a[allele_col].map( + lambda a: list(map(lambda e: e.get_abs_edit(), a.edits)) + ) edits_df = a.explode("edits")[["guide", "edits"]] edits_df["edits"] = edits_df.edits.map(str) edits_df = edits_df.loc[~edits_df.edits.str.startswith("-"), :].drop_duplicates() @@ -61,5 +64,5 @@ def plot_allele_stats(bdata, allele_df_keys, plot_save_path): plot_n_alleles_per_guide(bdata, key, bdata.uns[key].columns[1], ax[0]) plot_n_guides_per_edit(bdata, key, bdata.uns[key].columns[1], ax[1]) - #plt.tight_layout() + # plt.tight_layout() fig.savefig(plot_save_path, bbox_inches="tight") diff --git a/bean/preprocessing/utils.py b/bean/preprocessing/utils.py index b9e7c2c..42fc101 100755 --- a/bean/preprocessing/utils.py +++ b/bean/preprocessing/utils.py @@ -32,6 +32,11 @@ def prepare_bdata(bdata: be.ReporterScreen, args, warn, prefix: str): f"Some target column (bdata.guides[{args.target_col}]) value is null. Check your input file." ) bdata = bdata[bdata.guides[args.target_col].argsort(), :] + if any(bdata.X.sum(axis=1) > 0): + warn( + f"Filtering out {sum(bdata.X.sum(axis=1) > 0)} gRNAs without any counts over all samples." + ) + bdata = bdata[bdata.X.sum(axis=1) > 0, :] n_no_support_targets, bdata = filter_no_info_target( bdata, condit_col=args.condition_col, diff --git a/docs/_create-screen.md b/docs/_create-screen.md index 8fe7901..ee897b3 100755 --- a/docs/_create-screen.md +++ b/docs/_create-screen.md @@ -3,7 +3,7 @@ bean create-screen gRNA_library.csv sample_list.csv gRNA_counts_table.csv ``` ## Input - * gRNA_library.csv ([example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_guides.csv)) - * sample_list.csv ([example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_samples.csv)) + * gRNA_library.csv ([`variant` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_guides.csv), [`tiling` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/test_guide_info_tiling_chrom.csv)) + * sample_list.csv ([`sorting` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/sample_list_survival.csv), [`variant` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_samples.csv)) * gRNA_counts_table.csv: Table with gRNA ID in the first column and sample IDs as the column names (first row) `gRNA_library.csv` and `sample_list.csv` should be formatted as :ref:`input`. ([example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_counts.csv)) \ No newline at end of file diff --git a/docs/_tutorial_no_edit.md b/docs/_tutorial_no_edit.md index 9f29dc7..cb499ae 100644 --- a/docs/_tutorial_no_edit.md +++ b/docs/_tutorial_no_edit.md @@ -34,8 +34,12 @@ bean qc \ bean run sorting variant \ ${working_dir}/bean_count_${screen_id}_masked.h5ad \ -o ${working_dir}/ \ - --uniform-edit `# As we have no edit information.` \ - [--no-negative-control] `# If you don't have the negative control gRNAs.` + --uniform-edit --ignore-bcmatch `# As we have no edit/reporter information.` \ + [--fit-negctrl [--negctrl-col target_group --negctrl-col-value NegCtrl]] `# If you have the negative control gRNAs.` ``` +## Input file spec +* gRNA_info.csv: Should have `name`, `target` columns. You can also specify `target_group` column whose value indicate `PosCtrl`/`NegCtrl` for control gRNAs. +* sample_list.csv: Same requirement for the full run. See examples for [`sorting` screens](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_samples.csv) and [`survival` screens](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/sample_list_survival.csv). + See the example input files [here](https://pinellolab.github.io/crispr-bean/create_screen.html). \ No newline at end of file