diff --git a/README.md b/README.md index 8c9289d..f91da6b 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,14 @@ This is an analysis toolkit for the pooled CRISPR reporter or sensor data. The r ## Overview `crispr-bean` supports end-to-end analysis of pooled sorting screens, with or without reporter. -dag_bean.svg +dag_bean_v2.svg -1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files. -2. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides -3. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA. -4. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data. +1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files. + * [`bean-count-create`](#bean-create-screen-create-reporterscreen-object-from-flat-files) creates minimal ReporterScreen object from flat gRNA count file. Note that this way, allele counts are not included and many functionalities involving allele and edit counts are not supported. +2. [`bean-profile`](#bean-profile-profile-editing-patterns): Profile editing preferences of your editor. +3. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides +4. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA. +5. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data. ### Data structure BEAN stores mapped gRNA and allele counts in `ReporterScreen` object which is compatible with [AnnData](https://anndata.readthedocs.io/en/latest/index.html). See [Data Structure](#data-structure) section for more information. @@ -141,7 +143,35 @@ Optional columns are not required but can be provided for compatibility with `be * `--rerun` (default: `False`): Recount each sample. If `False`, existing count for each sample is taken. +## `bean-create-screen`: Create ReporterScreen object from flat files +```bash +bean-create-screen gRNA_library.csv sample_list.csv gRNA_counts_table.csv +``` +### Input + * [gRNA_library.csv](#1-gRNA_librarycsv) + * [sample_list.csv](#2-sample_listcsv) + * gRNA_counts_table.csv: Table with gRNA ID in the first column and sample IDs as the column names (first row) + +### Full Parameters + * `-e`, `--edits` (default: `None`): Path to edit counts .csv table, with index at first column and column names at the first row. + * `-o`, `--output-prefix` (default: `None`): Output file prefix (output will be saved as `output_prefix.h5ad`). If not provided, `gRNA_counts_table_csv` file prefix is used. + +

+## `bean-profile`: Profile editing patterns +```bash +bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profile report` +``` +### Parameters + * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used. + * `--replicate-col` (default: `"rep"`): Column name in `bdata.samples` that describes replicate ID. + * `--condition-col` (default: `"bin"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.) + * `--pam-col` (default: `None`): Column name describing PAM of each gRNA in `bdata.guides`. + * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`. + * `-w`, `--window-length` (default: `6`): Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window. + +### Output +Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).

diff --git a/bean/plotting/editing_patterns.py b/bean/plotting/editing_patterns.py index e621fcd..4f1ccd9 100644 --- a/bean/plotting/editing_patterns.py +++ b/bean/plotting/editing_patterns.py @@ -22,7 +22,7 @@ def _add_absent_edits( ): """If edit is not observed in editable position, add into the edit rate table.""" editable_positions = np.where( - (np.array(list(bdata.guides.loc[guide, "Reporter"])) == edited_base) + (np.array(list(bdata.guides.loc[guide, "reporter"])) == edited_base) )[0] if guide not in edit_tbl.guide.tolist(): observed_rel_pos = [] @@ -62,14 +62,13 @@ def _add_absent_edits( def get_edit_rates( bdata, edit_count_key="edit_counts", - add_absent=True, adjust_spacer_pos: bool = True, reporter_column: str = "reporter", ): """ Obtain position- and context-wise editing rate (context: base preceding the target base position). Args - bdata (bean.ReporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate. + bdata (bean.reporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate. edit_count_key: key in bdata.uns with edit count DataFrame to be used for analyzing posiiton-wide editing rate. """ edit_rates = bdata.get_normalized_allele_counts(bdata.uns[edit_count_key]) @@ -77,53 +76,45 @@ def get_edit_rates( edit_rates_agg = edit_rates_agg.loc[ edit_rates_agg.guide.map(lambda s: "CONTROL" not in s) ].reset_index(drop=True) - try: - edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1) - edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1) - edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype( - int - ) - if add_absent: - for guide in tqdm( - edit_rates_agg.guide.unique(), - desc="Calibrating edits in editable positions...", - ): - edit_rates_agg = _add_absent_edits( - bdata, - guide, - edit_rates_agg, - edited_base=bdata.base_edited_from, - target_alt=bdata.base_edited_to, - ) - if adjust_spacer_pos: - if "guide_len" not in bdata.guides.columns: - bdata.guides["guide_len"] = bdata.guides.sequence.map(len) - start_pad = ( - 32 - - 6 - - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True) - ) - edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype( - int - ) - edit_rates_agg = edit_rates_agg[ - (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20) - ].reset_index(drop=True) - edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1 - else: - edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1 - edit_rates_agg["base_change"] = edit_rates_agg.edit.map( - lambda e: e.get_base_change() + edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1) + edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1) + edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype(int) + + for guide in tqdm( + edit_rates_agg.guide.unique(), + desc="Calibrating edits in editable positions...", + ): + edit_rates_agg = _add_absent_edits( + bdata, + guide, + edit_rates_agg, + edited_base=bdata.base_edited_from, + target_alt=bdata.base_edited_to, ) - edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int) - edit_rates_agg["context"] = edit_rates_agg.apply( - lambda row: bdata.guides.loc[row.guide, reporter_column][ - row.rel_pos - 1 : row.rel_pos + 1 - ], - axis=1, + + if adjust_spacer_pos: + if "guide_len" not in bdata.guides.columns: + bdata.guides["guide_len"] = bdata.guides.sequence.map(len) + start_pad = ( + 32 - 6 - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True) ) - except Exception as exp: - print(exp) + edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype(int) + edit_rates_agg = edit_rates_agg[ + (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20) + ].reset_index(drop=True) + edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1 + else: + edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1 + edit_rates_agg["base_change"] = edit_rates_agg.edit.map( + lambda e: e.get_base_change() + ) + edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int) + edit_rates_agg["context"] = edit_rates_agg.apply( + lambda row: bdata.guides.loc[row.guide, reporter_column][ + row.rel_pos - 1 : row.rel_pos + 1 + ], + axis=1, + ) return edit_rates_agg @@ -190,10 +181,10 @@ def _get_norm_rates_df( edit_rates_df=None, edit_count_key="edit_counts", base_changes: Sequence[str] = None, - show_list=None, ): change_by_pos = pd.pivot( - edit_rates_df.groupby(["base_change", "spacer_pos"]) + edit_rates_df[["base_change", "spacer_pos", "rep_mean"]] + .groupby(["base_change", "spacer_pos"]) .sum()["rep_mean"] .reset_index(), index="spacer_pos", @@ -203,7 +194,7 @@ def _get_norm_rates_df( norm_matrix = pd.DataFrame(index=change_by_pos.index, columns=BASES) for pos in norm_matrix.index: - pos_base = bdata.guides.Reporter.map( + pos_base = bdata.guides.reporter.map( lambda s: s[pos] if pos < len(s) else " " ).values for b in BASES: @@ -216,10 +207,8 @@ def _get_norm_rates_df( :, change_by_pos.columns.map(lambda s: s.split(">")[0]) ].values ) - if show_list is None: - show_list = ["A>C", "A>T", "A>G", "C>T", "C>G"] # norm_rate_reduced = _combine_complementary_base_changes(norm_rate).astype(float) - return norm_rate.astype(float)[show_list] # _reduced + return norm_rate.astype(float)[base_changes] # _reduced def _get_possible_changes_from_target_base(target_basechange: str): @@ -257,6 +246,8 @@ def plot_by_pos_behive( if nonref_base_changes is None: if target_basechange == "A>G": nonref_base_changes = ["C>T", "C>G"] + elif target_basechange == "C>T": + nonref_base_changes = ["G>A", "G>C"] else: print("No non-ref base changes specified. not drawing them") nonref_base_changes = [] @@ -266,10 +257,10 @@ def plot_by_pos_behive( bdata, norm_rates_df, edit_count_key, - base_changes=[ + base_changes=ref_other_changes + + [ target_basechange, ] - + ref_other_changes + nonref_base_changes, ) fig, ax = plt.subplots(figsize=(3, 7)) @@ -278,6 +269,7 @@ def plot_by_pos_behive( if normalize: df_to_draw = df_to_draw / vmax * 100 vmax = 100 + target_data = df_to_draw.copy() target_data.loc[:, target_data.columns != target_basechange] = np.nan sns.heatmap( @@ -291,6 +283,7 @@ def plot_by_pos_behive( fmt=".0f" if normalize else ".2g", annot_kws={"fontsize": 8}, ) + ref_data = df_to_draw.copy() ref_data.loc[ :, @@ -322,20 +315,19 @@ def plot_by_pos_behive( annot_kws={"fontsize": 8}, ) ax.set_ylabel("Protospacer position") - return ax + return ax, df_to_draw -def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame): - edit_rates_df["pam"] = bdata.guides.loc[ - edit_rates_df.guide, "5-nt PAM" - ].reset_index(drop=True) +def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame, pam_col="5-nt PAM"): + edit_rates_df["pam"] = bdata.guides.loc[edit_rates_df.guide, pam_col].reset_index( + drop=True + ) edit_rates_df["pam23"] = edit_rates_df.pam.map(lambda s: s[1:3]) - edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1]) - edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2]) - edit_rates_df["pam12"] = edit_rates_df.pam.map(lambda s: s[:2]) - edit_rates_df["pam34"] = edit_rates_df.pam.map(lambda s: s[2:4]) return pd.pivot( - edit_rates_df.loc[(edit_rates_df.base_change == bdata.target_base_change)] + edit_rates_df.loc[ + (edit_rates_df.base_change == bdata.target_base_change), + ["rep_mean", "pam23", "spacer_pos"], + ] .groupby(["pam23", "spacer_pos"]) .mean() .reset_index(), @@ -348,11 +340,12 @@ def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame): def plot_by_pos_pam( bdata, edit_rates_df, + pam_col="5-nt PAM", ax=None, figsize=(6, 4), ): """Plot position by PAM""" - pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df) + pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df, pam_col) if ax is None: fig, ax = plt.subplots(figsize=figsize) sns.heatmap(pos_by_pam, ax=ax, cmap="Blues") @@ -432,6 +425,7 @@ def get_pam_preference( def plot_pam_preference( edit_rates_df, bdata=None, + pam_col="5-nt PAM", edit_start_pos: int = 3, edit_end_pos: int = 8, ax=None, @@ -440,7 +434,7 @@ def plot_pam_preference( """ """ if "pam2" not in edit_rates_df.columns or "pam3" not in edit_rates_df.columns: edit_rates_df["pam"] = bdata.guides.loc[ - edit_rates_df.guide, "5-nt PAM" + edit_rates_df.guide, pam_col ].reset_index(drop=True) edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1]) edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2]) diff --git a/bean/plotting/utils.py b/bean/plotting/utils.py new file mode 100644 index 0000000..1233cfb --- /dev/null +++ b/bean/plotting/utils.py @@ -0,0 +1,69 @@ +import argparse + + +def parse_args(): + print(" \n~~~BEAN Profile~~~") + print("-Profile editing patterns of your editor-") + print( + r""" + _ _ __ _ _ + / \ '\ _ __ _ _ ___ / _(_) |___ + | \ \ | '_ \ '_/ _ \ _| | / -_) + \ \ | | .__/_| \___/_| |_|_\___| + `.__|/ |_| + """ + ) + parser = argparse.ArgumentParser() + parser.add_argument( + "bdata_path", help="Path to the ReporterScreen object to run QC on", type=str + ) + parser.add_argument( + "-o", + "--output-prefix", + help="Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.", + type=str, + ) + parser.add_argument( + "--replicate-col", + help="Column name in `bdata.samples` that describes replicate ID.", + type=str, + default="rep", + ) + parser.add_argument( + "--condition-col", + help="Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)", + type=str, + default="bin", + ) + parser.add_argument( + "--pam-col", + help="Column name describing PAM of each gRNA in `bdata.guides`.", + type=str, + default=None, + ) + parser.add_argument( + "--control-condition", + help="Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.", + type=str, + default="bulk", + ) + parser.add_argument( + "-w", + "--window-length", + help="Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.", + type=int, + default=6, + ) + + args = parser.parse_args() + if args.output_prefix is None: + args.output_prefix = f"{args.bdata_path.rsplit('.h5ad', 1)[0]}" + return args + + +def check_args(args): + if args.window_length < 1: + raise ValueError(f"window_length {args.window_length} is too small.") + if args.window_length > 20: + raise ValueError(f"window_length {args.window_length} is too large.") + return args diff --git a/bin/bean-profile b/bin/bean-profile new file mode 100644 index 0000000..8b48daa --- /dev/null +++ b/bin/bean-profile @@ -0,0 +1,34 @@ +#!/usr/bin/env python +import os +import papermill as pm +import bean as be +from bean.plotting.utils import parse_args, check_args + + +def main(): + args = parse_args() + args = check_args(args) + os.system( + "python -m ipykernel install --user --name bean_python3 --display-name bean_python3" + ) + pm.execute_notebook( + f"{os.path.dirname(be.__file__)}/../notebooks/profile_editing_preference.ipynb", + f"{args.output_prefix}.ipynb", + parameters=dict( + bdata_path=args.bdata_path, + output_prefix=args.output_prefix, + replicate_col=args.replicate_col, + condition_col=args.condition_col, + control_condition=args.control_condition, + max_editing_window_length=args.window_length, + pam_col=args.pam_col, + ), + kernel_name="bean_python3", + ) + os.system( + f"jupyter nbconvert --to html {args.output_prefix}_editing_preference.ipynb" + ) + + +if __name__ == "__main__": + main() diff --git a/imgs/dag_bean.png b/imgs/dag_bean.png deleted file mode 100644 index 1ef58c8..0000000 Binary files a/imgs/dag_bean.png and /dev/null differ diff --git a/imgs/dag_bean_v2.png b/imgs/dag_bean_v2.png new file mode 100644 index 0000000..a48822c Binary files /dev/null and b/imgs/dag_bean_v2.png differ diff --git a/notebooks/sample_quality_report.ipynb b/notebooks/sample_quality_report.ipynb index bf0acf3..a2e2707 100644 --- a/notebooks/sample_quality_report.ipynb +++ b/notebooks/sample_quality_report.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -9,7 +8,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -114,7 +112,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -140,7 +137,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -148,7 +144,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -156,7 +151,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -184,7 +178,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -201,7 +194,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -238,7 +230,6 @@ "source": [] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -291,7 +282,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -310,7 +300,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -364,7 +353,6 @@ "source": [] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -469,7 +457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.8.12" }, "vscode": { "interpreter": { diff --git a/setup.py b/setup.py index d921a3f..4b2157f 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ "bin/bean-count", "bin/bean-count-samples", "bin/bean-create-screen", + "bin/bean-profile", "bin/bean-qc", "bin/bean-filter", "bin/bean-run", # TODO: prevent error when extra requirements are not met. @@ -37,7 +38,7 @@ "scipy", "perturb-tools>=0.2.8", "matplotlib", - "seaborn", + "seaborn>=0.13.0", "tqdm", "bio", "liftover", @@ -50,6 +51,7 @@ "ipykernel", "pytest-order", "nbconvert", + "logomaker", ], extras_require={"model": ["pyBigWig", "pyro-ppl<=1.8.1", "statsmodels", "torch<2"]}, include_package_data=True,