diff --git a/README.md b/README.md
index 8c9289d..f91da6b 100644
--- a/README.md
+++ b/README.md
@@ -13,12 +13,14 @@ This is an analysis toolkit for the pooled CRISPR reporter or sensor data. The r
## Overview
`crispr-bean` supports end-to-end analysis of pooled sorting screens, with or without reporter.
-
+
-1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files.
-2. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides
-3. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA.
-4. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data.
+1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files.
+ * [`bean-count-create`](#bean-create-screen-create-reporterscreen-object-from-flat-files) creates minimal ReporterScreen object from flat gRNA count file. Note that this way, allele counts are not included and many functionalities involving allele and edit counts are not supported.
+2. [`bean-profile`](#bean-profile-profile-editing-patterns): Profile editing preferences of your editor.
+3. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides
+4. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA.
+5. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data.
### Data structure
BEAN stores mapped gRNA and allele counts in `ReporterScreen` object which is compatible with [AnnData](https://anndata.readthedocs.io/en/latest/index.html). See [Data Structure](#data-structure) section for more information.
@@ -141,7 +143,35 @@ Optional columns are not required but can be provided for compatibility with `be
* `--rerun` (default: `False`): Recount each sample. If `False`, existing count for each sample is taken.
+## `bean-create-screen`: Create ReporterScreen object from flat files
+```bash
+bean-create-screen gRNA_library.csv sample_list.csv gRNA_counts_table.csv
+```
+### Input
+ * [gRNA_library.csv](#1-gRNA_librarycsv)
+ * [sample_list.csv](#2-sample_listcsv)
+ * gRNA_counts_table.csv: Table with gRNA ID in the first column and sample IDs as the column names (first row)
+
+### Full Parameters
+ * `-e`, `--edits` (default: `None`): Path to edit counts .csv table, with index at first column and column names at the first row.
+ * `-o`, `--output-prefix` (default: `None`): Output file prefix (output will be saved as `output_prefix.h5ad`). If not provided, `gRNA_counts_table_csv` file prefix is used.
+
+
+## `bean-profile`: Profile editing patterns
+```bash
+bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profile report`
+```
+### Parameters
+ * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
+ * `--replicate-col` (default: `"rep"`): Column name in `bdata.samples` that describes replicate ID.
+ * `--condition-col` (default: `"bin"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)
+ * `--pam-col` (default: `None`): Column name describing PAM of each gRNA in `bdata.guides`.
+ * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.
+ * `-w`, `--window-length` (default: `6`): Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.
+
+### Output
+Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).
diff --git a/bean/plotting/editing_patterns.py b/bean/plotting/editing_patterns.py
index e621fcd..4f1ccd9 100644
--- a/bean/plotting/editing_patterns.py
+++ b/bean/plotting/editing_patterns.py
@@ -22,7 +22,7 @@ def _add_absent_edits(
):
"""If edit is not observed in editable position, add into the edit rate table."""
editable_positions = np.where(
- (np.array(list(bdata.guides.loc[guide, "Reporter"])) == edited_base)
+ (np.array(list(bdata.guides.loc[guide, "reporter"])) == edited_base)
)[0]
if guide not in edit_tbl.guide.tolist():
observed_rel_pos = []
@@ -62,14 +62,13 @@ def _add_absent_edits(
def get_edit_rates(
bdata,
edit_count_key="edit_counts",
- add_absent=True,
adjust_spacer_pos: bool = True,
reporter_column: str = "reporter",
):
"""
Obtain position- and context-wise editing rate (context: base preceding the target base position).
Args
- bdata (bean.ReporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate.
+ bdata (bean.reporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate.
edit_count_key: key in bdata.uns with edit count DataFrame to be used for analyzing posiiton-wide editing rate.
"""
edit_rates = bdata.get_normalized_allele_counts(bdata.uns[edit_count_key])
@@ -77,53 +76,45 @@ def get_edit_rates(
edit_rates_agg = edit_rates_agg.loc[
edit_rates_agg.guide.map(lambda s: "CONTROL" not in s)
].reset_index(drop=True)
- try:
- edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1)
- edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1)
- edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype(
- int
- )
- if add_absent:
- for guide in tqdm(
- edit_rates_agg.guide.unique(),
- desc="Calibrating edits in editable positions...",
- ):
- edit_rates_agg = _add_absent_edits(
- bdata,
- guide,
- edit_rates_agg,
- edited_base=bdata.base_edited_from,
- target_alt=bdata.base_edited_to,
- )
- if adjust_spacer_pos:
- if "guide_len" not in bdata.guides.columns:
- bdata.guides["guide_len"] = bdata.guides.sequence.map(len)
- start_pad = (
- 32
- - 6
- - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True)
- )
- edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype(
- int
- )
- edit_rates_agg = edit_rates_agg[
- (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20)
- ].reset_index(drop=True)
- edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1
- else:
- edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1
- edit_rates_agg["base_change"] = edit_rates_agg.edit.map(
- lambda e: e.get_base_change()
+ edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1)
+ edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1)
+ edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype(int)
+
+ for guide in tqdm(
+ edit_rates_agg.guide.unique(),
+ desc="Calibrating edits in editable positions...",
+ ):
+ edit_rates_agg = _add_absent_edits(
+ bdata,
+ guide,
+ edit_rates_agg,
+ edited_base=bdata.base_edited_from,
+ target_alt=bdata.base_edited_to,
)
- edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int)
- edit_rates_agg["context"] = edit_rates_agg.apply(
- lambda row: bdata.guides.loc[row.guide, reporter_column][
- row.rel_pos - 1 : row.rel_pos + 1
- ],
- axis=1,
+
+ if adjust_spacer_pos:
+ if "guide_len" not in bdata.guides.columns:
+ bdata.guides["guide_len"] = bdata.guides.sequence.map(len)
+ start_pad = (
+ 32 - 6 - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True)
)
- except Exception as exp:
- print(exp)
+ edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype(int)
+ edit_rates_agg = edit_rates_agg[
+ (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20)
+ ].reset_index(drop=True)
+ edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1
+ else:
+ edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1
+ edit_rates_agg["base_change"] = edit_rates_agg.edit.map(
+ lambda e: e.get_base_change()
+ )
+ edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int)
+ edit_rates_agg["context"] = edit_rates_agg.apply(
+ lambda row: bdata.guides.loc[row.guide, reporter_column][
+ row.rel_pos - 1 : row.rel_pos + 1
+ ],
+ axis=1,
+ )
return edit_rates_agg
@@ -190,10 +181,10 @@ def _get_norm_rates_df(
edit_rates_df=None,
edit_count_key="edit_counts",
base_changes: Sequence[str] = None,
- show_list=None,
):
change_by_pos = pd.pivot(
- edit_rates_df.groupby(["base_change", "spacer_pos"])
+ edit_rates_df[["base_change", "spacer_pos", "rep_mean"]]
+ .groupby(["base_change", "spacer_pos"])
.sum()["rep_mean"]
.reset_index(),
index="spacer_pos",
@@ -203,7 +194,7 @@ def _get_norm_rates_df(
norm_matrix = pd.DataFrame(index=change_by_pos.index, columns=BASES)
for pos in norm_matrix.index:
- pos_base = bdata.guides.Reporter.map(
+ pos_base = bdata.guides.reporter.map(
lambda s: s[pos] if pos < len(s) else " "
).values
for b in BASES:
@@ -216,10 +207,8 @@ def _get_norm_rates_df(
:, change_by_pos.columns.map(lambda s: s.split(">")[0])
].values
)
- if show_list is None:
- show_list = ["A>C", "A>T", "A>G", "C>T", "C>G"]
# norm_rate_reduced = _combine_complementary_base_changes(norm_rate).astype(float)
- return norm_rate.astype(float)[show_list] # _reduced
+ return norm_rate.astype(float)[base_changes] # _reduced
def _get_possible_changes_from_target_base(target_basechange: str):
@@ -257,6 +246,8 @@ def plot_by_pos_behive(
if nonref_base_changes is None:
if target_basechange == "A>G":
nonref_base_changes = ["C>T", "C>G"]
+ elif target_basechange == "C>T":
+ nonref_base_changes = ["G>A", "G>C"]
else:
print("No non-ref base changes specified. not drawing them")
nonref_base_changes = []
@@ -266,10 +257,10 @@ def plot_by_pos_behive(
bdata,
norm_rates_df,
edit_count_key,
- base_changes=[
+ base_changes=ref_other_changes
+ + [
target_basechange,
]
- + ref_other_changes
+ nonref_base_changes,
)
fig, ax = plt.subplots(figsize=(3, 7))
@@ -278,6 +269,7 @@ def plot_by_pos_behive(
if normalize:
df_to_draw = df_to_draw / vmax * 100
vmax = 100
+
target_data = df_to_draw.copy()
target_data.loc[:, target_data.columns != target_basechange] = np.nan
sns.heatmap(
@@ -291,6 +283,7 @@ def plot_by_pos_behive(
fmt=".0f" if normalize else ".2g",
annot_kws={"fontsize": 8},
)
+
ref_data = df_to_draw.copy()
ref_data.loc[
:,
@@ -322,20 +315,19 @@ def plot_by_pos_behive(
annot_kws={"fontsize": 8},
)
ax.set_ylabel("Protospacer position")
- return ax
+ return ax, df_to_draw
-def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame):
- edit_rates_df["pam"] = bdata.guides.loc[
- edit_rates_df.guide, "5-nt PAM"
- ].reset_index(drop=True)
+def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame, pam_col="5-nt PAM"):
+ edit_rates_df["pam"] = bdata.guides.loc[edit_rates_df.guide, pam_col].reset_index(
+ drop=True
+ )
edit_rates_df["pam23"] = edit_rates_df.pam.map(lambda s: s[1:3])
- edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1])
- edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2])
- edit_rates_df["pam12"] = edit_rates_df.pam.map(lambda s: s[:2])
- edit_rates_df["pam34"] = edit_rates_df.pam.map(lambda s: s[2:4])
return pd.pivot(
- edit_rates_df.loc[(edit_rates_df.base_change == bdata.target_base_change)]
+ edit_rates_df.loc[
+ (edit_rates_df.base_change == bdata.target_base_change),
+ ["rep_mean", "pam23", "spacer_pos"],
+ ]
.groupby(["pam23", "spacer_pos"])
.mean()
.reset_index(),
@@ -348,11 +340,12 @@ def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame):
def plot_by_pos_pam(
bdata,
edit_rates_df,
+ pam_col="5-nt PAM",
ax=None,
figsize=(6, 4),
):
"""Plot position by PAM"""
- pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df)
+ pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df, pam_col)
if ax is None:
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(pos_by_pam, ax=ax, cmap="Blues")
@@ -432,6 +425,7 @@ def get_pam_preference(
def plot_pam_preference(
edit_rates_df,
bdata=None,
+ pam_col="5-nt PAM",
edit_start_pos: int = 3,
edit_end_pos: int = 8,
ax=None,
@@ -440,7 +434,7 @@ def plot_pam_preference(
""" """
if "pam2" not in edit_rates_df.columns or "pam3" not in edit_rates_df.columns:
edit_rates_df["pam"] = bdata.guides.loc[
- edit_rates_df.guide, "5-nt PAM"
+ edit_rates_df.guide, pam_col
].reset_index(drop=True)
edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1])
edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2])
diff --git a/bean/plotting/utils.py b/bean/plotting/utils.py
new file mode 100644
index 0000000..1233cfb
--- /dev/null
+++ b/bean/plotting/utils.py
@@ -0,0 +1,69 @@
+import argparse
+
+
+def parse_args():
+ print(" \n~~~BEAN Profile~~~")
+ print("-Profile editing patterns of your editor-")
+ print(
+ r"""
+ _ _ __ _ _
+ / \ '\ _ __ _ _ ___ / _(_) |___
+ | \ \ | '_ \ '_/ _ \ _| | / -_)
+ \ \ | | .__/_| \___/_| |_|_\___|
+ `.__|/ |_|
+ """
+ )
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "bdata_path", help="Path to the ReporterScreen object to run QC on", type=str
+ )
+ parser.add_argument(
+ "-o",
+ "--output-prefix",
+ help="Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.",
+ type=str,
+ )
+ parser.add_argument(
+ "--replicate-col",
+ help="Column name in `bdata.samples` that describes replicate ID.",
+ type=str,
+ default="rep",
+ )
+ parser.add_argument(
+ "--condition-col",
+ help="Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)",
+ type=str,
+ default="bin",
+ )
+ parser.add_argument(
+ "--pam-col",
+ help="Column name describing PAM of each gRNA in `bdata.guides`.",
+ type=str,
+ default=None,
+ )
+ parser.add_argument(
+ "--control-condition",
+ help="Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.",
+ type=str,
+ default="bulk",
+ )
+ parser.add_argument(
+ "-w",
+ "--window-length",
+ help="Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.",
+ type=int,
+ default=6,
+ )
+
+ args = parser.parse_args()
+ if args.output_prefix is None:
+ args.output_prefix = f"{args.bdata_path.rsplit('.h5ad', 1)[0]}"
+ return args
+
+
+def check_args(args):
+ if args.window_length < 1:
+ raise ValueError(f"window_length {args.window_length} is too small.")
+ if args.window_length > 20:
+ raise ValueError(f"window_length {args.window_length} is too large.")
+ return args
diff --git a/bin/bean-profile b/bin/bean-profile
new file mode 100644
index 0000000..8b48daa
--- /dev/null
+++ b/bin/bean-profile
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+import os
+import papermill as pm
+import bean as be
+from bean.plotting.utils import parse_args, check_args
+
+
+def main():
+ args = parse_args()
+ args = check_args(args)
+ os.system(
+ "python -m ipykernel install --user --name bean_python3 --display-name bean_python3"
+ )
+ pm.execute_notebook(
+ f"{os.path.dirname(be.__file__)}/../notebooks/profile_editing_preference.ipynb",
+ f"{args.output_prefix}.ipynb",
+ parameters=dict(
+ bdata_path=args.bdata_path,
+ output_prefix=args.output_prefix,
+ replicate_col=args.replicate_col,
+ condition_col=args.condition_col,
+ control_condition=args.control_condition,
+ max_editing_window_length=args.window_length,
+ pam_col=args.pam_col,
+ ),
+ kernel_name="bean_python3",
+ )
+ os.system(
+ f"jupyter nbconvert --to html {args.output_prefix}_editing_preference.ipynb"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/imgs/dag_bean.png b/imgs/dag_bean.png
deleted file mode 100644
index 1ef58c8..0000000
Binary files a/imgs/dag_bean.png and /dev/null differ
diff --git a/imgs/dag_bean_v2.png b/imgs/dag_bean_v2.png
new file mode 100644
index 0000000..a48822c
Binary files /dev/null and b/imgs/dag_bean_v2.png differ
diff --git a/notebooks/sample_quality_report.ipynb b/notebooks/sample_quality_report.ipynb
index bf0acf3..a2e2707 100644
--- a/notebooks/sample_quality_report.ipynb
+++ b/notebooks/sample_quality_report.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -9,7 +8,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -114,7 +112,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -140,7 +137,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -148,7 +144,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -156,7 +151,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -184,7 +178,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -201,7 +194,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -238,7 +230,6 @@
"source": []
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -291,7 +282,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -310,7 +300,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -364,7 +353,6 @@
"source": []
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -469,7 +457,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.6"
+ "version": "3.8.12"
},
"vscode": {
"interpreter": {
diff --git a/setup.py b/setup.py
index d921a3f..4b2157f 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,7 @@
"bin/bean-count",
"bin/bean-count-samples",
"bin/bean-create-screen",
+ "bin/bean-profile",
"bin/bean-qc",
"bin/bean-filter",
"bin/bean-run", # TODO: prevent error when extra requirements are not met.
@@ -37,7 +38,7 @@
"scipy",
"perturb-tools>=0.2.8",
"matplotlib",
- "seaborn",
+ "seaborn>=0.13.0",
"tqdm",
"bio",
"liftover",
@@ -50,6 +51,7 @@
"ipykernel",
"pytest-order",
"nbconvert",
+ "logomaker",
],
extras_require={"model": ["pyBigWig", "pyro-ppl<=1.8.1", "statsmodels", "torch<2"]},
include_package_data=True,