Skip to content

Commit

Permalink
adding per-replicate LFC and total editing rate
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Aug 14, 2024
1 parent 13cbd6d commit 8b3d7bc
Show file tree
Hide file tree
Showing 9 changed files with 10,999 additions and 10,975 deletions.
7 changes: 6 additions & 1 deletion bean/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ def main(args, return_data=False):
== args.negctrl_col_value.lower()
)[0]
else:
if "edit_rate_norm" not in ndata.screen.guides.columns:
ndata.screen.get_edit_from_allele()
ndata.screen.get_edit_mat_from_uns(rel_pos_is_reporter=True)
ndata.screen.get_guide_edit_rate(
unsorted_condition_label=args.control_condition
)
if args.splice_site_path is not None:
splice_site = pd.read_csv(args.splice_site_path).pos
target_info_df = be.an.translate_allele.annotate_edit(
Expand Down Expand Up @@ -198,7 +204,6 @@ def main(args, return_data=False):
],
axis=1,
)

# Add user-defined prior.
if args.prior_params is not None:
prior_params = _check_prior_params(args.prior_params, ndata)
Expand Down
18 changes: 12 additions & 6 deletions bean/framework/ReporterScreen.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,14 @@ def get_edit_mat_from_uns(
edits.target_pos_matches,
["guide", "edit"] + self.samples.index.tolist(),
]

good_edits = good_edits.copy()
good_guide_idx = guide_name_to_idx.loc[good_edits.guide, "index"].astype(int)
edit_mat = np.zeros(self.layers["edits"].shape)
for gidx, eidx in zip(good_guide_idx, good_edits.index):
self.layers["edits"][gidx, :] += good_edits.loc[
edit_mat[gidx, :] = edit_mat[gidx, :] + good_edits.loc[
eidx, self.samples.index.tolist()
].astype(int)
self.layers["edits"] = edit_mat
print("New edit matrix saved in .layers['edits']. Returning old edits.")
return old_edits

Expand Down Expand Up @@ -505,17 +507,21 @@ def get_guide_edit_rate(
prior_weight = 1
n_edits = self.layers[edit_layer].copy()[:, bulk_idx].sum(axis=1)
n_counts = self.layers[count_layer].copy()[:, bulk_idx].sum(axis=1)
edit_rate = (n_edits + prior_weight / 2) / (
(n_counts * num_targetable_sites) + prior_weight / 2
)
edit_rate = (n_edits + prior_weight / 2) / ((n_counts) + prior_weight / 2)
if normalize_by_editable_base:
edit_rate_norm = (n_edits + prior_weight / 2) / (
(n_counts * num_targetable_sites) + prior_weight / 2
)
edit_rate[n_counts < bcmatch_thres] = np.nan
if normalize_by_editable_base:
print("normalize by editable counts")
edit_rate[num_targetable_sites == 0] = np.nan
edit_rate_norm[num_targetable_sites == 0] = np.nan
if return_result:
return edit_rate
else:
self.guides["edit_rate"] = edit_rate
if normalize_by_editable_base:
self.guides["edit_rate_norm"] = edit_rate_norm
print(self.guides.edit_rate)

def get_edit_rate(
Expand Down
5 changes: 3 additions & 2 deletions bean/model/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,10 @@ def write_result_table(
else:
a_fitted = param_hist_dict["alpha_pi"].detach().cpu().numpy()
pi = a_fitted[..., 1:].sum(axis=1) / a_fitted.sum(axis=1)
guide_info_df["edit_rate"] = pi
# guide_info_df["edit_rate_fitted"] = pi
guide_info_df = guide_info_df[
["edit_rate"] + [col for col in guide_info_df.columns if col != "Mid"]
["edit_rate"]
+ [col for col in guide_info_df.columns if col != "Mid" and col != "edit_rate"]
]
if guide_acc is not None:
guide_info_df.insert(1, "accessibility", guide_acc)
Expand Down
61 changes: 35 additions & 26 deletions bean/model/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,14 @@ def _get_guide_info(bdata, args, guide_lfc_pseudocount: int = 5):
== low_upper_conds[args.sorting_bin_lower_quantile_col].min(),
args.condition_col,
].item()
guide_lfc = bdata.log_fold_change_agg(
guide_lfc = bdata.log_fold_change_reps(
highest_cond,
lowest_cond,
agg_col=args.replicate_col,
rep_col=args.replicate_col,
compare_col=args.condition_col,
return_result=True,
# return_result=True,
pseudocount=guide_lfc_pseudocount,
).to_frame(f"{highest_cond}_{lowest_cond}.median_lfc")
) # .to_frame(f"{highest_cond}_{lowest_cond}.median_lfc")
else:
# earliest / latest bin & earliest non-plasmid / latest
conditions = bdata.samples[
Expand All @@ -261,14 +261,14 @@ def _get_guide_info(bdata, args, guide_lfc_pseudocount: int = 5):
conditions[args.time_col] == conditions[args.time_col].min(),
args.condition_col,
].item()
guide_lfc = bdata.log_fold_change_agg(
guide_lfc = bdata.log_fold_change_reps(
latest_cond,
earliest_cond,
agg_col=args.replicate_col,
rep_col=args.replicate_col,
compare_col=args.condition_col,
return_result=True,
# return_result=True,
pseudocount=guide_lfc_pseudocount,
).to_frame(f"{latest_cond}_{earliest_cond}.median_lfc")
) # .to_frame(f"{latest_cond}_{earliest_cond}.median_lfc")
if args.plasmid_condition is not None:
selected_conditions = conditions.loc[
conditions[args.condition_col] != args.plasmid_condition
Expand All @@ -278,16 +278,23 @@ def _get_guide_info(bdata, args, guide_lfc_pseudocount: int = 5):
== selected_conditions[args.time_col].min(),
args.condition_col,
].item()
guide_lfc2 = bdata.log_fold_change_agg(
latest_cond,
earliest_selected_cond,
agg_col=args.replicate_col,
compare_col=args.condition_col,
return_result=True,
pseudocount=guide_lfc_pseudocount,
).to_frame(f"{latest_cond}_{earliest_selected_cond}.median_lfc")
guide_lfc = pd.concat([guide_lfc, guide_lfc2], axis=1)
return guide_lfc
if earliest_selected_cond != earliest_cond:
guide_lfc2 = bdata.log_fold_change_reps(
latest_cond,
earliest_selected_cond,
rep_col=args.replicate_col,
compare_col=args.condition_col,
# return_result=True,
pseudocount=guide_lfc_pseudocount,
) # .to_frame(f"{latest_cond}_{earliest_selected_cond}.median_lfc")
guide_lfc = pd.concat([guide_lfc, guide_lfc2], axis=1)
if bdata.tiling:
guide_info = pd.concat(
[bdata.guides[["edit_rate", "edit_rate_norm"]], guide_lfc], axis=1
)
else:
guide_info = pd.concat([bdata.guides[["edit_rate"]], guide_lfc], axis=1)
return guide_info


def _get_guide_to_variant_df(target_info_df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -297,29 +304,31 @@ def _get_guide_to_variant_df(target_info_df: pd.DataFrame) -> pd.DataFrame:

per_guide_edit_rates = target_info_df["per_guide_editing_rates"].map(
lambda s: (
[lambda x: (float(x) if x else np.nan) for x in s.strip(",").split(",")]
[(float(x) if x else np.nan) for x in s.strip(",").split(",")]
if not (s and pd.isnull(s))
else []
)
)
df = pd.DataFrame(
{
"variant": target_info_df.edit,
"variants": target_info_df.edit,
"guide": editing_guides,
"edit_rate": per_guide_edit_rates,
"edit_rates": per_guide_edit_rates,
}
)
df["zipped"] = df.apply(lambda row: list(zip(row.guide, row.edit_rate)), axis=1)
df_exp = df[["variant", "zipped"]].explode("zipped")
df["zipped"] = df.apply(lambda row: list(zip(row.guide, row.edit_rates)), axis=1)
df_exp = df[["variants", "zipped"]].explode("zipped")
df_exp["guide"] = df_exp["zipped"].map(
lambda x: x[0] if not pd.isnull(x) else np.nan
)
df_exp["edit_rate"] = df_exp["zipped"].map(
lambda x: x[1] if not pd.isnull(x) else np.nan
df_exp["per_variant_edit_rate"] = df_exp["zipped"].map(
lambda x: (x[1] if not pd.isnull(x) else np.nan)
)
df_exp = df_exp.loc[~df_exp.guide.isnull()]
guide_to_var_df = (
df_exp[["guide", "variant", "edit_rate"]].groupby("guide").agg(list)
df_exp[["guide", "variants", "per_variant_edit_rate"]]
.groupby("guide")
.agg(list)
)
return guide_to_var_df

Expand Down
4 changes: 3 additions & 1 deletion bean/notebooks/sample_quality_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,9 @@
" elif \"compare_col\" in str(e):\n",
" raise ValueError(f\"Column `{condition_label}` fed in with `--condition-col {condition_label}` does not exist in the input .h5ad file. Please check your input.\") from e\n",
" elif \"cond1\" in str(e):\n",
" raise ValueError(f\"Samples with `{condition_label}` value of `{comp_cond1}` or `{comp_cond2}` does not exist. Check your input argument fed in with `--lfc-conds `{comp_cond1},{comp_cond2}`.\")\n",
" raise ValueError(f\"Samples with `{condition_label}` value of `{comp_cond1}` or `{comp_cond2}` does not exist. Check your input argument fed in with `--lfc-conds `{comp_cond1},{comp_cond2}`.\") from e\n",
" else:\n",
" raise e\n",
"ax.set_title(\"top/bot LFC correlation, Spearman\")\n",
"plt.yticks(rotation=0)\n",
"plt.xticks(rotation=90)\n",
Expand Down
5 changes: 3 additions & 2 deletions docs/example_run_output/tiling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ These are the example output of [`bean run`](https://pinellolab.github.io/crispr

## `bean_sgRNA_result.[model_type].csv`
- `name`: sgRNA ID provided in the `name` column of the input.
- `edit_rate`: Effective editing rates
- `edit_rate`: Total editing rate in the editing window. `NaN` if no editable target base exists within the window.
- `edit_rate_norm`: Total editing rate divided by the number of editable base in the editing window. `NaN` if no editable target base exists within the window.
- `accessibility`: (Only if you have used `--scale-by-acc`) Accessibility signal that is used for scaling of the editing rate.
- `scaled_edit_rate`: (Only if you have used `--scale-by-acc`) Endogenous editing rate used for modeling, estimated by scaling reporter editing rate by accessibility signal
- `[cond1]_[cond2].median_lfc`: Raw LFC with pseudocount fed in with `--guide-lfc-pseudocount` argument (default 5).
- `[replicate].[cond1]_[cond2]`: Raw per-replicate LFC with pseudocount fed in with `--guide-lfc-pseudocount` argument (default 5).
- `variants`: Variants generated by this gRNA
- `variant_edit_rates`: Editing rate of this gRNA for each variant it creates.
Loading

0 comments on commit 8b3d7bc

Please sign in to comment.