From a7bf84b6bbdacefdb617ddeccff40d6601b1dc7d Mon Sep 17 00:00:00 2001 From: Jayoung Ryu Date: Mon, 4 Dec 2023 14:28:48 -0500 Subject: [PATCH 1/3] refining exon selections with assembly version --- README.md | 6 +++--- bean/annotate/translate_allele.py | 11 ++++++----- bean/annotate/utils.py | 21 +++++++++++++++------ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 8ae480b..ef03512 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profi ### Output Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)). -Allele translation +Allele translation ### Parameters * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used. @@ -188,7 +188,7 @@ bean-qc \ `bean-qc` supports following quality control and masks samples with low quality. Specifically: -Allele translation +Allele translation * Plots guide coverage and the uniformity of coverage * Guide count correlation between samples @@ -402,5 +402,5 @@ Python package `bean` supports multiple data wrangling functionalities for `Repo * Full pipeline takes 90.1s in GitHub Action for toy dataset of 2 replicates and 30 guides. ## Citation -If you have used BEAN, please cite: +If you have used BEAN for your analysis, please cite: Ryu, J. et al. Joint genotypic and phenotypic outcome modeling improves base editing variant effect quantification. medRxiv (2023) doi:10.1101/2023.09.08.23295253 diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py index a62cedb..2e81a89 100644 --- a/bean/annotate/translate_allele.py +++ b/bean/annotate/translate_allele.py @@ -225,11 +225,11 @@ def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True): return cds @classmethod - def from_gene_name(cls, gene_name): + def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"): cds = cls() if gene_name not in cls.gene_info_dict: chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_gene_name( - gene_name + gene_name, ref_version ) cls.gene_info_dict[gene_name] = { "chrom": chrom, @@ -329,8 +329,8 @@ def get_mismatch_df(): ).drop_duplicates() -def get_cds_dict(gene_names): - return {gname: CDS.from_gene_name(gname) for gname in gene_names} +def get_cds_dict(gene_names, ref_version: str = "GRCh38"): + return {gname: CDS.from_gene_name(gname, ref_version) for gname in gene_names} def export_gene_info_to_json(gene_dict, write_path=".tmp_gene_info.csv"): @@ -418,6 +418,7 @@ def get_allele_aa_change_single_gene( fasta_file: str = None, fasta_file_id: str = None, include_synonymous: bool = True, + ref_version: str = "GRCh38", ): """ Obtain amino acid changes @@ -425,7 +426,7 @@ def get_allele_aa_change_single_gene( if gene_name is None and fasta_file is not None: cds = CDS.from_fasta(fasta_file, fasta_file_id) elif gene_name is not None and fasta_file is None: - cds = CDS.from_gene_name(gene_name) + cds = CDS.from_gene_name(gene_name, ref_version) else: raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.") return cds.get_aa_change(allele, include_synonymous) diff --git a/bean/annotate/utils.py b/bean/annotate/utils.py index 207fbb9..436a012 100644 --- a/bean/annotate/utils.py +++ b/bean/annotate/utils.py @@ -97,7 +97,7 @@ def get_mane_transcript_id(gene_name: str): return mane_transcript_id, id_version -def get_exons_from_transcript_id(transcript_id: str, id_version: int): +def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"): """ Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version. @@ -112,9 +112,18 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int): response = requests.get(api_url, headers={"Content-Type": "application/json"}) transcript_json = response.json() if transcript_json["count"] != 1: - raise ValueError( - f"Non-unique entry for transcript ID and version:\n{transcript_json}" - ) + if transcript_json["count"] > 1: + api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons" + response = requests.get(api_url, headers={"Content-Type": "application/json"}) + transcript_json = response.json() + if transcript_json["count"] != 1: + raise ValueError( + f"Non-unique entry for transcript ID , version {id_version} and assembly {ref_version}:\n{transcript_json}" + ) + else: + raise ValueError( + f"No entry found for transcript ID {transcript_id}, version {id_version} and assembly {ref_version}" + ) transcript_record = transcript_json["results"][0] exons_list = transcript_record["exons"] strand = transcript_record["loc_strand"] # +1/-1 @@ -186,11 +195,11 @@ def get_exon_pos_seq(exon_id, id_version, cds_start, cds_end, ref_version="GRCh3 return chrom, list(sequence), list(range(start_pos, end_pos + 1)) -def get_cds_seq_pos_from_gene_name(gene_name: str): +def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"): transcript_id, id_version = get_mane_transcript_id(gene_name) print(f"MANE transcript ID {transcript_id} for {gene_name} will be used.") exons_list, cds_start, cds_end, strand = get_exons_from_transcript_id( - transcript_id, id_version + transcript_id, id_version, ref_version ) if strand == -1: exons_list = exons_list[::-1] From 41cad36466600e977245d67acd2901a2b5ca4e96 Mon Sep 17 00:00:00 2001 From: Jayoung Ryu Date: Tue, 5 Dec 2023 10:43:55 -0500 Subject: [PATCH 2/3] debug errors in sort_key assignment --- bean/preprocessing/utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bean/preprocessing/utils.py b/bean/preprocessing/utils.py index 8445f13..84e3d97 100644 --- a/bean/preprocessing/utils.py +++ b/bean/preprocessing/utils.py @@ -214,12 +214,16 @@ def _assign_rep_ids_and_sort( screen: be.ReporterScreen, rep_col: str, condition_column: str = None ) -> be.ReporterScreen: """Assign replicate IDs to samples and sort them accordingly.""" + if rep_col not in screen.samples.columns: + raise ValueError( + f"{rep_col} not in columns of ReporterScreen.samples with following columns: {screen.samples.columns}. Check your input or adjust `--replicate-col` of `bean-run` command as the existing column name specifying experimental replicates." + ) for i, rep in enumerate(sorted(screen.samples[rep_col].unique())): screen.samples.loc[screen.samples[rep_col] == rep, f"{rep_col}_id"] = i - if condition_column is None: - sort_key = f"{rep_col}_id" - else: - sort_key = [f"{rep_col}_id", f"{condition_column}_id"] + if condition_column is None: + sort_key = f"{rep_col}_id" + else: + sort_key = [f"{rep_col}_id", f"{condition_column}_id"] screen = screen[ :, screen.samples.sort_values(sort_key).index, From 3a1d2c1ad5448591b98466d79c25a8d06b3e1d64 Mon Sep 17 00:00:00 2001 From: Jayoung Ryu Date: Tue, 5 Dec 2023 16:37:47 -0500 Subject: [PATCH 3/3] prevent incorrect argument from running --- bean/model/run.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/bean/model/run.py b/bean/model/run.py index 68fc672..0cd7808 100644 --- a/bean/model/run.py +++ b/bean/model/run.py @@ -309,6 +309,25 @@ def check_args(args, bdata): raise ValueError( f"{args.bdata_path} does not have specified sample mask column {args.sample_mask_col} in .samples" ) + if args.condition_col not in bdata.samples.columns: + raise ValueError( + f"Condition column set by `--condition-col` {args.condition_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input." + ) + if args.control_condition_label not in bdata.samples[args.condition_col].tolist(): + raise ValueError( + f"No sample has control label (set by `--control-condition-label`) {args.control_condition_label} in ReporterScreen.samples[{args.condition_col}]: {bdata.samples[args.condition_col]}. Check your input." + ) + if args.replicate_col not in bdata.samples.columns: + raise ValueError( + f"Condition column set by `--replicate-col` {args.replicate_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input." + ) + if ( + args.control_guide_tag is not None + and not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any() + ): + raise ValueError( + f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input." + ) if args.alpha_if_overdispersion_fitting_fails is not None: try: b0, b1 = args.alpha_if_overdispersion_fitting_fails.split(",")