From 5217e84534216aab35dbcfd7bb62b400027b165d Mon Sep 17 00:00:00 2001 From: jykr Date: Tue, 26 Mar 2024 13:04:27 -0400 Subject: [PATCH] Debug barcode start site --- README.md | 3 ++- bean/mapping/GuideEditCounter.py | 13 ++++++------- tests/test_count.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 974e0ab..b78a3f2 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ See full detail in [Parameters](#parameters). File should contain following columns. * `name`: gRNA ID column * `sequence`: gRNA sequence -* `barcode`: R2 barcode to help match reporter to gRNA +* `barcode`: R2 barcode to help match reporter to gRNA, written in the sense direction (as in R1) * In order to use accessibility in the [variant effect quantification](#bean-run-quantify-variant-effects), provide accessibility information in one of two options. (For non-targeting guides, provide NA values (empty cell).) * Option 1: `chrom` & `genomic_pos`: Chromosome (ex. `chr19`) and genomic position of guide sequence. You will have to provide the path to the bigwig file with matching reference version in `bean-run`. * Option 2: `accessibility_signal`: ATAC-seq signal value of the target loci of each guide. @@ -153,6 +153,7 @@ Read structure * `--barcode-start-seq` (default: ''): Barcode + reporter starts after this sequence in R2, denoted as the sense direction (the same sequence direction as R1). * `--guide-start-seqs-file` (default: `None`): CSV file path with per-sample `guide_start_seq` to be used, if provided. Formatted as `sample_id, guide_start_seq` * `--guide-end-seqs-file` (default: `None`): CSV file path with per-sample `guide_end_seq` to be used, if provided. Formatted as `sample_id,guide_end_seq` +* `--guide-start-seqs-file` (default: `None`): CSV file path with per-sample `barcode_start_seq` to be used, if provided. Formatted as `sample_id, guide_start_seq` * `-l`, `--reporter-length` (default: `32`): Length of the reporter sequence. * `--gstart-reporter` (default: `6`): Start position of the guide sequence in the reporter * `--guide-bc` (default: `True`): Construct has guide barcode diff --git a/bean/mapping/GuideEditCounter.py b/bean/mapping/GuideEditCounter.py index 4ffd66a..f1d60e8 100644 --- a/bean/mapping/GuideEditCounter.py +++ b/bean/mapping/GuideEditCounter.py @@ -47,7 +47,7 @@ class NoReadsAfterQualityFiltering(Exception): strand_str_to_int = {"neg": -1, "pos": 1, "-": -1, "+": 1} - +base_revcomp = {"A":"T", "T":"A", "G":"C", "C":"G"} def _get_stranded_guide_offset(strand: int, start_pos: int, guide_len: int) -> int: if strand == -1: @@ -652,16 +652,15 @@ def get_reporter_seq_qual(self, R2_record: SeqIO.SeqRecord, R2_start = 0): def get_barcode(self, R1_seq, R2_seq): if self.barcode_start_seq != "": - barcode_start_idx = revcomp(R2_seq).replace( - self.base_edited_from, self.base_edited_to - ).find( - self.barcode_start_seq.replace(self.base_edited_from, self.base_edited_to) - ) + barcode_start_idx = R2_seq.replace( + base_revcomp[self.base_edited_from], base_revcomp[self.base_edited_to] + ).find(revcomp(self.barcode_start_seq).replace(base_revcomp[self.base_edited_from], base_revcomp[self.base_edited_to])) if barcode_start_idx == -1: return -1, "" + barcode_start_idx += len(self.barcode_start_seq) else: barcode_start_idx = 0 - return barcode_start_idx, revcomp(R2_seq[barcode_start_idx: self.guide_bc_len]) + return barcode_start_idx, revcomp(R2_seq[barcode_start_idx: (barcode_start_idx + self.guide_bc_len)]) def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str): # This should be adjusted for each experimental recipes. diff --git a/tests/test_count.py b/tests/test_count.py index bf7e685..fd9500a 100644 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -1,5 +1,8 @@ import pytest import subprocess +from bean.mapping import GuideEditCounter +from bean.mapping.utils import _get_input_parser +from bean.mapping._supporting_fn import revcomp @pytest.mark.order(2) @@ -52,6 +55,15 @@ def test_count_samples_bcstart(): except subprocess.CalledProcessError as exc: raise exc +def test_barcode_start_idx(): + args = vars(_get_input_parser().parse_args(["-b", "A", "-f", "tests/data/test_guide_info.csv",])) + args["barcode_start_seq"] = "gtttgaattcgctagctaggtcttg" + args["R1"] = "tests/data/test_R1.fastq" + args["R2"] = "tests/data/test_R2.fastq" + gc = GuideEditCounter(**args) + sidx, bc = gc.get_barcode("", revcomp("agtggcaccgagtcggtgcttttttTAACAGTGTAATCTGGCGAGCCACTGTTCTTTGTACCAGAAgtttgaattcgctagctaggtcttgctgg".upper())) + assert sidx == 29 + assert bc == "AGAA" @pytest.mark.order(6) def test_count_samples_tiling():