diff --git a/correct_primer_positions.py b/correct_primer_positions.py new file mode 100644 index 0000000..fbdf8c8 --- /dev/null +++ b/correct_primer_positions.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +"""correct_primer_positions.py +Namuun Battur +naominamuun@gmail.com +required packages: pandas, biopython, fuzzysearch, sys, os.path +""" + +from fuzzysearch import find_near_matches +from Bio import SeqIO +import pandas as pd +from sys import argv +import os.path + +def verify(sequence): + '''This code verifies if a sequence is a DNA''' + + # set the input sequence + seq = set(sequence) + + # confirm if its elements is equal to the set of valid DNA bases + # Use a union method to ensure the sequence is verified if does not + # contain all the bases + if seq == {"A", "T", "C", "G"}.union(seq): + return "DNA" + else: + return "Invalid DNA sequence" + +def rev_comp_if(seq): +# adjusted code from https://www.geeksforgeeks.org/reverse-complement-of-dna-strand-using-python/ + comp = [] + if verify(seq) == "DNA": + for base in seq: + if base == "A": comp.append("T") + elif base == "G": comp.append("C") + elif base == "T": comp.append("A") + elif base == "C": comp.append("G") + ### mask ambigious bases with N + elif base == "Y": comp.append("N") + elif base == "R": comp.append("N") + elif base == "S": comp.append("N") + elif base == "W": comp.append("N") + elif base == "K": comp.append("N") + elif base == "M": comp.append("N") + elif base == "B": comp.append("N") + elif base == "D": comp.append("N") + elif base == "H": comp.append("N") + elif base == "V": comp.append("N") + elif base == "N": comp.append("N") + else: + return "Invalid DNA Sequence" + + # reverse the sequence + comp_rev = comp[::-1] + + # convert list to string + comp_rev = "".join(comp_rev) + return comp_rev + +"""considered ambigious bases (https://www.bioinformatics.org/sms/iupac.html) +""" + +def update_positions(primer_df, record_dict): + try: + start_pos = [] + end_pos = [] + + for index, row in primer_df.iterrows(): + primer = str(row['seq']).upper() + + # reverse compliment the primer sequence + revprimer = rev_comp_if(str(row['seq']).upper()) + + # get reference sequence of the 1st entry from fasta file + reference=str(record_dict[list(record_dict)[0]].seq).upper() + + # fuzzy search primer sequence in reference sequence + # allowing max. 2 mismatches (Levenshtein distance of 2) + primer_match = find_near_matches(primer, reference, max_l_dist=2) + rev_primer_match = find_near_matches(revprimer, reference, max_l_dist=2) + + if len(primer_match) == 1: + start_pos.append(primer_match[0].start) + end_pos.append(primer_match[0].end) + elif len(rev_primer_match) == 1: + start_pos.append(rev_primer_match[0].start) + end_pos.append(rev_primer_match[0].end) + elif (len(primer_match) > 1 or len(rev_primer_match) > 1): + ## Maybe instead of raising an exception here choose the match nearest to initial primer position in varvamp bed file + raise Exception("Primer sequence found multiple times in reference sequence.") + else: + raise Exception("Primer sequence not found in reference sequence.") + + # check if primer size is still the same + seq_size = list(map(lambda a, b: a-b, end_pos, start_pos)) + if seq_size == list(primer_df['size']): + pass + else: + raise Exception("The primer size before and after the update are not identical") + except: + raise Exception("update_position not possible") + return start_pos, end_pos + + +def main(argv): + + if len(argv) != 4: + print('Please add required input files \n e.g.: python3 correct_primer_positions.py [path/to/Reference/.fasta] [path/to/primer/.tsv] [path/to/primer/.bed]') + return + else: + try: + # read reference fasta file + if os.path.exists(argv[1]): + record_dict = SeqIO.to_dict(SeqIO.parse(argv[1],"fasta")) + else: + print("FASTA file not available.") + + # read the primer tsv and bed files + if os.path.exists(argv[2]): + primer_df = pd.read_csv(argv[2], sep="\t") + else: + print("TSV file not available.") + if os.path.exists(argv[3]): + primer_bed_df = pd.read_csv(argv[3], sep="\t", header=None) + else: + print("BED file not available.") + + # update positions in bed file + s_pos, e_pos = update_positions(primer_df, record_dict) + + primer_bed_df.iloc[:,1] = s_pos + primer_bed_df.iloc[:,2] = e_pos + + primer_bed_df.to_csv(os.path.dirname(argv[3])+'/'+os.path.basename(argv[3])+'.position.corrected.bed', index=False, sep='\t', header=False) + print("Primer.bed file has been updated. See: \n"+os.path.dirname(argv[3])+'/'+os.path.basename(argv[3])+'.position.corrected.bed') + except Exception as e: # pragma: nocover + return "An error has occured: " + str(e) +if __name__ == "__main__": # pragma: nocover + main(argv) diff --git a/py_env_bed_pos_correction.yml b/py_env_bed_pos_correction.yml new file mode 100644 index 0000000..25520d5 --- /dev/null +++ b/py_env_bed_pos_correction.yml @@ -0,0 +1,48 @@ +name: py_bed +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - attrs=23.2.0=pyh71513ae_0 + - biopython=1.82=py312h98912ed_0 + - bzip2=1.0.8=hd590300_5 + - ca-certificates=2023.11.17=hbcca054_0 + - fuzzysearch=0.7.3=py312h30efb56_5 + - ld_impl_linux-64=2.40=h41732ed_0 + - libblas=3.9.0=20_linux64_openblas + - libcblas=3.9.0=20_linux64_openblas + - libexpat=2.5.0=hcb278e6_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.2.0=h807b86a_3 + - libgfortran-ng=13.2.0=h69a702a_3 + - libgfortran5=13.2.0=ha4646dd_3 + - libgomp=13.2.0=h807b86a_3 + - liblapack=3.9.0=20_linux64_openblas + - libnsl=2.0.1=hd590300_0 + - libopenblas=0.3.25=pthreads_h413a1c8_0 + - libsqlite=3.44.2=h2797004_0 + - libstdcxx-ng=13.2.0=h7e041cc_3 + - libuuid=2.38.1=h0b41bf4_0 + - libxcrypt=4.4.36=hd590300_1 + - libzlib=1.2.13=hd590300_5 + - ncurses=6.4=h59595ed_2 + - numpy=1.26.2=py312heda63a1_0 + - openssl=3.2.0=hd590300_1 + - pandas=2.1.4=py312hfb8ada1_0 + - pip=23.3.2=pyhd8ed1ab_0 + - python=3.12.1=hab00c5b_1_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-tzdata=2023.4=pyhd8ed1ab_0 + - python_abi=3.12=4_cp312 + - pytz=2023.3.post1=pyhd8ed1ab_0 + - readline=8.2=h8228510_1 + - setuptools=68.2.2=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - tk=8.6.13=noxft_h4845f30_101 + - tzdata=2023d=h0c530f3_0 + - wheel=0.42.0=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 +prefix: /home/namuun/miniconda3/envs/py_bed