add files

rki-mf1 · Jan 22, 2024 · 942152c · 942152c
commit 942152c
Show file tree

Hide file tree

Showing 2 changed files with 186 additions and 0 deletions.
diff --git a/correct_primer_positions.py b/correct_primer_positions.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+"""correct_primer_positions.py
+Namuun Battur
+[email protected]
+required packages: pandas, biopython, fuzzysearch, sys, os.path
+"""
+
+from fuzzysearch import find_near_matches
+from Bio import SeqIO
+import pandas as pd
+from sys import argv
+import os.path
+
+def verify(sequence):
+	'''This code verifies if a sequence is a DNA'''
+
+	# set the input sequence
+	seq = set(sequence)
+
+	# confirm if its elements is equal to the set of valid DNA bases
+	# Use a union method to ensure the sequence is verified if does not
+	# contain all the bases
+	if seq == {"A", "T", "C", "G"}.union(seq):
+		return "DNA"
+	else:
+		return "Invalid DNA sequence"
+
+def rev_comp_if(seq):
+# adjusted code from https://www.geeksforgeeks.org/reverse-complement-of-dna-strand-using-python/
+	comp = []
+	if verify(seq) == "DNA":
+		for base in seq:
+			if base == "A": comp.append("T")
+			elif base == "G": comp.append("C")
+			elif base == "T": comp.append("A")
+			elif base == "C": comp.append("G")
+			### mask ambigious bases with N
+			elif base == "Y": comp.append("N")
+			elif base == "R": comp.append("N")
+			elif base == "S": comp.append("N")
+			elif base == "W": comp.append("N")
+			elif base == "K": comp.append("N")
+			elif base == "M": comp.append("N")
+			elif base == "B": comp.append("N")
+			elif base == "D": comp.append("N")
+			elif base == "H": comp.append("N")
+			elif base == "V": comp.append("N")
+			elif base == "N": comp.append("N")
+	else:
+		return "Invalid DNA Sequence"
+
+	# reverse the sequence
+	comp_rev = comp[::-1]
+
+	# convert list to string
+	comp_rev = "".join(comp_rev)
+	return comp_rev
+
+"""considered ambigious bases (https://www.bioinformatics.org/sms/iupac.html)
+"""
+
+def update_positions(primer_df, record_dict):
+    try:
+        start_pos = []
+        end_pos = []
+
+        for index, row in primer_df.iterrows():
+            primer = str(row['seq']).upper()
+
+            # reverse compliment the primer sequence
+            revprimer = rev_comp_if(str(row['seq']).upper())
+
+            # get reference sequence of the 1st entry from fasta file
+            reference=str(record_dict[list(record_dict)[0]].seq).upper()
+
+            # fuzzy search primer sequence in reference sequence
+            # allowing max. 2 mismatches (Levenshtein distance of 2)
+            primer_match = find_near_matches(primer, reference, max_l_dist=2)
+            rev_primer_match = find_near_matches(revprimer, reference, max_l_dist=2)
+
+            if len(primer_match) == 1:
+                start_pos.append(primer_match[0].start)
+                end_pos.append(primer_match[0].end)
+            elif len(rev_primer_match) == 1:
+                start_pos.append(rev_primer_match[0].start)
+                end_pos.append(rev_primer_match[0].end)
+            elif (len(primer_match) > 1 or len(rev_primer_match) > 1):
+                ## Maybe instead of raising an exception here choose the match nearest to initial primer position in varvamp bed file
+                raise Exception("Primer sequence found multiple times in reference sequence.")
+            else:
+                raise Exception("Primer sequence not found in reference sequence.")
+
+        # check if primer size is still the same
+        seq_size = list(map(lambda a, b: a-b, end_pos, start_pos))
+        if seq_size == list(primer_df['size']):
+            pass
+        else:
+            raise Exception("The primer size before and after the update are not identical")
+    except:
+        raise Exception("update_position not possible")
+    return start_pos, end_pos
+
+
+def main(argv):
+
+    if len(argv) != 4:
+        print('Please add required input files \n e.g.: python3 correct_primer_positions.py [path/to/Reference/.fasta] [path/to/primer/.tsv] [path/to/primer/.bed]')
+        return
+    else:  
+        try:
+			#  read reference fasta file
+            if os.path.exists(argv[1]):
+                record_dict = SeqIO.to_dict(SeqIO.parse(argv[1],"fasta"))
+            else:
+                print("FASTA file not available.")
+
+			# read the primer tsv and bed files
+            if os.path.exists(argv[2]):
+                primer_df = pd.read_csv(argv[2], sep="\t")
+            else:
+                print("TSV file not available.")
+            if os.path.exists(argv[3]):
+                primer_bed_df = pd.read_csv(argv[3], sep="\t", header=None)
+            else:
+                print("BED file not available.")
+
+			# update positions in bed file
+            s_pos, e_pos = update_positions(primer_df, record_dict)
+
+            primer_bed_df.iloc[:,1] = s_pos
+            primer_bed_df.iloc[:,2] = e_pos
+
+            primer_bed_df.to_csv(os.path.dirname(argv[3])+'/'+os.path.basename(argv[3])+'.position.corrected.bed', index=False, sep='\t', header=False)
+            print("Primer.bed file has been updated. See: \n"+os.path.dirname(argv[3])+'/'+os.path.basename(argv[3])+'.position.corrected.bed')
+        except Exception as e:  # pragma: nocover
+            return "An error has occured: " + str(e)
+if __name__ == "__main__":  # pragma: nocover
+    main(argv)
diff --git a/py_env_bed_pos_correction.yml b/py_env_bed_pos_correction.yml
@@ -0,0 +1,48 @@
+name: py_bed
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - attrs=23.2.0=pyh71513ae_0
+  - biopython=1.82=py312h98912ed_0
+  - bzip2=1.0.8=hd590300_5
+  - ca-certificates=2023.11.17=hbcca054_0
+  - fuzzysearch=0.7.3=py312h30efb56_5
+  - ld_impl_linux-64=2.40=h41732ed_0
+  - libblas=3.9.0=20_linux64_openblas
+  - libcblas=3.9.0=20_linux64_openblas
+  - libexpat=2.5.0=hcb278e6_1
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=13.2.0=h807b86a_3
+  - libgfortran-ng=13.2.0=h69a702a_3
+  - libgfortran5=13.2.0=ha4646dd_3
+  - libgomp=13.2.0=h807b86a_3
+  - liblapack=3.9.0=20_linux64_openblas
+  - libnsl=2.0.1=hd590300_0
+  - libopenblas=0.3.25=pthreads_h413a1c8_0
+  - libsqlite=3.44.2=h2797004_0
+  - libstdcxx-ng=13.2.0=h7e041cc_3
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libzlib=1.2.13=hd590300_5
+  - ncurses=6.4=h59595ed_2
+  - numpy=1.26.2=py312heda63a1_0
+  - openssl=3.2.0=hd590300_1
+  - pandas=2.1.4=py312hfb8ada1_0
+  - pip=23.3.2=pyhd8ed1ab_0
+  - python=3.12.1=hab00c5b_1_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-tzdata=2023.4=pyhd8ed1ab_0
+  - python_abi=3.12=4_cp312
+  - pytz=2023.3.post1=pyhd8ed1ab_0
+  - readline=8.2=h8228510_1
+  - setuptools=68.2.2=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - tzdata=2023d=h0c530f3_0
+  - wheel=0.42.0=pyhd8ed1ab_0
+  - xz=5.2.6=h166bdaf_0
+prefix: /home/namuun/miniconda3/envs/py_bed