Skip to content

Commit

Permalink
Merge pull request #52 from VDBWRAIR/maaps
Browse files Browse the repository at this point in the history
Maaps
  • Loading branch information
necrolyte2 committed Oct 30, 2015
2 parents 64c555f + 1d3f067 commit 2f52f46
Show file tree
Hide file tree
Showing 13 changed files with 925 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ script:
- nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces -a '!download'
- pybot tests/*.robot
after_success:
- coveralls
- coveralls
32 changes: 32 additions & 0 deletions Den4_MAAPS_TestData16.fasta

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bio_pieces/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
except ImportError:
from io import BytesIO

from future.builtins import map, filter
from future.builtins import map, filter, zip

try:
import unittest2 as unittest
Expand Down
270 changes: 270 additions & 0 deletions bio_pieces/ctleptop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
#!/usr/bin/env python
# encoding: utf-8
"""
ctleptop.py -i [FASTA FILE] > Out_file.txt
Created by Dereje Jima on May 21, 2015
"""
from __future__ import division
from __future__ import print_function
from Bio.Seq import *
from Bio.Alphabet import IUPAC
from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
#from itertools import groupby
from Bio.Data import CodonTable
from Bio.Data.IUPACData import ambiguous_dna_values
#import yaml
import argparse
from bio_pieces import degen
from functools import partial
from tabulate import tabulate
from bio_pieces.compat import zip
import re

__docformat__ = "restructuredtext en"

AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"],
"W": ["A", "T"], "S": ["G", "C"],
"K": ["T", "G"],
"M": ["C", "A"], "D": ["A", "T", "G"],
"V": ["A", "C", "G"], "H": ["A", "C", "T"],
"B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]}

'''
def readFasta(fasta_name):
"""string -> tuple
Given a fasta file. Yield tuples of header, sequence
:fasta_name: Name of a file to read
:returns: tuple of fasta header and sequence line
"""
fh = open(fasta_name)
# ditch the boolean (x[0]) and just keep the header of sequence since we
# know they alternate
fasta_iter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in fasta_iter:
# drop the ">"
header = header.next()[1:].strip()
# join all sequence line to one
seq = "".join(s.strip() for s in fasta_iter.next())
yield header, seq
'''


def getNearbyChars(nt):
"""(str)->(list)
>>>getNearbyChars("R")
['A', 'G']
>>>getNearbyChars("Y")
['C', 'T']
>>>getNearbyChars("A")
['A']
"""
return AMBICODON.get(nt) or nt


def nearbyPermutations(letters, index=0):
"""(str)->(set)
>>>nearbyPermutations("AAR")
set(['AAG', 'AAA'])
>>>nearbyPermutations("ARR")
set(['AGG', 'AAG', 'AAA', 'AGA'])
nearbyPermutations("AAA")
set(['AAA'])
"""
if (index >= len(letters)):
return set([''])
subWords = nearbyPermutations(letters, index + 1)
nearbyLetters = getNearbyChars(letters[index])
return permutations(subWords, nearbyLetters)


def permutations(subWords, nearbyLetters):
"""(set, list) -> (set)
>>>permutations(set(['CA']), ['A', 'T'])
set(['ACA', 'TCA'])
"""
permutations = set()
for subWord in subWords:
for letter in nearbyLetters:
permutations.add(letter + subWord)
return permutations


def getaalist(codonlist):
"""(list) -> (list)
Return aa list from a a given nt codon list.
>>>getaalist(['AAA','ACT'])
['K', 'T']
"""
aalist = []
for codon in codonlist:
aa = Seq(codon, IUPAC.unambiguous_dna)
aa = str(translate(aa))
aalist.append(aa)
return aalist


def list_overlap(list1, list2):
"""(str, list) -> bool
Return True if the two list hava element that overlaps.
>>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
True
>>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
False
"""
for i in list1:
if i in list2:
return True
return False


# define our method
#def replace_all(text, dic):
#"""(str, dict)-> (str)
#>>>replace_all()
#"""
#for i, j in dic.iteritems():
#text = text.replace(i, j)
#return text


def access_mixed_aa(file_name):
"""(str) ->(list,list,list,list).
Return a list of amino acide code for ambiguous dna codon, position of
ambiguous nt codon, aa name,seq id from fasta header by reading multifasta
nucleotide fasta file
"""
from Bio import SeqIO
aa = []
nucleotide_idx = []
nucl_codon = []
seqids = []
for seq_record in SeqIO.parse(file_name, 'fasta'):
seq_id = seq_record.id
seqline = str(seq_record.seq)
seqline = seqline.replace("-", "N")
n = 3
codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
ambi_nucl = AMBICODON.keys()
for key, codon in sorted(codon_list.items()):
if list_overlap(codon, ambi_nucl):
d, e, f = codon
m = [d, e, f]
items = [i for i in m if i in ambi_nucl]
indexm = m.index(items[0])
for idx, val in enumerate(items):
codonlist = list(nearbyPermutations(codon))
val = getaalist(codonlist)
# remove if aa codon is the same eg. ['D', 'D']
val = set(val)
val = "/".join(sorted(val)) # yeild 'I/L'

key = key - 2 + indexm
if '/' in val:
nucleotide_idx.append(key)
nucl_codon.append(codon)
seqids.append(seq_id)
# if "/" in val and indexm == 2:
# key = key
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 1:
# key = key - 1
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 0:
# key = key - 2
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# else:
# pass
aa.append(val)

else:
# print "codon3 ..." ,codon
aa1 = Seq(codon, IUPAC.unambiguous_dna)
aa1 = aa1.translate()
aa1 = str(aa1)
aa.append(aa1)
#print aa, nucleotide_idx, nucl_codon, seqids
return aa, nucleotide_idx, nucl_codon, seqids


def create_args():
"""
Return command line arguments
"""
parser = argparse.ArgumentParser(description='Convert inframe nucleotide \
fasta file to protein and report mixed \
(ambiguous codon) with its location in \
the sequence', epilog = 'ctleptop -i \
tests/Den4_MAAPS_TestData16.fasta -o \
out_file.txt')
parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
parser.add_argument("-o", type=str, help="output file name", required=True)
parser.add_argument("--gb-file", type=str, help="genbank file name")
parser.add_argument("--gb-id", type=str, help="genabnk accession id")
parser.add_argument("--tab-file", type=str, help="gene tab/csv file")
return parser.parse_args()


def isGap(aalist, nclist):
"""(list, list) -> (list)
Return an updated protien codon list if the gap found in the nc codon
>>>isGap(["K/R", "I/T"], ["ARG", "NNN"])
["K/R", "GAPFOUND"]
"""
uaalist = []
for indx, val in enumerate(nclist):
if "N" in val:
codon = "GAPFOUND"
uaalist.append(codon)
else:
codon = aalist[indx]
uaalist.append(codon)
return uaalist

def open_f(filename):
return open(filename, 'w+')

def main():
args = create_args()
file_name = args.i
outfile = args.o
#print("Start processing and writing the output file to", outfile, " please please wait ... ")
outf = open_f(outfile)
my_list = access_mixed_aa(file_name)
aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)
pattern = re.compile(r'.+\/.+')
amb_aa_codon = []
#amb_aa_indx = []
for indx, letter in enumerate(aa):
if pattern.match(letter):
amb_aa_codon.append(letter)
#amb_aa_indx.append(indx + 1)
amb_aa_codon=isGap(amb_aa_codon, nucl_codon)
amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)

if args.gb_id or args.gb_file or args.tab_file:
reference_genes = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
overlapped_genes = degen.get_degen_list_overlap( reference_genes, nuc_idx)
my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition', 'gene name']) + "\n")
else:
print("Warning, no gene information supplied.")
my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon)
outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition']) + "\n")
outf.close()


if __name__ == '__main__':
main()
18 changes: 15 additions & 3 deletions bio_pieces/degen.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,28 @@ def open_generic_csv(csvfile):

csv_file_to_genes = compose(partial(map, row_to_gene), open_generic_csv, open)

def get_degen_list_overlap(genes, _degen_positions):
'''
:param iterable genes: iterable of genes with attributes `start`, `end`, `name`
:param itrable _degen_positions: degenerate positions
:return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
'''
genes, _degen_positions = list(genes), list(_degen_positions)
def get_intersect(pos):
intersects = lambda gene, pos=pos: gene if gene.start <= pos <= gene.end else None
matches = list(filter(bool, map(intersects, genes)))
return '-' if not matches else matches[0].name
return map(get_intersect, _degen_positions)

def get_gene_degen_overlap_info(genes, seq):
'''
:param iterable genes: iterable of genes with attributes `start`, `end`, `name`
:param str seq: nucleotide sequence
:return list of tuples of form: (gene name, position, nt)... where degens and genes overlap.
:return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
'''
_degen_positions = degen_positions(str(seq))
perms = product(genes, _degen_positions)
result = ((gene.name, pos, seq[pos]) for gene, pos in perms if gene.start <= pos <= gene.end)
return result
return ((gene.name, pos, seq[pos]) for gene, pos in perms if gene.start <= pos <= gene.end)

def get_genes(ref_id=None, genbank_file=None, user_file=None):
'''
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ schema
pyvcf
sh
funcy
tabulate
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
'beast_checkpoint = bio_pieces.beast_checkpoint:main',
'beast_wrapper = bio_pieces.beast_wrapper:beast_wrapper',
'beast_est_time = bio_pieces.beast_wrapper:beast_est_time',
'ctleptop = bio_pieces.ctleptop:main',
'parallel_blast = bio_pieces.parallel_blast:main',
'version = bio_pieces.version:main',
'degen = bio_pieces.degen:main'
Expand Down
Loading

0 comments on commit 2f52f46

Please sign in to comment.