Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maaps #52

Merged
merged 44 commits into from
Oct 30, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
587adf7
Added initial ctleptop script #22
demis001 May 27, 2015
4f611ff
Test data for ctleptop #22
demis001 May 27, 2015
eb77e21
Added a test code #22
demis001 Jun 10, 2015
2622d0d
commented unnecssary routines #22
demis001 Jun 10, 2015
02f8ff7
added ctleptop to setup.py #22
demis001 Jun 10, 2015
521cb15
Updated the final tabular print out #22
demis001 Jun 17, 2015
2a110c2
Merge pull request #38 from VDBWRAIR/dev
averagehat Jul 2, 2015
ae16442
Added multiple fasta test file #24
demis001 Jul 20, 2015
199464a
support multi-seq-file
demis001 Jul 28, 2015
847e715
Working version for multifasta
demis001 Aug 3, 2015
b85d55c
Commented testing lines
demis001 Aug 3, 2015
309ba0c
Cleaned the code, working for multiple fastafile
demis001 Aug 4, 2015
bf6a2f0
Updated the setup.py #24
Aug 6, 2015
1e7ab5f
Updated the ctleptopy, support multifasta #24
Aug 6, 2015
15352a0
Updated the test_ctleptopy, support multifasta #24
Aug 6, 2015
8665d78
updated the test script #24
demis001 Aug 10, 2015
aace281
Updated main script #24
demis001 Aug 10, 2015
a3ba5cc
Updated the test script #24
demis001 Aug 10, 2015
bd6e974
Updated the requirment file #24
demis001 Aug 10, 2015
0e4934c
Added the diverse test data #24
demis001 Aug 10, 2015
3c895a2
updated the test_ctleptop.py #24
demis001 Aug 11, 2015
46cd779
Merged two test file with different complexity #24
demis001 Aug 11, 2015
b7eff72
Updated test_ctleptop.py still 43% coverage #24
demis001 Aug 11, 2015
12098a8
Increased test coverge to 43 #24
demis001 Aug 12, 2015
cb42f4d
Mock main function #24
demis001 Aug 13, 2015
1aed61f
Added additional test #24
demis001 Aug 14, 2015
59fb4f5
Added help message #24
demis001 Aug 21, 2015
42ef8ca
Merge remote branch 'upstream/dev' into maaps
averagehat Aug 21, 2015
15471bd
Added function to accept list of degen positions for maaps
averagehat Aug 21, 2015
3afeb38
Added (optional) degen positions to ctleptop output
averagehat Aug 21, 2015
56b2113
bare-bones test for ctleptop
averagehat Aug 21, 2015
2fbff00
various fixes
averagehat Aug 21, 2015
8561f76
fixed incorrect imports
averagehat Aug 21, 2015
8416416
Update requirements.txt
averagehat Aug 21, 2015
0514867
code cleaning
averagehat Aug 21, 2015
ac0b54d
fix filepath
averagehat Aug 21, 2015
d315951
added zip
averagehat Aug 21, 2015
c87a308
reverted travis
averagehat Aug 21, 2015
c15a98f
Merge branch 'dev' of https://github.com/VDBWRAIR/bio_pieces into maaps
averagehat Aug 21, 2015
0793813
Merge branch 'maaps' of https://github.com/VDBWRAIR/bio_pieces into m…
averagehat Aug 21, 2015
8888304
fixes problem listed in PR #52 with AA index
averagehat Aug 26, 2015
5fc29da
replaced ctletop test file for pybot test
averagehat Oct 28, 2015
1a38328
changed pybot name
averagehat Oct 28, 2015
1d3f067
fixed float/integer division error in py3
averagehat Oct 28, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ script:
- nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces -a '!download'
- pybot tests/*.robot
after_success:
- coveralls
- coveralls
32 changes: 32 additions & 0 deletions Den4_MAAPS_TestData16.fasta

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bio_pieces/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
except ImportError:
from io import BytesIO

from future.builtins import map, filter
from future.builtins import map, filter, zip

try:
import unittest2 as unittest
Expand Down
270 changes: 270 additions & 0 deletions bio_pieces/ctleptop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
#!/usr/bin/env python
# encoding: utf-8
"""
ctleptop.py -i [FASTA FILE] > Out_file.txt

Created by Dereje Jima on May 21, 2015
"""
from __future__ import division
from __future__ import print_function
from Bio.Seq import *
from Bio.Alphabet import IUPAC
from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
#from itertools import groupby
from Bio.Data import CodonTable
from Bio.Data.IUPACData import ambiguous_dna_values
#import yaml
import argparse
from bio_pieces import degen
from functools import partial
from tabulate import tabulate
from bio_pieces.compat import zip
import re

__docformat__ = "restructuredtext en"

AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"],
"W": ["A", "T"], "S": ["G", "C"],
"K": ["T", "G"],
"M": ["C", "A"], "D": ["A", "T", "G"],
"V": ["A", "C", "G"], "H": ["A", "C", "T"],
"B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]}

'''
def readFasta(fasta_name):
"""string -> tuple
Given a fasta file. Yield tuples of header, sequence
:fasta_name: Name of a file to read
:returns: tuple of fasta header and sequence line

"""
fh = open(fasta_name)
# ditch the boolean (x[0]) and just keep the header of sequence since we
# know they alternate
fasta_iter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in fasta_iter:
# drop the ">"
header = header.next()[1:].strip()
# join all sequence line to one
seq = "".join(s.strip() for s in fasta_iter.next())
yield header, seq
'''


def getNearbyChars(nt):
"""(str)->(list)
>>>getNearbyChars("R")
['A', 'G']
>>>getNearbyChars("Y")
['C', 'T']
>>>getNearbyChars("A")
['A']
"""
return AMBICODON.get(nt) or nt


def nearbyPermutations(letters, index=0):
"""(str)->(set)
>>>nearbyPermutations("AAR")
set(['AAG', 'AAA'])
>>>nearbyPermutations("ARR")
set(['AGG', 'AAG', 'AAA', 'AGA'])
nearbyPermutations("AAA")
set(['AAA'])
"""
if (index >= len(letters)):
return set([''])
subWords = nearbyPermutations(letters, index + 1)
nearbyLetters = getNearbyChars(letters[index])
return permutations(subWords, nearbyLetters)


def permutations(subWords, nearbyLetters):
"""(set, list) -> (set)
>>>permutations(set(['CA']), ['A', 'T'])
set(['ACA', 'TCA'])
"""
permutations = set()
for subWord in subWords:
for letter in nearbyLetters:
permutations.add(letter + subWord)
return permutations


def getaalist(codonlist):
"""(list) -> (list)
Return aa list from a a given nt codon list.
>>>getaalist(['AAA','ACT'])
['K', 'T']
"""
aalist = []
for codon in codonlist:
aa = Seq(codon, IUPAC.unambiguous_dna)
aa = str(translate(aa))
aalist.append(aa)
return aalist


def list_overlap(list1, list2):
"""(str, list) -> bool
Return True if the two list hava element that overlaps.

>>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
True
>>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
False

"""
for i in list1:
if i in list2:
return True
return False


# define our method
#def replace_all(text, dic):
#"""(str, dict)-> (str)
#>>>replace_all()
#"""
#for i, j in dic.iteritems():
#text = text.replace(i, j)
#return text


def access_mixed_aa(file_name):
"""(str) ->(list,list,list,list).
Return a list of amino acide code for ambiguous dna codon, position of
ambiguous nt codon, aa name,seq id from fasta header by reading multifasta
nucleotide fasta file
"""
from Bio import SeqIO
aa = []
nucleotide_idx = []
nucl_codon = []
seqids = []
for seq_record in SeqIO.parse(file_name, 'fasta'):
seq_id = seq_record.id
seqline = str(seq_record.seq)
seqline = seqline.replace("-", "N")
n = 3
codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
ambi_nucl = AMBICODON.keys()
for key, codon in sorted(codon_list.items()):
if list_overlap(codon, ambi_nucl):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that there is some redundancy here

I think this can replace the if statement and should be quite a bit faster than the current implementation

items = set(m).intersection(ambi_nucl)
if items:

d, e, f = codon
m = [d, e, f]
items = [i for i in m if i in ambi_nucl]
indexm = m.index(items[0])
for idx, val in enumerate(items):
codonlist = list(nearbyPermutations(codon))
val = getaalist(codonlist)
# remove if aa codon is the same eg. ['D', 'D']
val = set(val)
val = "/".join(sorted(val)) # yeild 'I/L'

key = key - 2 + indexm
if '/' in val:
nucleotide_idx.append(key)
nucl_codon.append(codon)
seqids.append(seq_id)
# if "/" in val and indexm == 2:
# key = key
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 1:
# key = key - 1
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 0:
# key = key - 2
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# else:
# pass
aa.append(val)

else:
# print "codon3 ..." ,codon
aa1 = Seq(codon, IUPAC.unambiguous_dna)
aa1 = aa1.translate()
aa1 = str(aa1)
aa.append(aa1)
#print aa, nucleotide_idx, nucl_codon, seqids
return aa, nucleotide_idx, nucl_codon, seqids


def create_args():
"""
Return command line arguments

"""
parser = argparse.ArgumentParser(description='Convert inframe nucleotide \
fasta file to protein and report mixed \
(ambiguous codon) with its location in \
the sequence', epilog = 'ctleptop -i \
tests/Den4_MAAPS_TestData16.fasta -o \
out_file.txt')
parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
parser.add_argument("-o", type=str, help="output file name", required=True)
parser.add_argument("--gb-file", type=str, help="genbank file name")
parser.add_argument("--gb-id", type=str, help="genabnk accession id")
parser.add_argument("--tab-file", type=str, help="gene tab/csv file")
return parser.parse_args()


def isGap(aalist, nclist):
"""(list, list) -> (list)
Return an updated protien codon list if the gap found in the nc codon
>>>isGap(["K/R", "I/T"], ["ARG", "NNN"])
["K/R", "GAPFOUND"]
"""
uaalist = []
for indx, val in enumerate(nclist):
if "N" in val:
codon = "GAPFOUND"
uaalist.append(codon)
else:
codon = aalist[indx]
uaalist.append(codon)
return uaalist

def open_f(filename):
return open(filename, 'w+')

def main():
args = create_args()
file_name = args.i
outfile = args.o
#print("Start processing and writing the output file to", outfile, " please please wait ... ")
outf = open_f(outfile)
my_list = access_mixed_aa(file_name)
aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)
pattern = re.compile(r'.+\/.+')
amb_aa_codon = []
#amb_aa_indx = []
for indx, letter in enumerate(aa):
if pattern.match(letter):
amb_aa_codon.append(letter)
#amb_aa_indx.append(indx + 1)
amb_aa_codon=isGap(amb_aa_codon, nucl_codon)
amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)

if args.gb_id or args.gb_file or args.tab_file:
reference_genes = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
overlapped_genes = degen.get_degen_list_overlap( reference_genes, nuc_idx)
my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition', 'gene name']) + "\n")
else:
print("Warning, no gene information supplied.")
my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon)
outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition']) + "\n")
outf.close()


if __name__ == '__main__':
main()
18 changes: 15 additions & 3 deletions bio_pieces/degen.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,28 @@ def open_generic_csv(csvfile):

csv_file_to_genes = compose(partial(map, row_to_gene), open_generic_csv, open)

def get_degen_list_overlap(genes, _degen_positions):
'''
:param iterable genes: iterable of genes with attributes `start`, `end`, `name`
:param itrable _degen_positions: degenerate positions
:return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
'''
genes, _degen_positions = list(genes), list(_degen_positions)
def get_intersect(pos):
intersects = lambda gene, pos=pos: gene if gene.start <= pos <= gene.end else None
matches = list(filter(bool, map(intersects, genes)))
return '-' if not matches else matches[0].name
return map(get_intersect, _degen_positions)

def get_gene_degen_overlap_info(genes, seq):
'''
:param iterable genes: iterable of genes with attributes `start`, `end`, `name`
:param str seq: nucleotide sequence
:return list of tuples of form: (gene name, position, nt)... where degens and genes overlap.
:return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
'''
_degen_positions = degen_positions(str(seq))
perms = product(genes, _degen_positions)
result = ((gene.name, pos, seq[pos]) for gene, pos in perms if gene.start <= pos <= gene.end)
return result
return ((gene.name, pos, seq[pos]) for gene, pos in perms if gene.start <= pos <= gene.end)

def get_genes(ref_id=None, genbank_file=None, user_file=None):
'''
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ schema
pyvcf
sh
funcy
tabulate
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
'beast_checkpoint = bio_pieces.beast_checkpoint:main',
'beast_wrapper = bio_pieces.beast_wrapper:beast_wrapper',
'beast_est_time = bio_pieces.beast_wrapper:beast_est_time',
'ctleptop = bio_pieces.ctleptop:main',
'parallel_blast = bio_pieces.parallel_blast:main',
'version = bio_pieces.version:main',
'degen = bio_pieces.degen:main'
Expand Down
Loading