Merge pull request #52 from VDBWRAIR/maaps

Maaps
VDBWRAIR · Oct 30, 2015 · 2f52f46 · 2f52f46
2 parents 64c555f + 1d3f067
commit 2f52f46
Show file tree

Hide file tree

Showing 13 changed files with 925 additions and 5 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -13,4 +13,4 @@ script:
     - nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces -a '!download'
     - pybot tests/*.robot
 after_success:
-    - coveralls
+    - coveralls 
diff --git a/Den4_MAAPS_TestData16.fasta b/Den4_MAAPS_TestData16.fasta
diff --git a/bio_pieces/compat.py b/bio_pieces/compat.py
@@ -8,7 +8,7 @@
 except ImportError:
     from io import BytesIO
 
-from future.builtins import map, filter
+from future.builtins import map, filter, zip
 
 try:
     import unittest2 as unittest

diff --git a/bio_pieces/ctleptop.py b/bio_pieces/ctleptop.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+ctleptop.py -i [FASTA FILE] > Out_file.txt
+
+Created by Dereje Jima on May 21, 2015
+"""
+from __future__ import division
+from __future__ import print_function
+from Bio.Seq import *
+from Bio.Alphabet import IUPAC
+from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
+#from itertools import groupby
+from Bio.Data import CodonTable
+from Bio.Data.IUPACData import ambiguous_dna_values
+#import yaml
+import argparse
+from bio_pieces import degen
+from functools import partial
+from tabulate import tabulate
+from bio_pieces.compat import zip
+import re
+
+__docformat__ = "restructuredtext en"
+
+AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"],
+             "W": ["A", "T"], "S": ["G", "C"],
+             "K": ["T", "G"],
+             "M": ["C", "A"], "D": ["A", "T", "G"],
+             "V": ["A", "C", "G"], "H": ["A", "C", "T"],
+             "B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]}
+
+'''
+def readFasta(fasta_name):
+    """string -> tuple
+    Given a fasta file. Yield tuples of header, sequence
+    :fasta_name: Name of a file to read
+    :returns: tuple of fasta header and sequence line
+
+    """
+    fh = open(fasta_name)
+    # ditch the boolean (x[0]) and just keep the header of sequence since we
+    # know they alternate
+    fasta_iter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
+    for header in fasta_iter:
+        # drop the ">"
+        header = header.next()[1:].strip()
+        # join all sequence line to one
+        seq = "".join(s.strip() for s in fasta_iter.next())
+        yield header, seq
+'''
+
+
+def getNearbyChars(nt):
+    """(str)->(list)
+    >>>getNearbyChars("R")
+    ['A', 'G']
+    >>>getNearbyChars("Y")
+    ['C', 'T']
+    >>>getNearbyChars("A")
+    ['A']
+    """
+    return AMBICODON.get(nt) or nt
+
+
+def nearbyPermutations(letters, index=0):
+    """(str)->(set)
+    >>>nearbyPermutations("AAR")
+    set(['AAG', 'AAA'])
+    >>>nearbyPermutations("ARR")
+    set(['AGG', 'AAG', 'AAA', 'AGA'])
+    nearbyPermutations("AAA")
+    set(['AAA'])
+    """
+    if (index >= len(letters)):
+        return set([''])
+    subWords = nearbyPermutations(letters, index + 1)
+    nearbyLetters = getNearbyChars(letters[index])
+    return permutations(subWords, nearbyLetters)
+
+
+def permutations(subWords, nearbyLetters):
+    """(set, list) -> (set)
+    >>>permutations(set(['CA']), ['A', 'T'])
+    set(['ACA', 'TCA'])
+    """
+    permutations = set()
+    for subWord in subWords:
+        for letter in nearbyLetters:
+            permutations.add(letter + subWord)
+    return permutations
+
+
+def getaalist(codonlist):
+    """(list) -> (list)
+    Return aa list from a a given nt codon list.
+    >>>getaalist(['AAA','ACT'])
+    ['K', 'T']
+    """
+    aalist = []
+    for codon in codonlist:
+        aa = Seq(codon, IUPAC.unambiguous_dna)
+        aa = str(translate(aa))
+        aalist.append(aa)
+    return aalist
+
+
+def list_overlap(list1, list2):
+    """(str, list) -> bool
+    Return True  if the two list hava element that overlaps.
+
+    >>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
+    True
+    >>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
+    False
+
+    """
+    for i in list1:
+        if i in list2:
+            return True
+    return False
+
+
+# define our method
+#def replace_all(text, dic):
+    #"""(str, dict)-> (str)
+    #>>>replace_all()
+    #"""
+    #for i, j in dic.iteritems():
+        #text = text.replace(i, j)
+    #return text
+
+
+def access_mixed_aa(file_name):
+    """(str) ->(list,list,list,list).
+    Return a list of amino acide code for ambiguous dna codon, position of
+    ambiguous nt codon, aa name,seq id from fasta header  by reading multifasta
+    nucleotide fasta file
+    """
+    from Bio import SeqIO
+    aa = []
+    nucleotide_idx = []
+    nucl_codon = []
+    seqids = []
+    for seq_record in SeqIO.parse(file_name, 'fasta'):
+        seq_id = seq_record.id
+        seqline = str(seq_record.seq)
+        seqline = seqline.replace("-", "N")
+        n = 3
+        codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
+        ambi_nucl = AMBICODON.keys()
+        for key, codon in sorted(codon_list.items()):
+            if list_overlap(codon, ambi_nucl):
+                d, e, f = codon
+                m = [d, e, f]
+                items = [i for i in m if i in ambi_nucl]
+                indexm = m.index(items[0])
+                for idx, val in enumerate(items):
+                    codonlist = list(nearbyPermutations(codon))
+                    val = getaalist(codonlist)
+                    # remove if aa codon is the same eg. ['D', 'D']
+                    val = set(val)
+                    val = "/".join(sorted(val))   # yeild 'I/L'
+
+                    key = key - 2 + indexm
+                    if '/' in val:
+                        nucleotide_idx.append(key)
+                        nucl_codon.append(codon)
+                        seqids.append(seq_id)
+#                    if "/" in val and indexm == 2:
+#                        key = key
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    elif "/" in val and indexm == 1:
+#                        key = key - 1
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    elif "/" in val and indexm == 0:
+#                        key = key - 2
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    else:
+#                        pass
+                    aa.append(val)
+
+            else:
+                # print "codon3 ..." ,codon
+                aa1 = Seq(codon, IUPAC.unambiguous_dna)
+                aa1 = aa1.translate()
+                aa1 = str(aa1)
+                aa.append(aa1)
+    #print aa, nucleotide_idx, nucl_codon, seqids
+    return aa, nucleotide_idx, nucl_codon, seqids
+
+
+def create_args():
+    """
+    Return command line arguments
+
+    """
+    parser = argparse.ArgumentParser(description='Convert inframe nucleotide \
+                                     fasta file to protein and report mixed \
+                                     (ambiguous codon) with its location in \
+                                     the sequence', epilog = 'ctleptop -i \
+                                     tests/Den4_MAAPS_TestData16.fasta -o \
+                                     out_file.txt')
+    parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
+    parser.add_argument("-o", type=str,  help="output file name", required=True)
+    parser.add_argument("--gb-file", type=str,  help="genbank file name")
+    parser.add_argument("--gb-id", type=str,  help="genabnk accession id")
+    parser.add_argument("--tab-file", type=str,  help="gene tab/csv file")
+    return parser.parse_args()
+
+
+def isGap(aalist, nclist):
+    """(list, list) -> (list)
+    Return an updated protien codon list if the gap found in the nc codon
+    >>>isGap(["K/R", "I/T"], ["ARG", "NNN"])
+    ["K/R", "GAPFOUND"]
+    """
+    uaalist = []
+    for indx, val in enumerate(nclist):
+        if "N" in val:
+            codon = "GAPFOUND"
+            uaalist.append(codon)
+        else:
+            codon = aalist[indx]
+            uaalist.append(codon)
+    return uaalist
+
+def open_f(filename):
+    return open(filename, 'w+')
+
+def main():
+    args = create_args()
+    file_name = args.i
+    outfile = args.o
+    #print("Start processing and writing the output file to", outfile, " please please wait ... ")
+    outf = open_f(outfile)
+    my_list = access_mixed_aa(file_name)
+    aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)
+    pattern = re.compile(r'.+\/.+')
+    amb_aa_codon = []
+    #amb_aa_indx = []
+    for indx, letter in enumerate(aa):
+        if pattern.match(letter):
+            amb_aa_codon.append(letter)
+            #amb_aa_indx.append(indx + 1)
+    amb_aa_codon=isGap(amb_aa_codon, nucl_codon)
+    amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)
+
+    if args.gb_id  or args.gb_file or args.tab_file:
+        reference_genes = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
+        overlapped_genes = degen.get_degen_list_overlap( reference_genes, nuc_idx)
+        my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
+        outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
+                                     'nt composition', 'aa composition', 'gene name']) + "\n")
+    else:
+        print("Warning, no gene information supplied.")
+        my_list = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon)
+        outf.write(tabulate(my_list, headers=['seq id', 'nt Position', 'aa position',
+                                     'nt composition', 'aa composition']) + "\n")
+    outf.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bio_pieces/degen.py b/bio_pieces/degen.py
@@ -83,16 +83,28 @@ def open_generic_csv(csvfile):
 
 csv_file_to_genes = compose(partial(map, row_to_gene), open_generic_csv, open)
 
+def get_degen_list_overlap(genes, _degen_positions):
+    '''
+    :param iterable genes: iterable of genes with attributes `start`, `end`, `name`
+    :param itrable _degen_positions: degenerate positions
+    :return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
+    '''
+    genes, _degen_positions = list(genes), list(_degen_positions)
+    def get_intersect(pos):
+        intersects = lambda gene, pos=pos: gene if gene.start <= pos <= gene.end else None
+        matches = list(filter(bool, map(intersects, genes)))
+        return '-' if not matches else matches[0].name
+    return map(get_intersect, _degen_positions)
+
 def get_gene_degen_overlap_info(genes, seq):
     '''
     :param iterable genes: iterable of genes with attributes `start`, `end`, `name`
     :param str seq: nucleotide sequence
-    :return list of tuples of form: (gene name, position, nt)... where degens and genes overlap.
+    :return generator of tuples of form: (gene name, position, nt)... where degens and genes overlap.
     '''
     _degen_positions = degen_positions(str(seq))
     perms = product(genes, _degen_positions)
-    result = ((gene.name, pos, seq[pos]) for gene, pos in perms if  gene.start <= pos <= gene.end)
-    return result
+    return ((gene.name, pos, seq[pos]) for gene, pos in perms if  gene.start <= pos <= gene.end)
 
 def get_genes(ref_id=None, genbank_file=None, user_file=None):
     '''

diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ schema
 pyvcf
 sh
 funcy
+tabulate
diff --git a/setup.py b/setup.py
@@ -21,6 +21,7 @@
             'beast_checkpoint = bio_pieces.beast_checkpoint:main',
             'beast_wrapper = bio_pieces.beast_wrapper:beast_wrapper',
             'beast_est_time = bio_pieces.beast_wrapper:beast_est_time',
+            'ctleptop = bio_pieces.ctleptop:main',
             'parallel_blast = bio_pieces.parallel_blast:main',
             'version = bio_pieces.version:main',
             'degen = bio_pieces.degen:main'
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ schema @@
     pyvcf
     sh
     funcy
+    tabulate