Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bam processing (WIP) #19

Open
wants to merge 45 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
6c66ad1
added tagreads (tags all reads, doesn't skip any)
averagehat Feb 10, 2016
ed0ac2b
added tagreads
averagehat Feb 11, 2016
4aa14ca
tag_bam now sorts and indexes afterwards
averagehat Feb 11, 2016
4d46b24
added freebayes.jip
averagehat Feb 11, 2016
4c99339
tagreads now writes to separate file or stdout
averagehat Feb 11, 2016
6244e29
foo
averagehat Feb 11, 2016
eb79428
fixing freebayes
averagehat Feb 11, 2016
44fbf74
added bioframes dependency from git url
averagehat Feb 11, 2016
2d7c592
added jip consensus
averagehat Feb 11, 2016
0a7976b
Merge branch 'dev' of https://github.com/VDBWRAIR/bioframework into b…
averagehat Feb 11, 2016
e21e14d
added new implementation of consensus
averagehat Feb 12, 2016
44be3d0
update
averagehat Feb 22, 2016
549a244
consensus now handles multiple alts
averagehat Feb 23, 2016
aefe452
various fixes
Feb 24, 2016
e36a4c1
refactoring to simplify and remove deprecated funcs
Feb 24, 2016
46f2df9
added tests,refactored vcf dict flattening
Feb 27, 2016
1894e0a
add missing dependencies
Mar 2, 2016
4633cc6
fixed ordering bug
Mar 2, 2016
05b04e8
fixed groupby bug
Mar 2, 2016
8eb0c38
fixed make_consensus
Mar 2, 2016
8e3062c
more tests
Mar 2, 2016
33d7db7
add failing test
Mar 2, 2016
d3bfe63
fixed git+ protocol of biotest dep.
averagehat Mar 3, 2016
2faecbf
fixed contracts to actual pypy name
averagehat Mar 3, 2016
613e825
10/10 working
averagehat Mar 3, 2016
0de7577
baic types and examples
averagehat Mar 4, 2016
bf1b67b
typechecking works.
averagehat Mar 4, 2016
c2aecb8
added readme for types
averagehat Mar 4, 2016
c09b94b
fixed tests, now passing
averagehat Mar 4, 2016
6a0bd96
removing old types-hypothesis generator file
averagehat Mar 4, 2016
6cbd2e1
added missing typing requirement
averagehat Mar 4, 2016
4427de8
missing deps.
averagehat Mar 4, 2016
50cf3c9
add typing module dep.
Mar 4, 2016
59e18a6
Merge branch 'bam-processing' of https://github.com/VDBWRAIR/bioframe…
Mar 4, 2016
23afece
fix failing test
Mar 4, 2016
bb7f937
add support for trimming reference
Mar 4, 2016
272a115
fix readme formatting
averagehat Mar 4, 2016
875e1aa
Merge pull request #21 from VDBWRAIR/types
averagehat Mar 4, 2016
a08ab6c
Merge branch 'bam-processing' of https://github.com/VDBWRAIR/bioframe…
Mar 22, 2016
bbd8af8
fix VCFRow instantiation
Mar 22, 2016
c556a76
more info in consensus's stuff
Mar 23, 2016
e770b20
add doc for low-depth handling
Mar 23, 2016
c77a809
restore lowdepth->N so tests can maybe pass
Mar 23, 2016
101f5aa
fixed duplcate args error
Mar 23, 2016
9795897
ignroe
Apr 17, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ install:
- conda config --add channels bioconda
- conda install --file requirements-conda.txt
- pip install -r tests/requirements-dev.txt
- pip install -r requirements.txt
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi
- if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install robotframework-python3; else pip install robotframework; fi
- python setup.py install
Expand Down
287 changes: 287 additions & 0 deletions bioframework/consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
"""
Usage:
consensus --ref <ref> --vcf <vcf> [--mind <mind>] [--majority <majority>] [-o <output>]

Options:
--ref=<ref> Reference fasta file
--vcf=<vcf> VCF output
--majority=<majority> Percentage required [default: 80]
--mind=<mind> minimum depth to call base non-N [default: 10]
--output,-o=<output> output file [default: ]
"""
#stdlib
#TO type-check: (requires python3)
# $ MYPYPATH=$HOME/bioframework/mypy mypy --py2 --disallow-untyped-calls bioframework/consensus.py

from operator import itemgetter as get
from functools import partial
from itertools import ifilter, ifilterfalse, imap, groupby, takewhile, repeat, starmap, izip_longest
import os, sys
import collections

from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable, NamedTuple, BinaryIO

from Bio import SeqIO #done
from Bio.SeqRecord import SeqRecord #done
import vcf #done
from vcf.model import _Record
import sh #todo
#from toolz import compose
from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done
#from docopt import docopt #ignore
#from schema import Schema, Use #ignore
#from contracts import contract, new_contract #can ignore
#from mypy.types import VCFRow
#############
# Constants #
#############
VCFRow = NamedTuple("VCFRow",
[('ref', str),
('AO', List[int]),
('DP', int),
('QA', List[int]),
('QR', int),
('chrom',str),
('pos', int),
('alt', List[str])])
AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N' }

MAJORITY_PERCENTAGE = 80
MIN_DEPTH = 10
PMut = NamedTuple('PMut', [('ALT', str), ('AO', int), ('DP', int), ('QA', int), ('QR', int)])
Mut = Tuple[str, str, int]
###########
# Reducer #
###########
#@contract(reference='string', muts='list(tuple(string, string, int))' )
def make_consensus(reference, muts):
# type: (str, List[Mut]) -> Tuple[str, List[Mut]]
''' Actually builds a consensus string by recursively applying
the mutations.'''
def _do_build(t1, t2): # type: (Tuple[str,str,int], Tuple[str,str,int]) -> Tuple[str,str,int]
(accString, string, lastPos), (x, y, bigPos) = t1, t2
pos = bigPos - lastPos
return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x))
result, remaining, _ = reduce(_do_build, muts, ('', reference, 0))
return result + remaining, muts


##############
# Mappers #
##############

#TODO: Failing Case:
# a = {'ref': u'AT', 'pos': 2, 'AO': (51771, 41537, 42398, 9342), 'alt': [u'A',
# u'TT', u'AATTG', u'AAGAA'], 'chrom': u'o', 'DP': 87288}
# bioframework.consensus.call_many(10, 80, a)

#@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100',dp='number,>=0', ref='string|None',alts='dict(string: number)')
def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref):
# type: (int, int, int, List[PMut], str) -> str
"""when there are multiple alts, zip through with each of them
zip(*alts), character by character. compare the percentages, and
sum the percentages for each base. (groupby, sum) pick each character
(call each base) based on the given rules (using call_base)."""
#TODO: majority_percentage gets ignored, so replace constants
#TODO: behavior is undefined if sum(AO) > dp.
if dp < min_depth: #could call REF here sometimes
return 'N'

# """
# 1) "if the total quality of the alternates is < 200 (25 * 8), don't call it an alternate."
# 2) "if the total quality of the references is < 200, don't call the reference."
# 3) "if neither the reference nor the alternate can be called, call an N."
# """
# altsTooLow = sum(map(lambda x: x.QA)) < 200
# refTooLow = alts[0].QR < 200
#
# if altsTooLow and refTooLow:
# return 'N'
#
# elif altsTooLow:
# return ref

get_ao = lambda x: x.AO
total_ao = sum(map(get_ao, alts))

if ref is None: # this is an insert
if total_ao/float(dp) < .50:
return ref
# if the insert is above threshold, keep going and call the insert like a normal base
is_insert = lambda x: x.ALT == '-' # type: Callable[[PMut],bool]
inserts = list(filter(is_insert, alts))
if inserts:
assert len(inserts) == 1
ins = inserts[0]
if ins.AO/float(dp) > .50: # a deletion
return ''
dp -= ins.AO
alts_without_insert = list(ifilterfalse(is_insert, alts))
else:
alts_without_insert = alts
over_depth = lambda x: lambda y: y.AO/float(dp) > x
picked_alt = filter(over_depth(0.8), alts_without_insert)
if picked_alt:
return picked_alt[0].ALT
#add ref so that it will be considered in creating ambiguous base
if ref:
alts.append(PMut(ALT=ref, AO=(dp-total_ao), DP=dp, QA=-1, QR=-1))
over20 = filter(over_depth(0.2), alts)
as_ambiguous = ''.join(sorted(map(lambda x: x.ALT, over20)))
# this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table
return AMBIGUITY_TABLE[as_ambiguous] if as_ambiguous != '' else ''

def pmuts_from_row(row):
# type: (VCFRow) -> List[PMut]
muts = zip(row.QA, row.AO, row.alt)
longest_len = max(map(lambda x: len(x[-1]), muts))
longest_len = max(longest_len, len(row.ref))
def fill_gap(r):
qa, ao, s = r
return (qa, ao, str(s) + (longest_len - len(s)) * '-')
def merge_sum(x,y):
def combine(tups):
return sum(map(get(0), tups)), sum(map(get(1), tups))
#return x if y is None else (y if x is None else merge_with(sum, x, y))
return x if y is None else (y if x is None else merge_with(combine, x, y))
def seq_count(acc, ao_and_nts):
qa, ao, nts = ao_and_nts
#return map(merge_sum, acc, [{nt:ao, 'qa': qa} for nt in nts])
return map(merge_sum, acc, [{nt:(ao, qa)} for nt in nts])
xs = map(fill_gap, muts) # fill in the shorter alts with '-'.
# create a list of {base : count}, where the index matches the position
mut_dicts = reduce(seq_count, xs, [{}]) # type: Iterable[Dict[str,int]]
def to_pmuts(d): # type: (Dict[str,int]) -> List[PMut]
#qa = d.pop('qa')
#assert len(d.values()) == 1
make = lambda k,v: PMut(k, AO=v[0], QA=v[1], DP=row.DP,QR=row.QR)
return list(starmap(make, d.items()))
# res = PMut(d.keys()[0], d.values()[0], row.DP, qa, row.QR)
# return res
res = list(map(to_pmuts, mut_dicts))
return res

def call_many(min_depth, majority_percentage, rec):
# type: (int, int, VCFRow) -> Mut
muts_by_column = pmuts_from_row(rec)
#TODO: switch to generators
base_caller = lambda m,r: call_base_multi_alts(
min_depth, majority_percentage, rec.DP, m, r) # # # ?Callable[[Dict[Any,Any], str], str]
res = map(base_caller, muts_by_column, rec.ref)
# trim None values at the end, (which indicate deletion)
result = takewhile(bool, res)
return (rec.ref, ''.join(result), rec.pos)

#@contract(rec='dict',returns='dict')
def flatten_vcf_record(rec):
# type: (_Record) -> VCFRow
fields = ['ref', 'AO', 'DP', 'QA', 'QR', 'chrom' 'pos', 'alt']
_rec = merge({
'alt' : rec.ALT, 'ref' : rec.REF,
'pos' : rec.POS, 'chrom' : rec.CHROM},
rec.INFO)
if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else
d = merge(_rec, dict(alt=[_rec['alt']],
AO=[_rec['AO']],
QA=[_rec['QA']]))
else: d = _rec
d = keyfilter(fields.__contains__, d)
return VCFRow(**d)

##############
# Group By #
##############
#NOTE: could possibly drop lists, use fn.Stream all the time,
# and write a Stream instance for contracts like:
# https://github.com/AndreaCensi/contracts/blob/831ec7a5260ceb8960540ba0cb6cc26370cf2d82/src/contracts/library/lists.py
#@contract(references='list[N]($SeqRecord),N>0', muts='list(dict)',returns='tuple(list(dict))')
def group_muts_by_refs(references, muts):
# type: (List[SeqRecord], List[VCFRow]) -> List[List[VCFRow]]
'''group and sort the mutations so that they match the order of the references.'''
#NOTE: muts will already be "sorted" in that they are grouped together in the vcf
#fix the groupby so it doesn't incidentally drain the first object of the group
unzip = lambda x: zip(*x)
chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, lambda x: x.chrom)))
#@contract(key='tuple(string,list)')
def index_of_ref(key): # type: (Tuple[str, List[SeqRecord]]) -> int
chrom=key[0]
index_of_chrom = map(lambda x: x.id, references).index(chrom)
return index_of_chrom
_, muts_by_ref = unzip(sorted(zip(chroms, groups), key=index_of_ref))
return muts_by_ref



###############
# Runner #
###############

#@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int)
def all_consensuses(references, muts, mind, majority):
# type: (List[SeqRecord], List[VCFRow], int, int) -> Tuple[List[SeqRecord], Iterable[Tuple[str, List[Mut]]]]
''' generates conesnsuses, including for flu and other mult-reference VCFs.
applies filters and base callers to the mutations.
then builds the consensus using these calls and `make_consensus`'''
muts_by_ref = group_muts_by_refs(references, muts)
def single_consensus(muts, ref):
# type: (List[VCFRow], SeqRecord) -> Tuple[str, List[Mut]]
#the_muts = map(partial(call_many, mind, majority), muts)
the_muts = map(lambda x: call_many(mind, majority, x), muts)
ref_and_alt_differ = lambda x: x[0] != x[1]
# vcf is index-starting-at-1
#real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts))
real_muts = map(lambda x: (x[0], x[1], x[2] - 1), filter(ref_and_alt_differ, the_muts))
return make_consensus(str(ref.seq), real_muts)
return references, imap(single_consensus, muts_by_ref, references)


##########
# I/O #
##########
def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str
return ">{0}:Consensus\n{1}".format(ref.id, consensus)

def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int]
pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True)
get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int]
return imap(get_pos, pileup)

#TODO: is pileup 0-based or 1-based index?
def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str
start, end = next(positions), collections.deque(positions, 1)[0]
return '-'*start + ref[:start:end] + '-'*(len(ref) - end)



#@contract(ref_fasta=str, vcf=str, mind=int, majority=int)
def run(ref_fasta, freebayes_vcf, outfile, mind, majority):
# type: (str, str, BinaryIO, int, int) -> int
_refs = SeqIO.parse(ref_fasta, 'fasta')
with open(freebayes_vcf, 'r') as vcf_handle:
_muts = map(flatten_vcf_record, vcf.Reader(vcf_handle))
refs, muts = list(_refs), list(_muts)
the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority)
strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts))
result = '\n'.join(strings)
outfile.write(result)
outfile.close()
return 0





#def main(): # type: () -> None
# scheme = Schema(
# { '--vcf' : os.path.isfile,
# '--ref' : os.path.isfile,
# '--majority' : Use(int),
# '--mind' : Use(int),
# '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))})
# raw_args = docopt(__doc__, version='Version 1.0')
# args = scheme.validate(raw_args)
# run(args['--ref'], args['--vcf'], args['--output'],
# args['--mind'], args['--output'])

#if __name__ == '__main__':
# main()
Loading