Skip to content

Commit

Permalink
Merge pull request #20 from VDBWRAIR/dev
Browse files Browse the repository at this point in the history
update parse_contigs
  • Loading branch information
averagehat committed Apr 30, 2015
2 parents ad30391 + 375da8c commit 7e6af18
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 16 deletions.
22 changes: 15 additions & 7 deletions bio_pieces/parse_contigs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
'''
Usage: group_references <samfile>
Usage: group_references <samfile> --outdir <DIR>
Options:
--outdir=<DIR>,-o=<DIR> outupt directory [Default: parse_contigs_out]
Create separate fastq files for each reference in a samfile.
'''
Expand All @@ -8,6 +11,7 @@
import pandas as pd
from schema import Schema, Use
import sys
import os
if sys.version[0] == '3':
from io import StringIO as BytesIO
else:
Expand All @@ -24,24 +28,28 @@ def samview_to_df(rawtext):
def fixline(row):
return '\t'.join(row.split('\t')[:len(sam_columns)])

def get_seqs_by_ctg(rawtext):

def get_seqs_by_ctg(outdir, rawtext):
sam_df = samview_to_df(rawtext)
contig_groups = sam_df.groupby('RNAME')
fastq = "@{0}\n{1}\n+\n{2}".format
for group in contig_groups:
ref, reads = group[0], group[1]
with open("{0}.group.fq".format(ref), 'w') as out:
with open("{0}/{1}.group.fq".format(outdir, ref), 'w') as out:
map(out.writelines, '\n'.join(map(fastq, reads.QNAME, reads.SEQ, reads.QUAL)))
out.write('\n')


def main():
raw_args = docopt(__doc__)
scheme = Schema({'<samfile>' : Use(open, error='Samfile must be readable')})
scheme = Schema({
'<samfile>' : Use(open, error='Samfile must be readable'),
'--outdir' : str})
parsed_args = scheme.validate(raw_args)
get_seqs_by_ctg(parsed_args['<samfile>'].read())
outdir = parsed_args['--outdir']
if not os.path.exists(outdir):
os.mkdir(outdir)
get_seqs_by_ctg(outdir, parsed_args['<samfile>'].read())
return 0

if __name__ == '__main__':
main()

10 changes: 7 additions & 3 deletions bio_pieces/vcfcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def match_records(left, right, tag, threshold):
:return bool: False if they are different according to threshold
'''
l, r = flatten_vcf(left), flatten_vcf(right)
assert tag in l and tag in r, "Tag not found in flattened record {0}".format(str(flatten_vcf))
#assert tag in l and tag in r, "Tag not found in flattened record {0}".format(str(flatten_vcf))
if operator.xor( (tag in l), (tag in r)):
return False
if tag not in l and tag not in r:
return True
if not threshold:
return l[tag] == r[tag]
else:
Expand Down Expand Up @@ -82,7 +86,7 @@ def not_members(vcf_list, tag, collection):
:param list collection: a list of objects
:return list: all records where record[tag] or record.INFO[tag] is not in collection
'''
return [rec for rec in vcf_list if flatten_vcf(rec)[tag] not in collection]
return [rec for rec in vcf_list if (tag not in flatten_vcf(rec)) or flatten_vcf(rec)[tag] not in collection]

ambiguous = partial(not_members, tag='CB', collection=['A', 'C', 'G', 'T'])
exists = partial(not_members, collection= [None, [None], '-'])
Expand All @@ -103,7 +107,7 @@ def filter(vcf_list, tag, opfunc, value):
if type(opfunc) is str:
opfunc = getattr(operator, opfunc)
check_value = partial(compare_value, opfunc, value)
return [rec for rec in vcf_list if check_value(rec, tag)]
return [rec for rec in vcf_list if (tag in flatten_vcf(rec)) and check_value(rec, tag)]

def flatten_list(A):
return A[0] if type(A) == list else A
Expand Down
2 changes: 2 additions & 0 deletions bio_pieces/vcfcat_main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
'''
Command-line utility for querying VCF files. By default, outputs a full vcf file matching the query.
Usage:
vcfcat filter <FILE1> ( --tag=<TAG> (--ge | --le | --gt | --lt | --eq | --neq) <VALUE> ) [-c]
vcfcat exists <FILE1> (--tag=<TAG> ) [-c]
Expand Down
12 changes: 6 additions & 6 deletions tests/test_parse_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,20 @@ def test_sam_to_df_with_extra_fields(self):
self.assertEquals(result.ix[2]['QUAL'], 'FF@@@F@F')

def test_main(self): #, margs):
olddir = abspath(os.getcwd())
os.chdir(THISD)
sys.argv = ['group_refs', 'out.samtext']
# olddir = abspath(os.getcwd())
# os.chdir(THISD)
sys.argv = ['group_refs', 'tests/out.samtext', '--outdir', 'tests/testoutput/pc']
with open('out.samtext', 'w') as out:
out.write(self.samtext)
#with mock.patch('__builtin__.open', mock.mock_open(read_data=self.samtext), create=True) as m:
rcode = pc.main()
os.chdir(olddir)
# os.chdir(olddir)
self.assertEquals(0, rcode)
expected_group1 = [self.seqrecs[0], self.seqrecs[2]]
actual_group1 = SeqIO.parse(join(THISD, 'chr1.group.fq'), format='fastq')
actual_group1 = SeqIO.parse(join(THISD, 'testoutput/pc/chr1.group.fq'), format='fastq')
for le, ri in zip(expected_group1, actual_group1):
self.assertEquals(*map(lambda r: (str(r.seq), r.letter_annotations, r.id), [le, ri]))
actual_group2 = SeqIO.parse(join(THISD, 'chr2.group.fq'), format='fastq')
actual_group2 = SeqIO.parse(join(THISD, 'testoutput/pc/chr2.group.fq'), format='fastq')
expected_group2 = [self.seqrecs[1]]
for le, ri in zip(expected_group2, actual_group2):
self.assertEquals(*map(lambda r: (str(r.seq), r.letter_annotations, r.id) , [le, ri]))
Expand Down
8 changes: 8 additions & 0 deletions tests/testoutput/pc/chr1.group.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@read1
TTTCGAATC
+
FFFFFFFFF
@read3
CCGATCAA
+
FF@@@F@F
4 changes: 4 additions & 0 deletions tests/testoutput/pc/chr2.group.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@read2
CTTCGATC
+
AFFDDDDD

0 comments on commit 7e6af18

Please sign in to comment.