Merge pull request #20 from VDBWRAIR/dev

update parse_contigs
VDBWRAIR · Apr 30, 2015 · 7e6af18 · 7e6af18
2 parents ad30391 + 375da8c
commit 7e6af18
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 16 deletions.
diff --git a/bio_pieces/parse_contigs.py b/bio_pieces/parse_contigs.py
@@ -1,5 +1,8 @@
 '''
-Usage: group_references <samfile>
+Usage: group_references <samfile> --outdir <DIR>
+
+Options:
+    --outdir=<DIR>,-o=<DIR>   outupt directory [Default: parse_contigs_out]
 
 Create separate fastq files for each reference in a samfile.
 '''
@@ -8,6 +11,7 @@
 import pandas as pd
 from schema import Schema, Use
 import sys
+import os
 if sys.version[0] == '3':
     from io import StringIO as BytesIO
 else:
@@ -24,24 +28,28 @@ def samview_to_df(rawtext):
 def fixline(row):
     return '\t'.join(row.split('\t')[:len(sam_columns)])
 
-def get_seqs_by_ctg(rawtext):
-
+def get_seqs_by_ctg(outdir, rawtext):
     sam_df = samview_to_df(rawtext)
     contig_groups = sam_df.groupby('RNAME')
     fastq = "@{0}\n{1}\n+\n{2}".format
     for group in contig_groups:
         ref, reads = group[0], group[1]
-        with open("{0}.group.fq".format(ref), 'w') as out:
+        with open("{0}/{1}.group.fq".format(outdir, ref), 'w') as out:
             map(out.writelines, '\n'.join(map(fastq, reads.QNAME, reads.SEQ, reads.QUAL)))
             out.write('\n')
 
+
 def main():
     raw_args = docopt(__doc__)
-    scheme = Schema({'<samfile>' : Use(open, error='Samfile must be readable')})
+    scheme = Schema({
+        '<samfile>' : Use(open, error='Samfile must be readable'),
+        '--outdir' : str})
     parsed_args = scheme.validate(raw_args)
-    get_seqs_by_ctg(parsed_args['<samfile>'].read())
+    outdir = parsed_args['--outdir']
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
+    get_seqs_by_ctg(outdir, parsed_args['<samfile>'].read())
     return 0
 
 if __name__ == '__main__':
     main()
-
diff --git a/bio_pieces/vcfcat.py b/bio_pieces/vcfcat.py
@@ -35,7 +35,11 @@ def match_records(left, right, tag, threshold):
     :return bool: False if they are different according to threshold
     '''
     l, r = flatten_vcf(left), flatten_vcf(right)
-    assert tag in l and tag in r, "Tag not found in flattened record {0}".format(str(flatten_vcf))
+    #assert tag in l and tag in r, "Tag not found in flattened record {0}".format(str(flatten_vcf))
+    if operator.xor( (tag in l), (tag in r)):
+        return False
+    if tag not in l and tag not in r:
+        return True
     if not threshold:
         return l[tag] == r[tag]
     else:
@@ -82,7 +86,7 @@ def not_members(vcf_list, tag, collection):
     :param list collection: a list of objects
     :return list: all records where record[tag] or record.INFO[tag] is  not in collection
     '''
-    return [rec for rec in vcf_list if flatten_vcf(rec)[tag] not in collection]
+    return [rec for rec in vcf_list if (tag not in flatten_vcf(rec)) or flatten_vcf(rec)[tag] not in collection]
 
 ambiguous = partial(not_members, tag='CB', collection=['A', 'C', 'G', 'T'])
 exists = partial(not_members, collection= [None, [None], '-'])
@@ -103,7 +107,7 @@ def filter(vcf_list, tag, opfunc, value):
     if type(opfunc) is str:
         opfunc = getattr(operator, opfunc)
     check_value = partial(compare_value, opfunc, value)
-    return [rec for rec in vcf_list if check_value(rec, tag)]
+    return [rec for rec in vcf_list if (tag in flatten_vcf(rec)) and check_value(rec, tag)]
 
 def flatten_list(A):
   return A[0] if type(A) == list else A

diff --git a/bio_pieces/vcfcat_main.py b/bio_pieces/vcfcat_main.py
@@ -1,4 +1,6 @@
 '''
+Command-line utility for querying VCF files. By default, outputs a full vcf file matching the query.
+
 Usage:
     vcfcat filter <FILE1> ( --tag=<TAG> (--ge | --le | --gt | --lt | --eq | --neq) <VALUE> ) [-c]
     vcfcat exists <FILE1> (--tag=<TAG> ) [-c]

diff --git a/tests/test_parse_contigs.py b/tests/test_parse_contigs.py
@@ -51,20 +51,20 @@ def test_sam_to_df_with_extra_fields(self):
         self.assertEquals(result.ix[2]['QUAL'],   'FF@@@F@F')
 
     def test_main(self): #, margs):
-        olddir = abspath(os.getcwd())
-        os.chdir(THISD)
-        sys.argv = ['group_refs', 'out.samtext']
+#        olddir = abspath(os.getcwd())
+#        os.chdir(THISD)
+        sys.argv = ['group_refs', 'tests/out.samtext', '--outdir', 'tests/testoutput/pc']
         with open('out.samtext', 'w') as out:
             out.write(self.samtext)
         #with mock.patch('__builtin__.open', mock.mock_open(read_data=self.samtext), create=True) as m:
         rcode = pc.main()
-        os.chdir(olddir)
+#        os.chdir(olddir)
         self.assertEquals(0, rcode)
         expected_group1 = [self.seqrecs[0], self.seqrecs[2]]
-        actual_group1 = SeqIO.parse(join(THISD, 'chr1.group.fq'), format='fastq')
+        actual_group1 = SeqIO.parse(join(THISD, 'testoutput/pc/chr1.group.fq'), format='fastq')
         for le, ri in zip(expected_group1, actual_group1):
             self.assertEquals(*map(lambda r: (str(r.seq), r.letter_annotations, r.id), [le, ri]))
-        actual_group2 = SeqIO.parse(join(THISD, 'chr2.group.fq'), format='fastq')
+        actual_group2 = SeqIO.parse(join(THISD, 'testoutput/pc/chr2.group.fq'), format='fastq')
         expected_group2 = [self.seqrecs[1]]
         for le, ri in zip(expected_group2, actual_group2):
             self.assertEquals(*map(lambda r: (str(r.seq), r.letter_annotations, r.id) , [le, ri]))

diff --git a/tests/testoutput/pc/chr1.group.fq b/tests/testoutput/pc/chr1.group.fq
@@ -0,0 +1,8 @@
+@read1
+TTTCGAATC
++
+FFFFFFFFF
+@read3
+CCGATCAA
++
+FF@@@F@F
diff --git a/tests/testoutput/pc/chr2.group.fq b/tests/testoutput/pc/chr2.group.fq
@@ -0,0 +1,4 @@
+@read2
+CTTCGATC
++
+AFFDDDDD