Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hypothesis #17

Merged
merged 6 commits into from
Feb 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions bioframework/index_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,21 @@ def filter_on_index_quality(input, output, minimum):
def filter_on_index_quality_interleaved(interleaved, index1, index2, output, minimum):
'''enpair the interleaved read file,zip that enpaired with the two indexes
drop pairs from the interleaved file if either *index* is below the minimum'''
return write_zip_results(partial(qual_filter, minimum), output, 'fastq', interleaved, index1, index2)

def qual_filter(minimum, interleaved, idx1, idx2):
above_min = lambda x: min(x.letter_annotations['phred_quality']) >= minimum
def indexes_above_min(seq_index):
seq, idx1, idx2 = seq_index
return above_min(idx1) and above_min(idx2)
def qual_filter(interleaved, idx1, idx2):
# pair together forward/reverse reads
# [SeqRecord(forward), SeqRecord(reverse)]
interleaved = partition(2, interleaved)
# Zip together indexes with pairs
# [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
zipped = izip(interleaved, idx1, idx2)
# Filter out all indexes with less than min qual and then grab
# only the interleaved sequence tuple
filtered = imap(first, ifilter(indexes_above_min, zipped))
# Chain together all sequences
return chain.from_iterable(filtered)
return write_zip_results(qual_filter, output, 'fastq', interleaved, index1, index2)
# pair together forward/reverse reads
# [SeqRecord(forward), SeqRecord(reverse)]
interleaved = partition(2, interleaved)
# Zip together indexes with pairs
# [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
zipped = izip(interleaved, idx1, idx2)
# Filter out all indexes with less than min qual and then grab
# only the interleaved sequence tuple
filtered = imap(first, ifilter(indexes_above_min, zipped))
# Chain together all sequences
return chain.from_iterable(filtered)
2 changes: 1 addition & 1 deletion jip_modules/interleaved_index_filter.jip
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@

#%begin command python
from bioframework.index_filter import filter_on_index_quality_interleaved
filter_on_index_quality_interleaved("${input}", "${index1}", "${index2}", "${output}", "${min_qual}")
filter_on_index_quality_interleaved("${input|else('/dev/stdin')}", "${index1}", "${index2}", "${output|else('/dev/stdout')}", "${min_qual}")
#%end

2 changes: 2 additions & 0 deletions tests/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ nose
mock
python-coveralls
testfixtures
sh
hypothesis
123 changes: 123 additions & 0 deletions tests/test_filter_hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#TODO: make io test use commandline jip
#NOTE: this duplicates a lot of the tested code's logic (see IO, above_min, imports) and API dependencies (see imports)
from Bio.SeqRecord import SeqRecord
import os
import sh
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from functools import partial
import re
from bioframework.seqio import interleave
import unittest
from itertools import takewhile
import operator
from toolz.itertoolz import partition
import hypothesis
from hypothesis import strategies as st
from hypothesis import find, note, given, assume, example
from bioframework.index_filter import qual_filter, filter_on_index_quality_interleaved

above_min = lambda minimum: lambda x: min(x.letter_annotations['phred_quality']) >= minimum
make_seqrec = lambda id, seq, quals: \
SeqRecord(Seq(seq, IUPAC.ambiguous_dna), id=str(id), description='', letter_annotations={'phred_quality':quals})

rec = st.integers(min_value=1, max_value=10).flatmap(
lambda n:
st.builds(
make_seqrec,
st.integers(),
st.text(alphabet='ATGCN', min_size=n, max_size=n),
st.lists(st.integers(min_value=35, max_value=40), min_size=n, max_size=n)
)
)

reads_and_indices = st.integers(min_value=1,max_value=5).flatmap(
lambda n:
st.tuples(
st.integers(min_value=0, max_value=40),
st.lists(rec, min_size=n, max_size=n).map(lambda x: list(interleave(x, x))),
st.lists(rec, min_size=n, max_size=n),
st.lists(rec, min_size=n, max_size=n),
))

def ilen(seq): return sum(1 for _ in seq)

class TestIndexQualityFilter(unittest.TestCase):

@given(reads_and_indices)
def test_always_less_or_equal_size(self, seqs):
self.assertLessEqual(ilen(qual_filter(*seqs)), ilen(seqs[1]))

@given(reads_and_indices)
def test_result_is_even(self, seqs):
self.assertTrue(ilen(qual_filter(*seqs)) % 2 == 0)

@given(reads_and_indices)
def test_result_is_interleaved(self, seqs):
result = qual_filter(*seqs)
pairs = partition(2, result)
matches = map(lambda (x,y): x.id == y.id, pairs)
self.assertTrue(all(matches))

@given(reads_and_indices)
def test_drops_all_if_all_lower(self, seqs):
(_min, reads, i1, i2) = seqs
assume(not any(map(above_min(_min), i1)))
result = qual_filter(*seqs)
self.assertEqual([], list(result))

@given(reads_and_indices)
def test_drops_none_if_all_above(self, seqs):
(_min, reads, i1, i2) = seqs
assume(all(map(above_min(_min), i1)))
assume(all(map(above_min(_min), i2)))
result = qual_filter(*seqs)
self.assertEqual(reads, list(result))

@given(reads_and_indices)
def test_drops_some_if_any_lower(self, seqs):
(_min, reads, i1, i2) = seqs
assume(not all(map(above_min(_min), i1 + i2)))
result = qual_filter(*seqs)
self.assertLess(ilen(result), ilen(reads))
def run_io(self, seqs):
names = ['r', 'i1', 'i2']
outfile = 'foo'
map(partial(SeqIO.write, format='fastq'), seqs[1:], names)
cmd = sh.Command('jip_modules/interleaved_index_filter.jip')
cmd(i=names[0], index1=names[1], index2=names[2], minimum=seqs[1], o=outfile) # could use stdout
result = SeqIO.parse(outfile, 'fastq')
return result
#
@given(reads_and_indices)
@hypothesis.settings(max_examples=20)
def test_drops_some_if_any_lower_with_IO(self, seqs):
(_min, reads, i1, i2) = seqs
assume(not all(map(above_min(_min), i1 + i2)))
result = self.run_io(seqs)
self.assertLess(ilen(result), ilen(reads))

@given(reads_and_indices)
def test_drops_correct_seq(self, seqs):
(_min, reads, i1, i2) = seqs
assume(not all(map(above_min(_min), i1)))
first_failing_index = ilen(takewhile(above_min(_min), i1))
failing_reads = reads[first_failing_index*2:first_failing_index*2+2]
result = list(qual_filter(*seqs))
self.assertFalse(failing_reads[0] in result or failing_reads[1] in result)

@given(reads_and_indices)
@hypothesis.settings(max_examples=20)
def test_drops_correct_seq_IO(self, seqs):
(_min, reads, i1, i2) = seqs
assume(not all(map(above_min(_min), i1)))
first_failing_index = ilen(takewhile(above_min(_min), i1))
failing_reads = reads[first_failing_index*2:first_failing_index*2+2]
result = self.run_io(seqs)
self.assertFalse(failing_reads[0] in result or failing_reads[1] in result)
if __name__ == '__main__':
unittest.main()
# index files should be unchanged
# the number of reads left should be original - number of bad reads in each index which don't overlap
# can use find(strategy, boolean)