From 3249d9182e8538a0a61465a8b4e00b72e2b18bf1 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Tue, 9 Feb 2016 16:35:13 -0500 Subject: [PATCH 1/6] refactor index_filter --- bioframework/index_filter.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bioframework/index_filter.py b/bioframework/index_filter.py index 8aaaa15..dd72754 100644 --- a/bioframework/index_filter.py +++ b/bioframework/index_filter.py @@ -30,20 +30,21 @@ def filter_on_index_quality(input, output, minimum): def filter_on_index_quality_interleaved(interleaved, index1, index2, output, minimum): '''enpair the interleaved read file,zip that enpaired with the two indexes drop pairs from the interleaved file if either *index* is below the minimum''' + return write_zip_results(partial(qual_filter, minimum), output, 'fastq', interleaved, index1, index2) + +def qual_filter(minimum, interleaved, idx1, idx2): above_min = lambda x: min(x.letter_annotations['phred_quality']) >= minimum def indexes_above_min(seq_index): seq, idx1, idx2 = seq_index return above_min(idx1) and above_min(idx2) - def qual_filter(interleaved, idx1, idx2): - # pair together forward/reverse reads - # [SeqRecord(forward), SeqRecord(reverse)] - interleaved = partition(2, interleaved) - # Zip together indexes with pairs - # [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)] - zipped = izip(interleaved, idx1, idx2) - # Filter out all indexes with less than min qual and then grab - # only the interleaved sequence tuple - filtered = imap(first, ifilter(indexes_above_min, zipped)) - # Chain together all sequences - return chain.from_iterable(filtered) - return write_zip_results(qual_filter, output, 'fastq', interleaved, index1, index2) + # pair together forward/reverse reads + # [SeqRecord(forward), SeqRecord(reverse)] + interleaved = partition(2, interleaved) + # Zip together indexes with pairs + # [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)] + zipped = izip(interleaved, idx1, idx2) + # Filter out all indexes with less than min qual and then grab + # only the interleaved sequence tuple + filtered = imap(first, ifilter(indexes_above_min, zipped)) + # Chain together all sequences + return chain.from_iterable(filtered) From 60152ac528438dab622fb85cdc50054599769d69 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Tue, 9 Feb 2016 18:46:35 -0500 Subject: [PATCH 2/6] hypothesis tests --- tests/test_filter_hypothesis.py | 107 ++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/test_filter_hypothesis.py diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py new file mode 100644 index 0000000..1493762 --- /dev/null +++ b/tests/test_filter_hypothesis.py @@ -0,0 +1,107 @@ +#TODO: make io test use commandline jip +#NOTE: this duplicates a lot of the tested code's logic (see IO, above_min, imports) and API dependencies (see imports) +from Bio.SeqRecord import SeqRecord +import sh +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from Bio import SeqIO +from functools import partial +import re +from bioframework.seqio import interleave +import unittest +import operator +from toolz.itertoolz import partition +import hypothesis +from hypothesis import strategies as st +from hypothesis import find, note, given, assume, example +from bioframework.index_filter import qual_filter, filter_on_index_quality_interleaved + +above_min = lambda minimum: lambda x: min(x.letter_annotations['phred_quality']) >= minimum +make_seqrec = lambda id, seq, quals: \ + SeqRecord(Seq(seq, IUPAC.ambiguous_dna), id=str(id), description='', letter_annotations={'phred_quality':quals}) + +rec = st.integers(min_value=1, max_value=10).flatmap( + lambda n: + st.builds( + make_seqrec, + st.integers(), + st.text(alphabet='ATGCN', min_size=n, max_size=n), + st.lists(st.integers(min_value=35, max_value=40), min_size=n, max_size=n) + ) +) + +reads_and_indices = st.integers(min_value=1,max_value=5).flatmap( + lambda n: + st.tuples( + st.integers(min_value=0, max_value=40), + st.lists(rec, min_size=n, max_size=n).map(lambda x: list(interleave(x, x))), + st.lists(rec, min_size=n, max_size=n), + st.lists(rec, min_size=n, max_size=n), + )) + +def ilen(seq): return sum(1 for _ in seq) + +class TestIndexQualityFilter(unittest.TestCase): + @given(reads_and_indices) + def test_idempotent(self, seqs): + self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:]))) + + @given(reads_and_indices) + def test_always_less_or_equal_size(self, seqs): + self.assertLessEqual(ilen(qual_filter(*seqs)), ilen(seqs[1])) + + @given(reads_and_indices) + def test_result_is_even(self, seqs): + self.assertTrue(ilen(qual_filter(*seqs)) % 2 == 0) + + @given(reads_and_indices) + def test_result_is_interleaved(self, seqs): + result = qual_filter(*seqs) + pairs = partition(2, result) + matches = map(lambda (x,y): x.id == y.id, pairs) + self.assertTrue(all(matches)) + + @given(reads_and_indices) + def test_drops_all_if_all_lower(self, seqs): + (_min, reads, i1, i2) = seqs + assume(not any(map(above_min(_min), i1))) + result = qual_filter(*seqs) + self.assertEqual([], list(result)) + + @given(reads_and_indices) + def test_drops_none_if_all_above(self, seqs): + (_min, reads, i1, i2) = seqs + assume(all(map(above_min(_min), i1))) + assume(all(map(above_min(_min), i2))) + result = qual_filter(*seqs) + self.assertEqual(reads, list(result)) + + @given(reads_and_indices) + def test_drops_some_if_any_lower(self, seqs): + (_min, reads, i1, i2) = seqs + assume(not all(map(above_min(_min), i1 + i2))) + result = qual_filter(*seqs) + self.assertLess(ilen(result), ilen(reads)) + + @given(reads_and_indices) + #@hypothesis.settings(max_examples=3) + def test_drops_some_if_any_lower_with_IO(self, seqs): + (_min, reads, i1, i2) = seqs + assume(not all(map(above_min(_min), i1 + i2))) + names = ['r', 'i1', 'i2'] + map(partial(SeqIO.write, format='fastq'), seqs[1:], names) + outfile = 'foo' + filter_on_index_quality_interleaved(names[0], names[1], names[2], outfile, _min) + # version using the jip module itself not working + #cmd = sh.Command('jip_modules/interleaved_index_filter.jip') + #cmd(i=names[0], index1=names[1], index2=names[2], minimum=_min, o=outfile) # could use stdout + result = SeqIO.parse(outfile, 'fastq') + self.assertLess(ilen(result), ilen(reads)) + + +if __name__ == '__main__': + unittest.main() +# index files should be unchanged +# the number of reads left should be original - number of bad reads in each index which don't overlap +# can use find(strategy, boolean) +#_not = lambda f: lambda x: not f(x) From a9cdb9bac2049547168896e403210f1f3f3c38d7 Mon Sep 17 00:00:00 2001 From: Mike Panciera Date: Tue, 9 Feb 2016 21:15:10 -0500 Subject: [PATCH 3/6] filtering--with index--is not idempotent --- tests/test_filter_hypothesis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py index 1493762..9e80b78 100644 --- a/tests/test_filter_hypothesis.py +++ b/tests/test_filter_hypothesis.py @@ -42,9 +42,9 @@ def ilen(seq): return sum(1 for _ in seq) class TestIndexQualityFilter(unittest.TestCase): - @given(reads_and_indices) - def test_idempotent(self, seqs): - self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:]))) +# @given(reads_and_indices) +# def test_idempotent(self, seqs): +# self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:]))) @given(reads_and_indices) def test_always_less_or_equal_size(self, seqs): From ddeaa183846983edf0126f3cce48de2333597bf7 Mon Sep 17 00:00:00 2001 From: Mike Panciera Date: Tue, 9 Feb 2016 21:55:36 -0500 Subject: [PATCH 4/6] add hypothesis and sh to requirements --- tests/requirements-dev.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt index 1758ead..d96de86 100644 --- a/tests/requirements-dev.txt +++ b/tests/requirements-dev.txt @@ -2,3 +2,5 @@ nose mock python-coveralls testfixtures +sh +hypothesis From 0bf52b6dab3cfef9ac74038f85857fb8eed6e447 Mon Sep 17 00:00:00 2001 From: Tyghe Vallard Date: Wed, 10 Feb 2016 09:43:32 -0500 Subject: [PATCH 5/6] fixed seqio error where output file could be empty string --- jip_modules/interleaved_index_filter.jip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jip_modules/interleaved_index_filter.jip b/jip_modules/interleaved_index_filter.jip index 9468c61..6f1ea71 100755 --- a/jip_modules/interleaved_index_filter.jip +++ b/jip_modules/interleaved_index_filter.jip @@ -15,6 +15,6 @@ #%begin command python from bioframework.index_filter import filter_on_index_quality_interleaved -filter_on_index_quality_interleaved("${input}", "${index1}", "${index2}", "${output}", "${min_qual}") +filter_on_index_quality_interleaved("${input|else('/dev/stdin')}", "${index1}", "${index2}", "${output|else('/dev/stdout')}", "${min_qual}") #%end From 2d2efd779fcf9c2957ee5ba9dc891e856bdd0cd9 Mon Sep 17 00:00:00 2001 From: averagehat Date: Wed, 10 Feb 2016 11:04:07 -0500 Subject: [PATCH 6/6] more tests, all working locally --- tests/test_filter_hypothesis.py | 50 ++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py index 9e80b78..46a15ed 100644 --- a/tests/test_filter_hypothesis.py +++ b/tests/test_filter_hypothesis.py @@ -1,6 +1,7 @@ #TODO: make io test use commandline jip #NOTE: this duplicates a lot of the tested code's logic (see IO, above_min, imports) and API dependencies (see imports) from Bio.SeqRecord import SeqRecord +import os import sh from Bio.Seq import Seq from Bio.Alphabet import IUPAC @@ -8,7 +9,8 @@ from functools import partial import re from bioframework.seqio import interleave -import unittest +import unittest +from itertools import takewhile import operator from toolz.itertoolz import partition import hypothesis @@ -42,9 +44,6 @@ def ilen(seq): return sum(1 for _ in seq) class TestIndexQualityFilter(unittest.TestCase): -# @given(reads_and_indices) -# def test_idempotent(self, seqs): -# self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:]))) @given(reads_and_indices) def test_always_less_or_equal_size(self, seqs): @@ -59,7 +58,7 @@ def test_result_is_interleaved(self, seqs): result = qual_filter(*seqs) pairs = partition(2, result) matches = map(lambda (x,y): x.id == y.id, pairs) - self.assertTrue(all(matches)) + self.assertTrue(all(matches)) @given(reads_and_indices) def test_drops_all_if_all_lower(self, seqs): @@ -82,26 +81,43 @@ def test_drops_some_if_any_lower(self, seqs): assume(not all(map(above_min(_min), i1 + i2))) result = qual_filter(*seqs) self.assertLess(ilen(result), ilen(reads)) - + def run_io(self, seqs): + names = ['r', 'i1', 'i2'] + outfile = 'foo' + map(partial(SeqIO.write, format='fastq'), seqs[1:], names) + cmd = sh.Command('jip_modules/interleaved_index_filter.jip') + cmd(i=names[0], index1=names[1], index2=names[2], minimum=seqs[1], o=outfile) # could use stdout + result = SeqIO.parse(outfile, 'fastq') + return result +# @given(reads_and_indices) - #@hypothesis.settings(max_examples=3) + @hypothesis.settings(max_examples=20) def test_drops_some_if_any_lower_with_IO(self, seqs): (_min, reads, i1, i2) = seqs assume(not all(map(above_min(_min), i1 + i2))) - names = ['r', 'i1', 'i2'] - map(partial(SeqIO.write, format='fastq'), seqs[1:], names) - outfile = 'foo' - filter_on_index_quality_interleaved(names[0], names[1], names[2], outfile, _min) - # version using the jip module itself not working - #cmd = sh.Command('jip_modules/interleaved_index_filter.jip') - #cmd(i=names[0], index1=names[1], index2=names[2], minimum=_min, o=outfile) # could use stdout - result = SeqIO.parse(outfile, 'fastq') + result = self.run_io(seqs) self.assertLess(ilen(result), ilen(reads)) + @given(reads_and_indices) + def test_drops_correct_seq(self, seqs): + (_min, reads, i1, i2) = seqs + assume(not all(map(above_min(_min), i1))) + first_failing_index = ilen(takewhile(above_min(_min), i1)) + failing_reads = reads[first_failing_index*2:first_failing_index*2+2] + result = list(qual_filter(*seqs)) + self.assertFalse(failing_reads[0] in result or failing_reads[1] in result) + @given(reads_and_indices) + @hypothesis.settings(max_examples=20) + def test_drops_correct_seq_IO(self, seqs): + (_min, reads, i1, i2) = seqs + assume(not all(map(above_min(_min), i1))) + first_failing_index = ilen(takewhile(above_min(_min), i1)) + failing_reads = reads[first_failing_index*2:first_failing_index*2+2] + result = self.run_io(seqs) + self.assertFalse(failing_reads[0] in result or failing_reads[1] in result) if __name__ == '__main__': unittest.main() # index files should be unchanged # the number of reads left should be original - number of bad reads in each index which don't overlap -# can use find(strategy, boolean) -#_not = lambda f: lambda x: not f(x) +# can use find(strategy, boolean)