From 3249d9182e8538a0a61465a8b4e00b72e2b18bf1 Mon Sep 17 00:00:00 2001
From: michaelpanciera <michael.panciera.work@gmail.com>
Date: Tue, 9 Feb 2016 16:35:13 -0500
Subject: [PATCH 1/6] refactor index_filter

---
 bioframework/index_filter.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/bioframework/index_filter.py b/bioframework/index_filter.py
index 8aaaa15..dd72754 100644
--- a/bioframework/index_filter.py
+++ b/bioframework/index_filter.py
@@ -30,20 +30,21 @@ def filter_on_index_quality(input, output, minimum):
 def filter_on_index_quality_interleaved(interleaved, index1, index2, output, minimum):
     '''enpair the interleaved read file,zip that enpaired with the two indexes
     drop pairs from the interleaved file if either *index* is below the minimum'''
+    return write_zip_results(partial(qual_filter, minimum), output, 'fastq', interleaved, index1, index2)
+
+def qual_filter(minimum, interleaved, idx1, idx2):
     above_min = lambda x: min(x.letter_annotations['phred_quality']) >= minimum
     def indexes_above_min(seq_index):
         seq, idx1, idx2 = seq_index
         return above_min(idx1) and above_min(idx2)
-    def qual_filter(interleaved, idx1, idx2):
-        # pair together forward/reverse reads
-        #  [SeqRecord(forward), SeqRecord(reverse)]
-        interleaved = partition(2, interleaved)
-        # Zip together indexes with pairs 
-        #  [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
-        zipped = izip(interleaved, idx1, idx2)
-        # Filter out all indexes with less than min qual and then grab
-        # only the interleaved sequence tuple
-        filtered = imap(first, ifilter(indexes_above_min, zipped))
-        # Chain together all sequences
-        return chain.from_iterable(filtered)
-    return write_zip_results(qual_filter, output, 'fastq', interleaved, index1, index2)
+    # pair together forward/reverse reads
+    #  [SeqRecord(forward), SeqRecord(reverse)]
+    interleaved = partition(2, interleaved)
+    # Zip together indexes with pairs 
+    #  [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
+    zipped = izip(interleaved, idx1, idx2)
+    # Filter out all indexes with less than min qual and then grab
+    # only the interleaved sequence tuple
+    filtered = imap(first, ifilter(indexes_above_min, zipped))
+    # Chain together all sequences
+    return chain.from_iterable(filtered)

From 60152ac528438dab622fb85cdc50054599769d69 Mon Sep 17 00:00:00 2001
From: michaelpanciera <michael.panciera.work@gmail.com>
Date: Tue, 9 Feb 2016 18:46:35 -0500
Subject: [PATCH 2/6] hypothesis tests

---
 tests/test_filter_hypothesis.py | 107 ++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/test_filter_hypothesis.py

diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py
new file mode 100644
index 0000000..1493762
--- /dev/null
+++ b/tests/test_filter_hypothesis.py
@@ -0,0 +1,107 @@
+#TODO: make io test use commandline jip
+#NOTE: this duplicates  a lot of the tested code's logic (see IO, above_min, imports) and API dependencies (see imports)
+from Bio.SeqRecord import SeqRecord
+import sh
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+from Bio import SeqIO
+from functools import partial
+import re
+from bioframework.seqio import interleave
+import unittest 
+import operator
+from toolz.itertoolz import partition
+import hypothesis
+from hypothesis import strategies as st
+from hypothesis import find, note, given, assume, example
+from bioframework.index_filter import qual_filter, filter_on_index_quality_interleaved
+
+above_min = lambda minimum: lambda x: min(x.letter_annotations['phred_quality']) >= minimum
+make_seqrec = lambda id, seq, quals: \
+                SeqRecord(Seq(seq, IUPAC.ambiguous_dna), id=str(id), description='', letter_annotations={'phred_quality':quals})
+
+rec = st.integers(min_value=1, max_value=10).flatmap(
+    lambda n:
+        st.builds(
+            make_seqrec,
+            st.integers(),
+            st.text(alphabet='ATGCN', min_size=n, max_size=n),
+            st.lists(st.integers(min_value=35, max_value=40), min_size=n, max_size=n)
+        )
+)
+
+reads_and_indices = st.integers(min_value=1,max_value=5).flatmap(
+   lambda n:
+   st.tuples(
+      st.integers(min_value=0, max_value=40),
+      st.lists(rec, min_size=n, max_size=n).map(lambda x: list(interleave(x, x))),
+      st.lists(rec, min_size=n, max_size=n),
+      st.lists(rec, min_size=n, max_size=n),
+   ))
+
+def ilen(seq): return sum(1 for _ in seq)
+
+class TestIndexQualityFilter(unittest.TestCase):
+    @given(reads_and_indices)
+    def test_idempotent(self, seqs):
+        self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:])))
+
+    @given(reads_and_indices)
+    def test_always_less_or_equal_size(self, seqs):
+        self.assertLessEqual(ilen(qual_filter(*seqs)), ilen(seqs[1]))
+
+    @given(reads_and_indices)
+    def test_result_is_even(self, seqs):
+        self.assertTrue(ilen(qual_filter(*seqs)) % 2 == 0)
+
+    @given(reads_and_indices)
+    def test_result_is_interleaved(self, seqs):
+        result = qual_filter(*seqs)
+        pairs = partition(2, result)
+        matches = map(lambda (x,y): x.id == y.id, pairs)
+        self.assertTrue(all(matches)) 
+
+    @given(reads_and_indices)
+    def test_drops_all_if_all_lower(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(not any(map(above_min(_min), i1)))
+        result = qual_filter(*seqs)
+        self.assertEqual([], list(result))
+
+    @given(reads_and_indices)
+    def test_drops_none_if_all_above(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(all(map(above_min(_min), i1)))
+        assume(all(map(above_min(_min), i2)))
+        result = qual_filter(*seqs)
+        self.assertEqual(reads, list(result))
+
+    @given(reads_and_indices)
+    def test_drops_some_if_any_lower(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(not all(map(above_min(_min), i1 + i2)))
+        result = qual_filter(*seqs)
+        self.assertLess(ilen(result), ilen(reads))
+
+    @given(reads_and_indices)
+    #@hypothesis.settings(max_examples=3)
+    def test_drops_some_if_any_lower_with_IO(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(not all(map(above_min(_min), i1 + i2)))
+        names =  ['r', 'i1', 'i2']
+        map(partial(SeqIO.write, format='fastq'), seqs[1:], names)
+        outfile = 'foo'
+        filter_on_index_quality_interleaved(names[0], names[1], names[2], outfile, _min)
+        # version using the jip module itself not working
+        #cmd = sh.Command('jip_modules/interleaved_index_filter.jip')
+        #cmd(i=names[0], index1=names[1], index2=names[2], minimum=_min, o=outfile) # could use stdout
+        result = SeqIO.parse(outfile, 'fastq')
+        self.assertLess(ilen(result), ilen(reads))
+
+
+if __name__ == '__main__':
+    unittest.main()
+# index files should be unchanged
+# the number of reads left should be original - number of bad reads in each index which don't overlap
+# can use find(strategy, boolean) 
+#_not = lambda f: lambda x: not f(x)

From a9cdb9bac2049547168896e403210f1f3f3c38d7 Mon Sep 17 00:00:00 2001
From: Mike Panciera <michael.panciera.work@gmail.com>
Date: Tue, 9 Feb 2016 21:15:10 -0500
Subject: [PATCH 3/6] filtering--with index--is not idempotent

---
 tests/test_filter_hypothesis.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py
index 1493762..9e80b78 100644
--- a/tests/test_filter_hypothesis.py
+++ b/tests/test_filter_hypothesis.py
@@ -42,9 +42,9 @@
 def ilen(seq): return sum(1 for _ in seq)
 
 class TestIndexQualityFilter(unittest.TestCase):
-    @given(reads_and_indices)
-    def test_idempotent(self, seqs):
-        self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:])))
+#    @given(reads_and_indices)
+#    def test_idempotent(self, seqs):
+#        self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:])))
 
     @given(reads_and_indices)
     def test_always_less_or_equal_size(self, seqs):

From ddeaa183846983edf0126f3cce48de2333597bf7 Mon Sep 17 00:00:00 2001
From: Mike Panciera <michael.panciera.work@gmail.com>
Date: Tue, 9 Feb 2016 21:55:36 -0500
Subject: [PATCH 4/6] add hypothesis and sh to requirements

---
 tests/requirements-dev.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt
index 1758ead..d96de86 100644
--- a/tests/requirements-dev.txt
+++ b/tests/requirements-dev.txt
@@ -2,3 +2,5 @@ nose
 mock
 python-coveralls 
 testfixtures
+sh
+hypothesis

From 0bf52b6dab3cfef9ac74038f85857fb8eed6e447 Mon Sep 17 00:00:00 2001
From: Tyghe Vallard <vallardt@gmail.com>
Date: Wed, 10 Feb 2016 09:43:32 -0500
Subject: [PATCH 5/6] fixed seqio error where output file could be empty string

---
 jip_modules/interleaved_index_filter.jip | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jip_modules/interleaved_index_filter.jip b/jip_modules/interleaved_index_filter.jip
index 9468c61..6f1ea71 100755
--- a/jip_modules/interleaved_index_filter.jip
+++ b/jip_modules/interleaved_index_filter.jip
@@ -15,6 +15,6 @@
 
 #%begin command python
 from bioframework.index_filter import filter_on_index_quality_interleaved
-filter_on_index_quality_interleaved("${input}", "${index1}", "${index2}", "${output}", "${min_qual}") 
+filter_on_index_quality_interleaved("${input|else('/dev/stdin')}", "${index1}", "${index2}", "${output|else('/dev/stdout')}", "${min_qual}") 
 #%end
 

From 2d2efd779fcf9c2957ee5ba9dc891e856bdd0cd9 Mon Sep 17 00:00:00 2001
From: averagehat <michael.panciera.work@gmail.com>
Date: Wed, 10 Feb 2016 11:04:07 -0500
Subject: [PATCH 6/6] more tests, all working locally

---
 tests/test_filter_hypothesis.py | 50 ++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/tests/test_filter_hypothesis.py b/tests/test_filter_hypothesis.py
index 9e80b78..46a15ed 100644
--- a/tests/test_filter_hypothesis.py
+++ b/tests/test_filter_hypothesis.py
@@ -1,6 +1,7 @@
 #TODO: make io test use commandline jip
 #NOTE: this duplicates  a lot of the tested code's logic (see IO, above_min, imports) and API dependencies (see imports)
 from Bio.SeqRecord import SeqRecord
+import os
 import sh
 from Bio.Seq import Seq
 from Bio.Alphabet import IUPAC
@@ -8,7 +9,8 @@
 from functools import partial
 import re
 from bioframework.seqio import interleave
-import unittest 
+import unittest
+from itertools import takewhile
 import operator
 from toolz.itertoolz import partition
 import hypothesis
@@ -42,9 +44,6 @@
 def ilen(seq): return sum(1 for _ in seq)
 
 class TestIndexQualityFilter(unittest.TestCase):
-#    @given(reads_and_indices)
-#    def test_idempotent(self, seqs):
-#        self.assertSequenceEqual(list(qual_filter(*seqs)), list(qual_filter(list(qual_filter(*seqs)), *seqs[1:])))
 
     @given(reads_and_indices)
     def test_always_less_or_equal_size(self, seqs):
@@ -59,7 +58,7 @@ def test_result_is_interleaved(self, seqs):
         result = qual_filter(*seqs)
         pairs = partition(2, result)
         matches = map(lambda (x,y): x.id == y.id, pairs)
-        self.assertTrue(all(matches)) 
+        self.assertTrue(all(matches))
 
     @given(reads_and_indices)
     def test_drops_all_if_all_lower(self, seqs):
@@ -82,26 +81,43 @@ def test_drops_some_if_any_lower(self, seqs):
         assume(not all(map(above_min(_min), i1 + i2)))
         result = qual_filter(*seqs)
         self.assertLess(ilen(result), ilen(reads))
-
+    def run_io(self, seqs):
+        names =  ['r', 'i1', 'i2']
+        outfile = 'foo'
+        map(partial(SeqIO.write, format='fastq'), seqs[1:], names)
+        cmd = sh.Command('jip_modules/interleaved_index_filter.jip')
+        cmd(i=names[0], index1=names[1], index2=names[2], minimum=seqs[1], o=outfile) # could use stdout
+        result = SeqIO.parse(outfile, 'fastq')
+        return result
+#
     @given(reads_and_indices)
-    #@hypothesis.settings(max_examples=3)
+    @hypothesis.settings(max_examples=20)
     def test_drops_some_if_any_lower_with_IO(self, seqs):
         (_min, reads, i1, i2) = seqs
         assume(not all(map(above_min(_min), i1 + i2)))
-        names =  ['r', 'i1', 'i2']
-        map(partial(SeqIO.write, format='fastq'), seqs[1:], names)
-        outfile = 'foo'
-        filter_on_index_quality_interleaved(names[0], names[1], names[2], outfile, _min)
-        # version using the jip module itself not working
-        #cmd = sh.Command('jip_modules/interleaved_index_filter.jip')
-        #cmd(i=names[0], index1=names[1], index2=names[2], minimum=_min, o=outfile) # could use stdout
-        result = SeqIO.parse(outfile, 'fastq')
+        result = self.run_io(seqs)
         self.assertLess(ilen(result), ilen(reads))
 
+    @given(reads_and_indices)
+    def test_drops_correct_seq(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(not all(map(above_min(_min), i1)))
+        first_failing_index = ilen(takewhile(above_min(_min), i1))
+        failing_reads = reads[first_failing_index*2:first_failing_index*2+2]
+        result = list(qual_filter(*seqs))
+        self.assertFalse(failing_reads[0] in result or failing_reads[1] in result)
 
+    @given(reads_and_indices)
+    @hypothesis.settings(max_examples=20)
+    def test_drops_correct_seq_IO(self, seqs):
+        (_min, reads, i1, i2) = seqs
+        assume(not all(map(above_min(_min), i1)))
+        first_failing_index = ilen(takewhile(above_min(_min), i1))
+        failing_reads = reads[first_failing_index*2:first_failing_index*2+2]
+        result = self.run_io(seqs)
+        self.assertFalse(failing_reads[0] in result or failing_reads[1] in result)
 if __name__ == '__main__':
     unittest.main()
 # index files should be unchanged
 # the number of reads left should be original - number of bad reads in each index which don't overlap
-# can use find(strategy, boolean) 
-#_not = lambda f: lambda x: not f(x)
+# can use find(strategy, boolean)