Merge pull request #9 from averagehat/master

More-random
VDBWRAIR · Mar 31, 2015 · ba4edec · ba4edec
2 parents b8a6f9e + a9fea0a
commit ba4edec
Show file tree

Hide file tree

Showing 8 changed files with 120 additions and 22 deletions.
diff --git a/parallel.sh b/parallel.sh
@@ -9,7 +9,7 @@ mkdir -p $outdir
 outfile=${bamfile//[\/\.]/_}.minimized.$depth
 ngs="/home/AMED/michael.panciera/projects/ngs_mapper/ngs_mapper" 
 compiled=$outdir/compiled.${outfile}.bam;  
-
+MERGE=true
 let i=0
 for ref  in $refs; 
 do  
@@ -18,13 +18,24 @@ do
     (
     out=${tmpdir}/${i}; #${outifile}
     python subsample_mindepth.py $bamfile $ref --subsample $depth $orphans > $out.sam;
-    samtools view -hSb $out.sam  > $out.bam; #| samtools sort - $out; samtools index $out.bam;
-    ) &
+    samtools view -hSb $out.sam  > $out.bam; #| samtools sort - $out; samtools index $out.bam; 
     ref=${ref//[\/\|]/_}; 
+    if [ ! "$MERGE" = true ]
+    then
+        singleout=$outdir/${ref}.minimized.$depth.bam
+        mv $out.bam $singleout
+        samtools sort $singleout $singleout;
+        samtools index  $singleout; 
+        python $ngs/bam_to_qualdepth.py $singleout > $singleout.json
+        python  $ngs/graph_qualdepth.py $singleout.json -o $singleout.png 
+    fi 
+    ) & 
     echo "$ref saved to $out.bam";
 done; 
 wait
 
+if [ ! "$MERGE" = true ]; then exit; fi
+
 if [ $i -ge 2 ];
 then  
     echo "merging files."

diff --git a/subsample_mindepth.py b/subsample_mindepth.py
@@ -4,9 +4,9 @@
 import subsamplebam
 import subprocess as sp
 import re
+import random
 
-''' Python3 compatibility '''
-
+''' Python3 compatibility ''' 
 from past.builtins import map, xrange, filter
 
 class CommonEqualityMixin(object): 
@@ -48,11 +48,12 @@ class DepthMatrix(CommonEqualityMixin):
     Keep track of the picked alignments (seq_matrix) and .depth_array, representing the coverage at each sequence position.
     '''
 
-    def __init__(self, min_depth, allow_orphans=False):
+    def __init__(self, min_depth, allow_orphans=False, more_random=False):
         self.seq_matrix = [[]]
         #self.depth_array = np.zeros(ref_length)
         self.min_depth = min_depth 
         self.allow_orphans = allow_orphans
+        self.more_random = more_random
 
 
     def allow(self, seq):
@@ -85,11 +86,15 @@ def yield_greatest_overlaps(self, under_index, num_needed):
         matches = 0
         while matches < num_needed and candidate_sequences: 
             # could instead keep sequences in order sorted by overlap 
-            farthest_overlap_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
-            candidate_sequences.remove(farthest_overlap_seq)
-            farthest_overlap_seq.pick()
+            if self.more_random: 
+                next_seq = random.choice(candidate_sequences)
+            else: 
+                next_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
+            candidate_sequences.remove(next_seq)
+            next_seq.pick()
             matches += 1
-            yield farthest_overlap_seq
+            yield next_seq
+
     def pickreads(self, pos, needed_depth):
         ''' 
         1. get all sequences which could overlap under_index, and which have not already been selected. (how get these, by using samtools view again? or store in memory? only need a UNIQUE identifier and the sequence length)
@@ -117,7 +122,6 @@ def get_depths(self, reads):
        '''
        #TODO:Needlessly creates a depth_array of equal size to self.depth_array. Also needlessly slow.
        depths = np.zeros(len(self.depth_array))
-       #import ipdb; ipdb.set_trace()
        for seq in reads: 
            ''' Even if a sequence would overlap past the length of depth-array, we don't include that in depth-array. 
            the numpy arrays must be the same size in order to add them.  '''
@@ -131,7 +135,6 @@ def make_seq_matrix(self, bamfile, regionstr):
         :param str regionstr: reference sequence and length as listed in .bam header i.e.  <refname>:1-1000
         Parse a bam file using sam tools, and store the alignments as a 2d matrix, where each row is a posiiton in the reference sequence.  
         '''
-        #import ipdb; ipdb.set_trace()
         all_alignments = get_alignments(bamfile, regionstr)
         max_pos = max([seq.pos for seq in all_alignments])
         max_overlap = max([seq.overlap for seq in all_alignments])
@@ -145,7 +148,6 @@ def minimize_depths(self):
         Trim self.seq_matrix to minimize coverage overflow.
         For each position in the reference sequence, pick reads until the minimum depth is met if possible.
         '''
-        #import ipdb; ipdb.set_trace()
         for pos, depth in enumerate(self.depth_array):
             if depth < self.min_depth:
                 needed_depth = self.min_depth - depth
@@ -194,7 +196,7 @@ def main():
     sys.stderr.write(str(args)+'\n') 
 #    ref_length = int(args.regionstr.split(':')[-1].split('-')[-1])
 #    region_str = args.regionstr.split(':')[0]
-    matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans) 
+    matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans, more_random=args.more_random)
     matrix.make_seq_matrix(args.bamfile, args.refseq)
     matrix.minimize_depths()
     ''' Flatten the matrix '''

diff --git a/subsamplebam.py b/subsamplebam.py
@@ -53,8 +53,13 @@ def parse_args(wrapper=False):
             action='store_true',
             help='Allow orphan/unpaired reads.'
             )
-
 
+    parser.add_argument(
+            '-r', '--more-random',
+            action='store_true',
+            help='Subsample with more random read-picking at expense of minimizing depth.',
+            default=False
+            ) 
 
     return parser.parse_args()
 

diff --git a/tests/result.sam b/tests/result.sam
@@ -0,0 +1,11 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:gi|110640213|ref|NC_008253.1|	LN:4938920
+@PG	ID:bowtie2	PN:bowtie2	VN:2.1.0 
+gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1	0	gi|110640213|ref|NC_008253.1|	0	42	3M	*	0	0	GCT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:45A24	YT:Z:UU
+gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1	0	gi|110640213|ref|NC_008253.1|	0	42	3M	*	0	0	TTT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:26A43	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1	0	gi|110640213|ref|NC_008253.1|	0	42	2M	*	0	0	TT	22	AS:i:-6	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:31C23C14	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1	0	gi|110640213|ref|NC_008253.1|	0	42	4M	*	0	0	TTTC	2222	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:70	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1	0	gi|110640213|ref|NC_008253.1|	1	40	6M	*	0	0	TCATTC	222222	AS:i:-9	XN:i:0	XM:i:3	XO:i:0	XG:i:0	NM:i:3	MD:Z:9C27A5A26	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1	0	gi|110640213|ref|NC_008253.1|	1	42	2M	*	0	0	AT	22	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:70	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1	0	gi|110640213|ref|NC_008253.1|	2	42	6M	*	0	0	TTCTGA	222222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G54	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1	0	gi|110640213|ref|NC_008253.1|	2	42	3M	*	0	0	TCT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:10A59	YT:Z:UU
diff --git a/tests/simplein.bam b/tests/simplein.bam
diff --git a/tests/simplein.bam.bai b/tests/simplein.bam.bai
diff --git a/tests/simplein.sam b/tests/simplein.sam
@@ -0,0 +1,13 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:gi|110640213|ref|NC_008253.1|	LN:4938920
+@PG	ID:bowtie2	PN:bowtie2	VN:2.1.0 
+gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1	0	gi|110640213|ref|NC_008253.1|	0	42	3M	*	0	0	GCT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:45A24	YT:Z:UU
+gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1	0	gi|110640213|ref|NC_008253.1|	0	42	3M	*	0	0	TTT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:26A43	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1	0	gi|110640213|ref|NC_008253.1|	0	42	2M	*	0	0	TT	22	AS:i:-6	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:31C23C14	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1	0	gi|110640213|ref|NC_008253.1|	0	42	4M	*	0	0	TTTC	2222	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:70	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1	0	gi|110640213|ref|NC_008253.1|	1	40	6M	*	0	0	TCATTC	222222	AS:i:-9	XN:i:0	XM:i:3	XO:i:0	XG:i:0	NM:i:3	MD:Z:9C27A5A26	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1	0	gi|110640213|ref|NC_008253.1|	1	42	2M	*	0	0	AT	22	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:70	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1	0	gi|110640213|ref|NC_008253.1|	2	42	6M	*	0	0	TTCTGA	222222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G54	YT:Z:UU
+gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1	0	gi|110640213|ref|NC_008253.1|	2	42	3M	*	0	0	TCT	222	AS:i:-3	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:10A59	YT:Z:UU
+
+
diff --git a/tests/test_mindepth.py b/tests/test_mindepth.py
@@ -25,9 +25,20 @@
 
 THISD = os.path.dirname(os.path.abspath(__file__))
 
+''' Instead of a random alignment, return the smallest alignment to make testing simpler '''
+def not_random_subsample(collection):
+    return min(collection, key=lambda seq: seq.seq_length)
+
 def mock_args(wrapper=False): 
         bamfile = os.path.join(THISD, 'ecoli.bam')
-        return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|")
+        return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|", more_random=False)
+
+
+def more_random_mock_args(wrapper=False): 
+        bamfile = os.path.join(THISD, 'simplein.bam')
+        # empty refseq hack to get around malformed .bam file
+        return Namespace(reflength=1000, subsample=3, count_orphans=True, bamfile=bamfile, refseq="", more_random=True)
+
 
 
 def mock_get_raw_reads( bamfile, regionstr=''):
@@ -116,7 +127,7 @@ def test_yield_greatest_overlaps_ecoli(self):
     def test_pickreads(self):
         pass
 
-    def test_minimize_depths_ecoli(self):
+    def test_minimize_depths_complex(self):
         self.complexSetUp()
         seqs = map(sub.parse_alignment, self.raw_reads) 
         expected_matrix =  [
@@ -143,16 +154,61 @@ def test_minimize_depths_ecoli(self):
 
 
 
+    def check_files(self, mock_stdout, expected_file):
+        samfile = os.path.join(THISD, expected_file)
+        expected = open(samfile, 'r').read().strip()
+        result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
+        if expected_file == 'simplein.sam':
+            with open('tests/result.sam', 'w') as out: out.write(result)
+        return expected, result
+
+
+    @mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
     @mock.patch("subsamplebam.parse_args", side_effect=mock_args)
     @mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
-    def test_main(self, mock_stdout, func):
+    def test_main(self, mock_stdout, func, func2):
         sub.main() 
+        self.assertEquals(*self.check_files(mock_stdout, 'test40.sam'))
+
 
-        samfile = os.path.join(THISD, 'test40.sam')
-        expected = open(samfile, 'r').read().strip()
-        result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
-        self.assertEquals(expected, result)
 
+    ''' The order of these mocks is the order they appear as arguments to the test method. '''
+#    @mock.patch('subsample_mindepth.random.sample', side_effect=not_random_subsample)
+#    @mock.patch("subsamplebam.parse_args", side_effect=more_random_mock_args)
+#    @mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
+#    def test_main_more_random(self, mock_stdout, func, func2):
+#        #Expected will equal the input file becasue all reads are subsampled in this case
+#        sub.main()
+#        self.maxDiff = None
+#        expected, result = self.check_files(mock_stdout, 'simplein.sam')
+#        self.assertEquals(expected, result)
+
+
+    @mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
+    def test_minimize_depths_complex_more_random(self, func):
+        #TODO: Refactor this so as not to duplicate code with non-random ecoli test
+        self.complexSetUp()
+        seqs = map(sub.parse_alignment, self.raw_reads) 
+        expected_matrix =  [
+                [seqs[0],  seqs[1],  seqs[2], seqs[3]],
+                [seqs[4], seqs[5]],
+                [seqs[6], seqs[7]]
+                ]
+        expected_list = seqs
+        for seq in expected_list:
+            seq.pick()
+        expected_depths = np.array([4, 6, 7, 4, 3, 2, 2, 1])
+        self.matrix.min_depth = 3
+        self.matrix.more_random = True
+        self.matrix.minimize_depths()
+        actual_matrix = self.matrix.seq_matrix
+        for row in actual_matrix:
+            for seq in row:
+                if not seq.picked:
+                    row.remove(seq) 
+        self.assertEquals(expected_matrix, actual_matrix)
+        assert_equal(expected_depths, self.matrix.depth_array)
+        self.assertEquals([seq.string for seq in expected_list], sub.flatten_and_filter_matrix(self.matrix.seq_matrix))