Skip to content

Commit

Permalink
Merge pull request #9 from averagehat/master
Browse files Browse the repository at this point in the history
More-random
  • Loading branch information
necrolyte2 committed Mar 31, 2015
2 parents b8a6f9e + a9fea0a commit ba4edec
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 22 deletions.
17 changes: 14 additions & 3 deletions parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mkdir -p $outdir
outfile=${bamfile//[\/\.]/_}.minimized.$depth
ngs="/home/AMED/michael.panciera/projects/ngs_mapper/ngs_mapper"
compiled=$outdir/compiled.${outfile}.bam;

MERGE=true
let i=0
for ref in $refs;
do
Expand All @@ -18,13 +18,24 @@ do
(
out=${tmpdir}/${i}; #${outifile}
python subsample_mindepth.py $bamfile $ref --subsample $depth $orphans > $out.sam;
samtools view -hSb $out.sam > $out.bam; #| samtools sort - $out; samtools index $out.bam;
) &
samtools view -hSb $out.sam > $out.bam; #| samtools sort - $out; samtools index $out.bam;
ref=${ref//[\/\|]/_};
if [ ! "$MERGE" = true ]
then
singleout=$outdir/${ref}.minimized.$depth.bam
mv $out.bam $singleout
samtools sort $singleout $singleout;
samtools index $singleout;
python $ngs/bam_to_qualdepth.py $singleout > $singleout.json
python $ngs/graph_qualdepth.py $singleout.json -o $singleout.png
fi
) &
echo "$ref saved to $out.bam";
done;
wait

if [ ! "$MERGE" = true ]; then exit; fi

if [ $i -ge 2 ];
then
echo "merging files."
Expand Down
24 changes: 13 additions & 11 deletions subsample_mindepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import subsamplebam
import subprocess as sp
import re
import random

''' Python3 compatibility '''

''' Python3 compatibility '''
from past.builtins import map, xrange, filter

class CommonEqualityMixin(object):
Expand Down Expand Up @@ -48,11 +48,12 @@ class DepthMatrix(CommonEqualityMixin):
Keep track of the picked alignments (seq_matrix) and .depth_array, representing the coverage at each sequence position.
'''

def __init__(self, min_depth, allow_orphans=False):
def __init__(self, min_depth, allow_orphans=False, more_random=False):
self.seq_matrix = [[]]
#self.depth_array = np.zeros(ref_length)
self.min_depth = min_depth
self.allow_orphans = allow_orphans
self.more_random = more_random


def allow(self, seq):
Expand Down Expand Up @@ -85,11 +86,15 @@ def yield_greatest_overlaps(self, under_index, num_needed):
matches = 0
while matches < num_needed and candidate_sequences:
# could instead keep sequences in order sorted by overlap
farthest_overlap_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
candidate_sequences.remove(farthest_overlap_seq)
farthest_overlap_seq.pick()
if self.more_random:
next_seq = random.choice(candidate_sequences)
else:
next_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
candidate_sequences.remove(next_seq)
next_seq.pick()
matches += 1
yield farthest_overlap_seq
yield next_seq

def pickreads(self, pos, needed_depth):
'''
1. get all sequences which could overlap under_index, and which have not already been selected. (how get these, by using samtools view again? or store in memory? only need a UNIQUE identifier and the sequence length)
Expand Down Expand Up @@ -117,7 +122,6 @@ def get_depths(self, reads):
'''
#TODO:Needlessly creates a depth_array of equal size to self.depth_array. Also needlessly slow.
depths = np.zeros(len(self.depth_array))
#import ipdb; ipdb.set_trace()
for seq in reads:
''' Even if a sequence would overlap past the length of depth-array, we don't include that in depth-array.
the numpy arrays must be the same size in order to add them. '''
Expand All @@ -131,7 +135,6 @@ def make_seq_matrix(self, bamfile, regionstr):
:param str regionstr: reference sequence and length as listed in .bam header i.e. <refname>:1-1000
Parse a bam file using sam tools, and store the alignments as a 2d matrix, where each row is a posiiton in the reference sequence.
'''
#import ipdb; ipdb.set_trace()
all_alignments = get_alignments(bamfile, regionstr)
max_pos = max([seq.pos for seq in all_alignments])
max_overlap = max([seq.overlap for seq in all_alignments])
Expand All @@ -145,7 +148,6 @@ def minimize_depths(self):
Trim self.seq_matrix to minimize coverage overflow.
For each position in the reference sequence, pick reads until the minimum depth is met if possible.
'''
#import ipdb; ipdb.set_trace()
for pos, depth in enumerate(self.depth_array):
if depth < self.min_depth:
needed_depth = self.min_depth - depth
Expand Down Expand Up @@ -194,7 +196,7 @@ def main():
sys.stderr.write(str(args)+'\n')
# ref_length = int(args.regionstr.split(':')[-1].split('-')[-1])
# region_str = args.regionstr.split(':')[0]
matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans)
matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans, more_random=args.more_random)
matrix.make_seq_matrix(args.bamfile, args.refseq)
matrix.minimize_depths()
''' Flatten the matrix '''
Expand Down
7 changes: 6 additions & 1 deletion subsamplebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ def parse_args(wrapper=False):
action='store_true',
help='Allow orphan/unpaired reads.'
)


parser.add_argument(
'-r', '--more-random',
action='store_true',
help='Subsample with more random read-picking at expense of minimizing depth.',
default=False
)

return parser.parse_args()

Expand Down
11 changes: 11 additions & 0 deletions tests/result.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@HD VN:1.0 SO:coordinate
@SQ SN:gi|110640213|ref|NC_008253.1| LN:4938920
@PG ID:bowtie2 PN:bowtie2 VN:2.1.0
gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 GCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:45A24 YT:Z:UU
gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 TTT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:26A43 YT:Z:UU
gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1 0 gi|110640213|ref|NC_008253.1| 0 42 2M * 0 0 TT 22 AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:31C23C14 YT:Z:UU
gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1 0 gi|110640213|ref|NC_008253.1| 0 42 4M * 0 0 TTTC 2222 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1 0 gi|110640213|ref|NC_008253.1| 1 40 6M * 0 0 TCATTC 222222 AS:i:-9 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:9C27A5A26 YT:Z:UU
gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1 0 gi|110640213|ref|NC_008253.1| 1 42 2M * 0 0 AT 22 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1 0 gi|110640213|ref|NC_008253.1| 2 42 6M * 0 0 TTCTGA 222222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G54 YT:Z:UU
gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1 0 gi|110640213|ref|NC_008253.1| 2 42 3M * 0 0 TCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:10A59 YT:Z:UU
Binary file added tests/simplein.bam
Binary file not shown.
Binary file added tests/simplein.bam.bai
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/simplein.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
@HD VN:1.0 SO:coordinate
@SQ SN:gi|110640213|ref|NC_008253.1| LN:4938920
@PG ID:bowtie2 PN:bowtie2 VN:2.1.0
gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 GCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:45A24 YT:Z:UU
gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 TTT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:26A43 YT:Z:UU
gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1 0 gi|110640213|ref|NC_008253.1| 0 42 2M * 0 0 TT 22 AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:31C23C14 YT:Z:UU
gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1 0 gi|110640213|ref|NC_008253.1| 0 42 4M * 0 0 TTTC 2222 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1 0 gi|110640213|ref|NC_008253.1| 1 40 6M * 0 0 TCATTC 222222 AS:i:-9 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:9C27A5A26 YT:Z:UU
gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1 0 gi|110640213|ref|NC_008253.1| 1 42 2M * 0 0 AT 22 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1 0 gi|110640213|ref|NC_008253.1| 2 42 6M * 0 0 TTCTGA 222222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G54 YT:Z:UU
gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1 0 gi|110640213|ref|NC_008253.1| 2 42 3M * 0 0 TCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:10A59 YT:Z:UU


70 changes: 63 additions & 7 deletions tests/test_mindepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,20 @@

THISD = os.path.dirname(os.path.abspath(__file__))

''' Instead of a random alignment, return the smallest alignment to make testing simpler '''
def not_random_subsample(collection):
return min(collection, key=lambda seq: seq.seq_length)

def mock_args(wrapper=False):
bamfile = os.path.join(THISD, 'ecoli.bam')
return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|")
return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|", more_random=False)


def more_random_mock_args(wrapper=False):
bamfile = os.path.join(THISD, 'simplein.bam')
# empty refseq hack to get around malformed .bam file
return Namespace(reflength=1000, subsample=3, count_orphans=True, bamfile=bamfile, refseq="", more_random=True)



def mock_get_raw_reads( bamfile, regionstr=''):
Expand Down Expand Up @@ -116,7 +127,7 @@ def test_yield_greatest_overlaps_ecoli(self):
def test_pickreads(self):
pass

def test_minimize_depths_ecoli(self):
def test_minimize_depths_complex(self):
self.complexSetUp()
seqs = map(sub.parse_alignment, self.raw_reads)
expected_matrix = [
Expand All @@ -143,16 +154,61 @@ def test_minimize_depths_ecoli(self):



def check_files(self, mock_stdout, expected_file):
samfile = os.path.join(THISD, expected_file)
expected = open(samfile, 'r').read().strip()
result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
if expected_file == 'simplein.sam':
with open('tests/result.sam', 'w') as out: out.write(result)
return expected, result


@mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
@mock.patch("subsamplebam.parse_args", side_effect=mock_args)
@mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
def test_main(self, mock_stdout, func):
def test_main(self, mock_stdout, func, func2):
sub.main()
self.assertEquals(*self.check_files(mock_stdout, 'test40.sam'))


samfile = os.path.join(THISD, 'test40.sam')
expected = open(samfile, 'r').read().strip()
result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
self.assertEquals(expected, result)

''' The order of these mocks is the order they appear as arguments to the test method. '''
# @mock.patch('subsample_mindepth.random.sample', side_effect=not_random_subsample)
# @mock.patch("subsamplebam.parse_args", side_effect=more_random_mock_args)
# @mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
# def test_main_more_random(self, mock_stdout, func, func2):
# #Expected will equal the input file becasue all reads are subsampled in this case
# sub.main()
# self.maxDiff = None
# expected, result = self.check_files(mock_stdout, 'simplein.sam')
# self.assertEquals(expected, result)


@mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
def test_minimize_depths_complex_more_random(self, func):
#TODO: Refactor this so as not to duplicate code with non-random ecoli test
self.complexSetUp()
seqs = map(sub.parse_alignment, self.raw_reads)
expected_matrix = [
[seqs[0], seqs[1], seqs[2], seqs[3]],
[seqs[4], seqs[5]],
[seqs[6], seqs[7]]
]
expected_list = seqs
for seq in expected_list:
seq.pick()
expected_depths = np.array([4, 6, 7, 4, 3, 2, 2, 1])
self.matrix.min_depth = 3
self.matrix.more_random = True
self.matrix.minimize_depths()
actual_matrix = self.matrix.seq_matrix
for row in actual_matrix:
for seq in row:
if not seq.picked:
row.remove(seq)
self.assertEquals(expected_matrix, actual_matrix)
assert_equal(expected_depths, self.matrix.depth_array)
self.assertEquals([seq.string for seq in expected_list], sub.flatten_and_filter_matrix(self.matrix.seq_matrix))



Expand Down

0 comments on commit ba4edec

Please sign in to comment.