Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More-random #9

Merged
merged 5 commits into from
Mar 31, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion htslib
Submodule htslib deleted from 4b4349
17 changes: 14 additions & 3 deletions parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mkdir -p $outdir
outfile=${bamfile//[\/\.]/_}.minimized.$depth
ngs="/home/AMED/michael.panciera/projects/ngs_mapper/ngs_mapper"
compiled=$outdir/compiled.${outfile}.bam;

MERGE=true
let i=0
for ref in $refs;
do
Expand All @@ -18,13 +18,24 @@ do
(
out=${tmpdir}/${i}; #${outifile}
python subsample_mindepth.py $bamfile $ref --subsample $depth $orphans > $out.sam;
samtools view -hSb $out.sam > $out.bam; #| samtools sort - $out; samtools index $out.bam;
) &
samtools view -hSb $out.sam > $out.bam; #| samtools sort - $out; samtools index $out.bam;
ref=${ref//[\/\|]/_};
if [ ! "$MERGE" = true ]
then
singleout=$outdir/${ref}.minimized.$depth.bam
mv $out.bam $singleout
samtools sort $singleout $singleout;
samtools index $singleout;
python $ngs/bam_to_qualdepth.py $singleout > $singleout.json
python $ngs/graph_qualdepth.py $singleout.json -o $singleout.png
fi
) &
echo "$ref saved to $out.bam";
done;
wait

if [ ! "$MERGE" = true ]; then exit; fi

if [ $i -ge 2 ];
then
echo "merging files."
Expand Down
24 changes: 13 additions & 11 deletions subsample_mindepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import subsamplebam
import subprocess as sp
import re
import random

''' Python3 compatibility '''

''' Python3 compatibility '''
from past.builtins import map, xrange, filter

class CommonEqualityMixin(object):
Expand Down Expand Up @@ -48,11 +48,12 @@ class DepthMatrix(CommonEqualityMixin):
Keep track of the picked alignments (seq_matrix) and .depth_array, representing the coverage at each sequence position.
'''

def __init__(self, min_depth, allow_orphans=False):
def __init__(self, min_depth, allow_orphans=False, more_random=False):
self.seq_matrix = [[]]
#self.depth_array = np.zeros(ref_length)
self.min_depth = min_depth
self.allow_orphans = allow_orphans
self.more_random = more_random


def allow(self, seq):
Expand Down Expand Up @@ -85,11 +86,15 @@ def yield_greatest_overlaps(self, under_index, num_needed):
matches = 0
while matches < num_needed and candidate_sequences:
# could instead keep sequences in order sorted by overlap
farthest_overlap_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
candidate_sequences.remove(farthest_overlap_seq)
farthest_overlap_seq.pick()
if self.more_random:
next_seq = random.choice(candidate_sequences)
else:
next_seq = max(candidate_sequences, key=lambda seq: seq.overlap)
candidate_sequences.remove(next_seq)
next_seq.pick()
matches += 1
yield farthest_overlap_seq
yield next_seq

def pickreads(self, pos, needed_depth):
'''
1. get all sequences which could overlap under_index, and which have not already been selected. (how get these, by using samtools view again? or store in memory? only need a UNIQUE identifier and the sequence length)
Expand Down Expand Up @@ -117,7 +122,6 @@ def get_depths(self, reads):
'''
#TODO:Needlessly creates a depth_array of equal size to self.depth_array. Also needlessly slow.
depths = np.zeros(len(self.depth_array))
#import ipdb; ipdb.set_trace()
for seq in reads:
''' Even if a sequence would overlap past the length of depth-array, we don't include that in depth-array.
the numpy arrays must be the same size in order to add them. '''
Expand All @@ -131,7 +135,6 @@ def make_seq_matrix(self, bamfile, regionstr):
:param str regionstr: reference sequence and length as listed in .bam header i.e. <refname>:1-1000
Parse a bam file using sam tools, and store the alignments as a 2d matrix, where each row is a posiiton in the reference sequence.
'''
#import ipdb; ipdb.set_trace()
all_alignments = get_alignments(bamfile, regionstr)
max_pos = max([seq.pos for seq in all_alignments])
max_overlap = max([seq.overlap for seq in all_alignments])
Expand All @@ -145,7 +148,6 @@ def minimize_depths(self):
Trim self.seq_matrix to minimize coverage overflow.
For each position in the reference sequence, pick reads until the minimum depth is met if possible.
'''
#import ipdb; ipdb.set_trace()
for pos, depth in enumerate(self.depth_array):
if depth < self.min_depth:
needed_depth = self.min_depth - depth
Expand Down Expand Up @@ -194,7 +196,7 @@ def main():
sys.stderr.write(str(args)+'\n')
# ref_length = int(args.regionstr.split(':')[-1].split('-')[-1])
# region_str = args.regionstr.split(':')[0]
matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans)
matrix = DepthMatrix(args.subsample, allow_orphans=args.count_orphans, more_random=args.more_random)
matrix.make_seq_matrix(args.bamfile, args.refseq)
matrix.minimize_depths()
''' Flatten the matrix '''
Expand Down
7 changes: 6 additions & 1 deletion subsamplebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ def parse_args(wrapper=False):
action='store_true',
help='Allow orphan/unpaired reads.'
)


parser.add_argument(
'-r', '--more-random',
action='store_true',
help='Subsample with more random read-picking at expense of minimizing depth.',
default=False
)

return parser.parse_args()

Expand Down
11 changes: 11 additions & 0 deletions tests/result.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@HD VN:1.0 SO:coordinate
@SQ SN:gi|110640213|ref|NC_008253.1| LN:4938920
@PG ID:bowtie2 PN:bowtie2 VN:2.1.0
gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 GCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:45A24 YT:Z:UU
gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 TTT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:26A43 YT:Z:UU
gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1 0 gi|110640213|ref|NC_008253.1| 0 42 2M * 0 0 TT 22 AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:31C23C14 YT:Z:UU
gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1 0 gi|110640213|ref|NC_008253.1| 0 42 4M * 0 0 TTTC 2222 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1 0 gi|110640213|ref|NC_008253.1| 1 40 6M * 0 0 TCATTC 222222 AS:i:-9 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:9C27A5A26 YT:Z:UU
gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1 0 gi|110640213|ref|NC_008253.1| 1 42 2M * 0 0 AT 22 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1 0 gi|110640213|ref|NC_008253.1| 2 42 6M * 0 0 TTCTGA 222222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G54 YT:Z:UU
gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1 0 gi|110640213|ref|NC_008253.1| 2 42 3M * 0 0 TCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:10A59 YT:Z:UU
Binary file added tests/simplein.bam
Binary file not shown.
Binary file added tests/simplein.bam.bai
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/simplein.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
@HD VN:1.0 SO:coordinate
@SQ SN:gi|110640213|ref|NC_008253.1| LN:4938920
@PG ID:bowtie2 PN:bowtie2 VN:2.1.0
gi|110640213|ref|NC_008253.1|_2_451_1:0:0_1:0:0_12b/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 GCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:45A24 YT:Z:UU
gi|110640213|ref|NC_008253.1|_4_510_1:0:0_2:0:0_114/1 0 gi|110640213|ref|NC_008253.1| 0 42 3M * 0 0 TTT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:26A43 YT:Z:UU
gi|10640213|ref|NC_008253.1|_5_452_2:0:0_1:0:0_1d/1 0 gi|110640213|ref|NC_008253.1| 0 42 2M * 0 0 TT 22 AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:31C23C14 YT:Z:UU
gi|10640213|ref|NC_008253.1|_6_397_0:0:0_0:0:0_2c1/1 0 gi|110640213|ref|NC_008253.1| 0 42 4M * 0 0 TTTC 2222 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_7_500_3:0:0_1:0:0_250/1 0 gi|110640213|ref|NC_008253.1| 1 40 6M * 0 0 TCATTC 222222 AS:i:-9 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:9C27A5A26 YT:Z:UU
gi|10640213|ref|NC_008253.1|_9_557_0:0:0_2:0:0_13d/1 0 gi|110640213|ref|NC_008253.1| 1 42 2M * 0 0 AT 22 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UU
gi|10640213|ref|NC_008253.1|_10_554_1:0:0_1:0:0_2a/1 0 gi|110640213|ref|NC_008253.1| 2 42 6M * 0 0 TTCTGA 222222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G54 YT:Z:UU
gi|10640213|ref|NC_008253.1|_11_515_1:0:0_1:0:0_335/1 0 gi|110640213|ref|NC_008253.1| 2 42 3M * 0 0 TCT 222 AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:10A59 YT:Z:UU


70 changes: 63 additions & 7 deletions tests/test_mindepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,20 @@

THISD = os.path.dirname(os.path.abspath(__file__))

''' Instead of a random alignment, return the smallest alignment to make testing simpler '''
def not_random_subsample(collection):
return min(collection, key=lambda seq: seq.seq_length)

def mock_args(wrapper=False):
bamfile = os.path.join(THISD, 'ecoli.bam')
return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|")
return Namespace(reflength=1000, subsample=40, count_orphans=True, bamfile=bamfile, refseq="gi|110640213|ref|NC_008253.1|", more_random=False)


def more_random_mock_args(wrapper=False):
bamfile = os.path.join(THISD, 'simplein.bam')
# empty refseq hack to get around malformed .bam file
return Namespace(reflength=1000, subsample=3, count_orphans=True, bamfile=bamfile, refseq="", more_random=True)



def mock_get_raw_reads( bamfile, regionstr=''):
Expand Down Expand Up @@ -116,7 +127,7 @@ def test_yield_greatest_overlaps_ecoli(self):
def test_pickreads(self):
pass

def test_minimize_depths_ecoli(self):
def test_minimize_depths_complex(self):
self.complexSetUp()
seqs = map(sub.parse_alignment, self.raw_reads)
expected_matrix = [
Expand All @@ -143,16 +154,61 @@ def test_minimize_depths_ecoli(self):



def check_files(self, mock_stdout, expected_file):
samfile = os.path.join(THISD, expected_file)
expected = open(samfile, 'r').read().strip()
result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
if expected_file == 'simplein.sam':
with open('tests/result.sam', 'w') as out: out.write(result)
return expected, result


@mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
@mock.patch("subsamplebam.parse_args", side_effect=mock_args)
@mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
def test_main(self, mock_stdout, func):
def test_main(self, mock_stdout, func, func2):
sub.main()
self.assertEquals(*self.check_files(mock_stdout, 'test40.sam'))


samfile = os.path.join(THISD, 'test40.sam')
expected = open(samfile, 'r').read().strip()
result = str(mock_stdout.getvalue().strip().decode('UTF-8'));
self.assertEquals(expected, result)

''' The order of these mocks is the order they appear as arguments to the test method. '''
# @mock.patch('subsample_mindepth.random.sample', side_effect=not_random_subsample)
# @mock.patch("subsamplebam.parse_args", side_effect=more_random_mock_args)
# @mock.patch('subsamplebam.sys.stdout', new_callable=BytesIO)
# def test_main_more_random(self, mock_stdout, func, func2):
# #Expected will equal the input file becasue all reads are subsampled in this case
# sub.main()
# self.maxDiff = None
# expected, result = self.check_files(mock_stdout, 'simplein.sam')
# self.assertEquals(expected, result)


@mock.patch('subsample_mindepth.random.choice', side_effect=not_random_subsample)
def test_minimize_depths_complex_more_random(self, func):
#TODO: Refactor this so as not to duplicate code with non-random ecoli test
self.complexSetUp()
seqs = map(sub.parse_alignment, self.raw_reads)
expected_matrix = [
[seqs[0], seqs[1], seqs[2], seqs[3]],
[seqs[4], seqs[5]],
[seqs[6], seqs[7]]
]
expected_list = seqs
for seq in expected_list:
seq.pick()
expected_depths = np.array([4, 6, 7, 4, 3, 2, 2, 1])
self.matrix.min_depth = 3
self.matrix.more_random = True
self.matrix.minimize_depths()
actual_matrix = self.matrix.seq_matrix
for row in actual_matrix:
for seq in row:
if not seq.picked:
row.remove(seq)
self.assertEquals(expected_matrix, actual_matrix)
assert_equal(expected_depths, self.matrix.depth_array)
self.assertEquals([seq.string for seq in expected_list], sub.flatten_and_filter_matrix(self.matrix.seq_matrix))



Expand Down