-
Notifications
You must be signed in to change notification settings - Fork 15
/
sam2LongestBase.pl
executable file
·64 lines (59 loc) · 1.46 KB
/
sam2LongestBase.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/perl
## sam2LongestBase -- extracts a random sequence from the longest sequences at each base
my $pos = -1;
my $seqName = "";
my $bestFlags = "";
my $bestID = "";
my $bestSeq = "";
my $bestQual = "";
my $bestLine = "";
my $seenCount = 0;
my $output = "sam"; # can be "sam" or "fastq"
sub printSeq {
my ($id, $seq, $qual) = @_;
if($id){
printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual);
}
}
while(<>){
if(/^@/){
print;
next;
}
my $line = $_;
chomp;
my @F = split(/\t/);
if(($F[2] ne $seqName) || ($F[3] != $pos) || (length($bestSeq) <= length($F[9]))){
if(length($bestSeq) == length($F[9])){
## reservoir sampling with a reservoir size of 1
## See https://en.wikipedia.org/wiki/Reservoir_sampling
## * with probability 1/i, keep the new item instead of the current item
$seenCount++;
if(!rand($seenCount)){
## i.e. if rand($seenCount) == 0, then continue with replacement
next;
}
} else {
$seenCount = 1;
}
if(($F[2] ne $seqName) || ($F[3] != $pos)){
if($output eq "fastq"){
printSeq($bestID, $bestSeq, $bestQual);
} elsif($output eq "sam"){
print($bestLine);
}
}
$seqName = $F[2];
$pos = $F[3];
$bestLine = $line;
$bestID = $F[0];
$bestFlags = $F[1];
$bestSeq = $F[9];
$bestQual = $F[10];
}
}
if($output eq "fastq"){
printSeq($bestID, $bestSeq, $bestQual);
} elsif($output eq "sam"){
print($bestLine);
}