forked from vikas0633/perl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BPbtab.pl
174 lines (107 loc) · 4.39 KB
/
BPbtab.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env perl
=head1 NAME
BPbtab
script parses WU-BLAST or NCBI-BLAST output files into BTAB format where each HSP is reported as a single line with tab-delimited fields.
The original BTAB program is described here:
Dubnick M. (1992) Btab--a Blast output parser. Comput Appl Biosci 8(6):601-2
A Perl version which emulates the functionality of btab is provided here by BPbtab. BPbtab relies exclusively on the BioPerl 1.4 Bio::SearchIO functionality.
=cut
=head1 USAGE
Standard input is parsed and written to standard output.
BPbtab < blast.output > blast.output.btab
=cut
use strict;
use Bio::SearchIO;
my $in = new Bio::SearchIO(-format => 'blast',
-fh => \*STDIN);
# parse each blast record:
while( my $result = $in->next_result ) {
# parse each hit per record.
while( my $hit = $result->next_hit ) {
# a hit consists of one or more HSPs
while( my $hsp = $hit->next_hsp ) {
eval {
&process_blast_match ($result, $hit, $hsp);
};
if ($@) {
print STDERR "Error, couldn't process blast entry.\n";
}
}
}
}
exit(0);
####
sub process_blast_match {
my ($result, $hit, $hsp) = @_;
my @x;
$x[0] = $result->query_name();
# date
$x[2] = $result->query_length();
$x[3] = $hsp->algorithm();
$x[4] = $result->database_name();
$x[5] = $hit->name();
$x[6] = $hsp->start('query');
$x[7] = $hsp->end('query');
my $queryStrand = $hsp->strand('query');
if ($queryStrand == -1) {
($x[6], $x[7]) = ($x[7], $x[6]);
}
$x[8] = $hsp->start('hit');
$x[9] = $hsp->end('hit');
my $hitStrand = $hsp->strand('hit');
if ($hitStrand == -1) {
($x[8], $x[9]) = ($x[9], $x[8]);
}
$x[10] = sprintf ("%.1f", $hsp->percent_identity());
my $similarity = $hsp->frac_conserved('total') * 100;
$x[11] = sprintf("%.1f", $similarity);
$x[12] = $hsp->score();
$x[13] = $hsp->bits();
$x[15] = $hit->description();
$x[16] = ( ($hsp->query->frame + 1) * $hsp->query->strand); #blast frame (1, 2, 3, -1, -2, -3).
my $strandDescript = "null";
if ($queryStrand == 1) {
$strandDescript = "Plus";
} elsif ($queryStrand == -1) {
$strandDescript = "Minus";
}
$x[17] = $strandDescript;
$x[18] = $hit->length();
$x[19] = $hsp->evalue();
$x[20] = $hsp->pvalue();
my $outline = join ("\t", @x);
print "$outline\n";
}
=head1 DESCRIPTION
Fields
The parsed BLAST output is presented as single line HSP descriptions, with tab-delimited fields in the following order:
[0] Query Sequence Name
[2] Query Sequence Length
[3] Search Method -- Blast family application name
[4] Database Name
[5] Subject Sequence Name -- Database entry name
[6],[7] Query Left End, Query Right End -- The endpoints of the part
of the query sequence which Blast aligns with the subject sequence.
[8],[9] Subject Left End, Subject Right End -- The endpoints of the part
of the subject sequence which Blast aligns with the query sequence.
[10] Percent Identity -- The fraction of residues which are absolute
matches between the query and subject sequence, expressed in
percent.
[11] Percent Similarity -- The fraction of residues which are exact or
similar matches between the query and subject sequence, expressed in
percent.
[12] HSP score
[13] Bits score
[15] Description -- A freeform text field which contains the biological
description field from the database for the subject sequence. If
this text occupies more than one line in the Blast output file, the
NewLines are replaced by spaces. Commas may occur in this field even
if they are the field separator character, because this is the last
field in the record.
[16] Query Frame (1, 2, 3, -1, -2, -3)
[17] Query Strand -- Plus, Minus or null
[18] DB sequence length
[19] Expect -- expected value
[20] P-Value -- Poisson ratio
** Note ** Intervening field positions which are not described are not currently supported. These remain to support compatibility with other existing btab implementations.
=cut