-
Notifications
You must be signed in to change notification settings - Fork 28
/
filter_sequences_by_length.pl
executable file
·111 lines (90 loc) · 3.43 KB
/
filter_sequences_by_length.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl
#######################################################################################
### ###
### Copyright (C) 2017 Pawel Krawczyk ([email protected]) ###
### ###
### This program is free software: you can redistribute it and/or modify ###
### it under the terms of the GNU General Public License as published by ###
### the Free Software Foundation, either version 3 of the License, or ###
### (at your option) any later version. ###
### ###
### This program is distributed in the hope that it will be useful, ###
### but WITHOUT ANY WARRANTY; without even the implied warranty of ###
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ###
### GNU General Public License for more details. ###
### ###
### You should have received a copy of the GNU General Public License ###
### along with this program. If not, see <http://www.gnu.org/licenses/>. ###
### ###
#######################################################################################
use strict;
use Bio::SeqIO;
use Bio::SearchIO;
use Getopt::Long;
my $input_file;
my $output_file;
my $length_threshold;
# opis uzycia programu
my $USAGE =
"Filters sequences in multifasta file by length\n\nUsage of program:\n\n $0 [options] \n\t-input\t\tmultifasta file with sequences to be filtered\n\t-output\t\toutput file for filtered sequences \n\t-thresh\t\tsequence length threshold [bp]\n";
#pobranie zmiennych z linii polecen
unless (@ARGV) {
print $USAGE;
exit;
}
GetOptions(
'input=s' => \$input_file,
'thresh=s' => \$length_threshold,
'output=s' => \$output_file
);
if ( $input_file eq '' ) {
die "no input file specified\n";
}
if ( $output_file eq '' ) {
die "no output file specified\n";
}
if ( $length_threshold eq '' ) {
die "no length threshold provided\n";
}
my $inseq;
#properly treat gzipped files
if ( $input_file =~ /\.gz$/ ) {
$inseq = Bio::SeqIO->new(
-file => "gunzip -c $input_file |",
-format => 'fasta',
);
}
else {
$inseq = Bio::SeqIO->new(
-file => "<$input_file",
-format => 'fasta',
);
}
#define output fasta file
my $out_file = Bio::SeqIO->new(
-file => ">$output_file",
-format => 'fasta',
);
#counter variables:
my $no_seq = 0;
my $missed_seqs = 0;
my $seqLength;
#iterate over sequences in the input file
while ( my $seq = $inseq->next_seq ) {
$seqLength = $seq->length;
if ( $seqLength >= $length_threshold ) {
$out_file->write_seq($seq);
}
else {
# print "Sequence $seqID of length $seqLength will not be included\n";
$missed_seqs++;
}
$no_seq++;
}
#remaining sequences
my $remaining = $no_seq - $missed_seqs;
print
"$missed_seqs out of $no_seq sequences from $input_file were not included in the dataset $output_file.
Remaining $remaining seqeunces will be used for further processing steps
";
exit;