-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathget_full_length_NM_seqs.pl
executable file
·110 lines (98 loc) · 4.49 KB
/
get_full_length_NM_seqs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env perl
# FUTURE CHANGES:
# Ultimately, I should make this a more generalized script that takes as input an original sequencing results file (FASTQ, SCARF formats) and
# a file containing a list of headers (without the ">"), and returns either all entries from the results file that match the headers ('include'
# mode) or all entries from the results file that don't match the headers ('exclude' mode)
# Take as input the "no match" FASTA file produced by the COPRO-Seq pipeline, as well as the
# sequencing results file from which it was derived (SCARF or FASTQ format), and return the full-length sequences for use in a
# BLAST search
# This script is necessary because the trimmed Illumina reads are usually too short to give
# significant hits against large databases like nt
# Note: the headers for the files generated by this script may not exactly match those in your original sequencing results
# because in some cases sequencing platforms include whitespace in FASTA headers which the COPRO-Seq pipeline replaces
# with underscores (whitespace is undesirable because it breaks many third-party scripts/programs that read in FASTAs).
# IN PROCESS OF UPDATING THIS SCRIPT TO ACCOMMODATE FASTQ FILES -- DON'T USE IN CURRENT FORM UNTIL UPDATES ARE COMPLETE
# Usage: perl get_full_length_NM_seqs.pl [FASTA of trimmed NM seqs] [original SCARF/FASTQ sequencing results] [output file] [barcode length] [sequence file format ('fastq' or 'scarf')]
use strict;
use warnings;
my $seq_format;
if (defined $ARGV[4]) {
if(($ARGV[4] eq 'fastq') || ($ARGV[4] eq 'scarf')) { $seq_format = 'fastq'; }
else { die "Improper sequence file format specified at startup (options are 'scarf' and 'fastq')\n"; }
}
else {
$seq_format = 'fastq';
}
if ($seq_format eq 'fastq') {
my %NMheaders; # Will store a unique list of all sequence headers from the NM FASTA file (the headers you'll want to grab in original FASTQ)
# Cycle through input NM FASTA file to get all headers that need to be searched for in original FASTQ file
open(NM, $ARGV[0]) || die "Can't open input (NM) FASTA file $ARGV[0]!\n";
print "Opening FASTA of non-matching (NM) sequences for the retrieval of headers...\n";
while(<NM>) {
chomp;
if (/^>(.+)/) {
$NMheaders{$1} = 1;
}
}
print "Done grabbing headers.\n";
close NM;
open(FASTQ, $ARGV[1]) || die "Can't open input (original sequences) FASTQ file $ARGV[1]!\n";
open(OUTPUT, ">$ARGV[2]") || die "Can't open output file $ARGV[2]!\n";
# Check the first line and every four lines thereafter (FASTQ entries comprise four lines)
# If you find a header that was in your NM file (using the hash as a lookup), report the header
# and 2nd line (sequence) after trimming barcode to your output file, then ignore 3rd and 4th lines
# If you find a header that wasn't in your NM file, just ignore the next three lines without doing anything
my $trackprogress = 0;
while (<FASTQ>) {
$trackprogress++;
if ($trackprogress % 10000 == 0) { print "Done searching $trackprogress entries in your original sequencing results...\n"; }
if ($_ =~ /^@(.+)/) {
my $h = $1;
$h =~ s/\s/_/; # .NM FASTAs reported by COPRO-Seq pipeline have had all whitespace in original results converted to underscores
if (defined $NMheaders{$h}) {
my $untrimmedseq = <FASTQ>;
print OUTPUT ">$h\n";
print OUTPUT substr($untrimmedseq, $ARGV[3]);
<FASTQ>;
<FASTQ>;
}
else {
# Ignore next three lines
<FASTQ>;
<FASTQ>;
<FASTQ>;
}
}
else { die "There seems to be a problem with the format of your FASTQ file or with this script's efforts to parse it.\n"; }
}
close OUTPUT;
close FASTQ;
}
elsif ($seq_format eq 'scarf') {
# Get list of relevant headers that can be passed to grep via the -f option
open(NM, $ARGV[0]) || die "Can't open input (NM) FASTA file $ARGV[0]!\n";
open(PATTERNS, ">patterns.txt") || die "Can't open output file patterns.txt!\n";
while(<NM>) {
chomp;
my $a = $_;
$a =~ s/>//;
if(/^>/) {
print PATTERNS $a."\n";
}
}
close NM;
close PATTERNS;
# Grab all relevant lines from original SCARF file and pass to temporary text file
system("grep -F -f patterns.txt $ARGV[1] > tmp.txt");
# Cycle through each line of temporary text file, trimming barcode from sequence
open(TMP, "tmp.txt") || die "Can't open temporary file tmp.txt!\n";
open(OUTPUT, ">$ARGV[2]") || die "Can't open output file $ARGV[2]!\n";
while(<TMP>) {
my @line = split(/\t/, $_);
my $seq = substr($line[2], $ARGV[3]);
print OUTPUT "$line[0]\n$seq\n";
}
close TMP;
close OUTPUT;
}
exit;