get_full_length_NM_seqs.pl

#!/usr/bin/env perl

# FUTURE CHANGES:
#	Ultimately, I should make this a more generalized script that takes as input an original sequencing results file (FASTQ, SCARF formats) and 
#	a file containing a list of headers (without the ">"), and returns either all entries from the results file that match the headers ('include'
#	mode) or all entries from the results file that don't match the headers ('exclude' mode)	

# Take as input the "no match" FASTA file produced by the COPRO-Seq pipeline, as well as the
#	sequencing results file from which it was derived (SCARF or FASTQ format), and return the full-length sequences for use in a 
#	BLAST search
# This script is necessary because the trimmed Illumina reads are usually too short to give
#	significant hits against large databases like nt

# Note: the headers for the files generated by this script may not exactly match those in your original sequencing results
#	because in some cases sequencing platforms include whitespace in FASTA headers which the COPRO-Seq pipeline replaces
#	with underscores (whitespace is undesirable because it breaks many third-party scripts/programs that read in FASTAs).

# IN PROCESS OF UPDATING THIS SCRIPT TO ACCOMMODATE FASTQ FILES -- DON'T USE IN CURRENT FORM UNTIL UPDATES ARE COMPLETE

# Usage: perl get_full_length_NM_seqs.pl [FASTA of trimmed NM seqs] [original SCARF/FASTQ sequencing results] [output file] [barcode length] [sequence file format ('fastq' or 'scarf')]

use strict;
use warnings;

my $seq_format;

if (defined $ARGV[4]) {
	if(($ARGV[4] eq 'fastq') || ($ARGV[4] eq 'scarf')) { $seq_format = 'fastq'; }
	else { die "Improper sequence file format specified at startup (options are 'scarf' and 'fastq')\n"; }
}
else {
	$seq_format = 'fastq';
}

if ($seq_format eq 'fastq') {
	my %NMheaders;	# Will store a unique list of all sequence headers from the NM FASTA file (the headers you'll want to grab in original FASTQ)
	# Cycle through input NM FASTA file to get all headers that need to be searched for in original FASTQ file
	open(NM, $ARGV[0]) || die "Can't open input (NM) FASTA file $ARGV[0]!\n";
	print "Opening FASTA of non-matching (NM) sequences for the retrieval of headers...\n";
	while(<NM>) {
		chomp;
		if (/^>(.+)/) {
			$NMheaders{$1} = 1;
		}
	}
	print "Done grabbing headers.\n";
	close NM;
	open(FASTQ, $ARGV[1]) || die "Can't open input (original sequences) FASTQ file $ARGV[1]!\n";
	open(OUTPUT, ">$ARGV[2]") || die "Can't open output file $ARGV[2]!\n";
	# Check the first line and every four lines thereafter (FASTQ entries comprise four lines)
	#	If you find a header that was in your NM file (using the hash as a lookup), report the header
	#		and 2nd line (sequence) after trimming barcode to your output file, then ignore 3rd and 4th lines
	#	If you find a header that wasn't in your NM file, just ignore the next three lines without doing anything
	my $trackprogress = 0;
	while (<FASTQ>) {
		$trackprogress++;
		if ($trackprogress % 10000 == 0) { print "Done searching $trackprogress entries in your original sequencing results...\n"; }
		if ($_ =~ /^@(.+)/) {
 			my $h = $1;
 			$h =~ s/\s/_/;	# .NM FASTAs reported by COPRO-Seq pipeline have had all whitespace in original results converted to underscores
			if (defined $NMheaders{$h}) {
 				my $untrimmedseq = <FASTQ>;
 				print OUTPUT ">$h\n";
 				print OUTPUT substr($untrimmedseq, $ARGV[3]);
				<FASTQ>;
				<FASTQ>;
 			}
 			else {
 				# Ignore next three lines 
 				<FASTQ>;
 				<FASTQ>;
				<FASTQ>;
 			}
 		}
 		else { die	"There seems to be a problem with the format of your FASTQ file or with this script's efforts to parse it.\n"; }
	}
	close OUTPUT;
	close FASTQ;
}

elsif ($seq_format eq 'scarf') {
	# Get list of relevant headers that can be passed to grep via the -f option
	open(NM, $ARGV[0]) || die "Can't open input (NM) FASTA file $ARGV[0]!\n";
	open(PATTERNS, ">patterns.txt") || die "Can't open output file patterns.txt!\n";
	while(<NM>) {
		chomp;
		my $a = $_;
		$a =~ s/>//;
		if(/^>/) {
			print PATTERNS $a."\n";
		}
	}
	close NM;
	close PATTERNS;
	# Grab all relevant lines from original SCARF file and pass to temporary text file
	system("grep -F -f patterns.txt $ARGV[1] > tmp.txt");
	
	# Cycle through each line of temporary text file, trimming barcode from sequence
	open(TMP, "tmp.txt") || die "Can't open temporary file tmp.txt!\n";
	open(OUTPUT, ">$ARGV[2]") || die "Can't open output file $ARGV[2]!\n";
	while(<TMP>) {
		my @line = split(/\t/, $_);
		my $seq = substr($line[2], $ARGV[3]);
		print OUTPUT "$line[0]\n$seq\n";
	}
	close TMP;
	close OUTPUT;
}

exit;