From fe458c86f57dc524223d11d6ba6fd88c5e998b2c Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Mon, 29 Apr 2024 15:57:36 -0400 Subject: [PATCH] numcpus option added; new bash script to download and format --- bin/downloadKalamari.pl | 55 ++++++++++++++++++++++++++++++++++++++--- bin/downloadKalamari.sh | 11 +++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index b623cd3..c87f8ba 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -6,6 +6,9 @@ use File::Path qw/make_path/; use File::Copy qw/mv/; use Data::Dumper qw/Dumper/; +use POSIX qw/ceil/; + +use threads; local $0 = basename $0; sub logmsg{ print STDERR "$0: @_\n";} @@ -46,6 +49,8 @@ sub downloadKalamari{ my $header = <$fh>; chomp($header); my @header = split /\t/, $header; + + my @queue; while(<$fh>){ chomp; my %F; @@ -56,14 +61,39 @@ sub downloadKalamari{ next; } - my $fasta = downloadEntry(\%F, $settings); - $download_counter++; + push(@queue, \%F); } close $fh; + my @thr; + my $numPerThread = ceil(scalar(@queue)/$$settings{numcpus}); + for(my $i=0;$i<$$settings{numcpus};$i++){ + my @subQueue = splice(@queue, 0, $numPerThread); + logmsg "Sending ".scalar(@subQueue)." entries to thread $i"; + $thr[$i] = threads->new(\&downloadEntryWorker, \@subQueue, $settings); + } + + # Close out the threads + for(my $i=0;$i<@thr;$i++){ + logmsg "Joining thread $i"; + my $fastas = $thr[$i]->join; + } + return $download_counter; } +sub downloadEntryWorker{ + my($queue, $settings) = @_; + + my @fasta = (); + for my $fields(@$queue){ + my $fasta = downloadEntry($fields, $settings); + push(@fasta, $fasta); + } + + return \@fasta; +} + sub downloadEntry{ my($fields,$settings) = @_; logmsg "Downloading $$fields{scientificName}:$$fields{nuccoreAcc}"; @@ -79,7 +109,7 @@ sub downloadEntry{ # Get the esearch xml in place for at least one downstream query my $esearchXml = "$dir/$acc.esearch.xml"; if(! -e $esearchXml){ - system("esearch -db nuccore -query '$acc' > $esearchXml.tmp"); + command("esearch -db nuccore -query '$acc' > $esearchXml.tmp"); if($?){ die "ERROR running esearch: $!"; } @@ -279,6 +309,25 @@ sub which{ return ""; } +sub command{ + my($command) = @_; + + my $maxTries = 3; + my $numTries = 0; + do{{ + system($command); + + my $exit_code = $? >> 8; + if($exit_code){ + logmsg "ERROR on command (numTries: $numTries):\n $command"; + sleep 1; + } else { + last; + } + }} while($numTries++ < $maxTries); + +} + sub usage{ print "Usage: $0 [options] spreadsheet.tsv diff --git a/bin/downloadKalamari.sh b/bin/downloadKalamari.sh index 4c03b8b..e59d2d0 100644 --- a/bin/downloadKalamari.sh +++ b/bin/downloadKalamari.sh @@ -1,6 +1,14 @@ #!/bin/bash set -e + +if [[ "$1" =~ -h ]]; then + echo "Usage: $0 " + echo " Downloads the standard chromosomes and plasmids for Kalamari" + echo " from source and formats the kraken1 and kraken2 databases" + exit 0 +fi + set -u thisdir=$(dirname $0) @@ -35,6 +43,8 @@ function build_kraken1(){ kraken-build --db $DB --build --threads 1 kraken-build --db $DB --clean du -shc $DB + + echo "DONE. Set KRAKEN_DEFAULT_DB=$(realpath $DB)" } function build_kraken2(){ @@ -49,6 +59,7 @@ function build_kraken2(){ kraken2-build --db $DB --build --threads 1 kraken2-build --db $DB --clean du -shc $DB + echo "DONE. Set KRAKEN2_DEFAULT_DB=$(realpath $DB)" } perl $thisdir/downloadKalamari.pl $TSV \