Skip to content

Commit

Permalink
vast speed increase with batch downloads; cleaned up chromosomes.tsv
Browse files Browse the repository at this point in the history
  • Loading branch information
lskatz committed May 3, 2024
1 parent 8c7403d commit 45bf89d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 23 deletions.
44 changes: 28 additions & 16 deletions bin/downloadKalamari.pl
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@
use threads;

local $0 = basename $0;
sub logmsg{ print STDERR "$0: @_\n";}
sub logmsg{ my $tid=threads->tid; print STDERR "$0(TID$tid): @_\n";}

exit main();

sub main{
my $settings={};
GetOptions($settings,qw(numcpus=i tempdir=s and=s@ outdir=s help)) or die $!;
GetOptions($settings,qw(numcpus=i buffersize|buffer-size=i tempdir=s and=s@ outdir=s help)) or die $!;
usage() if($$settings{help} || !@ARGV);
$$settings{outdir} //= "Kalamari";
$$settings{tempdir} //= tempdir("$0.XXXXXX", CLEANUP=>1, TMPDIR=>1);
$$settings{numcpus}||= 1;
$$settings{and} //= [];
$$settings{buffersize} //= 10;
logmsg "Outdir will be $$settings{outdir}";

# Check for prerequisites
Expand All @@ -37,7 +38,7 @@ sub main{
my @spreadsheet = @ARGV;

for my $s(@spreadsheet){
downloadKalamari($s, $$settings{outdir}, $settings);
my $downloadCount = downloadKalamari($s, $$settings{outdir}, $settings);
}

return 0;
Expand Down Expand Up @@ -66,13 +67,18 @@ sub downloadKalamari{
push(@queue, \%F);
}
close $fh;
@queue = sort {$$a{nuccoreAcc} cmp $$b{nuccoreAcc} } @queue;

my @thr;
my $numPerThread = ceil(scalar(@queue)/$$settings{numcpus});
for(my $i=0;$i<$$settings{numcpus};$i++){
my @subQueue = splice(@queue, 0, $numPerThread);
logmsg "Sending ".scalar(@subQueue)." entries to thread $i";
$thr[$i] = threads->new(\&downloadEntryWorker, \@subQueue, $settings);
logmsg "Sent ".scalar(@subQueue)." entries to thread ".$thr[$i]->tid;

# Offset the threads to help avoid exceeding the API
# rate limit.
sleep 1;
}

# Close out the threads
Expand All @@ -84,17 +90,21 @@ sub downloadKalamari{
push(@errors, @$errors);
}

logmsg "Done downloading for $spreadsheet";
for my $acc(@errors){
logmsg "ERROR downloading: $acc";
}
if(!@errors){
logmsg "NOTE I did not detect any missing downloads.";
}

return $download_counter;
}

sub downloadEntryWorker{
my($queue, $settings) = @_;

my $bufferSize = 30;
my $bufferSize = $$settings{buffersize} || 10;

my @fasta = ();
my @err;
Expand Down Expand Up @@ -380,17 +390,19 @@ sub usage{
print
"Usage: $0 [options] spreadsheet.tsv
--outdir '' Output directory of Kalamari database
--numcpus 1
--tempdir Directory for temporary files, if you would
but default in TMPDIR
--and (currently not used)
Download additional files. Multiple --and
flags are allowed.
Possible values: protein, nucleotide
where either protein or nucleotide will
return files with CDS entries.
E.g., $0 --and protein --and nucleotide
--outdir '' Output directory of Kalamari database
--numcpus 1 How many threads
--bufferSize 10 How many genomes to down at the same
time, per thread
--tempdir Directory for temporary files, if you would
but default in TMPDIR
--and (currently not used)
Download additional files. Multiple --and
flags are allowed.
Possible values: protein, nucleotide
where either protein or nucleotide will
return files with CDS entries.
E.g., $0 --and protein --and nucleotide
";
exit 0;
}
7 changes: 0 additions & 7 deletions src/chromosomes-todo.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,4 @@ Helicobacter pullorum XXXXXX 35818 209
Helicobacter winghamensis XXXXXX 157268 209
Helicobacter valdiviensis XXXXXX 1458358 209
Caulobacter CP000927 366602 2648921
Francisella philomiragia CP000937 28110 262
Pseudomonas putida CP000949 390235 303
Yersinia pseudotuberculosis CP000950 502800 633
Polynucleobacter necessarius CP001010 576610 44013
Fusobacterium nucleatum NC_003454 851 848
Streptomyces coelicolor NC_003888 1902 1477431
Moorella thermoacetica NC_007644 1525 44260
Yersinia pseudotuberculosis NC_010465 502800 633
6 changes: 6 additions & 0 deletions src/chromosomes.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ Escherichia fergusonii CP042945 564 561
Exiguobacterium antarcticum CP003063 132920 33986
Finegoldia magna AP008971 1260 150022
Flavobacterium psychrophilum NC_009613 96345 237
Francisella philomiragia CP063138 28110 262
Francisella tularensis NC_006570 263 262
Fusobacterium nucleatum CP028101 851 848
Gallus gallus HQ857211 9031 9030
Gardnerella vaginalis NC_014644 2702 2701
Geobacter sulfurreducens NC_002939 35554 28231
Expand Down Expand Up @@ -165,6 +167,7 @@ Mesorhizobium ciceri NC_014923 39645 68287
Methylobacterium CP000943 426117 2615210
Methylobacterium radiotolerans CP001001 31998 407
Micrococcus luteus CP001628 1270 1269
Moorella thermoacetica CP012370 1525 44260
Morganella morganii morganii CP004345 180434 582
Mycobacterium abscessus NC_010397 36809 670516
Mycobacterium leprae NC_002677 1769 1763
Expand All @@ -182,9 +185,11 @@ Parabacteroides distasonis CP000140 823 375288
Photobacterium damselae CP046752 38293 657
Photobacterium damselae CP046751 38293 657
Pollachius virens FR751399 8060 8059
Polynucleobacter necessarius LT615228 576610 44013
Prochlorococcus marinus NC_005042 1219 1218
Proteus mirabilis NC_022000 584 583
Pseudomonas aeruginosa NC_002516 287 136841
Pseudomonas putida AP013070 390235 303
Pseudomonas syringae group genomosp. 3 NC_004578 251701 136849
Pyrobaculum neutrophilum CP001014 70771 2276
Rhodobacter sphaeroides NC_007494 1063 1060
Expand Down Expand Up @@ -275,5 +280,6 @@ Yersinia intermedia CP009801 631 629
Yersinia kristensenii CP054049 631 629
Yersinia massiliensis CP054048 33060 629
Yersinia mollaretii CP054043 33060 629
Yersinia pseudotuberculosis CP009712 502800 633
Yersinia rochesterensis CP032482 1604335 629
Yersinia rohdei CP009787 29485 629

0 comments on commit 45bf89d

Please sign in to comment.