Skip to content

Commit

Permalink
Validate taxonomy (#43)
Browse files Browse the repository at this point in the history
* validateTaxonomy update for just taxdirs; add 1 for filtered taxonomy; added DEBUG option for downloadKalamari.sh

* updated unit tests

* updated unit tests

* remove taxonomy stuff from downloadKalamari.sh

* fix validateTaxonomy syscall

* check on filtered tax in unit test
  • Loading branch information
lskatz authored May 17, 2024
1 parent 9104f76 commit 68f3fff
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 113 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/unit-testing.Listeria.Kraken1.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a subsampling unit test to get early results
on:
push:
branches: [master, dev, build-taxonomy]
branches: [master, dev, validate-taxonomy]
name: Listeria-with-Kraken1

env:
Expand Down Expand Up @@ -63,7 +63,9 @@ jobs:
column -ts $'\t' ${{ env.TSV }}
hexdump -c ${{ env.TSV }}
- name: download
run: perl Kalamari/bin/downloadKalamari.pl --outdir ${{ env.OUTDIR }} ${{ env.TSV }}
run: |
perl Kalamari/bin/downloadKalamari.pl --outdir ${{ env.OUTDIR }} ${{ env.TSV }}
find ${{ env.OUTDIR }} -name '*.fasta.gz' | xargs gunzip -v
- name: check-results
run: tree ${{ env.OUTDIR }}
- name: install kraken
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/unit-testing.Yersinia.Kraken2.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a subsampling unit test to get early results
on:
push:
branches: [master, dev, build-taxonomy]
branches: [master, dev, validate-taxonomy]
name: Genera-with-Kraken2

env:
Expand Down Expand Up @@ -45,7 +45,9 @@ jobs:
column -ts $'\t' ${{ env.TSV }}
hexdump -c ${{ env.TSV }}
- name: download
run: perl Kalamari/bin/downloadKalamari.pl --outdir ${{ matrix.GENUS }} ${{ env.TSV }}
run: |
perl Kalamari/bin/downloadKalamari.pl --outdir ${{ matrix.GENUS }} ${{ env.TSV }}
find ${{ matrix.GENUS }} -name '*.fasta.gz' | xargs gunzip -v
- name: check-results
run: |
tree ${{ matrix.GENUS }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/unit-testing.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
on:
push:
branches: [master, dev]
branches: [master, dev, validate-taxonomy]
name: Pull-down-all-accessions

jobs:
Expand Down Expand Up @@ -61,6 +61,6 @@ jobs:
run: tree kalamari.out
- name: check-file-sizes
run: |
find kalamari.out -name '*.fasta' > fasta.sizes
find kalamari.out -name '*.fasta.gz' > fasta.sizes
#cat fasta.sizes | xargs -n 100 ls -lhS
12 changes: 8 additions & 4 deletions .github/workflows/validateTaxonomy.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
on:
push:
branches: [master, dev, build-taxonomy]
branches: [master, dev, validate-taxonomy]
name: Validate taxonomy

jobs:
Expand Down Expand Up @@ -31,10 +31,14 @@ jobs:
run: |
echo $PATH
bash Kalamari/bin/buildTaxonomy.sh
bash Kalamari/bin/filterTaxonomy.sh
ls -lhR Kalamari/share/kalamari-*/taxonomy
#- name: validate taxonomy
# run: |
# perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy/nodes.dmp Kalamari/share/kalamari-*/taxonomy/names.dmp
- name: validate taxonomy
run: |
perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy
- name: validate filtered taxonomy
run: |
perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy/filtered
- name: matching taxids
run: |
export taxdir=$(\ls -d Kalamari/share/kalamari-*/taxonomy)
Expand Down
2 changes: 1 addition & 1 deletion bin/downloadKalamari.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
use IO::Compress::Gzip;
use version 0.77;

our $VERSION = version->parse("5.5.1");
our $VERSION = version->parse("5.6.0");

use threads;

Expand Down
43 changes: 7 additions & 36 deletions bin/downloadKalamari.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,45 +33,16 @@ if [[ ${KALAMARI_DEBUG:-} ]]; then
echo "DEBUG" >&2
mv $TSV $TSV.tmp
head -n 1 $TSV.tmp > $TSV
grep -m 5 Salmonella >> $TSV
grep -m 5 Listeria >> $TSV
grep -m 5 Escherichia>> $TSV
grep -m 5 Campylobacter >> $TSV
grep -m 5 Salmonella $TSV.tmp >> $TSV
grep -m 5 Listeria $TSV.tmp >> $TSV
grep -m 5 Escherichia $TSV.tmp >> $TSV
grep -m 5 Campylobacter $TSV.tmp >> $TSV
grep -m 5 Vibrio $TSV.tmp >> $TSV
grep -m 5 Legionella $TSV.tmp >> $TSV
fi

function build_kraken1(){
in_dir=$1
DB=$2

rm -rvf $DB
mkdir -v $DB
cp -rv $tempdir/taxonomy $DB/
find $in_dir -name '*.fasta' -exec kraken-build -db $DB \
--add-to-library {} \;
kraken-build --db $DB --build --threads 1
kraken-build --db $DB --clean
du -shc $DB

echo "DONE. Set KRAKEN_DEFAULT_DB=$(realpath $DB)"
}

function build_kraken2(){
in_dir=$1
DB=$2

rm -rvf $DB
mkdir -v $DB
cp -rv $tempdir/taxonomy $DB/
find $in_dir -name '*.fasta' -exec kraken2-build -db $DB \
--add-to-library {} \;
kraken2-build --db $DB --build --threads 1
kraken2-build --db $DB --clean
du -shc $DB
echo "DONE. Set KRAKEN2_DEFAULT_DB=$(realpath $DB)"
}

perl $thisdir/downloadKalamari.pl $TSV \
--outdir $tempdir/kalamari
--outdir $tempdir/kalamari --buffersize 100

rm -rf $outdir_prefix
mkdir -v $outdir_prefix
Expand Down
1 change: 1 addition & 0 deletions bin/filterTaxonomy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ taxid=$(cut -f 3,4 $thisdir/../src/chromosomes.tsv $thisdir/../src/plasmids.tsv

# Getting all necessary taxids
alltaxids=$(echo "$taxid" | taxonkit --data-dir=$srcdir lineage -t | cut -f 3 | tr ';' '\n' | grep . | sort -n | uniq)
alltaxids=$'1\n'"$alltaxids"
numtaxids=$(wc -c <<< $alltaxids)
echo "found $numtaxids taxids after calculating each taxon's lineage"

Expand Down
80 changes: 14 additions & 66 deletions bin/validateTaxonomy.pl
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ sub main{
GetOptions($settings,qw(help debug-fasta)) or die $!;
die usage() if($$settings{help} || !@ARGV);

for my $krakendir (@ARGV){
my $is_valid = validateTaxonomy($krakendir, $settings);
logmsg "Valid $krakendir? $is_valid";
for my $dir (@ARGV){
my $is_valid = validateTaxonomy($dir, $settings);
logmsg "Valid $dir? $is_valid";
}

return 0;
Expand All @@ -28,14 +28,11 @@ sub main{

# Return 1 if the taxonomy is good and 0 if not
sub validateTaxonomy{
my($krakendir, $settings) = @_;
my($taxdir, $settings) = @_;

my $taxtree_valid = 1;
my $seqs_have_taxids = 1;

my $taxdir = "$krakendir/taxonomy";
my $libdir = "$krakendir/library";

my $names = readDmp("$taxdir/names.dmp", $settings);
my $nodes = readDmp("$taxdir/nodes.dmp", $settings);

Expand All @@ -44,76 +41,28 @@ sub validateTaxonomy{
while(my($taxid, $taxinfo) = each(%$nodes)){
my $parent = $$taxinfo[0];

#if($parent == 1){
# die Dumper $taxid,$taxinfo,$$names{$taxid},$$nodes{$parent};
#}

# Die with a useful message if the parent node is not present
# and if the parent node is not 1 or 0
if(! $$nodes{$parent} && $parent > 1){
if(! $$nodes{$parent}){
logmsg "ERROR: could not find node $parent which is the parent of $taxid";
$taxtree_valid = 0;
}

# Find matching entries in names.dmp
if($taxid > 1 && !$$names{$taxid}){
if(!$$names{$taxid}){
logmsg "ERROR: could not find an entry in names.dmp for $taxid";
$taxtree_valid = 0;
}
if($parent > 1 && !$$names{$parent} ){
if(!$$names{$parent} ){
logmsg "ERROR: could not find an entry in names.dmp for $parent";
$taxtree_valid = 0;
}
# TODO check the parents up the chain
}

# Validate that every fasta entry has a taxonomy representation
my %fastadir = ();
my @fasta = ();
logmsg "Finding all fasta files in $libdir";
find({wanted => sub{
print STDERR ".";
if($$settings{'debug-fasta'} && @fasta > 5){
logmsg "DEBUG";
goto END_FIND;
}
push(@fasta, $File::Find::name);
}, no_chdir=>1, follow => 0, }, $libdir);
# Exit from the find function to here if we are debugging
END_FIND:
{
print STDERR "\n";
}

logmsg "Validating fasta files against the taxonomy";
for my $file(sort @fasta){
next if(-d $file);
next if($file !~ /\.f\w+$/);

# Get the taxid
open(my $fh, $file) or die "ERROR: could not read from $file: $!";
while(<$fh>){
next if(!/^>/);
chomp;
my($seqname, $krakenColonTaxid, $taxid) = split /\|/;
$seqname =~ s/^>//;
if($krakenColonTaxid ne "kraken:taxid"){
logmsg "Warning: middle field is not kraken:taxid for seq $seqname. Full line is '$_'";
}

if(!$taxid){
logmsg "Warning: no taxid for $seqname in $file";
}

if(!$$nodes{$taxid}){
logmsg "Warning: taxid $taxid is not represented in nodes.dmp ($file)";
$seqs_have_taxids = 0;
}
if(!$$names{$taxid}){
logmsg "Warning: taxid $taxid is not represented in names.dmp ($file)";
$seqs_have_taxids = 0;
}
}
close $fh;
}

my $is_good = $taxtree_valid && $seqs_have_taxids;
my $is_good = $taxtree_valid;
return $is_good;
}

Expand All @@ -133,9 +82,8 @@ sub readDmp{
}

sub usage{
print "Validate a kraken database for taxonomy
Usage: $0 krakendb1 [krakendb2...]
--debug-fasta Just look at 5 fasta entries
print "Validate a taxonomy folder with dmp files
Usage: $0 folder [folder2...]
";
exit 0;
}
Expand Down

0 comments on commit 68f3fff

Please sign in to comment.