From 68f3fff9e33b3339ed3f3854b878d08fa0c85a5e Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Fri, 17 May 2024 09:58:17 -0400 Subject: [PATCH] Validate taxonomy (#43) * validateTaxonomy update for just taxdirs; add 1 for filtered taxonomy; added DEBUG option for downloadKalamari.sh * updated unit tests * updated unit tests * remove taxonomy stuff from downloadKalamari.sh * fix validateTaxonomy syscall * check on filtered tax in unit test --- .../unit-testing.Listeria.Kraken1.yml | 6 +- .../unit-testing.Yersinia.Kraken2.yml | 6 +- .github/workflows/unit-testing.yml | 4 +- .github/workflows/validateTaxonomy.yml | 12 ++- bin/downloadKalamari.pl | 2 +- bin/downloadKalamari.sh | 43 ++-------- bin/filterTaxonomy.sh | 1 + bin/validateTaxonomy.pl | 80 ++++--------------- 8 files changed, 41 insertions(+), 113 deletions(-) diff --git a/.github/workflows/unit-testing.Listeria.Kraken1.yml b/.github/workflows/unit-testing.Listeria.Kraken1.yml index 9f3235e..e06f425 100644 --- a/.github/workflows/unit-testing.Listeria.Kraken1.yml +++ b/.github/workflows/unit-testing.Listeria.Kraken1.yml @@ -1,7 +1,7 @@ # This is a subsampling unit test to get early results on: push: - branches: [master, dev, build-taxonomy] + branches: [master, dev, validate-taxonomy] name: Listeria-with-Kraken1 env: @@ -63,7 +63,9 @@ jobs: column -ts $'\t' ${{ env.TSV }} hexdump -c ${{ env.TSV }} - name: download - run: perl Kalamari/bin/downloadKalamari.pl --outdir ${{ env.OUTDIR }} ${{ env.TSV }} + run: | + perl Kalamari/bin/downloadKalamari.pl --outdir ${{ env.OUTDIR }} ${{ env.TSV }} + find ${{ env.OUTDIR }} -name '*.fasta.gz' | xargs gunzip -v - name: check-results run: tree ${{ env.OUTDIR }} - name: install kraken diff --git a/.github/workflows/unit-testing.Yersinia.Kraken2.yml b/.github/workflows/unit-testing.Yersinia.Kraken2.yml index 9a4f39d..0f7efde 100644 --- a/.github/workflows/unit-testing.Yersinia.Kraken2.yml +++ b/.github/workflows/unit-testing.Yersinia.Kraken2.yml @@ -1,7 +1,7 @@ # This is a subsampling unit test to get early results on: push: - branches: [master, dev, build-taxonomy] + branches: [master, dev, validate-taxonomy] name: Genera-with-Kraken2 env: @@ -45,7 +45,9 @@ jobs: column -ts $'\t' ${{ env.TSV }} hexdump -c ${{ env.TSV }} - name: download - run: perl Kalamari/bin/downloadKalamari.pl --outdir ${{ matrix.GENUS }} ${{ env.TSV }} + run: | + perl Kalamari/bin/downloadKalamari.pl --outdir ${{ matrix.GENUS }} ${{ env.TSV }} + find ${{ matrix.GENUS }} -name '*.fasta.gz' | xargs gunzip -v - name: check-results run: | tree ${{ matrix.GENUS }} diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 2b7fd69..2cb5dd3 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -1,6 +1,6 @@ on: push: - branches: [master, dev] + branches: [master, dev, validate-taxonomy] name: Pull-down-all-accessions jobs: @@ -61,6 +61,6 @@ jobs: run: tree kalamari.out - name: check-file-sizes run: | - find kalamari.out -name '*.fasta' > fasta.sizes + find kalamari.out -name '*.fasta.gz' > fasta.sizes #cat fasta.sizes | xargs -n 100 ls -lhS diff --git a/.github/workflows/validateTaxonomy.yml b/.github/workflows/validateTaxonomy.yml index 849dabb..5147f40 100644 --- a/.github/workflows/validateTaxonomy.yml +++ b/.github/workflows/validateTaxonomy.yml @@ -1,6 +1,6 @@ on: push: - branches: [master, dev, build-taxonomy] + branches: [master, dev, validate-taxonomy] name: Validate taxonomy jobs: @@ -31,10 +31,14 @@ jobs: run: | echo $PATH bash Kalamari/bin/buildTaxonomy.sh + bash Kalamari/bin/filterTaxonomy.sh ls -lhR Kalamari/share/kalamari-*/taxonomy - #- name: validate taxonomy - # run: | - # perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy/nodes.dmp Kalamari/share/kalamari-*/taxonomy/names.dmp + - name: validate taxonomy + run: | + perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy + - name: validate filtered taxonomy + run: | + perl Kalamari/bin/validateTaxonomy.pl Kalamari/share/kalamari-*/taxonomy/filtered - name: matching taxids run: | export taxdir=$(\ls -d Kalamari/share/kalamari-*/taxonomy) diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 5b8f29e..314a3de 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.5.1"); +our $VERSION = version->parse("5.6.0"); use threads; diff --git a/bin/downloadKalamari.sh b/bin/downloadKalamari.sh index 6f4ba26..71e72ea 100755 --- a/bin/downloadKalamari.sh +++ b/bin/downloadKalamari.sh @@ -33,45 +33,16 @@ if [[ ${KALAMARI_DEBUG:-} ]]; then echo "DEBUG" >&2 mv $TSV $TSV.tmp head -n 1 $TSV.tmp > $TSV - grep -m 5 Salmonella >> $TSV - grep -m 5 Listeria >> $TSV - grep -m 5 Escherichia>> $TSV - grep -m 5 Campylobacter >> $TSV + grep -m 5 Salmonella $TSV.tmp >> $TSV + grep -m 5 Listeria $TSV.tmp >> $TSV + grep -m 5 Escherichia $TSV.tmp >> $TSV + grep -m 5 Campylobacter $TSV.tmp >> $TSV + grep -m 5 Vibrio $TSV.tmp >> $TSV + grep -m 5 Legionella $TSV.tmp >> $TSV fi -function build_kraken1(){ - in_dir=$1 - DB=$2 - - rm -rvf $DB - mkdir -v $DB - cp -rv $tempdir/taxonomy $DB/ - find $in_dir -name '*.fasta' -exec kraken-build -db $DB \ - --add-to-library {} \; - kraken-build --db $DB --build --threads 1 - kraken-build --db $DB --clean - du -shc $DB - - echo "DONE. Set KRAKEN_DEFAULT_DB=$(realpath $DB)" -} - -function build_kraken2(){ - in_dir=$1 - DB=$2 - - rm -rvf $DB - mkdir -v $DB - cp -rv $tempdir/taxonomy $DB/ - find $in_dir -name '*.fasta' -exec kraken2-build -db $DB \ - --add-to-library {} \; - kraken2-build --db $DB --build --threads 1 - kraken2-build --db $DB --clean - du -shc $DB - echo "DONE. Set KRAKEN2_DEFAULT_DB=$(realpath $DB)" -} - perl $thisdir/downloadKalamari.pl $TSV \ - --outdir $tempdir/kalamari + --outdir $tempdir/kalamari --buffersize 100 rm -rf $outdir_prefix mkdir -v $outdir_prefix diff --git a/bin/filterTaxonomy.sh b/bin/filterTaxonomy.sh index e65dcb3..48fbb3f 100755 --- a/bin/filterTaxonomy.sh +++ b/bin/filterTaxonomy.sh @@ -26,6 +26,7 @@ taxid=$(cut -f 3,4 $thisdir/../src/chromosomes.tsv $thisdir/../src/plasmids.tsv # Getting all necessary taxids alltaxids=$(echo "$taxid" | taxonkit --data-dir=$srcdir lineage -t | cut -f 3 | tr ';' '\n' | grep . | sort -n | uniq) +alltaxids=$'1\n'"$alltaxids" numtaxids=$(wc -c <<< $alltaxids) echo "found $numtaxids taxids after calculating each taxon's lineage" diff --git a/bin/validateTaxonomy.pl b/bin/validateTaxonomy.pl index f0db56d..c3af352 100755 --- a/bin/validateTaxonomy.pl +++ b/bin/validateTaxonomy.pl @@ -17,9 +17,9 @@ sub main{ GetOptions($settings,qw(help debug-fasta)) or die $!; die usage() if($$settings{help} || !@ARGV); - for my $krakendir (@ARGV){ - my $is_valid = validateTaxonomy($krakendir, $settings); - logmsg "Valid $krakendir? $is_valid"; + for my $dir (@ARGV){ + my $is_valid = validateTaxonomy($dir, $settings); + logmsg "Valid $dir? $is_valid"; } return 0; @@ -28,14 +28,11 @@ sub main{ # Return 1 if the taxonomy is good and 0 if not sub validateTaxonomy{ - my($krakendir, $settings) = @_; + my($taxdir, $settings) = @_; my $taxtree_valid = 1; my $seqs_have_taxids = 1; - my $taxdir = "$krakendir/taxonomy"; - my $libdir = "$krakendir/library"; - my $names = readDmp("$taxdir/names.dmp", $settings); my $nodes = readDmp("$taxdir/nodes.dmp", $settings); @@ -44,76 +41,28 @@ sub validateTaxonomy{ while(my($taxid, $taxinfo) = each(%$nodes)){ my $parent = $$taxinfo[0]; + #if($parent == 1){ + # die Dumper $taxid,$taxinfo,$$names{$taxid},$$nodes{$parent}; + #} + # Die with a useful message if the parent node is not present - # and if the parent node is not 1 or 0 - if(! $$nodes{$parent} && $parent > 1){ + if(! $$nodes{$parent}){ logmsg "ERROR: could not find node $parent which is the parent of $taxid"; $taxtree_valid = 0; } # Find matching entries in names.dmp - if($taxid > 1 && !$$names{$taxid}){ + if(!$$names{$taxid}){ logmsg "ERROR: could not find an entry in names.dmp for $taxid"; $taxtree_valid = 0; } - if($parent > 1 && !$$names{$parent} ){ + if(!$$names{$parent} ){ logmsg "ERROR: could not find an entry in names.dmp for $parent"; $taxtree_valid = 0; } - # TODO check the parents up the chain - } - - # Validate that every fasta entry has a taxonomy representation - my %fastadir = (); - my @fasta = (); - logmsg "Finding all fasta files in $libdir"; - find({wanted => sub{ - print STDERR "."; - if($$settings{'debug-fasta'} && @fasta > 5){ - logmsg "DEBUG"; - goto END_FIND; - } - push(@fasta, $File::Find::name); - }, no_chdir=>1, follow => 0, }, $libdir); - # Exit from the find function to here if we are debugging - END_FIND: - { - print STDERR "\n"; - } - - logmsg "Validating fasta files against the taxonomy"; - for my $file(sort @fasta){ - next if(-d $file); - next if($file !~ /\.f\w+$/); - - # Get the taxid - open(my $fh, $file) or die "ERROR: could not read from $file: $!"; - while(<$fh>){ - next if(!/^>/); - chomp; - my($seqname, $krakenColonTaxid, $taxid) = split /\|/; - $seqname =~ s/^>//; - if($krakenColonTaxid ne "kraken:taxid"){ - logmsg "Warning: middle field is not kraken:taxid for seq $seqname. Full line is '$_'"; - } - - if(!$taxid){ - logmsg "Warning: no taxid for $seqname in $file"; - } - - if(!$$nodes{$taxid}){ - logmsg "Warning: taxid $taxid is not represented in nodes.dmp ($file)"; - $seqs_have_taxids = 0; - } - if(!$$names{$taxid}){ - logmsg "Warning: taxid $taxid is not represented in names.dmp ($file)"; - $seqs_have_taxids = 0; - } - } - close $fh; } - my $is_good = $taxtree_valid && $seqs_have_taxids; + my $is_good = $taxtree_valid; return $is_good; } @@ -133,9 +82,8 @@ sub readDmp{ } sub usage{ - print "Validate a kraken database for taxonomy - Usage: $0 krakendb1 [krakendb2...] - --debug-fasta Just look at 5 fasta entries + print "Validate a taxonomy folder with dmp files + Usage: $0 folder [folder2...] "; exit 0; }