Building taxonomy (#38)

* building taxonomy files but this script will be deprecated right away * deprecated * script to build taxonomy with src files * m * move old taxonomy to deprecated * remove old 'versioned' files outside of git versioning * filter taxonomy script * complete the taxonomy * updated scripts for compiling databases * dev branch testing * fix lmono test a bit * .
lskatz · May 7, 2024 · 15989d9 · 15989d9
1 parent 3314965
commit 15989d9
Show file tree

Hide file tree

Showing 20 changed files with 230 additions and 340 deletions.
diff --git a/.github/workflows/unit-testing.Listeria.Kraken1.yml b/.github/workflows/unit-testing.Listeria.Kraken1.yml
@@ -1,7 +1,7 @@
 # This is a subsampling unit test to get early results
 on: 
   push:
-    branches: [master]
+    branches: [master, dev]
 name: Listeria-with-Kraken1
 
 env:

diff --git a/.github/workflows/unit-testing.Listeria.Kraken2.yml b/.github/workflows/unit-testing.Listeria.Kraken2.yml
@@ -1,7 +1,7 @@
 # This is a subsampling unit test to get early results
 on: 
   push:
-    branches: [master]
+    branches: [master, dev]
 name: Listeria-with-Kraken2
 
 env:
@@ -65,11 +65,14 @@ jobs:
           tree ${{ env.DB }}
           echo ".....Building the database....."
           kraken2-build --build --db ${{ env.DB }} --threads 2
-      - name: Kraken2 view results
+      - name: check out the directory structure
         run:  |
-          export PATH=$PATH:kraken2-2.1.2/target
           tree ${{ env.DB }}
           ls -lhSR ${{ env.DB }}
+          du -shc ${{ env.DB }}/* | sort -h
+      - name: Kraken2 view results
+        run:  |
+          export PATH=$PATH:kraken2-2.1.2/target
           QUERY=$(find ${{ env.OUTDIR }} -name '*.fasta' | head -n 1)
           echo "QUERY is $QUERY"
           head -n 2 $QUERY

diff --git a/.github/workflows/unit-testing.Yersinia.Kraken2.yml b/.github/workflows/unit-testing.Yersinia.Kraken2.yml
@@ -1,7 +1,7 @@
 # This is a subsampling unit test to get early results
 on: 
   push:
-    branches: [master]
+    branches: [master, dev]
 name: Yersinia-with-Kraken2
 
 env:

diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml
@@ -1,6 +1,6 @@
 on: 
   push:
-    branches: [master]
+    branches: [master, dev]
 name: Pull-down-all-accessions
 
 jobs:

diff --git a/.github/workflows/validateTaxonomy.yml b/.github/workflows/validateTaxonomy.yml
@@ -1,6 +1,6 @@
 on: 
   push:
-    branches: [fix-CI, master]
+    branches: [master, dev]
 name: Validate taxonomy
 
 jobs:

diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+edirect
+share
diff --git a/README.md b/README.md
@@ -39,20 +39,22 @@ using your own email address instead of `[email protected]`.
 
 ## Download instructions
 
-For usage, run `perl bin/downloadKalamari.pl --help`
+First, build the taxonomy.
+The script `buildTaxonomy.sh` uses the diffs in Kalamari to enhance the default NCBI taxonomy.
+Next, `filterTaxonomy.sh` reduces the taxonomy files to just those found in Kalamari.
+`filterTaxonomy.sh` uses `taxonkit` and so this needs to be in your
+environment before starting.
 
-    SRC=Kalamari
-    perl bin/downloadKalamari.pl -o $SRC src/chromosomes.tsv
+    bash bin/buildTaxonomy.sh
+    bash bin/filterTaxonomy.sh
 
-### ...with plasmids
+To download the chromosomes and plasmids, use the `.tsv` files, respectively, with `downloadKalamari.pl`.
+Run `downloadKalamari.pl --help` for usage.
+However, to download the files to a standard location,
+please simply use `downloadKalamari.sh` which uses
+`downloadKalamari.pl` internally.
 
-    SRC=Kalamari
-    perl bin/downloadKalamari.pl -o $SRC src/chromosomes.tsv src/plasmids.tsv
-
-### taxonomy
-
-The taxonomy files `nodes.dmp` and `names.dmp` are under `src/taxonomy-VER` 
-where `VER` is the version of Kalamari.
+    bash bin/downloadKalamari.pl
 
 ## Database formatting instructions
 
@@ -80,4 +82,4 @@ Please see [CONTRIBUTING.md](CONTRIBUTING.md)
 
 ## Citation
 
-Please refer to the ASM 2018 poster under docs
+Please refer to the ASM 2018 poster under docs.
diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -eu
+
+thisdir=$(dirname $0)
+KALAMARI_VER=$(downloadKalamari.pl --version)
+
+sharedir=$thisdir/../share/kalamari-$KALAMARI_VER
+SRC="$sharedir/kalamari"
+TAXDIR="$sharedir/taxonomy/filtered"
+
+# Test prereqs
+which kraken-build
+which jellyfish
+
+DB="$sharedir/kalamari-kraken1"
+mkdir -pv $DB
+cp -rv $TAXDIR $DB/taxonomy
+find $SRC -name '*.fasta' \
+  -exec kraken-build --db $DB --add-to-library {} \;
+kraken-build --db $DB --build --threads 1
+kraken-build --db $DB --clean
diff --git a/bin/buildKraken2.sh b/bin/buildKraken2.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -eu
+
+thisdir=$(dirname $0)
+KALAMARI_VER=$(downloadKalamari.pl --version)
+
+sharedir=$thisdir/../share/kalamari-$KALAMARI_VER
+SRC="$sharedir/kalamari"
+TAXDIR="$sharedir/taxonomy/filtered"
+
+# Test prereqs
+which kraken2-build
+which jellyfish
+
+DB="$sharedir/kalamari-kraken2"
+mkdir -pv $DB
+cp -rv $TAXDIR $DB/taxonomy
+find $SRC -name '*.fasta' \
+  -exec kraken2-build --db $DB --add-to-library {} \;
+kraken2-build --db $DB --build --threads 1
+kraken2-build --db $DB --clean
diff --git a/bin/buildTaxonomy.sh b/bin/buildTaxonomy.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+set -eu
+
+thisdir=$(dirname $0)
+thisfile=$(basename $0)
+KALAMARI_VER=$(downloadKalamari.pl --version)
+
+# Set up some directories
+tempdir=$(mktemp -d $thisfile.XXXXXX)
+trap "rm -rf $tempdir" EXIT
+outdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy"
+mkdir -pv $outdir
+
+# output files
+outnodes="$outdir/nodes.dmp"
+outnames="$outdir/names.dmp"
+
+# Build files
+delnodes="$thisdir/../src/taxonomy/build/delnodes.txt"
+addnodes="$thisdir/../src/taxonomy/build/nodes.dmp"
+addnames="$thisdir/../src/taxonomy/build/names.dmp"
+
+# Source files
+srcnodes="$tempdir/nodes.dmp"
+srcnames="$tempdir/names.dmp"
+
+# First, download the standard taxonomy dump tar.gz file
+curl ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz > $tempdir/taxonomy.tar.gz 
+tar -C $tempdir -xzf $tempdir/taxonomy.tar.gz
+
+# Next, build the taxonomy database.
+# Remove taxids in $delnodes from the source nodes file
+while read -r line; do
+    # If we see a comment line, skip it
+    if [[ "$line" =~ ^# ]]; then
+        continue
+    fi
+
+    # Read each 'word' as a taxid and remove it from
+    # $srcnodes using sed /d
+    for taxid in $line; do
+        echo "Removing taxid $taxid from $srcnodes"
+        sed -i -e "/^$taxid\t/d" $srcnodes
+    done
+done < $delnodes
+
+# Add in new nodes and names
+echo "Combining NCBI taxonomy with new additions from Kalamari"
+cat $srcnodes $addnodes > $outnodes
+cat $srcnames $addnames > $outnames
+
+# Copy in the rest of the source files
+echo "Copying any remaining taxonomy files to the target"
+for i in $tempdir/*.dmp; do
+    cp -nv $i $outdir/
+done
+
+echo "Output can be found in $outdir"
diff --git a/bin/filterTaxonomy.sh b/bin/filterTaxonomy.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -eu
+
+thisdir=$(dirname $0)
+thisfile=$(basename $0)
+KALAMARI_VER=$(downloadKalamari.pl --version)
+
+# Set up some directories
+tempdir=$(mktemp -d $thisfile.XXXXXX)
+trap "rm -rf $tempdir" EXIT
+outdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy/filtered"
+srcdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy"
+mkdir -pv $outdir
+
+# output files
+outnodes="$outdir/nodes.dmp"
+outnames="$outdir/names.dmp"
+
+# source taxonomy
+srcnodes="$srcdir/nodes.dmp"
+srcnames="$srcdir/names.dmp"
+
+# source leaf taxids
+taxid=$(cut -f 3,4 $thisdir/../src/chromosomes.tsv $thisdir/../src/plasmids.tsv | grep -v taxid | tr '\t' '\n' | sort -n | uniq)
+
+# Getting all necessary taxids
+alltaxids=$(echo "$taxid" | taxonkit --data-dir=$srcdir lineage -t | cut -f 3 | tr ';' '\n' | grep . | sort -n | uniq)
+numtaxids=$(wc -c <<< $alltaxids)
+echo "found $numtaxids taxids after calculating each taxon's lineage"
+
+# Filter nodes.dmp and names.dmp for $alltaxids
+echo "Finding all filtered taxids in $srcnodes"
+num=0
+# Replace the for loop with regex for grep
+regex=$(echo "$alltaxids" | perl -plane 's/(\d+)/^$1\t/' | tr '\n' '|' | sed 's/|$//');
+
+grep -E "$regex" $srcnodes > $outnodes
+grep -E "$regex" $srcnames > $outnames
+
+# Copy in the rest of the source files
+echo "Copying any remaining taxonomy files to the target"
+for i in $srcdir/*.dmp; do
+    cp -nv $i $outdir/
+done
diff --git a/docs/DATABASES.md b/docs/DATABASES.md
@@ -8,22 +8,26 @@ If you don't really want to read any of this, then there is
 a script for you.
 This script downloads and formats for Kraken1 and Kraken2.
 
+    bash bin/downloadTaxonomy.sh
+    bash bin/filterTaxonomy.sh # optional, reduces footprint
     bash bin/downloadKalamari.sh
 
 If you want to know more details, move onto the next section.
 
 ## Init
 
 Start off with a few environmental variables, regardless of your target database.
-If you have not already downloaded the Kalamari fasta files, please see the main [README](../README.md) file.
 
     # assuming source folder is "Kalamari", where you downloaded all fasta files
-    VERSION=5.0  # or whichever version you are building
+    VERSION=$(downloadKalamari.pl --version)
     CPUS=4       # Define how many threads to use
-    SRC=Kalamari # The folder where fasta files were downloaded
+    # The folder where fasta files were downloaded
+    sharedir=Kalamari/share/kalamari-$VERSION
+    SRC="$sharedir/kalamari"
+    TAXDIR="$sharedir/taxonomy/filtered"
+
     # Find a test fasta file for querying if you don't have one already
     FASTA=$(find $SRC -name '*.fasta' | head -n 1) 
-
     # Make a fake fastq file if you don't already have a test fastq file
     FASTQ="$FASTA.fastq.gz"
     head -n 2 $FASTA | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
@@ -36,18 +40,11 @@ Please follow the Init section before continuing. These instructions assume that
 
 #### Build
 
-    DB=kraken1.kalamari_$VERSION
-
-    mkdir -pv $DB
-    cp -rv src/taxonomy $DB/taxonomy
-    find $SRC -name '*.fasta' -exec kraken-build --db $DB --add-to-library {} \;
-    kraken-build --db $DB --build --threads $CPUS
-    # Optional: reduce the size of the database folder
-    kraken-build --db $DB --clean
-    du -shc $DB # view final size of database
+    bash bin/buildKraken1.sh
 
 #### Query
 
+    DB="$sharedir/kalamari-kraken1"
     # fasta input
     kraken --db kraken -output kraken.raw --fasta-input $FASTA
     # fastq input
@@ -57,18 +54,11 @@ Please follow the Init section before continuing. These instructions assume that
 
 #### Build
 
-    DB=kraken2.kalamari_$VERSION
-
-    mkdir -pv $DB
-    cp -rv src/taxonomy $DB/taxonomy
-    find $SRC -name '*.fasta' -exec kraken2-build --db $DB --add-to-library {} \;
-    kraken2-build --db $DB --build --threads $CPUS
-    # Optional: reduce the size of the database folder
-    kraken2-build --db $DB --clean
-    du -shc $DB # view final size of database
+    bash bin/buildKraken2.sh
 
 #### Query
 
+    DB="$sharedir/kalamari-kraken2"
     # Same command for either fasta or fastq
     kraken2 --db $DB --report kraken2.report --use-mpa-style --output kraken2.raw $FASTA
     kraken2 --db $DB --report kraken2.report --use-mpa-style --output kraken2.raw $FASTQ
@@ -77,7 +67,7 @@ Please follow the Init section before continuing. These instructions assume that
 
 #### Build
 
-    DB=kalamari_$VERSION.sepia
+    DB=$sharedir/kalamari-sepia
     # Create the Sepia references file with two columns: path, taxonomy
     python3 bin/generate_sepia_reference.py --taxonomy src/taxonomy -o sepia.ref.tsv --fastadir ./Kalamari src/chromosomes.tsv src/plasmids.tsv
     sepia build --index $DB --refs sepia.ref.tsv --kmer 41 --minimizer 31 --batch 300 --gamma 5.0 --threads $CPUS 
@@ -99,7 +89,7 @@ Please follow the Init section before continuing. These instructions assume that
 
 Using Mash version 2
 
-    DB=Kalamari.msh
+    DB=$sharedir/kalamari.msh
 
     find $SRC -name '*.fasta' -exec mash sketch {} \;
     find $SRC -name '*.msh' > $DB.fofn
@@ -109,7 +99,7 @@ Using Mash version 2
 
 #### Build
 
-    DB=Kalamari.blast
+    DB=$sharedir/Kalamari-blast
     mkdir $DB
 
     find $SRC -name '*.fasta' -exec cat {} \; > $DB/kalamari.fasta
@@ -124,7 +114,7 @@ Using Mash version 2
 #### Build
 
     # Can use the same folder; just add file of filename
-    DB=$SRC/reference.fofn
+    DB=$sharedir/reference.fofn
     find $SRC -name '*.fasta' > $DB
 
 #### Query
@@ -135,7 +125,7 @@ Using Mash version 2
 
 #### Build
 
-    DB=$SRC.mmseqs2
+    DB=$sharedir/kalamari-mmseqs2
     Kalamari]$ find $SRC -name '*.fasta' | xargs -n 100 gzip -c > $SRC.cat.gz
     mmseqs createdb Kalamari.cat.gz $DB