Skip to content

Commit

Permalink
Building taxonomy (#38)
Browse files Browse the repository at this point in the history
* building taxonomy files but this script will be deprecated right away

* deprecated

* script to build taxonomy with src files

* m

* move old taxonomy to deprecated

* remove old 'versioned' files outside of git versioning

* filter taxonomy script

* complete the taxonomy

* updated scripts for compiling databases

* dev branch testing

* fix lmono test a bit

* .
  • Loading branch information
lskatz authored May 7, 2024
1 parent 3314965 commit 15989d9
Show file tree
Hide file tree
Showing 20 changed files with 230 additions and 340 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-testing.Listeria.Kraken1.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a subsampling unit test to get early results
on:
push:
branches: [master]
branches: [master, dev]
name: Listeria-with-Kraken1

env:
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/unit-testing.Listeria.Kraken2.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a subsampling unit test to get early results
on:
push:
branches: [master]
branches: [master, dev]
name: Listeria-with-Kraken2

env:
Expand Down Expand Up @@ -65,11 +65,14 @@ jobs:
tree ${{ env.DB }}
echo ".....Building the database....."
kraken2-build --build --db ${{ env.DB }} --threads 2
- name: Kraken2 view results
- name: check out the directory structure
run: |
export PATH=$PATH:kraken2-2.1.2/target
tree ${{ env.DB }}
ls -lhSR ${{ env.DB }}
du -shc ${{ env.DB }}/* | sort -h
- name: Kraken2 view results
run: |
export PATH=$PATH:kraken2-2.1.2/target
QUERY=$(find ${{ env.OUTDIR }} -name '*.fasta' | head -n 1)
echo "QUERY is $QUERY"
head -n 2 $QUERY
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit-testing.Yersinia.Kraken2.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a subsampling unit test to get early results
on:
push:
branches: [master]
branches: [master, dev]
name: Yersinia-with-Kraken2

env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit-testing.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
on:
push:
branches: [master]
branches: [master, dev]
name: Pull-down-all-accessions

jobs:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/validateTaxonomy.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
on:
push:
branches: [fix-CI, master]
branches: [master, dev]
name: Validate taxonomy

jobs:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
edirect
share
26 changes: 14 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,22 @@ using your own email address instead of `[email protected]`.

## Download instructions

For usage, run `perl bin/downloadKalamari.pl --help`
First, build the taxonomy.
The script `buildTaxonomy.sh` uses the diffs in Kalamari to enhance the default NCBI taxonomy.
Next, `filterTaxonomy.sh` reduces the taxonomy files to just those found in Kalamari.
`filterTaxonomy.sh` uses `taxonkit` and so this needs to be in your
environment before starting.

SRC=Kalamari
perl bin/downloadKalamari.pl -o $SRC src/chromosomes.tsv
bash bin/buildTaxonomy.sh
bash bin/filterTaxonomy.sh

### ...with plasmids
To download the chromosomes and plasmids, use the `.tsv` files, respectively, with `downloadKalamari.pl`.
Run `downloadKalamari.pl --help` for usage.
However, to download the files to a standard location,
please simply use `downloadKalamari.sh` which uses
`downloadKalamari.pl` internally.

SRC=Kalamari
perl bin/downloadKalamari.pl -o $SRC src/chromosomes.tsv src/plasmids.tsv

### taxonomy

The taxonomy files `nodes.dmp` and `names.dmp` are under `src/taxonomy-VER`
where `VER` is the version of Kalamari.
bash bin/downloadKalamari.pl

## Database formatting instructions

Expand Down Expand Up @@ -80,4 +82,4 @@ Please see [CONTRIBUTING.md](CONTRIBUTING.md)

## Citation

Please refer to the ASM 2018 poster under docs
Please refer to the ASM 2018 poster under docs.
22 changes: 22 additions & 0 deletions bin/buildKraken1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -eu

thisdir=$(dirname $0)
KALAMARI_VER=$(downloadKalamari.pl --version)

sharedir=$thisdir/../share/kalamari-$KALAMARI_VER
SRC="$sharedir/kalamari"
TAXDIR="$sharedir/taxonomy/filtered"

# Test prereqs
which kraken-build
which jellyfish

DB="$sharedir/kalamari-kraken1"
mkdir -pv $DB
cp -rv $TAXDIR $DB/taxonomy
find $SRC -name '*.fasta' \
-exec kraken-build --db $DB --add-to-library {} \;
kraken-build --db $DB --build --threads 1
kraken-build --db $DB --clean
22 changes: 22 additions & 0 deletions bin/buildKraken2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -eu

thisdir=$(dirname $0)
KALAMARI_VER=$(downloadKalamari.pl --version)

sharedir=$thisdir/../share/kalamari-$KALAMARI_VER
SRC="$sharedir/kalamari"
TAXDIR="$sharedir/taxonomy/filtered"

# Test prereqs
which kraken2-build
which jellyfish

DB="$sharedir/kalamari-kraken2"
mkdir -pv $DB
cp -rv $TAXDIR $DB/taxonomy
find $SRC -name '*.fasta' \
-exec kraken2-build --db $DB --add-to-library {} \;
kraken2-build --db $DB --build --threads 1
kraken2-build --db $DB --clean
59 changes: 59 additions & 0 deletions bin/buildTaxonomy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

set -eu

thisdir=$(dirname $0)
thisfile=$(basename $0)
KALAMARI_VER=$(downloadKalamari.pl --version)

# Set up some directories
tempdir=$(mktemp -d $thisfile.XXXXXX)
trap "rm -rf $tempdir" EXIT
outdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy"
mkdir -pv $outdir

# output files
outnodes="$outdir/nodes.dmp"
outnames="$outdir/names.dmp"

# Build files
delnodes="$thisdir/../src/taxonomy/build/delnodes.txt"
addnodes="$thisdir/../src/taxonomy/build/nodes.dmp"
addnames="$thisdir/../src/taxonomy/build/names.dmp"

# Source files
srcnodes="$tempdir/nodes.dmp"
srcnames="$tempdir/names.dmp"

# First, download the standard taxonomy dump tar.gz file
curl ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz > $tempdir/taxonomy.tar.gz
tar -C $tempdir -xzf $tempdir/taxonomy.tar.gz

# Next, build the taxonomy database.
# Remove taxids in $delnodes from the source nodes file
while read -r line; do
# If we see a comment line, skip it
if [[ "$line" =~ ^# ]]; then
continue
fi

# Read each 'word' as a taxid and remove it from
# $srcnodes using sed /d
for taxid in $line; do
echo "Removing taxid $taxid from $srcnodes"
sed -i -e "/^$taxid\t/d" $srcnodes
done
done < $delnodes

# Add in new nodes and names
echo "Combining NCBI taxonomy with new additions from Kalamari"
cat $srcnodes $addnodes > $outnodes
cat $srcnames $addnames > $outnames

# Copy in the rest of the source files
echo "Copying any remaining taxonomy files to the target"
for i in $tempdir/*.dmp; do
cp -nv $i $outdir/
done

echo "Output can be found in $outdir"
45 changes: 45 additions & 0 deletions bin/filterTaxonomy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

set -eu

thisdir=$(dirname $0)
thisfile=$(basename $0)
KALAMARI_VER=$(downloadKalamari.pl --version)

# Set up some directories
tempdir=$(mktemp -d $thisfile.XXXXXX)
trap "rm -rf $tempdir" EXIT
outdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy/filtered"
srcdir="$thisdir/../share/kalamari-$KALAMARI_VER/taxonomy"
mkdir -pv $outdir

# output files
outnodes="$outdir/nodes.dmp"
outnames="$outdir/names.dmp"

# source taxonomy
srcnodes="$srcdir/nodes.dmp"
srcnames="$srcdir/names.dmp"

# source leaf taxids
taxid=$(cut -f 3,4 $thisdir/../src/chromosomes.tsv $thisdir/../src/plasmids.tsv | grep -v taxid | tr '\t' '\n' | sort -n | uniq)

# Getting all necessary taxids
alltaxids=$(echo "$taxid" | taxonkit --data-dir=$srcdir lineage -t | cut -f 3 | tr ';' '\n' | grep . | sort -n | uniq)
numtaxids=$(wc -c <<< $alltaxids)
echo "found $numtaxids taxids after calculating each taxon's lineage"

# Filter nodes.dmp and names.dmp for $alltaxids
echo "Finding all filtered taxids in $srcnodes"
num=0
# Replace the for loop with regex for grep
regex=$(echo "$alltaxids" | perl -plane 's/(\d+)/^$1\t/' | tr '\n' '|' | sed 's/|$//');

grep -E "$regex" $srcnodes > $outnodes
grep -E "$regex" $srcnames > $outnames

# Copy in the rest of the source files
echo "Copying any remaining taxonomy files to the target"
for i in $srcdir/*.dmp; do
cp -nv $i $outdir/
done
44 changes: 17 additions & 27 deletions docs/DATABASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,26 @@ If you don't really want to read any of this, then there is
a script for you.
This script downloads and formats for Kraken1 and Kraken2.

bash bin/downloadTaxonomy.sh
bash bin/filterTaxonomy.sh # optional, reduces footprint
bash bin/downloadKalamari.sh

If you want to know more details, move onto the next section.

## Init

Start off with a few environmental variables, regardless of your target database.
If you have not already downloaded the Kalamari fasta files, please see the main [README](../README.md) file.

# assuming source folder is "Kalamari", where you downloaded all fasta files
VERSION=5.0 # or whichever version you are building
VERSION=$(downloadKalamari.pl --version)
CPUS=4 # Define how many threads to use
SRC=Kalamari # The folder where fasta files were downloaded
# The folder where fasta files were downloaded
sharedir=Kalamari/share/kalamari-$VERSION
SRC="$sharedir/kalamari"
TAXDIR="$sharedir/taxonomy/filtered"

# Find a test fasta file for querying if you don't have one already
FASTA=$(find $SRC -name '*.fasta' | head -n 1)

# Make a fake fastq file if you don't already have a test fastq file
FASTQ="$FASTA.fastq.gz"
head -n 2 $FASTA | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
Expand All @@ -36,18 +40,11 @@ Please follow the Init section before continuing. These instructions assume that

#### Build

DB=kraken1.kalamari_$VERSION

mkdir -pv $DB
cp -rv src/taxonomy $DB/taxonomy
find $SRC -name '*.fasta' -exec kraken-build --db $DB --add-to-library {} \;
kraken-build --db $DB --build --threads $CPUS
# Optional: reduce the size of the database folder
kraken-build --db $DB --clean
du -shc $DB # view final size of database
bash bin/buildKraken1.sh

#### Query

DB="$sharedir/kalamari-kraken1"
# fasta input
kraken --db kraken -output kraken.raw --fasta-input $FASTA
# fastq input
Expand All @@ -57,18 +54,11 @@ Please follow the Init section before continuing. These instructions assume that

#### Build

DB=kraken2.kalamari_$VERSION

mkdir -pv $DB
cp -rv src/taxonomy $DB/taxonomy
find $SRC -name '*.fasta' -exec kraken2-build --db $DB --add-to-library {} \;
kraken2-build --db $DB --build --threads $CPUS
# Optional: reduce the size of the database folder
kraken2-build --db $DB --clean
du -shc $DB # view final size of database
bash bin/buildKraken2.sh

#### Query

DB="$sharedir/kalamari-kraken2"
# Same command for either fasta or fastq
kraken2 --db $DB --report kraken2.report --use-mpa-style --output kraken2.raw $FASTA
kraken2 --db $DB --report kraken2.report --use-mpa-style --output kraken2.raw $FASTQ
Expand All @@ -77,7 +67,7 @@ Please follow the Init section before continuing. These instructions assume that

#### Build

DB=kalamari_$VERSION.sepia
DB=$sharedir/kalamari-sepia
# Create the Sepia references file with two columns: path, taxonomy
python3 bin/generate_sepia_reference.py --taxonomy src/taxonomy -o sepia.ref.tsv --fastadir ./Kalamari src/chromosomes.tsv src/plasmids.tsv
sepia build --index $DB --refs sepia.ref.tsv --kmer 41 --minimizer 31 --batch 300 --gamma 5.0 --threads $CPUS
Expand All @@ -99,7 +89,7 @@ Please follow the Init section before continuing. These instructions assume that

Using Mash version 2

DB=Kalamari.msh
DB=$sharedir/kalamari.msh

find $SRC -name '*.fasta' -exec mash sketch {} \;
find $SRC -name '*.msh' > $DB.fofn
Expand All @@ -109,7 +99,7 @@ Using Mash version 2

#### Build

DB=Kalamari.blast
DB=$sharedir/Kalamari-blast
mkdir $DB

find $SRC -name '*.fasta' -exec cat {} \; > $DB/kalamari.fasta
Expand All @@ -124,7 +114,7 @@ Using Mash version 2
#### Build

# Can use the same folder; just add file of filename
DB=$SRC/reference.fofn
DB=$sharedir/reference.fofn
find $SRC -name '*.fasta' > $DB

#### Query
Expand All @@ -135,7 +125,7 @@ Using Mash version 2

#### Build

DB=$SRC.mmseqs2
DB=$sharedir/kalamari-mmseqs2
Kalamari]$ find $SRC -name '*.fasta' | xargs -n 100 gzip -c > $SRC.cat.gz
mmseqs createdb Kalamari.cat.gz $DB

Expand Down
Loading

0 comments on commit 15989d9

Please sign in to comment.