Skip to content

Commit

Permalink
Build sepia (#25)
Browse files Browse the repository at this point in the history
* fixed bacillus genus back to bacteria in the plasmids

* sepia building v1

* m

* sepia documentation and reference generation script

* m

Co-authored-by: Lee Katz - Aspen <[email protected]>
  • Loading branch information
lskatz and lskatz authored Aug 20, 2021
1 parent 21d9e70 commit 603a89a
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 7 deletions.
92 changes: 92 additions & 0 deletions bin/generate_sepia_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = "Henk den Bakker"
__version__ = "1.1"
__credits__ = ["Henk den Bakker", "Lee Katz"]
__date__ = "2021-08-20"

import os
import argparse

def main(args):

#lets check ranks in file
ranks = set()
with open('src/taxonomy/nodes.dmp') as nodes:
for line in nodes:
line = line.strip().split('\t')
ranks.add(line[4])


lineage_graph =dict()
ranks_dict = dict()
scientific_name = dict()
accession_taxid = dict()


for tsv in args.tsv:
with open(tsv) as data:
for line in data:
line = line.strip().split('\t')
accession_taxid[line[1]] = line[2]

#fill lineage_graph and ranks_dict

with open(args.taxonomy + '/nodes.dmp') as nodes:
for line in nodes:
line = line.strip().split('\t')
lineage_graph[int(line[0])] = int(line[2])
ranks_dict[int(line[0])] = line[4]
ranks_dict[1] = 'root'

with open(args.taxonomy + '/names.dmp') as names: #scientific
for line in names:
if 'scientific' in line:
line = line.strip().split('\t')
scientific_name[int(line[0])] = line[2]
scientific_name[1] = 'root'

def get_lineage(taxid, lineage_graph):
lineage = [taxid]
ancestor = 0
while ancestor != 1:
ancestor = lineage_graph[taxid]
lineage.append(ancestor)
taxid = ancestor
lineage.reverse()
return lineage


paths = []

for root, dirs, files in os.walk(args.fastadir):
for file in files:
if file.endswith(".fasta"):
paths.append(os.path.join(root, file))

outfile = open(args.outfile, 'w')

for path in paths:
accession = path.split('/')[-1].split('.')[0]
try:
taxid = accession_taxid[accession]
lineage = get_lineage(int(taxid), lineage_graph)
outfile.write(path + '\t' + ';'.join([scientific_name[i] for i in lineage]) + '\n')
except KeyError:
print('Could not find accession ' + accession + '. Check if it is present in you .tsv files')

outfile.close()

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create the references file for Sepia. This is a two-column format with file path and taxonomy string separated by semicolons.")

parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
parser.add_argument("--taxonomy", "-t", metavar="taxdir", help="The directory with names.dmp and nodes.dmp", required=True)
parser.add_argument("--outfile", "-o", metavar="out.tsv", help="The sepia reference file", required=True)
parser.add_argument("--fastadir","-f", metavar="fastadir",help="The directory containing files with extension .fasta. The directory will be iteratively parsed.", required=True)
parser.add_argument("tsv", metavar="kalamari.tsv", help="Kalamari tsv file(s)", nargs='+')
args = parser.parse_args()

main(args)

16 changes: 9 additions & 7 deletions docs/DATABASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ If you have not already downloaded the Kalamari fasta files, please see the main

# Make a fake fastq file
FASTQ="$FASTA.fastq.gz"
head -n 2 $QUERY | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
head -n 2 $FASTA | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ

## Different databases

Expand Down Expand Up @@ -67,18 +67,20 @@ Please follow the Init section before continuing. These instructions assume that
#### Build

DB=kalamari_$VERSION.sepia
find $SRC -name '*.fasta' > kalamari.fofn
# Create a reference file for Sepia with two columns:
# path name
cat kalamari.fofn | perl -F'/' -lane 'chomp; ($genus,$species,$fasta)=(@F[-3,-2,-1]); $name=join("_", $genus, $species); print join("\t", $_, $name);' > kalamari.tsv
sepia build --index $DB --refs kalamari.tsv --kmer 31 --batch 300 --gamma 5.0 --threads 12 --minimizer 15
# Create the Sepia references file with two columns: path, taxonomy
python3 bin/generate_sepia_reference.py --taxonomy src/taxonomy -o sepia.ref.tsv --fastadir ./Kalamari src/chromosomes.tsv src/plasmids.tsv
sepia build --index $DB --refs sepia.ref.tsv --kmer 41 --minimizer 31 --batch 300 --gamma 5.0 --threads $CPUS
ls -lhS $DB # view a directory representing the database

#### Query

sepia classify --index $DB --prefix test_kalamari --query $FASTQ
sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t
# View the summary:
# First column is the classification, second read count and third average kmer similarity
sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t

# View taxonomic classification results per read
zcat test_kalamari_classification.gz | head

### ColorID with Kalamari

Expand Down

0 comments on commit 603a89a

Please sign in to comment.