Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build sepia #25

Merged
merged 6 commits into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions bin/generate_sepia_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = "Henk den Bakker"
__version__ = "1.1"
__credits__ = ["Henk den Bakker", "Lee Katz"]
__date__ = "2021-08-20"

import os
import argparse

def main(args):

#lets check ranks in file
ranks = set()
with open('src/taxonomy/nodes.dmp') as nodes:
for line in nodes:
line = line.strip().split('\t')
ranks.add(line[4])


lineage_graph =dict()
ranks_dict = dict()
scientific_name = dict()
accession_taxid = dict()


for tsv in args.tsv:
with open(tsv) as data:
for line in data:
line = line.strip().split('\t')
accession_taxid[line[1]] = line[2]

#fill lineage_graph and ranks_dict

with open(args.taxonomy + '/nodes.dmp') as nodes:
for line in nodes:
line = line.strip().split('\t')
lineage_graph[int(line[0])] = int(line[2])
ranks_dict[int(line[0])] = line[4]
ranks_dict[1] = 'root'

with open(args.taxonomy + '/names.dmp') as names: #scientific
for line in names:
if 'scientific' in line:
line = line.strip().split('\t')
scientific_name[int(line[0])] = line[2]
scientific_name[1] = 'root'

def get_lineage(taxid, lineage_graph):
lineage = [taxid]
ancestor = 0
while ancestor != 1:
ancestor = lineage_graph[taxid]
lineage.append(ancestor)
taxid = ancestor
lineage.reverse()
return lineage


paths = []

for root, dirs, files in os.walk(args.fastadir):
for file in files:
if file.endswith(".fasta"):
paths.append(os.path.join(root, file))

outfile = open(args.outfile, 'w')

for path in paths:
accession = path.split('/')[-1].split('.')[0]
try:
taxid = accession_taxid[accession]
lineage = get_lineage(int(taxid), lineage_graph)
outfile.write(path + '\t' + ';'.join([scientific_name[i] for i in lineage]) + '\n')
except KeyError:
print('Could not find accession ' + accession + '. Check if it is present in you .tsv files')

outfile.close()

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create the references file for Sepia. This is a two-column format with file path and taxonomy string separated by semicolons.")

parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
parser.add_argument("--taxonomy", "-t", metavar="taxdir", help="The directory with names.dmp and nodes.dmp", required=True)
parser.add_argument("--outfile", "-o", metavar="out.tsv", help="The sepia reference file", required=True)
parser.add_argument("--fastadir","-f", metavar="fastadir",help="The directory containing files with extension .fasta. The directory will be iteratively parsed.", required=True)
parser.add_argument("tsv", metavar="kalamari.tsv", help="Kalamari tsv file(s)", nargs='+')
args = parser.parse_args()

main(args)

16 changes: 9 additions & 7 deletions docs/DATABASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ If you have not already downloaded the Kalamari fasta files, please see the main

# Make a fake fastq file
FASTQ="$FASTA.fastq.gz"
head -n 2 $QUERY | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
head -n 2 $FASTA | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ

## Different databases

Expand Down Expand Up @@ -67,18 +67,20 @@ Please follow the Init section before continuing. These instructions assume that
#### Build

DB=kalamari_$VERSION.sepia
find $SRC -name '*.fasta' > kalamari.fofn
# Create a reference file for Sepia with two columns:
# path name
cat kalamari.fofn | perl -F'/' -lane 'chomp; ($genus,$species,$fasta)=(@F[-3,-2,-1]); $name=join("_", $genus, $species); print join("\t", $_, $name);' > kalamari.tsv
sepia build --index $DB --refs kalamari.tsv --kmer 31 --batch 300 --gamma 5.0 --threads 12 --minimizer 15
# Create the Sepia references file with two columns: path, taxonomy
python3 bin/generate_sepia_reference.py --taxonomy src/taxonomy -o sepia.ref.tsv --fastadir ./Kalamari src/chromosomes.tsv src/plasmids.tsv
sepia build --index $DB --refs sepia.ref.tsv --kmer 41 --minimizer 31 --batch 300 --gamma 5.0 --threads $CPUS
ls -lhS $DB # view a directory representing the database

#### Query

sepia classify --index $DB --prefix test_kalamari --query $FASTQ
sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t
# View the summary:
# First column is the classification, second read count and third average kmer similarity
sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t

# View taxonomic classification results per read
zcat test_kalamari_classification.gz | head

### ColorID with Kalamari

Expand Down