Build sepia (#25)

* fixed bacillus genus back to bacteria in the plasmids * sepia building v1 * m * sepia documentation and reference generation script * m Co-authored-by: Lee Katz - Aspen <[email protected]>
lskatz · Aug 20, 2021 · 603a89a · 603a89a
1 parent 21d9e70
commit 603a89a
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 7 deletions.
diff --git a/bin/generate_sepia_reference.py b/bin/generate_sepia_reference.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+__author__  = "Henk den Bakker"
+__version__ = "1.1"
+__credits__ = ["Henk den Bakker", "Lee Katz"]
+__date__    = "2021-08-20"
+
+import os
+import argparse
+
+def main(args):
+
+  #lets check ranks in file
+  ranks = set()
+  with open('src/taxonomy/nodes.dmp') as nodes:
+      for line in nodes:
+          line = line.strip().split('\t')
+          ranks.add(line[4])
+
+
+  lineage_graph =dict()
+  ranks_dict = dict()
+  scientific_name = dict()
+  accession_taxid = dict()
+
+
+  for tsv in args.tsv:
+    with open(tsv) as data:
+        for line in data:
+            line = line.strip().split('\t')
+            accession_taxid[line[1]] = line[2]
+
+  #fill lineage_graph and ranks_dict
+
+  with open(args.taxonomy + '/nodes.dmp') as nodes:
+      for line in nodes:
+          line = line.strip().split('\t')
+          lineage_graph[int(line[0])] = int(line[2])
+          ranks_dict[int(line[0])] = line[4]
+      ranks_dict[1] = 'root'
+
+  with open(args.taxonomy + '/names.dmp') as names: #scientific
+      for line in names:
+          if 'scientific' in line:
+              line = line.strip().split('\t')
+              scientific_name[int(line[0])] = line[2]
+      scientific_name[1] = 'root'        
+
+  def get_lineage(taxid, lineage_graph):
+      lineage = [taxid]
+      ancestor = 0
+      while ancestor != 1:
+          ancestor = lineage_graph[taxid]
+          lineage.append(ancestor)
+          taxid = ancestor
+      lineage.reverse()
+      return lineage
+
+
+  paths = []
+
+  for root, dirs, files in os.walk(args.fastadir):
+      for file in files:
+          if file.endswith(".fasta"):
+              paths.append(os.path.join(root, file))
+
+  outfile = open(args.outfile, 'w')
+
+  for path in paths:
+      accession = path.split('/')[-1].split('.')[0]
+      try:
+          taxid = accession_taxid[accession]
+          lineage = get_lineage(int(taxid), lineage_graph)
+          outfile.write(path + '\t' + ';'.join([scientific_name[i] for i in lineage]) + '\n')
+      except KeyError:
+          print('Could not find accession ' + accession + '. Check if it is present in you .tsv files')
+
+  outfile.close()     
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="Create the references file for Sepia. This is a two-column format with file path and taxonomy string separated by semicolons.")
+
+  parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
+  parser.add_argument("--taxonomy", "-t", metavar="taxdir", help="The directory with names.dmp and nodes.dmp", required=True) 
+  parser.add_argument("--outfile", "-o", metavar="out.tsv", help="The sepia reference file", required=True) 
+  parser.add_argument("--fastadir","-f", metavar="fastadir",help="The directory containing files with extension .fasta. The directory will be iteratively parsed.", required=True)
+  parser.add_argument("tsv", metavar="kalamari.tsv", help="Kalamari tsv file(s)", nargs='+') 
+  args = parser.parse_args()
+
+  main(args)
+
diff --git a/docs/DATABASES.md b/docs/DATABASES.md
@@ -15,7 +15,7 @@ If you have not already downloaded the Kalamari fasta files, please see the main
 
     # Make a fake fastq file
     FASTQ="$FASTA.fastq.gz"
-    head -n 2 $QUERY | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
+    head -n 2 $FASTA | perl -e '$id=<>; $seq=<>; chomp($id, $seq); $qual="I" x length($seq); $id=~s/^>/@/; print "$id\n$seq\n+\n$qual\n";' | gzip -c > $FASTQ
 
 ## Different databases
 
@@ -67,18 +67,20 @@ Please follow the Init section before continuing. These instructions assume that
 #### Build
 
     DB=kalamari_$VERSION.sepia
-    find $SRC -name '*.fasta' > kalamari.fofn
-    # Create a reference file for Sepia with two columns:
-    # path name
-    cat kalamari.fofn | perl -F'/' -lane 'chomp; ($genus,$species,$fasta)=(@F[-3,-2,-1]); $name=join("_", $genus, $species); print join("\t", $_, $name);' > kalamari.tsv 
-    sepia build --index $DB --refs kalamari.tsv --kmer 31 --batch 300 --gamma 5.0 --threads 12 --minimizer 15
+    # Create the Sepia references file with two columns: path, taxonomy
+    python3 bin/generate_sepia_reference.py --taxonomy src/taxonomy -o sepia.ref.tsv --fastadir ./Kalamari src/chromosomes.tsv src/plasmids.tsv
+    sepia build --index $DB --refs sepia.ref.tsv --kmer 41 --minimizer 31 --batch 300 --gamma 5.0 --threads $CPUS 
     ls -lhS $DB # view a directory representing the database    
 
 #### Query
 
     sepia classify --index $DB --prefix test_kalamari --query $FASTQ
-    sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t
+    # View the summary:
     # First column is the classification, second read count and third average kmer similarity
+    sort -k2,2nr test_kalamari_summary.txt | head -n 20 | column -t
+
+    # View taxonomic classification results per read
+    zcat test_kalamari_classification.gz | head
 
 ### ColorID with Kalamari