Skip to content

Commit

Permalink
Allow data dir to be independent of install dir
Browse files Browse the repository at this point in the history
I am suggesting a few changes that I think may be useful for installing
eggnog-mapper via bioconda and using it in a galaxy installation.  This
allows the data dir to be independent of the installation directory.
Also allows a path for the diamond db.
  • Loading branch information
jj-umn committed Apr 28, 2017
1 parent 7245490 commit 622b57b
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 37 deletions.
46 changes: 30 additions & 16 deletions download_eggnog_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
import os
from argparse import ArgumentParser
from eggnogmapper.common import EGGNOG_DATABASES, DATA_PATH, HMMDB_PATH, pexists, pjoin, get_level_base_path
from eggnogmapper.common import EGGNOG_DATABASES, get_data_path, get_hmmdb_path, pexists, pjoin, get_level_base_path, set_data_path, existing_dir, get_db_present
from eggnogmapper.utils import ask, colorify

def run(cmd):
Expand All @@ -16,22 +16,22 @@ def download_hmm_database(level):
flag = '-N'
else:
flag = ''
cmd = 'mkdir -p %s; cd %s; wget %s -nH --user-agent=Mozilla/5.0 --relative -r --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off %s' %(HMMDB_PATH, HMMDB_PATH, flag, url)
cmd = 'mkdir -p %s; cd %s; wget %s -nH --user-agent=Mozilla/5.0 --relative -r --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off %s' %(get_hmmdb_path(), get_hmmdb_path(), flag, url)
run(cmd)

def download_annotations():
url = 'http://eggnogdb.embl.de/download/eggnog_4.5/eggnog-mapper-data/eggnog.db.gz'
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O eggnog.db.gz %s && echo Decompressing... && gunzip eggnog.db.gz' %(DATA_PATH, url)
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O eggnog.db.gz %s && echo Decompressing... && gunzip eggnog.db.gz' %(get_data_path(), url)
run(cmd)

def download_groups():
url = 'http://eggnogdb.embl.de/download/eggnog_4.5/eggnog-mapper-data/OG_fasta.tar.gz'
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O OG_fasta.tar.gz %s && echo Decompressing... && tar -zxf OG_fasta.tar.gz && rm OG_fasta.tar.gz' %(DATA_PATH, url)
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O OG_fasta.tar.gz %s && echo Decompressing... && tar -zxf OG_fasta.tar.gz && rm OG_fasta.tar.gz' %(get_data_path(), url)
run(cmd)

def download_diamond_db():
url = 'http://eggnogdb.embl.de/download/eggnog_4.5/eggnog-mapper-data/eggnog_proteins.dmnd.gz'
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O eggnog_proteins.dmnd.gz %s && echo Decompressing... && gunzip eggnog_proteins.dmnd.gz' %(DATA_PATH, url)
cmd = 'cd %s && wget -nH --user-agent=Mozilla/5.0 --relative --no-parent --reject "index.html*" --cut-dirs=4 -e robots=off -O eggnog_proteins.dmnd.gz %s && echo Decompressing... && gunzip eggnog_proteins.dmnd.gz' %(get_data_path(), url)
run(cmd)


Expand All @@ -49,44 +49,58 @@ def download_diamond_db():
parser.add_argument('-s', action="store_true", dest='simulate',
help='simulate and print commands. Nothing is downloaded')

parser.add_argument('-q', action="store_true", dest='quiet',
help='quiet_mode')

parser.add_argument("--data_dir", metavar='', type=existing_dir,
help='Directory to use for DATA_PATH.')


args = parser.parse_args()

if args.data_dir:
set_data_path(args.data_dir)

if 'all' in args.dbs:
args.dbs = EGGNOG_DATABASES

if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')):
if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')):
if args.allyes or ask("Download main annotation database?") == 'y':
print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green')
print colorify('Downloading "eggnog.db" at %s...' %get_data_path(), 'green')
download_annotations()
else:
print 'Skipping'

else:
print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')
if not args.quiet:
print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')

if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')):
if args.force or not pexists(pjoin(get_data_path(), 'OG_fasta')):
if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green')
print colorify('Downloading fasta files " at %s/OG_fasta...' %get_data_path(), 'green')
download_groups()
else:
print 'Skipping'

else:
print colorify('Skipping OG_fasta/ database (already present). Use -f to force download', 'lblue')
if not args.quiet:
print colorify('Skipping OG_fasta/ database (already present). Use -f to force download', 'lblue')

if args.force or not pexists(pjoin(DATA_PATH, 'eggnog_proteins.dmnd')):
if args.force or not pexists(pjoin(get_data_path(), 'eggnog_proteins.dmnd')):
if args.allyes or ask("Download diamond database (~4GB after decompression)?") == 'y':
print colorify('Downloading fasta files " at %s/eggnog_proteins.dmnd...' %DATA_PATH, 'green')
print colorify('Downloading fasta files " at %s/eggnog_proteins.dmnd...' %get_data_path(), 'green')
download_diamond_db()
else:
print 'Skipping'
else:
print colorify('Skipping diamond database (or already present). Use -f to force download', 'lblue')
if not args.quiet:
print colorify('Skipping diamond database (or already present). Use -f to force download', 'lblue')

if set(args.dbs) != set(['none']):
if args.allyes or ask("Download %d HMM database(s): %s?"%(len(args.dbs), ','.join(args.dbs))) == 'y':
for db in args.dbs:
print colorify('Downloading %s HMM database " at %s/%s\_hmm ...' %(db, HMMDB_PATH, db), 'green')
download_hmm_database(db)
if args.force or not get_db_present(db):
print colorify('Downloading %s HMM database " at %s/%s\_hmm ...' %(db, get_hmmdb_path(), db), 'green')
download_hmm_database(db)
else:
print 'Skipping'
4 changes: 2 additions & 2 deletions eggnogmapper/annota.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import multiprocessing

from .common import EGGNOGDB_FILE
from .common import get_eggnogdb_file
from .utils import timeit

conn = None
Expand All @@ -15,7 +15,7 @@

def connect():
global conn, db
conn = sqlite3.connect(EGGNOGDB_FILE)
conn = sqlite3.connect(get_eggnogdb_file())
db = conn.cursor()

def get_og_annotations(ogname):
Expand Down
32 changes: 22 additions & 10 deletions eggnogmapper/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,23 @@
DIAMOND = find_executable('diamond') or pjoin(BASE_PATH, 'bin', 'diamond')

DATA_PATH = pjoin(BASE_PATH, "data")
FASTA_PATH = pjoin(DATA_PATH, "OG_fasta")
HMMDB_PATH = pjoin(DATA_PATH, "hmmdb_levels")
EGGNOGDB_FILE = pjoin(DATA_PATH, "eggnog.db")
OGLEVELS_FILE = pjoin(DATA_PATH, "og2level.tsv.gz")
EGGNOG_DMND_DB = pjoin(DATA_PATH, "eggnog_proteins.dmnd")
def get_data_path(): return DATA_PATH
def get_fasta_path(): return pjoin(DATA_PATH, "OG_fasta")
def get_hmmdb_path(): return pjoin(DATA_PATH, "hmmdb_levels")
def get_eggnogdb_file(): return pjoin(DATA_PATH, "eggnog.db")
def get_oglevels_file(): return pjoin(DATA_PATH, "og2level.tsv.gz")
def get_eggnog_dmnd_db(): return pjoin(DATA_PATH, "eggnog_proteins.dmnd")

def set_data_path(data_path):
global DATA_PATH
DATA_PATH = existing_dir(data_path)
# show_binaries()



def show_binaries():
for e in (HMMSEARCH, HMMSCAN, HMMSTAT, HMMPGMD, PHMMER, DIAMOND, DATA_PATH,
FASTA_PATH, HMMDB_PATH, EGGNOGDB_FILE, OGLEVELS_FILE, EGGNOG_DMND_DB):
get_fasta_path(), get_hmmdb_path(), get_eggnogdb_file(), get_oglevels_file(), get_eggnog_dmnd_db()):
print "% 65s" %e, pexists(e)

def get_call_info():
Expand Down Expand Up @@ -151,14 +159,18 @@ def get_level_base_path(level):

def get_db_info(level):
if level == 'euk':
return (pjoin(HMMDB_PATH,"euk_500/euk_500.hmm"), EGGNOG_DATABASES[level])
return (pjoin(get_hmmdb_path(),"euk_500/euk_500.hmm"), EGGNOG_DATABASES[level])
elif level == 'bact':
return (pjoin(HMMDB_PATH,"bact_50/bact_50.hmm"), EGGNOG_DATABASES[level])
return (pjoin(get_hmmdb_path(),"bact_50/bact_50.hmm"), EGGNOG_DATABASES[level])
elif level == 'arch':
return (pjoin(HMMDB_PATH,"arch_1/arch_1.hmm"), EGGNOG_DATABASES[level])
return (pjoin(get_hmmdb_path(),"arch_1/arch_1.hmm"), EGGNOG_DATABASES[level])
else:
return (pjoin(HMMDB_PATH, level+"_hmm", level + "_hmm.all_hmm"), EGGNOG_DATABASES[level])
return (pjoin(get_hmmdb_path(), level+"_hmm", level + "_hmm.all_hmm"), EGGNOG_DATABASES[level])

def get_db_present(level):
dbpath, port = get_db_info(level)
db_present = [pexists(dbpath + "." + ext) for ext in 'h3f h3i h3m h3p idmap'.split()]
return db_present

def get_citation(addons=['hmmer']):
CITATION = """
Expand Down
27 changes: 18 additions & 9 deletions emapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def setup_hmm_search(args):
raise ValueError('Database not found')

if not args.no_refine:
if not pexists(pjoin(DATA_PATH, 'OG_fasta')):
if not pexists(pjoin(get_data_path(), 'OG_fasta')):
print colorify('Database data/OG_fasta/ not present. Use download_eggnog_database.py to fetch it', 'red')
raise ValueError('Database not found')

Expand Down Expand Up @@ -288,6 +288,7 @@ def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
score_thr = args.seed_ortholog_score
evalue_thr = args.seed_ortholog_evalue
excluded_taxa = args.excluded_taxa if args.excluded_taxa else None
dmnd_db = args.db if args.db else get_eggnog_dmnd_db()

if not DIAMOND:
raise ValueError("diamond not found in path")
Expand All @@ -297,10 +298,10 @@ def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
raw_output_file = pjoin(tempdir, uuid.uuid4().hex)
if excluded_taxa:
cmd = '%s blastp -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25' %\
(DIAMOND, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file)
(DIAMOND, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file)
else:
cmd = '%s blastp -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3' %\
(DIAMOND, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file)
(DIAMOND, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file)

print colorify(' '+cmd, 'yellow')
status = subprocess.call(cmd, shell=True,
Expand Down Expand Up @@ -495,7 +496,7 @@ def refine_matches(fasta_file, refine_file, hits_file, args):
print colorify("Hit refinement starts now", 'green')
start_time = time.time()
og2level = dict([tuple(map(str.strip, line.split('\t')))
for line in gopen(OGLEVELS_FILE)])
for line in gopen(get_oglevels_file())])
OUT = open(refine_file, "w")

if not args.no_file_comments:
Expand Down Expand Up @@ -562,7 +563,7 @@ def process_nog_hits_file(hits_file, query_fasta, og2level, skip_queries=None,

seq = sequences[seqname]
visited_queries.add(seqname)
target_fasta = os.path.join(FASTA_PATH, level, "%s.fa" % hitname)
target_fasta = os.path.join(get_fasta_path(), level, "%s.fa" % hitname)
cmds.append([seqname, seq, target_fasta, excluded_taxa, tempdir])

if cmds:
Expand Down Expand Up @@ -857,13 +858,18 @@ def parse_args(parser):
print get_version()
sys.exit(0)

if not args.no_annot and not pexists(EGGNOGDB_FILE):
if args.data_dir:
set_data_path(args.data_dir)

if not args.no_annot and not pexists(get_eggnogdb_file()):
print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')
raise emapperException()

if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red')
raise emapperException()
if args.mode == 'diamond':
dmnd_db = args.db if args.db else get_eggnog_dmnd_db()
if not pexists(dmnd_db):
print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red')
raise emapperException()

if args.cpu == 0:
args.cpu = multiprocessing.cpu_count()
Expand Down Expand Up @@ -943,6 +949,9 @@ def parse_args(parser):
pg_db.add_argument('--dbtype', dest="dbtype",
choices=["hmmdb", "seqdb"], default="hmmdb")

pg_db.add_argument("--data_dir", metavar='', type=existing_dir,
help='Directory to use for DATA_PATH.')

pg_db.add_argument('--qtype', choices=["hmm", "seq"], default="seq")


Expand Down

0 comments on commit 622b57b

Please sign in to comment.