-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parallel blast improvements #59
Changes from 5 commits
ab69738
afd46b1
e95008e
f9b7093
214f892
69fd86b
2084553
4c5ab46
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,21 +6,19 @@ | |
import shlex | ||
import subprocess | ||
import sys | ||
try: | ||
from __builtin__ import open | ||
except ImportError: | ||
from builtins import open | ||
|
||
try: | ||
from collections import OrderedDict | ||
except ImportError: | ||
from ordereddict import OrderedDict | ||
from bio_pieces.compat import OrderedDict, open | ||
|
||
import sh | ||
|
||
# Staticly set options for blast | ||
MAX_TARGET_SEQS = 10 | ||
BLAST_FORMAT = "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" | ||
# Users cannot have these in the other args passed | ||
STATIC_BLAST_ARGS = [ | ||
'-num_threads', '-db', '-query' | ||
] | ||
|
||
# Users cannot have these in the other args passed | ||
STATIC_DIAMOND_ARGS = [ | ||
'-t', '--threads', '-d', '--db', '-q', '--query', '--daa', '-a' | ||
] | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
|
@@ -73,23 +71,39 @@ def parallel_blast(inputfile, outfile, ninst, db, blasttype, task, blastoptions) | |
None if blastx/blastp | ||
:param str blastoptions: other options to pass to blast | ||
''' | ||
if has_duplicate_args(blastoptions, STATIC_BLAST_ARGS): | ||
raise ValueError("You cannot supply any of the arguments inside of {0} as" \ | ||
" optional arguments to blast".format(STATIC_BLAST_ARGS)) | ||
blast_path = sh.which(blasttype) | ||
if blast_path is None: | ||
raise ValueError("{0} is not in your path(Maybe not installed?)".format( | ||
blasttype | ||
)) | ||
args = ['-u', '--pipe', '--block', '10', '--recstart', '>'] | ||
args = ['-u', '--pipe', '--block', '100k', '--recstart', '>'] | ||
args += generate_sshlogins(ninst) | ||
args += [blast_path] | ||
if task is not None: | ||
args += ['-task', task] | ||
args += ['-db', db, '-max_target_seqs', str(MAX_TARGET_SEQS), | ||
'-outfmt', '"'+BLAST_FORMAT+'"' | ||
] | ||
args += ['-db', db,] | ||
args += shlex.split(blastoptions) | ||
args += ['-query', '-'] | ||
cmd = sh.Command('parallel') | ||
run(cmd, args, inputfile, outfile) | ||
run(cmd, *args, _in=open(inputfile), _out=open(outfile,'w')) | ||
|
||
def has_duplicate_args(argstring, staticarglist): | ||
''' | ||
Ensure that none of staticarglist arguments are contained in argstring | ||
If they are then return True otherwise false | ||
|
||
:param str argstring: argument string for some command | ||
:param list staticarglist: list of static args that should be checked to see | ||
if they are contained in argstring | ||
:returns: True or False | ||
''' | ||
for x in staticarglist: | ||
if x in argstring: | ||
return True | ||
return False | ||
|
||
def parallel_diamond(inputfile, outfile, ninst, db, task, diamondoptions): | ||
''' | ||
|
@@ -109,6 +123,9 @@ def parallel_diamond(inputfile, outfile, ninst, db, task, diamondoptions): | |
diamond -task blastx -compress 0 -db /path/nt -o outfile -query inputfile -o outfile | ||
my $cmd = "$type $task_option $options -q $query -d $db -o $out"; | ||
''' | ||
if has_duplicate_args(diamondoptions, STATIC_DIAMOND_ARGS): | ||
raise ValueError("You cannot supply any of the arguments inside of {0} as" \ | ||
" optional arguments to diamond".format(STATIC_DIAMOND_ARGS)) | ||
# This seems kinda stupid that we are just replacing cpu count for each | ||
# node with 1, but it is easier than refactoring other code to be better | ||
sshlogins = generate_sshlogins(ninst) | ||
|
@@ -118,19 +135,40 @@ def parallel_diamond(inputfile, outfile, ninst, db, task, diamondoptions): | |
dmnd_path = sh.which('diamond') | ||
if dmnd_path is None: | ||
raise ValueError("diamond is not in your path(Maybe not installed?)") | ||
args = ['-u', '--pipe', '--block', '10', '--recstart', '>', '--cat'] | ||
args += sshlogins | ||
args += [ | ||
diamond_cmd = [ | ||
dmnd_path, task, '--threads', str(ninst), '--db', db, '--query', '{}', | ||
'--compress', '0', '-a', outfile | ||
] + shlex.split(diamondoptions) | ||
cmd = sh.Command('parallel') | ||
run(cmd, args, inputfile, outfile) | ||
'-a', '{}.{#}', ';', dmnd_path, 'view', '{}.{#}.daa' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a little cryptic I would mention the need to do |
||
] | ||
if len(sshlogins) > 2: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because sshlogins is a list of ['--sshlogin', 'something'] so running local the list looks like ['--sshlogins', '1/:'] |
||
args = ['-u', '--pipe', '--block', '10', '--recstart', '>', '--cat'] | ||
args += sshlogins | ||
diamond_cmd_str = ' '.join(diamond_cmd) + diamondoptions | ||
args += [diamond_cmd_str] | ||
cmd = sh.Command('parallel') | ||
run(cmd, *args, _in=open(inputfile), _out=open(outfile,'w')) | ||
else: | ||
dcmd = sh.Command('diamond') | ||
args = [task] | ||
if diamondoptions: | ||
args += shlex.split(diamondoptions) | ||
p = run( | ||
dcmd, *args, threads=ninst, db=db, query=inputfile, a=outfile | ||
) | ||
p = run( | ||
dcmd, 'view', a=outfile+'.daa', _out=open(outfile,'w') | ||
) | ||
|
||
def run(cmd, args, infile, outfile): | ||
print("[cmd] {0} {1}".format(cmd._path, ' '.join(args))) | ||
def run(cmd, *args, **kwargs): | ||
''' | ||
Runs and prints what is being run to stdout | ||
''' | ||
kwargsignore = ['_in', '_out'] | ||
kwargs_str = ' '.join(['--'+a+' '+str(v) for a,v in kwargs.items() | ||
if a not in kwargsignore]) | ||
args_str = ' '.join(args) | ||
print("[cmd] {0} {1} {2}".format(cmd._path, args_str, kwargs_str)) | ||
try: | ||
p = cmd(*args, _in=open(infile), _out=open(outfile,'w')) | ||
p = cmd(*args, **kwargs) | ||
print(p) | ||
except sh.ErrorReturnCode as e: | ||
print("There was an error") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
set(a).intersection(b)
orfilter(a.__contains__, b)
using
has_duplicate_args
forces me to look for that function definition elsewhereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
blastoptions is a string so for both your cases you would need to convert blastoptions to a list using shlex.split.
Typically I think this is frowned upon as you have too many things going on in one line making it hard to read an non-pythonic.
I'll remove the function though because it doesn't really have a name that tells what it is doing and it removes a test class though.