-
-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add support for LSF #74
Conversation
Ben, export LSFManager, addprocs_lsf,master_worker_dict
immutable LSFManager <: ClusterManager
np::Integer
flags::Cmd
workers_per_proc::Integer
end
LSFManager(np,flags)=LSFManager(np,flags,1)
LSFManager(np,wpc)=LSFManager(np,``,wpc)
LSFManager(np)=LSFManager(np,``,1)
global master_worker_dict=Dict(1=>[1])
function launch(manager::LSFManager, params::Dict, launched::Array, c::Condition)
try
dir = params[:dir]
exename = params[:exename]
exeflags = params[:exeflags]
np = manager.np
jobname = `julia-$(getpid())`
# bsub -x is important, otherwise can fall on the same host as another job!
cmd = `cd $dir ";" hostname -i "|" xargs $exename $exeflags $(worker_arg) --bind-to `
bsub_cmd = `bsub -I -x $(manager.flags) -cwd $dir -J $jobname "$cmd"`
info(bsub_cmd)
stream_proc = [open(bsub_cmd) for i in 1:np]
#sleep(10)
for i in 1:np
config = WorkerConfig()
config.io, io_proc = stream_proc[i]
config.userdata = Dict{Symbol, Any}(:task => i, :process => io_proc)
config.count=manager.workers_per_proc
config.exename=exename
#(a1,p1)=Base.Distributed.read_worker_host_port(stream_proc[i][1])
config.exeflags=`$worker_arg`
#@show config.bind_addr,config.port
push!(launched, config)
#println("here,",readlines(config.io.value))
notify(c)
end
catch e
println("Error launching workers")
println(e)
end
end
manage(manager::LSFManager, id::Int64, config::WorkerConfig, op::Symbol) = nothing
function kill(manager::LSFManager, id::Int64, config::WorkerConfig)
warn("When bsub head process is terminated, all workers on process will be terminated too",once=true)
if haskey(master_worker_dict,id)
rmprocs(master_worker_dict[id])
remote_do(exit,id)
nothing
else
remote_do(exit,id)
nothing
end
end
addprocs_lsf(np::Integer; workers_per_proc::Int=1,flags::Cmd=``) = addprocs(LSFManager(np, flags,workers_per_proc))
function Base.Distributed.launch_n_additional_processes(manager, frompid, fromconfig, cnt, launched_q)
@sync begin
exename = get(fromconfig.exename)
exeflags = get(fromconfig.exeflags, ``);
cmd = `$exename $exeflags --bind-to $(fromconfig.host.value)`
#info(cmd)
master_worker_dict[frompid]=[]
new_addresses = remotecall_fetch(Base.Distributed.launch_additional, frompid, cnt, cmd)
for address in new_addresses
(bind_addr, port) = address
wconfig = WorkerConfig()
for x in [:host, :tunnel, :sshflags, :exeflags, :exename, :enable_threaded_blas]
setfield!(wconfig, x, getfield(fromconfig, x))
end
wconfig.bind_addr = bind_addr
wconfig.port = port
let wconfig=wconfig
@async begin
pid = Base.Distributed.create_worker(manager, wconfig)
remote_do(Base.Distributed.redirect_output_from_additional_worker, frompid, pid, port)
push!(master_worker_dict[frompid],pid)
push!(launched_q, pid)
end
end
end
end
end I realize that all of this may not be necessary/appropriate for all the systems, but in case it helps someone. Cheers! |
thanks for this. it'd be really nice though if we could make a couple questions: did you try passing the also, by saying you couldn't figure out a way around |
If I debug, For refrence the command is: If I manually remove the quotes and run the |
you need to quote the space inbetween hostname and -i. but that still might not work. --bind-to needs the address of the worker, right? so it somehow needs to be quoted so it doesn't run on the master process. LSF has an environment variable called LSB_HOSTS, which for a job with a single task, like these, contains just a single host name, that of the worker. problem is that --bind-to needs the ip address. |
I had thought about mapping |
I got it to work (with your original PR, and an almost cleaner solution) with the following: ss="";
for i=1:length(worker_arg.exec);ss=string(ss,worker_arg[i]," ");end
cmd = "cd $dir;$exename $ss $exeflags"
bsub_cmd = `bsub -I $(manager.flags) -J $jobname "$cmd"` And then passing addprocs(ClusterManagers.LSFManager(10,``),exeflags="--bind-to \$(hostname --ip-address)",dir="/tmp/") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When count >1, rmprocs
will almost bkill
the job. The other processes that were launched will be killed too on the bsub side, but julia will think they are still connected.
Hence my circuitous global dict and the changes to the kill function.
to get this working we need JuliaLang/julia#29770 |
i've pushed some changes which make this work on julia 1.0.2. thanks @vchuravy ! there is one warning though that we need to take care of:
|
everything works for me, despite the "failed worker startup". if i'm reading the code correctly, that text will be printed even for successful workers. am i missing something? @raminammour can you please test? |
I do not have access to an lsf cluster at the moment. However, I had to support someone who did two weeks ago. I cherry picked the pr mentioned above, built Julia and the lsf manager worked. I did get the weird warning too, but it worked, and we were in a hurry :) |
@amitmurthy this is ready to merge. can you please review? thanks. |
Thanks @bjarthur |
closes #72