Skip to content

Commit

Permalink
Merge pull request #1126 from pyiron/add_gpu_support
Browse files Browse the repository at this point in the history
Add GPU support
  • Loading branch information
jan-janssen authored Jun 8, 2023
2 parents 615609f + 12af539 commit c034d0b
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 16 deletions.
21 changes: 9 additions & 12 deletions pyiron_base/jobs/job/extension/executable.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,32 +217,29 @@ def executable_path(self, new_path):
else:
self.storage.mpi = False

def get_input_for_subprocess_call(self, cores, threads):
def get_input_for_subprocess_call(self, cores, threads, gpus=None):
"""
Get the input parameters for the subprocess call to execute the job
Args:
cores (int): number of cores
threads (int): number of threads
gpus (int/None): number of gpus
Returns:
str/ list, boolean: executable and shell variables
"""
if cores == 1 or not self.mpi:
executable = self.__str__()
shell = True
elif isinstance(self.executable_path, list):
executable = self.executable_path[:] + [
str(cores),
str(threads),
]
shell = False
else:
executable = [
self.executable_path,
str(cores),
str(threads),
]
if isinstance(self.executable_path, list):
executable = self.executable_path[:]
else:
executable = [self.executable_path]
executable += [str(cores), str(threads)]
if gpus is not None:
executable += [str(gpus)]
shell = False
return executable, shell

Expand Down
22 changes: 21 additions & 1 deletion pyiron_base/jobs/job/extension/server/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,18 @@ class Server: # add the option to return the job id and the hold id to the serv
"""

def __init__(
self, host=None, queue=None, cores=1, threads=1, run_mode="modal", new_hdf=True
self,
host=None,
queue=None,
cores=1,
threads=1,
gpus=None,
run_mode="modal",
new_hdf=True,
):
self._cores = cores
self._threads = threads
self._gpus = None
self._run_time = None
self._memory_limit = None
self._host = self._init_host(host=host)
Expand Down Expand Up @@ -230,6 +238,14 @@ def threads(self):
def threads(self, number_of_threads):
self._threads = number_of_threads

@property
def gpus(self):
return self._gpus

@gpus.setter
def gpus(self, number_of_gpus):
self._gpus = number_of_gpus

@property
def cores(self):
"""
Expand Down Expand Up @@ -447,6 +463,8 @@ def to_hdf(self, hdf, group_name=None):
hdf_dict["accept_crash"] = self.accept_crash
if len(self.additional_arguments) > 0:
hdf_dict["additional_arguments"] = self.additional_arguments
if self._gpus is not None:
hdf_dict["accept_crash"] = self._gpus

if group_name is not None:
with hdf.open(group_name) as hdf_group:
Expand Down Expand Up @@ -490,6 +508,8 @@ def from_hdf(self, hdf, group_name=None):
self._threads = hdf_dict["threads"]
if "additional_arguments" in hdf_dict.keys():
self.additional_arguments = hdf_dict["additional_arguments"]
if "gpus" in hdf_dict.keys():
self._gpus = hdf_dict["accept_crash"]
self._new_hdf = hdf_dict["new_h5"] == 1

def db_entry(self):
Expand Down
15 changes: 12 additions & 3 deletions pyiron_base/jobs/job/runfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,15 @@ def run_job_with_status_created(job):
elif job.server.run_mode.srun:
run_job_with_runmode_srun(job=job)
elif job.server.run_mode.flux:
return run_job_with_runmode_flux(job=job, executor=job.flux_executor)
if job.server.gpus is not None:
gpus_per_slot = int(job.server.gpus / job.server.cores)
else:
gpus_per_slot = None
return run_job_with_runmode_flux(
job=job,
executor=job.flux_executor,
gpus_per_slot=gpus_per_slot,
)
elif (
job.server.run_mode.non_modal
or job.server.run_mode.thread
Expand Down Expand Up @@ -443,7 +451,7 @@ def run_job_with_runmode_srun(job):
)


def run_job_with_runmode_flux(job, executor):
def run_job_with_runmode_flux(job, executor, gpus_per_slot=None):
if not flux_available:
raise ModuleNotFoundError(
"No module named 'flux'. No linux you can install flux via conda."
Expand Down Expand Up @@ -480,6 +488,7 @@ def run_job_with_runmode_flux(job, executor):
script=exeuctable_str,
num_nodes=1,
cores_per_slot=1,
gpus_per_slot=gpus_per_slot,
num_slots=job.server.cores,
)
jobspec.cwd = job.project_hdf5.working_directory
Expand Down Expand Up @@ -515,7 +524,7 @@ def execute_job_with_external_executable(job):
raise ValueError("No executable set!")
job.status.running = True
executable, shell = job.executable.get_input_for_subprocess_call(
cores=job.server.cores, threads=job.server.threads
cores=job.server.cores, threads=job.server.threads, gpus=job.server.gpus
)
job_crashed, out = False, None
try:
Expand Down

0 comments on commit c034d0b

Please sign in to comment.